1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26/*
27 * MAC Services Module
28 *
29 * The GLDv3 framework locking -  The MAC layer
30 * --------------------------------------------
31 *
32 * The MAC layer is central to the GLD framework and can provide the locking
33 * framework needed for itself and for the use of MAC clients. MAC end points
34 * are fairly disjoint and don't share a lot of state. So a coarse grained
35 * multi-threading scheme is to single thread all create/modify/delete or set
36 * type of control operations on a per mac end point while allowing data threads
37 * concurrently.
38 *
39 * Control operations (set) that modify a mac end point are always serialized on
40 * a per mac end point basis, We have at most 1 such thread per mac end point
41 * at a time.
42 *
43 * All other operations that are not serialized are essentially multi-threaded.
44 * For example a control operation (get) like getting statistics which may not
45 * care about reading values atomically or data threads sending or receiving
46 * data. Mostly these type of operations don't modify the control state. Any
47 * state these operations care about are protected using traditional locks.
48 *
49 * The perimeter only serializes serial operations. It does not imply there
50 * aren't any other concurrent operations. However a serialized operation may
51 * sometimes need to make sure it is the only thread. In this case it needs
52 * to use reference counting mechanisms to cv_wait until any current data
53 * threads are done.
54 *
55 * The mac layer itself does not hold any locks across a call to another layer.
56 * The perimeter is however held across a down call to the driver to make the
57 * whole control operation atomic with respect to other control operations.
58 * Also the data path and get type control operations may proceed concurrently.
59 * These operations synchronize with the single serial operation on a given mac
60 * end point using regular locks. The perimeter ensures that conflicting
61 * operations like say a mac_multicast_add and a mac_multicast_remove on the
62 * same mac end point don't interfere with each other and also ensures that the
63 * changes in the mac layer and the call to the underlying driver to say add a
64 * multicast address are done atomically without interference from a thread
65 * trying to delete the same address.
66 *
67 * For example, consider
68 * mac_multicst_add()
69 * {
70 *	mac_perimeter_enter();	serialize all control operations
71 *
72 *	grab list lock		protect against access by data threads
73 *	add to list
74 *	drop list lock
75 *
76 *	call driver's mi_multicst
77 *
78 *	mac_perimeter_exit();
79 * }
80 *
81 * To lessen the number of serialization locks and simplify the lock hierarchy,
82 * we serialize all the control operations on a per mac end point by using a
83 * single serialization lock called the perimeter. We allow recursive entry into
84 * the perimeter to facilitate use of this mechanism by both the mac client and
85 * the MAC layer itself.
86 *
87 * MAC client means an entity that does an operation on a mac handle
88 * obtained from a mac_open/mac_client_open. Similarly MAC driver means
89 * an entity that does an operation on a mac handle obtained from a
90 * mac_register. An entity could be both client and driver but on different
91 * handles eg. aggr. and should only make the corresponding mac interface calls
92 * i.e. mac driver interface or mac client interface as appropriate for that
93 * mac handle.
94 *
95 * General rules.
96 * -------------
97 *
98 * R1. The lock order of upcall threads is natually opposite to downcall
99 * threads. Hence upcalls must not hold any locks across layers for fear of
100 * recursive lock enter and lock order violation. This applies to all layers.
101 *
102 * R2. The perimeter is just another lock. Since it is held in the down
103 * direction, acquiring the perimeter in an upcall is prohibited as it would
104 * cause a deadlock. This applies to all layers.
105 *
106 * Note that upcalls that need to grab the mac perimeter (for example
107 * mac_notify upcalls) can still achieve that by posting the request to a
108 * thread, which can then grab all the required perimeters and locks in the
109 * right global order. Note that in the above example the mac layer iself
110 * won't grab the mac perimeter in the mac_notify upcall, instead the upcall
111 * to the client must do that. Please see the aggr code for an example.
112 *
113 * MAC client rules
114 * ----------------
115 *
116 * R3. A MAC client may use the MAC provided perimeter facility to serialize
117 * control operations on a per mac end point. It does this by by acquring
118 * and holding the perimeter across a sequence of calls to the mac layer.
119 * This ensures atomicity across the entire block of mac calls. In this
120 * model the MAC client must not hold any client locks across the calls to
121 * the mac layer. This model is the preferred solution.
122 *
123 * R4. However if a MAC client has a lot of global state across all mac end
124 * points the per mac end point serialization may not be sufficient. In this
125 * case the client may choose to use global locks or use its own serialization.
126 * To avoid deadlocks, these client layer locks held across the mac calls
127 * in the control path must never be acquired by the data path for the reason
128 * mentioned below.
129 *
130 * (Assume that a control operation that holds a client lock blocks in the
131 * mac layer waiting for upcall reference counts to drop to zero. If an upcall
132 * data thread that holds this reference count, tries to acquire the same
133 * client lock subsequently it will deadlock).
134 *
135 * A MAC client may follow either the R3 model or the R4 model, but can't
136 * mix both. In the former, the hierarchy is Perim -> client locks, but in
137 * the latter it is client locks -> Perim.
138 *
139 * R5. MAC clients must make MAC calls (excluding data calls) in a cv_wait'able
140 * context since they may block while trying to acquire the perimeter.
141 * In addition some calls may block waiting for upcall refcnts to come down to
142 * zero.
143 *
144 * R6. MAC clients must make sure that they are single threaded and all threads
145 * from the top (in particular data threads) have finished before calling
146 * mac_client_close. The MAC framework does not track the number of client
147 * threads using the mac client handle. Also mac clients must make sure
148 * they have undone all the control operations before calling mac_client_close.
149 * For example mac_unicast_remove/mac_multicast_remove to undo the corresponding
150 * mac_unicast_add/mac_multicast_add.
151 *
152 * MAC framework rules
153 * -------------------
154 *
155 * R7. The mac layer itself must not hold any mac layer locks (except the mac
156 * perimeter) across a call to any other layer from the mac layer. The call to
157 * any other layer could be via mi_* entry points, classifier entry points into
158 * the driver or via upcall pointers into layers above. The mac perimeter may
159 * be acquired or held only in the down direction, for e.g. when calling into
160 * a mi_* driver enty point to provide atomicity of the operation.
161 *
162 * R8. Since it is not guaranteed (see R14) that drivers won't hold locks across
163 * mac driver interfaces, the MAC layer must provide a cut out for control
164 * interfaces like upcall notifications and start them in a separate thread.
165 *
166 * R9. Note that locking order also implies a plumbing order. For example
167 * VNICs are allowed to be created over aggrs, but not vice-versa. An attempt
168 * to plumb in any other order must be failed at mac_open time, otherwise it
169 * could lead to deadlocks due to inverse locking order.
170 *
171 * R10. MAC driver interfaces must not block since the driver could call them
172 * in interrupt context.
173 *
174 * R11. Walkers must preferably not hold any locks while calling walker
175 * callbacks. Instead these can operate on reference counts. In simple
176 * callbacks it may be ok to hold a lock and call the callbacks, but this is
177 * harder to maintain in the general case of arbitrary callbacks.
178 *
179 * R12. The MAC layer must protect upcall notification callbacks using reference
180 * counts rather than holding locks across the callbacks.
181 *
182 * R13. Given the variety of drivers, it is preferable if the MAC layer can make
183 * sure that any pointers (such as mac ring pointers) it passes to the driver
184 * remain valid until mac unregister time. Currently the mac layer achieves
185 * this by using generation numbers for rings and freeing the mac rings only
186 * at unregister time.  The MAC layer must provide a layer of indirection and
187 * must not expose underlying driver rings or driver data structures/pointers
188 * directly to MAC clients.
189 *
190 * MAC driver rules
191 * ----------------
192 *
193 * R14. It would be preferable if MAC drivers don't hold any locks across any
194 * mac call. However at a minimum they must not hold any locks across data
195 * upcalls. They must also make sure that all references to mac data structures
196 * are cleaned up and that it is single threaded at mac_unregister time.
197 *
198 * R15. MAC driver interfaces don't block and so the action may be done
199 * asynchronously in a separate thread as for example handling notifications.
200 * The driver must not assume that the action is complete when the call
201 * returns.
202 *
203 * R16. Drivers must maintain a generation number per Rx ring, and pass it
204 * back to mac_rx_ring(); They are expected to increment the generation
205 * number whenever the ring's stop routine is invoked.
206 * See comments in mac_rx_ring();
207 *
208 * R17 Similarly mi_stop is another synchronization point and the driver must
209 * ensure that all upcalls are done and there won't be any future upcall
210 * before returning from mi_stop.
211 *
212 * R18. The driver may assume that all set/modify control operations via
213 * the mi_* entry points are single threaded on a per mac end point.
214 *
215 * Lock and Perimeter hierarchy scenarios
216 * ---------------------------------------
217 *
218 * i_mac_impl_lock -> mi_rw_lock -> srs_lock -> s_ring_lock[i_mac_tx_srs_notify]
219 *
220 * ft_lock -> fe_lock [mac_flow_lookup]
221 *
222 * mi_rw_lock -> fe_lock [mac_bcast_send]
223 *
224 * srs_lock -> mac_bw_lock [mac_rx_srs_drain_bw]
225 *
226 * cpu_lock -> mac_srs_g_lock -> srs_lock -> s_ring_lock [mac_walk_srs_and_bind]
227 *
228 * i_dls_devnet_lock -> mac layer locks [dls_devnet_rename]
229 *
230 * Perimeters are ordered P1 -> P2 -> P3 from top to bottom in order of mac
231 * client to driver. In the case of clients that explictly use the mac provided
232 * perimeter mechanism for its serialization, the hierarchy is
233 * Perimeter -> mac layer locks, since the client never holds any locks across
234 * the mac calls. In the case of clients that use its own locks the hierarchy
235 * is Client locks -> Mac Perim -> Mac layer locks. The client never explicitly
236 * calls mac_perim_enter/exit in this case.
237 *
238 * Subflow creation rules
239 * ---------------------------
240 * o In case of a user specified cpulist present on underlying link and flows,
241 * the flows cpulist must be a subset of the underlying link.
242 * o In case of a user specified fanout mode present on link and flow, the
243 * subflow fanout count has to be less than or equal to that of the
244 * underlying link. The cpu-bindings for the subflows will be a subset of
245 * the underlying link.
246 * o In case if no cpulist specified on both underlying link and flow, the
247 * underlying link relies on a  MAC tunable to provide out of box fanout.
248 * The subflow will have no cpulist (the subflow will be unbound)
249 * o In case if no cpulist is specified on the underlying link, a subflow can
250 * carry  either a user-specified cpulist or fanout count. The cpu-bindings
251 * for the subflow will not adhere to restriction that they need to be subset
252 * of the underlying link.
253 * o In case where the underlying link is carrying either a user specified
254 * cpulist or fanout mode and for a unspecified subflow, the subflow will be
255 * created unbound.
256 * o While creating unbound subflows, bandwidth mode changes attempt to
257 * figure a right fanout count. In such cases the fanout count will override
258 * the unbound cpu-binding behavior.
259 * o In addition to this, while cycling between flow and link properties, we
260 * impose a restriction that if a link property has a subflow with
261 * user-specified attributes, we will not allow changing the link property.
262 * The administrator needs to reset all the user specified properties for the
263 * subflows before attempting a link property change.
264 * Some of the above rules can be overridden by specifying additional command
265 * line options while creating or modifying link or subflow properties.
266 */
267
268#include <sys/types.h>
269#include <sys/conf.h>
270#include <sys/id_space.h>
271#include <sys/esunddi.h>
272#include <sys/stat.h>
273#include <sys/mkdev.h>
274#include <sys/stream.h>
275#include <sys/strsun.h>
276#include <sys/strsubr.h>
277#include <sys/dlpi.h>
278#include <sys/list.h>
279#include <sys/modhash.h>
280#include <sys/mac_provider.h>
281#include <sys/mac_client_impl.h>
282#include <sys/mac_soft_ring.h>
283#include <sys/mac_stat.h>
284#include <sys/mac_impl.h>
285#include <sys/mac.h>
286#include <sys/dls.h>
287#include <sys/dld.h>
288#include <sys/modctl.h>
289#include <sys/fs/dv_node.h>
290#include <sys/thread.h>
291#include <sys/proc.h>
292#include <sys/callb.h>
293#include <sys/cpuvar.h>
294#include <sys/atomic.h>
295#include <sys/bitmap.h>
296#include <sys/sdt.h>
297#include <sys/mac_flow.h>
298#include <sys/ddi_intr_impl.h>
299#include <sys/disp.h>
300#include <sys/sdt.h>
301#include <sys/vnic.h>
302#include <sys/vnic_impl.h>
303#include <sys/vlan.h>
304#include <inet/ip.h>
305#include <inet/ip6.h>
306#include <sys/exacct.h>
307#include <sys/exacct_impl.h>
308#include <inet/nd.h>
309#include <sys/ethernet.h>
310#include <sys/pool.h>
311#include <sys/pool_pset.h>
312#include <sys/cpupart.h>
313#include <inet/wifi_ioctl.h>
314#include <net/wpa.h>
315
316#define	IMPL_HASHSZ	67	/* prime */
317
318kmem_cache_t		*i_mac_impl_cachep;
319mod_hash_t		*i_mac_impl_hash;
320krwlock_t		i_mac_impl_lock;
321uint_t			i_mac_impl_count;
322static kmem_cache_t	*mac_ring_cache;
323static id_space_t	*minor_ids;
324static uint32_t		minor_count;
325static pool_event_cb_t	mac_pool_event_reg;
326
327/*
328 * Logging stuff. Perhaps mac_logging_interval could be broken into
329 * mac_flow_log_interval and mac_link_log_interval if we want to be
330 * able to schedule them differently.
331 */
332uint_t			mac_logging_interval;
333boolean_t		mac_flow_log_enable;
334boolean_t		mac_link_log_enable;
335timeout_id_t		mac_logging_timer;
336
337/* for debugging, see MAC_DBG_PRT() in mac_impl.h */
338int mac_dbg = 0;
339
340#define	MACTYPE_KMODDIR	"mac"
341#define	MACTYPE_HASHSZ	67
342static mod_hash_t	*i_mactype_hash;
343/*
344 * i_mactype_lock synchronizes threads that obtain references to mactype_t
345 * structures through i_mactype_getplugin().
346 */
347static kmutex_t		i_mactype_lock;
348
349/*
350 * mac_tx_percpu_cnt
351 *
352 * Number of per cpu locks per mac_client_impl_t. Used by the transmit side
353 * in mac_tx to reduce lock contention. This is sized at boot time in mac_init.
354 * mac_tx_percpu_cnt_max is settable in /etc/system and must be a power of 2.
355 * Per cpu locks may be disabled by setting mac_tx_percpu_cnt_max to 1.
356 */
357int mac_tx_percpu_cnt;
358int mac_tx_percpu_cnt_max = 128;
359
360/*
361 * Call back functions for the bridge module.  These are guaranteed to be valid
362 * when holding a reference on a link or when holding mip->mi_bridge_lock and
363 * mi_bridge_link is non-NULL.
364 */
365mac_bridge_tx_t mac_bridge_tx_cb;
366mac_bridge_rx_t mac_bridge_rx_cb;
367mac_bridge_ref_t mac_bridge_ref_cb;
368mac_bridge_ls_t mac_bridge_ls_cb;
369
370static int i_mac_constructor(void *, void *, int);
371static void i_mac_destructor(void *, void *);
372static int i_mac_ring_ctor(void *, void *, int);
373static void i_mac_ring_dtor(void *, void *);
374static mblk_t *mac_rx_classify(mac_impl_t *, mac_resource_handle_t, mblk_t *);
375void mac_tx_client_flush(mac_client_impl_t *);
376void mac_tx_client_block(mac_client_impl_t *);
377static void mac_rx_ring_quiesce(mac_ring_t *, uint_t);
378static int mac_start_group_and_rings(mac_group_t *);
379static void mac_stop_group_and_rings(mac_group_t *);
380static void mac_pool_event_cb(pool_event_t, int, void *);
381
382typedef struct netinfo_s {
383	list_node_t	ni_link;
384	void		*ni_record;
385	int		ni_size;
386	int		ni_type;
387} netinfo_t;
388
389/*
390 * Module initialization functions.
391 */
392
393void
394mac_init(void)
395{
396	mac_tx_percpu_cnt = ((boot_max_ncpus == -1) ? max_ncpus :
397	    boot_max_ncpus);
398
399	/* Upper bound is mac_tx_percpu_cnt_max */
400	if (mac_tx_percpu_cnt > mac_tx_percpu_cnt_max)
401		mac_tx_percpu_cnt = mac_tx_percpu_cnt_max;
402
403	if (mac_tx_percpu_cnt < 1) {
404		/* Someone set max_tx_percpu_cnt_max to 0 or less */
405		mac_tx_percpu_cnt = 1;
406	}
407
408	ASSERT(mac_tx_percpu_cnt >= 1);
409	mac_tx_percpu_cnt = (1 << highbit(mac_tx_percpu_cnt - 1));
410	/*
411	 * Make it of the form 2**N - 1 in the range
412	 * [0 .. mac_tx_percpu_cnt_max - 1]
413	 */
414	mac_tx_percpu_cnt--;
415
416	i_mac_impl_cachep = kmem_cache_create("mac_impl_cache",
417	    sizeof (mac_impl_t), 0, i_mac_constructor, i_mac_destructor,
418	    NULL, NULL, NULL, 0);
419	ASSERT(i_mac_impl_cachep != NULL);
420
421	mac_ring_cache = kmem_cache_create("mac_ring_cache",
422	    sizeof (mac_ring_t), 0, i_mac_ring_ctor, i_mac_ring_dtor, NULL,
423	    NULL, NULL, 0);
424	ASSERT(mac_ring_cache != NULL);
425
426	i_mac_impl_hash = mod_hash_create_extended("mac_impl_hash",
427	    IMPL_HASHSZ, mod_hash_null_keydtor, mod_hash_null_valdtor,
428	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
429	rw_init(&i_mac_impl_lock, NULL, RW_DEFAULT, NULL);
430
431	mac_flow_init();
432	mac_soft_ring_init();
433	mac_bcast_init();
434	mac_client_init();
435
436	i_mac_impl_count = 0;
437
438	i_mactype_hash = mod_hash_create_extended("mactype_hash",
439	    MACTYPE_HASHSZ,
440	    mod_hash_null_keydtor, mod_hash_null_valdtor,
441	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
442
443	/*
444	 * Allocate an id space to manage minor numbers. The range of the
445	 * space will be from MAC_MAX_MINOR+1 to MAC_PRIVATE_MINOR-1.  This
446	 * leaves half of the 32-bit minors available for driver private use.
447	 */
448	minor_ids = id_space_create("mac_minor_ids", MAC_MAX_MINOR+1,
449	    MAC_PRIVATE_MINOR-1);
450	ASSERT(minor_ids != NULL);
451	minor_count = 0;
452
453	/* Let's default to 20 seconds */
454	mac_logging_interval = 20;
455	mac_flow_log_enable = B_FALSE;
456	mac_link_log_enable = B_FALSE;
457	mac_logging_timer = 0;
458
459	/* Register to be notified of noteworthy pools events */
460	mac_pool_event_reg.pec_func =  mac_pool_event_cb;
461	mac_pool_event_reg.pec_arg = NULL;
462	pool_event_cb_register(&mac_pool_event_reg);
463}
464
465int
466mac_fini(void)
467{
468
469	if (i_mac_impl_count > 0 || minor_count > 0)
470		return (EBUSY);
471
472	pool_event_cb_unregister(&mac_pool_event_reg);
473
474	id_space_destroy(minor_ids);
475	mac_flow_fini();
476
477	mod_hash_destroy_hash(i_mac_impl_hash);
478	rw_destroy(&i_mac_impl_lock);
479
480	mac_client_fini();
481	kmem_cache_destroy(mac_ring_cache);
482
483	mod_hash_destroy_hash(i_mactype_hash);
484	mac_soft_ring_finish();
485
486
487	return (0);
488}
489
490/*
491 * Initialize a GLDv3 driver's device ops.  A driver that manages its own ops
492 * (e.g. softmac) may pass in a NULL ops argument.
493 */
494void
495mac_init_ops(struct dev_ops *ops, const char *name)
496{
497	major_t major = ddi_name_to_major((char *)name);
498
499	/*
500	 * By returning on error below, we are not letting the driver continue
501	 * in an undefined context.  The mac_register() function will faill if
502	 * DN_GLDV3_DRIVER isn't set.
503	 */
504	if (major == DDI_MAJOR_T_NONE)
505		return;
506	LOCK_DEV_OPS(&devnamesp[major].dn_lock);
507	devnamesp[major].dn_flags |= (DN_GLDV3_DRIVER | DN_NETWORK_DRIVER);
508	UNLOCK_DEV_OPS(&devnamesp[major].dn_lock);
509	if (ops != NULL)
510		dld_init_ops(ops, name);
511}
512
513void
514mac_fini_ops(struct dev_ops *ops)
515{
516	dld_fini_ops(ops);
517}
518
519/*ARGSUSED*/
520static int
521i_mac_constructor(void *buf, void *arg, int kmflag)
522{
523	mac_impl_t	*mip = buf;
524
525	bzero(buf, sizeof (mac_impl_t));
526
527	mip->mi_linkstate = LINK_STATE_UNKNOWN;
528
529	rw_init(&mip->mi_rw_lock, NULL, RW_DRIVER, NULL);
530	mutex_init(&mip->mi_notify_lock, NULL, MUTEX_DRIVER, NULL);
531	mutex_init(&mip->mi_promisc_lock, NULL, MUTEX_DRIVER, NULL);
532	mutex_init(&mip->mi_ring_lock, NULL, MUTEX_DEFAULT, NULL);
533
534	mip->mi_notify_cb_info.mcbi_lockp = &mip->mi_notify_lock;
535	cv_init(&mip->mi_notify_cb_info.mcbi_cv, NULL, CV_DRIVER, NULL);
536	mip->mi_promisc_cb_info.mcbi_lockp = &mip->mi_promisc_lock;
537	cv_init(&mip->mi_promisc_cb_info.mcbi_cv, NULL, CV_DRIVER, NULL);
538
539	mutex_init(&mip->mi_bridge_lock, NULL, MUTEX_DEFAULT, NULL);
540
541	return (0);
542}
543
544/*ARGSUSED*/
545static void
546i_mac_destructor(void *buf, void *arg)
547{
548	mac_impl_t	*mip = buf;
549	mac_cb_info_t	*mcbi;
550
551	ASSERT(mip->mi_ref == 0);
552	ASSERT(mip->mi_active == 0);
553	ASSERT(mip->mi_linkstate == LINK_STATE_UNKNOWN);
554	ASSERT(mip->mi_devpromisc == 0);
555	ASSERT(mip->mi_ksp == NULL);
556	ASSERT(mip->mi_kstat_count == 0);
557	ASSERT(mip->mi_nclients == 0);
558	ASSERT(mip->mi_nactiveclients == 0);
559	ASSERT(mip->mi_single_active_client == NULL);
560	ASSERT(mip->mi_state_flags == 0);
561	ASSERT(mip->mi_factory_addr == NULL);
562	ASSERT(mip->mi_factory_addr_num == 0);
563	ASSERT(mip->mi_default_tx_ring == NULL);
564
565	mcbi = &mip->mi_notify_cb_info;
566	ASSERT(mcbi->mcbi_del_cnt == 0 && mcbi->mcbi_walker_cnt == 0);
567	ASSERT(mip->mi_notify_bits == 0);
568	ASSERT(mip->mi_notify_thread == NULL);
569	ASSERT(mcbi->mcbi_lockp == &mip->mi_notify_lock);
570	mcbi->mcbi_lockp = NULL;
571
572	mcbi = &mip->mi_promisc_cb_info;
573	ASSERT(mcbi->mcbi_del_cnt == 0 && mip->mi_promisc_list == NULL);
574	ASSERT(mip->mi_promisc_list == NULL);
575	ASSERT(mcbi->mcbi_lockp == &mip->mi_promisc_lock);
576	mcbi->mcbi_lockp = NULL;
577
578	ASSERT(mip->mi_bcast_ngrps == 0 && mip->mi_bcast_grp == NULL);
579	ASSERT(mip->mi_perim_owner == NULL && mip->mi_perim_ocnt == 0);
580
581	rw_destroy(&mip->mi_rw_lock);
582
583	mutex_destroy(&mip->mi_promisc_lock);
584	cv_destroy(&mip->mi_promisc_cb_info.mcbi_cv);
585	mutex_destroy(&mip->mi_notify_lock);
586	cv_destroy(&mip->mi_notify_cb_info.mcbi_cv);
587	mutex_destroy(&mip->mi_ring_lock);
588
589	ASSERT(mip->mi_bridge_link == NULL);
590}
591
592/* ARGSUSED */
593static int
594i_mac_ring_ctor(void *buf, void *arg, int kmflag)
595{
596	mac_ring_t *ring = (mac_ring_t *)buf;
597
598	bzero(ring, sizeof (mac_ring_t));
599	cv_init(&ring->mr_cv, NULL, CV_DEFAULT, NULL);
600	mutex_init(&ring->mr_lock, NULL, MUTEX_DEFAULT, NULL);
601	ring->mr_state = MR_FREE;
602	return (0);
603}
604
605/* ARGSUSED */
606static void
607i_mac_ring_dtor(void *buf, void *arg)
608{
609	mac_ring_t *ring = (mac_ring_t *)buf;
610
611	cv_destroy(&ring->mr_cv);
612	mutex_destroy(&ring->mr_lock);
613}
614
615/*
616 * Common functions to do mac callback addition and deletion. Currently this is
617 * used by promisc callbacks and notify callbacks. List addition and deletion
618 * need to take care of list walkers. List walkers in general, can't hold list
619 * locks and make upcall callbacks due to potential lock order and recursive
620 * reentry issues. Instead list walkers increment the list walker count to mark
621 * the presence of a walker thread. Addition can be carefully done to ensure
622 * that the list walker always sees either the old list or the new list.
623 * However the deletion can't be done while the walker is active, instead the
624 * deleting thread simply marks the entry as logically deleted. The last walker
625 * physically deletes and frees up the logically deleted entries when the walk
626 * is complete.
627 */
628void
629mac_callback_add(mac_cb_info_t *mcbi, mac_cb_t **mcb_head,
630    mac_cb_t *mcb_elem)
631{
632	mac_cb_t	*p;
633	mac_cb_t	**pp;
634
635	/* Verify it is not already in the list */
636	for (pp = mcb_head; (p = *pp) != NULL; pp = &p->mcb_nextp) {
637		if (p == mcb_elem)
638			break;
639	}
640	VERIFY(p == NULL);
641
642	/*
643	 * Add it to the head of the callback list. The membar ensures that
644	 * the following list pointer manipulations reach global visibility
645	 * in exactly the program order below.
646	 */
647	ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
648
649	mcb_elem->mcb_nextp = *mcb_head;
650	membar_producer();
651	*mcb_head = mcb_elem;
652}
653
654/*
655 * Mark the entry as logically deleted. If there aren't any walkers unlink
656 * from the list. In either case return the corresponding status.
657 */
658boolean_t
659mac_callback_remove(mac_cb_info_t *mcbi, mac_cb_t **mcb_head,
660    mac_cb_t *mcb_elem)
661{
662	mac_cb_t	*p;
663	mac_cb_t	**pp;
664
665	ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
666	/*
667	 * Search the callback list for the entry to be removed
668	 */
669	for (pp = mcb_head; (p = *pp) != NULL; pp = &p->mcb_nextp) {
670		if (p == mcb_elem)
671			break;
672	}
673	VERIFY(p != NULL);
674
675	/*
676	 * If there are walkers just mark it as deleted and the last walker
677	 * will remove from the list and free it.
678	 */
679	if (mcbi->mcbi_walker_cnt != 0) {
680		p->mcb_flags |= MCB_CONDEMNED;
681		mcbi->mcbi_del_cnt++;
682		return (B_FALSE);
683	}
684
685	ASSERT(mcbi->mcbi_del_cnt == 0);
686	*pp = p->mcb_nextp;
687	p->mcb_nextp = NULL;
688	return (B_TRUE);
689}
690
691/*
692 * Wait for all pending callback removals to be completed
693 */
694void
695mac_callback_remove_wait(mac_cb_info_t *mcbi)
696{
697	ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
698	while (mcbi->mcbi_del_cnt != 0) {
699		DTRACE_PROBE1(need_wait, mac_cb_info_t *, mcbi);
700		cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp);
701	}
702}
703
704/*
705 * The last mac callback walker does the cleanup. Walk the list and unlik
706 * all the logically deleted entries and construct a temporary list of
707 * removed entries. Return the list of removed entries to the caller.
708 */
709mac_cb_t *
710mac_callback_walker_cleanup(mac_cb_info_t *mcbi, mac_cb_t **mcb_head)
711{
712	mac_cb_t	*p;
713	mac_cb_t	**pp;
714	mac_cb_t	*rmlist = NULL;		/* List of removed elements */
715	int	cnt = 0;
716
717	ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
718	ASSERT(mcbi->mcbi_del_cnt != 0 && mcbi->mcbi_walker_cnt == 0);
719
720	pp = mcb_head;
721	while (*pp != NULL) {
722		if ((*pp)->mcb_flags & MCB_CONDEMNED) {
723			p = *pp;
724			*pp = p->mcb_nextp;
725			p->mcb_nextp = rmlist;
726			rmlist = p;
727			cnt++;
728			continue;
729		}
730		pp = &(*pp)->mcb_nextp;
731	}
732
733	ASSERT(mcbi->mcbi_del_cnt == cnt);
734	mcbi->mcbi_del_cnt = 0;
735	return (rmlist);
736}
737
738boolean_t
739mac_callback_lookup(mac_cb_t **mcb_headp, mac_cb_t *mcb_elem)
740{
741	mac_cb_t	*mcb;
742
743	/* Verify it is not already in the list */
744	for (mcb = *mcb_headp; mcb != NULL; mcb = mcb->mcb_nextp) {
745		if (mcb == mcb_elem)
746			return (B_TRUE);
747	}
748
749	return (B_FALSE);
750}
751
752boolean_t
753mac_callback_find(mac_cb_info_t *mcbi, mac_cb_t **mcb_headp, mac_cb_t *mcb_elem)
754{
755	boolean_t	found;
756
757	mutex_enter(mcbi->mcbi_lockp);
758	found = mac_callback_lookup(mcb_headp, mcb_elem);
759	mutex_exit(mcbi->mcbi_lockp);
760
761	return (found);
762}
763
764/* Free the list of removed callbacks */
765void
766mac_callback_free(mac_cb_t *rmlist)
767{
768	mac_cb_t	*mcb;
769	mac_cb_t	*mcb_next;
770
771	for (mcb = rmlist; mcb != NULL; mcb = mcb_next) {
772		mcb_next = mcb->mcb_nextp;
773		kmem_free(mcb->mcb_objp, mcb->mcb_objsize);
774	}
775}
776
777/*
778 * The promisc callbacks are in 2 lists, one off the 'mip' and another off the
779 * 'mcip' threaded by mpi_mi_link and mpi_mci_link respectively. However there
780 * is only a single shared total walker count, and an entry can't be physically
781 * unlinked if a walker is active on either list. The last walker does this
782 * cleanup of logically deleted entries.
783 */
784void
785i_mac_promisc_walker_cleanup(mac_impl_t *mip)
786{
787	mac_cb_t	*rmlist;
788	mac_cb_t	*mcb;
789	mac_cb_t	*mcb_next;
790	mac_promisc_impl_t	*mpip;
791
792	/*
793	 * Construct a temporary list of deleted callbacks by walking the
794	 * the mi_promisc_list. Then for each entry in the temporary list,
795	 * remove it from the mci_promisc_list and free the entry.
796	 */
797	rmlist = mac_callback_walker_cleanup(&mip->mi_promisc_cb_info,
798	    &mip->mi_promisc_list);
799
800	for (mcb = rmlist; mcb != NULL; mcb = mcb_next) {
801		mcb_next = mcb->mcb_nextp;
802		mpip = (mac_promisc_impl_t *)mcb->mcb_objp;
803		VERIFY(mac_callback_remove(&mip->mi_promisc_cb_info,
804		    &mpip->mpi_mcip->mci_promisc_list, &mpip->mpi_mci_link));
805		mcb->mcb_flags = 0;
806		mcb->mcb_nextp = NULL;
807		kmem_cache_free(mac_promisc_impl_cache, mpip);
808	}
809}
810
811void
812i_mac_notify(mac_impl_t *mip, mac_notify_type_t type)
813{
814	mac_cb_info_t	*mcbi;
815
816	/*
817	 * Signal the notify thread even after mi_ref has become zero and
818	 * mi_disabled is set. The synchronization with the notify thread
819	 * happens in mac_unregister and that implies the driver must make
820	 * sure it is single-threaded (with respect to mac calls) and that
821	 * all pending mac calls have returned before it calls mac_unregister
822	 */
823	rw_enter(&i_mac_impl_lock, RW_READER);
824	if (mip->mi_state_flags & MIS_DISABLED)
825		goto exit;
826
827	/*
828	 * Guard against incorrect notifications.  (Running a newer
829	 * mac client against an older implementation?)
830	 */
831	if (type >= MAC_NNOTE)
832		goto exit;
833
834	mcbi = &mip->mi_notify_cb_info;
835	mutex_enter(mcbi->mcbi_lockp);
836	mip->mi_notify_bits |= (1 << type);
837	cv_broadcast(&mcbi->mcbi_cv);
838	mutex_exit(mcbi->mcbi_lockp);
839
840exit:
841	rw_exit(&i_mac_impl_lock);
842}
843
844/*
845 * Mac serialization primitives. Please see the block comment at the
846 * top of the file.
847 */
848void
849i_mac_perim_enter(mac_impl_t *mip)
850{
851	mac_client_impl_t	*mcip;
852
853	if (mip->mi_state_flags & MIS_IS_VNIC) {
854		/*
855		 * This is a VNIC. Return the lower mac since that is what
856		 * we want to serialize on.
857		 */
858		mcip = mac_vnic_lower(mip);
859		mip = mcip->mci_mip;
860	}
861
862	mutex_enter(&mip->mi_perim_lock);
863	if (mip->mi_perim_owner == curthread) {
864		mip->mi_perim_ocnt++;
865		mutex_exit(&mip->mi_perim_lock);
866		return;
867	}
868
869	while (mip->mi_perim_owner != NULL)
870		cv_wait(&mip->mi_perim_cv, &mip->mi_perim_lock);
871
872	mip->mi_perim_owner = curthread;
873	ASSERT(mip->mi_perim_ocnt == 0);
874	mip->mi_perim_ocnt++;
875#ifdef DEBUG
876	mip->mi_perim_stack_depth = getpcstack(mip->mi_perim_stack,
877	    MAC_PERIM_STACK_DEPTH);
878#endif
879	mutex_exit(&mip->mi_perim_lock);
880}
881
882int
883i_mac_perim_enter_nowait(mac_impl_t *mip)
884{
885	/*
886	 * The vnic is a special case, since the serialization is done based
887	 * on the lower mac. If the lower mac is busy, it does not imply the
888	 * vnic can't be unregistered. But in the case of other drivers,
889	 * a busy perimeter or open mac handles implies that the mac is busy
890	 * and can't be unregistered.
891	 */
892	if (mip->mi_state_flags & MIS_IS_VNIC) {
893		i_mac_perim_enter(mip);
894		return (0);
895	}
896
897	mutex_enter(&mip->mi_perim_lock);
898	if (mip->mi_perim_owner != NULL) {
899		mutex_exit(&mip->mi_perim_lock);
900		return (EBUSY);
901	}
902	ASSERT(mip->mi_perim_ocnt == 0);
903	mip->mi_perim_owner = curthread;
904	mip->mi_perim_ocnt++;
905	mutex_exit(&mip->mi_perim_lock);
906
907	return (0);
908}
909
910void
911i_mac_perim_exit(mac_impl_t *mip)
912{
913	mac_client_impl_t *mcip;
914
915	if (mip->mi_state_flags & MIS_IS_VNIC) {
916		/*
917		 * This is a VNIC. Return the lower mac since that is what
918		 * we want to serialize on.
919		 */
920		mcip = mac_vnic_lower(mip);
921		mip = mcip->mci_mip;
922	}
923
924	ASSERT(mip->mi_perim_owner == curthread && mip->mi_perim_ocnt != 0);
925
926	mutex_enter(&mip->mi_perim_lock);
927	if (--mip->mi_perim_ocnt == 0) {
928		mip->mi_perim_owner = NULL;
929		cv_signal(&mip->mi_perim_cv);
930	}
931	mutex_exit(&mip->mi_perim_lock);
932}
933
934/*
935 * Returns whether the current thread holds the mac perimeter. Used in making
936 * assertions.
937 */
938boolean_t
939mac_perim_held(mac_handle_t mh)
940{
941	mac_impl_t	*mip = (mac_impl_t *)mh;
942	mac_client_impl_t *mcip;
943
944	if (mip->mi_state_flags & MIS_IS_VNIC) {
945		/*
946		 * This is a VNIC. Return the lower mac since that is what
947		 * we want to serialize on.
948		 */
949		mcip = mac_vnic_lower(mip);
950		mip = mcip->mci_mip;
951	}
952	return (mip->mi_perim_owner == curthread);
953}
954
955/*
956 * mac client interfaces to enter the mac perimeter of a mac end point, given
957 * its mac handle, or macname or linkid.
958 */
959void
960mac_perim_enter_by_mh(mac_handle_t mh, mac_perim_handle_t *mphp)
961{
962	mac_impl_t	*mip = (mac_impl_t *)mh;
963
964	i_mac_perim_enter(mip);
965	/*
966	 * The mac_perim_handle_t returned encodes the 'mip' and whether a
967	 * mac_open has been done internally while entering the perimeter.
968	 * This information is used in mac_perim_exit
969	 */
970	MAC_ENCODE_MPH(*mphp, mip, 0);
971}
972
973int
974mac_perim_enter_by_macname(const char *name, mac_perim_handle_t *mphp)
975{
976	int	err;
977	mac_handle_t	mh;
978
979	if ((err = mac_open(name, &mh)) != 0)
980		return (err);
981
982	mac_perim_enter_by_mh(mh, mphp);
983	MAC_ENCODE_MPH(*mphp, mh, 1);
984	return (0);
985}
986
987int
988mac_perim_enter_by_linkid(datalink_id_t linkid, mac_perim_handle_t *mphp)
989{
990	int	err;
991	mac_handle_t	mh;
992
993	if ((err = mac_open_by_linkid(linkid, &mh)) != 0)
994		return (err);
995
996	mac_perim_enter_by_mh(mh, mphp);
997	MAC_ENCODE_MPH(*mphp, mh, 1);
998	return (0);
999}
1000
1001void
1002mac_perim_exit(mac_perim_handle_t mph)
1003{
1004	mac_impl_t	*mip;
1005	boolean_t	need_close;
1006
1007	MAC_DECODE_MPH(mph, mip, need_close);
1008	i_mac_perim_exit(mip);
1009	if (need_close)
1010		mac_close((mac_handle_t)mip);
1011}
1012
1013int
1014mac_hold(const char *macname, mac_impl_t **pmip)
1015{
1016	mac_impl_t	*mip;
1017	int		err;
1018
1019	/*
1020	 * Check the device name length to make sure it won't overflow our
1021	 * buffer.
1022	 */
1023	if (strlen(macname) >= MAXNAMELEN)
1024		return (EINVAL);
1025
1026	/*
1027	 * Look up its entry in the global hash table.
1028	 */
1029	rw_enter(&i_mac_impl_lock, RW_WRITER);
1030	err = mod_hash_find(i_mac_impl_hash, (mod_hash_key_t)macname,
1031	    (mod_hash_val_t *)&mip);
1032
1033	if (err != 0) {
1034		rw_exit(&i_mac_impl_lock);
1035		return (ENOENT);
1036	}
1037
1038	if (mip->mi_state_flags & MIS_DISABLED) {
1039		rw_exit(&i_mac_impl_lock);
1040		return (ENOENT);
1041	}
1042
1043	if (mip->mi_state_flags & MIS_EXCLUSIVE_HELD) {
1044		rw_exit(&i_mac_impl_lock);
1045		return (EBUSY);
1046	}
1047
1048	mip->mi_ref++;
1049	rw_exit(&i_mac_impl_lock);
1050
1051	*pmip = mip;
1052	return (0);
1053}
1054
1055void
1056mac_rele(mac_impl_t *mip)
1057{
1058	rw_enter(&i_mac_impl_lock, RW_WRITER);
1059	ASSERT(mip->mi_ref != 0);
1060	if (--mip->mi_ref == 0) {
1061		ASSERT(mip->mi_nactiveclients == 0 &&
1062		    !(mip->mi_state_flags & MIS_EXCLUSIVE));
1063	}
1064	rw_exit(&i_mac_impl_lock);
1065}
1066
1067/*
1068 * Private GLDv3 function to start a MAC instance.
1069 */
1070int
1071mac_start(mac_handle_t mh)
1072{
1073	mac_impl_t	*mip = (mac_impl_t *)mh;
1074	int		err = 0;
1075	mac_group_t	*defgrp;
1076
1077	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1078	ASSERT(mip->mi_start != NULL);
1079
1080	/*
1081	 * Check whether the device is already started.
1082	 */
1083	if (mip->mi_active++ == 0) {
1084		mac_ring_t *ring = NULL;
1085
1086		/*
1087		 * Start the device.
1088		 */
1089		err = mip->mi_start(mip->mi_driver);
1090		if (err != 0) {
1091			mip->mi_active--;
1092			return (err);
1093		}
1094
1095		/*
1096		 * Start the default tx ring.
1097		 */
1098		if (mip->mi_default_tx_ring != NULL) {
1099
1100			ring = (mac_ring_t *)mip->mi_default_tx_ring;
1101			if (ring->mr_state != MR_INUSE) {
1102				err = mac_start_ring(ring);
1103				if (err != 0) {
1104					mip->mi_active--;
1105					return (err);
1106				}
1107			}
1108		}
1109
1110		if ((defgrp = MAC_DEFAULT_RX_GROUP(mip)) != NULL) {
1111			/*
1112			 * Start the default ring, since it will be needed
1113			 * to receive broadcast and multicast traffic for
1114			 * both primary and non-primary MAC clients.
1115			 */
1116			ASSERT(defgrp->mrg_state == MAC_GROUP_STATE_REGISTERED);
1117			err = mac_start_group_and_rings(defgrp);
1118			if (err != 0) {
1119				mip->mi_active--;
1120				if ((ring != NULL) &&
1121				    (ring->mr_state == MR_INUSE))
1122					mac_stop_ring(ring);
1123				return (err);
1124			}
1125			mac_set_group_state(defgrp, MAC_GROUP_STATE_SHARED);
1126		}
1127	}
1128
1129	return (err);
1130}
1131
1132/*
1133 * Private GLDv3 function to stop a MAC instance.
1134 */
1135void
1136mac_stop(mac_handle_t mh)
1137{
1138	mac_impl_t	*mip = (mac_impl_t *)mh;
1139	mac_group_t	*grp;
1140
1141	ASSERT(mip->mi_stop != NULL);
1142	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1143
1144	/*
1145	 * Check whether the device is still needed.
1146	 */
1147	ASSERT(mip->mi_active != 0);
1148	if (--mip->mi_active == 0) {
1149		if ((grp = MAC_DEFAULT_RX_GROUP(mip)) != NULL) {
1150			/*
1151			 * There should be no more active clients since the
1152			 * MAC is being stopped. Stop the default RX group
1153			 * and transition it back to registered state.
1154			 *
1155			 * When clients are torn down, the groups
1156			 * are release via mac_release_rx_group which
1157			 * knows the the default group is always in
1158			 * started mode since broadcast uses it. So
1159			 * we can assert that their are no clients
1160			 * (since mac_bcast_add doesn't register itself
1161			 * as a client) and group is in SHARED state.
1162			 */
1163			ASSERT(grp->mrg_state == MAC_GROUP_STATE_SHARED);
1164			ASSERT(MAC_GROUP_NO_CLIENT(grp) &&
1165			    mip->mi_nactiveclients == 0);
1166			mac_stop_group_and_rings(grp);
1167			mac_set_group_state(grp, MAC_GROUP_STATE_REGISTERED);
1168		}
1169
1170		if (mip->mi_default_tx_ring != NULL) {
1171			mac_ring_t *ring;
1172
1173			ring = (mac_ring_t *)mip->mi_default_tx_ring;
1174			if (ring->mr_state == MR_INUSE) {
1175				mac_stop_ring(ring);
1176				ring->mr_flag = 0;
1177			}
1178		}
1179
1180		/*
1181		 * Stop the device.
1182		 */
1183		mip->mi_stop(mip->mi_driver);
1184	}
1185}
1186
1187int
1188i_mac_promisc_set(mac_impl_t *mip, boolean_t on)
1189{
1190	int		err = 0;
1191
1192	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1193	ASSERT(mip->mi_setpromisc != NULL);
1194
1195	if (on) {
1196		/*
1197		 * Enable promiscuous mode on the device if not yet enabled.
1198		 */
1199		if (mip->mi_devpromisc++ == 0) {
1200			err = mip->mi_setpromisc(mip->mi_driver, B_TRUE);
1201			if (err != 0) {
1202				mip->mi_devpromisc--;
1203				return (err);
1204			}
1205			i_mac_notify(mip, MAC_NOTE_DEVPROMISC);
1206		}
1207	} else {
1208		if (mip->mi_devpromisc == 0)
1209			return (EPROTO);
1210
1211		/*
1212		 * Disable promiscuous mode on the device if this is the last
1213		 * enabling.
1214		 */
1215		if (--mip->mi_devpromisc == 0) {
1216			err = mip->mi_setpromisc(mip->mi_driver, B_FALSE);
1217			if (err != 0) {
1218				mip->mi_devpromisc++;
1219				return (err);
1220			}
1221			i_mac_notify(mip, MAC_NOTE_DEVPROMISC);
1222		}
1223	}
1224
1225	return (0);
1226}
1227
1228/*
1229 * The promiscuity state can change any time. If the caller needs to take
1230 * actions that are atomic with the promiscuity state, then the caller needs
1231 * to bracket the entire sequence with mac_perim_enter/exit
1232 */
1233boolean_t
1234mac_promisc_get(mac_handle_t mh)
1235{
1236	mac_impl_t		*mip = (mac_impl_t *)mh;
1237
1238	/*
1239	 * Return the current promiscuity.
1240	 */
1241	return (mip->mi_devpromisc != 0);
1242}
1243
1244/*
1245 * Invoked at MAC instance attach time to initialize the list
1246 * of factory MAC addresses supported by a MAC instance. This function
1247 * builds a local cache in the mac_impl_t for the MAC addresses
1248 * supported by the underlying hardware. The MAC clients themselves
1249 * use the mac_addr_factory*() functions to query and reserve
1250 * factory MAC addresses.
1251 */
1252void
1253mac_addr_factory_init(mac_impl_t *mip)
1254{
1255	mac_capab_multifactaddr_t capab;
1256	uint8_t *addr;
1257	int i;
1258
1259	/*
1260	 * First round to see how many factory MAC addresses are available.
1261	 */
1262	bzero(&capab, sizeof (capab));
1263	if (!i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_MULTIFACTADDR,
1264	    &capab) || (capab.mcm_naddr == 0)) {
1265		/*
1266		 * The MAC instance doesn't support multiple factory
1267		 * MAC addresses, we're done here.
1268		 */
1269		return;
1270	}
1271
1272	/*
1273	 * Allocate the space and get all the factory addresses.
1274	 */
1275	addr = kmem_alloc(capab.mcm_naddr * MAXMACADDRLEN, KM_SLEEP);
1276	capab.mcm_getaddr(mip->mi_driver, capab.mcm_naddr, addr);
1277
1278	mip->mi_factory_addr_num = capab.mcm_naddr;
1279	mip->mi_factory_addr = kmem_zalloc(mip->mi_factory_addr_num *
1280	    sizeof (mac_factory_addr_t), KM_SLEEP);
1281
1282	for (i = 0; i < capab.mcm_naddr; i++) {
1283		bcopy(addr + i * MAXMACADDRLEN,
1284		    mip->mi_factory_addr[i].mfa_addr,
1285		    mip->mi_type->mt_addr_length);
1286		mip->mi_factory_addr[i].mfa_in_use = B_FALSE;
1287	}
1288
1289	kmem_free(addr, capab.mcm_naddr * MAXMACADDRLEN);
1290}
1291
1292void
1293mac_addr_factory_fini(mac_impl_t *mip)
1294{
1295	if (mip->mi_factory_addr == NULL) {
1296		ASSERT(mip->mi_factory_addr_num == 0);
1297		return;
1298	}
1299
1300	kmem_free(mip->mi_factory_addr, mip->mi_factory_addr_num *
1301	    sizeof (mac_factory_addr_t));
1302
1303	mip->mi_factory_addr = NULL;
1304	mip->mi_factory_addr_num = 0;
1305}
1306
1307/*
1308 * Reserve a factory MAC address. If *slot is set to -1, the function
1309 * attempts to reserve any of the available factory MAC addresses and
1310 * returns the reserved slot id. If no slots are available, the function
1311 * returns ENOSPC. If *slot is not set to -1, the function reserves
1312 * the specified slot if it is available, or returns EBUSY is the slot
1313 * is already used. Returns ENOTSUP if the underlying MAC does not
1314 * support multiple factory addresses. If the slot number is not -1 but
1315 * is invalid, returns EINVAL.
1316 */
1317int
1318mac_addr_factory_reserve(mac_client_handle_t mch, int *slot)
1319{
1320	mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1321	mac_impl_t *mip = mcip->mci_mip;
1322	int i, ret = 0;
1323
1324	i_mac_perim_enter(mip);
1325	/*
1326	 * Protect against concurrent readers that may need a self-consistent
1327	 * view of the factory addresses
1328	 */
1329	rw_enter(&mip->mi_rw_lock, RW_WRITER);
1330
1331	if (mip->mi_factory_addr_num == 0) {
1332		ret = ENOTSUP;
1333		goto bail;
1334	}
1335
1336	if (*slot != -1) {
1337		/* check the specified slot */
1338		if (*slot < 1 || *slot > mip->mi_factory_addr_num) {
1339			ret = EINVAL;
1340			goto bail;
1341		}
1342		if (mip->mi_factory_addr[*slot-1].mfa_in_use) {
1343			ret = EBUSY;
1344			goto bail;
1345		}
1346	} else {
1347		/* pick the next available slot */
1348		for (i = 0; i < mip->mi_factory_addr_num; i++) {
1349			if (!mip->mi_factory_addr[i].mfa_in_use)
1350				break;
1351		}
1352
1353		if (i == mip->mi_factory_addr_num) {
1354			ret = ENOSPC;
1355			goto bail;
1356		}
1357		*slot = i+1;
1358	}
1359
1360	mip->mi_factory_addr[*slot-1].mfa_in_use = B_TRUE;
1361	mip->mi_factory_addr[*slot-1].mfa_client = mcip;
1362
1363bail:
1364	rw_exit(&mip->mi_rw_lock);
1365	i_mac_perim_exit(mip);
1366	return (ret);
1367}
1368
1369/*
1370 * Release the specified factory MAC address slot.
1371 */
1372void
1373mac_addr_factory_release(mac_client_handle_t mch, uint_t slot)
1374{
1375	mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1376	mac_impl_t *mip = mcip->mci_mip;
1377
1378	i_mac_perim_enter(mip);
1379	/*
1380	 * Protect against concurrent readers that may need a self-consistent
1381	 * view of the factory addresses
1382	 */
1383	rw_enter(&mip->mi_rw_lock, RW_WRITER);
1384
1385	ASSERT(slot > 0 && slot <= mip->mi_factory_addr_num);
1386	ASSERT(mip->mi_factory_addr[slot-1].mfa_in_use);
1387
1388	mip->mi_factory_addr[slot-1].mfa_in_use = B_FALSE;
1389
1390	rw_exit(&mip->mi_rw_lock);
1391	i_mac_perim_exit(mip);
1392}
1393
1394/*
1395 * Stores in mac_addr the value of the specified MAC address. Returns
1396 * 0 on success, or EINVAL if the slot number is not valid for the MAC.
1397 * The caller must provide a string of at least MAXNAMELEN bytes.
1398 */
1399void
1400mac_addr_factory_value(mac_handle_t mh, int slot, uchar_t *mac_addr,
1401    uint_t *addr_len, char *client_name, boolean_t *in_use_arg)
1402{
1403	mac_impl_t *mip = (mac_impl_t *)mh;
1404	boolean_t in_use;
1405
1406	ASSERT(slot > 0 && slot <= mip->mi_factory_addr_num);
1407
1408	/*
1409	 * Readers need to hold mi_rw_lock. Writers need to hold mac perimeter
1410	 * and mi_rw_lock
1411	 */
1412	rw_enter(&mip->mi_rw_lock, RW_READER);
1413	bcopy(mip->mi_factory_addr[slot-1].mfa_addr, mac_addr, MAXMACADDRLEN);
1414	*addr_len = mip->mi_type->mt_addr_length;
1415	in_use = mip->mi_factory_addr[slot-1].mfa_in_use;
1416	if (in_use && client_name != NULL) {
1417		bcopy(mip->mi_factory_addr[slot-1].mfa_client->mci_name,
1418		    client_name, MAXNAMELEN);
1419	}
1420	if (in_use_arg != NULL)
1421		*in_use_arg = in_use;
1422	rw_exit(&mip->mi_rw_lock);
1423}
1424
1425/*
1426 * Returns the number of factory MAC addresses (in addition to the
1427 * primary MAC address), 0 if the underlying MAC doesn't support
1428 * that feature.
1429 */
1430uint_t
1431mac_addr_factory_num(mac_handle_t mh)
1432{
1433	mac_impl_t *mip = (mac_impl_t *)mh;
1434
1435	return (mip->mi_factory_addr_num);
1436}
1437
1438
1439void
1440mac_rx_group_unmark(mac_group_t *grp, uint_t flag)
1441{
1442	mac_ring_t	*ring;
1443
1444	for (ring = grp->mrg_rings; ring != NULL; ring = ring->mr_next)
1445		ring->mr_flag &= ~flag;
1446}
1447
1448/*
1449 * The following mac_hwrings_xxx() functions are private mac client functions
1450 * used by the aggr driver to access and control the underlying HW Rx group
1451 * and rings. In this case, the aggr driver has exclusive control of the
1452 * underlying HW Rx group/rings, it calls the following functions to
1453 * start/stop the HW Rx rings, disable/enable polling, add/remove mac'
1454 * addresses, or set up the Rx callback.
1455 */
1456/* ARGSUSED */
1457static void
1458mac_hwrings_rx_process(void *arg, mac_resource_handle_t srs,
1459    mblk_t *mp_chain, boolean_t loopback)
1460{
1461	mac_soft_ring_set_t	*mac_srs = (mac_soft_ring_set_t *)srs;
1462	mac_srs_rx_t		*srs_rx = &mac_srs->srs_rx;
1463	mac_direct_rx_t		proc;
1464	void			*arg1;
1465	mac_resource_handle_t	arg2;
1466
1467	proc = srs_rx->sr_func;
1468	arg1 = srs_rx->sr_arg1;
1469	arg2 = mac_srs->srs_mrh;
1470
1471	proc(arg1, arg2, mp_chain, NULL);
1472}
1473
1474/*
1475 * This function is called to get the list of HW rings that are reserved by
1476 * an exclusive mac client.
1477 *
1478 * Return value: the number of HW rings.
1479 */
1480int
1481mac_hwrings_get(mac_client_handle_t mch, mac_group_handle_t *hwgh,
1482    mac_ring_handle_t *hwrh, mac_ring_type_t rtype)
1483{
1484	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1485	flow_entry_t		*flent = mcip->mci_flent;
1486	mac_group_t		*grp;
1487	mac_ring_t		*ring;
1488	int			cnt = 0;
1489
1490	if (rtype == MAC_RING_TYPE_RX) {
1491		grp = flent->fe_rx_ring_group;
1492	} else if (rtype == MAC_RING_TYPE_TX) {
1493		grp = flent->fe_tx_ring_group;
1494	} else {
1495		ASSERT(B_FALSE);
1496		return (-1);
1497	}
1498	/*
1499	 * The mac client did not reserve any RX group, return directly.
1500	 * This is probably because the underlying MAC does not support
1501	 * any groups.
1502	 */
1503	if (hwgh != NULL)
1504		*hwgh = NULL;
1505	if (grp == NULL)
1506		return (0);
1507	/*
1508	 * This group must be reserved by this mac client.
1509	 */
1510	ASSERT((grp->mrg_state == MAC_GROUP_STATE_RESERVED) &&
1511	    (mcip == MAC_GROUP_ONLY_CLIENT(grp)));
1512
1513	for (ring = grp->mrg_rings; ring != NULL; ring = ring->mr_next, cnt++) {
1514		ASSERT(cnt < MAX_RINGS_PER_GROUP);
1515		hwrh[cnt] = (mac_ring_handle_t)ring;
1516	}
1517	if (hwgh != NULL)
1518		*hwgh = (mac_group_handle_t)grp;
1519
1520	return (cnt);
1521}
1522
1523/*
1524 * This function is called to get info about Tx/Rx rings.
1525 *
1526 * Return value: returns uint_t which will have various bits set
1527 * that indicates different properties of the ring.
1528 */
1529uint_t
1530mac_hwring_getinfo(mac_ring_handle_t rh)
1531{
1532	mac_ring_t *ring = (mac_ring_t *)rh;
1533	mac_ring_info_t *info = &ring->mr_info;
1534
1535	return (info->mri_flags);
1536}
1537
1538/*
1539 * Export ddi interrupt handles from the HW ring to the pseudo ring and
1540 * setup the RX callback of the mac client which exclusively controls
1541 * HW ring.
1542 */
1543void
1544mac_hwring_setup(mac_ring_handle_t hwrh, mac_resource_handle_t prh,
1545    mac_ring_handle_t pseudo_rh)
1546{
1547	mac_ring_t		*hw_ring = (mac_ring_t *)hwrh;
1548	mac_ring_t		*pseudo_ring;
1549	mac_soft_ring_set_t	*mac_srs = hw_ring->mr_srs;
1550
1551	if (pseudo_rh != NULL) {
1552		pseudo_ring = (mac_ring_t *)pseudo_rh;
1553		/* Export the ddi handles to pseudo ring */
1554		pseudo_ring->mr_info.mri_intr.mi_ddi_handle =
1555		    hw_ring->mr_info.mri_intr.mi_ddi_handle;
1556		pseudo_ring->mr_info.mri_intr.mi_ddi_shared =
1557		    hw_ring->mr_info.mri_intr.mi_ddi_shared;
1558		/*
1559		 * Save a pointer to pseudo ring in the hw ring. If
1560		 * interrupt handle changes, the hw ring will be
1561		 * notified of the change (see mac_ring_intr_set())
1562		 * and the appropriate change has to be made to
1563		 * the pseudo ring that has exported the ddi handle.
1564		 */
1565		hw_ring->mr_prh = pseudo_rh;
1566	}
1567
1568	if (hw_ring->mr_type == MAC_RING_TYPE_RX) {
1569		ASSERT(!(mac_srs->srs_type & SRST_TX));
1570		mac_srs->srs_mrh = prh;
1571		mac_srs->srs_rx.sr_lower_proc = mac_hwrings_rx_process;
1572	}
1573}
1574
1575void
1576mac_hwring_teardown(mac_ring_handle_t hwrh)
1577{
1578	mac_ring_t		*hw_ring = (mac_ring_t *)hwrh;
1579	mac_soft_ring_set_t	*mac_srs;
1580
1581	if (hw_ring == NULL)
1582		return;
1583	hw_ring->mr_prh = NULL;
1584	if (hw_ring->mr_type == MAC_RING_TYPE_RX) {
1585		mac_srs = hw_ring->mr_srs;
1586		ASSERT(!(mac_srs->srs_type & SRST_TX));
1587		mac_srs->srs_rx.sr_lower_proc = mac_rx_srs_process;
1588		mac_srs->srs_mrh = NULL;
1589	}
1590}
1591
1592int
1593mac_hwring_disable_intr(mac_ring_handle_t rh)
1594{
1595	mac_ring_t *rr_ring = (mac_ring_t *)rh;
1596	mac_intr_t *intr = &rr_ring->mr_info.mri_intr;
1597
1598	return (intr->mi_disable(intr->mi_handle));
1599}
1600
1601int
1602mac_hwring_enable_intr(mac_ring_handle_t rh)
1603{
1604	mac_ring_t *rr_ring = (mac_ring_t *)rh;
1605	mac_intr_t *intr = &rr_ring->mr_info.mri_intr;
1606
1607	return (intr->mi_enable(intr->mi_handle));
1608}
1609
1610int
1611mac_hwring_start(mac_ring_handle_t rh)
1612{
1613	mac_ring_t *rr_ring = (mac_ring_t *)rh;
1614
1615	MAC_RING_UNMARK(rr_ring, MR_QUIESCE);
1616	return (0);
1617}
1618
1619void
1620mac_hwring_stop(mac_ring_handle_t rh)
1621{
1622	mac_ring_t *rr_ring = (mac_ring_t *)rh;
1623
1624	mac_rx_ring_quiesce(rr_ring, MR_QUIESCE);
1625}
1626
1627mblk_t *
1628mac_hwring_poll(mac_ring_handle_t rh, int bytes_to_pickup)
1629{
1630	mac_ring_t *rr_ring = (mac_ring_t *)rh;
1631	mac_ring_info_t *info = &rr_ring->mr_info;
1632
1633	return (info->mri_poll(info->mri_driver, bytes_to_pickup));
1634}
1635
1636/*
1637 * Send packets through a selected tx ring.
1638 */
1639mblk_t *
1640mac_hwring_tx(mac_ring_handle_t rh, mblk_t *mp)
1641{
1642	mac_ring_t *ring = (mac_ring_t *)rh;
1643	mac_ring_info_t *info = &ring->mr_info;
1644
1645	ASSERT(ring->mr_type == MAC_RING_TYPE_TX &&
1646	    ring->mr_state >= MR_INUSE);
1647	return (info->mri_tx(info->mri_driver, mp));
1648}
1649
1650/*
1651 * Query stats for a particular rx/tx ring
1652 */
1653int
1654mac_hwring_getstat(mac_ring_handle_t rh, uint_t stat, uint64_t *val)
1655{
1656	mac_ring_t	*ring = (mac_ring_t *)rh;
1657	mac_ring_info_t *info = &ring->mr_info;
1658
1659	return (info->mri_stat(info->mri_driver, stat, val));
1660}
1661
1662/*
1663 * Private function that is only used by aggr to send packets through
1664 * a port/Tx ring. Since aggr exposes a pseudo Tx ring even for ports
1665 * that does not expose Tx rings, aggr_ring_tx() entry point needs
1666 * access to mac_impl_t to send packets through m_tx() entry point.
1667 * It accomplishes this by calling mac_hwring_send_priv() function.
1668 */
1669mblk_t *
1670mac_hwring_send_priv(mac_client_handle_t mch, mac_ring_handle_t rh, mblk_t *mp)
1671{
1672	mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1673	mac_impl_t *mip = mcip->mci_mip;
1674
1675	MAC_TX(mip, rh, mp, mcip);
1676	return (mp);
1677}
1678
1679int
1680mac_hwgroup_addmac(mac_group_handle_t gh, const uint8_t *addr)
1681{
1682	mac_group_t *group = (mac_group_t *)gh;
1683
1684	return (mac_group_addmac(group, addr));
1685}
1686
1687int
1688mac_hwgroup_remmac(mac_group_handle_t gh, const uint8_t *addr)
1689{
1690	mac_group_t *group = (mac_group_t *)gh;
1691
1692	return (mac_group_remmac(group, addr));
1693}
1694
1695/*
1696 * Set the RX group to be shared/reserved. Note that the group must be
1697 * started/stopped outside of this function.
1698 */
1699void
1700mac_set_group_state(mac_group_t *grp, mac_group_state_t state)
1701{
1702	/*
1703	 * If there is no change in the group state, just return.
1704	 */
1705	if (grp->mrg_state == state)
1706		return;
1707
1708	switch (state) {
1709	case MAC_GROUP_STATE_RESERVED:
1710		/*
1711		 * Successfully reserved the group.
1712		 *
1713		 * Given that there is an exclusive client controlling this
1714		 * group, we enable the group level polling when available,
1715		 * so that SRSs get to turn on/off individual rings they's
1716		 * assigned to.
1717		 */
1718		ASSERT(MAC_PERIM_HELD(grp->mrg_mh));
1719
1720		if (grp->mrg_type == MAC_RING_TYPE_RX &&
1721		    GROUP_INTR_DISABLE_FUNC(grp) != NULL) {
1722			GROUP_INTR_DISABLE_FUNC(grp)(GROUP_INTR_HANDLE(grp));
1723		}
1724		break;
1725
1726	case MAC_GROUP_STATE_SHARED:
1727		/*
1728		 * Set all rings of this group to software classified.
1729		 * If the group has an overriding interrupt, then re-enable it.
1730		 */
1731		ASSERT(MAC_PERIM_HELD(grp->mrg_mh));
1732
1733		if (grp->mrg_type == MAC_RING_TYPE_RX &&
1734		    GROUP_INTR_ENABLE_FUNC(grp) != NULL) {
1735			GROUP_INTR_ENABLE_FUNC(grp)(GROUP_INTR_HANDLE(grp));
1736		}
1737		/* The ring is not available for reservations any more */
1738		break;
1739
1740	case MAC_GROUP_STATE_REGISTERED:
1741		/* Also callable from mac_register, perim is not held */
1742		break;
1743
1744	default:
1745		ASSERT(B_FALSE);
1746		break;
1747	}
1748
1749	grp->mrg_state = state;
1750}
1751
1752/*
1753 * Quiesce future hardware classified packets for the specified Rx ring
1754 */
1755static void
1756mac_rx_ring_quiesce(mac_ring_t *rx_ring, uint_t ring_flag)
1757{
1758	ASSERT(rx_ring->mr_classify_type == MAC_HW_CLASSIFIER);
1759	ASSERT(ring_flag == MR_CONDEMNED || ring_flag  == MR_QUIESCE);
1760
1761	mutex_enter(&rx_ring->mr_lock);
1762	rx_ring->mr_flag |= ring_flag;
1763	while (rx_ring->mr_refcnt != 0)
1764		cv_wait(&rx_ring->mr_cv, &rx_ring->mr_lock);
1765	mutex_exit(&rx_ring->mr_lock);
1766}
1767
1768/*
1769 * Please see mac_tx for details about the per cpu locking scheme
1770 */
1771static void
1772mac_tx_lock_all(mac_client_impl_t *mcip)
1773{
1774	int	i;
1775
1776	for (i = 0; i <= mac_tx_percpu_cnt; i++)
1777		mutex_enter(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
1778}
1779
1780static void
1781mac_tx_unlock_all(mac_client_impl_t *mcip)
1782{
1783	int	i;
1784
1785	for (i = mac_tx_percpu_cnt; i >= 0; i--)
1786		mutex_exit(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
1787}
1788
1789static void
1790mac_tx_unlock_allbutzero(mac_client_impl_t *mcip)
1791{
1792	int	i;
1793
1794	for (i = mac_tx_percpu_cnt; i > 0; i--)
1795		mutex_exit(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
1796}
1797
1798static int
1799mac_tx_sum_refcnt(mac_client_impl_t *mcip)
1800{
1801	int	i;
1802	int	refcnt = 0;
1803
1804	for (i = 0; i <= mac_tx_percpu_cnt; i++)
1805		refcnt += mcip->mci_tx_pcpu[i].pcpu_tx_refcnt;
1806
1807	return (refcnt);
1808}
1809
1810/*
1811 * Stop future Tx packets coming down from the client in preparation for
1812 * quiescing the Tx side. This is needed for dynamic reclaim and reassignment
1813 * of rings between clients
1814 */
1815void
1816mac_tx_client_block(mac_client_impl_t *mcip)
1817{
1818	mac_tx_lock_all(mcip);
1819	mcip->mci_tx_flag |= MCI_TX_QUIESCE;
1820	while (mac_tx_sum_refcnt(mcip) != 0) {
1821		mac_tx_unlock_allbutzero(mcip);
1822		cv_wait(&mcip->mci_tx_cv, &mcip->mci_tx_pcpu[0].pcpu_tx_lock);
1823		mutex_exit(&mcip->mci_tx_pcpu[0].pcpu_tx_lock);
1824		mac_tx_lock_all(mcip);
1825	}
1826	mac_tx_unlock_all(mcip);
1827}
1828
1829void
1830mac_tx_client_unblock(mac_client_impl_t *mcip)
1831{
1832	mac_tx_lock_all(mcip);
1833	mcip->mci_tx_flag &= ~MCI_TX_QUIESCE;
1834	mac_tx_unlock_all(mcip);
1835	/*
1836	 * We may fail to disable flow control for the last MAC_NOTE_TX
1837	 * notification because the MAC client is quiesced. Send the
1838	 * notification again.
1839	 */
1840	i_mac_notify(mcip->mci_mip, MAC_NOTE_TX);
1841}
1842
1843/*
1844 * Wait for an SRS to quiesce. The SRS worker will signal us when the
1845 * quiesce is done.
1846 */
1847static void
1848mac_srs_quiesce_wait(mac_soft_ring_set_t *srs, uint_t srs_flag)
1849{
1850	mutex_enter(&srs->srs_lock);
1851	while (!(srs->srs_state & srs_flag))
1852		cv_wait(&srs->srs_quiesce_done_cv, &srs->srs_lock);
1853	mutex_exit(&srs->srs_lock);
1854}
1855
1856/*
1857 * Quiescing an Rx SRS is achieved by the following sequence. The protocol
1858 * works bottom up by cutting off packet flow from the bottommost point in the
1859 * mac, then the SRS, and then the soft rings. There are 2 use cases of this
1860 * mechanism. One is a temporary quiesce of the SRS, such as say while changing
1861 * the Rx callbacks. Another use case is Rx SRS teardown. In the former case
1862 * the QUIESCE prefix/suffix is used and in the latter the CONDEMNED is used
1863 * for the SRS and MR flags. In the former case the threads pause waiting for
1864 * a restart, while in the latter case the threads exit. The Tx SRS teardown
1865 * is also mostly similar to the above.
1866 *
1867 * 1. Stop future hardware classified packets at the lowest level in the mac.
1868 *    Remove any hardware classification rule (CONDEMNED case) and mark the
1869 *    rings as CONDEMNED or QUIESCE as appropriate. This prevents the mr_refcnt
1870 *    from increasing. Upcalls from the driver that come through hardware
1871 *    classification will be dropped in mac_rx from now on. Then we wait for
1872 *    the mr_refcnt to drop to zero. When the mr_refcnt reaches zero we are
1873 *    sure there aren't any upcall threads from the driver through hardware
1874 *    classification. In the case of SRS teardown we also remove the
1875 *    classification rule in the driver.
1876 *
1877 * 2. Stop future software classified packets by marking the flow entry with
1878 *    FE_QUIESCE or FE_CONDEMNED as appropriate which prevents the refcnt from
1879 *    increasing. We also remove the flow entry from the table in the latter
1880 *    case. Then wait for the fe_refcnt to reach an appropriate quiescent value
1881 *    that indicates there aren't any active threads using that flow entry.
1882 *
1883 * 3. Quiesce the SRS and softrings by signaling the SRS. The SRS poll thread,
1884 *    SRS worker thread, and the soft ring threads are quiesced in sequence
1885 *    with the SRS worker thread serving as a master controller. This
1886 *    mechansim is explained in mac_srs_worker_quiesce().
1887 *
1888 * The restart mechanism to reactivate the SRS and softrings is explained
1889 * in mac_srs_worker_restart(). Here we just signal the SRS worker to start the
1890 * restart sequence.
1891 */
1892void
1893mac_rx_srs_quiesce(mac_soft_ring_set_t *srs, uint_t srs_quiesce_flag)
1894{
1895	flow_entry_t	*flent = srs->srs_flent;
1896	uint_t	mr_flag, srs_done_flag;
1897
1898	ASSERT(MAC_PERIM_HELD((mac_handle_t)FLENT_TO_MIP(flent)));
1899	ASSERT(!(srs->srs_type & SRST_TX));
1900
1901	if (srs_quiesce_flag == SRS_CONDEMNED) {
1902		mr_flag = MR_CONDEMNED;
1903		srs_done_flag = SRS_CONDEMNED_DONE;
1904		if (srs->srs_type & SRST_CLIENT_POLL_ENABLED)
1905			mac_srs_client_poll_disable(srs->srs_mcip, srs);
1906	} else {
1907		ASSERT(srs_quiesce_flag == SRS_QUIESCE);
1908		mr_flag = MR_QUIESCE;
1909		srs_done_flag = SRS_QUIESCE_DONE;
1910		if (srs->srs_type & SRST_CLIENT_POLL_ENABLED)
1911			mac_srs_client_poll_quiesce(srs->srs_mcip, srs);
1912	}
1913
1914	if (srs->srs_ring != NULL) {
1915		mac_rx_ring_quiesce(srs->srs_ring, mr_flag);
1916	} else {
1917		/*
1918		 * SRS is driven by software classification. In case
1919		 * of CONDEMNED, the top level teardown functions will
1920		 * deal with flow removal.
1921		 */
1922		if (srs_quiesce_flag != SRS_CONDEMNED) {
1923			FLOW_MARK(flent, FE_QUIESCE);
1924			mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
1925		}
1926	}
1927
1928	/*
1929	 * Signal the SRS to quiesce itself, and then cv_wait for the
1930	 * SRS quiesce to complete. The SRS worker thread will wake us
1931	 * up when the quiesce is complete
1932	 */
1933	mac_srs_signal(srs, srs_quiesce_flag);
1934	mac_srs_quiesce_wait(srs, srs_done_flag);
1935}
1936
1937/*
1938 * Remove an SRS.
1939 */
1940void
1941mac_rx_srs_remove(mac_soft_ring_set_t *srs)
1942{
1943	flow_entry_t *flent = srs->srs_flent;
1944	int i;
1945
1946	mac_rx_srs_quiesce(srs, SRS_CONDEMNED);
1947	/*
1948	 * Locate and remove our entry in the fe_rx_srs[] array, and
1949	 * adjust the fe_rx_srs array entries and array count by
1950	 * moving the last entry into the vacated spot.
1951	 */
1952	mutex_enter(&flent->fe_lock);
1953	for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
1954		if (flent->fe_rx_srs[i] == srs)
1955			break;
1956	}
1957
1958	ASSERT(i != 0 && i < flent->fe_rx_srs_cnt);
1959	if (i != flent->fe_rx_srs_cnt - 1) {
1960		flent->fe_rx_srs[i] =
1961		    flent->fe_rx_srs[flent->fe_rx_srs_cnt - 1];
1962		i = flent->fe_rx_srs_cnt - 1;
1963	}
1964
1965	flent->fe_rx_srs[i] = NULL;
1966	flent->fe_rx_srs_cnt--;
1967	mutex_exit(&flent->fe_lock);
1968
1969	mac_srs_free(srs);
1970}
1971
1972static void
1973mac_srs_clear_flag(mac_soft_ring_set_t *srs, uint_t flag)
1974{
1975	mutex_enter(&srs->srs_lock);
1976	srs->srs_state &= ~flag;
1977	mutex_exit(&srs->srs_lock);
1978}
1979
1980void
1981mac_rx_srs_restart(mac_soft_ring_set_t *srs)
1982{
1983	flow_entry_t	*flent = srs->srs_flent;
1984	mac_ring_t	*mr;
1985
1986	ASSERT(MAC_PERIM_HELD((mac_handle_t)FLENT_TO_MIP(flent)));
1987	ASSERT((srs->srs_type & SRST_TX) == 0);
1988
1989	/*
1990	 * This handles a change in the number of SRSs between the quiesce and
1991	 * and restart operation of a flow.
1992	 */
1993	if (!SRS_QUIESCED(srs))
1994		return;
1995
1996	/*
1997	 * Signal the SRS to restart itself. Wait for the restart to complete
1998	 * Note that we only restart the SRS if it is not marked as
1999	 * permanently quiesced.
2000	 */
2001	if (!SRS_QUIESCED_PERMANENT(srs)) {
2002		mac_srs_signal(srs, SRS_RESTART);
2003		mac_srs_quiesce_wait(srs, SRS_RESTART_DONE);
2004		mac_srs_clear_flag(srs, SRS_RESTART_DONE);
2005
2006		mac_srs_client_poll_restart(srs->srs_mcip, srs);
2007	}
2008
2009	/* Finally clear the flags to let the packets in */
2010	mr = srs->srs_ring;
2011	if (mr != NULL) {
2012		MAC_RING_UNMARK(mr, MR_QUIESCE);
2013		/* In case the ring was stopped, safely restart it */
2014		if (mr->mr_state != MR_INUSE)
2015			(void) mac_start_ring(mr);
2016	} else {
2017		FLOW_UNMARK(flent, FE_QUIESCE);
2018	}
2019}
2020
2021/*
2022 * Temporary quiesce of a flow and associated Rx SRS.
2023 * Please see block comment above mac_rx_classify_flow_rem.
2024 */
2025/* ARGSUSED */
2026int
2027mac_rx_classify_flow_quiesce(flow_entry_t *flent, void *arg)
2028{
2029	int		i;
2030
2031	for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
2032		mac_rx_srs_quiesce((mac_soft_ring_set_t *)flent->fe_rx_srs[i],
2033		    SRS_QUIESCE);
2034	}
2035	return (0);
2036}
2037
2038/*
2039 * Restart a flow and associated Rx SRS that has been quiesced temporarily
2040 * Please see block comment above mac_rx_classify_flow_rem
2041 */
2042/* ARGSUSED */
2043int
2044mac_rx_classify_flow_restart(flow_entry_t *flent, void *arg)
2045{
2046	int		i;
2047
2048	for (i = 0; i < flent->fe_rx_srs_cnt; i++)
2049		mac_rx_srs_restart((mac_soft_ring_set_t *)flent->fe_rx_srs[i]);
2050
2051	return (0);
2052}
2053
2054void
2055mac_srs_perm_quiesce(mac_client_handle_t mch, boolean_t on)
2056{
2057	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
2058	flow_entry_t		*flent = mcip->mci_flent;
2059	mac_impl_t		*mip = mcip->mci_mip;
2060	mac_soft_ring_set_t	*mac_srs;
2061	int			i;
2062
2063	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
2064
2065	if (flent == NULL)
2066		return;
2067
2068	for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
2069		mac_srs = flent->fe_rx_srs[i];
2070		mutex_enter(&mac_srs->srs_lock);
2071		if (on)
2072			mac_srs->srs_state |= SRS_QUIESCE_PERM;
2073		else
2074			mac_srs->srs_state &= ~SRS_QUIESCE_PERM;
2075		mutex_exit(&mac_srs->srs_lock);
2076	}
2077}
2078
2079void
2080mac_rx_client_quiesce(mac_client_handle_t mch)
2081{
2082	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
2083	mac_impl_t		*mip = mcip->mci_mip;
2084
2085	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
2086
2087	if (MCIP_DATAPATH_SETUP(mcip)) {
2088		(void) mac_rx_classify_flow_quiesce(mcip->mci_flent,
2089		    NULL);
2090		(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
2091		    mac_rx_classify_flow_quiesce, NULL);
2092	}
2093}
2094
2095void
2096mac_rx_client_restart(mac_client_handle_t mch)
2097{
2098	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
2099	mac_impl_t		*mip = mcip->mci_mip;
2100
2101	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
2102
2103	if (MCIP_DATAPATH_SETUP(mcip)) {
2104		(void) mac_rx_classify_flow_restart(mcip->mci_flent, NULL);
2105		(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
2106		    mac_rx_classify_flow_restart, NULL);
2107	}
2108}
2109
2110/*
2111 * This function only quiesces the Tx SRS and softring worker threads. Callers
2112 * need to make sure that there aren't any mac client threads doing current or
2113 * future transmits in the mac before calling this function.
2114 */
2115void
2116mac_tx_srs_quiesce(mac_soft_ring_set_t *srs, uint_t srs_quiesce_flag)
2117{
2118	mac_client_impl_t	*mcip = srs->srs_mcip;
2119
2120	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2121
2122	ASSERT(srs->srs_type & SRST_TX);
2123	ASSERT(srs_quiesce_flag == SRS_CONDEMNED ||
2124	    srs_quiesce_flag == SRS_QUIESCE);
2125
2126	/*
2127	 * Signal the SRS to quiesce itself, and then cv_wait for the
2128	 * SRS quiesce to complete. The SRS worker thread will wake us
2129	 * up when the quiesce is complete
2130	 */
2131	mac_srs_signal(srs, srs_quiesce_flag);
2132	mac_srs_quiesce_wait(srs, srs_quiesce_flag == SRS_QUIESCE ?
2133	    SRS_QUIESCE_DONE : SRS_CONDEMNED_DONE);
2134}
2135
2136void
2137mac_tx_srs_restart(mac_soft_ring_set_t *srs)
2138{
2139	/*
2140	 * Resizing the fanout could result in creation of new SRSs.
2141	 * They may not necessarily be in the quiesced state in which
2142	 * case it need be restarted
2143	 */
2144	if (!SRS_QUIESCED(srs))
2145		return;
2146
2147	mac_srs_signal(srs, SRS_RESTART);
2148	mac_srs_quiesce_wait(srs, SRS_RESTART_DONE);
2149	mac_srs_clear_flag(srs, SRS_RESTART_DONE);
2150}
2151
2152/*
2153 * Temporary quiesce of a flow and associated Rx SRS.
2154 * Please see block comment above mac_rx_srs_quiesce
2155 */
2156/* ARGSUSED */
2157int
2158mac_tx_flow_quiesce(flow_entry_t *flent, void *arg)
2159{
2160	/*
2161	 * The fe_tx_srs is null for a subflow on an interface that is
2162	 * not plumbed
2163	 */
2164	if (flent->fe_tx_srs != NULL)
2165		mac_tx_srs_quiesce(flent->fe_tx_srs, SRS_QUIESCE);
2166	return (0);
2167}
2168
2169/* ARGSUSED */
2170int
2171mac_tx_flow_restart(flow_entry_t *flent, void *arg)
2172{
2173	/*
2174	 * The fe_tx_srs is null for a subflow on an interface that is
2175	 * not plumbed
2176	 */
2177	if (flent->fe_tx_srs != NULL)
2178		mac_tx_srs_restart(flent->fe_tx_srs);
2179	return (0);
2180}
2181
2182static void
2183i_mac_tx_client_quiesce(mac_client_handle_t mch, uint_t srs_quiesce_flag)
2184{
2185	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
2186
2187	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2188
2189	mac_tx_client_block(mcip);
2190	if (MCIP_TX_SRS(mcip) != NULL) {
2191		mac_tx_srs_quiesce(MCIP_TX_SRS(mcip), srs_quiesce_flag);
2192		(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
2193		    mac_tx_flow_quiesce, NULL);
2194	}
2195}
2196
2197void
2198mac_tx_client_quiesce(mac_client_handle_t mch)
2199{
2200	i_mac_tx_client_quiesce(mch, SRS_QUIESCE);
2201}
2202
2203void
2204mac_tx_client_condemn(mac_client_handle_t mch)
2205{
2206	i_mac_tx_client_quiesce(mch, SRS_CONDEMNED);
2207}
2208
2209void
2210mac_tx_client_restart(mac_client_handle_t mch)
2211{
2212	mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
2213
2214	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2215
2216	mac_tx_client_unblock(mcip);
2217	if (MCIP_TX_SRS(mcip) != NULL) {
2218		mac_tx_srs_restart(MCIP_TX_SRS(mcip));
2219		(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
2220		    mac_tx_flow_restart, NULL);
2221	}
2222}
2223
2224void
2225mac_tx_client_flush(mac_client_impl_t *mcip)
2226{
2227	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2228
2229	mac_tx_client_quiesce((mac_client_handle_t)mcip);
2230	mac_tx_client_restart((mac_client_handle_t)mcip);
2231}
2232
2233void
2234mac_client_quiesce(mac_client_impl_t *mcip)
2235{
2236	mac_rx_client_quiesce((mac_client_handle_t)mcip);
2237	mac_tx_client_quiesce((mac_client_handle_t)mcip);
2238}
2239
2240void
2241mac_client_restart(mac_client_impl_t *mcip)
2242{
2243	mac_rx_client_restart((mac_client_handle_t)mcip);
2244	mac_tx_client_restart((mac_client_handle_t)mcip);
2245}
2246
2247/*
2248 * Allocate a minor number.
2249 */
2250minor_t
2251mac_minor_hold(boolean_t sleep)
2252{
2253	minor_t	minor;
2254
2255	/*
2256	 * Grab a value from the arena.
2257	 */
2258	atomic_add_32(&minor_count, 1);
2259
2260	if (sleep)
2261		minor = (uint_t)id_alloc(minor_ids);
2262	else
2263		minor = (uint_t)id_alloc_nosleep(minor_ids);
2264
2265	if (minor == 0) {
2266		atomic_add_32(&minor_count, -1);
2267		return (0);
2268	}
2269
2270	return (minor);
2271}
2272
2273/*
2274 * Release a previously allocated minor number.
2275 */
2276void
2277mac_minor_rele(minor_t minor)
2278{
2279	/*
2280	 * Return the value to the arena.
2281	 */
2282	id_free(minor_ids, minor);
2283	atomic_add_32(&minor_count, -1);
2284}
2285
2286uint32_t
2287mac_no_notification(mac_handle_t mh)
2288{
2289	mac_impl_t *mip = (mac_impl_t *)mh;
2290
2291	return (((mip->mi_state_flags & MIS_LEGACY) != 0) ?
2292	    mip->mi_capab_legacy.ml_unsup_note : 0);
2293}
2294
2295/*
2296 * Prevent any new opens of this mac in preparation for unregister
2297 */
2298int
2299i_mac_disable(mac_impl_t *mip)
2300{
2301	mac_client_impl_t	*mcip;
2302
2303	rw_enter(&i_mac_impl_lock, RW_WRITER);
2304	if (mip->mi_state_flags & MIS_DISABLED) {
2305		/* Already disabled, return success */
2306		rw_exit(&i_mac_impl_lock);
2307		return (0);
2308	}
2309	/*
2310	 * See if there are any other references to this mac_t (e.g., VLAN's).
2311	 * If so return failure. If all the other checks below pass, then
2312	 * set mi_disabled atomically under the i_mac_impl_lock to prevent
2313	 * any new VLAN's from being created or new mac client opens of this
2314	 * mac end point.
2315	 */
2316	if (mip->mi_ref > 0) {
2317		rw_exit(&i_mac_impl_lock);
2318		return (EBUSY);
2319	}
2320
2321	/*
2322	 * mac clients must delete all multicast groups they join before
2323	 * closing. bcast groups are reference counted, the last client
2324	 * to delete the group will wait till the group is physically
2325	 * deleted. Since all clients have closed this mac end point
2326	 * mi_bcast_ngrps must be zero at this point
2327	 */
2328	ASSERT(mip->mi_bcast_ngrps == 0);
2329
2330	/*
2331	 * Don't let go of this if it has some flows.
2332	 * All other code guarantees no flows are added to a disabled
2333	 * mac, therefore it is sufficient to check for the flow table
2334	 * only here.
2335	 */
2336	mcip = mac_primary_client_handle(mip);
2337	if ((mcip != NULL) && mac_link_has_flows((mac_client_handle_t)mcip)) {
2338		rw_exit(&i_mac_impl_lock);
2339		return (ENOTEMPTY);
2340	}
2341
2342	mip->mi_state_flags |= MIS_DISABLED;
2343	rw_exit(&i_mac_impl_lock);
2344	return (0);
2345}
2346
2347int
2348mac_disable_nowait(mac_handle_t mh)
2349{
2350	mac_impl_t	*mip = (mac_impl_t *)mh;
2351	int err;
2352
2353	if ((err = i_mac_perim_enter_nowait(mip)) != 0)
2354		return (err);
2355	err = i_mac_disable(mip);
2356	i_mac_perim_exit(mip);
2357	return (err);
2358}
2359
2360int
2361mac_disable(mac_handle_t mh)
2362{
2363	mac_impl_t	*mip = (mac_impl_t *)mh;
2364	int err;
2365
2366	i_mac_perim_enter(mip);
2367	err = i_mac_disable(mip);
2368	i_mac_perim_exit(mip);
2369
2370	/*
2371	 * Clean up notification thread and wait for it to exit.
2372	 */
2373	if (err == 0)
2374		i_mac_notify_exit(mip);
2375
2376	return (err);
2377}
2378
2379/*
2380 * Called when the MAC instance has a non empty flow table, to de-multiplex
2381 * incoming packets to the right flow.
2382 * The MAC's rw lock is assumed held as a READER.
2383 */
2384/* ARGSUSED */
2385static mblk_t *
2386mac_rx_classify(mac_impl_t *mip, mac_resource_handle_t mrh, mblk_t *mp)
2387{
2388	flow_entry_t	*flent = NULL;
2389	uint_t		flags = FLOW_INBOUND;
2390	int		err;
2391
2392	/*
2393	 * If the mac is a port of an aggregation, pass FLOW_IGNORE_VLAN
2394	 * to mac_flow_lookup() so that the VLAN packets can be successfully
2395	 * passed to the non-VLAN aggregation flows.
2396	 *
2397	 * Note that there is possibly a race between this and
2398	 * mac_unicast_remove/add() and VLAN packets could be incorrectly
2399	 * classified to non-VLAN flows of non-aggregation mac clients. These
2400	 * VLAN packets will be then filtered out by the mac module.
2401	 */
2402	if ((mip->mi_state_flags & MIS_EXCLUSIVE) != 0)
2403		flags |= FLOW_IGNORE_VLAN;
2404
2405	err = mac_flow_lookup(mip->mi_flow_tab, mp, flags, &flent);
2406	if (err != 0) {
2407		/* no registered receive function */
2408		return (mp);
2409	} else {
2410		mac_client_impl_t	*mcip;
2411
2412		/*
2413		 * This flent might just be an additional one on the MAC client,
2414		 * i.e. for classification purposes (different fdesc), however
2415		 * the resources, SRS et. al., are in the mci_flent, so if
2416		 * this isn't the mci_flent, we need to get it.
2417		 */
2418		if ((mcip = flent->fe_mcip) != NULL &&
2419		    mcip->mci_flent != flent) {
2420			FLOW_REFRELE(flent);
2421			flent = mcip->mci_flent;
2422			FLOW_TRY_REFHOLD(flent, err);
2423			if (err != 0)
2424				return (mp);
2425		}
2426		(flent->fe_cb_fn)(flent->fe_cb_arg1, flent->fe_cb_arg2, mp,
2427		    B_FALSE);
2428		FLOW_REFRELE(flent);
2429	}
2430	return (NULL);
2431}
2432
2433mblk_t *
2434mac_rx_flow(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
2435{
2436	mac_impl_t	*mip = (mac_impl_t *)mh;
2437	mblk_t		*bp, *bp1, **bpp, *list = NULL;
2438
2439	/*
2440	 * We walk the chain and attempt to classify each packet.
2441	 * The packets that couldn't be classified will be returned
2442	 * back to the caller.
2443	 */
2444	bp = mp_chain;
2445	bpp = &list;
2446	while (bp != NULL) {
2447		bp1 = bp;
2448		bp = bp->b_next;
2449		bp1->b_next = NULL;
2450
2451		if (mac_rx_classify(mip, mrh, bp1) != NULL) {
2452			*bpp = bp1;
2453			bpp = &bp1->b_next;
2454		}
2455	}
2456	return (list);
2457}
2458
2459static int
2460mac_tx_flow_srs_wakeup(flow_entry_t *flent, void *arg)
2461{
2462	mac_ring_handle_t ring = arg;
2463
2464	if (flent->fe_tx_srs)
2465		mac_tx_srs_wakeup(flent->fe_tx_srs, ring);
2466	return (0);
2467}
2468
2469void
2470i_mac_tx_srs_notify(mac_impl_t *mip, mac_ring_handle_t ring)
2471{
2472	mac_client_impl_t	*cclient;
2473	mac_soft_ring_set_t	*mac_srs;
2474
2475	/*
2476	 * After grabbing the mi_rw_lock, the list of clients can't change.
2477	 * If there are any clients mi_disabled must be B_FALSE and can't
2478	 * get set since there are clients. If there aren't any clients we
2479	 * don't do anything. In any case the mip has to be valid. The driver
2480	 * must make sure that it goes single threaded (with respect to mac
2481	 * calls) and wait for all pending mac calls to finish before calling
2482	 * mac_unregister.
2483	 */
2484	rw_enter(&i_mac_impl_lock, RW_READER);
2485	if (mip->mi_state_flags & MIS_DISABLED) {
2486		rw_exit(&i_mac_impl_lock);
2487		return;
2488	}
2489
2490	/*
2491	 * Get MAC tx srs from walking mac_client_handle list.
2492	 */
2493	rw_enter(&mip->mi_rw_lock, RW_READER);
2494	for (cclient = mip->mi_clients_list; cclient != NULL;
2495	    cclient = cclient->mci_client_next) {
2496		if ((mac_srs = MCIP_TX_SRS(cclient)) != NULL) {
2497			mac_tx_srs_wakeup(mac_srs, ring);
2498		} else {
2499			/*
2500			 * Aggr opens underlying ports in exclusive mode
2501			 * and registers flow control callbacks using
2502			 * mac_tx_client_notify(). When opened in
2503			 * exclusive mode, Tx SRS won't be created
2504			 * during mac_unicast_add().
2505			 */
2506			if (cclient->mci_state_flags & MCIS_EXCLUSIVE) {
2507				mac_tx_invoke_callbacks(cclient,
2508				    (mac_tx_cookie_t)ring);
2509			}
2510		}
2511		(void) mac_flow_walk(cclient->mci_subflow_tab,
2512		    mac_tx_flow_srs_wakeup, ring);
2513	}
2514	rw_exit(&mip->mi_rw_lock);
2515	rw_exit(&i_mac_impl_lock);
2516}
2517
2518/* ARGSUSED */
2519void
2520mac_multicast_refresh(mac_handle_t mh, mac_multicst_t refresh, void *arg,
2521    boolean_t add)
2522{
2523	mac_impl_t *mip = (mac_impl_t *)mh;
2524
2525	i_mac_perim_enter((mac_impl_t *)mh);
2526	/*
2527	 * If no specific refresh function was given then default to the
2528	 * driver's m_multicst entry point.
2529	 */
2530	if (refresh == NULL) {
2531		refresh = mip->mi_multicst;
2532		arg = mip->mi_driver;
2533	}
2534
2535	mac_bcast_refresh(mip, refresh, arg, add);
2536	i_mac_perim_exit((mac_impl_t *)mh);
2537}
2538
2539void
2540mac_promisc_refresh(mac_handle_t mh, mac_setpromisc_t refresh, void *arg)
2541{
2542	mac_impl_t	*mip = (mac_impl_t *)mh;
2543
2544	/*
2545	 * If no specific refresh function was given then default to the
2546	 * driver's m_promisc entry point.
2547	 */
2548	if (refresh == NULL) {
2549		refresh = mip->mi_setpromisc;
2550		arg = mip->mi_driver;
2551	}
2552	ASSERT(refresh != NULL);
2553
2554	/*
2555	 * Call the refresh function with the current promiscuity.
2556	 */
2557	refresh(arg, (mip->mi_devpromisc != 0));
2558}
2559
2560/*
2561 * The mac client requests that the mac not to change its margin size to
2562 * be less than the specified value.  If "current" is B_TRUE, then the client
2563 * requests the mac not to change its margin size to be smaller than the
2564 * current size. Further, return the current margin size value in this case.
2565 *
2566 * We keep every requested size in an ordered list from largest to smallest.
2567 */
2568int
2569mac_margin_add(mac_handle_t mh, uint32_t *marginp, boolean_t current)
2570{
2571	mac_impl_t		*mip = (mac_impl_t *)mh;
2572	mac_margin_req_t	**pp, *p;
2573	int			err = 0;
2574
2575	rw_enter(&(mip->mi_rw_lock), RW_WRITER);
2576	if (current)
2577		*marginp = mip->mi_margin;
2578
2579	/*
2580	 * If the current margin value cannot satisfy the margin requested,
2581	 * return ENOTSUP directly.
2582	 */
2583	if (*marginp > mip->mi_margin) {
2584		err = ENOTSUP;
2585		goto done;
2586	}
2587
2588	/*
2589	 * Check whether the given margin is already in the list. If so,
2590	 * bump the reference count.
2591	 */
2592	for (pp = &mip->mi_mmrp; (p = *pp) != NULL; pp = &p->mmr_nextp) {
2593		if (p->mmr_margin == *marginp) {
2594			/*
2595			 * The margin requested is already in the list,
2596			 * so just bump the reference count.
2597			 */
2598			p->mmr_ref++;
2599			goto done;
2600		}
2601		if (p->mmr_margin < *marginp)
2602			break;
2603	}
2604
2605
2606	p = kmem_zalloc(sizeof (mac_margin_req_t), KM_SLEEP);
2607	p->mmr_margin = *marginp;
2608	p->mmr_ref++;
2609	p->mmr_nextp = *pp;
2610	*pp = p;
2611
2612done:
2613	rw_exit(&(mip->mi_rw_lock));
2614	return (err);
2615}
2616
2617/*
2618 * The mac client requests to cancel its previous mac_margin_add() request.
2619 * We remove the requested margin size from the list.
2620 */
2621int
2622mac_margin_remove(mac_handle_t mh, uint32_t margin)
2623{
2624	mac_impl_t		*mip = (mac_impl_t *)mh;
2625	mac_margin_req_t	**pp, *p;
2626	int			err = 0;
2627
2628	rw_enter(&(mip->mi_rw_lock), RW_WRITER);
2629	/*
2630	 * Find the entry in the list for the given margin.
2631	 */
2632	for (pp = &(mip->mi_mmrp); (p = *pp) != NULL; pp = &(p->mmr_nextp)) {
2633		if (p->mmr_margin == margin) {
2634			if (--p->mmr_ref == 0)
2635				break;
2636
2637			/*
2638			 * There is still a reference to this address so
2639			 * there's nothing more to do.
2640			 */
2641			goto done;
2642		}
2643	}
2644
2645	/*
2646	 * We did not find an entry for the given margin.
2647	 */
2648	if (p == NULL) {
2649		err = ENOENT;
2650		goto done;
2651	}
2652
2653	ASSERT(p->mmr_ref == 0);
2654
2655	/*
2656	 * Remove it from the list.
2657	 */
2658	*pp = p->mmr_nextp;
2659	kmem_free(p, sizeof (mac_margin_req_t));
2660done:
2661	rw_exit(&(mip->mi_rw_lock));
2662	return (err);
2663}
2664
2665boolean_t
2666mac_margin_update(mac_handle_t mh, uint32_t margin)
2667{
2668	mac_impl_t	*mip = (mac_impl_t *)mh;
2669	uint32_t	margin_needed = 0;
2670
2671	rw_enter(&(mip->mi_rw_lock), RW_WRITER);
2672
2673	if (mip->mi_mmrp != NULL)
2674		margin_needed = mip->mi_mmrp->mmr_margin;
2675
2676	if (margin_needed <= margin)
2677		mip->mi_margin = margin;
2678
2679	rw_exit(&(mip->mi_rw_lock));
2680
2681	if (margin_needed <= margin)
2682		i_mac_notify(mip, MAC_NOTE_MARGIN);
2683
2684	return (margin_needed <= margin);
2685}
2686
2687/*
2688 * MAC Type Plugin functions.
2689 */
2690
2691mactype_t *
2692mactype_getplugin(const char *pname)
2693{
2694	mactype_t	*mtype = NULL;
2695	boolean_t	tried_modload = B_FALSE;
2696
2697	mutex_enter(&i_mactype_lock);
2698
2699find_registered_mactype:
2700	if (mod_hash_find(i_mactype_hash, (mod_hash_key_t)pname,
2701	    (mod_hash_val_t *)&mtype) != 0) {
2702		if (!tried_modload) {
2703			/*
2704			 * If the plugin has not yet been loaded, then
2705			 * attempt to load it now.  If modload() succeeds,
2706			 * the plugin should have registered using
2707			 * mactype_register(), in which case we can go back
2708			 * and attempt to find it again.
2709			 */
2710			if (modload(MACTYPE_KMODDIR, (char *)pname) != -1) {
2711				tried_modload = B_TRUE;
2712				goto find_registered_mactype;
2713			}
2714		}
2715	} else {
2716		/*
2717		 * Note that there's no danger that the plugin we've loaded
2718		 * could be unloaded between the modload() step and the
2719		 * reference count bump here, as we're holding
2720		 * i_mactype_lock, which mactype_unregister() also holds.
2721		 */
2722		atomic_inc_32(&mtype->mt_ref);
2723	}
2724
2725	mutex_exit(&i_mactype_lock);
2726	return (mtype);
2727}
2728
2729mactype_register_t *
2730mactype_alloc(uint_t mactype_version)
2731{
2732	mactype_register_t *mtrp;
2733
2734	/*
2735	 * Make sure there isn't a version mismatch between the plugin and
2736	 * the framework.  In the future, if multiple versions are
2737	 * supported, this check could become more sophisticated.
2738	 */
2739	if (mactype_version != MACTYPE_VERSION)
2740		return (NULL);
2741
2742	mtrp = kmem_zalloc(sizeof (mactype_register_t), KM_SLEEP);
2743	mtrp->mtr_version = mactype_version;
2744	return (mtrp);
2745}
2746
2747void
2748mactype_free(mactype_register_t *mtrp)
2749{
2750	kmem_free(mtrp, sizeof (mactype_register_t));
2751}
2752
2753int
2754mactype_register(mactype_register_t *mtrp)
2755{
2756	mactype_t	*mtp;
2757	mactype_ops_t	*ops = mtrp->mtr_ops;
2758
2759	/* Do some sanity checking before we register this MAC type. */
2760	if (mtrp->mtr_ident == NULL || ops == NULL)
2761		return (EINVAL);
2762
2763	/*
2764	 * Verify that all mandatory callbacks are set in the ops
2765	 * vector.
2766	 */
2767	if (ops->mtops_unicst_verify == NULL ||
2768	    ops->mtops_multicst_verify == NULL ||
2769	    ops->mtops_sap_verify == NULL ||
2770	    ops->mtops_header == NULL ||
2771	    ops->mtops_header_info == NULL) {
2772		return (EINVAL);
2773	}
2774
2775	mtp = kmem_zalloc(sizeof (*mtp), KM_SLEEP);
2776	mtp->mt_ident = mtrp->mtr_ident;
2777	mtp->mt_ops = *ops;
2778	mtp->mt_type = mtrp->mtr_mactype;
2779	mtp->mt_nativetype = mtrp->mtr_nativetype;
2780	mtp->mt_addr_length = mtrp->mtr_addrlen;
2781	if (mtrp->mtr_brdcst_addr != NULL) {
2782		mtp->mt_brdcst_addr = kmem_alloc(mtrp->mtr_addrlen, KM_SLEEP);
2783		bcopy(mtrp->mtr_brdcst_addr, mtp->mt_brdcst_addr,
2784		    mtrp->mtr_addrlen);
2785	}
2786
2787	mtp->mt_stats = mtrp->mtr_stats;
2788	mtp->mt_statcount = mtrp->mtr_statcount;
2789
2790	mtp->mt_mapping = mtrp->mtr_mapping;
2791	mtp->mt_mappingcount = mtrp->mtr_mappingcount;
2792
2793	if (mod_hash_insert(i_mactype_hash,
2794	    (mod_hash_key_t)mtp->mt_ident, (mod_hash_val_t)mtp) != 0) {
2795		kmem_free(mtp->mt_brdcst_addr, mtp->mt_addr_length);
2796		kmem_free(mtp, sizeof (*mtp));
2797		return (EEXIST);
2798	}
2799	return (0);
2800}
2801
2802int
2803mactype_unregister(const char *ident)
2804{
2805	mactype_t	*mtp;
2806	mod_hash_val_t	val;
2807	int 		err;
2808
2809	/*
2810	 * Let's not allow MAC drivers to use this plugin while we're
2811	 * trying to unregister it.  Holding i_mactype_lock also prevents a
2812	 * plugin from unregistering while a MAC driver is attempting to
2813	 * hold a reference to it in i_mactype_getplugin().
2814	 */
2815	mutex_enter(&i_mactype_lock);
2816
2817	if ((err = mod_hash_find(i_mactype_hash, (mod_hash_key_t)ident,
2818	    (mod_hash_val_t *)&mtp)) != 0) {
2819		/* A plugin is trying to unregister, but it never registered. */
2820		err = ENXIO;
2821		goto done;
2822	}
2823
2824	if (mtp->mt_ref != 0) {
2825		err = EBUSY;
2826		goto done;
2827	}
2828
2829	err = mod_hash_remove(i_mactype_hash, (mod_hash_key_t)ident, &val);
2830	ASSERT(err == 0);
2831	if (err != 0) {
2832		/* This should never happen, thus the ASSERT() above. */
2833		err = EINVAL;
2834		goto done;
2835	}
2836	ASSERT(mtp == (mactype_t *)val);
2837
2838	if (mtp->mt_brdcst_addr != NULL)
2839		kmem_free(mtp->mt_brdcst_addr, mtp->mt_addr_length);
2840	kmem_free(mtp, sizeof (mactype_t));
2841done:
2842	mutex_exit(&i_mactype_lock);
2843	return (err);
2844}
2845
2846/*
2847 * Checks the size of the value size specified for a property as
2848 * part of a property operation. Returns B_TRUE if the size is
2849 * correct, B_FALSE otherwise.
2850 */
2851boolean_t
2852mac_prop_check_size(mac_prop_id_t id, uint_t valsize, boolean_t is_range)
2853{
2854	uint_t minsize = 0;
2855
2856	if (is_range)
2857		return (valsize >= sizeof (mac_propval_range_t));
2858
2859	switch (id) {
2860	case MAC_PROP_ZONE:
2861		minsize = sizeof (dld_ioc_zid_t);
2862		break;
2863	case MAC_PROP_AUTOPUSH:
2864		if (valsize != 0)
2865			minsize = sizeof (struct dlautopush);
2866		break;
2867	case MAC_PROP_TAGMODE:
2868		minsize = sizeof (link_tagmode_t);
2869		break;
2870	case MAC_PROP_RESOURCE:
2871	case MAC_PROP_RESOURCE_EFF:
2872		minsize = sizeof (mac_resource_props_t);
2873		break;
2874	case MAC_PROP_DUPLEX:
2875		minsize = sizeof (link_duplex_t);
2876		break;
2877	case MAC_PROP_SPEED:
2878		minsize = sizeof (uint64_t);
2879		break;
2880	case MAC_PROP_STATUS:
2881		minsize = sizeof (link_state_t);
2882		break;
2883	case MAC_PROP_AUTONEG:
2884	case MAC_PROP_EN_AUTONEG:
2885		minsize = sizeof (uint8_t);
2886		break;
2887	case MAC_PROP_MTU:
2888	case MAC_PROP_LLIMIT:
2889	case MAC_PROP_LDECAY:
2890		minsize = sizeof (uint32_t);
2891		break;
2892	case MAC_PROP_FLOWCTRL:
2893		minsize = sizeof (link_flowctrl_t);
2894		break;
2895	case MAC_PROP_ADV_10GFDX_CAP:
2896	case MAC_PROP_EN_10GFDX_CAP:
2897	case MAC_PROP_ADV_1000HDX_CAP:
2898	case MAC_PROP_EN_1000HDX_CAP:
2899	case MAC_PROP_ADV_100FDX_CAP:
2900	case MAC_PROP_EN_100FDX_CAP:
2901	case MAC_PROP_ADV_100HDX_CAP:
2902	case MAC_PROP_EN_100HDX_CAP:
2903	case MAC_PROP_ADV_10FDX_CAP:
2904	case MAC_PROP_EN_10FDX_CAP:
2905	case MAC_PROP_ADV_10HDX_CAP:
2906	case MAC_PROP_EN_10HDX_CAP:
2907	case MAC_PROP_ADV_100T4_CAP:
2908	case MAC_PROP_EN_100T4_CAP:
2909		minsize = sizeof (uint8_t);
2910		break;
2911	case MAC_PROP_PVID:
2912		minsize = sizeof (uint16_t);
2913		break;
2914	case MAC_PROP_IPTUN_HOPLIMIT:
2915		minsize = sizeof (uint32_t);
2916		break;
2917	case MAC_PROP_IPTUN_ENCAPLIMIT:
2918		minsize = sizeof (uint32_t);
2919		break;
2920	case MAC_PROP_MAX_TX_RINGS_AVAIL:
2921	case MAC_PROP_MAX_RX_RINGS_AVAIL:
2922	case MAC_PROP_MAX_RXHWCLNT_AVAIL:
2923	case MAC_PROP_MAX_TXHWCLNT_AVAIL:
2924		minsize = sizeof (uint_t);
2925		break;
2926	case MAC_PROP_WL_ESSID:
2927		minsize = sizeof (wl_linkstatus_t);
2928		break;
2929	case MAC_PROP_WL_BSSID:
2930		minsize = sizeof (wl_bssid_t);
2931		break;
2932	case MAC_PROP_WL_BSSTYPE:
2933		minsize = sizeof (wl_bss_type_t);
2934		break;
2935	case MAC_PROP_WL_LINKSTATUS:
2936		minsize = sizeof (wl_linkstatus_t);
2937		break;
2938	case MAC_PROP_WL_DESIRED_RATES:
2939		minsize = sizeof (wl_rates_t);
2940		break;
2941	case MAC_PROP_WL_SUPPORTED_RATES:
2942		minsize = sizeof (wl_rates_t);
2943		break;
2944	case MAC_PROP_WL_AUTH_MODE:
2945		minsize = sizeof (wl_authmode_t);
2946		break;
2947	case MAC_PROP_WL_ENCRYPTION:
2948		minsize = sizeof (wl_encryption_t);
2949		break;
2950	case MAC_PROP_WL_RSSI:
2951		minsize = sizeof (wl_rssi_t);
2952		break;
2953	case MAC_PROP_WL_PHY_CONFIG:
2954		minsize = sizeof (wl_phy_conf_t);
2955		break;
2956	case MAC_PROP_WL_CAPABILITY:
2957		minsize = sizeof (wl_capability_t);
2958		break;
2959	case MAC_PROP_WL_WPA:
2960		minsize = sizeof (wl_wpa_t);
2961		break;
2962	case MAC_PROP_WL_SCANRESULTS:
2963		minsize = sizeof (wl_wpa_ess_t);
2964		break;
2965	case MAC_PROP_WL_POWER_MODE:
2966		minsize = sizeof (wl_ps_mode_t);
2967		break;
2968	case MAC_PROP_WL_RADIO:
2969		minsize = sizeof (wl_radio_t);
2970		break;
2971	case MAC_PROP_WL_ESS_LIST:
2972		minsize = sizeof (wl_ess_list_t);
2973		break;
2974	case MAC_PROP_WL_KEY_TAB:
2975		minsize = sizeof (wl_wep_key_tab_t);
2976		break;
2977	case MAC_PROP_WL_CREATE_IBSS:
2978		minsize = sizeof (wl_create_ibss_t);
2979		break;
2980	case MAC_PROP_WL_SETOPTIE:
2981		minsize = sizeof (wl_wpa_ie_t);
2982		break;
2983	case MAC_PROP_WL_DELKEY:
2984		minsize = sizeof (wl_del_key_t);
2985		break;
2986	case MAC_PROP_WL_KEY:
2987		minsize = sizeof (wl_key_t);
2988		break;
2989	case MAC_PROP_WL_MLME:
2990		minsize = sizeof (wl_mlme_t);
2991		break;
2992	}
2993
2994	return (valsize >= minsize);
2995}
2996
2997/*
2998 * mac_set_prop() sets MAC or hardware driver properties:
2999 *
3000 * - MAC-managed properties such as resource properties include maxbw,
3001 *   priority, and cpu binding list, as well as the default port VID
3002 *   used by bridging. These properties are consumed by the MAC layer
3003 *   itself and not passed down to the driver. For resource control
3004 *   properties, this function invokes mac_set_resources() which will
3005 *   cache the property value in mac_impl_t and may call
3006 *   mac_client_set_resource() to update property value of the primary
3007 *   mac client, if it exists.
3008 *
3009 * - Properties which act on the hardware and must be passed to the
3010 *   driver, such as MTU, through the driver's mc_setprop() entry point.
3011 */
3012int
3013mac_set_prop(mac_handle_t mh, mac_prop_id_t id, char *name, void *val,
3014    uint_t valsize)
3015{
3016	int err = ENOTSUP;
3017	mac_impl_t *mip = (mac_impl_t *)mh;
3018
3019	ASSERT(MAC_PERIM_HELD(mh));
3020
3021	switch (id) {
3022	case MAC_PROP_RESOURCE: {
3023		mac_resource_props_t *mrp;
3024
3025		/* call mac_set_resources() for MAC properties */
3026		ASSERT(valsize >= sizeof (mac_resource_props_t));
3027		mrp = kmem_zalloc(sizeof (*mrp), KM_SLEEP);
3028		bcopy(val, mrp, sizeof (*mrp));
3029		err = mac_set_resources(mh, mrp);
3030		kmem_free(mrp, sizeof (*mrp));
3031		break;
3032	}
3033
3034	case MAC_PROP_PVID:
3035		ASSERT(valsize >= sizeof (uint16_t));
3036		if (mip->mi_state_flags & MIS_IS_VNIC)
3037			return (EINVAL);
3038		err = mac_set_pvid(mh, *(uint16_t *)val);
3039		break;
3040
3041	case MAC_PROP_MTU: {
3042		uint32_t mtu;
3043
3044		ASSERT(valsize >= sizeof (uint32_t));
3045		bcopy(val, &mtu, sizeof (mtu));
3046		err = mac_set_mtu(mh, mtu, NULL);
3047		break;
3048	}
3049
3050	case MAC_PROP_LLIMIT:
3051	case MAC_PROP_LDECAY: {
3052		uint32_t learnval;
3053
3054		if (valsize < sizeof (learnval) ||
3055		    (mip->mi_state_flags & MIS_IS_VNIC))
3056			return (EINVAL);
3057		bcopy(val, &learnval, sizeof (learnval));
3058		if (learnval == 0 && id == MAC_PROP_LDECAY)
3059			return (EINVAL);
3060		if (id == MAC_PROP_LLIMIT)
3061			mip->mi_llimit = learnval;
3062		else
3063			mip->mi_ldecay = learnval;
3064		err = 0;
3065		break;
3066	}
3067
3068	default:
3069		/* For other driver properties, call driver's callback */
3070		if (mip->mi_callbacks->mc_callbacks & MC_SETPROP) {
3071			err = mip->mi_callbacks->mc_setprop(mip->mi_driver,
3072			    name, id, valsize, val);
3073		}
3074	}
3075	return (err);
3076}
3077
3078/*
3079 * mac_get_prop() gets MAC or device driver properties.
3080 *
3081 * If the property is a driver property, mac_get_prop() calls driver's callback
3082 * entry point to get it.
3083 * If the property is a MAC property, mac_get_prop() invokes mac_get_resources()
3084 * which returns the cached value in mac_impl_t.
3085 */
3086int
3087mac_get_prop(mac_handle_t mh, mac_prop_id_t id, char *name, void *val,
3088    uint_t valsize)
3089{
3090	int err = ENOTSUP;
3091	mac_impl_t *mip = (mac_impl_t *)mh;
3092	uint_t	rings;
3093	uint_t	vlinks;
3094
3095	bzero(val, valsize);
3096
3097	switch (id) {
3098	case MAC_PROP_RESOURCE: {
3099		mac_resource_props_t *mrp;
3100
3101		/* If mac property, read from cache */
3102		ASSERT(valsize >= sizeof (mac_resource_props_t));
3103		mrp = kmem_zalloc(sizeof (*mrp), KM_SLEEP);
3104		mac_get_resources(mh, mrp);
3105		bcopy(mrp, val, sizeof (*mrp));
3106		kmem_free(mrp, sizeof (*mrp));
3107		return (0);
3108	}
3109	case MAC_PROP_RESOURCE_EFF: {
3110		mac_resource_props_t *mrp;
3111
3112		/* If mac effective property, read from client */
3113		ASSERT(valsize >= sizeof (mac_resource_props_t));
3114		mrp = kmem_zalloc(sizeof (*mrp), KM_SLEEP);
3115		mac_get_effective_resources(mh, mrp);
3116		bcopy(mrp, val, sizeof (*mrp));
3117		kmem_free(mrp, sizeof (*mrp));
3118		return (0);
3119	}
3120
3121	case MAC_PROP_PVID:
3122		ASSERT(valsize >= sizeof (uint16_t));
3123		if (mip->mi_state_flags & MIS_IS_VNIC)
3124			return (EINVAL);
3125		*(uint16_t *)val = mac_get_pvid(mh);
3126		return (0);
3127
3128	case MAC_PROP_LLIMIT:
3129	case MAC_PROP_LDECAY:
3130		ASSERT(valsize >= sizeof (uint32_t));
3131		if (mip->mi_state_flags & MIS_IS_VNIC)
3132			return (EINVAL);
3133		if (id == MAC_PROP_LLIMIT)
3134			bcopy(&mip->mi_llimit, val, sizeof (mip->mi_llimit));
3135		else
3136			bcopy(&mip->mi_ldecay, val, sizeof (mip->mi_ldecay));
3137		return (0);
3138
3139	case MAC_PROP_MTU: {
3140		uint32_t sdu;
3141
3142		ASSERT(valsize >= sizeof (uint32_t));
3143		mac_sdu_get2(mh, NULL, &sdu, NULL);
3144		bcopy(&sdu, val, sizeof (sdu));
3145
3146		return (0);
3147	}
3148	case MAC_PROP_STATUS: {
3149		link_state_t link_state;
3150
3151		if (valsize < sizeof (link_state))
3152			return (EINVAL);
3153		link_state = mac_link_get(mh);
3154		bcopy(&link_state, val, sizeof (link_state));
3155
3156		return (0);
3157	}
3158
3159	case MAC_PROP_MAX_RX_RINGS_AVAIL:
3160	case MAC_PROP_MAX_TX_RINGS_AVAIL:
3161		ASSERT(valsize >= sizeof (uint_t));
3162		rings = id == MAC_PROP_MAX_RX_RINGS_AVAIL ?
3163		    mac_rxavail_get(mh) : mac_txavail_get(mh);
3164		bcopy(&rings, val, sizeof (uint_t));
3165		return (0);
3166
3167	case MAC_PROP_MAX_RXHWCLNT_AVAIL:
3168	case MAC_PROP_MAX_TXHWCLNT_AVAIL:
3169		ASSERT(valsize >= sizeof (uint_t));
3170		vlinks = id == MAC_PROP_MAX_RXHWCLNT_AVAIL ?
3171		    mac_rxhwlnksavail_get(mh) : mac_txhwlnksavail_get(mh);
3172		bcopy(&vlinks, val, sizeof (uint_t));
3173		return (0);
3174
3175	case MAC_PROP_RXRINGSRANGE:
3176	case MAC_PROP_TXRINGSRANGE:
3177		/*
3178		 * The value for these properties are returned through
3179		 * the MAC_PROP_RESOURCE property.
3180		 */
3181		return (0);
3182
3183	default:
3184		break;
3185
3186	}
3187
3188	/* If driver property, request from driver */
3189	if (mip->mi_callbacks->mc_callbacks & MC_GETPROP) {
3190		err = mip->mi_callbacks->mc_getprop(mip->mi_driver, name, id,
3191		    valsize, val);
3192	}
3193
3194	return (err);
3195}
3196
3197/*
3198 * Helper function to initialize the range structure for use in
3199 * mac_get_prop. If the type can be other than uint32, we can
3200 * pass that as an arg.
3201 */
3202static void
3203_mac_set_range(mac_propval_range_t *range, uint32_t min, uint32_t max)
3204{
3205	range->mpr_count = 1;
3206	range->mpr_type = MAC_PROPVAL_UINT32;
3207	range->mpr_range_uint32[0].mpur_min = min;
3208	range->mpr_range_uint32[0].mpur_max = max;
3209}
3210
3211/*
3212 * Returns information about the specified property, such as default
3213 * values or permissions.
3214 */
3215int
3216mac_prop_info(mac_handle_t mh, mac_prop_id_t id, char *name,
3217    void *default_val, uint_t default_size, mac_propval_range_t *range,
3218    uint_t *perm)
3219{
3220	mac_prop_info_state_t state;
3221	mac_impl_t *mip = (mac_impl_t *)mh;
3222	uint_t	max;
3223
3224	/*
3225	 * A property is read/write by default unless the driver says
3226	 * otherwise.
3227	 */
3228	if (perm != NULL)
3229		*perm = MAC_PROP_PERM_RW;
3230
3231	if (default_val != NULL)
3232		bzero(default_val, default_size);
3233
3234	/*
3235	 * First, handle framework properties for which we don't need to
3236	 * involve the driver.
3237	 */
3238	switch (id) {
3239	case MAC_PROP_RESOURCE:
3240	case MAC_PROP_PVID:
3241	case MAC_PROP_LLIMIT:
3242	case MAC_PROP_LDECAY:
3243		return (0);
3244
3245	case MAC_PROP_MAX_RX_RINGS_AVAIL:
3246	case MAC_PROP_MAX_TX_RINGS_AVAIL:
3247	case MAC_PROP_MAX_RXHWCLNT_AVAIL:
3248	case MAC_PROP_MAX_TXHWCLNT_AVAIL:
3249		if (perm != NULL)
3250			*perm = MAC_PROP_PERM_READ;
3251		return (0);
3252
3253	case MAC_PROP_RXRINGSRANGE:
3254	case MAC_PROP_TXRINGSRANGE:
3255		/*
3256		 * Currently, we support range for RX and TX rings properties.
3257		 * When we extend this support to maxbw, cpus and priority,
3258		 * we should move this to mac_get_resources.
3259		 * There is no default value for RX or TX rings.
3260		 */
3261		if ((mip->mi_state_flags & MIS_IS_VNIC) &&
3262		    mac_is_vnic_primary(mh)) {
3263			/*
3264			 * We don't support setting rings for a VLAN
3265			 * data link because it shares its ring with the
3266			 * primary MAC client.
3267			 */
3268			if (perm != NULL)
3269				*perm = MAC_PROP_PERM_READ;
3270			if (range != NULL)
3271				range->mpr_count = 0;
3272		} else if (range != NULL) {
3273			if (mip->mi_state_flags & MIS_IS_VNIC)
3274				mh = mac_get_lower_mac_handle(mh);
3275			mip = (mac_impl_t *)mh;
3276			if ((id == MAC_PROP_RXRINGSRANGE &&
3277			    mip->mi_rx_group_type == MAC_GROUP_TYPE_STATIC) ||
3278			    (id == MAC_PROP_TXRINGSRANGE &&
3279			    mip->mi_tx_group_type == MAC_GROUP_TYPE_STATIC)) {
3280				if (id == MAC_PROP_RXRINGSRANGE) {
3281					if ((mac_rxhwlnksavail_get(mh) +
3282					    mac_rxhwlnksrsvd_get(mh)) <= 1) {
3283						/*
3284						 * doesn't support groups or
3285						 * rings
3286						 */
3287						range->mpr_count = 0;
3288					} else {
3289						/*
3290						 * supports specifying groups,
3291						 * but not rings
3292						 */
3293						_mac_set_range(range, 0, 0);
3294					}
3295				} else {
3296					if ((mac_txhwlnksavail_get(mh) +
3297					    mac_txhwlnksrsvd_get(mh)) <= 1) {
3298						/*
3299						 * doesn't support groups or
3300						 * rings
3301						 */
3302						range->mpr_count = 0;
3303					} else {
3304						/*
3305						 * supports specifying groups,
3306						 * but not rings
3307						 */
3308						_mac_set_range(range, 0, 0);
3309					}
3310				}
3311			} else {
3312				max = id == MAC_PROP_RXRINGSRANGE ?
3313				    mac_rxavail_get(mh) + mac_rxrsvd_get(mh) :
3314				    mac_txavail_get(mh) + mac_txrsvd_get(mh);
3315				if (max <= 1) {
3316					/*
3317					 * doesn't support groups or
3318					 * rings
3319					 */
3320					range->mpr_count = 0;
3321				} else  {
3322					/*
3323					 * -1 because we have to leave out the
3324					 * default ring.
3325					 */
3326					_mac_set_range(range, 1, max - 1);
3327				}
3328			}
3329		}
3330		return (0);
3331
3332	case MAC_PROP_STATUS:
3333		if (perm != NULL)
3334			*perm = MAC_PROP_PERM_READ;
3335		return (0);
3336	}
3337
3338	/*
3339	 * Get the property info from the driver if it implements the
3340	 * property info entry point.
3341	 */
3342	bzero(&state, sizeof (state));
3343
3344	if (mip->mi_callbacks->mc_callbacks & MC_PROPINFO) {
3345		state.pr_default = default_val;
3346		state.pr_default_size = default_size;
3347
3348		/*
3349		 * The caller specifies the maximum number of ranges
3350		 * it can accomodate using mpr_count. We don't touch
3351		 * this value until the driver returns from its
3352		 * mc_propinfo() callback, and ensure we don't exceed
3353		 * this number of range as the driver defines
3354		 * supported range from its mc_propinfo().
3355		 *
3356		 * pr_range_cur_count keeps track of how many ranges
3357		 * were defined by the driver from its mc_propinfo()
3358		 * entry point.
3359		 *
3360		 * On exit, the user-specified range mpr_count returns
3361		 * the number of ranges specified by the driver on
3362		 * success, or the number of ranges it wanted to
3363		 * define if that number of ranges could not be
3364		 * accomodated by the specified range structure.  In
3365		 * the latter case, the caller will be able to
3366		 * allocate a larger range structure, and query the
3367		 * property again.
3368		 */
3369		state.pr_range_cur_count = 0;
3370		state.pr_range = range;
3371
3372		mip->mi_callbacks->mc_propinfo(mip->mi_driver, name, id,
3373		    (mac_prop_info_handle_t)&state);
3374
3375		if (state.pr_flags & MAC_PROP_INFO_RANGE)
3376			range->mpr_count = state.pr_range_cur_count;
3377
3378		/*
3379		 * The operation could fail if the buffer supplied by
3380		 * the user was too small for the range or default
3381		 * value of the property.
3382		 */
3383		if (state.pr_errno != 0)
3384			return (state.pr_errno);
3385
3386		if (perm != NULL && state.pr_flags & MAC_PROP_INFO_PERM)
3387			*perm = state.pr_perm;
3388	}
3389
3390	/*
3391	 * The MAC layer may want to provide default values or allowed
3392	 * ranges for properties if the driver does not provide a
3393	 * property info entry point, or that entry point exists, but
3394	 * it did not provide a default value or allowed ranges for
3395	 * that property.
3396	 */
3397	switch (id) {
3398	case MAC_PROP_MTU: {
3399		uint32_t sdu;
3400
3401		mac_sdu_get2(mh, NULL, &sdu, NULL);
3402
3403		if (range != NULL && !(state.pr_flags &
3404		    MAC_PROP_INFO_RANGE)) {
3405			/* MTU range */
3406			_mac_set_range(range, sdu, sdu);
3407		}
3408
3409		if (default_val != NULL && !(state.pr_flags &
3410		    MAC_PROP_INFO_DEFAULT)) {
3411			if (mip->mi_info.mi_media == DL_ETHER)
3412				sdu = ETHERMTU;
3413			/* default MTU value */
3414			bcopy(&sdu, default_val, sizeof (sdu));
3415		}
3416	}
3417	}
3418
3419	return (0);
3420}
3421
3422int
3423mac_fastpath_disable(mac_handle_t mh)
3424{
3425	mac_impl_t	*mip = (mac_impl_t *)mh;
3426
3427	if ((mip->mi_state_flags & MIS_LEGACY) == 0)
3428		return (0);
3429
3430	return (mip->mi_capab_legacy.ml_fastpath_disable(mip->mi_driver));
3431}
3432
3433void
3434mac_fastpath_enable(mac_handle_t mh)
3435{
3436	mac_impl_t	*mip = (mac_impl_t *)mh;
3437
3438	if ((mip->mi_state_flags & MIS_LEGACY) == 0)
3439		return;
3440
3441	mip->mi_capab_legacy.ml_fastpath_enable(mip->mi_driver);
3442}
3443
3444void
3445mac_register_priv_prop(mac_impl_t *mip, char **priv_props)
3446{
3447	uint_t nprops, i;
3448
3449	if (priv_props == NULL)
3450		return;
3451
3452	nprops = 0;
3453	while (priv_props[nprops] != NULL)
3454		nprops++;
3455	if (nprops == 0)
3456		return;
3457
3458
3459	mip->mi_priv_prop = kmem_zalloc(nprops * sizeof (char *), KM_SLEEP);
3460
3461	for (i = 0; i < nprops; i++) {
3462		mip->mi_priv_prop[i] = kmem_zalloc(MAXLINKPROPNAME, KM_SLEEP);
3463		(void) strlcpy(mip->mi_priv_prop[i], priv_props[i],
3464		    MAXLINKPROPNAME);
3465	}
3466
3467	mip->mi_priv_prop_count = nprops;
3468}
3469
3470void
3471mac_unregister_priv_prop(mac_impl_t *mip)
3472{
3473	uint_t i;
3474
3475	if (mip->mi_priv_prop_count == 0) {
3476		ASSERT(mip->mi_priv_prop == NULL);
3477		return;
3478	}
3479
3480	for (i = 0; i < mip->mi_priv_prop_count; i++)
3481		kmem_free(mip->mi_priv_prop[i], MAXLINKPROPNAME);
3482	kmem_free(mip->mi_priv_prop, mip->mi_priv_prop_count *
3483	    sizeof (char *));
3484
3485	mip->mi_priv_prop = NULL;
3486	mip->mi_priv_prop_count = 0;
3487}
3488
3489/*
3490 * mac_ring_t 'mr' macros. Some rogue drivers may access ring structure
3491 * (by invoking mac_rx()) even after processing mac_stop_ring(). In such
3492 * cases if MAC free's the ring structure after mac_stop_ring(), any
3493 * illegal access to the ring structure coming from the driver will panic
3494 * the system. In order to protect the system from such inadverent access,
3495 * we maintain a cache of rings in the mac_impl_t after they get free'd up.
3496 * When packets are received on free'd up rings, MAC (through the generation
3497 * count mechanism) will drop such packets.
3498 */
3499static mac_ring_t *
3500mac_ring_alloc(mac_impl_t *mip)
3501{
3502	mac_ring_t *ring;
3503
3504	mutex_enter(&mip->mi_ring_lock);
3505	if (mip->mi_ring_freelist != NULL) {
3506		ring = mip->mi_ring_freelist;
3507		mip->mi_ring_freelist = ring->mr_next;
3508		bzero(ring, sizeof (mac_ring_t));
3509		mutex_exit(&mip->mi_ring_lock);
3510	} else {
3511		mutex_exit(&mip->mi_ring_lock);
3512		ring = kmem_cache_alloc(mac_ring_cache, KM_SLEEP);
3513	}
3514	ASSERT((ring != NULL) && (ring->mr_state == MR_FREE));
3515	return (ring);
3516}
3517
3518static void
3519mac_ring_free(mac_impl_t *mip, mac_ring_t *ring)
3520{
3521	ASSERT(ring->mr_state == MR_FREE);
3522
3523	mutex_enter(&mip->mi_ring_lock);
3524	ring->mr_state = MR_FREE;
3525	ring->mr_flag = 0;
3526	ring->mr_next = mip->mi_ring_freelist;
3527	ring->mr_mip = NULL;
3528	mip->mi_ring_freelist = ring;
3529	mac_ring_stat_delete(ring);
3530	mutex_exit(&mip->mi_ring_lock);
3531}
3532
3533static void
3534mac_ring_freeall(mac_impl_t *mip)
3535{
3536	mac_ring_t *ring_next;
3537	mutex_enter(&mip->mi_ring_lock);
3538	mac_ring_t *ring = mip->mi_ring_freelist;
3539	while (ring != NULL) {
3540		ring_next = ring->mr_next;
3541		kmem_cache_free(mac_ring_cache, ring);
3542		ring = ring_next;
3543	}
3544	mip->mi_ring_freelist = NULL;
3545	mutex_exit(&mip->mi_ring_lock);
3546}
3547
3548int
3549mac_start_ring(mac_ring_t *ring)
3550{
3551	int rv = 0;
3552
3553	ASSERT(ring->mr_state == MR_FREE);
3554
3555	if (ring->mr_start != NULL) {
3556		rv = ring->mr_start(ring->mr_driver, ring->mr_gen_num);
3557		if (rv != 0)
3558			return (rv);
3559	}
3560
3561	ring->mr_state = MR_INUSE;
3562	return (rv);
3563}
3564
3565void
3566mac_stop_ring(mac_ring_t *ring)
3567{
3568	ASSERT(ring->mr_state == MR_INUSE);
3569
3570	if (ring->mr_stop != NULL)
3571		ring->mr_stop(ring->mr_driver);
3572
3573	ring->mr_state = MR_FREE;
3574
3575	/*
3576	 * Increment the ring generation number for this ring.
3577	 */
3578	ring->mr_gen_num++;
3579}
3580
3581int
3582mac_start_group(mac_group_t *group)
3583{
3584	int rv = 0;
3585
3586	if (group->mrg_start != NULL)
3587		rv = group->mrg_start(group->mrg_driver);
3588
3589	return (rv);
3590}
3591
3592void
3593mac_stop_group(mac_group_t *group)
3594{
3595	if (group->mrg_stop != NULL)
3596		group->mrg_stop(group->mrg_driver);
3597}
3598
3599/*
3600 * Called from mac_start() on the default Rx group. Broadcast and multicast
3601 * packets are received only on the default group. Hence the default group
3602 * needs to be up even if the primary client is not up, for the other groups
3603 * to be functional. We do this by calling this function at mac_start time
3604 * itself. However the broadcast packets that are received can't make their
3605 * way beyond mac_rx until a mac client creates a broadcast flow.
3606 */
3607static int
3608mac_start_group_and_rings(mac_group_t *group)
3609{
3610	mac_ring_t	*ring;
3611	int		rv = 0;
3612
3613	ASSERT(group->mrg_state == MAC_GROUP_STATE_REGISTERED);
3614	if ((rv = mac_start_group(group)) != 0)
3615		return (rv);
3616
3617	for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
3618		ASSERT(ring->mr_state == MR_FREE);
3619		if ((rv = mac_start_ring(ring)) != 0)
3620			goto error;
3621		ring->mr_classify_type = MAC_SW_CLASSIFIER;
3622	}
3623	return (0);
3624
3625error:
3626	mac_stop_group_and_rings(group);
3627	return (rv);
3628}
3629
3630/* Called from mac_stop on the default Rx group */
3631static void
3632mac_stop_group_and_rings(mac_group_t *group)
3633{
3634	mac_ring_t	*ring;
3635
3636	for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
3637		if (ring->mr_state != MR_FREE) {
3638			mac_stop_ring(ring);
3639			ring->mr_flag = 0;
3640			ring->mr_classify_type = MAC_NO_CLASSIFIER;
3641		}
3642	}
3643	mac_stop_group(group);
3644}
3645
3646
3647static mac_ring_t *
3648mac_init_ring(mac_impl_t *mip, mac_group_t *group, int index,
3649    mac_capab_rings_t *cap_rings)
3650{
3651	mac_ring_t *ring, *rnext;
3652	mac_ring_info_t ring_info;
3653	ddi_intr_handle_t ddi_handle;
3654
3655	ring = mac_ring_alloc(mip);
3656
3657	/* Prepare basic information of ring */
3658
3659	/*
3660	 * Ring index is numbered to be unique across a particular device.
3661	 * Ring index computation makes following assumptions:
3662	 *	- For drivers with static grouping (e.g. ixgbe, bge),
3663	 *	ring index exchanged with the driver (e.g. during mr_rget)
3664	 *	is unique only across the group the ring belongs to.
3665	 *	- Drivers with dynamic grouping (e.g. nxge), start
3666	 *	with single group (mrg_index = 0).
3667	 */
3668	ring->mr_index = group->mrg_index * group->mrg_info.mgi_count + index;
3669	ring->mr_type = group->mrg_type;
3670	ring->mr_gh = (mac_group_handle_t)group;
3671
3672	/* Insert the new ring to the list. */
3673	ring->mr_next = group->mrg_rings;
3674	group->mrg_rings = ring;
3675
3676	/* Zero to reuse the info data structure */
3677	bzero(&ring_info, sizeof (ring_info));
3678
3679	/* Query ring information from driver */
3680	cap_rings->mr_rget(mip->mi_driver, group->mrg_type, group->mrg_index,
3681	    index, &ring_info, (mac_ring_handle_t)ring);
3682
3683	ring->mr_info = ring_info;
3684
3685	/*
3686	 * The interrupt handle could be shared among multiple rings.
3687	 * Thus if there is a bunch of rings that are sharing an
3688	 * interrupt, then only one ring among the bunch will be made
3689	 * available for interrupt re-targeting; the rest will have
3690	 * ddi_shared flag set to TRUE and would not be available for
3691	 * be interrupt re-targeting.
3692	 */
3693	if ((ddi_handle = ring_info.mri_intr.mi_ddi_handle) != NULL) {
3694		rnext = ring->mr_next;
3695		while (rnext != NULL) {
3696			if (rnext->mr_info.mri_intr.mi_ddi_handle ==
3697			    ddi_handle) {
3698				/*
3699				 * If default ring (mr_index == 0) is part
3700				 * of a group of rings sharing an
3701				 * interrupt, then set ddi_shared flag for
3702				 * the default ring and give another ring
3703				 * the chance to be re-targeted.
3704				 */
3705				if (rnext->mr_index == 0 &&
3706				    !rnext->mr_info.mri_intr.mi_ddi_shared) {
3707					rnext->mr_info.mri_intr.mi_ddi_shared =
3708					    B_TRUE;
3709				} else {
3710					ring->mr_info.mri_intr.mi_ddi_shared =
3711					    B_TRUE;
3712				}
3713				break;
3714			}
3715			rnext = rnext->mr_next;
3716		}
3717		/*
3718		 * If rnext is NULL, then no matching ddi_handle was found.
3719		 * Rx rings get registered first. So if this is a Tx ring,
3720		 * then go through all the Rx rings and see if there is a
3721		 * matching ddi handle.
3722		 */
3723		if (rnext == NULL && ring->mr_type == MAC_RING_TYPE_TX) {
3724			mac_compare_ddi_handle(mip->mi_rx_groups,
3725			    mip->mi_rx_group_count, ring);
3726		}
3727	}
3728
3729	/* Update ring's status */
3730	ring->mr_state = MR_FREE;
3731	ring->mr_flag = 0;
3732
3733	/* Update the ring count of the group */
3734	group->mrg_cur_count++;
3735
3736	/* Create per ring kstats */
3737	if (ring->mr_stat != NULL) {
3738		ring->mr_mip = mip;
3739		mac_ring_stat_create(ring);
3740	}
3741
3742	return (ring);
3743}
3744
3745/*
3746 * Rings are chained together for easy regrouping.
3747 */
3748static void
3749mac_init_group(mac_impl_t *mip, mac_group_t *group, int size,
3750    mac_capab_rings_t *cap_rings)
3751{
3752	int index;
3753
3754	/*
3755	 * Initialize all ring members of this group. Size of zero will not
3756	 * enter the loop, so it's safe for initializing an empty group.
3757	 */
3758	for (index = size - 1; index >= 0; index--)
3759		(void) mac_init_ring(mip, group, index, cap_rings);
3760}
3761
3762int
3763mac_init_rings(mac_impl_t *mip, mac_ring_type_t rtype)
3764{
3765	mac_capab_rings_t	*cap_rings;
3766	mac_group_t		*group;
3767	mac_group_t		*groups;
3768	mac_group_info_t	group_info;
3769	uint_t			group_free = 0;
3770	uint_t			ring_left;
3771	mac_ring_t		*ring;
3772	int			g;
3773	int			err = 0;
3774	uint_t			grpcnt;
3775	boolean_t		pseudo_txgrp = B_FALSE;
3776
3777	switch (rtype) {
3778	case MAC_RING_TYPE_RX:
3779		ASSERT(mip->mi_rx_groups == NULL);
3780
3781		cap_rings = &mip->mi_rx_rings_cap;
3782		cap_rings->mr_type = MAC_RING_TYPE_RX;
3783		break;
3784	case MAC_RING_TYPE_TX:
3785		ASSERT(mip->mi_tx_groups == NULL);
3786
3787		cap_rings = &mip->mi_tx_rings_cap;
3788		cap_rings->mr_type = MAC_RING_TYPE_TX;
3789		break;
3790	default:
3791		ASSERT(B_FALSE);
3792	}
3793
3794	if (!i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_RINGS, cap_rings))
3795		return (0);
3796	grpcnt = cap_rings->mr_gnum;
3797
3798	/*
3799	 * If we have multiple TX rings, but only one TX group, we can
3800	 * create pseudo TX groups (one per TX ring) in the MAC layer,
3801	 * except for an aggr. For an aggr currently we maintain only
3802	 * one group with all the rings (for all its ports), going
3803	 * forwards we might change this.
3804	 */
3805	if (rtype == MAC_RING_TYPE_TX &&
3806	    cap_rings->mr_gnum == 0 && cap_rings->mr_rnum >  0 &&
3807	    (mip->mi_state_flags & MIS_IS_AGGR) == 0) {
3808		/*
3809		 * The -1 here is because we create a default TX group
3810		 * with all the rings in it.
3811		 */
3812		grpcnt = cap_rings->mr_rnum - 1;
3813		pseudo_txgrp = B_TRUE;
3814	}
3815
3816	/*
3817	 * Allocate a contiguous buffer for all groups.
3818	 */
3819	groups = kmem_zalloc(sizeof (mac_group_t) * (grpcnt+ 1), KM_SLEEP);
3820
3821	ring_left = cap_rings->mr_rnum;
3822
3823	/*
3824	 * Get all ring groups if any, and get their ring members
3825	 * if any.
3826	 */
3827	for (g = 0; g < grpcnt; g++) {
3828		group = groups + g;
3829
3830		/* Prepare basic information of the group */
3831		group->mrg_index = g;
3832		group->mrg_type = rtype;
3833		group->mrg_state = MAC_GROUP_STATE_UNINIT;
3834		group->mrg_mh = (mac_handle_t)mip;
3835		group->mrg_next = group + 1;
3836
3837		/* Zero to reuse the info data structure */
3838		bzero(&group_info, sizeof (group_info));
3839
3840		if (pseudo_txgrp) {
3841			/*
3842			 * This is a pseudo group that we created, apart
3843			 * from setting the state there is nothing to be
3844			 * done.
3845			 */
3846			group->mrg_state = MAC_GROUP_STATE_REGISTERED;
3847			group_free++;
3848			continue;
3849		}
3850		/* Query group information from driver */
3851		cap_rings->mr_gget(mip->mi_driver, rtype, g, &group_info,
3852		    (mac_group_handle_t)group);
3853
3854		switch (cap_rings->mr_group_type) {
3855		case MAC_GROUP_TYPE_DYNAMIC:
3856			if (cap_rings->mr_gaddring == NULL ||
3857			    cap_rings->mr_gremring == NULL) {
3858				DTRACE_PROBE3(
3859				    mac__init__rings_no_addremring,
3860				    char *, mip->mi_name,
3861				    mac_group_add_ring_t,
3862				    cap_rings->mr_gaddring,
3863				    mac_group_add_ring_t,
3864				    cap_rings->mr_gremring);
3865				err = EINVAL;
3866				goto bail;
3867			}
3868
3869			switch (rtype) {
3870			case MAC_RING_TYPE_RX:
3871				/*
3872				 * The first RX group must have non-zero
3873				 * rings, and the following groups must
3874				 * have zero rings.
3875				 */
3876				if (g == 0 && group_info.mgi_count == 0) {
3877					DTRACE_PROBE1(
3878					    mac__init__rings__rx__def__zero,
3879					    char *, mip->mi_name);
3880					err = EINVAL;
3881					goto bail;
3882				}
3883				if (g > 0 && group_info.mgi_count != 0) {
3884					DTRACE_PROBE3(
3885					    mac__init__rings__rx__nonzero,
3886					    char *, mip->mi_name,
3887					    int, g, int, group_info.mgi_count);
3888					err = EINVAL;
3889					goto bail;
3890				}
3891				break;
3892			case MAC_RING_TYPE_TX:
3893				/*
3894				 * All TX ring groups must have zero rings.
3895				 */
3896				if (group_info.mgi_count != 0) {
3897					DTRACE_PROBE3(
3898					    mac__init__rings__tx__nonzero,
3899					    char *, mip->mi_name,
3900					    int, g, int, group_info.mgi_count);
3901					err = EINVAL;
3902					goto bail;
3903				}
3904				break;
3905			}
3906			break;
3907		case MAC_GROUP_TYPE_STATIC:
3908			/*
3909			 * Note that an empty group is allowed, e.g., an aggr
3910			 * would start with an empty group.
3911			 */
3912			break;
3913		default:
3914			/* unknown group type */
3915			DTRACE_PROBE2(mac__init__rings__unknown__type,
3916			    char *, mip->mi_name,
3917			    int, cap_rings->mr_group_type);
3918			err = EINVAL;
3919			goto bail;
3920		}
3921
3922
3923		/*
3924		 * Driver must register group->mgi_addmac/remmac() for rx groups
3925		 * to support multiple MAC addresses.
3926		 */
3927		if (rtype == MAC_RING_TYPE_RX) {
3928			if ((group_info.mgi_addmac == NULL) ||
3929			    (group_info.mgi_addmac == NULL)) {
3930				goto bail;
3931			}
3932		}
3933
3934		/* Cache driver-supplied information */
3935		group->mrg_info = group_info;
3936
3937		/* Update the group's status and group count. */
3938		mac_set_group_state(group, MAC_GROUP_STATE_REGISTERED);
3939		group_free++;
3940
3941		group->mrg_rings = NULL;
3942		group->mrg_cur_count = 0;
3943		mac_init_group(mip, group, group_info.mgi_count, cap_rings);
3944		ring_left -= group_info.mgi_count;
3945
3946		/* The current group size should be equal to default value */
3947		ASSERT(group->mrg_cur_count == group_info.mgi_count);
3948	}
3949
3950	/* Build up a dummy group for free resources as a pool */
3951	group = groups + grpcnt;
3952
3953	/* Prepare basic information of the group */
3954	group->mrg_index = -1;
3955	group->mrg_type = rtype;
3956	group->mrg_state = MAC_GROUP_STATE_UNINIT;
3957	group->mrg_mh = (mac_handle_t)mip;
3958	group->mrg_next = NULL;
3959
3960	/*
3961	 * If there are ungrouped rings, allocate a continuous buffer for
3962	 * remaining resources.
3963	 */
3964	if (ring_left != 0) {
3965		group->mrg_rings = NULL;
3966		group->mrg_cur_count = 0;
3967		mac_init_group(mip, group, ring_left, cap_rings);
3968
3969		/* The current group size should be equal to ring_left */
3970		ASSERT(group->mrg_cur_count == ring_left);
3971
3972		ring_left = 0;
3973
3974		/* Update this group's status */
3975		mac_set_group_state(group, MAC_GROUP_STATE_REGISTERED);
3976	} else
3977		group->mrg_rings = NULL;
3978
3979	ASSERT(ring_left == 0);
3980
3981bail:
3982
3983	/* Cache other important information to finalize the initialization */
3984	switch (rtype) {
3985	case MAC_RING_TYPE_RX:
3986		mip->mi_rx_group_type = cap_rings->mr_group_type;
3987		mip->mi_rx_group_count = cap_rings->mr_gnum;
3988		mip->mi_rx_groups = groups;
3989		mip->mi_rx_donor_grp = groups;
3990		if (mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
3991			/*
3992			 * The default ring is reserved since it is
3993			 * used for sending the broadcast etc. packets.
3994			 */
3995			mip->mi_rxrings_avail =
3996			    mip->mi_rx_groups->mrg_cur_count - 1;
3997			mip->mi_rxrings_rsvd = 1;
3998		}
3999		/*
4000		 * The default group cannot be reserved. It is used by
4001		 * all the clients that do not have an exclusive group.
4002		 */
4003		mip->mi_rxhwclnt_avail = mip->mi_rx_group_count - 1;
4004		mip->mi_rxhwclnt_used = 1;
4005		break;
4006	case MAC_RING_TYPE_TX:
4007		mip->mi_tx_group_type = pseudo_txgrp ? MAC_GROUP_TYPE_DYNAMIC :
4008		    cap_rings->mr_group_type;
4009		mip->mi_tx_group_count = grpcnt;
4010		mip->mi_tx_group_free = group_free;
4011		mip->mi_tx_groups = groups;
4012
4013		group = groups + grpcnt;
4014		ring = group->mrg_rings;
4015		/*
4016		 * The ring can be NULL in the case of aggr. Aggr will
4017		 * have an empty Tx group which will get populated
4018		 * later when pseudo Tx rings are added after
4019		 * mac_register() is done.
4020		 */
4021		if (ring == NULL) {
4022			ASSERT(mip->mi_state_flags & MIS_IS_AGGR);
4023			/*
4024			 * pass the group to aggr so it can add Tx
4025			 * rings to the group later.
4026			 */
4027			cap_rings->mr_gget(mip->mi_driver, rtype, 0, NULL,
4028			    (mac_group_handle_t)group);
4029			/*
4030			 * Even though there are no rings at this time
4031			 * (rings will come later), set the group
4032			 * state to registered.
4033			 */
4034			group->mrg_state = MAC_GROUP_STATE_REGISTERED;
4035		} else {
4036			/*
4037			 * Ring 0 is used as the default one and it could be
4038			 * assigned to a client as well.
4039			 */
4040			while ((ring->mr_index != 0) && (ring->mr_next != NULL))
4041				ring = ring->mr_next;
4042			ASSERT(ring->mr_index == 0);
4043			mip->mi_default_tx_ring = (mac_ring_handle_t)ring;
4044		}
4045		if (mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC)
4046			mip->mi_txrings_avail = group->mrg_cur_count - 1;
4047			/*
4048			 * The default ring cannot be reserved.
4049			 */
4050			mip->mi_txrings_rsvd = 1;
4051		/*
4052		 * The default group cannot be reserved. It will be shared
4053		 * by clients that do not have an exclusive group.
4054		 */
4055		mip->mi_txhwclnt_avail = mip->mi_tx_group_count;
4056		mip->mi_txhwclnt_used = 1;
4057		break;
4058	default:
4059		ASSERT(B_FALSE);
4060	}
4061
4062	if (err != 0)
4063		mac_free_rings(mip, rtype);
4064
4065	return (err);
4066}
4067
4068/*
4069 * The ddi interrupt handle could be shared amoung rings. If so, compare
4070 * the new ring's ddi handle with the existing ones and set ddi_shared
4071 * flag.
4072 */
4073void
4074mac_compare_ddi_handle(mac_group_t *groups, uint_t grpcnt, mac_ring_t *cring)
4075{
4076	mac_group_t *group;
4077	mac_ring_t *ring;
4078	ddi_intr_handle_t ddi_handle;
4079	int g;
4080
4081	ddi_handle = cring->mr_info.mri_intr.mi_ddi_handle;
4082	for (g = 0; g < grpcnt; g++) {
4083		group = groups + g;
4084		for (ring = group->mrg_rings; ring != NULL;
4085		    ring = ring->mr_next) {
4086			if (ring == cring)
4087				continue;
4088			if (ring->mr_info.mri_intr.mi_ddi_handle ==
4089			    ddi_handle) {
4090				if (cring->mr_type == MAC_RING_TYPE_RX &&
4091				    ring->mr_index == 0 &&
4092				    !ring->mr_info.mri_intr.mi_ddi_shared) {
4093					ring->mr_info.mri_intr.mi_ddi_shared =
4094					    B_TRUE;
4095				} else {
4096					cring->mr_info.mri_intr.mi_ddi_shared =
4097					    B_TRUE;
4098				}
4099				return;
4100			}
4101		}
4102	}
4103}
4104
4105/*
4106 * Called to free all groups of particular type (RX or TX). It's assumed that
4107 * no clients are using these groups.
4108 */
4109void
4110mac_free_rings(mac_impl_t *mip, mac_ring_type_t rtype)
4111{
4112	mac_group_t *group, *groups;
4113	uint_t group_count;
4114
4115	switch (rtype) {
4116	case MAC_RING_TYPE_RX:
4117		if (mip->mi_rx_groups == NULL)
4118			return;
4119
4120		groups = mip->mi_rx_groups;
4121		group_count = mip->mi_rx_group_count;
4122
4123		mip->mi_rx_groups = NULL;
4124		mip->mi_rx_donor_grp = NULL;
4125		mip->mi_rx_group_count = 0;
4126		break;
4127	case MAC_RING_TYPE_TX:
4128		ASSERT(mip->mi_tx_group_count == mip->mi_tx_group_free);
4129
4130		if (mip->mi_tx_groups == NULL)
4131			return;
4132
4133		groups = mip->mi_tx_groups;
4134		group_count = mip->mi_tx_group_count;
4135
4136		mip->mi_tx_groups = NULL;
4137		mip->mi_tx_group_count = 0;
4138		mip->mi_tx_group_free = 0;
4139		mip->mi_default_tx_ring = NULL;
4140		break;
4141	default:
4142		ASSERT(B_FALSE);
4143	}
4144
4145	for (group = groups; group != NULL; group = group->mrg_next) {
4146		mac_ring_t *ring;
4147
4148		if (group->mrg_cur_count == 0)
4149			continue;
4150
4151		ASSERT(group->mrg_rings != NULL);
4152
4153		while ((ring = group->mrg_rings) != NULL) {
4154			group->mrg_rings = ring->mr_next;
4155			mac_ring_free(mip, ring);
4156		}
4157	}
4158
4159	/* Free all the cached rings */
4160	mac_ring_freeall(mip);
4161	/* Free the block of group data strutures */
4162	kmem_free(groups, sizeof (mac_group_t) * (group_count + 1));
4163}
4164
4165/*
4166 * Associate a MAC address with a receive group.
4167 *
4168 * The return value of this function should always be checked properly, because
4169 * any type of failure could cause unexpected results. A group can be added
4170 * or removed with a MAC address only after it has been reserved. Ideally,
4171 * a successful reservation always leads to calling mac_group_addmac() to
4172 * steer desired traffic. Failure of adding an unicast MAC address doesn't
4173 * always imply that the group is functioning abnormally.
4174 *
4175 * Currently this function is called everywhere, and it reflects assumptions
4176 * about MAC addresses in the implementation. CR 6735196.
4177 */
4178int
4179mac_group_addmac(mac_group_t *group, const uint8_t *addr)
4180{
4181	ASSERT(group->mrg_type == MAC_RING_TYPE_RX);
4182	ASSERT(group->mrg_info.mgi_addmac != NULL);
4183
4184	return (group->mrg_info.mgi_addmac(group->mrg_info.mgi_driver, addr));
4185}
4186
4187/*
4188 * Remove the association between MAC address and receive group.
4189 */
4190int
4191mac_group_remmac(mac_group_t *group, const uint8_t *addr)
4192{
4193	ASSERT(group->mrg_type == MAC_RING_TYPE_RX);
4194	ASSERT(group->mrg_info.mgi_remmac != NULL);
4195
4196	return (group->mrg_info.mgi_remmac(group->mrg_info.mgi_driver, addr));
4197}
4198
4199/*
4200 * This is the entry point for packets transmitted through the bridging code.
4201 * If no bridge is in place, MAC_RING_TX transmits using tx ring. The 'rh'
4202 * pointer may be NULL to select the default ring.
4203 */
4204mblk_t *
4205mac_bridge_tx(mac_impl_t *mip, mac_ring_handle_t rh, mblk_t *mp)
4206{
4207	mac_handle_t mh;
4208
4209	/*
4210	 * Once we take a reference on the bridge link, the bridge
4211	 * module itself can't unload, so the callback pointers are
4212	 * stable.
4213	 */
4214	mutex_enter(&mip->mi_bridge_lock);
4215	if ((mh = mip->mi_bridge_link) != NULL)
4216		mac_bridge_ref_cb(mh, B_TRUE);
4217	mutex_exit(&mip->mi_bridge_lock);
4218	if (mh == NULL) {
4219		MAC_RING_TX(mip, rh, mp, mp);
4220	} else {
4221		mp = mac_bridge_tx_cb(mh, rh, mp);
4222		mac_bridge_ref_cb(mh, B_FALSE);
4223	}
4224
4225	return (mp);
4226}
4227
4228/*
4229 * Find a ring from its index.
4230 */
4231mac_ring_handle_t
4232mac_find_ring(mac_group_handle_t gh, int index)
4233{
4234	mac_group_t *group = (mac_group_t *)gh;
4235	mac_ring_t *ring = group->mrg_rings;
4236
4237	for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next)
4238		if (ring->mr_index == index)
4239			break;
4240
4241	return ((mac_ring_handle_t)ring);
4242}
4243/*
4244 * Add a ring to an existing group.
4245 *
4246 * The ring must be either passed directly (for example if the ring
4247 * movement is initiated by the framework), or specified through a driver
4248 * index (for example when the ring is added by the driver.
4249 *
4250 * The caller needs to call mac_perim_enter() before calling this function.
4251 */
4252int
4253i_mac_group_add_ring(mac_group_t *group, mac_ring_t *ring, int index)
4254{
4255	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
4256	mac_capab_rings_t *cap_rings;
4257	boolean_t driver_call = (ring == NULL);
4258	mac_group_type_t group_type;
4259	int ret = 0;
4260	flow_entry_t *flent;
4261
4262	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4263
4264	switch (group->mrg_type) {
4265	case MAC_RING_TYPE_RX:
4266		cap_rings = &mip->mi_rx_rings_cap;
4267		group_type = mip->mi_rx_group_type;
4268		break;
4269	case MAC_RING_TYPE_TX:
4270		cap_rings = &mip->mi_tx_rings_cap;
4271		group_type = mip->mi_tx_group_type;
4272		break;
4273	default:
4274		ASSERT(B_FALSE);
4275	}
4276
4277	/*
4278	 * There should be no ring with the same ring index in the target
4279	 * group.
4280	 */
4281	ASSERT(mac_find_ring((mac_group_handle_t)group,
4282	    driver_call ? index : ring->mr_index) == NULL);
4283
4284	if (driver_call) {
4285		/*
4286		 * The function is called as a result of a request from
4287		 * a driver to add a ring to an existing group, for example
4288		 * from the aggregation driver. Allocate a new mac_ring_t
4289		 * for that ring.
4290		 */
4291		ring = mac_init_ring(mip, group, index, cap_rings);
4292		ASSERT(group->mrg_state > MAC_GROUP_STATE_UNINIT);
4293	} else {
4294		/*
4295		 * The function is called as a result of a MAC layer request
4296		 * to add a ring to an existing group. In this case the
4297		 * ring is being moved between groups, which requires
4298		 * the underlying driver to support dynamic grouping,
4299		 * and the mac_ring_t already exists.
4300		 */
4301		ASSERT(group_type == MAC_GROUP_TYPE_DYNAMIC);
4302		ASSERT(group->mrg_driver == NULL ||
4303		    cap_rings->mr_gaddring != NULL);
4304		ASSERT(ring->mr_gh == NULL);
4305	}
4306
4307	/*
4308	 * At this point the ring should not be in use, and it should be
4309	 * of the right for the target group.
4310	 */
4311	ASSERT(ring->mr_state < MR_INUSE);
4312	ASSERT(ring->mr_srs == NULL);
4313	ASSERT(ring->mr_type == group->mrg_type);
4314
4315	if (!driver_call) {
4316		/*
4317		 * Add the driver level hardware ring if the process was not
4318		 * initiated by the driver, and the target group is not the
4319		 * group.
4320		 */
4321		if (group->mrg_driver != NULL) {
4322			cap_rings->mr_gaddring(group->mrg_driver,
4323			    ring->mr_driver, ring->mr_type);
4324		}
4325
4326		/*
4327		 * Insert the ring ahead existing rings.
4328		 */
4329		ring->mr_next = group->mrg_rings;
4330		group->mrg_rings = ring;
4331		ring->mr_gh = (mac_group_handle_t)group;
4332		group->mrg_cur_count++;
4333	}
4334
4335	/*
4336	 * If the group has not been actively used, we're done.
4337	 */
4338	if (group->mrg_index != -1 &&
4339	    group->mrg_state < MAC_GROUP_STATE_RESERVED)
4340		return (0);
4341
4342	/*
4343	 * Start the ring if needed. Failure causes to undo the grouping action.
4344	 */
4345	if (ring->mr_state != MR_INUSE) {
4346		if ((ret = mac_start_ring(ring)) != 0) {
4347			if (!driver_call) {
4348				cap_rings->mr_gremring(group->mrg_driver,
4349				    ring->mr_driver, ring->mr_type);
4350			}
4351			group->mrg_cur_count--;
4352			group->mrg_rings = ring->mr_next;
4353
4354			ring->mr_gh = NULL;
4355
4356			if (driver_call)
4357				mac_ring_free(mip, ring);
4358
4359			return (ret);
4360		}
4361	}
4362
4363	/*
4364	 * Set up SRS/SR according to the ring type.
4365	 */
4366	switch (ring->mr_type) {
4367	case MAC_RING_TYPE_RX:
4368		/*
4369		 * Setup SRS on top of the new ring if the group is
4370		 * reserved for someones exclusive use.
4371		 */
4372		if (group->mrg_state == MAC_GROUP_STATE_RESERVED) {
4373			mac_client_impl_t *mcip;
4374
4375			mcip = MAC_GROUP_ONLY_CLIENT(group);
4376			/*
4377			 * Even though this group is reserved we migth still
4378			 * have multiple clients, i.e a VLAN shares the
4379			 * group with the primary mac client.
4380			 */
4381			if (mcip != NULL) {
4382				flent = mcip->mci_flent;
4383				ASSERT(flent->fe_rx_srs_cnt > 0);
4384				mac_rx_srs_group_setup(mcip, flent, SRST_LINK);
4385				mac_fanout_setup(mcip, flent,
4386				    MCIP_RESOURCE_PROPS(mcip), mac_rx_deliver,
4387				    mcip, NULL, NULL);
4388			} else {
4389				ring->mr_classify_type = MAC_SW_CLASSIFIER;
4390			}
4391		}
4392		break;
4393	case MAC_RING_TYPE_TX:
4394	{
4395		mac_grp_client_t	*mgcp = group->mrg_clients;
4396		mac_client_impl_t	*mcip;
4397		mac_soft_ring_set_t	*mac_srs;
4398		mac_srs_tx_t		*tx;
4399
4400		if (MAC_GROUP_NO_CLIENT(group)) {
4401			if (ring->mr_state == MR_INUSE)
4402				mac_stop_ring(ring);
4403			ring->mr_flag = 0;
4404			break;
4405		}
4406		/*
4407		 * If the rings are being moved to a group that has
4408		 * clients using it, then add the new rings to the
4409		 * clients SRS.
4410		 */
4411		while (mgcp != NULL) {
4412			boolean_t	is_aggr;
4413
4414			mcip = mgcp->mgc_client;
4415			flent = mcip->mci_flent;
4416			is_aggr = (mcip->mci_state_flags & MCIS_IS_AGGR);
4417			mac_srs = MCIP_TX_SRS(mcip);
4418			tx = &mac_srs->srs_tx;
4419			mac_tx_client_quiesce((mac_client_handle_t)mcip);
4420			/*
4421			 * If we are  growing from 1 to multiple rings.
4422			 */
4423			if (tx->st_mode == SRS_TX_BW ||
4424			    tx->st_mode == SRS_TX_SERIALIZE ||
4425			    tx->st_mode == SRS_TX_DEFAULT) {
4426				mac_ring_t	*tx_ring = tx->st_arg2;
4427
4428				tx->st_arg2 = NULL;
4429				mac_tx_srs_stat_recreate(mac_srs, B_TRUE);
4430				mac_tx_srs_add_ring(mac_srs, tx_ring);
4431				if (mac_srs->srs_type & SRST_BW_CONTROL) {
4432					tx->st_mode = is_aggr ? SRS_TX_BW_AGGR :
4433					    SRS_TX_BW_FANOUT;
4434				} else {
4435					tx->st_mode = is_aggr ? SRS_TX_AGGR :
4436					    SRS_TX_FANOUT;
4437				}
4438				tx->st_func = mac_tx_get_func(tx->st_mode);
4439			}
4440			mac_tx_srs_add_ring(mac_srs, ring);
4441			mac_fanout_setup(mcip, flent, MCIP_RESOURCE_PROPS(mcip),
4442			    mac_rx_deliver, mcip, NULL, NULL);
4443			mac_tx_client_restart((mac_client_handle_t)mcip);
4444			mgcp = mgcp->mgc_next;
4445		}
4446		break;
4447	}
4448	default:
4449		ASSERT(B_FALSE);
4450	}
4451	/*
4452	 * For aggr, the default ring will be NULL to begin with. If it
4453	 * is NULL, then pick the first ring that gets added as the
4454	 * default ring. Any ring in an aggregation can be removed at
4455	 * any time (by the user action of removing a link) and if the
4456	 * current default ring gets removed, then a new one gets
4457	 * picked (see i_mac_group_rem_ring()).
4458	 */
4459	if (mip->mi_state_flags & MIS_IS_AGGR &&
4460	    mip->mi_default_tx_ring == NULL &&
4461	    ring->mr_type == MAC_RING_TYPE_TX) {
4462		mip->mi_default_tx_ring = (mac_ring_handle_t)ring;
4463	}
4464
4465	MAC_RING_UNMARK(ring, MR_INCIPIENT);
4466	return (0);
4467}
4468
4469/*
4470 * Remove a ring from it's current group. MAC internal function for dynamic
4471 * grouping.
4472 *
4473 * The caller needs to call mac_perim_enter() before calling this function.
4474 */
4475void
4476i_mac_group_rem_ring(mac_group_t *group, mac_ring_t *ring,
4477    boolean_t driver_call)
4478{
4479	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
4480	mac_capab_rings_t *cap_rings = NULL;
4481	mac_group_type_t group_type;
4482
4483	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4484
4485	ASSERT(mac_find_ring((mac_group_handle_t)group,
4486	    ring->mr_index) == (mac_ring_handle_t)ring);
4487	ASSERT((mac_group_t *)ring->mr_gh == group);
4488	ASSERT(ring->mr_type == group->mrg_type);
4489
4490	if (ring->mr_state == MR_INUSE)
4491		mac_stop_ring(ring);
4492	switch (ring->mr_type) {
4493	case MAC_RING_TYPE_RX:
4494		group_type = mip->mi_rx_group_type;
4495		cap_rings = &mip->mi_rx_rings_cap;
4496
4497		/*
4498		 * Only hardware classified packets hold a reference to the
4499		 * ring all the way up the Rx path. mac_rx_srs_remove()
4500		 * will take care of quiescing the Rx path and removing the
4501		 * SRS. The software classified path neither holds a reference
4502		 * nor any association with the ring in mac_rx.
4503		 */
4504		if (ring->mr_srs != NULL) {
4505			mac_rx_srs_remove(ring->mr_srs);
4506			ring->mr_srs = NULL;
4507		}
4508
4509		break;
4510	case MAC_RING_TYPE_TX:
4511	{
4512		mac_grp_client_t	*mgcp;
4513		mac_client_impl_t	*mcip;
4514		mac_soft_ring_set_t	*mac_srs;
4515		mac_srs_tx_t		*tx;
4516		mac_ring_t		*rem_ring;
4517		mac_group_t		*defgrp;
4518		uint_t			ring_info = 0;
4519
4520		/*
4521		 * For TX this function is invoked in three
4522		 * cases:
4523		 *
4524		 * 1) In the case of a failure during the
4525		 * initial creation of a group when a share is
4526		 * associated with a MAC client. So the SRS is not
4527		 * yet setup, and will be setup later after the
4528		 * group has been reserved and populated.
4529		 *
4530		 * 2) From mac_release_tx_group() when freeing
4531		 * a TX SRS.
4532		 *
4533		 * 3) In the case of aggr, when a port gets removed,
4534		 * the pseudo Tx rings that it exposed gets removed.
4535		 *
4536		 * In the first two cases the SRS and its soft
4537		 * rings are already quiesced.
4538		 */
4539		if (driver_call) {
4540			mac_client_impl_t *mcip;
4541			mac_soft_ring_set_t *mac_srs;
4542			mac_soft_ring_t *sringp;
4543			mac_srs_tx_t *srs_tx;
4544
4545			if (mip->mi_state_flags & MIS_IS_AGGR &&
4546			    mip->mi_default_tx_ring ==
4547			    (mac_ring_handle_t)ring) {
4548				/* pick a new default Tx ring */
4549				mip->mi_default_tx_ring =
4550				    (group->mrg_rings != ring) ?
4551				    (mac_ring_handle_t)group->mrg_rings :
4552				    (mac_ring_handle_t)(ring->mr_next);
4553			}
4554			/* Presently only aggr case comes here */
4555			if (group->mrg_state != MAC_GROUP_STATE_RESERVED)
4556				break;
4557
4558			mcip = MAC_GROUP_ONLY_CLIENT(group);
4559			ASSERT(mcip != NULL);
4560			ASSERT(mcip->mci_state_flags & MCIS_IS_AGGR);
4561			mac_srs = MCIP_TX_SRS(mcip);
4562			ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_AGGR ||
4563			    mac_srs->srs_tx.st_mode == SRS_TX_BW_AGGR);
4564			srs_tx = &mac_srs->srs_tx;
4565			/*
4566			 * Wakeup any callers blocked on this
4567			 * Tx ring due to flow control.
4568			 */
4569			sringp = srs_tx->st_soft_rings[ring->mr_index];
4570			ASSERT(sringp != NULL);
4571			mac_tx_invoke_callbacks(mcip, (mac_tx_cookie_t)sringp);
4572			mac_tx_client_quiesce((mac_client_handle_t)mcip);
4573			mac_tx_srs_del_ring(mac_srs, ring);
4574			mac_tx_client_restart((mac_client_handle_t)mcip);
4575			break;
4576		}
4577		ASSERT(ring != (mac_ring_t *)mip->mi_default_tx_ring);
4578		group_type = mip->mi_tx_group_type;
4579		cap_rings = &mip->mi_tx_rings_cap;
4580		/*
4581		 * See if we need to take it out of the MAC clients using
4582		 * this group
4583		 */
4584		if (MAC_GROUP_NO_CLIENT(group))
4585			break;
4586		mgcp = group->mrg_clients;
4587		defgrp = MAC_DEFAULT_TX_GROUP(mip);
4588		while (mgcp != NULL) {
4589			mcip = mgcp->mgc_client;
4590			mac_srs = MCIP_TX_SRS(mcip);
4591			tx = &mac_srs->srs_tx;
4592			mac_tx_client_quiesce((mac_client_handle_t)mcip);
4593			/*
4594			 * If we are here when removing rings from the
4595			 * defgroup, mac_reserve_tx_ring would have
4596			 * already deleted the ring from the MAC
4597			 * clients in the group.
4598			 */
4599			if (group != defgrp) {
4600				mac_tx_invoke_callbacks(mcip,
4601				    (mac_tx_cookie_t)
4602				    mac_tx_srs_get_soft_ring(mac_srs, ring));
4603				mac_tx_srs_del_ring(mac_srs, ring);
4604			}
4605			/*
4606			 * Additionally, if  we are left with only
4607			 * one ring in the group after this, we need
4608			 * to modify the mode etc. to. (We haven't
4609			 * yet taken the ring out, so we check with 2).
4610			 */
4611			if (group->mrg_cur_count == 2) {
4612				if (ring->mr_next == NULL)
4613					rem_ring = group->mrg_rings;
4614				else
4615					rem_ring = ring->mr_next;
4616				mac_tx_invoke_callbacks(mcip,
4617				    (mac_tx_cookie_t)
4618				    mac_tx_srs_get_soft_ring(mac_srs,
4619				    rem_ring));
4620				mac_tx_srs_del_ring(mac_srs, rem_ring);
4621				if (rem_ring->mr_state != MR_INUSE) {
4622					(void) mac_start_ring(rem_ring);
4623				}
4624				tx->st_arg2 = (void *)rem_ring;
4625				mac_tx_srs_stat_recreate(mac_srs, B_FALSE);
4626				ring_info = mac_hwring_getinfo(
4627				    (mac_ring_handle_t)rem_ring);
4628				/*
4629				 * We are  shrinking from multiple
4630				 * to 1 ring.
4631				 */
4632				if (mac_srs->srs_type & SRST_BW_CONTROL) {
4633					tx->st_mode = SRS_TX_BW;
4634				} else if (mac_tx_serialize ||
4635				    (ring_info & MAC_RING_TX_SERIALIZE)) {
4636					tx->st_mode = SRS_TX_SERIALIZE;
4637				} else {
4638					tx->st_mode = SRS_TX_DEFAULT;
4639				}
4640				tx->st_func = mac_tx_get_func(tx->st_mode);
4641			}
4642			mac_tx_client_restart((mac_client_handle_t)mcip);
4643			mgcp = mgcp->mgc_next;
4644		}
4645		break;
4646	}
4647	default:
4648		ASSERT(B_FALSE);
4649	}
4650
4651	/*
4652	 * Remove the ring from the group.
4653	 */
4654	if (ring == group->mrg_rings)
4655		group->mrg_rings = ring->mr_next;
4656	else {
4657		mac_ring_t *pre;
4658
4659		pre = group->mrg_rings;
4660		while (pre->mr_next != ring)
4661			pre = pre->mr_next;
4662		pre->mr_next = ring->mr_next;
4663	}
4664	group->mrg_cur_count--;
4665
4666	if (!driver_call) {
4667		ASSERT(group_type == MAC_GROUP_TYPE_DYNAMIC);
4668		ASSERT(group->mrg_driver == NULL ||
4669		    cap_rings->mr_gremring != NULL);
4670
4671		/*
4672		 * Remove the driver level hardware ring.
4673		 */
4674		if (group->mrg_driver != NULL) {
4675			cap_rings->mr_gremring(group->mrg_driver,
4676			    ring->mr_driver, ring->mr_type);
4677		}
4678	}
4679
4680	ring->mr_gh = NULL;
4681	if (driver_call)
4682		mac_ring_free(mip, ring);
4683	else
4684		ring->mr_flag = 0;
4685}
4686
4687/*
4688 * Move a ring to the target group. If needed, remove the ring from the group
4689 * that it currently belongs to.
4690 *
4691 * The caller need to enter MAC's perimeter by calling mac_perim_enter().
4692 */
4693static int
4694mac_group_mov_ring(mac_impl_t *mip, mac_group_t *d_group, mac_ring_t *ring)
4695{
4696	mac_group_t *s_group = (mac_group_t *)ring->mr_gh;
4697	int rv;
4698
4699	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4700	ASSERT(d_group != NULL);
4701	ASSERT(s_group->mrg_mh == d_group->mrg_mh);
4702
4703	if (s_group == d_group)
4704		return (0);
4705
4706	/*
4707	 * Remove it from current group first.
4708	 */
4709	if (s_group != NULL)
4710		i_mac_group_rem_ring(s_group, ring, B_FALSE);
4711
4712	/*
4713	 * Add it to the new group.
4714	 */
4715	rv = i_mac_group_add_ring(d_group, ring, 0);
4716	if (rv != 0) {
4717		/*
4718		 * Failed to add ring back to source group. If
4719		 * that fails, the ring is stuck in limbo, log message.
4720		 */
4721		if (i_mac_group_add_ring(s_group, ring, 0)) {
4722			cmn_err(CE_WARN, "%s: failed to move ring %p\n",
4723			    mip->mi_name, (void *)ring);
4724		}
4725	}
4726
4727	return (rv);
4728}
4729
4730/*
4731 * Find a MAC address according to its value.
4732 */
4733mac_address_t *
4734mac_find_macaddr(mac_impl_t *mip, uint8_t *mac_addr)
4735{
4736	mac_address_t *map;
4737
4738	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4739
4740	for (map = mip->mi_addresses; map != NULL; map = map->ma_next) {
4741		if (bcmp(mac_addr, map->ma_addr, map->ma_len) == 0)
4742			break;
4743	}
4744
4745	return (map);
4746}
4747
4748/*
4749 * Check whether the MAC address is shared by multiple clients.
4750 */
4751boolean_t
4752mac_check_macaddr_shared(mac_address_t *map)
4753{
4754	ASSERT(MAC_PERIM_HELD((mac_handle_t)map->ma_mip));
4755
4756	return (map->ma_nusers > 1);
4757}
4758
4759/*
4760 * Remove the specified MAC address from the MAC address list and free it.
4761 */
4762static void
4763mac_free_macaddr(mac_address_t *map)
4764{
4765	mac_impl_t *mip = map->ma_mip;
4766
4767	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4768	ASSERT(mip->mi_addresses != NULL);
4769
4770	map = mac_find_macaddr(mip, map->ma_addr);
4771
4772	ASSERT(map != NULL);
4773	ASSERT(map->ma_nusers == 0);
4774
4775	if (map == mip->mi_addresses) {
4776		mip->mi_addresses = map->ma_next;
4777	} else {
4778		mac_address_t *pre;
4779
4780		pre = mip->mi_addresses;
4781		while (pre->ma_next != map)
4782			pre = pre->ma_next;
4783		pre->ma_next = map->ma_next;
4784	}
4785
4786	kmem_free(map, sizeof (mac_address_t));
4787}
4788
4789/*
4790 * Add a MAC address reference for a client. If the desired MAC address
4791 * exists, add a reference to it. Otherwise, add the new address by adding
4792 * it to a reserved group or setting promiscuous mode. Won't try different
4793 * group is the group is non-NULL, so the caller must explictly share
4794 * default group when needed.
4795 *
4796 * Note, the primary MAC address is initialized at registration time, so
4797 * to add it to default group only need to activate it if its reference
4798 * count is still zero. Also, some drivers may not have advertised RINGS
4799 * capability.
4800 */
4801int
4802mac_add_macaddr(mac_impl_t *mip, mac_group_t *group, uint8_t *mac_addr,
4803    boolean_t use_hw)
4804{
4805	mac_address_t *map;
4806	int err = 0;
4807	boolean_t allocated_map = B_FALSE;
4808
4809	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4810
4811	map = mac_find_macaddr(mip, mac_addr);
4812
4813	/*
4814	 * If the new MAC address has not been added. Allocate a new one
4815	 * and set it up.
4816	 */
4817	if (map == NULL) {
4818		map = kmem_zalloc(sizeof (mac_address_t), KM_SLEEP);
4819		map->ma_len = mip->mi_type->mt_addr_length;
4820		bcopy(mac_addr, map->ma_addr, map->ma_len);
4821		map->ma_nusers = 0;
4822		map->ma_group = group;
4823		map->ma_mip = mip;
4824
4825		/* add the new MAC address to the head of the address list */
4826		map->ma_next = mip->mi_addresses;
4827		mip->mi_addresses = map;
4828
4829		allocated_map = B_TRUE;
4830	}
4831
4832	ASSERT(map->ma_group == NULL || map->ma_group == group);
4833	if (map->ma_group == NULL)
4834		map->ma_group = group;
4835
4836	/*
4837	 * If the MAC address is already in use, simply account for the
4838	 * new client.
4839	 */
4840	if (map->ma_nusers++ > 0)
4841		return (0);
4842
4843	/*
4844	 * Activate this MAC address by adding it to the reserved group.
4845	 */
4846	if (group != NULL) {
4847		err = mac_group_addmac(group, (const uint8_t *)mac_addr);
4848		if (err == 0) {
4849			map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
4850			return (0);
4851		}
4852	}
4853
4854	/*
4855	 * The MAC address addition failed. If the client requires a
4856	 * hardware classified MAC address, fail the operation.
4857	 */
4858	if (use_hw) {
4859		err = ENOSPC;
4860		goto bail;
4861	}
4862
4863	/*
4864	 * Try promiscuous mode.
4865	 *
4866	 * For drivers that don't advertise RINGS capability, do
4867	 * nothing for the primary address.
4868	 */
4869	if ((group == NULL) &&
4870	    (bcmp(map->ma_addr, mip->mi_addr, map->ma_len) == 0)) {
4871		map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
4872		return (0);
4873	}
4874
4875	/*
4876	 * Enable promiscuous mode in order to receive traffic
4877	 * to the new MAC address.
4878	 */
4879	if ((err = i_mac_promisc_set(mip, B_TRUE)) == 0) {
4880		map->ma_type = MAC_ADDRESS_TYPE_UNICAST_PROMISC;
4881		return (0);
4882	}
4883
4884	/*
4885	 * Free the MAC address that could not be added. Don't free
4886	 * a pre-existing address, it could have been the entry
4887	 * for the primary MAC address which was pre-allocated by
4888	 * mac_init_macaddr(), and which must remain on the list.
4889	 */
4890bail:
4891	map->ma_nusers--;
4892	if (allocated_map)
4893		mac_free_macaddr(map);
4894	return (err);
4895}
4896
4897/*
4898 * Remove a reference to a MAC address. This may cause to remove the MAC
4899 * address from an associated group or to turn off promiscuous mode.
4900 * The caller needs to handle the failure properly.
4901 */
4902int
4903mac_remove_macaddr(mac_address_t *map)
4904{
4905	mac_impl_t *mip = map->ma_mip;
4906	int err = 0;
4907
4908	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4909
4910	ASSERT(map == mac_find_macaddr(mip, map->ma_addr));
4911
4912	/*
4913	 * If it's not the last client using this MAC address, only update
4914	 * the MAC clients count.
4915	 */
4916	if (--map->ma_nusers > 0)
4917		return (0);
4918
4919	/*
4920	 * The MAC address is no longer used by any MAC client, so remove
4921	 * it from its associated group, or turn off promiscuous mode
4922	 * if it was enabled for the MAC address.
4923	 */
4924	switch (map->ma_type) {
4925	case MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED:
4926		/*
4927		 * Don't free the preset primary address for drivers that
4928		 * don't advertise RINGS capability.
4929		 */
4930		if (map->ma_group == NULL)
4931			return (0);
4932
4933		err = mac_group_remmac(map->ma_group, map->ma_addr);
4934		if (err == 0)
4935			map->ma_group = NULL;
4936		break;
4937	case MAC_ADDRESS_TYPE_UNICAST_PROMISC:
4938		err = i_mac_promisc_set(mip, B_FALSE);
4939		break;
4940	default:
4941		ASSERT(B_FALSE);
4942	}
4943
4944	if (err != 0)
4945		return (err);
4946
4947	/*
4948	 * We created MAC address for the primary one at registration, so we
4949	 * won't free it here. mac_fini_macaddr() will take care of it.
4950	 */
4951	if (bcmp(map->ma_addr, mip->mi_addr, map->ma_len) != 0)
4952		mac_free_macaddr(map);
4953
4954	return (0);
4955}
4956
4957/*
4958 * Update an existing MAC address. The caller need to make sure that the new
4959 * value has not been used.
4960 */
4961int
4962mac_update_macaddr(mac_address_t *map, uint8_t *mac_addr)
4963{
4964	mac_impl_t *mip = map->ma_mip;
4965	int err = 0;
4966
4967	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4968	ASSERT(mac_find_macaddr(mip, mac_addr) == NULL);
4969
4970	switch (map->ma_type) {
4971	case MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED:
4972		/*
4973		 * Update the primary address for drivers that are not
4974		 * RINGS capable.
4975		 */
4976		if (mip->mi_rx_groups == NULL) {
4977			err = mip->mi_unicst(mip->mi_driver, (const uint8_t *)
4978			    mac_addr);
4979			if (err != 0)
4980				return (err);
4981			break;
4982		}
4983
4984		/*
4985		 * If this MAC address is not currently in use,
4986		 * simply break out and update the value.
4987		 */
4988		if (map->ma_nusers == 0)
4989			break;
4990
4991		/*
4992		 * Need to replace the MAC address associated with a group.
4993		 */
4994		err = mac_group_remmac(map->ma_group, map->ma_addr);
4995		if (err != 0)
4996			return (err);
4997
4998		err = mac_group_addmac(map->ma_group, mac_addr);
4999
5000		/*
5001		 * Failure hints hardware error. The MAC layer needs to
5002		 * have error notification facility to handle this.
5003		 * Now, simply try to restore the value.
5004		 */
5005		if (err != 0)
5006			(void) mac_group_addmac(map->ma_group, map->ma_addr);
5007
5008		break;
5009	case MAC_ADDRESS_TYPE_UNICAST_PROMISC:
5010		/*
5011		 * Need to do nothing more if in promiscuous mode.
5012		 */
5013		break;
5014	default:
5015		ASSERT(B_FALSE);
5016	}
5017
5018	/*
5019	 * Successfully replaced the MAC address.
5020	 */
5021	if (err == 0)
5022		bcopy(mac_addr, map->ma_addr, map->ma_len);
5023
5024	return (err);
5025}
5026
5027/*
5028 * Freshen the MAC address with new value. Its caller must have updated the
5029 * hardware MAC address before calling this function.
5030 * This funcitons is supposed to be used to handle the MAC address change
5031 * notification from underlying drivers.
5032 */
5033void
5034mac_freshen_macaddr(mac_address_t *map, uint8_t *mac_addr)
5035{
5036	mac_impl_t *mip = map->ma_mip;
5037
5038	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
5039	ASSERT(mac_find_macaddr(mip, mac_addr) == NULL);
5040
5041	/*
5042	 * Freshen the MAC address with new value.
5043	 */
5044	bcopy(mac_addr, map->ma_addr, map->ma_len);
5045	bcopy(mac_addr, mip->mi_addr, map->ma_len);
5046
5047	/*
5048	 * Update all MAC clients that share this MAC address.
5049	 */
5050	mac_unicast_update_clients(mip, map);
5051}
5052
5053/*
5054 * Set up the primary MAC address.
5055 */
5056void
5057mac_init_macaddr(mac_impl_t *mip)
5058{
5059	mac_address_t *map;
5060
5061	/*
5062	 * The reference count is initialized to zero, until it's really
5063	 * activated.
5064	 */
5065	map = kmem_zalloc(sizeof (mac_address_t), KM_SLEEP);
5066	map->ma_len = mip->mi_type->mt_addr_length;
5067	bcopy(mip->mi_addr, map->ma_addr, map->ma_len);
5068
5069	/*
5070	 * If driver advertises RINGS capability, it shouldn't have initialized
5071	 * its primary MAC address. For other drivers, including VNIC, the
5072	 * primary address must work after registration.
5073	 */
5074	if (mip->mi_rx_groups == NULL)
5075		map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
5076
5077	map->ma_mip = mip;
5078
5079	mip->mi_addresses = map;
5080}
5081
5082/*
5083 * Clean up the primary MAC address. Note, only one primary MAC address
5084 * is allowed. All other MAC addresses must have been freed appropriately.
5085 */
5086void
5087mac_fini_macaddr(mac_impl_t *mip)
5088{
5089	mac_address_t *map = mip->mi_addresses;
5090
5091	if (map == NULL)
5092		return;
5093
5094	/*
5095	 * If mi_addresses is initialized, there should be exactly one
5096	 * entry left on the list with no users.
5097	 */
5098	ASSERT(map->ma_nusers == 0);
5099	ASSERT(map->ma_next == NULL);
5100
5101	kmem_free(map, sizeof (mac_address_t));
5102	mip->mi_addresses = NULL;
5103}
5104
5105/*
5106 * Logging related functions.
5107 *
5108 * Note that Kernel statistics have been extended to maintain fine
5109 * granularity of statistics viz. hardware lane, software lane, fanout
5110 * stats etc. However, extended accounting continues to support only
5111 * aggregate statistics like before.
5112 */
5113
5114/* Write the flow description to a netinfo_t record */
5115static netinfo_t *
5116mac_write_flow_desc(flow_entry_t *flent, mac_client_impl_t *mcip)
5117{
5118	netinfo_t		*ninfo;
5119	net_desc_t		*ndesc;
5120	flow_desc_t		*fdesc;
5121	mac_resource_props_t	*mrp;
5122
5123	ninfo = kmem_zalloc(sizeof (netinfo_t), KM_NOSLEEP);
5124	if (ninfo == NULL)
5125		return (NULL);
5126	ndesc = kmem_zalloc(sizeof (net_desc_t), KM_NOSLEEP);
5127	if (ndesc == NULL) {
5128		kmem_free(ninfo, sizeof (netinfo_t));
5129		return (NULL);
5130	}
5131
5132	/*
5133	 * Grab the fe_lock to see a self-consistent fe_flow_desc.
5134	 * Updates to the fe_flow_desc are done under the fe_lock
5135	 */
5136	mutex_enter(&flent->fe_lock);
5137	fdesc = &flent->fe_flow_desc;
5138	mrp = &flent->fe_resource_props;
5139
5140	ndesc->nd_name = flent->fe_flow_name;
5141	ndesc->nd_devname = mcip->mci_name;
5142	bcopy(fdesc->fd_src_mac, ndesc->nd_ehost, ETHERADDRL);
5143	bcopy(fdesc->fd_dst_mac, ndesc->nd_edest, ETHERADDRL);
5144	ndesc->nd_sap = htonl(fdesc->fd_sap);
5145	ndesc->nd_isv4 = (uint8_t)fdesc->fd_ipversion == IPV4_VERSION;
5146	ndesc->nd_bw_limit = mrp->mrp_maxbw;
5147	if (ndesc->nd_isv4) {
5148		ndesc->nd_saddr[3] = htonl(fdesc->fd_local_addr.s6_addr32[3]);
5149		ndesc->nd_daddr[3] = htonl(fdesc->fd_remote_addr.s6_addr32[3]);
5150	} else {
5151		bcopy(&fdesc->fd_local_addr, ndesc->nd_saddr, IPV6_ADDR_LEN);
5152		bcopy(&fdesc->fd_remote_addr, ndesc->nd_daddr, IPV6_ADDR_LEN);
5153	}
5154	ndesc->nd_sport = htons(fdesc->fd_local_port);
5155	ndesc->nd_dport = htons(fdesc->fd_remote_port);
5156	ndesc->nd_protocol = (uint8_t)fdesc->fd_protocol;
5157	mutex_exit(&flent->fe_lock);
5158
5159	ninfo->ni_record = ndesc;
5160	ninfo->ni_size = sizeof (net_desc_t);
5161	ninfo->ni_type = EX_NET_FLDESC_REC;
5162
5163	return (ninfo);
5164}
5165
5166/* Write the flow statistics to a netinfo_t record */
5167static netinfo_t *
5168mac_write_flow_stats(flow_entry_t *flent)
5169{
5170	netinfo_t		*ninfo;
5171	net_stat_t		*nstat;
5172	mac_soft_ring_set_t	*mac_srs;
5173	mac_rx_stats_t		*mac_rx_stat;
5174	mac_tx_stats_t		*mac_tx_stat;
5175	int			i;
5176
5177	ninfo = kmem_zalloc(sizeof (netinfo_t), KM_NOSLEEP);
5178	if (ninfo == NULL)
5179		return (NULL);
5180	nstat = kmem_zalloc(sizeof (net_stat_t), KM_NOSLEEP);
5181	if (nstat == NULL) {
5182		kmem_free(ninfo, sizeof (netinfo_t));
5183		return (NULL);
5184	}
5185
5186	nstat->ns_name = flent->fe_flow_name;
5187	for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
5188		mac_srs = (mac_soft_ring_set_t *)flent->fe_rx_srs[i];
5189		mac_rx_stat = &mac_srs->srs_rx.sr_stat;
5190
5191		nstat->ns_ibytes += mac_rx_stat->mrs_intrbytes +
5192		    mac_rx_stat->mrs_pollbytes + mac_rx_stat->mrs_lclbytes;
5193		nstat->ns_ipackets += mac_rx_stat->mrs_intrcnt +
5194		    mac_rx_stat->mrs_pollcnt + mac_rx_stat->mrs_lclcnt;
5195		nstat->ns_oerrors += mac_rx_stat->mrs_ierrors;
5196	}
5197
5198	mac_srs = (mac_soft_ring_set_t *)(flent->fe_tx_srs);
5199	if (mac_srs != NULL) {
5200		mac_tx_stat = &mac_srs->srs_tx.st_stat;
5201
5202		nstat->ns_obytes = mac_tx_stat->mts_obytes;
5203		nstat->ns_opackets = mac_tx_stat->mts_opackets;
5204		nstat->ns_oerrors = mac_tx_stat->mts_oerrors;
5205	}
5206
5207	ninfo->ni_record = nstat;
5208	ninfo->ni_size = sizeof (net_stat_t);
5209	ninfo->ni_type = EX_NET_FLSTAT_REC;
5210
5211	return (ninfo);
5212}
5213
5214/* Write the link description to a netinfo_t record */
5215static netinfo_t *
5216mac_write_link_desc(mac_client_impl_t *mcip)
5217{
5218	netinfo_t		*ninfo;
5219	net_desc_t		*ndesc;
5220	flow_entry_t		*flent = mcip->mci_flent;
5221
5222	ninfo = kmem_zalloc(sizeof (netinfo_t), KM_NOSLEEP);
5223	if (ninfo == NULL)
5224		return (NULL);
5225	ndesc = kmem_zalloc(sizeof (net_desc_t), KM_NOSLEEP);
5226	if (ndesc == NULL) {
5227		kmem_free(ninfo, sizeof (netinfo_t));
5228		return (NULL);
5229	}
5230
5231	ndesc->nd_name = mcip->mci_name;
5232	ndesc->nd_devname = mcip->mci_name;
5233	ndesc->nd_isv4 = B_TRUE;
5234	/*
5235	 * Grab the fe_lock to see a self-consistent fe_flow_desc.
5236	 * Updates to the fe_flow_desc are done under the fe_lock
5237	 * after removing the flent from the flow table.
5238	 */
5239	mutex_enter(&flent->fe_lock);
5240	bcopy(flent->fe_flow_desc.fd_src_mac, ndesc->nd_ehost, ETHERADDRL);
5241	mutex_exit(&flent->fe_lock);
5242
5243	ninfo->ni_record = ndesc;
5244	ninfo->ni_size = sizeof (net_desc_t);
5245	ninfo->ni_type = EX_NET_LNDESC_REC;
5246
5247	return (ninfo);
5248}
5249
5250/* Write the link statistics to a netinfo_t record */
5251static netinfo_t *
5252mac_write_link_stats(mac_client_impl_t *mcip)
5253{
5254	netinfo_t		*ninfo;
5255	net_stat_t		*nstat;
5256	flow_entry_t		*flent;
5257	mac_soft_ring_set_t	*mac_srs;
5258	mac_rx_stats_t		*mac_rx_stat;
5259	mac_tx_stats_t		*mac_tx_stat;
5260	int			i;
5261
5262	ninfo = kmem_zalloc(sizeof (netinfo_t), KM_NOSLEEP);
5263	if (ninfo == NULL)
5264		return (NULL);
5265	nstat = kmem_zalloc(sizeof (net_stat_t), KM_NOSLEEP);
5266	if (nstat == NULL) {
5267		kmem_free(ninfo, sizeof (netinfo_t));
5268		return (NULL);
5269	}
5270
5271	nstat->ns_name = mcip->mci_name;
5272	flent = mcip->mci_flent;
5273	if (flent != NULL)  {
5274		for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
5275			mac_srs = (mac_soft_ring_set_t *)flent->fe_rx_srs[i];
5276			mac_rx_stat = &mac_srs->srs_rx.sr_stat;
5277
5278			nstat->ns_ibytes += mac_rx_stat->mrs_intrbytes +
5279			    mac_rx_stat->mrs_pollbytes +
5280			    mac_rx_stat->mrs_lclbytes;
5281			nstat->ns_ipackets += mac_rx_stat->mrs_intrcnt +
5282			    mac_rx_stat->mrs_pollcnt + mac_rx_stat->mrs_lclcnt;
5283			nstat->ns_oerrors += mac_rx_stat->mrs_ierrors;
5284		}
5285	}
5286
5287	mac_srs = (mac_soft_ring_set_t *)(mcip->mci_flent->fe_tx_srs);
5288	if (mac_srs != NULL) {
5289		mac_tx_stat = &mac_srs->srs_tx.st_stat;
5290
5291		nstat->ns_obytes = mac_tx_stat->mts_obytes;
5292		nstat->ns_opackets = mac_tx_stat->mts_opackets;
5293		nstat->ns_oerrors = mac_tx_stat->mts_oerrors;
5294	}
5295
5296	ninfo->ni_record = nstat;
5297	ninfo->ni_size = sizeof (net_stat_t);
5298	ninfo->ni_type = EX_NET_LNSTAT_REC;
5299
5300	return (ninfo);
5301}
5302
5303typedef struct i_mac_log_state_s {
5304	boolean_t	mi_last;
5305	int		mi_fenable;
5306	int		mi_lenable;
5307	list_t		*mi_list;
5308} i_mac_log_state_t;
5309
5310/*
5311 * For a given flow, if the description has not been logged before, do it now.
5312 * If it is a VNIC, then we have collected information about it from the MAC
5313 * table, so skip it.
5314 *
5315 * Called through mac_flow_walk_nolock()
5316 *
5317 * Return 0 if successful.
5318 */
5319static int
5320mac_log_flowinfo(flow_entry_t *flent, void *arg)
5321{
5322	mac_client_impl_t	*mcip = flent->fe_mcip;
5323	i_mac_log_state_t	*lstate = arg;
5324	netinfo_t		*ninfo;
5325
5326	if (mcip == NULL)
5327		return (0);
5328
5329	/*
5330	 * If the name starts with "vnic", and fe_user_generated is true (to
5331	 * exclude the mcast and active flow entries created implicitly for
5332	 * a vnic, it is a VNIC flow.  i.e. vnic1 is a vnic flow,
5333	 * vnic/bge1/mcast1 is not and neither is vnic/bge1/active.
5334	 */
5335	if (strncasecmp(flent->fe_flow_name, "vnic", 4) == 0 &&
5336	    (flent->fe_type & FLOW_USER) != 0) {
5337		return (0);
5338	}
5339
5340	if (!flent->fe_desc_logged) {
5341		/*
5342		 * We don't return error because we want to continue the
5343		 * walk in case this is the last walk which means we
5344		 * need to reset fe_desc_logged in all the flows.
5345		 */
5346		if ((ninfo = mac_write_flow_desc(flent, mcip)) == NULL)
5347			return (0);
5348		list_insert_tail(lstate->mi_list, ninfo);
5349		flent->fe_desc_logged = B_TRUE;
5350	}
5351
5352	/*
5353	 * Regardless of the error, we want to proceed in case we have to
5354	 * reset fe_desc_logged.
5355	 */
5356	ninfo = mac_write_flow_stats(flent);
5357	if (ninfo == NULL)
5358		return (-1);
5359
5360	list_insert_tail(lstate->mi_list, ninfo);
5361
5362	if (mcip != NULL && !(mcip->mci_state_flags & MCIS_DESC_LOGGED))
5363		flent->fe_desc_logged = B_FALSE;
5364
5365	return (0);
5366}
5367
5368/*
5369 * Log the description for each mac client of this mac_impl_t, if it
5370 * hasn't already been done. Additionally, log statistics for the link as
5371 * well. Walk the flow table and log information for each flow as well.
5372 * If it is the last walk (mci_last), then we turn off mci_desc_logged (and
5373 * also fe_desc_logged, if flow logging is on) since we want to log the
5374 * description if and when logging is restarted.
5375 *
5376 * Return 0 upon success or -1 upon failure
5377 */
5378static int
5379i_mac_impl_log(mac_impl_t *mip, i_mac_log_state_t *lstate)
5380{
5381	mac_client_impl_t	*mcip;
5382	netinfo_t		*ninfo;
5383
5384	i_mac_perim_enter(mip);
5385	/*
5386	 * Only walk the client list for NIC and etherstub
5387	 */
5388	if ((mip->mi_state_flags & MIS_DISABLED) ||
5389	    ((mip->mi_state_flags & MIS_IS_VNIC) &&
5390	    (mac_get_lower_mac_handle((mac_handle_t)mip) != NULL))) {
5391		i_mac_perim_exit(mip);
5392		return (0);
5393	}
5394
5395	for (mcip = mip->mi_clients_list; mcip != NULL;
5396	    mcip = mcip->mci_client_next) {
5397		if (!MCIP_DATAPATH_SETUP(mcip))
5398			continue;
5399		if (lstate->mi_lenable) {
5400			if (!(mcip->mci_state_flags & MCIS_DESC_LOGGED)) {
5401				ninfo = mac_write_link_desc(mcip);
5402				if (ninfo == NULL) {
5403				/*
5404				 * We can't terminate it if this is the last
5405				 * walk, else there might be some links with
5406				 * mi_desc_logged set to true, which means
5407				 * their description won't be logged the next
5408				 * time logging is started (similarly for the
5409				 * flows within such links). We can continue
5410				 * without walking the flow table (i.e. to
5411				 * set fe_desc_logged to false) because we
5412				 * won't have written any flow stuff for this
5413				 * link as we haven't logged the link itself.
5414				 */
5415					i_mac_perim_exit(mip);
5416					if (lstate->mi_last)
5417						return (0);
5418					else
5419						return (-1);
5420				}
5421				mcip->mci_state_flags |= MCIS_DESC_LOGGED;
5422				list_insert_tail(lstate->mi_list, ninfo);
5423			}
5424		}
5425
5426		ninfo = mac_write_link_stats(mcip);
5427		if (ninfo == NULL && !lstate->mi_last) {
5428			i_mac_perim_exit(mip);
5429			return (-1);
5430		}
5431		list_insert_tail(lstate->mi_list, ninfo);
5432
5433		if (lstate->mi_last)
5434			mcip->mci_state_flags &= ~MCIS_DESC_LOGGED;
5435
5436		if (lstate->mi_fenable) {
5437			if (mcip->mci_subflow_tab != NULL) {
5438				(void) mac_flow_walk_nolock(
5439				    mcip->mci_subflow_tab, mac_log_flowinfo,
5440				    lstate);
5441			}
5442		}
5443	}
5444	i_mac_perim_exit(mip);
5445	return (0);
5446}
5447
5448/*
5449 * modhash walker function to add a mac_impl_t to a list
5450 */
5451/*ARGSUSED*/
5452static uint_t
5453i_mac_impl_list_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
5454{
5455	list_t			*list = (list_t *)arg;
5456	mac_impl_t		*mip = (mac_impl_t *)val;
5457
5458	if ((mip->mi_state_flags & MIS_DISABLED) == 0) {
5459		list_insert_tail(list, mip);
5460		mip->mi_ref++;
5461	}
5462
5463	return (MH_WALK_CONTINUE);
5464}
5465
5466void
5467i_mac_log_info(list_t *net_log_list, i_mac_log_state_t *lstate)
5468{
5469	list_t			mac_impl_list;
5470	mac_impl_t		*mip;
5471	netinfo_t		*ninfo;
5472
5473	/* Create list of mac_impls */
5474	ASSERT(RW_LOCK_HELD(&i_mac_impl_lock));
5475	list_create(&mac_impl_list, sizeof (mac_impl_t), offsetof(mac_impl_t,
5476	    mi_node));
5477	mod_hash_walk(i_mac_impl_hash, i_mac_impl_list_walker, &mac_impl_list);
5478	rw_exit(&i_mac_impl_lock);
5479
5480	/* Create log entries for each mac_impl */
5481	for (mip = list_head(&mac_impl_list); mip != NULL;
5482	    mip = list_next(&mac_impl_list, mip)) {
5483		if (i_mac_impl_log(mip, lstate) != 0)
5484			continue;
5485	}
5486
5487	/* Remove elements and destroy list of mac_impls */
5488	rw_enter(&i_mac_impl_lock, RW_WRITER);
5489	while ((mip = list_remove_tail(&mac_impl_list)) != NULL) {
5490		mip->mi_ref--;
5491	}
5492	rw_exit(&i_mac_impl_lock);
5493	list_destroy(&mac_impl_list);
5494
5495	/*
5496	 * Write log entries to files outside of locks, free associated
5497	 * structures, and remove entries from the list.
5498	 */
5499	while ((ninfo = list_head(net_log_list)) != NULL) {
5500		(void) exacct_commit_netinfo(ninfo->ni_record, ninfo->ni_type);
5501		list_remove(net_log_list, ninfo);
5502		kmem_free(ninfo->ni_record, ninfo->ni_size);
5503		kmem_free(ninfo, sizeof (*ninfo));
5504	}
5505	list_destroy(net_log_list);
5506}
5507
5508/*
5509 * The timer thread that runs every mac_logging_interval seconds and logs
5510 * link and/or flow information.
5511 */
5512/* ARGSUSED */
5513void
5514mac_log_linkinfo(void *arg)
5515{
5516	i_mac_log_state_t	lstate;
5517	list_t			net_log_list;
5518
5519	list_create(&net_log_list, sizeof (netinfo_t),
5520	    offsetof(netinfo_t, ni_link));
5521
5522	rw_enter(&i_mac_impl_lock, RW_READER);
5523	if (!mac_flow_log_enable && !mac_link_log_enable) {
5524		rw_exit(&i_mac_impl_lock);
5525		return;
5526	}
5527	lstate.mi_fenable = mac_flow_log_enable;
5528	lstate.mi_lenable = mac_link_log_enable;
5529	lstate.mi_last = B_FALSE;
5530	lstate.mi_list = &net_log_list;
5531
5532	/* Write log entries for each mac_impl in the list */
5533	i_mac_log_info(&net_log_list, &lstate);
5534
5535	if (mac_flow_log_enable || mac_link_log_enable) {
5536		mac_logging_timer = timeout(mac_log_linkinfo, NULL,
5537		    SEC_TO_TICK(mac_logging_interval));
5538	}
5539}
5540
5541typedef struct i_mac_fastpath_state_s {
5542	boolean_t	mf_disable;
5543	int		mf_err;
5544} i_mac_fastpath_state_t;
5545
5546/* modhash walker function to enable or disable fastpath */
5547/*ARGSUSED*/
5548static uint_t
5549i_mac_fastpath_walker(mod_hash_key_t key, mod_hash_val_t *val,
5550    void *arg)
5551{
5552	i_mac_fastpath_state_t	*state = arg;
5553	mac_handle_t		mh = (mac_handle_t)val;
5554
5555	if (state->mf_disable)
5556		state->mf_err = mac_fastpath_disable(mh);
5557	else
5558		mac_fastpath_enable(mh);
5559
5560	return (state->mf_err == 0 ? MH_WALK_CONTINUE : MH_WALK_TERMINATE);
5561}
5562
5563/*
5564 * Start the logging timer.
5565 */
5566int
5567mac_start_logusage(mac_logtype_t type, uint_t interval)
5568{
5569	i_mac_fastpath_state_t	dstate = {B_TRUE, 0};
5570	i_mac_fastpath_state_t	estate = {B_FALSE, 0};
5571	int			err;
5572
5573	rw_enter(&i_mac_impl_lock, RW_WRITER);
5574	switch (type) {
5575	case MAC_LOGTYPE_FLOW:
5576		if (mac_flow_log_enable) {
5577			rw_exit(&i_mac_impl_lock);
5578			return (0);
5579		}
5580		/* FALLTHRU */
5581	case MAC_LOGTYPE_LINK:
5582		if (mac_link_log_enable) {
5583			rw_exit(&i_mac_impl_lock);
5584			return (0);
5585		}
5586		break;
5587	default:
5588		ASSERT(0);
5589	}
5590
5591	/* Disable fastpath */
5592	mod_hash_walk(i_mac_impl_hash, i_mac_fastpath_walker, &dstate);
5593	if ((err = dstate.mf_err) != 0) {
5594		/* Reenable fastpath  */
5595		mod_hash_walk(i_mac_impl_hash, i_mac_fastpath_walker, &estate);
5596		rw_exit(&i_mac_impl_lock);
5597		return (err);
5598	}
5599
5600	switch (type) {
5601	case MAC_LOGTYPE_FLOW:
5602		mac_flow_log_enable = B_TRUE;
5603		/* FALLTHRU */
5604	case MAC_LOGTYPE_LINK:
5605		mac_link_log_enable = B_TRUE;
5606		break;
5607	}
5608
5609	mac_logging_interval = interval;
5610	rw_exit(&i_mac_impl_lock);
5611	mac_log_linkinfo(NULL);
5612	return (0);
5613}
5614
5615/*
5616 * Stop the logging timer if both link and flow logging are turned off.
5617 */
5618void
5619mac_stop_logusage(mac_logtype_t type)
5620{
5621	i_mac_log_state_t	lstate;
5622	i_mac_fastpath_state_t	estate = {B_FALSE, 0};
5623	list_t			net_log_list;
5624
5625	list_create(&net_log_list, sizeof (netinfo_t),
5626	    offsetof(netinfo_t, ni_link));
5627
5628	rw_enter(&i_mac_impl_lock, RW_WRITER);
5629
5630	lstate.mi_fenable = mac_flow_log_enable;
5631	lstate.mi_lenable = mac_link_log_enable;
5632	lstate.mi_list = &net_log_list;
5633
5634	/* Last walk */
5635	lstate.mi_last = B_TRUE;
5636
5637	switch (type) {
5638	case MAC_LOGTYPE_FLOW:
5639		if (lstate.mi_fenable) {
5640			ASSERT(mac_link_log_enable);
5641			mac_flow_log_enable = B_FALSE;
5642			mac_link_log_enable = B_FALSE;
5643			break;
5644		}
5645		/* FALLTHRU */
5646	case MAC_LOGTYPE_LINK:
5647		if (!lstate.mi_lenable || mac_flow_log_enable) {
5648			rw_exit(&i_mac_impl_lock);
5649			return;
5650		}
5651		mac_link_log_enable = B_FALSE;
5652		break;
5653	default:
5654		ASSERT(0);
5655	}
5656
5657	/* Reenable fastpath */
5658	mod_hash_walk(i_mac_impl_hash, i_mac_fastpath_walker, &estate);
5659
5660	(void) untimeout(mac_logging_timer);
5661	mac_logging_timer = 0;
5662
5663	/* Write log entries for each mac_impl in the list */
5664	i_mac_log_info(&net_log_list, &lstate);
5665}
5666
5667/*
5668 * Walk the rx and tx SRS/SRs for a flow and update the priority value.
5669 */
5670void
5671mac_flow_update_priority(mac_client_impl_t *mcip, flow_entry_t *flent)
5672{
5673	pri_t			pri;
5674	int			count;
5675	mac_soft_ring_set_t	*mac_srs;
5676
5677	if (flent->fe_rx_srs_cnt <= 0)
5678		return;
5679
5680	if (((mac_soft_ring_set_t *)flent->fe_rx_srs[0])->srs_type ==
5681	    SRST_FLOW) {
5682		pri = FLOW_PRIORITY(mcip->mci_min_pri,
5683		    mcip->mci_max_pri,
5684		    flent->fe_resource_props.mrp_priority);
5685	} else {
5686		pri = mcip->mci_max_pri;
5687	}
5688
5689	for (count = 0; count < flent->fe_rx_srs_cnt; count++) {
5690		mac_srs = flent->fe_rx_srs[count];
5691		mac_update_srs_priority(mac_srs, pri);
5692	}
5693	/*
5694	 * If we have a Tx SRS, we need to modify all the threads associated
5695	 * with it.
5696	 */
5697	if (flent->fe_tx_srs != NULL)
5698		mac_update_srs_priority(flent->fe_tx_srs, pri);
5699}
5700
5701/*
5702 * RX and TX rings are reserved according to different semantics depending
5703 * on the requests from the MAC clients and type of rings:
5704 *
5705 * On the Tx side, by default we reserve individual rings, independently from
5706 * the groups.
5707 *
5708 * On the Rx side, the reservation is at the granularity of the group
5709 * of rings, and used for v12n level 1 only. It has a special case for the
5710 * primary client.
5711 *
5712 * If a share is allocated to a MAC client, we allocate a TX group and an
5713 * RX group to the client, and assign TX rings and RX rings to these
5714 * groups according to information gathered from the driver through
5715 * the share capability.
5716 *
5717 * The foreseable evolution of Rx rings will handle v12n level 2 and higher
5718 * to allocate individual rings out of a group and program the hw classifier
5719 * based on IP address or higher level criteria.
5720 */
5721
5722/*
5723 * mac_reserve_tx_ring()
5724 * Reserve a unused ring by marking it with MR_INUSE state.
5725 * As reserved, the ring is ready to function.
5726 *
5727 * Notes for Hybrid I/O:
5728 *
5729 * If a specific ring is needed, it is specified through the desired_ring
5730 * argument. Otherwise that argument is set to NULL.
5731 * If the desired ring was previous allocated to another client, this
5732 * function swaps it with a new ring from the group of unassigned rings.
5733 */
5734mac_ring_t *
5735mac_reserve_tx_ring(mac_impl_t *mip, mac_ring_t *desired_ring)
5736{
5737	mac_group_t		*group;
5738	mac_grp_client_t	*mgcp;
5739	mac_client_impl_t	*mcip;
5740	mac_soft_ring_set_t	*srs;
5741
5742	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
5743
5744	/*
5745	 * Find an available ring and start it before changing its status.
5746	 * The unassigned rings are at the end of the mi_tx_groups
5747	 * array.
5748	 */
5749	group = MAC_DEFAULT_TX_GROUP(mip);
5750
5751	/* Can't take the default ring out of the default group */
5752	ASSERT(desired_ring != (mac_ring_t *)mip->mi_default_tx_ring);
5753
5754	if (desired_ring->mr_state == MR_FREE) {
5755		ASSERT(MAC_GROUP_NO_CLIENT(group));
5756		if (mac_start_ring(desired_ring) != 0)
5757			return (NULL);
5758		return (desired_ring);
5759	}
5760	/*
5761	 * There are clients using this ring, so let's move the clients
5762	 * away from using this ring.
5763	 */
5764	for (mgcp = group->mrg_clients; mgcp != NULL; mgcp = mgcp->mgc_next) {
5765		mcip = mgcp->mgc_client;
5766		mac_tx_client_quiesce((mac_client_handle_t)mcip);
5767		srs = MCIP_TX_SRS(mcip);
5768		ASSERT(mac_tx_srs_ring_present(srs, desired_ring));
5769		mac_tx_invoke_callbacks(mcip,
5770		    (mac_tx_cookie_t)mac_tx_srs_get_soft_ring(srs,
5771		    desired_ring));
5772		mac_tx_srs_del_ring(srs, desired_ring);
5773		mac_tx_client_restart((mac_client_handle_t)mcip);
5774	}
5775	return (desired_ring);
5776}
5777
5778/*
5779 * For a reserved group with multiple clients, return the primary client.
5780 */
5781static mac_client_impl_t *
5782mac_get_grp_primary(mac_group_t *grp)
5783{
5784	mac_grp_client_t	*mgcp = grp->mrg_clients;
5785	mac_client_impl_t	*mcip;
5786
5787	while (mgcp != NULL) {
5788		mcip = mgcp->mgc_client;
5789		if (mcip->mci_flent->fe_type & FLOW_PRIMARY_MAC)
5790			return (mcip);
5791		mgcp = mgcp->mgc_next;
5792	}
5793	return (NULL);
5794}
5795
5796/*
5797 * Hybrid I/O specifies the ring that should be given to a share.
5798 * If the ring is already used by clients, then we need to release
5799 * the ring back to the default group so that we can give it to
5800 * the share. This means the clients using this ring now get a
5801 * replacement ring. If there aren't any replacement rings, this
5802 * function returns a failure.
5803 */
5804static int
5805mac_reclaim_ring_from_grp(mac_impl_t *mip, mac_ring_type_t ring_type,
5806    mac_ring_t *ring, mac_ring_t **rings, int nrings)
5807{
5808	mac_group_t		*group = (mac_group_t *)ring->mr_gh;
5809	mac_resource_props_t	*mrp;
5810	mac_client_impl_t	*mcip;
5811	mac_group_t		*defgrp;
5812	mac_ring_t		*tring;
5813	mac_group_t		*tgrp;
5814	int			i;
5815	int			j;
5816
5817	mcip = MAC_GROUP_ONLY_CLIENT(group);
5818	if (mcip == NULL)
5819		mcip = mac_get_grp_primary(group);
5820	ASSERT(mcip != NULL);
5821	ASSERT(mcip->mci_share == NULL);
5822
5823	mrp = MCIP_RESOURCE_PROPS(mcip);
5824	if (ring_type == MAC_RING_TYPE_RX) {
5825		defgrp = mip->mi_rx_donor_grp;
5826		if ((mrp->mrp_mask & MRP_RX_RINGS) == 0) {
5827			/* Need to put this mac client in the default group */
5828			if (mac_rx_switch_group(mcip, group, defgrp) != 0)
5829				return (ENOSPC);
5830		} else {
5831			/*
5832			 * Switch this ring with some other ring from
5833			 * the default group.
5834			 */
5835			for (tring = defgrp->mrg_rings; tring != NULL;
5836			    tring = tring->mr_next) {
5837				if (tring->mr_index == 0)
5838					continue;
5839				for (j = 0; j < nrings; j++) {
5840					if (rings[j] == tring)
5841						break;
5842				}
5843				if (j >= nrings)
5844					break;
5845			}
5846			if (tring == NULL)
5847				return (ENOSPC);
5848			if (mac_group_mov_ring(mip, group, tring) != 0)
5849				return (ENOSPC);
5850			if (mac_group_mov_ring(mip, defgrp, ring) != 0) {
5851				(void) mac_group_mov_ring(mip, defgrp, tring);
5852				return (ENOSPC);
5853			}
5854		}
5855		ASSERT(ring->mr_gh == (mac_group_handle_t)defgrp);
5856		return (0);
5857	}
5858
5859	defgrp = MAC_DEFAULT_TX_GROUP(mip);
5860	if (ring == (mac_ring_t *)mip->mi_default_tx_ring) {
5861		/*
5862		 * See if we can get a spare ring to replace the default
5863		 * ring.
5864		 */
5865		if (defgrp->mrg_cur_count == 1) {
5866			/*
5867			 * Need to get a ring from another client, see if
5868			 * there are any clients that can be moved to
5869			 * the default group, thereby freeing some rings.
5870			 */
5871			for (i = 0; i < mip->mi_tx_group_count; i++) {
5872				tgrp = &mip->mi_tx_groups[i];
5873				if (tgrp->mrg_state ==
5874				    MAC_GROUP_STATE_REGISTERED) {
5875					continue;
5876				}
5877				mcip = MAC_GROUP_ONLY_CLIENT(tgrp);
5878				if (mcip == NULL)
5879					mcip = mac_get_grp_primary(tgrp);
5880				ASSERT(mcip != NULL);
5881				mrp = MCIP_RESOURCE_PROPS(mcip);
5882				if ((mrp->mrp_mask & MRP_TX_RINGS) == 0) {
5883					ASSERT(tgrp->mrg_cur_count == 1);
5884					/*
5885					 * If this ring is part of the
5886					 * rings asked by the share we cannot
5887					 * use it as the default ring.
5888					 */
5889					for (j = 0; j < nrings; j++) {
5890						if (rings[j] == tgrp->mrg_rings)
5891							break;
5892					}
5893					if (j < nrings)
5894						continue;
5895					mac_tx_client_quiesce(
5896					    (mac_client_handle_t)mcip);
5897					mac_tx_switch_group(mcip, tgrp,
5898					    defgrp);
5899					mac_tx_client_restart(
5900					    (mac_client_handle_t)mcip);
5901					break;
5902				}
5903			}
5904			/*
5905			 * All the rings are reserved, can't give up the
5906			 * default ring.
5907			 */
5908			if (defgrp->mrg_cur_count <= 1)
5909				return (ENOSPC);
5910		}
5911		/*
5912		 * Swap the default ring with another.
5913		 */
5914		for (tring = defgrp->mrg_rings; tring != NULL;
5915		    tring = tring->mr_next) {
5916			/*
5917			 * If this ring is part of the rings asked by the
5918			 * share we cannot use it as the default ring.
5919			 */
5920			for (j = 0; j < nrings; j++) {
5921				if (rings[j] == tring)
5922					break;
5923			}
5924			if (j >= nrings)
5925				break;
5926		}
5927		ASSERT(tring != NULL);
5928		mip->mi_default_tx_ring = (mac_ring_handle_t)tring;
5929		return (0);
5930	}
5931	/*
5932	 * The Tx ring is with a group reserved by a MAC client. See if
5933	 * we can swap it.
5934	 */
5935	ASSERT(group->mrg_state == MAC_GROUP_STATE_RESERVED);
5936	mcip = MAC_GROUP_ONLY_CLIENT(group);
5937	if (mcip == NULL)
5938		mcip = mac_get_grp_primary(group);
5939	ASSERT(mcip !=  NULL);
5940	mrp = MCIP_RESOURCE_PROPS(mcip);
5941	mac_tx_client_quiesce((mac_client_handle_t)mcip);
5942	if ((mrp->mrp_mask & MRP_TX_RINGS) == 0) {
5943		ASSERT(group->mrg_cur_count == 1);
5944		/* Put this mac client in the default group */
5945		mac_tx_switch_group(mcip, group, defgrp);
5946	} else {
5947		/*
5948		 * Switch this ring with some other ring from
5949		 * the default group.
5950		 */
5951		for (tring = defgrp->mrg_rings; tring != NULL;
5952		    tring = tring->mr_next) {
5953			if (tring == (mac_ring_t *)mip->mi_default_tx_ring)
5954				continue;
5955			/*
5956			 * If this ring is part of the rings asked by the
5957			 * share we cannot use it for swapping.
5958			 */
5959			for (j = 0; j < nrings; j++) {
5960				if (rings[j] == tring)
5961					break;
5962			}
5963			if (j >= nrings)
5964				break;
5965		}
5966		if (tring == NULL) {
5967			mac_tx_client_restart((mac_client_handle_t)mcip);
5968			return (ENOSPC);
5969		}
5970		if (mac_group_mov_ring(mip, group, tring) != 0) {
5971			mac_tx_client_restart((mac_client_handle_t)mcip);
5972			return (ENOSPC);
5973		}
5974		if (mac_group_mov_ring(mip, defgrp, ring) != 0) {
5975			(void) mac_group_mov_ring(mip, defgrp, tring);
5976			mac_tx_client_restart((mac_client_handle_t)mcip);
5977			return (ENOSPC);
5978		}
5979	}
5980	mac_tx_client_restart((mac_client_handle_t)mcip);
5981	ASSERT(ring->mr_gh == (mac_group_handle_t)defgrp);
5982	return (0);
5983}
5984
5985/*
5986 * Populate a zero-ring group with rings. If the share is non-NULL,
5987 * the rings are chosen according to that share.
5988 * Invoked after allocating a new RX or TX group through
5989 * mac_reserve_rx_group() or mac_reserve_tx_group(), respectively.
5990 * Returns zero on success, an errno otherwise.
5991 */
5992int
5993i_mac_group_allocate_rings(mac_impl_t *mip, mac_ring_type_t ring_type,
5994    mac_group_t *src_group, mac_group_t *new_group, mac_share_handle_t share,
5995    uint32_t ringcnt)
5996{
5997	mac_ring_t **rings, *ring;
5998	uint_t nrings;
5999	int rv = 0, i = 0, j;
6000
6001	ASSERT((ring_type == MAC_RING_TYPE_RX &&
6002	    mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) ||
6003	    (ring_type == MAC_RING_TYPE_TX &&
6004	    mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC));
6005
6006	/*
6007	 * First find the rings to allocate to the group.
6008	 */
6009	if (share != NULL) {
6010		/* get rings through ms_squery() */
6011		mip->mi_share_capab.ms_squery(share, ring_type, NULL, &nrings);
6012		ASSERT(nrings != 0);
6013		rings = kmem_alloc(nrings * sizeof (mac_ring_handle_t),
6014		    KM_SLEEP);
6015		mip->mi_share_capab.ms_squery(share, ring_type,
6016		    (mac_ring_handle_t *)rings, &nrings);
6017		for (i = 0; i < nrings; i++) {
6018			/*
6019			 * If we have given this ring to a non-default
6020			 * group, we need to check if we can get this
6021			 * ring.
6022			 */
6023			ring = rings[i];
6024			if (ring->mr_gh != (mac_group_handle_t)src_group ||
6025			    ring == (mac_ring_t *)mip->mi_default_tx_ring) {
6026				if (mac_reclaim_ring_from_grp(mip, ring_type,
6027				    ring, rings, nrings) != 0) {
6028					rv = ENOSPC;
6029					goto bail;
6030				}
6031			}
6032		}
6033	} else {
6034		/*
6035		 * Pick one ring from default group.
6036		 *
6037		 * for now pick the second ring which requires the first ring
6038		 * at index 0 to stay in the default group, since it is the
6039		 * ring which carries the multicast traffic.
6040		 * We need a better way for a driver to indicate this,
6041		 * for example a per-ring flag.
6042		 */
6043		rings = kmem_alloc(ringcnt * sizeof (mac_ring_handle_t),
6044		    KM_SLEEP);
6045		for (ring = src_group->mrg_rings; ring != NULL;
6046		    ring = ring->mr_next) {
6047			if (ring_type == MAC_RING_TYPE_RX &&
6048			    ring->mr_index == 0) {
6049				continue;
6050			}
6051			if (ring_type == MAC_RING_TYPE_TX &&
6052			    ring == (mac_ring_t *)mip->mi_default_tx_ring) {
6053				continue;
6054			}
6055			rings[i++] = ring;
6056			if (i == ringcnt)
6057				break;
6058		}
6059		ASSERT(ring != NULL);
6060		nrings = i;
6061		/* Not enough rings as required */
6062		if (nrings != ringcnt) {
6063			rv = ENOSPC;
6064			goto bail;
6065		}
6066	}
6067
6068	switch (ring_type) {
6069	case MAC_RING_TYPE_RX:
6070		if (src_group->mrg_cur_count - nrings < 1) {
6071			/* we ran out of rings */
6072			rv = ENOSPC;
6073			goto bail;
6074		}
6075
6076		/* move receive rings to new group */
6077		for (i = 0; i < nrings; i++) {
6078			rv = mac_group_mov_ring(mip, new_group, rings[i]);
6079			if (rv != 0) {
6080				/* move rings back on failure */
6081				for (j = 0; j < i; j++) {
6082					(void) mac_group_mov_ring(mip,
6083					    src_group, rings[j]);
6084				}
6085				goto bail;
6086			}
6087		}
6088		break;
6089
6090	case MAC_RING_TYPE_TX: {
6091		mac_ring_t *tmp_ring;
6092
6093		/* move the TX rings to the new group */
6094		for (i = 0; i < nrings; i++) {
6095			/* get the desired ring */
6096			tmp_ring = mac_reserve_tx_ring(mip, rings[i]);
6097			if (tmp_ring == NULL) {
6098				rv = ENOSPC;
6099				goto bail;
6100			}
6101			ASSERT(tmp_ring == rings[i]);
6102			rv = mac_group_mov_ring(mip, new_group, rings[i]);
6103			if (rv != 0) {
6104				/* cleanup on failure */
6105				for (j = 0; j < i; j++) {
6106					(void) mac_group_mov_ring(mip,
6107					    MAC_DEFAULT_TX_GROUP(mip),
6108					    rings[j]);
6109				}
6110				goto bail;
6111			}
6112		}
6113		break;
6114	}
6115	}
6116
6117	/* add group to share */
6118	if (share != NULL)
6119		mip->mi_share_capab.ms_sadd(share, new_group->mrg_driver);
6120
6121bail:
6122	/* free temporary array of rings */
6123	kmem_free(rings, nrings * sizeof (mac_ring_handle_t));
6124
6125	return (rv);
6126}
6127
6128void
6129mac_group_add_client(mac_group_t *grp, mac_client_impl_t *mcip)
6130{
6131	mac_grp_client_t *mgcp;
6132
6133	for (mgcp = grp->mrg_clients; mgcp != NULL; mgcp = mgcp->mgc_next) {
6134		if (mgcp->mgc_client == mcip)
6135			break;
6136	}
6137
6138	VERIFY(mgcp == NULL);
6139
6140	mgcp = kmem_zalloc(sizeof (mac_grp_client_t), KM_SLEEP);
6141	mgcp->mgc_client = mcip;
6142	mgcp->mgc_next = grp->mrg_clients;
6143	grp->mrg_clients = mgcp;
6144
6145}
6146
6147void
6148mac_group_remove_client(mac_group_t *grp, mac_client_impl_t *mcip)
6149{
6150	mac_grp_client_t *mgcp, **pprev;
6151
6152	for (pprev = &grp->mrg_clients, mgcp = *pprev; mgcp != NULL;
6153	    pprev = &mgcp->mgc_next, mgcp = *pprev) {
6154		if (mgcp->mgc_client == mcip)
6155			break;
6156	}
6157
6158	ASSERT(mgcp != NULL);
6159
6160	*pprev = mgcp->mgc_next;
6161	kmem_free(mgcp, sizeof (mac_grp_client_t));
6162}
6163
6164/*
6165 * mac_reserve_rx_group()
6166 *
6167 * Finds an available group and exclusively reserves it for a client.
6168 * The group is chosen to suit the flow's resource controls (bandwidth and
6169 * fanout requirements) and the address type.
6170 * If the requestor is the pimary MAC then return the group with the
6171 * largest number of rings, otherwise the default ring when available.
6172 */
6173mac_group_t *
6174mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move)
6175{
6176	mac_share_handle_t	share = mcip->mci_share;
6177	mac_impl_t		*mip = mcip->mci_mip;
6178	mac_group_t		*grp = NULL;
6179	int			i;
6180	int			err = 0;
6181	mac_address_t		*map;
6182	mac_resource_props_t	*mrp = MCIP_RESOURCE_PROPS(mcip);
6183	int			nrings;
6184	int			donor_grp_rcnt;
6185	boolean_t		need_exclgrp = B_FALSE;
6186	int			need_rings = 0;
6187	mac_group_t		*candidate_grp = NULL;
6188	mac_client_impl_t	*gclient;
6189	mac_resource_props_t	*gmrp;
6190	mac_group_t		*donorgrp = NULL;
6191	boolean_t		rxhw = mrp->mrp_mask & MRP_RX_RINGS;
6192	boolean_t		unspec = mrp->mrp_mask & MRP_RXRINGS_UNSPEC;
6193	boolean_t		isprimary;
6194
6195	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
6196
6197	isprimary = mcip->mci_flent->fe_type & FLOW_PRIMARY_MAC;
6198
6199	/*
6200	 * Check if a group already has this mac address (case of VLANs)
6201	 * unless we are moving this MAC client from one group to another.
6202	 */
6203	if (!move && (map = mac_find_macaddr(mip, mac_addr)) != NULL) {
6204		if (map->ma_group != NULL)
6205			return (map->ma_group);
6206	}
6207	if (mip->mi_rx_groups == NULL || mip->mi_rx_group_count == 0)
6208		return (NULL);
6209	/*
6210	 * If exclusive open, return NULL which will enable the
6211	 * caller to use the default group.
6212	 */
6213	if (mcip->mci_state_flags & MCIS_EXCLUSIVE)
6214		return (NULL);
6215
6216	/* For dynamic groups default unspecified to 1 */
6217	if (rxhw && unspec &&
6218	    mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
6219		mrp->mrp_nrxrings = 1;
6220	}
6221	/*
6222	 * For static grouping we allow only specifying rings=0 and
6223	 * unspecified
6224	 */
6225	if (rxhw && mrp->mrp_nrxrings > 0 &&
6226	    mip->mi_rx_group_type == MAC_GROUP_TYPE_STATIC) {
6227		return (NULL);
6228	}
6229	if (rxhw) {
6230		/*
6231		 * We have explicitly asked for a group (with nrxrings,
6232		 * if unspec).
6233		 */
6234		if (unspec || mrp->mrp_nrxrings > 0) {
6235			need_exclgrp = B_TRUE;
6236			need_rings = mrp->mrp_nrxrings;
6237		} else if (mrp->mrp_nrxrings == 0) {
6238			/*
6239			 * We have asked for a software group.
6240			 */
6241			return (NULL);
6242		}
6243	} else if (isprimary && mip->mi_nactiveclients == 1 &&
6244	    mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
6245		/*
6246		 * If the primary is the only active client on this
6247		 * mip and we have not asked for any rings, we give
6248		 * it the default group so that the primary gets to
6249		 * use all the rings.
6250		 */
6251		return (NULL);
6252	}
6253
6254	/* The group that can donate rings */
6255	donorgrp = mip->mi_rx_donor_grp;
6256
6257	/*
6258	 * The number of rings that the default group can donate.
6259	 * We need to leave at least one ring.
6260	 */
6261	donor_grp_rcnt = donorgrp->mrg_cur_count - 1;
6262
6263	/*
6264	 * Try to exclusively reserve a RX group.
6265	 *
6266	 * For flows requiring HW_DEFAULT_RING (unicast flow of the primary
6267	 * client), try to reserve the a non-default RX group and give
6268	 * it all the rings from the donor group, except the default ring
6269	 *
6270	 * For flows requiring HW_RING (unicast flow of other clients), try
6271	 * to reserve non-default RX group with the specified number of
6272	 * rings, if available.
6273	 *
6274	 * For flows that have not asked for software or hardware ring,
6275	 * try to reserve a non-default group with 1 ring, if available.
6276	 */
6277	for (i = 1; i < mip->mi_rx_group_count; i++) {
6278		grp = &mip->mi_rx_groups[i];
6279
6280		DTRACE_PROBE3(rx__group__trying, char *, mip->mi_name,
6281		    int, grp->mrg_index, mac_group_state_t, grp->mrg_state);
6282
6283		/*
6284		 * Check if this group could be a candidate group for
6285		 * eviction if we need a group for this MAC client,
6286		 * but there aren't any. A candidate group is one
6287		 * that didn't ask for an exclusive group, but got
6288		 * one and it has enough rings (combined with what
6289		 * the donor group can donate) for the new MAC
6290		 * client
6291		 */
6292		if (grp->mrg_state >= MAC_GROUP_STATE_RESERVED) {
6293			/*
6294			 * If the primary/donor group is not the default
6295			 * group, don't bother looking for a candidate group.
6296			 * If we don't have enough rings we will check
6297			 * if the primary group can be vacated.
6298			 */
6299			if (candidate_grp == NULL &&
6300			    donorgrp == MAC_DEFAULT_RX_GROUP(mip)) {
6301				ASSERT(!MAC_GROUP_NO_CLIENT(grp));
6302				gclient = MAC_GROUP_ONLY_CLIENT(grp);
6303				if (gclient == NULL)
6304					gclient = mac_get_grp_primary(grp);
6305				ASSERT(gclient != NULL);
6306				gmrp = MCIP_RESOURCE_PROPS(gclient);
6307				if (gclient->mci_share == NULL &&
6308				    (gmrp->mrp_mask & MRP_RX_RINGS) == 0 &&
6309				    (unspec ||
6310				    (grp->mrg_cur_count + donor_grp_rcnt >=
6311				    need_rings))) {
6312					candidate_grp = grp;
6313				}
6314			}
6315			continue;
6316		}
6317		/*
6318		 * This group could already be SHARED by other multicast
6319		 * flows on this client. In that case, the group would
6320		 * be shared and has already been started.
6321		 */
6322		ASSERT(grp->mrg_state != MAC_GROUP_STATE_UNINIT);
6323
6324		if ((grp->mrg_state == MAC_GROUP_STATE_REGISTERED) &&
6325		    (mac_start_group(grp) != 0)) {
6326			continue;
6327		}
6328
6329		if (mip->mi_rx_group_type != MAC_GROUP_TYPE_DYNAMIC)
6330			break;
6331		ASSERT(grp->mrg_cur_count == 0);
6332
6333		/*
6334		 * Populate the group. Rings should be taken
6335		 * from the donor group.
6336		 */
6337		nrings = rxhw ? need_rings : isprimary ? donor_grp_rcnt: 1;
6338
6339		/*
6340		 * If the donor group can't donate, let's just walk and
6341		 * see if someone can vacate a group, so that we have
6342		 * enough rings for this, unless we already have
6343		 * identified a candiate group..
6344		 */
6345		if (nrings <= donor_grp_rcnt) {
6346			err = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_RX,
6347			    donorgrp, grp, share, nrings);
6348			if (err == 0) {
6349				/*
6350				 * For a share i_mac_group_allocate_rings gets
6351				 * the rings from the driver, let's populate
6352				 * the property for the client now.
6353				 */
6354				if (share != NULL) {
6355					mac_client_set_rings(
6356					    (mac_client_handle_t)mcip,
6357					    grp->mrg_cur_count, -1);
6358				}
6359				if (mac_is_primary_client(mcip) && !rxhw)
6360					mip->mi_rx_donor_grp = grp;
6361				break;
6362			}
6363		}
6364
6365		DTRACE_PROBE3(rx__group__reserve__alloc__rings, char *,
6366		    mip->mi_name, int, grp->mrg_index, int, err);
6367
6368		/*
6369		 * It's a dynamic group but the grouping operation
6370		 * failed.
6371		 */
6372		mac_stop_group(grp);
6373	}
6374	/* We didn't find an exclusive group for this MAC client */
6375	if (i >= mip->mi_rx_group_count) {
6376
6377		if (!need_exclgrp)
6378			return (NULL);
6379
6380		/*
6381		 * If we found a candidate group then we switch the
6382		 * MAC client from the candidate_group to the default
6383		 * group and give the group to this MAC client. If
6384		 * we didn't find a candidate_group, check if the
6385		 * primary is in its own group and if it can make way
6386		 * for this MAC client.
6387		 */
6388		if (candidate_grp == NULL &&
6389		    donorgrp != MAC_DEFAULT_RX_GROUP(mip) &&
6390		    donorgrp->mrg_cur_count >= need_rings) {
6391			candidate_grp = donorgrp;
6392		}
6393		if (candidate_grp != NULL) {
6394			boolean_t	prim_grp = B_FALSE;
6395
6396			/*
6397			 * Switch the MAC client from the candidate group
6398			 * to the default group.. If this group was the
6399			 * donor group, then after the switch we need
6400			 * to update the donor group too.
6401			 */
6402			grp = candidate_grp;
6403			gclient = MAC_GROUP_ONLY_CLIENT(grp);
6404			if (gclient == NULL)
6405				gclient = mac_get_grp_primary(grp);
6406			if (grp == mip->mi_rx_donor_grp)
6407				prim_grp = B_TRUE;
6408			if (mac_rx_switch_group(gclient, grp,
6409			    MAC_DEFAULT_RX_GROUP(mip)) != 0) {
6410				return (NULL);
6411			}
6412			if (prim_grp) {
6413				mip->mi_rx_donor_grp =
6414				    MAC_DEFAULT_RX_GROUP(mip);
6415				donorgrp = MAC_DEFAULT_RX_GROUP(mip);
6416			}
6417
6418
6419			/*
6420			 * Now give this group with the required rings
6421			 * to this MAC client.
6422			 */
6423			ASSERT(grp->mrg_state == MAC_GROUP_STATE_REGISTERED);
6424			if (mac_start_group(grp) != 0)
6425				return (NULL);
6426
6427			if (mip->mi_rx_group_type != MAC_GROUP_TYPE_DYNAMIC)
6428				return (grp);
6429
6430			donor_grp_rcnt = donorgrp->mrg_cur_count - 1;
6431			ASSERT(grp->mrg_cur_count == 0);
6432			ASSERT(donor_grp_rcnt >= need_rings);
6433			err = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_RX,
6434			    donorgrp, grp, share, need_rings);
6435			if (err == 0) {
6436				/*
6437				 * For a share i_mac_group_allocate_rings gets
6438				 * the rings from the driver, let's populate
6439				 * the property for the client now.
6440				 */
6441				if (share != NULL) {
6442					mac_client_set_rings(
6443					    (mac_client_handle_t)mcip,
6444					    grp->mrg_cur_count, -1);
6445				}
6446				DTRACE_PROBE2(rx__group__reserved,
6447				    char *, mip->mi_name, int, grp->mrg_index);
6448				return (grp);
6449			}
6450			DTRACE_PROBE3(rx__group__reserve__alloc__rings, char *,
6451			    mip->mi_name, int, grp->mrg_index, int, err);
6452			mac_stop_group(grp);
6453		}
6454		return (NULL);
6455	}
6456	ASSERT(grp != NULL);
6457
6458	DTRACE_PROBE2(rx__group__reserved,
6459	    char *, mip->mi_name, int, grp->mrg_index);
6460	return (grp);
6461}
6462
6463/*
6464 * mac_rx_release_group()
6465 *
6466 * This is called when there are no clients left for the group.
6467 * The group is stopped and marked MAC_GROUP_STATE_REGISTERED,
6468 * and if it is a non default group, the shares are removed and
6469 * all rings are assigned back to default group.
6470 */
6471void
6472mac_release_rx_group(mac_client_impl_t *mcip, mac_group_t *group)
6473{
6474	mac_impl_t		*mip = mcip->mci_mip;
6475	mac_ring_t		*ring;
6476
6477	ASSERT(group != MAC_DEFAULT_RX_GROUP(mip));
6478
6479	if (mip->mi_rx_donor_grp == group)
6480		mip->mi_rx_donor_grp = MAC_DEFAULT_RX_GROUP(mip);
6481
6482	/*
6483	 * This is the case where there are no clients left. Any
6484	 * SRS etc on this group have also be quiesced.
6485	 */
6486	for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
6487		if (ring->mr_classify_type == MAC_HW_CLASSIFIER) {
6488			ASSERT(group->mrg_state == MAC_GROUP_STATE_RESERVED);
6489			/*
6490			 * Remove the SRS associated with the HW ring.
6491			 * As a result, polling will be disabled.
6492			 */
6493			ring->mr_srs = NULL;
6494		}
6495		ASSERT(group->mrg_state < MAC_GROUP_STATE_RESERVED ||
6496		    ring->mr_state == MR_INUSE);
6497		if (ring->mr_state == MR_INUSE) {
6498			mac_stop_ring(ring);
6499			ring->mr_flag = 0;
6500		}
6501	}
6502
6503	/* remove group from share */
6504	if (mcip->mci_share != NULL) {
6505		mip->mi_share_capab.ms_sremove(mcip->mci_share,
6506		    group->mrg_driver);
6507	}
6508
6509	if (mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
6510		mac_ring_t *ring;
6511
6512		/*
6513		 * Rings were dynamically allocated to group.
6514		 * Move rings back to default group.
6515		 */
6516		while ((ring = group->mrg_rings) != NULL) {
6517			(void) mac_group_mov_ring(mip, mip->mi_rx_donor_grp,
6518			    ring);
6519		}
6520	}
6521	mac_stop_group(group);
6522	/*
6523	 * Possible improvement: See if we can assign the group just released
6524	 * to a another client of the mip
6525	 */
6526}
6527
6528/*
6529 * When we move the primary's mac address between groups, we need to also
6530 * take all the clients sharing the same mac address along with it (VLANs)
6531 * We remove the mac address for such clients from the group after quiescing
6532 * them. When we add the mac address we restart the client. Note that
6533 * the primary's mac address is removed from the group after all the
6534 * other clients sharing the address are removed. Similarly, the primary's
6535 * mac address is added before all the other client's mac address are
6536 * added. While grp is the group where the clients reside, tgrp is
6537 * the group where the addresses have to be added.
6538 */
6539static void
6540mac_rx_move_macaddr_prim(mac_client_impl_t *mcip, mac_group_t *grp,
6541    mac_group_t *tgrp, uint8_t *maddr, boolean_t add)
6542{
6543	mac_impl_t		*mip = mcip->mci_mip;
6544	mac_grp_client_t	*mgcp = grp->mrg_clients;
6545	mac_client_impl_t	*gmcip;
6546	boolean_t		prim;
6547
6548	prim = (mcip->mci_state_flags & MCIS_UNICAST_HW) != 0;
6549
6550	/*
6551	 * If the clients are in a non-default group, we just have to
6552	 * walk the group's client list. If it is in the default group
6553	 * (which will be shared by other clients as well, we need to
6554	 * check if the unicast address matches mcip's unicast.
6555	 */
6556	while (mgcp != NULL) {
6557		gmcip = mgcp->mgc_client;
6558		if (gmcip != mcip &&
6559		    (grp != MAC_DEFAULT_RX_GROUP(mip) ||
6560		    mcip->mci_unicast == gmcip->mci_unicast)) {
6561			if (!add) {
6562				mac_rx_client_quiesce(
6563				    (mac_client_handle_t)gmcip);
6564				(void) mac_remove_macaddr(mcip->mci_unicast);
6565			} else {
6566				(void) mac_add_macaddr(mip, tgrp, maddr, prim);
6567				mac_rx_client_restart(
6568				    (mac_client_handle_t)gmcip);
6569			}
6570		}
6571		mgcp = mgcp->mgc_next;
6572	}
6573}
6574
6575
6576/*
6577 * Move the MAC address from fgrp to tgrp. If this is the primary client,
6578 * we need to take any VLANs etc. together too.
6579 */
6580static int
6581mac_rx_move_macaddr(mac_client_impl_t *mcip, mac_group_t *fgrp,
6582    mac_group_t *tgrp)
6583{
6584	mac_impl_t		*mip = mcip->mci_mip;
6585	uint8_t			maddr[MAXMACADDRLEN];
6586	int			err = 0;
6587	boolean_t		prim;
6588	boolean_t		multiclnt = B_FALSE;
6589
6590	mac_rx_client_quiesce((mac_client_handle_t)mcip);
6591	ASSERT(mcip->mci_unicast != NULL);
6592	bcopy(mcip->mci_unicast->ma_addr, maddr, mcip->mci_unicast->ma_len);
6593
6594	prim = (mcip->mci_state_flags & MCIS_UNICAST_HW) != 0;
6595	if (mcip->mci_unicast->ma_nusers > 1) {
6596		mac_rx_move_macaddr_prim(mcip, fgrp, NULL, maddr, B_FALSE);
6597		multiclnt = B_TRUE;
6598	}
6599	ASSERT(mcip->mci_unicast->ma_nusers == 1);
6600	err = mac_remove_macaddr(mcip->mci_unicast);
6601	if (err != 0) {
6602		mac_rx_client_restart((mac_client_handle_t)mcip);
6603		if (multiclnt) {
6604			mac_rx_move_macaddr_prim(mcip, fgrp, fgrp, maddr,
6605			    B_TRUE);
6606		}
6607		return (err);
6608	}
6609	/*
6610	 * Program the H/W Classifier first, if this fails we need
6611	 * not proceed with the other stuff.
6612	 */
6613	if ((err = mac_add_macaddr(mip, tgrp, maddr, prim)) != 0) {
6614		/* Revert back the H/W Classifier */
6615		if ((err = mac_add_macaddr(mip, fgrp, maddr, prim)) != 0) {
6616			/*
6617			 * This should not fail now since it worked earlier,
6618			 * should we panic?
6619			 */
6620			cmn_err(CE_WARN,
6621			    "mac_rx_switch_group: switching %p back"
6622			    " to group %p failed!!", (void *)mcip,
6623			    (void *)fgrp);
6624		}
6625		mac_rx_client_restart((mac_client_handle_t)mcip);
6626		if (multiclnt) {
6627			mac_rx_move_macaddr_prim(mcip, fgrp, fgrp, maddr,
6628			    B_TRUE);
6629		}
6630		return (err);
6631	}
6632	mcip->mci_unicast = mac_find_macaddr(mip, maddr);
6633	mac_rx_client_restart((mac_client_handle_t)mcip);
6634	if (multiclnt)
6635		mac_rx_move_macaddr_prim(mcip, fgrp, tgrp, maddr, B_TRUE);
6636	return (err);
6637}
6638
6639/*
6640 * Switch the MAC client from one group to another. This means we need
6641 * to remove the MAC address from the group, remove the MAC client,
6642 * teardown the SRSs and revert the group state. Then, we add the client
6643 * to the destination group, set the SRSs, and add the MAC address to the
6644 * group.
6645 */
6646int
6647mac_rx_switch_group(mac_client_impl_t *mcip, mac_group_t *fgrp,
6648    mac_group_t *tgrp)
6649{
6650	int			err;
6651	mac_group_state_t	next_state;
6652	mac_client_impl_t	*group_only_mcip;
6653	mac_client_impl_t	*gmcip;
6654	mac_impl_t		*mip = mcip->mci_mip;
6655	mac_grp_client_t	*mgcp;
6656
6657	ASSERT(fgrp == mcip->mci_flent->fe_rx_ring_group);
6658
6659	if ((err = mac_rx_move_macaddr(mcip, fgrp, tgrp)) != 0)
6660		return (err);
6661
6662	/*
6663	 * The group might be reserved, but SRSs may not be set up, e.g.
6664	 * primary and its vlans using a reserved group.
6665	 */
6666	if (fgrp->mrg_state == MAC_GROUP_STATE_RESERVED &&
6667	    MAC_GROUP_ONLY_CLIENT(fgrp) != NULL) {
6668		mac_rx_srs_group_teardown(mcip->mci_flent, B_TRUE);
6669	}
6670	if (fgrp != MAC_DEFAULT_RX_GROUP(mip)) {
6671		mgcp = fgrp->mrg_clients;
6672		while (mgcp != NULL) {
6673			gmcip = mgcp->mgc_client;
6674			mgcp = mgcp->mgc_next;
6675			mac_group_remove_client(fgrp, gmcip);
6676			mac_group_add_client(tgrp, gmcip);
6677			gmcip->mci_flent->fe_rx_ring_group = tgrp;
6678		}
6679		mac_release_rx_group(mcip, fgrp);
6680		ASSERT(MAC_GROUP_NO_CLIENT(fgrp));
6681		mac_set_group_state(fgrp, MAC_GROUP_STATE_REGISTERED);
6682	} else {
6683		mac_group_remove_client(fgrp, mcip);
6684		mac_group_add_client(tgrp, mcip);
6685		mcip->mci_flent->fe_rx_ring_group = tgrp;
6686		/*
6687		 * If there are other clients (VLANs) sharing this address
6688		 * we should be here only for the primary.
6689		 */
6690		if (mcip->mci_unicast->ma_nusers > 1) {
6691			/*
6692			 * We need to move all the clients that are using
6693			 * this h/w address.
6694			 */
6695			mgcp = fgrp->mrg_clients;
6696			while (mgcp != NULL) {
6697				gmcip = mgcp->mgc_client;
6698				mgcp = mgcp->mgc_next;
6699				if (mcip->mci_unicast == gmcip->mci_unicast) {
6700					mac_group_remove_client(fgrp, gmcip);
6701					mac_group_add_client(tgrp, gmcip);
6702					gmcip->mci_flent->fe_rx_ring_group =
6703					    tgrp;
6704				}
6705			}
6706		}
6707		/*
6708		 * The default group will still take the multicast,
6709		 * broadcast traffic etc., so it won't go to
6710		 * MAC_GROUP_STATE_REGISTERED.
6711		 */
6712		if (fgrp->mrg_state == MAC_GROUP_STATE_RESERVED)
6713			mac_rx_group_unmark(fgrp, MR_CONDEMNED);
6714		mac_set_group_state(fgrp, MAC_GROUP_STATE_SHARED);
6715	}
6716	next_state = mac_group_next_state(tgrp, &group_only_mcip,
6717	    MAC_DEFAULT_RX_GROUP(mip), B_TRUE);
6718	mac_set_group_state(tgrp, next_state);
6719	/*
6720	 * If the destination group is reserved, setup the SRSs etc.
6721	 */
6722	if (tgrp->mrg_state == MAC_GROUP_STATE_RESERVED) {
6723		mac_rx_srs_group_setup(mcip, mcip->mci_flent, SRST_LINK);
6724		mac_fanout_setup(mcip, mcip->mci_flent,
6725		    MCIP_RESOURCE_PROPS(mcip), mac_rx_deliver, mcip, NULL,
6726		    NULL);
6727		mac_rx_group_unmark(tgrp, MR_INCIPIENT);
6728	} else {
6729		mac_rx_switch_grp_to_sw(tgrp);
6730	}
6731	return (0);
6732}
6733
6734/*
6735 * Reserves a TX group for the specified share. Invoked by mac_tx_srs_setup()
6736 * when a share was allocated to the client.
6737 */
6738mac_group_t *
6739mac_reserve_tx_group(mac_client_impl_t *mcip, boolean_t move)
6740{
6741	mac_impl_t		*mip = mcip->mci_mip;
6742	mac_group_t		*grp = NULL;
6743	int			rv;
6744	int			i;
6745	int			err;
6746	mac_group_t		*defgrp;
6747	mac_share_handle_t	share = mcip->mci_share;
6748	mac_resource_props_t	*mrp = MCIP_RESOURCE_PROPS(mcip);
6749	int			nrings;
6750	int			defnrings;
6751	boolean_t		need_exclgrp = B_FALSE;
6752	int			need_rings = 0;
6753	mac_group_t		*candidate_grp = NULL;
6754	mac_client_impl_t	*gclient;
6755	mac_resource_props_t	*gmrp;
6756	boolean_t		txhw = mrp->mrp_mask & MRP_TX_RINGS;
6757	boolean_t		unspec = mrp->mrp_mask & MRP_TXRINGS_UNSPEC;
6758	boolean_t		isprimary;
6759
6760	isprimary = mcip->mci_flent->fe_type & FLOW_PRIMARY_MAC;
6761	/*
6762	 * When we come here for a VLAN on the primary (dladm create-vlan),
6763	 * we need to pair it along with the primary (to keep it consistent
6764	 * with the RX side). So, we check if the primary is already assigned
6765	 * to a group and return the group if so. The other way is also
6766	 * true, i.e. the VLAN is already created and now we are plumbing
6767	 * the primary.
6768	 */
6769	if (!move && isprimary) {
6770		for (gclient = mip->mi_clients_list; gclient != NULL;
6771		    gclient = gclient->mci_client_next) {
6772			if (gclient->mci_flent->fe_type & FLOW_PRIMARY_MAC &&
6773			    gclient->mci_flent->fe_tx_ring_group != NULL) {
6774				return (gclient->mci_flent->fe_tx_ring_group);
6775			}
6776		}
6777	}
6778
6779	if (mip->mi_tx_groups == NULL || mip->mi_tx_group_count == 0)
6780		return (NULL);
6781
6782	/* For dynamic groups, default unspec to 1 */
6783	if (txhw && unspec &&
6784	    mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
6785		mrp->mrp_ntxrings = 1;
6786	}
6787	/*
6788	 * For static grouping we allow only specifying rings=0 and
6789	 * unspecified
6790	 */
6791	if (txhw && mrp->mrp_ntxrings > 0 &&
6792	    mip->mi_tx_group_type == MAC_GROUP_TYPE_STATIC) {
6793		return (NULL);
6794	}
6795
6796	if (txhw) {
6797		/*
6798		 * We have explicitly asked for a group (with ntxrings,
6799		 * if unspec).
6800		 */
6801		if (unspec || mrp->mrp_ntxrings > 0) {
6802			need_exclgrp = B_TRUE;
6803			need_rings = mrp->mrp_ntxrings;
6804		} else if (mrp->mrp_ntxrings == 0) {
6805			/*
6806			 * We have asked for a software group.
6807			 */
6808			return (NULL);
6809		}
6810	}
6811	defgrp = MAC_DEFAULT_TX_GROUP(mip);
6812	/*
6813	 * The number of rings that the default group can donate.
6814	 * We need to leave at least one ring - the default ring - in
6815	 * this group.
6816	 */
6817	defnrings = defgrp->mrg_cur_count - 1;
6818
6819	/*
6820	 * Primary gets default group unless explicitly told not
6821	 * to  (i.e. rings > 0).
6822	 */
6823	if (isprimary && !need_exclgrp)
6824		return (NULL);
6825
6826	nrings = (mrp->mrp_mask & MRP_TX_RINGS) != 0 ? mrp->mrp_ntxrings : 1;
6827	for (i = 0; i <  mip->mi_tx_group_count; i++) {
6828		grp = &mip->mi_tx_groups[i];
6829		if ((grp->mrg_state == MAC_GROUP_STATE_RESERVED) ||
6830		    (grp->mrg_state == MAC_GROUP_STATE_UNINIT)) {
6831			/*
6832			 * Select a candidate for replacement if we don't
6833			 * get an exclusive group. A candidate group is one
6834			 * that didn't ask for an exclusive group, but got
6835			 * one and it has enough rings (combined with what
6836			 * the default group can donate) for the new MAC
6837			 * client.
6838			 */
6839			if (grp->mrg_state == MAC_GROUP_STATE_RESERVED &&
6840			    candidate_grp == NULL) {
6841				gclient = MAC_GROUP_ONLY_CLIENT(grp);
6842				if (gclient == NULL)
6843					gclient = mac_get_grp_primary(grp);
6844				gmrp = MCIP_RESOURCE_PROPS(gclient);
6845				if (gclient->mci_share == NULL &&
6846				    (gmrp->mrp_mask & MRP_TX_RINGS) == 0 &&
6847				    (unspec ||
6848				    (grp->mrg_cur_count + defnrings) >=
6849				    need_rings)) {
6850					candidate_grp = grp;
6851				}
6852			}
6853			continue;
6854		}
6855		/*
6856		 * If the default can't donate let's just walk and
6857		 * see if someone can vacate a group, so that we have
6858		 * enough rings for this.
6859		 */
6860		if (mip->mi_tx_group_type != MAC_GROUP_TYPE_DYNAMIC ||
6861		    nrings <= defnrings) {
6862			if (grp->mrg_state == MAC_GROUP_STATE_REGISTERED) {
6863				rv = mac_start_group(grp);
6864				ASSERT(rv == 0);
6865			}
6866			break;
6867		}
6868	}
6869
6870	/* The default group */
6871	if (i >= mip->mi_tx_group_count) {
6872		/*
6873		 * If we need an exclusive group and have identified a
6874		 * candidate group we switch the MAC client from the
6875		 * candidate group to the default group and give the
6876		 * candidate group to this client.
6877		 */
6878		if (need_exclgrp && candidate_grp != NULL) {
6879			/*
6880			 * Switch the MAC client from the candidate group
6881			 * to the default group.
6882			 */
6883			grp = candidate_grp;
6884			gclient = MAC_GROUP_ONLY_CLIENT(grp);
6885			if (gclient == NULL)
6886				gclient = mac_get_grp_primary(grp);
6887			mac_tx_client_quiesce((mac_client_handle_t)gclient);
6888			mac_tx_switch_group(gclient, grp, defgrp);
6889			mac_tx_client_restart((mac_client_handle_t)gclient);
6890
6891			/*
6892			 * Give the candidate group with the specified number
6893			 * of rings to this MAC client.
6894			 */
6895			ASSERT(grp->mrg_state == MAC_GROUP_STATE_REGISTERED);
6896			rv = mac_start_group(grp);
6897			ASSERT(rv == 0);
6898
6899			if (mip->mi_tx_group_type != MAC_GROUP_TYPE_DYNAMIC)
6900				return (grp);
6901
6902			ASSERT(grp->mrg_cur_count == 0);
6903			ASSERT(defgrp->mrg_cur_count > need_rings);
6904
6905			err = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_TX,
6906			    defgrp, grp, share, need_rings);
6907			if (err == 0) {
6908				/*
6909				 * For a share i_mac_group_allocate_rings gets
6910				 * the rings from the driver, let's populate
6911				 * the property for the client now.
6912				 */
6913				if (share != NULL) {
6914					mac_client_set_rings(
6915					    (mac_client_handle_t)mcip, -1,
6916					    grp->mrg_cur_count);
6917				}
6918				mip->mi_tx_group_free--;
6919				return (grp);
6920			}
6921			DTRACE_PROBE3(tx__group__reserve__alloc__rings, char *,
6922			    mip->mi_name, int, grp->mrg_index, int, err);
6923			mac_stop_group(grp);
6924		}
6925		return (NULL);
6926	}
6927	/*
6928	 * We got an exclusive group, but it is not dynamic.
6929	 */
6930	if (mip->mi_tx_group_type != MAC_GROUP_TYPE_DYNAMIC) {
6931		mip->mi_tx_group_free--;
6932		return (grp);
6933	}
6934
6935	rv = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_TX, defgrp, grp,
6936	    share, nrings);
6937	if (rv != 0) {
6938		DTRACE_PROBE3(tx__group__reserve__alloc__rings,
6939		    char *, mip->mi_name, int, grp->mrg_index, int, rv);
6940		mac_stop_group(grp);
6941		return (NULL);
6942	}
6943	/*
6944	 * For a share i_mac_group_allocate_rings gets the rings from the
6945	 * driver, let's populate the property for the client now.
6946	 */
6947	if (share != NULL) {
6948		mac_client_set_rings((mac_client_handle_t)mcip, -1,
6949		    grp->mrg_cur_count);
6950	}
6951	mip->mi_tx_group_free--;
6952	return (grp);
6953}
6954
6955void
6956mac_release_tx_group(mac_client_impl_t *mcip, mac_group_t *grp)
6957{
6958	mac_impl_t		*mip = mcip->mci_mip;
6959	mac_share_handle_t	share = mcip->mci_share;
6960	mac_ring_t		*ring;
6961	mac_soft_ring_set_t	*srs = MCIP_TX_SRS(mcip);
6962	mac_group_t		*defgrp;
6963
6964	defgrp = MAC_DEFAULT_TX_GROUP(mip);
6965	if (srs != NULL) {
6966		if (srs->srs_soft_ring_count > 0) {
6967			for (ring = grp->mrg_rings; ring != NULL;
6968			    ring = ring->mr_next) {
6969				ASSERT(mac_tx_srs_ring_present(srs, ring));
6970				mac_tx_invoke_callbacks(mcip,
6971				    (mac_tx_cookie_t)
6972				    mac_tx_srs_get_soft_ring(srs, ring));
6973				mac_tx_srs_del_ring(srs, ring);
6974			}
6975		} else {
6976			ASSERT(srs->srs_tx.st_arg2 != NULL);
6977			srs->srs_tx.st_arg2 = NULL;
6978			mac_srs_stat_delete(srs);
6979		}
6980	}
6981	if (share != NULL)
6982		mip->mi_share_capab.ms_sremove(share, grp->mrg_driver);
6983
6984	/* move the ring back to the pool */
6985	if (mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
6986		while ((ring = grp->mrg_rings) != NULL)
6987			(void) mac_group_mov_ring(mip, defgrp, ring);
6988	}
6989	mac_stop_group(grp);
6990	mip->mi_tx_group_free++;
6991}
6992
6993/*
6994 * Disassociate a MAC client from a group, i.e go through the rings in the
6995 * group and delete all the soft rings tied to them.
6996 */
6997static void
6998mac_tx_dismantle_soft_rings(mac_group_t *fgrp, flow_entry_t *flent)
6999{
7000	mac_client_impl_t	*mcip = flent->fe_mcip;
7001	mac_soft_ring_set_t	*tx_srs;
7002	mac_srs_tx_t		*tx;
7003	mac_ring_t		*ring;
7004
7005	tx_srs = flent->fe_tx_srs;
7006	tx = &tx_srs->srs_tx;
7007
7008	/* Single ring case we haven't created any soft rings */
7009	if (tx->st_mode == SRS_TX_BW || tx->st_mode == SRS_TX_SERIALIZE ||
7010	    tx->st_mode == SRS_TX_DEFAULT) {
7011		tx->st_arg2 = NULL;
7012		mac_srs_stat_delete(tx_srs);
7013	/* Fanout case, where we have to dismantle the soft rings */
7014	} else {
7015		for (ring = fgrp->mrg_rings; ring != NULL;
7016		    ring = ring->mr_next) {
7017			ASSERT(mac_tx_srs_ring_present(tx_srs, ring));
7018			mac_tx_invoke_callbacks(mcip,
7019			    (mac_tx_cookie_t)mac_tx_srs_get_soft_ring(tx_srs,
7020			    ring));
7021			mac_tx_srs_del_ring(tx_srs, ring);
7022		}
7023		ASSERT(tx->st_arg2 == NULL);
7024	}
7025}
7026
7027/*
7028 * Switch the MAC client from one group to another. This means we need
7029 * to remove the MAC client, teardown the SRSs and revert the group state.
7030 * Then, we add the client to the destination roup, set the SRSs etc.
7031 */
7032void
7033mac_tx_switch_group(mac_client_impl_t *mcip, mac_group_t *fgrp,
7034    mac_group_t *tgrp)
7035{
7036	mac_client_impl_t	*group_only_mcip;
7037	mac_impl_t		*mip = mcip->mci_mip;
7038	flow_entry_t		*flent = mcip->mci_flent;
7039	mac_group_t		*defgrp;
7040	mac_grp_client_t	*mgcp;
7041	mac_client_impl_t	*gmcip;
7042	flow_entry_t		*gflent;
7043
7044	defgrp = MAC_DEFAULT_TX_GROUP(mip);
7045	ASSERT(fgrp == flent->fe_tx_ring_group);
7046
7047	if (fgrp == defgrp) {
7048		/*
7049		 * If this is the primary we need to find any VLANs on
7050		 * the primary and move them too.
7051		 */
7052		mac_group_remove_client(fgrp, mcip);
7053		mac_tx_dismantle_soft_rings(fgrp, flent);
7054		if (mcip->mci_unicast->ma_nusers > 1) {
7055			mgcp = fgrp->mrg_clients;
7056			while (mgcp != NULL) {
7057				gmcip = mgcp->mgc_client;
7058				mgcp = mgcp->mgc_next;
7059				if (mcip->mci_unicast != gmcip->mci_unicast)
7060					continue;
7061				mac_tx_client_quiesce(
7062				    (mac_client_handle_t)gmcip);
7063
7064				gflent = gmcip->mci_flent;
7065				mac_group_remove_client(fgrp, gmcip);
7066				mac_tx_dismantle_soft_rings(fgrp, gflent);
7067
7068				mac_group_add_client(tgrp, gmcip);
7069				gflent->fe_tx_ring_group = tgrp;
7070				/* We could directly set this to SHARED */
7071				tgrp->mrg_state = mac_group_next_state(tgrp,
7072				    &group_only_mcip, defgrp, B_FALSE);
7073
7074				mac_tx_srs_group_setup(gmcip, gflent,
7075				    SRST_LINK);
7076				mac_fanout_setup(gmcip, gflent,
7077				    MCIP_RESOURCE_PROPS(gmcip), mac_rx_deliver,
7078				    gmcip, NULL, NULL);
7079
7080				mac_tx_client_restart(
7081				    (mac_client_handle_t)gmcip);
7082			}
7083		}
7084		if (MAC_GROUP_NO_CLIENT(fgrp)) {
7085			mac_ring_t	*ring;
7086			int		cnt;
7087			int		ringcnt;
7088
7089			fgrp->mrg_state = MAC_GROUP_STATE_REGISTERED;
7090			/*
7091			 * Additionally, we also need to stop all
7092			 * the rings in the default group, except
7093			 * the default ring. The reason being
7094			 * this group won't be released since it is
7095			 * the default group, so the rings won't
7096			 * be stopped otherwise.
7097			 */
7098			ringcnt = fgrp->mrg_cur_count;
7099			ring = fgrp->mrg_rings;
7100			for (cnt = 0; cnt < ringcnt; cnt++) {
7101				if (ring->mr_state == MR_INUSE &&
7102				    ring !=
7103				    (mac_ring_t *)mip->mi_default_tx_ring) {
7104					mac_stop_ring(ring);
7105					ring->mr_flag = 0;
7106				}
7107				ring = ring->mr_next;
7108			}
7109		} else if (MAC_GROUP_ONLY_CLIENT(fgrp) != NULL) {
7110			fgrp->mrg_state = MAC_GROUP_STATE_RESERVED;
7111		} else {
7112			ASSERT(fgrp->mrg_state == MAC_GROUP_STATE_SHARED);
7113		}
7114	} else {
7115		/*
7116		 * We could have VLANs sharing the non-default group with
7117		 * the primary.
7118		 */
7119		mgcp = fgrp->mrg_clients;
7120		while (mgcp != NULL) {
7121			gmcip = mgcp->mgc_client;
7122			mgcp = mgcp->mgc_next;
7123			if (gmcip == mcip)
7124				continue;
7125			mac_tx_client_quiesce((mac_client_handle_t)gmcip);
7126			gflent = gmcip->mci_flent;
7127
7128			mac_group_remove_client(fgrp, gmcip);
7129			mac_tx_dismantle_soft_rings(fgrp, gflent);
7130
7131			mac_group_add_client(tgrp, gmcip);
7132			gflent->fe_tx_ring_group = tgrp;
7133			/* We could directly set this to SHARED */
7134			tgrp->mrg_state = mac_group_next_state(tgrp,
7135			    &group_only_mcip, defgrp, B_FALSE);
7136			mac_tx_srs_group_setup(gmcip, gflent, SRST_LINK);
7137			mac_fanout_setup(gmcip, gflent,
7138			    MCIP_RESOURCE_PROPS(gmcip), mac_rx_deliver,
7139			    gmcip, NULL, NULL);
7140
7141			mac_tx_client_restart((mac_client_handle_t)gmcip);
7142		}
7143		mac_group_remove_client(fgrp, mcip);
7144		mac_release_tx_group(mcip, fgrp);
7145		fgrp->mrg_state = MAC_GROUP_STATE_REGISTERED;
7146	}
7147
7148	/* Add it to the tgroup */
7149	mac_group_add_client(tgrp, mcip);
7150	flent->fe_tx_ring_group = tgrp;
7151	tgrp->mrg_state = mac_group_next_state(tgrp, &group_only_mcip,
7152	    defgrp, B_FALSE);
7153
7154	mac_tx_srs_group_setup(mcip, flent, SRST_LINK);
7155	mac_fanout_setup(mcip, flent, MCIP_RESOURCE_PROPS(mcip),
7156	    mac_rx_deliver, mcip, NULL, NULL);
7157}
7158
7159/*
7160 * This is a 1-time control path activity initiated by the client (IP).
7161 * The mac perimeter protects against other simultaneous control activities,
7162 * for example an ioctl that attempts to change the degree of fanout and
7163 * increase or decrease the number of softrings associated with this Tx SRS.
7164 */
7165static mac_tx_notify_cb_t *
7166mac_client_tx_notify_add(mac_client_impl_t *mcip,
7167    mac_tx_notify_t notify, void *arg)
7168{
7169	mac_cb_info_t *mcbi;
7170	mac_tx_notify_cb_t *mtnfp;
7171
7172	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
7173
7174	mtnfp = kmem_zalloc(sizeof (mac_tx_notify_cb_t), KM_SLEEP);
7175	mtnfp->mtnf_fn = notify;
7176	mtnfp->mtnf_arg = arg;
7177	mtnfp->mtnf_link.mcb_objp = mtnfp;
7178	mtnfp->mtnf_link.mcb_objsize = sizeof (mac_tx_notify_cb_t);
7179	mtnfp->mtnf_link.mcb_flags = MCB_TX_NOTIFY_CB_T;
7180
7181	mcbi = &mcip->mci_tx_notify_cb_info;
7182	mutex_enter(mcbi->mcbi_lockp);
7183	mac_callback_add(mcbi, &mcip->mci_tx_notify_cb_list, &mtnfp->mtnf_link);
7184	mutex_exit(mcbi->mcbi_lockp);
7185	return (mtnfp);
7186}
7187
7188static void
7189mac_client_tx_notify_remove(mac_client_impl_t *mcip, mac_tx_notify_cb_t *mtnfp)
7190{
7191	mac_cb_info_t	*mcbi;
7192	mac_cb_t	**cblist;
7193
7194	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
7195
7196	if (!mac_callback_find(&mcip->mci_tx_notify_cb_info,
7197	    &mcip->mci_tx_notify_cb_list, &mtnfp->mtnf_link)) {
7198		cmn_err(CE_WARN,
7199		    "mac_client_tx_notify_remove: callback not "
7200		    "found, mcip 0x%p mtnfp 0x%p", (void *)mcip, (void *)mtnfp);
7201		return;
7202	}
7203
7204	mcbi = &mcip->mci_tx_notify_cb_info;
7205	cblist = &mcip->mci_tx_notify_cb_list;
7206	mutex_enter(mcbi->mcbi_lockp);
7207	if (mac_callback_remove(mcbi, cblist, &mtnfp->mtnf_link))
7208		kmem_free(mtnfp, sizeof (mac_tx_notify_cb_t));
7209	else
7210		mac_callback_remove_wait(&mcip->mci_tx_notify_cb_info);
7211	mutex_exit(mcbi->mcbi_lockp);
7212}
7213
7214/*
7215 * mac_client_tx_notify():
7216 * call to add and remove flow control callback routine.
7217 */
7218mac_tx_notify_handle_t
7219mac_client_tx_notify(mac_client_handle_t mch, mac_tx_notify_t callb_func,
7220    void *ptr)
7221{
7222	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
7223	mac_tx_notify_cb_t	*mtnfp = NULL;
7224
7225	i_mac_perim_enter(mcip->mci_mip);
7226
7227	if (callb_func != NULL) {
7228		/* Add a notify callback */
7229		mtnfp = mac_client_tx_notify_add(mcip, callb_func, ptr);
7230	} else {
7231		mac_client_tx_notify_remove(mcip, (mac_tx_notify_cb_t *)ptr);
7232	}
7233	i_mac_perim_exit(mcip->mci_mip);
7234
7235	return ((mac_tx_notify_handle_t)mtnfp);
7236}
7237
7238void
7239mac_bridge_vectors(mac_bridge_tx_t txf, mac_bridge_rx_t rxf,
7240    mac_bridge_ref_t reff, mac_bridge_ls_t lsf)
7241{
7242	mac_bridge_tx_cb = txf;
7243	mac_bridge_rx_cb = rxf;
7244	mac_bridge_ref_cb = reff;
7245	mac_bridge_ls_cb = lsf;
7246}
7247
7248int
7249mac_bridge_set(mac_handle_t mh, mac_handle_t link)
7250{
7251	mac_impl_t *mip = (mac_impl_t *)mh;
7252	int retv;
7253
7254	mutex_enter(&mip->mi_bridge_lock);
7255	if (mip->mi_bridge_link == NULL) {
7256		mip->mi_bridge_link = link;
7257		retv = 0;
7258	} else {
7259		retv = EBUSY;
7260	}
7261	mutex_exit(&mip->mi_bridge_lock);
7262	if (retv == 0) {
7263		mac_poll_state_change(mh, B_FALSE);
7264		mac_capab_update(mh);
7265	}
7266	return (retv);
7267}
7268
7269/*
7270 * Disable bridging on the indicated link.
7271 */
7272void
7273mac_bridge_clear(mac_handle_t mh, mac_handle_t link)
7274{
7275	mac_impl_t *mip = (mac_impl_t *)mh;
7276
7277	mutex_enter(&mip->mi_bridge_lock);
7278	ASSERT(mip->mi_bridge_link == link);
7279	mip->mi_bridge_link = NULL;
7280	mutex_exit(&mip->mi_bridge_lock);
7281	mac_poll_state_change(mh, B_TRUE);
7282	mac_capab_update(mh);
7283}
7284
7285void
7286mac_no_active(mac_handle_t mh)
7287{
7288	mac_impl_t *mip = (mac_impl_t *)mh;
7289
7290	i_mac_perim_enter(mip);
7291	mip->mi_state_flags |= MIS_NO_ACTIVE;
7292	i_mac_perim_exit(mip);
7293}
7294
7295/*
7296 * Walk the primary VLAN clients whenever the primary's rings property
7297 * changes and update the mac_resource_props_t for the VLAN's client.
7298 * We need to do this since we don't support setting these properties
7299 * on the primary's VLAN clients, but the VLAN clients have to
7300 * follow the primary w.r.t the rings property;
7301 */
7302void
7303mac_set_prim_vlan_rings(mac_impl_t  *mip, mac_resource_props_t *mrp)
7304{
7305	mac_client_impl_t	*vmcip;
7306	mac_resource_props_t	*vmrp;
7307
7308	for (vmcip = mip->mi_clients_list; vmcip != NULL;
7309	    vmcip = vmcip->mci_client_next) {
7310		if (!(vmcip->mci_flent->fe_type & FLOW_PRIMARY_MAC) ||
7311		    mac_client_vid((mac_client_handle_t)vmcip) ==
7312		    VLAN_ID_NONE) {
7313			continue;
7314		}
7315		vmrp = MCIP_RESOURCE_PROPS(vmcip);
7316
7317		vmrp->mrp_nrxrings =  mrp->mrp_nrxrings;
7318		if (mrp->mrp_mask & MRP_RX_RINGS)
7319			vmrp->mrp_mask |= MRP_RX_RINGS;
7320		else if (vmrp->mrp_mask & MRP_RX_RINGS)
7321			vmrp->mrp_mask &= ~MRP_RX_RINGS;
7322
7323		vmrp->mrp_ntxrings =  mrp->mrp_ntxrings;
7324		if (mrp->mrp_mask & MRP_TX_RINGS)
7325			vmrp->mrp_mask |= MRP_TX_RINGS;
7326		else if (vmrp->mrp_mask & MRP_TX_RINGS)
7327			vmrp->mrp_mask &= ~MRP_TX_RINGS;
7328
7329		if (mrp->mrp_mask & MRP_RXRINGS_UNSPEC)
7330			vmrp->mrp_mask |= MRP_RXRINGS_UNSPEC;
7331		else
7332			vmrp->mrp_mask &= ~MRP_RXRINGS_UNSPEC;
7333
7334		if (mrp->mrp_mask & MRP_TXRINGS_UNSPEC)
7335			vmrp->mrp_mask |= MRP_TXRINGS_UNSPEC;
7336		else
7337			vmrp->mrp_mask &= ~MRP_TXRINGS_UNSPEC;
7338	}
7339}
7340
7341/*
7342 * We are adding or removing ring(s) from a group. The source for taking
7343 * rings is the default group. The destination for giving rings back is
7344 * the default group.
7345 */
7346int
7347mac_group_ring_modify(mac_client_impl_t *mcip, mac_group_t *group,
7348    mac_group_t *defgrp)
7349{
7350	mac_resource_props_t	*mrp = MCIP_RESOURCE_PROPS(mcip);
7351	uint_t			modify;
7352	int			count;
7353	mac_ring_t		*ring;
7354	mac_ring_t		*next;
7355	mac_impl_t		*mip = mcip->mci_mip;
7356	mac_ring_t		**rings;
7357	uint_t			ringcnt;
7358	int			i = 0;
7359	boolean_t		rx_group = group->mrg_type == MAC_RING_TYPE_RX;
7360	int			start;
7361	int			end;
7362	mac_group_t		*tgrp;
7363	int			j;
7364	int			rv = 0;
7365
7366	/*
7367	 * If we are asked for just a group, we give 1 ring, else
7368	 * the specified number of rings.
7369	 */
7370	if (rx_group) {
7371		ringcnt = (mrp->mrp_mask & MRP_RXRINGS_UNSPEC) ? 1:
7372		    mrp->mrp_nrxrings;
7373	} else {
7374		ringcnt = (mrp->mrp_mask & MRP_TXRINGS_UNSPEC) ? 1:
7375		    mrp->mrp_ntxrings;
7376	}
7377
7378	/* don't allow modifying rings for a share for now. */
7379	ASSERT(mcip->mci_share == NULL);
7380
7381	if (ringcnt == group->mrg_cur_count)
7382		return (0);
7383
7384	if (group->mrg_cur_count > ringcnt) {
7385		modify = group->mrg_cur_count - ringcnt;
7386		if (rx_group) {
7387			if (mip->mi_rx_donor_grp == group) {
7388				ASSERT(mac_is_primary_client(mcip));
7389				mip->mi_rx_donor_grp = defgrp;
7390			} else {
7391				defgrp = mip->mi_rx_donor_grp;
7392			}
7393		}
7394		ring = group->mrg_rings;
7395		rings = kmem_alloc(modify * sizeof (mac_ring_handle_t),
7396		    KM_SLEEP);
7397		j = 0;
7398		for (count = 0; count < modify; count++) {
7399			next = ring->mr_next;
7400			rv = mac_group_mov_ring(mip, defgrp, ring);
7401			if (rv != 0) {
7402				/* cleanup on failure */
7403				for (j = 0; j < count; j++) {
7404					(void) mac_group_mov_ring(mip, group,
7405					    rings[j]);
7406				}
7407				break;
7408			}
7409			rings[j++] = ring;
7410			ring = next;
7411		}
7412		kmem_free(rings, modify * sizeof (mac_ring_handle_t));
7413		return (rv);
7414	}
7415	if (ringcnt >= MAX_RINGS_PER_GROUP)
7416		return (EINVAL);
7417
7418	modify = ringcnt - group->mrg_cur_count;
7419
7420	if (rx_group) {
7421		if (group != mip->mi_rx_donor_grp)
7422			defgrp = mip->mi_rx_donor_grp;
7423		else
7424			/*
7425			 * This is the donor group with all the remaining
7426			 * rings. Default group now gets to be the donor
7427			 */
7428			mip->mi_rx_donor_grp = defgrp;
7429		start = 1;
7430		end = mip->mi_rx_group_count;
7431	} else {
7432		start = 0;
7433		end = mip->mi_tx_group_count - 1;
7434	}
7435	/*
7436	 * If the default doesn't have any rings, lets see if we can
7437	 * take rings given to an h/w client that doesn't need it.
7438	 * For now, we just see if there is  any one client that can donate
7439	 * all the required rings.
7440	 */
7441	if (defgrp->mrg_cur_count < (modify + 1)) {
7442		for (i = start; i < end; i++) {
7443			if (rx_group) {
7444				tgrp = &mip->mi_rx_groups[i];
7445				if (tgrp == group || tgrp->mrg_state <
7446				    MAC_GROUP_STATE_RESERVED) {
7447					continue;
7448				}
7449				mcip = MAC_GROUP_ONLY_CLIENT(tgrp);
7450				if (mcip == NULL)
7451					mcip = mac_get_grp_primary(tgrp);
7452				ASSERT(mcip != NULL);
7453				mrp = MCIP_RESOURCE_PROPS(mcip);
7454				if ((mrp->mrp_mask & MRP_RX_RINGS) != 0)
7455					continue;
7456				if ((tgrp->mrg_cur_count +
7457				    defgrp->mrg_cur_count) < (modify + 1)) {
7458					continue;
7459				}
7460				if (mac_rx_switch_group(mcip, tgrp,
7461				    defgrp) != 0) {
7462					return (ENOSPC);
7463				}
7464			} else {
7465				tgrp = &mip->mi_tx_groups[i];
7466				if (tgrp == group || tgrp->mrg_state <
7467				    MAC_GROUP_STATE_RESERVED) {
7468					continue;
7469				}
7470				mcip = MAC_GROUP_ONLY_CLIENT(tgrp);
7471				if (mcip == NULL)
7472					mcip = mac_get_grp_primary(tgrp);
7473				mrp = MCIP_RESOURCE_PROPS(mcip);
7474				if ((mrp->mrp_mask & MRP_TX_RINGS) != 0)
7475					continue;
7476				if ((tgrp->mrg_cur_count +
7477				    defgrp->mrg_cur_count) < (modify + 1)) {
7478					continue;
7479				}
7480				/* OK, we can switch this to s/w */
7481				mac_tx_client_quiesce(
7482				    (mac_client_handle_t)mcip);
7483				mac_tx_switch_group(mcip, tgrp, defgrp);
7484				mac_tx_client_restart(
7485				    (mac_client_handle_t)mcip);
7486			}
7487		}
7488		if (defgrp->mrg_cur_count < (modify + 1))
7489			return (ENOSPC);
7490	}
7491	if ((rv = i_mac_group_allocate_rings(mip, group->mrg_type, defgrp,
7492	    group, mcip->mci_share, modify)) != 0) {
7493		return (rv);
7494	}
7495	return (0);
7496}
7497
7498/*
7499 * Given the poolname in mac_resource_props, find the cpupart
7500 * that is associated with this pool.  The cpupart will be used
7501 * later for finding the cpus to be bound to the networking threads.
7502 *
7503 * use_default is set B_TRUE if pools are enabled and pool_default
7504 * is returned.  This avoids a 2nd lookup to set the poolname
7505 * for pool-effective.
7506 *
7507 * returns:
7508 *
7509 *    NULL -   pools are disabled or if the 'cpus' property is set.
7510 *    cpupart of pool_default  - pools are enabled and the pool
7511 *             is not available or poolname is blank
7512 *    cpupart of named pool    - pools are enabled and the pool
7513 *             is available.
7514 */
7515cpupart_t *
7516mac_pset_find(mac_resource_props_t *mrp, boolean_t *use_default)
7517{
7518	pool_t		*pool;
7519	cpupart_t	*cpupart;
7520
7521	*use_default = B_FALSE;
7522
7523	/* CPUs property is set */
7524	if (mrp->mrp_mask & MRP_CPUS)
7525		return (NULL);
7526
7527	ASSERT(pool_lock_held());
7528
7529	/* Pools are disabled, no pset */
7530	if (pool_state == POOL_DISABLED)
7531		return (NULL);
7532
7533	/* Pools property is set */
7534	if (mrp->mrp_mask & MRP_POOL) {
7535		if ((pool = pool_lookup_pool_by_name(mrp->mrp_pool)) == NULL) {
7536			/* Pool not found */
7537			DTRACE_PROBE1(mac_pset_find_no_pool, char *,
7538			    mrp->mrp_pool);
7539			*use_default = B_TRUE;
7540			pool = pool_default;
7541		}
7542	/* Pools property is not set */
7543	} else {
7544		*use_default = B_TRUE;
7545		pool = pool_default;
7546	}
7547
7548	/* Find the CPU pset that corresponds to the pool */
7549	mutex_enter(&cpu_lock);
7550	if ((cpupart = cpupart_find(pool->pool_pset->pset_id)) == NULL) {
7551		DTRACE_PROBE1(mac_find_pset_no_pset, psetid_t,
7552		    pool->pool_pset->pset_id);
7553	}
7554	mutex_exit(&cpu_lock);
7555
7556	return (cpupart);
7557}
7558
7559void
7560mac_set_pool_effective(boolean_t use_default, cpupart_t *cpupart,
7561    mac_resource_props_t *mrp, mac_resource_props_t *emrp)
7562{
7563	ASSERT(pool_lock_held());
7564
7565	if (cpupart != NULL) {
7566		emrp->mrp_mask |= MRP_POOL;
7567		if (use_default) {
7568			(void) strcpy(emrp->mrp_pool,
7569			    "pool_default");
7570		} else {
7571			ASSERT(strlen(mrp->mrp_pool) != 0);
7572			(void) strcpy(emrp->mrp_pool,
7573			    mrp->mrp_pool);
7574		}
7575	} else {
7576		emrp->mrp_mask &= ~MRP_POOL;
7577		bzero(emrp->mrp_pool, MAXPATHLEN);
7578	}
7579}
7580
7581struct mac_pool_arg {
7582	char		mpa_poolname[MAXPATHLEN];
7583	pool_event_t	mpa_what;
7584};
7585
7586/*ARGSUSED*/
7587static uint_t
7588mac_pool_link_update(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
7589{
7590	struct mac_pool_arg	*mpa = arg;
7591	mac_impl_t		*mip = (mac_impl_t *)val;
7592	mac_client_impl_t	*mcip;
7593	mac_resource_props_t	*mrp, *emrp;
7594	boolean_t		pool_update = B_FALSE;
7595	boolean_t		pool_clear = B_FALSE;
7596	boolean_t		use_default = B_FALSE;
7597	cpupart_t		*cpupart = NULL;
7598
7599	mrp = kmem_zalloc(sizeof (*mrp), KM_SLEEP);
7600	i_mac_perim_enter(mip);
7601	for (mcip = mip->mi_clients_list; mcip != NULL;
7602	    mcip = mcip->mci_client_next) {
7603		pool_update = B_FALSE;
7604		pool_clear = B_FALSE;
7605		use_default = B_FALSE;
7606		mac_client_get_resources((mac_client_handle_t)mcip, mrp);
7607		emrp = MCIP_EFFECTIVE_PROPS(mcip);
7608
7609		/*
7610		 * When pools are enabled
7611		 */
7612		if ((mpa->mpa_what == POOL_E_ENABLE) &&
7613		    ((mrp->mrp_mask & MRP_CPUS) == 0)) {
7614			mrp->mrp_mask |= MRP_POOL;
7615			pool_update = B_TRUE;
7616		}
7617
7618		/*
7619		 * When pools are disabled
7620		 */
7621		if ((mpa->mpa_what == POOL_E_DISABLE) &&
7622		    ((mrp->mrp_mask & MRP_CPUS) == 0)) {
7623			mrp->mrp_mask |= MRP_POOL;
7624			pool_clear = B_TRUE;
7625		}
7626
7627		/*
7628		 * Look for links with the pool property set and the poolname
7629		 * matching the one which is changing.
7630		 */
7631		if (strcmp(mrp->mrp_pool, mpa->mpa_poolname) == 0) {
7632			/*
7633			 * The pool associated with the link has changed.
7634			 */
7635			if (mpa->mpa_what == POOL_E_CHANGE) {
7636				mrp->mrp_mask |= MRP_POOL;
7637				pool_update = B_TRUE;
7638			}
7639		}
7640
7641		/*
7642		 * This link is associated with pool_default and
7643		 * pool_default has changed.
7644		 */
7645		if ((mpa->mpa_what == POOL_E_CHANGE) &&
7646		    (strcmp(emrp->mrp_pool, "pool_default") == 0) &&
7647		    (strcmp(mpa->mpa_poolname, "pool_default") == 0)) {
7648			mrp->mrp_mask |= MRP_POOL;
7649			pool_update = B_TRUE;
7650		}
7651
7652		/*
7653		 * Get new list of cpus for the pool, bind network
7654		 * threads to new list of cpus and update resources.
7655		 */
7656		if (pool_update) {
7657			if (MCIP_DATAPATH_SETUP(mcip)) {
7658				pool_lock();
7659				cpupart = mac_pset_find(mrp, &use_default);
7660				mac_fanout_setup(mcip, mcip->mci_flent, mrp,
7661				    mac_rx_deliver, mcip, NULL, cpupart);
7662				mac_set_pool_effective(use_default, cpupart,
7663				    mrp, emrp);
7664				pool_unlock();
7665			}
7666			mac_update_resources(mrp, MCIP_RESOURCE_PROPS(mcip),
7667			    B_FALSE);
7668		}
7669
7670		/*
7671		 * Clear the effective pool and bind network threads
7672		 * to any available CPU.
7673		 */
7674		if (pool_clear) {
7675			if (MCIP_DATAPATH_SETUP(mcip)) {
7676				emrp->mrp_mask &= ~MRP_POOL;
7677				bzero(emrp->mrp_pool, MAXPATHLEN);
7678				mac_fanout_setup(mcip, mcip->mci_flent, mrp,
7679				    mac_rx_deliver, mcip, NULL, NULL);
7680			}
7681			mac_update_resources(mrp, MCIP_RESOURCE_PROPS(mcip),
7682			    B_FALSE);
7683		}
7684	}
7685	i_mac_perim_exit(mip);
7686	kmem_free(mrp, sizeof (*mrp));
7687	return (MH_WALK_CONTINUE);
7688}
7689
7690static void
7691mac_pool_update(void *arg)
7692{
7693	mod_hash_walk(i_mac_impl_hash, mac_pool_link_update, arg);
7694	kmem_free(arg, sizeof (struct mac_pool_arg));
7695}
7696
7697/*
7698 * Callback function to be executed when a noteworthy pool event
7699 * takes place.
7700 */
7701/* ARGSUSED */
7702static void
7703mac_pool_event_cb(pool_event_t what, poolid_t id, void *arg)
7704{
7705	pool_t			*pool;
7706	char			*poolname = NULL;
7707	struct mac_pool_arg	*mpa;
7708
7709	pool_lock();
7710	mpa = kmem_zalloc(sizeof (struct mac_pool_arg), KM_SLEEP);
7711
7712	switch (what) {
7713	case POOL_E_ENABLE:
7714	case POOL_E_DISABLE:
7715		break;
7716
7717	case POOL_E_CHANGE:
7718		pool = pool_lookup_pool_by_id(id);
7719		if (pool == NULL) {
7720			kmem_free(mpa, sizeof (struct mac_pool_arg));
7721			pool_unlock();
7722			return;
7723		}
7724		pool_get_name(pool, &poolname);
7725		(void) strlcpy(mpa->mpa_poolname, poolname,
7726		    sizeof (mpa->mpa_poolname));
7727		break;
7728
7729	default:
7730		kmem_free(mpa, sizeof (struct mac_pool_arg));
7731		pool_unlock();
7732		return;
7733	}
7734	pool_unlock();
7735
7736	mpa->mpa_what = what;
7737
7738	mac_pool_update(mpa);
7739}
7740
7741/*
7742 * Set effective rings property. This could be called from datapath_setup/
7743 * datapath_teardown or set-linkprop.
7744 * If the group is reserved we just go ahead and set the effective rings.
7745 * Additionally, for TX this could mean the default  group has lost/gained
7746 * some rings, so if the default group is reserved, we need to adjust the
7747 * effective rings for the default group clients. For RX, if we are working
7748 * with the non-default group, we just need * to reset the effective props
7749 * for the default group clients.
7750 */
7751void
7752mac_set_rings_effective(mac_client_impl_t *mcip)
7753{
7754	mac_impl_t		*mip = mcip->mci_mip;
7755	mac_group_t		*grp;
7756	mac_group_t		*defgrp;
7757	flow_entry_t		*flent = mcip->mci_flent;
7758	mac_resource_props_t	*emrp = MCIP_EFFECTIVE_PROPS(mcip);
7759	mac_grp_client_t	*mgcp;
7760	mac_client_impl_t	*gmcip;
7761
7762	grp = flent->fe_rx_ring_group;
7763	if (grp != NULL) {
7764		defgrp = MAC_DEFAULT_RX_GROUP(mip);
7765		/*
7766		 * If we have reserved a group, set the effective rings
7767		 * to the ring count in the group.
7768		 */
7769		if (grp->mrg_state == MAC_GROUP_STATE_RESERVED) {
7770			emrp->mrp_mask |= MRP_RX_RINGS;
7771			emrp->mrp_nrxrings = grp->mrg_cur_count;
7772		}
7773
7774		/*
7775		 * We go through the clients in the shared group and
7776		 * reset the effective properties. It is possible this
7777		 * might have already been done for some client (i.e.
7778		 * if some client is being moved to a group that is
7779		 * already shared). The case where the default group is
7780		 * RESERVED is taken care of above (note in the RX side if
7781		 * there is a non-default group, the default group is always
7782		 * SHARED).
7783		 */
7784		if (grp != defgrp || grp->mrg_state == MAC_GROUP_STATE_SHARED) {
7785			if (grp->mrg_state == MAC_GROUP_STATE_SHARED)
7786				mgcp = grp->mrg_clients;
7787			else
7788				mgcp = defgrp->mrg_clients;
7789			while (mgcp != NULL) {
7790				gmcip = mgcp->mgc_client;
7791				emrp = MCIP_EFFECTIVE_PROPS(gmcip);
7792				if (emrp->mrp_mask & MRP_RX_RINGS) {
7793					emrp->mrp_mask &= ~MRP_RX_RINGS;
7794					emrp->mrp_nrxrings = 0;
7795				}
7796				mgcp = mgcp->mgc_next;
7797			}
7798		}
7799	}
7800
7801	/* Now the TX side */
7802	grp = flent->fe_tx_ring_group;
7803	if (grp != NULL) {
7804		defgrp = MAC_DEFAULT_TX_GROUP(mip);
7805
7806		if (grp->mrg_state == MAC_GROUP_STATE_RESERVED) {
7807			emrp->mrp_mask |= MRP_TX_RINGS;
7808			emrp->mrp_ntxrings = grp->mrg_cur_count;
7809		} else if (grp->mrg_state == MAC_GROUP_STATE_SHARED) {
7810			mgcp = grp->mrg_clients;
7811			while (mgcp != NULL) {
7812				gmcip = mgcp->mgc_client;
7813				emrp = MCIP_EFFECTIVE_PROPS(gmcip);
7814				if (emrp->mrp_mask & MRP_TX_RINGS) {
7815					emrp->mrp_mask &= ~MRP_TX_RINGS;
7816					emrp->mrp_ntxrings = 0;
7817				}
7818				mgcp = mgcp->mgc_next;
7819			}
7820		}
7821
7822		/*
7823		 * If the group is not the default group and the default
7824		 * group is reserved, the ring count in the default group
7825		 * might have changed, update it.
7826		 */
7827		if (grp != defgrp &&
7828		    defgrp->mrg_state == MAC_GROUP_STATE_RESERVED) {
7829			gmcip = MAC_GROUP_ONLY_CLIENT(defgrp);
7830			emrp = MCIP_EFFECTIVE_PROPS(gmcip);
7831			emrp->mrp_ntxrings = defgrp->mrg_cur_count;
7832		}
7833	}
7834	emrp = MCIP_EFFECTIVE_PROPS(mcip);
7835}
7836
7837/*
7838 * Check if the primary is in the default group. If so, see if we
7839 * can give it a an exclusive group now that another client is
7840 * being configured. We take the primary out of the default group
7841 * because the multicast/broadcast packets for the all the clients
7842 * will land in the default ring in the default group which means
7843 * any client in the default group, even if it is the only on in
7844 * the group, will lose exclusive access to the rings, hence
7845 * polling.
7846 */
7847mac_client_impl_t *
7848mac_check_primary_relocation(mac_client_impl_t *mcip, boolean_t rxhw)
7849{
7850	mac_impl_t		*mip = mcip->mci_mip;
7851	mac_group_t		*defgrp = MAC_DEFAULT_RX_GROUP(mip);
7852	flow_entry_t		*flent = mcip->mci_flent;
7853	mac_resource_props_t	*mrp = MCIP_RESOURCE_PROPS(mcip);
7854	uint8_t			*mac_addr;
7855	mac_group_t		*ngrp;
7856
7857	/*
7858	 * Check if the primary is in the default group, if not
7859	 * or if it is explicitly configured to be in the default
7860	 * group OR set the RX rings property, return.
7861	 */
7862	if (flent->fe_rx_ring_group != defgrp || mrp->mrp_mask & MRP_RX_RINGS)
7863		return (NULL);
7864
7865	/*
7866	 * If the new client needs an exclusive group and we
7867	 * don't have another for the primary, return.
7868	 */
7869	if (rxhw && mip->mi_rxhwclnt_avail < 2)
7870		return (NULL);
7871
7872	mac_addr = flent->fe_flow_desc.fd_dst_mac;
7873	/*
7874	 * We call this when we are setting up the datapath for
7875	 * the first non-primary.
7876	 */
7877	ASSERT(mip->mi_nactiveclients == 2);
7878	/*
7879	 * OK, now we have the primary that needs to be relocated.
7880	 */
7881	ngrp =  mac_reserve_rx_group(mcip, mac_addr, B_TRUE);
7882	if (ngrp == NULL)
7883		return (NULL);
7884	if (mac_rx_switch_group(mcip, defgrp, ngrp) != 0) {
7885		mac_stop_group(ngrp);
7886		return (NULL);
7887	}
7888	return (mcip);
7889}
7890