vsw_switching.c revision 9336:7c07eb9c53fb
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#include <sys/types.h>
28#include <sys/errno.h>
29#include <sys/debug.h>
30#include <sys/time.h>
31#include <sys/sysmacros.h>
32#include <sys/systm.h>
33#include <sys/user.h>
34#include <sys/stropts.h>
35#include <sys/stream.h>
36#include <sys/strlog.h>
37#include <sys/strsubr.h>
38#include <sys/cmn_err.h>
39#include <sys/cpu.h>
40#include <sys/kmem.h>
41#include <sys/conf.h>
42#include <sys/ddi.h>
43#include <sys/sunddi.h>
44#include <sys/ksynch.h>
45#include <sys/stat.h>
46#include <sys/kstat.h>
47#include <sys/vtrace.h>
48#include <sys/strsun.h>
49#include <sys/dlpi.h>
50#include <sys/ethernet.h>
51#include <net/if.h>
52#include <sys/varargs.h>
53#include <sys/machsystm.h>
54#include <sys/modctl.h>
55#include <sys/modhash.h>
56#include <sys/mac.h>
57#include <sys/mac_ether.h>
58#include <sys/taskq.h>
59#include <sys/note.h>
60#include <sys/mach_descrip.h>
61#include <sys/mdeg.h>
62#include <sys/ldc.h>
63#include <sys/vsw_fdb.h>
64#include <sys/vsw.h>
65#include <sys/vio_mailbox.h>
66#include <sys/vnet_mailbox.h>
67#include <sys/vnet_common.h>
68#include <sys/vio_util.h>
69#include <sys/sdt.h>
70#include <sys/atomic.h>
71#include <sys/vlan.h>
72
73/* Switching setup routines */
74void vsw_setup_switching_thread(void *arg);
75int vsw_setup_switching_start(vsw_t *vswp);
76void vsw_setup_switching_stop(vsw_t *vswp);
77int vsw_setup_switching(vsw_t *);
78void vsw_setup_layer2_post_process(vsw_t *vswp);
79void vsw_switch_frame_nop(vsw_t *vswp, mblk_t *mp, int caller,
80    vsw_port_t *port, mac_resource_handle_t mrh);
81static	int vsw_setup_layer2(vsw_t *);
82static	int vsw_setup_layer3(vsw_t *);
83
84/* Switching/data transmit routines */
85static	void vsw_switch_l2_frame_mac_client(vsw_t *vswp, mblk_t *mp, int caller,
86    vsw_port_t *port, mac_resource_handle_t);
87static	void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
88	vsw_port_t *port, mac_resource_handle_t);
89static	void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
90	vsw_port_t *port, mac_resource_handle_t);
91static	int vsw_forward_all(vsw_t *vswp, mblk_t *mp,
92	int caller, vsw_port_t *port);
93static	int vsw_forward_grp(vsw_t *vswp, mblk_t *mp,
94    int caller, vsw_port_t *port);
95
96/* VLAN routines */
97void vsw_create_vlans(void *arg, int type);
98void vsw_destroy_vlans(void *arg, int type);
99void vsw_vlan_add_ids(void *arg, int type);
100void vsw_vlan_remove_ids(void *arg, int type);
101static	void vsw_vlan_create_hash(void *arg, int type);
102static	void vsw_vlan_destroy_hash(void *arg, int type);
103boolean_t vsw_frame_lookup_vid(void *arg, int caller, struct ether_header *ehp,
104	uint16_t *vidp);
105mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp);
106uint32_t vsw_vlan_frames_untag(void *arg, int type, mblk_t **np, mblk_t **npt);
107boolean_t vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid);
108
109/* Forwarding database (FDB) routines */
110void vsw_fdbe_add(vsw_t *vswp, void *port);
111void vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr);
112static	vsw_fdbe_t *vsw_fdbe_find(vsw_t *vswp, struct ether_addr *);
113static void vsw_fdbe_find_cb(mod_hash_key_t key, mod_hash_val_t val);
114
115int vsw_add_rem_mcst(vnet_mcast_msg_t *, vsw_port_t *);
116int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *);
117int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *);
118void vsw_del_mcst_vsw(vsw_t *);
119
120/* Support functions */
121static mblk_t *vsw_dupmsgchain(mblk_t *mp);
122static mblk_t *vsw_get_same_dest_list(struct ether_header *ehp, mblk_t **mpp);
123
124
125/*
126 * Functions imported from other files.
127 */
128extern mblk_t *vsw_tx_msg(vsw_t *, mblk_t *, int, vsw_port_t *);
129extern mcst_addr_t *vsw_del_addr(uint8_t, void *, uint64_t);
130extern int vsw_mac_open(vsw_t *vswp);
131extern void vsw_mac_close(vsw_t *vswp);
132extern void vsw_mac_rx(vsw_t *vswp, mac_resource_handle_t mrh,
133    mblk_t *mp, vsw_macrx_flags_t flags);
134extern void vsw_set_addrs(vsw_t *vswp);
135extern int vsw_portsend(vsw_port_t *port, mblk_t *mp);
136extern void vsw_hio_init(vsw_t *vswp);
137extern void vsw_hio_start_ports(vsw_t *vswp);
138extern int vsw_mac_multicast_add(vsw_t *vswp, vsw_port_t *port,
139    mcst_addr_t *mcst_p, int type);
140extern void vsw_mac_multicast_remove(vsw_t *vswp, vsw_port_t *port,
141    mcst_addr_t *mcst_p, int type);
142extern void vsw_physlink_state_update(vsw_t *vswp);
143
144/*
145 * Tunables used in this file.
146 */
147extern	int vsw_setup_switching_delay;
148extern	uint32_t vsw_vlan_nchains;
149extern	uint32_t vsw_fdbe_refcnt_delay;
150
151#define	VSW_FDBE_REFHOLD(p)						\
152{									\
153	atomic_inc_32(&(p)->refcnt);					\
154	ASSERT((p)->refcnt != 0);					\
155}
156
157#define	VSW_FDBE_REFRELE(p)						\
158{									\
159	ASSERT((p)->refcnt != 0);					\
160	atomic_dec_32(&(p)->refcnt);					\
161}
162
163/*
164 * Thread to setup switching mode. This thread is created during vsw_attach()
165 * initially. It invokes vsw_setup_switching() and keeps retrying while the
166 * returned value is EAGAIN. The thread exits when the switching mode setup is
167 * done successfully or when the error returned is not EAGAIN. This thread may
168 * also get created from vsw_update_md_prop() if the switching mode needs to be
169 * updated.
170 */
171void
172vsw_setup_switching_thread(void *arg)
173{
174	callb_cpr_t	cprinfo;
175	vsw_t		*vswp =  (vsw_t *)arg;
176	clock_t		wait_time;
177	clock_t		xwait;
178	clock_t		wait_rv;
179	int		rv;
180
181	/* wait time used on successive retries */
182	xwait = drv_usectohz(vsw_setup_switching_delay * MICROSEC);
183
184	CALLB_CPR_INIT(&cprinfo, &vswp->sw_thr_lock, callb_generic_cpr,
185	    "vsw_setup_sw_thread");
186
187	mutex_enter(&vswp->sw_thr_lock);
188
189	while ((vswp->sw_thr_flags & VSW_SWTHR_STOP) == 0) {
190
191		CALLB_CPR_SAFE_BEGIN(&cprinfo);
192
193		/* Wait for sometime before (re)trying setup_switching() */
194		wait_time = ddi_get_lbolt() + xwait;
195		while ((vswp->sw_thr_flags & VSW_SWTHR_STOP) == 0) {
196			wait_rv = cv_timedwait(&vswp->sw_thr_cv,
197			    &vswp->sw_thr_lock, wait_time);
198			if (wait_rv == -1) {	/* timed out */
199				break;
200			}
201		}
202
203		CALLB_CPR_SAFE_END(&cprinfo, &vswp->sw_thr_lock)
204
205		if ((vswp->sw_thr_flags & VSW_SWTHR_STOP) != 0) {
206			/*
207			 * If there is a stop request, process that first and
208			 * exit the loop. Continue to hold the mutex which gets
209			 * released in CALLB_CPR_EXIT().
210			 */
211			break;
212		}
213
214		mutex_exit(&vswp->sw_thr_lock);
215		rv = vsw_setup_switching(vswp);
216		if (rv == 0) {
217			vsw_setup_layer2_post_process(vswp);
218		}
219		mutex_enter(&vswp->sw_thr_lock);
220		if (rv != EAGAIN) {
221			break;
222		}
223
224	}
225
226	vswp->sw_thr_flags &= ~VSW_SWTHR_STOP;
227	vswp->sw_thread = NULL;
228	CALLB_CPR_EXIT(&cprinfo);
229	thread_exit();
230}
231
232/*
233 * Create a thread to setup the switching mode.
234 * Returns 0 on success; 1 on failure.
235 */
236int
237vsw_setup_switching_start(vsw_t *vswp)
238{
239	mutex_enter(&vswp->sw_thr_lock);
240
241	vswp->sw_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
242	    vsw_setup_switching_thread, vswp, 0, &p0, TS_RUN, minclsyspri);
243
244	if (vswp->sw_thread == NULL) {
245		mutex_exit(&vswp->sw_thr_lock);
246		return (1);
247	}
248
249	mutex_exit(&vswp->sw_thr_lock);
250	return (0);
251}
252
253/*
254 * Stop the thread to setup switching mode.
255 */
256void
257vsw_setup_switching_stop(vsw_t *vswp)
258{
259	kt_did_t	tid = 0;
260
261	/*
262	 * Signal the setup_switching thread to stop and wait until it stops.
263	 */
264	mutex_enter(&vswp->sw_thr_lock);
265
266	if (vswp->sw_thread != NULL) {
267		tid = vswp->sw_thread->t_did;
268		vswp->sw_thr_flags |= VSW_SWTHR_STOP;
269		cv_signal(&vswp->sw_thr_cv);
270	}
271
272	mutex_exit(&vswp->sw_thr_lock);
273
274	if (tid != 0)
275		thread_join(tid);
276
277	(void) atomic_swap_32(&vswp->switching_setup_done, B_FALSE);
278
279	vswp->mac_open_retries = 0;
280}
281
282/*
283 * Setup the required switching mode.
284 * Returns:
285 *  0 on success.
286 *  EAGAIN if retry is needed.
287 *  1 on all other failures.
288 */
289int
290vsw_setup_switching(vsw_t *vswp)
291{
292	int	rv = 1;
293
294	D1(vswp, "%s: enter", __func__);
295
296	/*
297	 * Select best switching mode.
298	 * This is done as this routine can be called from the timeout
299	 * handler to retry setting up a specific mode. Currently only
300	 * the function which sets up layer2/promisc mode returns EAGAIN
301	 * if the underlying network device is not available yet, causing
302	 * retries.
303	 */
304	if (vswp->smode & VSW_LAYER2) {
305		rv = vsw_setup_layer2(vswp);
306	} else if (vswp->smode & VSW_LAYER3) {
307		rv = vsw_setup_layer3(vswp);
308	} else {
309		DERR(vswp, "unknown switch mode");
310		rv = 1;
311	}
312
313	if (rv && (rv != EAGAIN)) {
314		cmn_err(CE_WARN, "!vsw%d: Unable to setup specified "
315		    "switching mode", vswp->instance);
316	} else if (rv == 0) {
317		(void) atomic_swap_32(&vswp->switching_setup_done, B_TRUE);
318	}
319
320	D2(vswp, "%s: Operating in mode %d", __func__,
321	    vswp->smode);
322
323	D1(vswp, "%s: exit", __func__);
324
325	return (rv);
326}
327
328/*
329 * Setup for layer 2 switching.
330 *
331 * Returns:
332 *  0 on success.
333 *  EAGAIN if retry is needed.
334 *  EIO on all other failures.
335 */
336static int
337vsw_setup_layer2(vsw_t *vswp)
338{
339	int	rv;
340
341	D1(vswp, "%s: enter", __func__);
342
343	/*
344	 * Until the network device is successfully opened,
345	 * set the switching to use vsw_switch_l2_frame.
346	 */
347	vswp->vsw_switch_frame = vsw_switch_l2_frame;
348	vswp->mac_cl_switching = B_FALSE;
349
350	rv = strlen(vswp->physname);
351	if (rv == 0) {
352		/*
353		 * Physical device name is NULL, which is
354		 * required for layer 2.
355		 */
356		cmn_err(CE_WARN, "!vsw%d: no network device name specified",
357		    vswp->instance);
358		return (EIO);
359	}
360
361	mutex_enter(&vswp->mac_lock);
362
363	rv = vsw_mac_open(vswp);
364	if (rv != 0) {
365		if (rv != EAGAIN) {
366			cmn_err(CE_WARN, "!vsw%d: Unable to open network "
367			    "device: %s\n", vswp->instance, vswp->physname);
368		}
369		mutex_exit(&vswp->mac_lock);
370		return (rv);
371	}
372
373	/*
374	 * Now we can use the mac client switching, so set the switching
375	 * function to use vsw_switch_l2_frame_mac_client(), which simply
376	 * sends the packets to MAC layer for switching.
377	 */
378	vswp->vsw_switch_frame = vsw_switch_l2_frame_mac_client;
379	vswp->mac_cl_switching = B_TRUE;
380
381	D1(vswp, "%s: exit", __func__);
382
383	/* Initialize HybridIO related stuff */
384	vsw_hio_init(vswp);
385
386	mutex_exit(&vswp->mac_lock);
387	return (0);
388
389exit_error:
390	vsw_mac_close(vswp);
391	mutex_exit(&vswp->mac_lock);
392	return (EIO);
393}
394
395static int
396vsw_setup_layer3(vsw_t *vswp)
397{
398	D1(vswp, "%s: enter", __func__);
399
400	D2(vswp, "%s: operating in layer 3 mode", __func__);
401	vswp->vsw_switch_frame = vsw_switch_l3_frame;
402
403	D1(vswp, "%s: exit", __func__);
404
405	return (0);
406}
407
408/* ARGSUSED */
409void
410vsw_switch_frame_nop(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *port,
411			mac_resource_handle_t mrh)
412{
413	freemsgchain(mp);
414}
415
416/*
417 * Use mac client for layer 2 switching .
418 */
419static void
420vsw_switch_l2_frame_mac_client(vsw_t *vswp, mblk_t *mp, int caller,
421    vsw_port_t *port, mac_resource_handle_t mrh)
422{
423	_NOTE(ARGUNUSED(mrh))
424
425	mblk_t		*ret_m;
426
427	/*
428	 * This switching function is expected to be called by
429	 * the ports or the interface only. The packets from
430	 * physical interface already switched.
431	 */
432	ASSERT((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV));
433
434	if ((ret_m = vsw_tx_msg(vswp, mp, caller, port)) != NULL) {
435		DERR(vswp, "%s: drop mblks to "
436		    "phys dev", __func__);
437		freemsgchain(ret_m);
438	}
439}
440
441/*
442 * Switch the given ethernet frame when operating in layer 2 mode.
443 *
444 * vswp: pointer to the vsw instance
445 * mp: pointer to chain of ethernet frame(s) to be switched
446 * caller: identifies the source of this frame as:
447 * 		1. VSW_VNETPORT - a vsw port (connected to a vnet).
448 *		2. VSW_PHYSDEV - the physical ethernet device
449 *		3. VSW_LOCALDEV - vsw configured as a virtual interface
450 * arg: argument provided by the caller.
451 *		1. for VNETPORT - pointer to the corresponding vsw_port_t.
452 *		2. for PHYSDEV - NULL
453 *		3. for LOCALDEV - pointer to to this vsw_t(self)
454 */
455void
456vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
457			vsw_port_t *arg, mac_resource_handle_t mrh)
458{
459	struct ether_header	*ehp;
460	mblk_t			*bp, *ret_m;
461	vsw_fdbe_t		*fp;
462
463	D1(vswp, "%s: enter (caller %d)", __func__, caller);
464
465	/*
466	 * PERF: rather than breaking up the chain here, scan it
467	 * to find all mblks heading to same destination and then
468	 * pass that sub-chain to the lower transmit functions.
469	 */
470
471	/* process the chain of packets */
472	bp = mp;
473	while (bp) {
474		ehp = (struct ether_header *)bp->b_rptr;
475		mp = vsw_get_same_dest_list(ehp, &bp);
476		ASSERT(mp != NULL);
477
478		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
479		    __func__, MBLKSIZE(mp), MBLKL(mp));
480
481		if (ether_cmp(&ehp->ether_dhost, &vswp->if_addr) == 0) {
482			/*
483			 * If destination is VSW_LOCALDEV (vsw as an eth
484			 * interface) and if the device is up & running,
485			 * send the packet up the stack on this host.
486			 * If the virtual interface is down, drop the packet.
487			 */
488			if (caller != VSW_LOCALDEV) {
489				vsw_mac_rx(vswp, mrh, mp, VSW_MACRX_FREEMSG);
490			} else {
491				freemsgchain(mp);
492			}
493			continue;
494		}
495
496		/*
497		 * Find fdb entry for the destination
498		 * and hold a reference to it.
499		 */
500		fp = vsw_fdbe_find(vswp, &ehp->ether_dhost);
501		if (fp != NULL) {
502
503			/*
504			 * If plumbed and in promisc mode then copy msg
505			 * and send up the stack.
506			 */
507			vsw_mac_rx(vswp, mrh, mp,
508			    VSW_MACRX_PROMISC | VSW_MACRX_COPYMSG);
509
510			/*
511			 * If the destination is in FDB, the packet
512			 * should be forwarded to the correponding
513			 * vsw_port (connected to a vnet device -
514			 * VSW_VNETPORT)
515			 */
516			(void) vsw_portsend(fp->portp, mp);
517
518			/* Release the reference on the fdb entry */
519			VSW_FDBE_REFRELE(fp);
520		} else {
521			/*
522			 * Destination not in FDB.
523			 *
524			 * If the destination is broadcast or
525			 * multicast forward the packet to all
526			 * (VNETPORTs, PHYSDEV, LOCALDEV),
527			 * except the caller.
528			 */
529			if (IS_BROADCAST(ehp)) {
530				D2(vswp, "%s: BROADCAST pkt", __func__);
531				(void) vsw_forward_all(vswp, mp, caller, arg);
532			} else if (IS_MULTICAST(ehp)) {
533				D2(vswp, "%s: MULTICAST pkt", __func__);
534				(void) vsw_forward_grp(vswp, mp, caller, arg);
535			} else {
536				/*
537				 * If the destination is unicast, and came
538				 * from either a logical network device or
539				 * the switch itself when it is plumbed, then
540				 * send it out on the physical device and also
541				 * up the stack if the logical interface is
542				 * in promiscious mode.
543				 *
544				 * NOTE:  The assumption here is that if we
545				 * cannot find the destination in our fdb, its
546				 * a unicast address, and came from either a
547				 * vnet or down the stack (when plumbed) it
548				 * must be destinded for an ethernet device
549				 * outside our ldoms.
550				 */
551				if (caller == VSW_VNETPORT) {
552					/* promisc check copy etc */
553					vsw_mac_rx(vswp, mrh, mp,
554					    VSW_MACRX_PROMISC |
555					    VSW_MACRX_COPYMSG);
556
557					if ((ret_m = vsw_tx_msg(vswp, mp,
558					    caller, arg)) != NULL) {
559						DERR(vswp, "%s: drop mblks to "
560						    "phys dev", __func__);
561						freemsgchain(ret_m);
562					}
563
564				} else if (caller == VSW_PHYSDEV) {
565					/*
566					 * Pkt seen because card in promisc
567					 * mode. Send up stack if plumbed in
568					 * promisc mode, else drop it.
569					 */
570					vsw_mac_rx(vswp, mrh, mp,
571					    VSW_MACRX_PROMISC |
572					    VSW_MACRX_FREEMSG);
573
574				} else if (caller == VSW_LOCALDEV) {
575					/*
576					 * Pkt came down the stack, send out
577					 * over physical device.
578					 */
579					if ((ret_m = vsw_tx_msg(vswp, mp,
580					    caller, NULL)) != NULL) {
581						DERR(vswp, "%s: drop mblks to "
582						    "phys dev", __func__);
583						freemsgchain(ret_m);
584					}
585				}
586			}
587		}
588	}
589	D1(vswp, "%s: exit\n", __func__);
590}
591
592/*
593 * Switch ethernet frame when in layer 3 mode (i.e. using IP
594 * layer to do the routing).
595 *
596 * There is a large amount of overlap between this function and
597 * vsw_switch_l2_frame. At some stage we need to revisit and refactor
598 * both these functions.
599 */
600void
601vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
602			vsw_port_t *arg, mac_resource_handle_t mrh)
603{
604	struct ether_header	*ehp;
605	mblk_t			*bp = NULL;
606	vsw_fdbe_t		*fp;
607
608	D1(vswp, "%s: enter (caller %d)", __func__, caller);
609
610	/*
611	 * In layer 3 mode should only ever be switching packets
612	 * between IP layer and vnet devices. So make sure thats
613	 * who is invoking us.
614	 */
615	if ((caller != VSW_LOCALDEV) && (caller != VSW_VNETPORT)) {
616		DERR(vswp, "%s: unexpected caller (%d)", __func__, caller);
617		freemsgchain(mp);
618		return;
619	}
620
621	/* process the chain of packets */
622	bp = mp;
623	while (bp) {
624		ehp = (struct ether_header *)bp->b_rptr;
625		mp = vsw_get_same_dest_list(ehp, &bp);
626		ASSERT(mp != NULL);
627
628		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
629		    __func__, MBLKSIZE(mp), MBLKL(mp));
630
631		/*
632		 * Find fdb entry for the destination
633		 * and hold a reference to it.
634		 */
635		fp = vsw_fdbe_find(vswp, &ehp->ether_dhost);
636		if (fp != NULL) {
637
638			D2(vswp, "%s: sending to target port", __func__);
639			(void) vsw_portsend(fp->portp, mp);
640
641			/* Release the reference on the fdb entry */
642			VSW_FDBE_REFRELE(fp);
643		} else {
644			/*
645			 * Destination not in FDB
646			 *
647			 * If the destination is broadcast or
648			 * multicast forward the packet to all
649			 * (VNETPORTs, PHYSDEV, LOCALDEV),
650			 * except the caller.
651			 */
652			if (IS_BROADCAST(ehp)) {
653				D2(vswp, "%s: BROADCAST pkt", __func__);
654				(void) vsw_forward_all(vswp, mp, caller, arg);
655			} else if (IS_MULTICAST(ehp)) {
656				D2(vswp, "%s: MULTICAST pkt", __func__);
657				(void) vsw_forward_grp(vswp, mp, caller, arg);
658			} else {
659				/*
660				 * Unicast pkt from vnet that we don't have
661				 * an FDB entry for, so must be destinded for
662				 * the outside world. Attempt to send up to the
663				 * IP layer to allow it to deal with it.
664				 */
665				if (caller == VSW_VNETPORT) {
666					vsw_mac_rx(vswp, mrh,
667					    mp, VSW_MACRX_FREEMSG);
668				}
669			}
670		}
671	}
672
673	D1(vswp, "%s: exit", __func__);
674}
675
676/*
677 * Setup mac addrs and hio resources for layer 2 switching only.
678 */
679void
680vsw_setup_layer2_post_process(vsw_t *vswp)
681{
682	if (vswp->smode & VSW_LAYER2) {
683		/*
684		 * Program unicst, mcst addrs of vsw
685		 * interface and ports in the physdev.
686		 */
687		vsw_set_addrs(vswp);
688
689		/* Start HIO for ports that have already connected */
690		vsw_hio_start_ports(vswp);
691
692		/* Update physical link info to any ports already connected */
693		vsw_physlink_state_update(vswp);
694	}
695}
696
697/*
698 * Forward the ethernet frame to all ports (VNETPORTs, PHYSDEV, LOCALDEV),
699 * except the caller (port on which frame arrived).
700 */
701static int
702vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
703{
704	vsw_port_list_t	*plist = &vswp->plist;
705	vsw_port_t	*portp;
706	mblk_t		*nmp = NULL;
707	mblk_t		*ret_m = NULL;
708	int		skip_port = 0;
709
710	D1(vswp, "vsw_forward_all: enter\n");
711
712	/*
713	 * Broadcast message from inside ldoms so send to outside
714	 * world if in either of layer 2 modes.
715	 */
716	if ((vswp->smode & VSW_LAYER2) &&
717	    ((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) {
718
719		nmp = vsw_dupmsgchain(mp);
720		if (nmp) {
721			if ((ret_m = vsw_tx_msg(vswp, nmp, caller, arg))
722			    != NULL) {
723				DERR(vswp, "%s: dropping pkt(s) "
724				    "consisting of %ld bytes of data for"
725				    " physical device", __func__, MBLKL(ret_m));
726				freemsgchain(ret_m);
727			}
728		}
729	}
730
731	if (caller == VSW_VNETPORT)
732		skip_port = 1;
733
734	/*
735	 * Broadcast message from other vnet (layer 2 or 3) or outside
736	 * world (layer 2 only), send up stack if plumbed.
737	 */
738	if ((caller == VSW_PHYSDEV) || (caller == VSW_VNETPORT)) {
739		vsw_mac_rx(vswp, NULL, mp, VSW_MACRX_COPYMSG);
740	}
741
742	/* send it to all VNETPORTs */
743	READ_ENTER(&plist->lockrw);
744	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
745		D2(vswp, "vsw_forward_all: port %d", portp->p_instance);
746		/*
747		 * Caution ! - don't reorder these two checks as arg
748		 * will be NULL if the caller is PHYSDEV. skip_port is
749		 * only set if caller is VNETPORT.
750		 */
751		if ((skip_port) && (portp == arg)) {
752			continue;
753		} else {
754			nmp = vsw_dupmsgchain(mp);
755			if (nmp) {
756				/*
757				 * The plist->lockrw is protecting the
758				 * portp from getting destroyed here.
759				 * So, no ref_cnt is incremented here.
760				 */
761				(void) vsw_portsend(portp, nmp);
762			} else {
763				DERR(vswp, "vsw_forward_all: nmp NULL");
764			}
765		}
766	}
767	RW_EXIT(&plist->lockrw);
768
769	freemsgchain(mp);
770
771	D1(vswp, "vsw_forward_all: exit\n");
772	return (0);
773}
774
775/*
776 * Forward pkts to any devices or interfaces which have registered
777 * an interest in them (i.e. multicast groups).
778 */
779static int
780vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
781{
782	struct ether_header	*ehp = (struct ether_header *)mp->b_rptr;
783	mfdb_ent_t		*entp = NULL;
784	mfdb_ent_t		*tpp = NULL;
785	vsw_port_t 		*port;
786	uint64_t		key = 0;
787	mblk_t			*nmp = NULL;
788	mblk_t			*ret_m = NULL;
789	boolean_t		check_if = B_TRUE;
790
791	/*
792	 * Convert address to hash table key
793	 */
794	KEY_HASH(key, &ehp->ether_dhost);
795
796	D1(vswp, "%s: key 0x%llx", __func__, key);
797
798	/*
799	 * If pkt came from either a vnet or down the stack (if we are
800	 * plumbed) and we are in layer 2 mode, then we send the pkt out
801	 * over the physical adapter, and then check to see if any other
802	 * vnets are interested in it.
803	 */
804	if ((vswp->smode & VSW_LAYER2) &&
805	    ((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) {
806		nmp = vsw_dupmsgchain(mp);
807		if (nmp) {
808			if ((ret_m = vsw_tx_msg(vswp, nmp, caller, arg))
809			    != NULL) {
810				DERR(vswp, "%s: dropping pkt(s) consisting of "
811				    "%ld bytes of data for physical device",
812				    __func__, MBLKL(ret_m));
813				freemsgchain(ret_m);
814			}
815		}
816	}
817
818	READ_ENTER(&vswp->mfdbrw);
819	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)key,
820	    (mod_hash_val_t *)&entp) != 0) {
821		D3(vswp, "%s: no table entry found for addr 0x%llx",
822		    __func__, key);
823	} else {
824		/*
825		 * Send to list of devices associated with this address...
826		 */
827		for (tpp = entp; tpp != NULL; tpp = tpp->nextp) {
828
829			/* dont send to ourselves */
830			if ((caller == VSW_VNETPORT) &&
831			    (tpp->d_addr == (void *)arg)) {
832				port = (vsw_port_t *)tpp->d_addr;
833				D3(vswp, "%s: not sending to ourselves"
834				    " : port %d", __func__, port->p_instance);
835				continue;
836
837			} else if ((caller == VSW_LOCALDEV) &&
838			    (tpp->d_type == VSW_LOCALDEV)) {
839				D2(vswp, "%s: not sending back up stack",
840				    __func__);
841				continue;
842			}
843
844			if (tpp->d_type == VSW_VNETPORT) {
845				port = (vsw_port_t *)tpp->d_addr;
846				D3(vswp, "%s: sending to port %ld for addr "
847				    "0x%llx", __func__, port->p_instance, key);
848
849				nmp = vsw_dupmsgchain(mp);
850				if (nmp) {
851					/*
852					 * The vswp->mfdbrw is protecting the
853					 * portp from getting destroyed here.
854					 * So, no ref_cnt is incremented here.
855					 */
856					(void) vsw_portsend(port, nmp);
857				}
858			} else {
859				vsw_mac_rx(vswp, NULL,
860				    mp, VSW_MACRX_COPYMSG);
861				D2(vswp, "%s: sending up stack"
862				    " for addr 0x%llx", __func__, key);
863				check_if = B_FALSE;
864			}
865		}
866	}
867
868	RW_EXIT(&vswp->mfdbrw);
869
870	/*
871	 * If the pkt came from either a vnet or from physical device,
872	 * and if we havent already sent the pkt up the stack then we
873	 * check now if we can/should (i.e. the interface is plumbed
874	 * and in promisc mode).
875	 */
876	if ((check_if) &&
877	    ((caller == VSW_VNETPORT) || (caller == VSW_PHYSDEV))) {
878		vsw_mac_rx(vswp, NULL, mp,
879		    VSW_MACRX_PROMISC | VSW_MACRX_COPYMSG);
880	}
881
882	freemsgchain(mp);
883
884	D1(vswp, "%s: exit", __func__);
885
886	return (0);
887}
888
889/*
890 * This function creates the vlan id hash table for the given vsw device or
891 * port. It then adds each vlan that the device or port has been assigned,
892 * into this hash table.
893 * Arguments:
894 *   arg:  vsw device or port.
895 *   type: type of arg; VSW_LOCALDEV(vsw device) or VSW_VNETPORT(port).
896 */
897void
898vsw_create_vlans(void *arg, int type)
899{
900	/* create vlan hash table */
901	vsw_vlan_create_hash(arg, type);
902
903	/* add vlan ids of the vsw device into its hash table */
904	vsw_vlan_add_ids(arg, type);
905}
906
907/*
908 * This function removes the vlan ids of the vsw device or port from its hash
909 * table. It then destroys the vlan hash table.
910 * Arguments:
911 *   arg:  vsw device or port.
912 *   type: type of arg; VSW_LOCALDEV(vsw device) or VSW_VNETPORT(port).
913 */
914void
915vsw_destroy_vlans(void *arg, int type)
916{
917	/* remove vlan ids from the hash table */
918	vsw_vlan_remove_ids(arg, type);
919
920	/* destroy vlan-hash-table */
921	vsw_vlan_destroy_hash(arg, type);
922}
923
924/*
925 * Create a vlan-id hash table for the given vsw device or port.
926 */
927static void
928vsw_vlan_create_hash(void *arg, int type)
929{
930	char		hashname[MAXNAMELEN];
931
932	if (type == VSW_LOCALDEV) {
933		vsw_t		*vswp = (vsw_t *)arg;
934
935		(void) snprintf(hashname, MAXNAMELEN, "vsw%d-vlan-hash",
936		    vswp->instance);
937
938		vswp->vlan_nchains = vsw_vlan_nchains;
939		vswp->vlan_hashp = mod_hash_create_idhash(hashname,
940		    vswp->vlan_nchains, mod_hash_null_valdtor);
941
942	} else if (type == VSW_VNETPORT) {
943		vsw_port_t	*portp = (vsw_port_t *)arg;
944
945		(void) snprintf(hashname, MAXNAMELEN, "port%d-vlan-hash",
946		    portp->p_instance);
947
948		portp->vlan_nchains = vsw_vlan_nchains;
949		portp->vlan_hashp = mod_hash_create_idhash(hashname,
950		    portp->vlan_nchains, mod_hash_null_valdtor);
951
952	} else {
953		return;
954	}
955}
956
957/*
958 * Destroy the vlan-id hash table for the given vsw device or port.
959 */
960static void
961vsw_vlan_destroy_hash(void *arg, int type)
962{
963	if (type == VSW_LOCALDEV) {
964		vsw_t		*vswp = (vsw_t *)arg;
965
966		mod_hash_destroy_hash(vswp->vlan_hashp);
967		vswp->vlan_nchains = 0;
968	} else if (type == VSW_VNETPORT) {
969		vsw_port_t	*portp = (vsw_port_t *)arg;
970
971		mod_hash_destroy_hash(portp->vlan_hashp);
972		portp->vlan_nchains = 0;
973	} else {
974		return;
975	}
976}
977
978/*
979 * Add vlan ids of the given vsw device or port into its hash table.
980 */
981void
982vsw_vlan_add_ids(void *arg, int type)
983{
984	int	rv;
985	int	i;
986
987	if (type == VSW_LOCALDEV) {
988		vsw_t		*vswp = (vsw_t *)arg;
989
990		rv = mod_hash_insert(vswp->vlan_hashp,
991		    (mod_hash_key_t)VLAN_ID_KEY(vswp->pvid),
992		    (mod_hash_val_t)B_TRUE);
993		if (rv != 0) {
994			cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d) for "
995			    "the interface", vswp->instance, vswp->pvid);
996		}
997
998		for (i = 0; i < vswp->nvids; i++) {
999			rv = mod_hash_insert(vswp->vlan_hashp,
1000			    (mod_hash_key_t)VLAN_ID_KEY(vswp->vids[i].vl_vid),
1001			    (mod_hash_val_t)B_TRUE);
1002			if (rv != 0) {
1003				cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d)"
1004				    " for the interface", vswp->instance,
1005				    vswp->pvid);
1006			}
1007		}
1008
1009	} else if (type == VSW_VNETPORT) {
1010		vsw_port_t	*portp = (vsw_port_t *)arg;
1011		vsw_t		*vswp = portp->p_vswp;
1012
1013		rv = mod_hash_insert(portp->vlan_hashp,
1014		    (mod_hash_key_t)VLAN_ID_KEY(portp->pvid),
1015		    (mod_hash_val_t)B_TRUE);
1016		if (rv != 0) {
1017			cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d) for "
1018			    "the port(%d)", vswp->instance, vswp->pvid,
1019			    portp->p_instance);
1020		}
1021
1022		for (i = 0; i < portp->nvids; i++) {
1023			rv = mod_hash_insert(portp->vlan_hashp,
1024			    (mod_hash_key_t)VLAN_ID_KEY(portp->vids[i].vl_vid),
1025			    (mod_hash_val_t)B_TRUE);
1026			if (rv != 0) {
1027				cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d)"
1028				    " for the port(%d)", vswp->instance,
1029				    vswp->pvid, portp->p_instance);
1030			}
1031		}
1032
1033	}
1034}
1035
1036/*
1037 * Remove vlan ids of the given vsw device or port from its hash table.
1038 */
1039void
1040vsw_vlan_remove_ids(void *arg, int type)
1041{
1042	mod_hash_val_t	vp;
1043	int		rv;
1044	int		i;
1045
1046	if (type == VSW_LOCALDEV) {
1047		vsw_t		*vswp = (vsw_t *)arg;
1048
1049		rv = vsw_vlan_lookup(vswp->vlan_hashp, vswp->pvid);
1050		if (rv == B_TRUE) {
1051			rv = mod_hash_remove(vswp->vlan_hashp,
1052			    (mod_hash_key_t)VLAN_ID_KEY(vswp->pvid),
1053			    (mod_hash_val_t *)&vp);
1054			ASSERT(rv == 0);
1055		}
1056
1057		for (i = 0; i < vswp->nvids; i++) {
1058			rv = vsw_vlan_lookup(vswp->vlan_hashp,
1059			    vswp->vids[i].vl_vid);
1060			if (rv == B_TRUE) {
1061				rv = mod_hash_remove(vswp->vlan_hashp,
1062				    (mod_hash_key_t)VLAN_ID_KEY(
1063				    vswp->vids[i].vl_vid),
1064				    (mod_hash_val_t *)&vp);
1065				ASSERT(rv == 0);
1066			}
1067		}
1068
1069	} else if (type == VSW_VNETPORT) {
1070		vsw_port_t	*portp = (vsw_port_t *)arg;
1071
1072		portp = (vsw_port_t *)arg;
1073		rv = vsw_vlan_lookup(portp->vlan_hashp, portp->pvid);
1074		if (rv == B_TRUE) {
1075			rv = mod_hash_remove(portp->vlan_hashp,
1076			    (mod_hash_key_t)VLAN_ID_KEY(portp->pvid),
1077			    (mod_hash_val_t *)&vp);
1078			ASSERT(rv == 0);
1079		}
1080
1081		for (i = 0; i < portp->nvids; i++) {
1082			rv = vsw_vlan_lookup(portp->vlan_hashp,
1083			    portp->vids[i].vl_vid);
1084			if (rv == B_TRUE) {
1085				rv = mod_hash_remove(portp->vlan_hashp,
1086				    (mod_hash_key_t)VLAN_ID_KEY(
1087				    portp->vids[i].vl_vid),
1088				    (mod_hash_val_t *)&vp);
1089				ASSERT(rv == 0);
1090			}
1091		}
1092
1093	} else {
1094		return;
1095	}
1096}
1097
1098/*
1099 * Find the given vlan id in the hash table.
1100 * Return: B_TRUE if the id is found; B_FALSE if not found.
1101 */
1102boolean_t
1103vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid)
1104{
1105	int		rv;
1106	mod_hash_val_t	vp;
1107
1108	rv = mod_hash_find(vlan_hashp, VLAN_ID_KEY(vid), (mod_hash_val_t *)&vp);
1109
1110	if (rv != 0)
1111		return (B_FALSE);
1112
1113	return (B_TRUE);
1114}
1115
1116/*
1117 * Add an entry into FDB for the given vsw.
1118 */
1119void
1120vsw_fdbe_add(vsw_t *vswp, void *port)
1121{
1122	uint64_t	addr = 0;
1123	vsw_port_t	*portp;
1124	vsw_fdbe_t	*fp;
1125	int		rv;
1126
1127	portp = (vsw_port_t *)port;
1128	KEY_HASH(addr, &portp->p_macaddr);
1129
1130	fp = kmem_zalloc(sizeof (vsw_fdbe_t), KM_SLEEP);
1131	fp->portp = port;
1132
1133	/*
1134	 * Note: duplicate keys will be rejected by mod_hash.
1135	 */
1136	rv = mod_hash_insert(vswp->fdb_hashp, (mod_hash_key_t)addr,
1137	    (mod_hash_val_t)fp);
1138	if (rv != 0) {
1139		cmn_err(CE_WARN, "vsw%d: Duplicate mac-address(%s) for "
1140		    "the port(%d)", vswp->instance,
1141		    ether_sprintf(&portp->p_macaddr), portp->p_instance);
1142	}
1143}
1144
1145/*
1146 * Remove an entry from FDB.
1147 */
1148void
1149vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr)
1150{
1151	uint64_t	addr = 0;
1152	vsw_fdbe_t	*fp;
1153	int		rv;
1154
1155	KEY_HASH(addr, eaddr);
1156
1157	/*
1158	 * Remove the entry from fdb hash table.
1159	 * This prevents further references to this fdb entry.
1160	 */
1161	rv = mod_hash_remove(vswp->fdb_hashp, (mod_hash_key_t)addr,
1162	    (mod_hash_val_t *)&fp);
1163	if (rv != 0) {
1164		/* invalid key? */
1165		return;
1166	}
1167
1168	/*
1169	 * If there are threads already ref holding before the entry was
1170	 * removed from hash table, then wait for ref count to drop to zero.
1171	 */
1172	while (fp->refcnt != 0) {
1173		delay(drv_usectohz(vsw_fdbe_refcnt_delay));
1174	}
1175
1176	kmem_free(fp, sizeof (*fp));
1177}
1178
1179/*
1180 * Search fdb for a given mac address. If an entry is found, hold
1181 * a reference to it and return the entry, else returns NULL.
1182 */
1183static vsw_fdbe_t *
1184vsw_fdbe_find(vsw_t *vswp, struct ether_addr *addrp)
1185{
1186	uint64_t	key = 0;
1187	vsw_fdbe_t	*fp;
1188	int		rv;
1189
1190	KEY_HASH(key, addrp);
1191
1192	rv = mod_hash_find_cb(vswp->fdb_hashp, (mod_hash_key_t)key,
1193	    (mod_hash_val_t *)&fp, vsw_fdbe_find_cb);
1194
1195	if (rv != 0)
1196		return (NULL);
1197
1198	return (fp);
1199}
1200
1201/*
1202 * Callback function provided to mod_hash_find_cb(). After finding the fdb
1203 * entry corresponding to the key (macaddr), this callback will be invoked by
1204 * mod_hash_find_cb() to atomically increment the reference count on the fdb
1205 * entry before returning the found entry.
1206 */
1207static void
1208vsw_fdbe_find_cb(mod_hash_key_t key, mod_hash_val_t val)
1209{
1210	_NOTE(ARGUNUSED(key))
1211	VSW_FDBE_REFHOLD((vsw_fdbe_t *)val);
1212}
1213
1214/*
1215 * A given frame must be always tagged with the appropriate vlan id (unless it
1216 * is in the default-vlan) before the mac address switching function is called.
1217 * Otherwise, after switching function determines the destination, we cannot
1218 * figure out if the destination belongs to the the same vlan that the frame
1219 * originated from and if it needs tag/untag. Frames which are inbound from
1220 * the external(physical) network over a vlan trunk link are always tagged.
1221 * However frames which are received from a vnet-port over ldc or frames which
1222 * are coming down the stack on the service domain over vsw interface may be
1223 * untagged. These frames must be tagged with the appropriate pvid of the
1224 * sender (vnet-port or vsw device), before invoking the switching function.
1225 *
1226 * Arguments:
1227 *   arg:    caller of the function.
1228 *   type:   type of arg(caller): VSW_LOCALDEV(vsw) or VSW_VNETPORT(port)
1229 *   mp:     frame(s) to be tagged.
1230 */
1231mblk_t *
1232vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp)
1233{
1234	vsw_t			*vswp;
1235	vsw_port_t		*portp;
1236	struct ether_header	*ehp;
1237	mblk_t			*bp;
1238	mblk_t			*bpt;
1239	mblk_t			*bph;
1240	mblk_t			*bpn;
1241	uint16_t		pvid;
1242
1243	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
1244
1245	if (type == VSW_LOCALDEV) {
1246		vswp = (vsw_t *)arg;
1247		pvid = vswp->pvid;
1248		portp = NULL;
1249	} else {
1250		/* VSW_VNETPORT */
1251		portp = (vsw_port_t *)arg;
1252		pvid = portp->pvid;
1253		vswp = portp->p_vswp;
1254	}
1255
1256	bpn = bph = bpt = NULL;
1257
1258	for (bp = mp; bp != NULL; bp = bpn) {
1259
1260		bpn = bp->b_next;
1261		bp->b_next = bp->b_prev = NULL;
1262
1263		/* Determine if it is an untagged frame */
1264		ehp = (struct ether_header *)bp->b_rptr;
1265
1266		if (ehp->ether_type != ETHERTYPE_VLAN) {	/* untagged */
1267
1268			/* no need to tag if the frame is in default vlan */
1269			if (pvid != vswp->default_vlan_id) {
1270				bp = vnet_vlan_insert_tag(bp, pvid);
1271				if (bp == NULL) {
1272					continue;
1273				}
1274			}
1275		}
1276
1277		/* build a chain of processed packets */
1278		if (bph == NULL) {
1279			bph = bpt = bp;
1280		} else {
1281			bpt->b_next = bp;
1282			bpt = bp;
1283		}
1284
1285	}
1286
1287	return (bph);
1288}
1289
1290/*
1291 * Frames destined to a vnet-port or to the local vsw interface, must be
1292 * untagged if necessary before sending. This function first checks that the
1293 * frame can be sent to the destination in the vlan identified by the frame
1294 * tag. Note that when this function is invoked the frame must have been
1295 * already tagged (unless it is in the default-vlan). Because, this function is
1296 * called when the switching function determines the destination and invokes
1297 * its send function (vnet-port or vsw interface) and all frames would have
1298 * been tagged by this time (see comments in vsw_vlan_frame_pretag()).
1299 *
1300 * Arguments:
1301 *   arg:    destination device.
1302 *   type:   type of arg(destination): VSW_LOCALDEV(vsw) or VSW_VNETPORT(port)
1303 *   np:     head of pkt chain to be validated and untagged.
1304 *   npt:    tail of pkt chain to be validated and untagged.
1305 *
1306 * Returns:
1307 *   np:     head of updated chain of packets
1308 *   npt:    tail of updated chain of packets
1309 *   rv:     count of the packets in the returned list
1310 */
1311uint32_t
1312vsw_vlan_frame_untag(void *arg, int type, mblk_t **np, mblk_t **npt)
1313{
1314	mblk_t			*bp;
1315	mblk_t			*bpt;
1316	mblk_t			*bph;
1317	mblk_t			*bpn;
1318	vsw_port_t		*portp;
1319	vsw_t			*vswp;
1320	uint32_t		count;
1321	struct ether_header	*ehp;
1322	boolean_t		is_tagged;
1323	boolean_t		rv;
1324	uint16_t		vlan_id;
1325	uint16_t		pvid;
1326	mod_hash_t		*vlan_hashp;
1327
1328	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
1329
1330
1331	if (type == VSW_LOCALDEV) {
1332		vswp = (vsw_t *)arg;
1333		pvid = vswp->pvid;
1334		vlan_hashp = vswp->vlan_hashp;
1335		portp = NULL;
1336	} else {
1337		/* type == VSW_VNETPORT */
1338		portp = (vsw_port_t *)arg;
1339		vswp = portp->p_vswp;
1340		vlan_hashp = portp->vlan_hashp;
1341		pvid = portp->pvid;
1342	}
1343
1344	/*
1345	 * If the MAC layer switching in place, then
1346	 * untagging required only if the pvid is not
1347	 * the same as default_vlan_id. This is because,
1348	 * the MAC layer will send packets for the
1349	 * registered vlans only.
1350	 */
1351	if ((vswp->mac_cl_switching == B_TRUE) &&
1352	    (pvid == vswp->default_vlan_id)) {
1353		/* simply count and set the tail */
1354		count = 1;
1355		bp = *np;
1356		ASSERT(bp != NULL);
1357		while (bp->b_next != NULL) {
1358			bp = bp->b_next;
1359			count++;
1360		}
1361		*npt = bp;
1362		return (count);
1363	}
1364
1365	bpn = bph = bpt = NULL;
1366	count = 0;
1367
1368	for (bp = *np; bp != NULL; bp = bpn) {
1369
1370		bpn = bp->b_next;
1371		bp->b_next = bp->b_prev = NULL;
1372
1373		/*
1374		 * Determine the vlan id that the frame belongs to.
1375		 */
1376		ehp = (struct ether_header *)bp->b_rptr;
1377		is_tagged = vsw_frame_lookup_vid(arg, type, ehp, &vlan_id);
1378
1379		/*
1380		 * If MAC layer switching in place, then we
1381		 * need to untag only if the tagged packet has
1382		 * vlan-id same as the pvid.
1383		 */
1384		if (vswp->mac_cl_switching == B_TRUE) {
1385
1386			/* only tagged packets expected here */
1387			ASSERT(is_tagged == B_TRUE);
1388			if (vlan_id == pvid) {
1389				bp = vnet_vlan_remove_tag(bp);
1390				if (bp == NULL) {
1391					/* packet dropped */
1392					continue;
1393				}
1394			}
1395		} else { /* No MAC layer switching */
1396
1397			/*
1398			 * Check the frame header if tag/untag is  needed.
1399			 */
1400			if (is_tagged == B_FALSE) {
1401				/*
1402				 * Untagged frame. We shouldn't have an
1403				 * untagged packet at this point, unless
1404				 * the destination's  vlan id is
1405				 * default-vlan-id; if it is not the
1406				 * default-vlan-id, we drop the packet.
1407				 */
1408				if (vlan_id != vswp->default_vlan_id) {
1409					/* drop the packet */
1410					freemsg(bp);
1411					continue;
1412				}
1413			} else {	/* Tagged */
1414				/*
1415				 * Tagged frame, untag if it's the
1416				 * destination's pvid.
1417				 */
1418				if (vlan_id == pvid) {
1419
1420					bp = vnet_vlan_remove_tag(bp);
1421					if (bp == NULL) {
1422						/* packet dropped */
1423						continue;
1424					}
1425				} else {
1426
1427					/*
1428					 * Check if the destination is in the
1429					 * same vlan.
1430					 */
1431					rv = vsw_vlan_lookup(vlan_hashp,
1432					    vlan_id);
1433					if (rv == B_FALSE) {
1434						/* drop the packet */
1435						freemsg(bp);
1436						continue;
1437					}
1438				}
1439
1440			}
1441		}
1442
1443		/* build a chain of processed packets */
1444		if (bph == NULL) {
1445			bph = bpt = bp;
1446		} else {
1447			bpt->b_next = bp;
1448			bpt = bp;
1449		}
1450		count++;
1451	}
1452
1453	*np = bph;
1454	*npt = bpt;
1455	return (count);
1456}
1457
1458/*
1459 * Lookup the vlan id of the given frame. If it is a vlan-tagged frame,
1460 * then the vlan-id is available in the tag; otherwise, its vlan id is
1461 * implicitly obtained based on the caller (destination of the frame:
1462 * VSW_VNETPORT or VSW_LOCALDEV).
1463 * The vlan id determined is returned in vidp.
1464 * Returns: B_TRUE if it is a tagged frame; B_FALSE if it is untagged.
1465 */
1466boolean_t
1467vsw_frame_lookup_vid(void *arg, int caller, struct ether_header *ehp,
1468	uint16_t *vidp)
1469{
1470	struct ether_vlan_header	*evhp;
1471	vsw_t				*vswp;
1472	vsw_port_t			*portp;
1473
1474	/* If it's a tagged frame, get the vid from vlan header */
1475	if (ehp->ether_type == ETHERTYPE_VLAN) {
1476
1477		evhp = (struct ether_vlan_header *)ehp;
1478		*vidp = VLAN_ID(ntohs(evhp->ether_tci));
1479		return (B_TRUE);
1480	}
1481
1482	/* Untagged frame; determine vlan id based on caller */
1483	switch (caller) {
1484
1485	case VSW_VNETPORT:
1486		/*
1487		 * packet destined to a vnet; vlan-id is pvid of vnet-port.
1488		 */
1489		portp = (vsw_port_t *)arg;
1490		*vidp = portp->pvid;
1491		break;
1492
1493	case VSW_LOCALDEV:
1494
1495		/*
1496		 * packet destined to vsw interface;
1497		 * vlan-id is port-vlan-id of vsw device.
1498		 */
1499		vswp = (vsw_t *)arg;
1500		*vidp = vswp->pvid;
1501		break;
1502	}
1503
1504	return (B_FALSE);
1505}
1506
1507/*
1508 * Add or remove multicast address(es).
1509 *
1510 * Returns 0 on success, 1 on failure.
1511 */
1512int
1513vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port)
1514{
1515	mcst_addr_t		*mcst_p = NULL;
1516	vsw_t			*vswp = port->p_vswp;
1517	uint64_t		addr = 0x0;
1518	int			i;
1519
1520	D1(vswp, "%s: enter", __func__);
1521
1522	D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count);
1523
1524	for (i = 0; i < mcst_pkt->count; i++) {
1525		/*
1526		 * Convert address into form that can be used
1527		 * as hash table key.
1528		 */
1529		KEY_HASH(addr, &(mcst_pkt->mca[i]));
1530
1531		/*
1532		 * Add or delete the specified address/port combination.
1533		 */
1534		if (mcst_pkt->set == 0x1) {
1535			D3(vswp, "%s: adding multicast address 0x%llx for "
1536			    "port %ld", __func__, addr, port->p_instance);
1537			if (vsw_add_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
1538				/*
1539				 * Update the list of multicast
1540				 * addresses contained within the
1541				 * port structure to include this new
1542				 * one.
1543				 */
1544				mcst_p = kmem_zalloc(sizeof (mcst_addr_t),
1545				    KM_NOSLEEP);
1546				if (mcst_p == NULL) {
1547					DERR(vswp, "%s: unable to alloc mem",
1548					    __func__);
1549					(void) vsw_del_mcst(vswp,
1550					    VSW_VNETPORT, addr, port);
1551					return (1);
1552				}
1553
1554				mcst_p->nextp = NULL;
1555				mcst_p->addr = addr;
1556				ether_copy(&mcst_pkt->mca[i], &mcst_p->mca);
1557
1558				/*
1559				 * Program the address into HW. If the addr
1560				 * has already been programmed then the MAC
1561				 * just increments a ref counter (which is
1562				 * used when the address is being deleted)
1563				 */
1564				if (vsw_mac_multicast_add(vswp, port, mcst_p,
1565				    VSW_VNETPORT)) {
1566					(void) vsw_del_mcst(vswp,
1567					    VSW_VNETPORT, addr, port);
1568					kmem_free(mcst_p, sizeof (*mcst_p));
1569					return (1);
1570				}
1571
1572				mutex_enter(&port->mca_lock);
1573				mcst_p->nextp = port->mcap;
1574				port->mcap = mcst_p;
1575				mutex_exit(&port->mca_lock);
1576
1577			} else {
1578				DERR(vswp, "%s: error adding multicast "
1579				    "address 0x%llx for port %ld",
1580				    __func__, addr, port->p_instance);
1581				return (1);
1582			}
1583		} else {
1584			/*
1585			 * Delete an entry from the multicast hash
1586			 * table and update the address list
1587			 * appropriately.
1588			 */
1589			if (vsw_del_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
1590				D3(vswp, "%s: deleting multicast address "
1591				    "0x%llx for port %ld", __func__, addr,
1592				    port->p_instance);
1593
1594				mcst_p = vsw_del_addr(VSW_VNETPORT, port, addr);
1595				ASSERT(mcst_p != NULL);
1596
1597				/*
1598				 * Remove the address from HW. The address
1599				 * will actually only be removed once the ref
1600				 * count within the MAC layer has dropped to
1601				 * zero. I.e. we can safely call this fn even
1602				 * if other ports are interested in this
1603				 * address.
1604				 */
1605				vsw_mac_multicast_remove(vswp, port, mcst_p,
1606				    VSW_VNETPORT);
1607				kmem_free(mcst_p, sizeof (*mcst_p));
1608
1609			} else {
1610				DERR(vswp, "%s: error deleting multicast "
1611				    "addr 0x%llx for port %ld",
1612				    __func__, addr, port->p_instance);
1613				return (1);
1614			}
1615		}
1616	}
1617	D1(vswp, "%s: exit", __func__);
1618	return (0);
1619}
1620
1621/*
1622 * Add a new multicast entry.
1623 *
1624 * Search hash table based on address. If match found then
1625 * update associated val (which is chain of ports), otherwise
1626 * create new key/val (addr/port) pair and insert into table.
1627 */
1628int
1629vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
1630{
1631	int		dup = 0;
1632	int		rv = 0;
1633	mfdb_ent_t	*ment = NULL;
1634	mfdb_ent_t	*tmp_ent = NULL;
1635	mfdb_ent_t	*new_ent = NULL;
1636	void		*tgt = NULL;
1637
1638	if (devtype == VSW_VNETPORT) {
1639		/*
1640		 * Being invoked from a vnet.
1641		 */
1642		ASSERT(arg != NULL);
1643		tgt = arg;
1644		D2(NULL, "%s: port %d : address 0x%llx", __func__,
1645		    ((vsw_port_t *)arg)->p_instance, addr);
1646	} else {
1647		/*
1648		 * We are being invoked via the m_multicst mac entry
1649		 * point.
1650		 */
1651		D2(NULL, "%s: address 0x%llx", __func__, addr);
1652		tgt = (void *)vswp;
1653	}
1654
1655	WRITE_ENTER(&vswp->mfdbrw);
1656	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
1657	    (mod_hash_val_t *)&ment) != 0) {
1658
1659		/* address not currently in table */
1660		ment = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
1661		ment->d_addr = (void *)tgt;
1662		ment->d_type = devtype;
1663		ment->nextp = NULL;
1664
1665		if (mod_hash_insert(vswp->mfdb, (mod_hash_key_t)addr,
1666		    (mod_hash_val_t)ment) != 0) {
1667			DERR(vswp, "%s: hash table insertion failed", __func__);
1668			kmem_free(ment, sizeof (mfdb_ent_t));
1669			rv = 1;
1670		} else {
1671			D2(vswp, "%s: added initial entry for 0x%llx to "
1672			    "table", __func__, addr);
1673		}
1674	} else {
1675		/*
1676		 * Address in table. Check to see if specified port
1677		 * is already associated with the address. If not add
1678		 * it now.
1679		 */
1680		tmp_ent = ment;
1681		while (tmp_ent != NULL) {
1682			if (tmp_ent->d_addr == (void *)tgt) {
1683				if (devtype == VSW_VNETPORT) {
1684					DERR(vswp, "%s: duplicate port entry "
1685					    "found for portid %ld and key "
1686					    "0x%llx", __func__,
1687					    ((vsw_port_t *)arg)->p_instance,
1688					    addr);
1689				} else {
1690					DERR(vswp, "%s: duplicate entry found"
1691					    "for key 0x%llx", __func__, addr);
1692				}
1693				rv = 1;
1694				dup = 1;
1695				break;
1696			}
1697			tmp_ent = tmp_ent->nextp;
1698		}
1699
1700		/*
1701		 * Port not on list so add it to end now.
1702		 */
1703		if (0 == dup) {
1704			D2(vswp, "%s: added entry for 0x%llx to table",
1705			    __func__, addr);
1706			new_ent = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
1707			new_ent->d_addr = (void *)tgt;
1708			new_ent->d_type = devtype;
1709			new_ent->nextp = NULL;
1710
1711			tmp_ent = ment;
1712			while (tmp_ent->nextp != NULL)
1713				tmp_ent = tmp_ent->nextp;
1714
1715			tmp_ent->nextp = new_ent;
1716		}
1717	}
1718
1719	RW_EXIT(&vswp->mfdbrw);
1720	return (rv);
1721}
1722
1723/*
1724 * Remove a multicast entry from the hashtable.
1725 *
1726 * Search hash table based on address. If match found, scan
1727 * list of ports associated with address. If specified port
1728 * found remove it from list.
1729 */
1730int
1731vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
1732{
1733	mfdb_ent_t	*ment = NULL;
1734	mfdb_ent_t	*curr_p, *prev_p;
1735	void		*tgt = NULL;
1736
1737	D1(vswp, "%s: enter", __func__);
1738
1739	if (devtype == VSW_VNETPORT) {
1740		tgt = (vsw_port_t *)arg;
1741		D2(vswp, "%s: removing port %d from mFDB for address"
1742		    " 0x%llx", __func__, ((vsw_port_t *)tgt)->p_instance, addr);
1743	} else {
1744		D2(vswp, "%s: removing entry", __func__);
1745		tgt = (void *)vswp;
1746	}
1747
1748	WRITE_ENTER(&vswp->mfdbrw);
1749	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
1750	    (mod_hash_val_t *)&ment) != 0) {
1751		D2(vswp, "%s: address 0x%llx not in table", __func__, addr);
1752		RW_EXIT(&vswp->mfdbrw);
1753		return (1);
1754	}
1755
1756	prev_p = curr_p = ment;
1757
1758	while (curr_p != NULL) {
1759		if (curr_p->d_addr == (void *)tgt) {
1760			if (devtype == VSW_VNETPORT) {
1761				D2(vswp, "%s: port %d found", __func__,
1762				    ((vsw_port_t *)tgt)->p_instance);
1763			} else {
1764				D2(vswp, "%s: instance found", __func__);
1765			}
1766
1767			if (prev_p == curr_p) {
1768				/*
1769				 * head of list, if no other element is in
1770				 * list then destroy this entry, otherwise
1771				 * just replace it with updated value.
1772				 */
1773				ment = curr_p->nextp;
1774				if (ment == NULL) {
1775					(void) mod_hash_destroy(vswp->mfdb,
1776					    (mod_hash_val_t)addr);
1777				} else {
1778					(void) mod_hash_replace(vswp->mfdb,
1779					    (mod_hash_key_t)addr,
1780					    (mod_hash_val_t)ment);
1781				}
1782			} else {
1783				/*
1784				 * Not head of list, no need to do
1785				 * replacement, just adjust list pointers.
1786				 */
1787				prev_p->nextp = curr_p->nextp;
1788			}
1789			break;
1790		}
1791
1792		prev_p = curr_p;
1793		curr_p = curr_p->nextp;
1794	}
1795
1796	RW_EXIT(&vswp->mfdbrw);
1797
1798	D1(vswp, "%s: exit", __func__);
1799
1800	if (curr_p == NULL)
1801		return (1);
1802	kmem_free(curr_p, sizeof (mfdb_ent_t));
1803	return (0);
1804}
1805
1806/*
1807 * Port is being deleted, but has registered an interest in one
1808 * or more multicast groups. Using the list of addresses maintained
1809 * within the port structure find the appropriate entry in the hash
1810 * table and remove this port from the list of interested ports.
1811 */
1812void
1813vsw_del_mcst_port(vsw_port_t *port)
1814{
1815	mcst_addr_t	*mcap = NULL;
1816	vsw_t		*vswp = port->p_vswp;
1817
1818	D1(vswp, "%s: enter", __func__);
1819
1820	mutex_enter(&port->mca_lock);
1821
1822	while ((mcap = port->mcap) != NULL) {
1823
1824		port->mcap = mcap->nextp;
1825
1826		mutex_exit(&port->mca_lock);
1827
1828		(void) vsw_del_mcst(vswp, VSW_VNETPORT,
1829		    mcap->addr, port);
1830
1831		/*
1832		 * Remove the address from HW. The address
1833		 * will actually only be removed once the ref
1834		 * count within the MAC layer has dropped to
1835		 * zero. I.e. we can safely call this fn even
1836		 * if other ports are interested in this
1837		 * address.
1838		 */
1839		vsw_mac_multicast_remove(vswp, port, mcap, VSW_VNETPORT);
1840		kmem_free(mcap, sizeof (*mcap));
1841
1842		mutex_enter(&port->mca_lock);
1843
1844	}
1845
1846	mutex_exit(&port->mca_lock);
1847
1848	D1(vswp, "%s: exit", __func__);
1849}
1850
1851/*
1852 * This vsw instance is detaching, but has registered an interest in one
1853 * or more multicast groups. Using the list of addresses maintained
1854 * within the vsw structure find the appropriate entry in the hash
1855 * table and remove this instance from the list of interested ports.
1856 */
1857void
1858vsw_del_mcst_vsw(vsw_t *vswp)
1859{
1860	mcst_addr_t	*next_p = NULL;
1861
1862	D1(vswp, "%s: enter", __func__);
1863
1864	mutex_enter(&vswp->mca_lock);
1865
1866	while (vswp->mcap != NULL) {
1867		DERR(vswp, "%s: deleting addr 0x%llx",
1868		    __func__, vswp->mcap->addr);
1869		(void) vsw_del_mcst(vswp, VSW_LOCALDEV, vswp->mcap->addr, NULL);
1870
1871		next_p = vswp->mcap->nextp;
1872		kmem_free(vswp->mcap, sizeof (mcst_addr_t));
1873		vswp->mcap = next_p;
1874	}
1875
1876	vswp->mcap = NULL;
1877	mutex_exit(&vswp->mca_lock);
1878
1879	D1(vswp, "%s: exit", __func__);
1880}
1881
1882mblk_t *
1883vsw_get_same_dest_list(struct ether_header *ehp, mblk_t **mpp)
1884{
1885	mblk_t			*bp;
1886	mblk_t			*nbp;
1887	mblk_t			*head = NULL;
1888	mblk_t			*tail = NULL;
1889	mblk_t			*prev = NULL;
1890	struct ether_header	*behp;
1891
1892	/* process the chain of packets */
1893	bp = *mpp;
1894	while (bp) {
1895		nbp = bp->b_next;
1896		behp = (struct ether_header *)bp->b_rptr;
1897		bp->b_prev = NULL;
1898		if (ether_cmp(&ehp->ether_dhost, &behp->ether_dhost) == 0) {
1899			if (prev == NULL) {
1900				*mpp = nbp;
1901			} else {
1902				prev->b_next = nbp;
1903			}
1904			bp->b_next =  NULL;
1905			if (head == NULL) {
1906				head = tail = bp;
1907			} else {
1908				tail->b_next = bp;
1909				tail = bp;
1910			}
1911		} else {
1912			prev = bp;
1913		}
1914		bp = nbp;
1915	}
1916	return (head);
1917}
1918
1919static mblk_t *
1920vsw_dupmsgchain(mblk_t *mp)
1921{
1922	mblk_t	*nmp = NULL;
1923	mblk_t	**nmpp = &nmp;
1924
1925	for (; mp != NULL; mp = mp->b_next) {
1926		if ((*nmpp = dupmsg(mp)) == NULL) {
1927			freemsgchain(nmp);
1928			return (NULL);
1929		}
1930
1931		nmpp = &((*nmpp)->b_next);
1932	}
1933
1934	return (nmp);
1935}
1936