vsw_switching.c revision 9819:82205569275c
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#include <sys/types.h>
28#include <sys/errno.h>
29#include <sys/debug.h>
30#include <sys/time.h>
31#include <sys/sysmacros.h>
32#include <sys/systm.h>
33#include <sys/user.h>
34#include <sys/stropts.h>
35#include <sys/stream.h>
36#include <sys/strlog.h>
37#include <sys/strsubr.h>
38#include <sys/cmn_err.h>
39#include <sys/cpu.h>
40#include <sys/kmem.h>
41#include <sys/conf.h>
42#include <sys/ddi.h>
43#include <sys/sunddi.h>
44#include <sys/ksynch.h>
45#include <sys/stat.h>
46#include <sys/kstat.h>
47#include <sys/vtrace.h>
48#include <sys/strsun.h>
49#include <sys/dlpi.h>
50#include <sys/ethernet.h>
51#include <net/if.h>
52#include <sys/varargs.h>
53#include <sys/machsystm.h>
54#include <sys/modctl.h>
55#include <sys/modhash.h>
56#include <sys/mac.h>
57#include <sys/mac_ether.h>
58#include <sys/taskq.h>
59#include <sys/note.h>
60#include <sys/mach_descrip.h>
61#include <sys/mdeg.h>
62#include <sys/ldc.h>
63#include <sys/vsw_fdb.h>
64#include <sys/vsw.h>
65#include <sys/vio_mailbox.h>
66#include <sys/vnet_mailbox.h>
67#include <sys/vnet_common.h>
68#include <sys/vio_util.h>
69#include <sys/sdt.h>
70#include <sys/atomic.h>
71#include <sys/vlan.h>
72
73/* Switching setup routines */
74void vsw_setup_switching_thread(void *arg);
75int vsw_setup_switching_start(vsw_t *vswp);
76void vsw_setup_switching_stop(vsw_t *vswp);
77int vsw_setup_switching(vsw_t *);
78void vsw_setup_switching_post_process(vsw_t *vswp);
79void vsw_switch_frame_nop(vsw_t *vswp, mblk_t *mp, int caller,
80    vsw_port_t *port, mac_resource_handle_t mrh);
81static	int vsw_setup_layer2(vsw_t *);
82static	int vsw_setup_layer3(vsw_t *);
83
84/* Switching/data transmit routines */
85static	void vsw_switch_l2_frame_mac_client(vsw_t *vswp, mblk_t *mp, int caller,
86    vsw_port_t *port, mac_resource_handle_t);
87static	void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
88	vsw_port_t *port, mac_resource_handle_t);
89static	void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
90	vsw_port_t *port, mac_resource_handle_t);
91static	int vsw_forward_all(vsw_t *vswp, mblk_t *mp,
92	int caller, vsw_port_t *port);
93static	int vsw_forward_grp(vsw_t *vswp, mblk_t *mp,
94    int caller, vsw_port_t *port);
95
96/* VLAN routines */
97void vsw_create_vlans(void *arg, int type);
98void vsw_destroy_vlans(void *arg, int type);
99void vsw_vlan_add_ids(void *arg, int type);
100void vsw_vlan_remove_ids(void *arg, int type);
101static	void vsw_vlan_create_hash(void *arg, int type);
102static	void vsw_vlan_destroy_hash(void *arg, int type);
103boolean_t vsw_frame_lookup_vid(void *arg, int caller, struct ether_header *ehp,
104	uint16_t *vidp);
105mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp);
106uint32_t vsw_vlan_frames_untag(void *arg, int type, mblk_t **np, mblk_t **npt);
107boolean_t vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid);
108
109/* Forwarding database (FDB) routines */
110void vsw_fdbe_add(vsw_t *vswp, void *port);
111void vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr);
112static	vsw_fdbe_t *vsw_fdbe_find(vsw_t *vswp, struct ether_addr *);
113static void vsw_fdbe_find_cb(mod_hash_key_t key, mod_hash_val_t val);
114
115int vsw_add_rem_mcst(vnet_mcast_msg_t *, vsw_port_t *);
116int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *);
117int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *);
118void vsw_del_mcst_vsw(vsw_t *);
119
120/* Support functions */
121static mblk_t *vsw_dupmsgchain(mblk_t *mp);
122static mblk_t *vsw_get_same_dest_list(struct ether_header *ehp, mblk_t **mpp);
123
124
125/*
126 * Functions imported from other files.
127 */
128extern mblk_t *vsw_tx_msg(vsw_t *, mblk_t *, int, vsw_port_t *);
129extern mcst_addr_t *vsw_del_addr(uint8_t, void *, uint64_t);
130extern int vsw_mac_open(vsw_t *vswp);
131extern void vsw_mac_close(vsw_t *vswp);
132extern void vsw_mac_rx(vsw_t *vswp, mac_resource_handle_t mrh,
133    mblk_t *mp, vsw_macrx_flags_t flags);
134extern void vsw_set_addrs(vsw_t *vswp);
135extern int vsw_portsend(vsw_port_t *port, mblk_t *mp);
136extern void vsw_hio_init(vsw_t *vswp);
137extern void vsw_hio_start_ports(vsw_t *vswp);
138extern int vsw_mac_multicast_add(vsw_t *vswp, vsw_port_t *port,
139    mcst_addr_t *mcst_p, int type);
140extern void vsw_mac_multicast_remove(vsw_t *vswp, vsw_port_t *port,
141    mcst_addr_t *mcst_p, int type);
142extern void vsw_mac_link_update(vsw_t *vswp, link_state_t link_state);
143extern void vsw_physlink_update_ports(vsw_t *vswp);
144
145/*
146 * Tunables used in this file.
147 */
148extern	int vsw_setup_switching_delay;
149extern	uint32_t vsw_vlan_nchains;
150extern	uint32_t vsw_fdbe_refcnt_delay;
151
152#define	VSW_FDBE_REFHOLD(p)						\
153{									\
154	atomic_inc_32(&(p)->refcnt);					\
155	ASSERT((p)->refcnt != 0);					\
156}
157
158#define	VSW_FDBE_REFRELE(p)						\
159{									\
160	ASSERT((p)->refcnt != 0);					\
161	atomic_dec_32(&(p)->refcnt);					\
162}
163
164/*
165 * Thread to setup switching mode. This thread is created during vsw_attach()
166 * initially. It invokes vsw_setup_switching() and keeps retrying while the
167 * returned value is EAGAIN. The thread exits when the switching mode setup is
168 * done successfully or when the error returned is not EAGAIN. This thread may
169 * also get created from vsw_update_md_prop() if the switching mode needs to be
170 * updated.
171 */
172void
173vsw_setup_switching_thread(void *arg)
174{
175	callb_cpr_t	cprinfo;
176	vsw_t		*vswp =  (vsw_t *)arg;
177	clock_t		wait_time;
178	clock_t		xwait;
179	clock_t		wait_rv;
180	int		rv;
181
182	/* wait time used on successive retries */
183	xwait = drv_usectohz(vsw_setup_switching_delay * MICROSEC);
184
185	CALLB_CPR_INIT(&cprinfo, &vswp->sw_thr_lock, callb_generic_cpr,
186	    "vsw_setup_sw_thread");
187
188	mutex_enter(&vswp->sw_thr_lock);
189
190	while ((vswp->sw_thr_flags & VSW_SWTHR_STOP) == 0) {
191
192		CALLB_CPR_SAFE_BEGIN(&cprinfo);
193
194		/* Wait for sometime before (re)trying setup_switching() */
195		wait_time = ddi_get_lbolt() + xwait;
196		while ((vswp->sw_thr_flags & VSW_SWTHR_STOP) == 0) {
197			wait_rv = cv_timedwait(&vswp->sw_thr_cv,
198			    &vswp->sw_thr_lock, wait_time);
199			if (wait_rv == -1) {	/* timed out */
200				break;
201			}
202		}
203
204		CALLB_CPR_SAFE_END(&cprinfo, &vswp->sw_thr_lock)
205
206		if ((vswp->sw_thr_flags & VSW_SWTHR_STOP) != 0) {
207			/*
208			 * If there is a stop request, process that first and
209			 * exit the loop. Continue to hold the mutex which gets
210			 * released in CALLB_CPR_EXIT().
211			 */
212			break;
213		}
214
215		mutex_exit(&vswp->sw_thr_lock);
216		rv = vsw_setup_switching(vswp);
217		if (rv == 0) {
218			vsw_setup_switching_post_process(vswp);
219		}
220		mutex_enter(&vswp->sw_thr_lock);
221		if (rv != EAGAIN) {
222			break;
223		}
224
225	}
226
227	vswp->sw_thr_flags &= ~VSW_SWTHR_STOP;
228	vswp->sw_thread = NULL;
229	CALLB_CPR_EXIT(&cprinfo);
230	thread_exit();
231}
232
233/*
234 * Create a thread to setup the switching mode.
235 * Returns 0 on success; 1 on failure.
236 */
237int
238vsw_setup_switching_start(vsw_t *vswp)
239{
240	mutex_enter(&vswp->sw_thr_lock);
241
242	vswp->sw_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
243	    vsw_setup_switching_thread, vswp, 0, &p0, TS_RUN, minclsyspri);
244
245	if (vswp->sw_thread == NULL) {
246		mutex_exit(&vswp->sw_thr_lock);
247		return (1);
248	}
249
250	mutex_exit(&vswp->sw_thr_lock);
251	return (0);
252}
253
254/*
255 * Stop the thread to setup switching mode.
256 */
257void
258vsw_setup_switching_stop(vsw_t *vswp)
259{
260	kt_did_t	tid = 0;
261
262	/*
263	 * Signal the setup_switching thread to stop and wait until it stops.
264	 */
265	mutex_enter(&vswp->sw_thr_lock);
266
267	if (vswp->sw_thread != NULL) {
268		tid = vswp->sw_thread->t_did;
269		vswp->sw_thr_flags |= VSW_SWTHR_STOP;
270		cv_signal(&vswp->sw_thr_cv);
271	}
272
273	mutex_exit(&vswp->sw_thr_lock);
274
275	if (tid != 0)
276		thread_join(tid);
277
278	(void) atomic_swap_32(&vswp->switching_setup_done, B_FALSE);
279
280	vswp->mac_open_retries = 0;
281}
282
283/*
284 * Setup the required switching mode.
285 * Returns:
286 *  0 on success.
287 *  EAGAIN if retry is needed.
288 *  1 on all other failures.
289 */
290int
291vsw_setup_switching(vsw_t *vswp)
292{
293	int	rv = 1;
294
295	D1(vswp, "%s: enter", __func__);
296
297	/*
298	 * Select best switching mode.
299	 * This is done as this routine can be called from the timeout
300	 * handler to retry setting up a specific mode. Currently only
301	 * the function which sets up layer2/promisc mode returns EAGAIN
302	 * if the underlying network device is not available yet, causing
303	 * retries.
304	 */
305	if (vswp->smode & VSW_LAYER2) {
306		rv = vsw_setup_layer2(vswp);
307	} else if (vswp->smode & VSW_LAYER3) {
308		rv = vsw_setup_layer3(vswp);
309	} else {
310		DERR(vswp, "unknown switch mode");
311		rv = 1;
312	}
313
314	if (rv && (rv != EAGAIN)) {
315		cmn_err(CE_WARN, "!vsw%d: Unable to setup specified "
316		    "switching mode", vswp->instance);
317	} else if (rv == 0) {
318		(void) atomic_swap_32(&vswp->switching_setup_done, B_TRUE);
319	}
320
321	D2(vswp, "%s: Operating in mode %d", __func__,
322	    vswp->smode);
323
324	D1(vswp, "%s: exit", __func__);
325
326	return (rv);
327}
328
329/*
330 * Setup for layer 2 switching.
331 *
332 * Returns:
333 *  0 on success.
334 *  EAGAIN if retry is needed.
335 *  EIO on all other failures.
336 */
337static int
338vsw_setup_layer2(vsw_t *vswp)
339{
340	int	rv;
341
342	D1(vswp, "%s: enter", __func__);
343
344	/*
345	 * Until the network device is successfully opened,
346	 * set the switching to use vsw_switch_l2_frame.
347	 */
348	vswp->vsw_switch_frame = vsw_switch_l2_frame;
349	vswp->mac_cl_switching = B_FALSE;
350
351	rv = strlen(vswp->physname);
352	if (rv == 0) {
353		/*
354		 * Physical device name is NULL, which is
355		 * required for layer 2.
356		 */
357		cmn_err(CE_WARN, "!vsw%d: no network device name specified",
358		    vswp->instance);
359		return (EIO);
360	}
361
362	mutex_enter(&vswp->mac_lock);
363
364	rv = vsw_mac_open(vswp);
365	if (rv != 0) {
366		if (rv != EAGAIN) {
367			cmn_err(CE_WARN, "!vsw%d: Unable to open network "
368			    "device: %s\n", vswp->instance, vswp->physname);
369		}
370		mutex_exit(&vswp->mac_lock);
371		return (rv);
372	}
373
374	/*
375	 * Now we can use the mac client switching, so set the switching
376	 * function to use vsw_switch_l2_frame_mac_client(), which simply
377	 * sends the packets to MAC layer for switching.
378	 */
379	vswp->vsw_switch_frame = vsw_switch_l2_frame_mac_client;
380	vswp->mac_cl_switching = B_TRUE;
381
382	D1(vswp, "%s: exit", __func__);
383
384	/* Initialize HybridIO related stuff */
385	vsw_hio_init(vswp);
386
387	mutex_exit(&vswp->mac_lock);
388	return (0);
389
390exit_error:
391	vsw_mac_close(vswp);
392	mutex_exit(&vswp->mac_lock);
393	return (EIO);
394}
395
396static int
397vsw_setup_layer3(vsw_t *vswp)
398{
399	D1(vswp, "%s: enter", __func__);
400
401	D2(vswp, "%s: operating in layer 3 mode", __func__);
402	vswp->vsw_switch_frame = vsw_switch_l3_frame;
403
404	D1(vswp, "%s: exit", __func__);
405
406	return (0);
407}
408
409/* ARGSUSED */
410void
411vsw_switch_frame_nop(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *port,
412			mac_resource_handle_t mrh)
413{
414	freemsgchain(mp);
415}
416
417/*
418 * Use mac client for layer 2 switching .
419 */
420static void
421vsw_switch_l2_frame_mac_client(vsw_t *vswp, mblk_t *mp, int caller,
422    vsw_port_t *port, mac_resource_handle_t mrh)
423{
424	_NOTE(ARGUNUSED(mrh))
425
426	mblk_t		*ret_m;
427
428	/*
429	 * This switching function is expected to be called by
430	 * the ports or the interface only. The packets from
431	 * physical interface already switched.
432	 */
433	ASSERT((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV));
434
435	if ((ret_m = vsw_tx_msg(vswp, mp, caller, port)) != NULL) {
436		DERR(vswp, "%s: drop mblks to "
437		    "phys dev", __func__);
438		freemsgchain(ret_m);
439	}
440}
441
442/*
443 * Switch the given ethernet frame when operating in layer 2 mode.
444 *
445 * vswp: pointer to the vsw instance
446 * mp: pointer to chain of ethernet frame(s) to be switched
447 * caller: identifies the source of this frame as:
448 * 		1. VSW_VNETPORT - a vsw port (connected to a vnet).
449 *		2. VSW_PHYSDEV - the physical ethernet device
450 *		3. VSW_LOCALDEV - vsw configured as a virtual interface
451 * arg: argument provided by the caller.
452 *		1. for VNETPORT - pointer to the corresponding vsw_port_t.
453 *		2. for PHYSDEV - NULL
454 *		3. for LOCALDEV - pointer to to this vsw_t(self)
455 */
456void
457vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
458			vsw_port_t *arg, mac_resource_handle_t mrh)
459{
460	struct ether_header	*ehp;
461	mblk_t			*bp, *ret_m;
462	vsw_fdbe_t		*fp;
463
464	D1(vswp, "%s: enter (caller %d)", __func__, caller);
465
466	/*
467	 * PERF: rather than breaking up the chain here, scan it
468	 * to find all mblks heading to same destination and then
469	 * pass that sub-chain to the lower transmit functions.
470	 */
471
472	/* process the chain of packets */
473	bp = mp;
474	while (bp) {
475		ehp = (struct ether_header *)bp->b_rptr;
476		mp = vsw_get_same_dest_list(ehp, &bp);
477		ASSERT(mp != NULL);
478
479		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
480		    __func__, MBLKSIZE(mp), MBLKL(mp));
481
482		if (ether_cmp(&ehp->ether_dhost, &vswp->if_addr) == 0) {
483			/*
484			 * If destination is VSW_LOCALDEV (vsw as an eth
485			 * interface) and if the device is up & running,
486			 * send the packet up the stack on this host.
487			 * If the virtual interface is down, drop the packet.
488			 */
489			if (caller != VSW_LOCALDEV) {
490				vsw_mac_rx(vswp, mrh, mp, VSW_MACRX_FREEMSG);
491			} else {
492				freemsgchain(mp);
493			}
494			continue;
495		}
496
497		/*
498		 * Find fdb entry for the destination
499		 * and hold a reference to it.
500		 */
501		fp = vsw_fdbe_find(vswp, &ehp->ether_dhost);
502		if (fp != NULL) {
503
504			/*
505			 * If plumbed and in promisc mode then copy msg
506			 * and send up the stack.
507			 */
508			vsw_mac_rx(vswp, mrh, mp,
509			    VSW_MACRX_PROMISC | VSW_MACRX_COPYMSG);
510
511			/*
512			 * If the destination is in FDB, the packet
513			 * should be forwarded to the correponding
514			 * vsw_port (connected to a vnet device -
515			 * VSW_VNETPORT)
516			 */
517			(void) vsw_portsend(fp->portp, mp);
518
519			/* Release the reference on the fdb entry */
520			VSW_FDBE_REFRELE(fp);
521		} else {
522			/*
523			 * Destination not in FDB.
524			 *
525			 * If the destination is broadcast or
526			 * multicast forward the packet to all
527			 * (VNETPORTs, PHYSDEV, LOCALDEV),
528			 * except the caller.
529			 */
530			if (IS_BROADCAST(ehp)) {
531				D2(vswp, "%s: BROADCAST pkt", __func__);
532				(void) vsw_forward_all(vswp, mp, caller, arg);
533			} else if (IS_MULTICAST(ehp)) {
534				D2(vswp, "%s: MULTICAST pkt", __func__);
535				(void) vsw_forward_grp(vswp, mp, caller, arg);
536			} else {
537				/*
538				 * If the destination is unicast, and came
539				 * from either a logical network device or
540				 * the switch itself when it is plumbed, then
541				 * send it out on the physical device and also
542				 * up the stack if the logical interface is
543				 * in promiscious mode.
544				 *
545				 * NOTE:  The assumption here is that if we
546				 * cannot find the destination in our fdb, its
547				 * a unicast address, and came from either a
548				 * vnet or down the stack (when plumbed) it
549				 * must be destinded for an ethernet device
550				 * outside our ldoms.
551				 */
552				if (caller == VSW_VNETPORT) {
553					/* promisc check copy etc */
554					vsw_mac_rx(vswp, mrh, mp,
555					    VSW_MACRX_PROMISC |
556					    VSW_MACRX_COPYMSG);
557
558					if ((ret_m = vsw_tx_msg(vswp, mp,
559					    caller, arg)) != NULL) {
560						DERR(vswp, "%s: drop mblks to "
561						    "phys dev", __func__);
562						freemsgchain(ret_m);
563					}
564
565				} else if (caller == VSW_PHYSDEV) {
566					/*
567					 * Pkt seen because card in promisc
568					 * mode. Send up stack if plumbed in
569					 * promisc mode, else drop it.
570					 */
571					vsw_mac_rx(vswp, mrh, mp,
572					    VSW_MACRX_PROMISC |
573					    VSW_MACRX_FREEMSG);
574
575				} else if (caller == VSW_LOCALDEV) {
576					/*
577					 * Pkt came down the stack, send out
578					 * over physical device.
579					 */
580					if ((ret_m = vsw_tx_msg(vswp, mp,
581					    caller, NULL)) != NULL) {
582						DERR(vswp, "%s: drop mblks to "
583						    "phys dev", __func__);
584						freemsgchain(ret_m);
585					}
586				}
587			}
588		}
589	}
590	D1(vswp, "%s: exit\n", __func__);
591}
592
593/*
594 * Switch ethernet frame when in layer 3 mode (i.e. using IP
595 * layer to do the routing).
596 *
597 * There is a large amount of overlap between this function and
598 * vsw_switch_l2_frame. At some stage we need to revisit and refactor
599 * both these functions.
600 */
601void
602vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
603			vsw_port_t *arg, mac_resource_handle_t mrh)
604{
605	struct ether_header	*ehp;
606	mblk_t			*bp = NULL;
607	vsw_fdbe_t		*fp;
608
609	D1(vswp, "%s: enter (caller %d)", __func__, caller);
610
611	/*
612	 * In layer 3 mode should only ever be switching packets
613	 * between IP layer and vnet devices. So make sure thats
614	 * who is invoking us.
615	 */
616	if ((caller != VSW_LOCALDEV) && (caller != VSW_VNETPORT)) {
617		DERR(vswp, "%s: unexpected caller (%d)", __func__, caller);
618		freemsgchain(mp);
619		return;
620	}
621
622	/* process the chain of packets */
623	bp = mp;
624	while (bp) {
625		ehp = (struct ether_header *)bp->b_rptr;
626		mp = vsw_get_same_dest_list(ehp, &bp);
627		ASSERT(mp != NULL);
628
629		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
630		    __func__, MBLKSIZE(mp), MBLKL(mp));
631
632		/*
633		 * Find fdb entry for the destination
634		 * and hold a reference to it.
635		 */
636		fp = vsw_fdbe_find(vswp, &ehp->ether_dhost);
637		if (fp != NULL) {
638
639			D2(vswp, "%s: sending to target port", __func__);
640			(void) vsw_portsend(fp->portp, mp);
641
642			/* Release the reference on the fdb entry */
643			VSW_FDBE_REFRELE(fp);
644		} else {
645			/*
646			 * Destination not in FDB
647			 *
648			 * If the destination is broadcast or
649			 * multicast forward the packet to all
650			 * (VNETPORTs, PHYSDEV, LOCALDEV),
651			 * except the caller.
652			 */
653			if (IS_BROADCAST(ehp)) {
654				D2(vswp, "%s: BROADCAST pkt", __func__);
655				(void) vsw_forward_all(vswp, mp, caller, arg);
656			} else if (IS_MULTICAST(ehp)) {
657				D2(vswp, "%s: MULTICAST pkt", __func__);
658				(void) vsw_forward_grp(vswp, mp, caller, arg);
659			} else {
660				/*
661				 * Unicast pkt from vnet that we don't have
662				 * an FDB entry for, so must be destinded for
663				 * the outside world. Attempt to send up to the
664				 * IP layer to allow it to deal with it.
665				 */
666				if (caller == VSW_VNETPORT) {
667					vsw_mac_rx(vswp, mrh,
668					    mp, VSW_MACRX_FREEMSG);
669				}
670			}
671		}
672	}
673
674	D1(vswp, "%s: exit", __func__);
675}
676
677/*
678 * Additional initializations that are needed for the specific switching mode.
679 */
680void
681vsw_setup_switching_post_process(vsw_t *vswp)
682{
683	link_state_t	link_state = LINK_STATE_UP;
684
685	if (vswp->smode & VSW_LAYER2) {
686		/*
687		 * Program unicst, mcst addrs of vsw
688		 * interface and ports in the physdev.
689		 */
690		vsw_set_addrs(vswp);
691
692		/* Start HIO for ports that have already connected */
693		vsw_hio_start_ports(vswp);
694
695		if (vswp->pls_update == B_TRUE) {
696			link_state = vswp->phys_link_state;
697		}
698
699		/* Update physical link info to any ports already connected */
700		vsw_physlink_update_ports(vswp);
701	}
702
703	vsw_mac_link_update(vswp, link_state);
704}
705
706/*
707 * Forward the ethernet frame to all ports (VNETPORTs, PHYSDEV, LOCALDEV),
708 * except the caller (port on which frame arrived).
709 */
710static int
711vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
712{
713	vsw_port_list_t	*plist = &vswp->plist;
714	vsw_port_t	*portp;
715	mblk_t		*nmp = NULL;
716	mblk_t		*ret_m = NULL;
717	int		skip_port = 0;
718
719	D1(vswp, "vsw_forward_all: enter\n");
720
721	/*
722	 * Broadcast message from inside ldoms so send to outside
723	 * world if in either of layer 2 modes.
724	 */
725	if ((vswp->smode & VSW_LAYER2) &&
726	    ((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) {
727
728		nmp = vsw_dupmsgchain(mp);
729		if (nmp) {
730			if ((ret_m = vsw_tx_msg(vswp, nmp, caller, arg))
731			    != NULL) {
732				DERR(vswp, "%s: dropping pkt(s) "
733				    "consisting of %ld bytes of data for"
734				    " physical device", __func__, MBLKL(ret_m));
735				freemsgchain(ret_m);
736			}
737		}
738	}
739
740	if (caller == VSW_VNETPORT)
741		skip_port = 1;
742
743	/*
744	 * Broadcast message from other vnet (layer 2 or 3) or outside
745	 * world (layer 2 only), send up stack if plumbed.
746	 */
747	if ((caller == VSW_PHYSDEV) || (caller == VSW_VNETPORT)) {
748		vsw_mac_rx(vswp, NULL, mp, VSW_MACRX_COPYMSG);
749	}
750
751	/* send it to all VNETPORTs */
752	READ_ENTER(&plist->lockrw);
753	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
754		D2(vswp, "vsw_forward_all: port %d", portp->p_instance);
755		/*
756		 * Caution ! - don't reorder these two checks as arg
757		 * will be NULL if the caller is PHYSDEV. skip_port is
758		 * only set if caller is VNETPORT.
759		 */
760		if ((skip_port) && (portp == arg)) {
761			continue;
762		} else {
763			nmp = vsw_dupmsgchain(mp);
764			if (nmp) {
765				/*
766				 * The plist->lockrw is protecting the
767				 * portp from getting destroyed here.
768				 * So, no ref_cnt is incremented here.
769				 */
770				(void) vsw_portsend(portp, nmp);
771			} else {
772				DERR(vswp, "vsw_forward_all: nmp NULL");
773			}
774		}
775	}
776	RW_EXIT(&plist->lockrw);
777
778	freemsgchain(mp);
779
780	D1(vswp, "vsw_forward_all: exit\n");
781	return (0);
782}
783
784/*
785 * Forward pkts to any devices or interfaces which have registered
786 * an interest in them (i.e. multicast groups).
787 */
788static int
789vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
790{
791	struct ether_header	*ehp = (struct ether_header *)mp->b_rptr;
792	mfdb_ent_t		*entp = NULL;
793	mfdb_ent_t		*tpp = NULL;
794	vsw_port_t 		*port;
795	uint64_t		key = 0;
796	mblk_t			*nmp = NULL;
797	mblk_t			*ret_m = NULL;
798	boolean_t		check_if = B_TRUE;
799
800	/*
801	 * Convert address to hash table key
802	 */
803	KEY_HASH(key, &ehp->ether_dhost);
804
805	D1(vswp, "%s: key 0x%llx", __func__, key);
806
807	/*
808	 * If pkt came from either a vnet or down the stack (if we are
809	 * plumbed) and we are in layer 2 mode, then we send the pkt out
810	 * over the physical adapter, and then check to see if any other
811	 * vnets are interested in it.
812	 */
813	if ((vswp->smode & VSW_LAYER2) &&
814	    ((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) {
815		nmp = vsw_dupmsgchain(mp);
816		if (nmp) {
817			if ((ret_m = vsw_tx_msg(vswp, nmp, caller, arg))
818			    != NULL) {
819				DERR(vswp, "%s: dropping pkt(s) consisting of "
820				    "%ld bytes of data for physical device",
821				    __func__, MBLKL(ret_m));
822				freemsgchain(ret_m);
823			}
824		}
825	}
826
827	READ_ENTER(&vswp->mfdbrw);
828	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)key,
829	    (mod_hash_val_t *)&entp) != 0) {
830		D3(vswp, "%s: no table entry found for addr 0x%llx",
831		    __func__, key);
832	} else {
833		/*
834		 * Send to list of devices associated with this address...
835		 */
836		for (tpp = entp; tpp != NULL; tpp = tpp->nextp) {
837
838			/* dont send to ourselves */
839			if ((caller == VSW_VNETPORT) &&
840			    (tpp->d_addr == (void *)arg)) {
841				port = (vsw_port_t *)tpp->d_addr;
842				D3(vswp, "%s: not sending to ourselves"
843				    " : port %d", __func__, port->p_instance);
844				continue;
845
846			} else if ((caller == VSW_LOCALDEV) &&
847			    (tpp->d_type == VSW_LOCALDEV)) {
848				D2(vswp, "%s: not sending back up stack",
849				    __func__);
850				continue;
851			}
852
853			if (tpp->d_type == VSW_VNETPORT) {
854				port = (vsw_port_t *)tpp->d_addr;
855				D3(vswp, "%s: sending to port %ld for addr "
856				    "0x%llx", __func__, port->p_instance, key);
857
858				nmp = vsw_dupmsgchain(mp);
859				if (nmp) {
860					/*
861					 * The vswp->mfdbrw is protecting the
862					 * portp from getting destroyed here.
863					 * So, no ref_cnt is incremented here.
864					 */
865					(void) vsw_portsend(port, nmp);
866				}
867			} else {
868				vsw_mac_rx(vswp, NULL,
869				    mp, VSW_MACRX_COPYMSG);
870				D2(vswp, "%s: sending up stack"
871				    " for addr 0x%llx", __func__, key);
872				check_if = B_FALSE;
873			}
874		}
875	}
876
877	RW_EXIT(&vswp->mfdbrw);
878
879	/*
880	 * If the pkt came from either a vnet or from physical device,
881	 * and if we havent already sent the pkt up the stack then we
882	 * check now if we can/should (i.e. the interface is plumbed
883	 * and in promisc mode).
884	 */
885	if ((check_if) &&
886	    ((caller == VSW_VNETPORT) || (caller == VSW_PHYSDEV))) {
887		vsw_mac_rx(vswp, NULL, mp,
888		    VSW_MACRX_PROMISC | VSW_MACRX_COPYMSG);
889	}
890
891	freemsgchain(mp);
892
893	D1(vswp, "%s: exit", __func__);
894
895	return (0);
896}
897
898/*
899 * This function creates the vlan id hash table for the given vsw device or
900 * port. It then adds each vlan that the device or port has been assigned,
901 * into this hash table.
902 * Arguments:
903 *   arg:  vsw device or port.
904 *   type: type of arg; VSW_LOCALDEV(vsw device) or VSW_VNETPORT(port).
905 */
906void
907vsw_create_vlans(void *arg, int type)
908{
909	/* create vlan hash table */
910	vsw_vlan_create_hash(arg, type);
911
912	/* add vlan ids of the vsw device into its hash table */
913	vsw_vlan_add_ids(arg, type);
914}
915
916/*
917 * This function removes the vlan ids of the vsw device or port from its hash
918 * table. It then destroys the vlan hash table.
919 * Arguments:
920 *   arg:  vsw device or port.
921 *   type: type of arg; VSW_LOCALDEV(vsw device) or VSW_VNETPORT(port).
922 */
923void
924vsw_destroy_vlans(void *arg, int type)
925{
926	/* remove vlan ids from the hash table */
927	vsw_vlan_remove_ids(arg, type);
928
929	/* destroy vlan-hash-table */
930	vsw_vlan_destroy_hash(arg, type);
931}
932
933/*
934 * Create a vlan-id hash table for the given vsw device or port.
935 */
936static void
937vsw_vlan_create_hash(void *arg, int type)
938{
939	char		hashname[MAXNAMELEN];
940
941	if (type == VSW_LOCALDEV) {
942		vsw_t		*vswp = (vsw_t *)arg;
943
944		(void) snprintf(hashname, MAXNAMELEN, "vsw%d-vlan-hash",
945		    vswp->instance);
946
947		vswp->vlan_nchains = vsw_vlan_nchains;
948		vswp->vlan_hashp = mod_hash_create_idhash(hashname,
949		    vswp->vlan_nchains, mod_hash_null_valdtor);
950
951	} else if (type == VSW_VNETPORT) {
952		vsw_port_t	*portp = (vsw_port_t *)arg;
953
954		(void) snprintf(hashname, MAXNAMELEN, "port%d-vlan-hash",
955		    portp->p_instance);
956
957		portp->vlan_nchains = vsw_vlan_nchains;
958		portp->vlan_hashp = mod_hash_create_idhash(hashname,
959		    portp->vlan_nchains, mod_hash_null_valdtor);
960
961	} else {
962		return;
963	}
964}
965
966/*
967 * Destroy the vlan-id hash table for the given vsw device or port.
968 */
969static void
970vsw_vlan_destroy_hash(void *arg, int type)
971{
972	if (type == VSW_LOCALDEV) {
973		vsw_t		*vswp = (vsw_t *)arg;
974
975		mod_hash_destroy_hash(vswp->vlan_hashp);
976		vswp->vlan_nchains = 0;
977	} else if (type == VSW_VNETPORT) {
978		vsw_port_t	*portp = (vsw_port_t *)arg;
979
980		mod_hash_destroy_hash(portp->vlan_hashp);
981		portp->vlan_nchains = 0;
982	} else {
983		return;
984	}
985}
986
987/*
988 * Add vlan ids of the given vsw device or port into its hash table.
989 */
990void
991vsw_vlan_add_ids(void *arg, int type)
992{
993	int	rv;
994	int	i;
995
996	if (type == VSW_LOCALDEV) {
997		vsw_t		*vswp = (vsw_t *)arg;
998
999		rv = mod_hash_insert(vswp->vlan_hashp,
1000		    (mod_hash_key_t)VLAN_ID_KEY(vswp->pvid),
1001		    (mod_hash_val_t)B_TRUE);
1002		if (rv != 0) {
1003			cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d) for "
1004			    "the interface", vswp->instance, vswp->pvid);
1005		}
1006
1007		for (i = 0; i < vswp->nvids; i++) {
1008			rv = mod_hash_insert(vswp->vlan_hashp,
1009			    (mod_hash_key_t)VLAN_ID_KEY(vswp->vids[i].vl_vid),
1010			    (mod_hash_val_t)B_TRUE);
1011			if (rv != 0) {
1012				cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d)"
1013				    " for the interface", vswp->instance,
1014				    vswp->pvid);
1015			}
1016		}
1017
1018	} else if (type == VSW_VNETPORT) {
1019		vsw_port_t	*portp = (vsw_port_t *)arg;
1020		vsw_t		*vswp = portp->p_vswp;
1021
1022		rv = mod_hash_insert(portp->vlan_hashp,
1023		    (mod_hash_key_t)VLAN_ID_KEY(portp->pvid),
1024		    (mod_hash_val_t)B_TRUE);
1025		if (rv != 0) {
1026			cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d) for "
1027			    "the port(%d)", vswp->instance, vswp->pvid,
1028			    portp->p_instance);
1029		}
1030
1031		for (i = 0; i < portp->nvids; i++) {
1032			rv = mod_hash_insert(portp->vlan_hashp,
1033			    (mod_hash_key_t)VLAN_ID_KEY(portp->vids[i].vl_vid),
1034			    (mod_hash_val_t)B_TRUE);
1035			if (rv != 0) {
1036				cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d)"
1037				    " for the port(%d)", vswp->instance,
1038				    vswp->pvid, portp->p_instance);
1039			}
1040		}
1041
1042	}
1043}
1044
1045/*
1046 * Remove vlan ids of the given vsw device or port from its hash table.
1047 */
1048void
1049vsw_vlan_remove_ids(void *arg, int type)
1050{
1051	mod_hash_val_t	vp;
1052	int		rv;
1053	int		i;
1054
1055	if (type == VSW_LOCALDEV) {
1056		vsw_t		*vswp = (vsw_t *)arg;
1057
1058		rv = vsw_vlan_lookup(vswp->vlan_hashp, vswp->pvid);
1059		if (rv == B_TRUE) {
1060			rv = mod_hash_remove(vswp->vlan_hashp,
1061			    (mod_hash_key_t)VLAN_ID_KEY(vswp->pvid),
1062			    (mod_hash_val_t *)&vp);
1063			ASSERT(rv == 0);
1064		}
1065
1066		for (i = 0; i < vswp->nvids; i++) {
1067			rv = vsw_vlan_lookup(vswp->vlan_hashp,
1068			    vswp->vids[i].vl_vid);
1069			if (rv == B_TRUE) {
1070				rv = mod_hash_remove(vswp->vlan_hashp,
1071				    (mod_hash_key_t)VLAN_ID_KEY(
1072				    vswp->vids[i].vl_vid),
1073				    (mod_hash_val_t *)&vp);
1074				ASSERT(rv == 0);
1075			}
1076		}
1077
1078	} else if (type == VSW_VNETPORT) {
1079		vsw_port_t	*portp = (vsw_port_t *)arg;
1080
1081		portp = (vsw_port_t *)arg;
1082		rv = vsw_vlan_lookup(portp->vlan_hashp, portp->pvid);
1083		if (rv == B_TRUE) {
1084			rv = mod_hash_remove(portp->vlan_hashp,
1085			    (mod_hash_key_t)VLAN_ID_KEY(portp->pvid),
1086			    (mod_hash_val_t *)&vp);
1087			ASSERT(rv == 0);
1088		}
1089
1090		for (i = 0; i < portp->nvids; i++) {
1091			rv = vsw_vlan_lookup(portp->vlan_hashp,
1092			    portp->vids[i].vl_vid);
1093			if (rv == B_TRUE) {
1094				rv = mod_hash_remove(portp->vlan_hashp,
1095				    (mod_hash_key_t)VLAN_ID_KEY(
1096				    portp->vids[i].vl_vid),
1097				    (mod_hash_val_t *)&vp);
1098				ASSERT(rv == 0);
1099			}
1100		}
1101
1102	} else {
1103		return;
1104	}
1105}
1106
1107/*
1108 * Find the given vlan id in the hash table.
1109 * Return: B_TRUE if the id is found; B_FALSE if not found.
1110 */
1111boolean_t
1112vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid)
1113{
1114	int		rv;
1115	mod_hash_val_t	vp;
1116
1117	rv = mod_hash_find(vlan_hashp, VLAN_ID_KEY(vid), (mod_hash_val_t *)&vp);
1118
1119	if (rv != 0)
1120		return (B_FALSE);
1121
1122	return (B_TRUE);
1123}
1124
1125/*
1126 * Add an entry into FDB for the given vsw.
1127 */
1128void
1129vsw_fdbe_add(vsw_t *vswp, void *port)
1130{
1131	uint64_t	addr = 0;
1132	vsw_port_t	*portp;
1133	vsw_fdbe_t	*fp;
1134	int		rv;
1135
1136	portp = (vsw_port_t *)port;
1137	KEY_HASH(addr, &portp->p_macaddr);
1138
1139	fp = kmem_zalloc(sizeof (vsw_fdbe_t), KM_SLEEP);
1140	fp->portp = port;
1141
1142	/*
1143	 * Note: duplicate keys will be rejected by mod_hash.
1144	 */
1145	rv = mod_hash_insert(vswp->fdb_hashp, (mod_hash_key_t)addr,
1146	    (mod_hash_val_t)fp);
1147	if (rv != 0) {
1148		cmn_err(CE_WARN, "vsw%d: Duplicate mac-address(%s) for "
1149		    "the port(%d)", vswp->instance,
1150		    ether_sprintf(&portp->p_macaddr), portp->p_instance);
1151	}
1152}
1153
1154/*
1155 * Remove an entry from FDB.
1156 */
1157void
1158vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr)
1159{
1160	uint64_t	addr = 0;
1161	vsw_fdbe_t	*fp;
1162	int		rv;
1163
1164	KEY_HASH(addr, eaddr);
1165
1166	/*
1167	 * Remove the entry from fdb hash table.
1168	 * This prevents further references to this fdb entry.
1169	 */
1170	rv = mod_hash_remove(vswp->fdb_hashp, (mod_hash_key_t)addr,
1171	    (mod_hash_val_t *)&fp);
1172	if (rv != 0) {
1173		/* invalid key? */
1174		return;
1175	}
1176
1177	/*
1178	 * If there are threads already ref holding before the entry was
1179	 * removed from hash table, then wait for ref count to drop to zero.
1180	 */
1181	while (fp->refcnt != 0) {
1182		delay(drv_usectohz(vsw_fdbe_refcnt_delay));
1183	}
1184
1185	kmem_free(fp, sizeof (*fp));
1186}
1187
1188/*
1189 * Search fdb for a given mac address. If an entry is found, hold
1190 * a reference to it and return the entry, else returns NULL.
1191 */
1192static vsw_fdbe_t *
1193vsw_fdbe_find(vsw_t *vswp, struct ether_addr *addrp)
1194{
1195	uint64_t	key = 0;
1196	vsw_fdbe_t	*fp;
1197	int		rv;
1198
1199	KEY_HASH(key, addrp);
1200
1201	rv = mod_hash_find_cb(vswp->fdb_hashp, (mod_hash_key_t)key,
1202	    (mod_hash_val_t *)&fp, vsw_fdbe_find_cb);
1203
1204	if (rv != 0)
1205		return (NULL);
1206
1207	return (fp);
1208}
1209
1210/*
1211 * Callback function provided to mod_hash_find_cb(). After finding the fdb
1212 * entry corresponding to the key (macaddr), this callback will be invoked by
1213 * mod_hash_find_cb() to atomically increment the reference count on the fdb
1214 * entry before returning the found entry.
1215 */
1216static void
1217vsw_fdbe_find_cb(mod_hash_key_t key, mod_hash_val_t val)
1218{
1219	_NOTE(ARGUNUSED(key))
1220	VSW_FDBE_REFHOLD((vsw_fdbe_t *)val);
1221}
1222
1223/*
1224 * A given frame must be always tagged with the appropriate vlan id (unless it
1225 * is in the default-vlan) before the mac address switching function is called.
1226 * Otherwise, after switching function determines the destination, we cannot
1227 * figure out if the destination belongs to the the same vlan that the frame
1228 * originated from and if it needs tag/untag. Frames which are inbound from
1229 * the external(physical) network over a vlan trunk link are always tagged.
1230 * However frames which are received from a vnet-port over ldc or frames which
1231 * are coming down the stack on the service domain over vsw interface may be
1232 * untagged. These frames must be tagged with the appropriate pvid of the
1233 * sender (vnet-port or vsw device), before invoking the switching function.
1234 *
1235 * Arguments:
1236 *   arg:    caller of the function.
1237 *   type:   type of arg(caller): VSW_LOCALDEV(vsw) or VSW_VNETPORT(port)
1238 *   mp:     frame(s) to be tagged.
1239 */
1240mblk_t *
1241vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp)
1242{
1243	vsw_t			*vswp;
1244	vsw_port_t		*portp;
1245	struct ether_header	*ehp;
1246	mblk_t			*bp;
1247	mblk_t			*bpt;
1248	mblk_t			*bph;
1249	mblk_t			*bpn;
1250	uint16_t		pvid;
1251
1252	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
1253
1254	if (type == VSW_LOCALDEV) {
1255		vswp = (vsw_t *)arg;
1256		pvid = vswp->pvid;
1257		portp = NULL;
1258	} else {
1259		/* VSW_VNETPORT */
1260		portp = (vsw_port_t *)arg;
1261		pvid = portp->pvid;
1262		vswp = portp->p_vswp;
1263	}
1264
1265	bpn = bph = bpt = NULL;
1266
1267	for (bp = mp; bp != NULL; bp = bpn) {
1268
1269		bpn = bp->b_next;
1270		bp->b_next = bp->b_prev = NULL;
1271
1272		/* Determine if it is an untagged frame */
1273		ehp = (struct ether_header *)bp->b_rptr;
1274
1275		if (ehp->ether_type != ETHERTYPE_VLAN) {	/* untagged */
1276
1277			/* no need to tag if the frame is in default vlan */
1278			if (pvid != vswp->default_vlan_id) {
1279				bp = vnet_vlan_insert_tag(bp, pvid);
1280				if (bp == NULL) {
1281					continue;
1282				}
1283			}
1284		}
1285
1286		/* build a chain of processed packets */
1287		if (bph == NULL) {
1288			bph = bpt = bp;
1289		} else {
1290			bpt->b_next = bp;
1291			bpt = bp;
1292		}
1293
1294	}
1295
1296	return (bph);
1297}
1298
1299/*
1300 * Frames destined to a vnet-port or to the local vsw interface, must be
1301 * untagged if necessary before sending. This function first checks that the
1302 * frame can be sent to the destination in the vlan identified by the frame
1303 * tag. Note that when this function is invoked the frame must have been
1304 * already tagged (unless it is in the default-vlan). Because, this function is
1305 * called when the switching function determines the destination and invokes
1306 * its send function (vnet-port or vsw interface) and all frames would have
1307 * been tagged by this time (see comments in vsw_vlan_frame_pretag()).
1308 *
1309 * Arguments:
1310 *   arg:    destination device.
1311 *   type:   type of arg(destination): VSW_LOCALDEV(vsw) or VSW_VNETPORT(port)
1312 *   np:     head of pkt chain to be validated and untagged.
1313 *   npt:    tail of pkt chain to be validated and untagged.
1314 *
1315 * Returns:
1316 *   np:     head of updated chain of packets
1317 *   npt:    tail of updated chain of packets
1318 *   rv:     count of the packets in the returned list
1319 */
1320uint32_t
1321vsw_vlan_frame_untag(void *arg, int type, mblk_t **np, mblk_t **npt)
1322{
1323	mblk_t			*bp;
1324	mblk_t			*bpt;
1325	mblk_t			*bph;
1326	mblk_t			*bpn;
1327	vsw_port_t		*portp;
1328	vsw_t			*vswp;
1329	uint32_t		count;
1330	struct ether_header	*ehp;
1331	boolean_t		is_tagged;
1332	boolean_t		rv;
1333	uint16_t		vlan_id;
1334	uint16_t		pvid;
1335	mod_hash_t		*vlan_hashp;
1336
1337	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
1338
1339
1340	if (type == VSW_LOCALDEV) {
1341		vswp = (vsw_t *)arg;
1342		pvid = vswp->pvid;
1343		vlan_hashp = vswp->vlan_hashp;
1344		portp = NULL;
1345	} else {
1346		/* type == VSW_VNETPORT */
1347		portp = (vsw_port_t *)arg;
1348		vswp = portp->p_vswp;
1349		vlan_hashp = portp->vlan_hashp;
1350		pvid = portp->pvid;
1351	}
1352
1353	/*
1354	 * If the MAC layer switching in place, then
1355	 * untagging required only if the pvid is not
1356	 * the same as default_vlan_id. This is because,
1357	 * the MAC layer will send packets for the
1358	 * registered vlans only.
1359	 */
1360	if ((vswp->mac_cl_switching == B_TRUE) &&
1361	    (pvid == vswp->default_vlan_id)) {
1362		/* simply count and set the tail */
1363		count = 1;
1364		bp = *np;
1365		ASSERT(bp != NULL);
1366		while (bp->b_next != NULL) {
1367			bp = bp->b_next;
1368			count++;
1369		}
1370		*npt = bp;
1371		return (count);
1372	}
1373
1374	bpn = bph = bpt = NULL;
1375	count = 0;
1376
1377	for (bp = *np; bp != NULL; bp = bpn) {
1378
1379		bpn = bp->b_next;
1380		bp->b_next = bp->b_prev = NULL;
1381
1382		/*
1383		 * Determine the vlan id that the frame belongs to.
1384		 */
1385		ehp = (struct ether_header *)bp->b_rptr;
1386		is_tagged = vsw_frame_lookup_vid(arg, type, ehp, &vlan_id);
1387
1388		/*
1389		 * If MAC layer switching in place, then we
1390		 * need to untag only if the tagged packet has
1391		 * vlan-id same as the pvid.
1392		 */
1393		if (vswp->mac_cl_switching == B_TRUE) {
1394
1395			/* only tagged packets expected here */
1396			ASSERT(is_tagged == B_TRUE);
1397			if (vlan_id == pvid) {
1398				bp = vnet_vlan_remove_tag(bp);
1399				if (bp == NULL) {
1400					/* packet dropped */
1401					continue;
1402				}
1403			}
1404		} else { /* No MAC layer switching */
1405
1406			/*
1407			 * Check the frame header if tag/untag is  needed.
1408			 */
1409			if (is_tagged == B_FALSE) {
1410				/*
1411				 * Untagged frame. We shouldn't have an
1412				 * untagged packet at this point, unless
1413				 * the destination's  vlan id is
1414				 * default-vlan-id; if it is not the
1415				 * default-vlan-id, we drop the packet.
1416				 */
1417				if (vlan_id != vswp->default_vlan_id) {
1418					/* drop the packet */
1419					freemsg(bp);
1420					continue;
1421				}
1422			} else {	/* Tagged */
1423				/*
1424				 * Tagged frame, untag if it's the
1425				 * destination's pvid.
1426				 */
1427				if (vlan_id == pvid) {
1428
1429					bp = vnet_vlan_remove_tag(bp);
1430					if (bp == NULL) {
1431						/* packet dropped */
1432						continue;
1433					}
1434				} else {
1435
1436					/*
1437					 * Check if the destination is in the
1438					 * same vlan.
1439					 */
1440					rv = vsw_vlan_lookup(vlan_hashp,
1441					    vlan_id);
1442					if (rv == B_FALSE) {
1443						/* drop the packet */
1444						freemsg(bp);
1445						continue;
1446					}
1447				}
1448
1449			}
1450		}
1451
1452		/* build a chain of processed packets */
1453		if (bph == NULL) {
1454			bph = bpt = bp;
1455		} else {
1456			bpt->b_next = bp;
1457			bpt = bp;
1458		}
1459		count++;
1460	}
1461
1462	*np = bph;
1463	*npt = bpt;
1464	return (count);
1465}
1466
1467/*
1468 * Lookup the vlan id of the given frame. If it is a vlan-tagged frame,
1469 * then the vlan-id is available in the tag; otherwise, its vlan id is
1470 * implicitly obtained based on the caller (destination of the frame:
1471 * VSW_VNETPORT or VSW_LOCALDEV).
1472 * The vlan id determined is returned in vidp.
1473 * Returns: B_TRUE if it is a tagged frame; B_FALSE if it is untagged.
1474 */
1475boolean_t
1476vsw_frame_lookup_vid(void *arg, int caller, struct ether_header *ehp,
1477	uint16_t *vidp)
1478{
1479	struct ether_vlan_header	*evhp;
1480	vsw_t				*vswp;
1481	vsw_port_t			*portp;
1482
1483	/* If it's a tagged frame, get the vid from vlan header */
1484	if (ehp->ether_type == ETHERTYPE_VLAN) {
1485
1486		evhp = (struct ether_vlan_header *)ehp;
1487		*vidp = VLAN_ID(ntohs(evhp->ether_tci));
1488		return (B_TRUE);
1489	}
1490
1491	/* Untagged frame; determine vlan id based on caller */
1492	switch (caller) {
1493
1494	case VSW_VNETPORT:
1495		/*
1496		 * packet destined to a vnet; vlan-id is pvid of vnet-port.
1497		 */
1498		portp = (vsw_port_t *)arg;
1499		*vidp = portp->pvid;
1500		break;
1501
1502	case VSW_LOCALDEV:
1503
1504		/*
1505		 * packet destined to vsw interface;
1506		 * vlan-id is port-vlan-id of vsw device.
1507		 */
1508		vswp = (vsw_t *)arg;
1509		*vidp = vswp->pvid;
1510		break;
1511	}
1512
1513	return (B_FALSE);
1514}
1515
1516/*
1517 * Add or remove multicast address(es).
1518 *
1519 * Returns 0 on success, 1 on failure.
1520 */
1521int
1522vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port)
1523{
1524	mcst_addr_t		*mcst_p = NULL;
1525	vsw_t			*vswp = port->p_vswp;
1526	uint64_t		addr = 0x0;
1527	int			i;
1528
1529	D1(vswp, "%s: enter", __func__);
1530
1531	D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count);
1532
1533	for (i = 0; i < mcst_pkt->count; i++) {
1534		/*
1535		 * Convert address into form that can be used
1536		 * as hash table key.
1537		 */
1538		KEY_HASH(addr, &(mcst_pkt->mca[i]));
1539
1540		/*
1541		 * Add or delete the specified address/port combination.
1542		 */
1543		if (mcst_pkt->set == 0x1) {
1544			D3(vswp, "%s: adding multicast address 0x%llx for "
1545			    "port %ld", __func__, addr, port->p_instance);
1546			if (vsw_add_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
1547				/*
1548				 * Update the list of multicast
1549				 * addresses contained within the
1550				 * port structure to include this new
1551				 * one.
1552				 */
1553				mcst_p = kmem_zalloc(sizeof (mcst_addr_t),
1554				    KM_NOSLEEP);
1555				if (mcst_p == NULL) {
1556					DERR(vswp, "%s: unable to alloc mem",
1557					    __func__);
1558					(void) vsw_del_mcst(vswp,
1559					    VSW_VNETPORT, addr, port);
1560					return (1);
1561				}
1562
1563				mcst_p->nextp = NULL;
1564				mcst_p->addr = addr;
1565				ether_copy(&mcst_pkt->mca[i], &mcst_p->mca);
1566
1567				/*
1568				 * Program the address into HW. If the addr
1569				 * has already been programmed then the MAC
1570				 * just increments a ref counter (which is
1571				 * used when the address is being deleted)
1572				 */
1573				if (vsw_mac_multicast_add(vswp, port, mcst_p,
1574				    VSW_VNETPORT)) {
1575					(void) vsw_del_mcst(vswp,
1576					    VSW_VNETPORT, addr, port);
1577					kmem_free(mcst_p, sizeof (*mcst_p));
1578					return (1);
1579				}
1580
1581				mutex_enter(&port->mca_lock);
1582				mcst_p->nextp = port->mcap;
1583				port->mcap = mcst_p;
1584				mutex_exit(&port->mca_lock);
1585
1586			} else {
1587				DERR(vswp, "%s: error adding multicast "
1588				    "address 0x%llx for port %ld",
1589				    __func__, addr, port->p_instance);
1590				return (1);
1591			}
1592		} else {
1593			/*
1594			 * Delete an entry from the multicast hash
1595			 * table and update the address list
1596			 * appropriately.
1597			 */
1598			if (vsw_del_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
1599				D3(vswp, "%s: deleting multicast address "
1600				    "0x%llx for port %ld", __func__, addr,
1601				    port->p_instance);
1602
1603				mcst_p = vsw_del_addr(VSW_VNETPORT, port, addr);
1604				ASSERT(mcst_p != NULL);
1605
1606				/*
1607				 * Remove the address from HW. The address
1608				 * will actually only be removed once the ref
1609				 * count within the MAC layer has dropped to
1610				 * zero. I.e. we can safely call this fn even
1611				 * if other ports are interested in this
1612				 * address.
1613				 */
1614				vsw_mac_multicast_remove(vswp, port, mcst_p,
1615				    VSW_VNETPORT);
1616				kmem_free(mcst_p, sizeof (*mcst_p));
1617
1618			} else {
1619				DERR(vswp, "%s: error deleting multicast "
1620				    "addr 0x%llx for port %ld",
1621				    __func__, addr, port->p_instance);
1622				return (1);
1623			}
1624		}
1625	}
1626	D1(vswp, "%s: exit", __func__);
1627	return (0);
1628}
1629
1630/*
1631 * Add a new multicast entry.
1632 *
1633 * Search hash table based on address. If match found then
1634 * update associated val (which is chain of ports), otherwise
1635 * create new key/val (addr/port) pair and insert into table.
1636 */
1637int
1638vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
1639{
1640	int		dup = 0;
1641	int		rv = 0;
1642	mfdb_ent_t	*ment = NULL;
1643	mfdb_ent_t	*tmp_ent = NULL;
1644	mfdb_ent_t	*new_ent = NULL;
1645	void		*tgt = NULL;
1646
1647	if (devtype == VSW_VNETPORT) {
1648		/*
1649		 * Being invoked from a vnet.
1650		 */
1651		ASSERT(arg != NULL);
1652		tgt = arg;
1653		D2(NULL, "%s: port %d : address 0x%llx", __func__,
1654		    ((vsw_port_t *)arg)->p_instance, addr);
1655	} else {
1656		/*
1657		 * We are being invoked via the m_multicst mac entry
1658		 * point.
1659		 */
1660		D2(NULL, "%s: address 0x%llx", __func__, addr);
1661		tgt = (void *)vswp;
1662	}
1663
1664	WRITE_ENTER(&vswp->mfdbrw);
1665	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
1666	    (mod_hash_val_t *)&ment) != 0) {
1667
1668		/* address not currently in table */
1669		ment = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
1670		ment->d_addr = (void *)tgt;
1671		ment->d_type = devtype;
1672		ment->nextp = NULL;
1673
1674		if (mod_hash_insert(vswp->mfdb, (mod_hash_key_t)addr,
1675		    (mod_hash_val_t)ment) != 0) {
1676			DERR(vswp, "%s: hash table insertion failed", __func__);
1677			kmem_free(ment, sizeof (mfdb_ent_t));
1678			rv = 1;
1679		} else {
1680			D2(vswp, "%s: added initial entry for 0x%llx to "
1681			    "table", __func__, addr);
1682		}
1683	} else {
1684		/*
1685		 * Address in table. Check to see if specified port
1686		 * is already associated with the address. If not add
1687		 * it now.
1688		 */
1689		tmp_ent = ment;
1690		while (tmp_ent != NULL) {
1691			if (tmp_ent->d_addr == (void *)tgt) {
1692				if (devtype == VSW_VNETPORT) {
1693					DERR(vswp, "%s: duplicate port entry "
1694					    "found for portid %ld and key "
1695					    "0x%llx", __func__,
1696					    ((vsw_port_t *)arg)->p_instance,
1697					    addr);
1698				} else {
1699					DERR(vswp, "%s: duplicate entry found"
1700					    "for key 0x%llx", __func__, addr);
1701				}
1702				rv = 1;
1703				dup = 1;
1704				break;
1705			}
1706			tmp_ent = tmp_ent->nextp;
1707		}
1708
1709		/*
1710		 * Port not on list so add it to end now.
1711		 */
1712		if (0 == dup) {
1713			D2(vswp, "%s: added entry for 0x%llx to table",
1714			    __func__, addr);
1715			new_ent = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
1716			new_ent->d_addr = (void *)tgt;
1717			new_ent->d_type = devtype;
1718			new_ent->nextp = NULL;
1719
1720			tmp_ent = ment;
1721			while (tmp_ent->nextp != NULL)
1722				tmp_ent = tmp_ent->nextp;
1723
1724			tmp_ent->nextp = new_ent;
1725		}
1726	}
1727
1728	RW_EXIT(&vswp->mfdbrw);
1729	return (rv);
1730}
1731
1732/*
1733 * Remove a multicast entry from the hashtable.
1734 *
1735 * Search hash table based on address. If match found, scan
1736 * list of ports associated with address. If specified port
1737 * found remove it from list.
1738 */
1739int
1740vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
1741{
1742	mfdb_ent_t	*ment = NULL;
1743	mfdb_ent_t	*curr_p, *prev_p;
1744	void		*tgt = NULL;
1745
1746	D1(vswp, "%s: enter", __func__);
1747
1748	if (devtype == VSW_VNETPORT) {
1749		tgt = (vsw_port_t *)arg;
1750		D2(vswp, "%s: removing port %d from mFDB for address"
1751		    " 0x%llx", __func__, ((vsw_port_t *)tgt)->p_instance, addr);
1752	} else {
1753		D2(vswp, "%s: removing entry", __func__);
1754		tgt = (void *)vswp;
1755	}
1756
1757	WRITE_ENTER(&vswp->mfdbrw);
1758	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
1759	    (mod_hash_val_t *)&ment) != 0) {
1760		D2(vswp, "%s: address 0x%llx not in table", __func__, addr);
1761		RW_EXIT(&vswp->mfdbrw);
1762		return (1);
1763	}
1764
1765	prev_p = curr_p = ment;
1766
1767	while (curr_p != NULL) {
1768		if (curr_p->d_addr == (void *)tgt) {
1769			if (devtype == VSW_VNETPORT) {
1770				D2(vswp, "%s: port %d found", __func__,
1771				    ((vsw_port_t *)tgt)->p_instance);
1772			} else {
1773				D2(vswp, "%s: instance found", __func__);
1774			}
1775
1776			if (prev_p == curr_p) {
1777				/*
1778				 * head of list, if no other element is in
1779				 * list then destroy this entry, otherwise
1780				 * just replace it with updated value.
1781				 */
1782				ment = curr_p->nextp;
1783				if (ment == NULL) {
1784					(void) mod_hash_destroy(vswp->mfdb,
1785					    (mod_hash_val_t)addr);
1786				} else {
1787					(void) mod_hash_replace(vswp->mfdb,
1788					    (mod_hash_key_t)addr,
1789					    (mod_hash_val_t)ment);
1790				}
1791			} else {
1792				/*
1793				 * Not head of list, no need to do
1794				 * replacement, just adjust list pointers.
1795				 */
1796				prev_p->nextp = curr_p->nextp;
1797			}
1798			break;
1799		}
1800
1801		prev_p = curr_p;
1802		curr_p = curr_p->nextp;
1803	}
1804
1805	RW_EXIT(&vswp->mfdbrw);
1806
1807	D1(vswp, "%s: exit", __func__);
1808
1809	if (curr_p == NULL)
1810		return (1);
1811	kmem_free(curr_p, sizeof (mfdb_ent_t));
1812	return (0);
1813}
1814
1815/*
1816 * Port is being deleted, but has registered an interest in one
1817 * or more multicast groups. Using the list of addresses maintained
1818 * within the port structure find the appropriate entry in the hash
1819 * table and remove this port from the list of interested ports.
1820 */
1821void
1822vsw_del_mcst_port(vsw_port_t *port)
1823{
1824	mcst_addr_t	*mcap = NULL;
1825	vsw_t		*vswp = port->p_vswp;
1826
1827	D1(vswp, "%s: enter", __func__);
1828
1829	mutex_enter(&port->mca_lock);
1830
1831	while ((mcap = port->mcap) != NULL) {
1832
1833		port->mcap = mcap->nextp;
1834
1835		mutex_exit(&port->mca_lock);
1836
1837		(void) vsw_del_mcst(vswp, VSW_VNETPORT,
1838		    mcap->addr, port);
1839
1840		/*
1841		 * Remove the address from HW. The address
1842		 * will actually only be removed once the ref
1843		 * count within the MAC layer has dropped to
1844		 * zero. I.e. we can safely call this fn even
1845		 * if other ports are interested in this
1846		 * address.
1847		 */
1848		vsw_mac_multicast_remove(vswp, port, mcap, VSW_VNETPORT);
1849		kmem_free(mcap, sizeof (*mcap));
1850
1851		mutex_enter(&port->mca_lock);
1852
1853	}
1854
1855	mutex_exit(&port->mca_lock);
1856
1857	D1(vswp, "%s: exit", __func__);
1858}
1859
1860/*
1861 * This vsw instance is detaching, but has registered an interest in one
1862 * or more multicast groups. Using the list of addresses maintained
1863 * within the vsw structure find the appropriate entry in the hash
1864 * table and remove this instance from the list of interested ports.
1865 */
1866void
1867vsw_del_mcst_vsw(vsw_t *vswp)
1868{
1869	mcst_addr_t	*next_p = NULL;
1870
1871	D1(vswp, "%s: enter", __func__);
1872
1873	mutex_enter(&vswp->mca_lock);
1874
1875	while (vswp->mcap != NULL) {
1876		DERR(vswp, "%s: deleting addr 0x%llx",
1877		    __func__, vswp->mcap->addr);
1878		(void) vsw_del_mcst(vswp, VSW_LOCALDEV, vswp->mcap->addr, NULL);
1879
1880		next_p = vswp->mcap->nextp;
1881		kmem_free(vswp->mcap, sizeof (mcst_addr_t));
1882		vswp->mcap = next_p;
1883	}
1884
1885	vswp->mcap = NULL;
1886	mutex_exit(&vswp->mca_lock);
1887
1888	D1(vswp, "%s: exit", __func__);
1889}
1890
1891mblk_t *
1892vsw_get_same_dest_list(struct ether_header *ehp, mblk_t **mpp)
1893{
1894	mblk_t			*bp;
1895	mblk_t			*nbp;
1896	mblk_t			*head = NULL;
1897	mblk_t			*tail = NULL;
1898	mblk_t			*prev = NULL;
1899	struct ether_header	*behp;
1900
1901	/* process the chain of packets */
1902	bp = *mpp;
1903	while (bp) {
1904		nbp = bp->b_next;
1905		behp = (struct ether_header *)bp->b_rptr;
1906		bp->b_prev = NULL;
1907		if (ether_cmp(&ehp->ether_dhost, &behp->ether_dhost) == 0) {
1908			if (prev == NULL) {
1909				*mpp = nbp;
1910			} else {
1911				prev->b_next = nbp;
1912			}
1913			bp->b_next =  NULL;
1914			if (head == NULL) {
1915				head = tail = bp;
1916			} else {
1917				tail->b_next = bp;
1918				tail = bp;
1919			}
1920		} else {
1921			prev = bp;
1922		}
1923		bp = nbp;
1924	}
1925	return (head);
1926}
1927
1928static mblk_t *
1929vsw_dupmsgchain(mblk_t *mp)
1930{
1931	mblk_t	*nmp = NULL;
1932	mblk_t	**nmpp = &nmp;
1933
1934	for (; mp != NULL; mp = mp->b_next) {
1935		if ((*nmpp = dupmsg(mp)) == NULL) {
1936			freemsgchain(nmp);
1937			return (NULL);
1938		}
1939
1940		nmpp = &((*nmpp)->b_next);
1941	}
1942
1943	return (nmp);
1944}
1945