vsw_switching.c revision 8275:7c223a798022
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#include <sys/types.h>
28#include <sys/errno.h>
29#include <sys/debug.h>
30#include <sys/time.h>
31#include <sys/sysmacros.h>
32#include <sys/systm.h>
33#include <sys/user.h>
34#include <sys/stropts.h>
35#include <sys/stream.h>
36#include <sys/strlog.h>
37#include <sys/strsubr.h>
38#include <sys/cmn_err.h>
39#include <sys/cpu.h>
40#include <sys/kmem.h>
41#include <sys/conf.h>
42#include <sys/ddi.h>
43#include <sys/sunddi.h>
44#include <sys/ksynch.h>
45#include <sys/stat.h>
46#include <sys/kstat.h>
47#include <sys/vtrace.h>
48#include <sys/strsun.h>
49#include <sys/dlpi.h>
50#include <sys/ethernet.h>
51#include <net/if.h>
52#include <sys/varargs.h>
53#include <sys/machsystm.h>
54#include <sys/modctl.h>
55#include <sys/modhash.h>
56#include <sys/mac.h>
57#include <sys/mac_ether.h>
58#include <sys/taskq.h>
59#include <sys/note.h>
60#include <sys/mach_descrip.h>
61#include <sys/mdeg.h>
62#include <sys/ldc.h>
63#include <sys/vsw_fdb.h>
64#include <sys/vsw.h>
65#include <sys/vio_mailbox.h>
66#include <sys/vnet_mailbox.h>
67#include <sys/vnet_common.h>
68#include <sys/vio_util.h>
69#include <sys/sdt.h>
70#include <sys/atomic.h>
71#include <sys/vlan.h>
72
73/* Switching setup routines */
74void vsw_setup_switching_timeout(void *arg);
75void vsw_stop_switching_timeout(vsw_t *vswp);
76int vsw_setup_switching(vsw_t *);
77void vsw_setup_layer2_post_process(vsw_t *vswp);
78void vsw_switch_frame_nop(vsw_t *vswp, mblk_t *mp, int caller,
79    vsw_port_t *port, mac_resource_handle_t mrh);
80static	int vsw_setup_layer2(vsw_t *);
81static	int vsw_setup_layer3(vsw_t *);
82
83/* Switching/data transmit routines */
84static	void vsw_switch_l2_frame_mac_client(vsw_t *vswp, mblk_t *mp, int caller,
85    vsw_port_t *port, mac_resource_handle_t);
86static	void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
87	vsw_port_t *port, mac_resource_handle_t);
88static	void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
89	vsw_port_t *port, mac_resource_handle_t);
90static	int vsw_forward_all(vsw_t *vswp, mblk_t *mp,
91	int caller, vsw_port_t *port);
92static	int vsw_forward_grp(vsw_t *vswp, mblk_t *mp,
93    int caller, vsw_port_t *port);
94
95/* VLAN routines */
96void vsw_create_vlans(void *arg, int type);
97void vsw_destroy_vlans(void *arg, int type);
98void vsw_vlan_add_ids(void *arg, int type);
99void vsw_vlan_remove_ids(void *arg, int type);
100static	void vsw_vlan_create_hash(void *arg, int type);
101static	void vsw_vlan_destroy_hash(void *arg, int type);
102boolean_t vsw_frame_lookup_vid(void *arg, int caller, struct ether_header *ehp,
103	uint16_t *vidp);
104mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp);
105uint32_t vsw_vlan_frames_untag(void *arg, int type, mblk_t **np, mblk_t **npt);
106boolean_t vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid);
107
108/* Forwarding database (FDB) routines */
109void vsw_fdbe_add(vsw_t *vswp, void *port);
110void vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr);
111static	vsw_fdbe_t *vsw_fdbe_find(vsw_t *vswp, struct ether_addr *);
112static void vsw_fdbe_find_cb(mod_hash_key_t key, mod_hash_val_t val);
113
114int vsw_add_rem_mcst(vnet_mcast_msg_t *, vsw_port_t *);
115int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *);
116int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *);
117void vsw_del_mcst_vsw(vsw_t *);
118
119/* Support functions */
120static mblk_t *vsw_dupmsgchain(mblk_t *mp);
121static mblk_t *vsw_get_same_dest_list(struct ether_header *ehp, mblk_t **mpp);
122
123
124/*
125 * Functions imported from other files.
126 */
127extern mblk_t *vsw_tx_msg(vsw_t *, mblk_t *, int, vsw_port_t *);
128extern mcst_addr_t *vsw_del_addr(uint8_t, void *, uint64_t);
129extern int vsw_mac_open(vsw_t *vswp);
130extern void vsw_mac_close(vsw_t *vswp);
131extern void vsw_mac_rx(vsw_t *vswp, mac_resource_handle_t mrh,
132    mblk_t *mp, vsw_macrx_flags_t flags);
133extern void vsw_set_addrs(vsw_t *vswp);
134extern int vsw_portsend(vsw_port_t *port, mblk_t *mp);
135extern void vsw_hio_init(vsw_t *vswp);
136extern void vsw_hio_start_ports(vsw_t *vswp);
137extern int vsw_mac_multicast_add(vsw_t *vswp, vsw_port_t *port,
138    mcst_addr_t *mcst_p, int type);
139extern void vsw_mac_multicast_remove(vsw_t *vswp, vsw_port_t *port,
140    mcst_addr_t *mcst_p, int type);
141
142/*
143 * Tunables used in this file.
144 */
145extern	int vsw_setup_switching_delay;
146extern	uint32_t vsw_vlan_nchains;
147extern	uint32_t vsw_fdbe_refcnt_delay;
148
149#define	VSW_FDBE_REFHOLD(p)						\
150{									\
151	atomic_inc_32(&(p)->refcnt);					\
152	ASSERT((p)->refcnt != 0);					\
153}
154
155#define	VSW_FDBE_REFRELE(p)						\
156{									\
157	ASSERT((p)->refcnt != 0);					\
158	atomic_dec_32(&(p)->refcnt);					\
159}
160
161/*
162 * Timeout routine to setup switching mode:
163 * vsw_setup_switching() is invoked from vsw_attach() or vsw_update_md_prop()
164 * initially. If it fails and the error is EAGAIN, then this timeout handler
165 * is started to retry vsw_setup_switching(). vsw_setup_switching() is retried
166 * until we successfully finish it; or the returned error is not EAGAIN.
167 */
168void
169vsw_setup_switching_timeout(void *arg)
170{
171	vsw_t		*vswp = (vsw_t *)arg;
172	int		rv;
173
174	if (vswp->swtmout_enabled == B_FALSE)
175		return;
176
177	rv = vsw_setup_switching(vswp);
178
179	if (rv == 0) {
180		vsw_setup_layer2_post_process(vswp);
181	}
182
183	mutex_enter(&vswp->swtmout_lock);
184
185	if (rv == EAGAIN && vswp->swtmout_enabled == B_TRUE) {
186		/*
187		 * Reschedule timeout() if the error is EAGAIN and the
188		 * timeout is still enabled. For errors other than EAGAIN,
189		 * we simply return without rescheduling timeout().
190		 */
191		vswp->swtmout_id =
192		    timeout(vsw_setup_switching_timeout, vswp,
193		    (vsw_setup_switching_delay * drv_usectohz(MICROSEC)));
194		goto exit;
195	}
196
197	/* timeout handler completed */
198	vswp->swtmout_enabled = B_FALSE;
199	vswp->swtmout_id = 0;
200
201exit:
202	mutex_exit(&vswp->swtmout_lock);
203}
204
205/*
206 * Cancel the timeout handler to setup switching mode.
207 */
208void
209vsw_stop_switching_timeout(vsw_t *vswp)
210{
211	timeout_id_t tid;
212
213	mutex_enter(&vswp->swtmout_lock);
214
215	tid = vswp->swtmout_id;
216
217	if (tid != 0) {
218		/* signal timeout handler to stop */
219		vswp->swtmout_enabled = B_FALSE;
220		vswp->swtmout_id = 0;
221		mutex_exit(&vswp->swtmout_lock);
222
223		(void) untimeout(tid);
224	} else {
225		mutex_exit(&vswp->swtmout_lock);
226	}
227
228	(void) atomic_swap_32(&vswp->switching_setup_done, B_FALSE);
229
230	mutex_enter(&vswp->mac_lock);
231	vswp->mac_open_retries = 0;
232	mutex_exit(&vswp->mac_lock);
233}
234
235/*
236 * Setup the required switching mode.
237 * This routine is invoked from vsw_attach() or vsw_update_md_prop()
238 * initially. If it fails and the error is EAGAIN, then a timeout handler
239 * is started to retry vsw_setup_switching(), until it successfully finishes;
240 * or the returned error is not EAGAIN.
241 *
242 * Returns:
243 *  0 on success.
244 *  EAGAIN if retry is needed.
245 *  1 on all other failures.
246 */
247int
248vsw_setup_switching(vsw_t *vswp)
249{
250	int	rv = 1;
251
252	D1(vswp, "%s: enter", __func__);
253
254	/*
255	 * Select best switching mode.
256	 * This is done as this routine can be called from the timeout
257	 * handler to retry setting up a specific mode. Currently only
258	 * the function which sets up layer2/promisc mode returns EAGAIN
259	 * if the underlying network device is not available yet, causing
260	 * retries.
261	 */
262	if (vswp->smode & VSW_LAYER2) {
263		rv = vsw_setup_layer2(vswp);
264	} else if (vswp->smode & VSW_LAYER3) {
265		rv = vsw_setup_layer3(vswp);
266	} else {
267		DERR(vswp, "unknown switch mode");
268		rv = 1;
269	}
270
271	if (rv && (rv != EAGAIN)) {
272		cmn_err(CE_WARN, "!vsw%d: Unable to setup specified "
273		    "switching mode", vswp->instance);
274	} else if (rv == 0) {
275		(void) atomic_swap_32(&vswp->switching_setup_done, B_TRUE);
276	}
277
278	D2(vswp, "%s: Operating in mode %d", __func__,
279	    vswp->smode);
280
281	D1(vswp, "%s: exit", __func__);
282
283	return (rv);
284}
285
286/*
287 * Setup for layer 2 switching.
288 *
289 * Returns:
290 *  0 on success.
291 *  EAGAIN if retry is needed.
292 *  EIO on all other failures.
293 */
294static int
295vsw_setup_layer2(vsw_t *vswp)
296{
297	int	rv;
298
299	D1(vswp, "%s: enter", __func__);
300
301	/*
302	 * Until the network device is successfully opened,
303	 * set the switching to use vsw_switch_l2_frame.
304	 */
305	vswp->vsw_switch_frame = vsw_switch_l2_frame;
306	vswp->mac_cl_switching = B_FALSE;
307
308	rv = strlen(vswp->physname);
309	if (rv == 0) {
310		/*
311		 * Physical device name is NULL, which is
312		 * required for layer 2.
313		 */
314		cmn_err(CE_WARN, "!vsw%d: no network device name specified",
315		    vswp->instance);
316		return (EIO);
317	}
318
319	mutex_enter(&vswp->mac_lock);
320
321	rv = vsw_mac_open(vswp);
322	if (rv != 0) {
323		if (rv != EAGAIN) {
324			cmn_err(CE_WARN, "!vsw%d: Unable to open network "
325			    "device: %s\n", vswp->instance, vswp->physname);
326		}
327		mutex_exit(&vswp->mac_lock);
328		return (rv);
329	}
330
331	/*
332	 * Now we can use the mac client switching, so set the switching
333	 * function to use vsw_switch_l2_frame_mac_client(), which simply
334	 * sends the packets to MAC layer for switching.
335	 */
336	vswp->vsw_switch_frame = vsw_switch_l2_frame_mac_client;
337	vswp->mac_cl_switching = B_TRUE;
338
339	D1(vswp, "%s: exit", __func__);
340
341	/* Initialize HybridIO related stuff */
342	vsw_hio_init(vswp);
343
344	mutex_exit(&vswp->mac_lock);
345	return (0);
346
347exit_error:
348	vsw_mac_close(vswp);
349	mutex_exit(&vswp->mac_lock);
350	return (EIO);
351}
352
353static int
354vsw_setup_layer3(vsw_t *vswp)
355{
356	D1(vswp, "%s: enter", __func__);
357
358	D2(vswp, "%s: operating in layer 3 mode", __func__);
359	vswp->vsw_switch_frame = vsw_switch_l3_frame;
360
361	D1(vswp, "%s: exit", __func__);
362
363	return (0);
364}
365
366/* ARGSUSED */
367void
368vsw_switch_frame_nop(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *port,
369			mac_resource_handle_t mrh)
370{
371	freemsgchain(mp);
372}
373
374/*
375 * Use mac client for layer 2 switching .
376 */
377static void
378vsw_switch_l2_frame_mac_client(vsw_t *vswp, mblk_t *mp, int caller,
379    vsw_port_t *port, mac_resource_handle_t mrh)
380{
381	_NOTE(ARGUNUSED(mrh))
382
383	mblk_t		*ret_m;
384
385	/*
386	 * This switching function is expected to be called by
387	 * the ports or the interface only. The packets from
388	 * physical interface already switched.
389	 */
390	ASSERT((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV));
391
392	if ((ret_m = vsw_tx_msg(vswp, mp, caller, port)) != NULL) {
393		DERR(vswp, "%s: drop mblks to "
394		    "phys dev", __func__);
395		freemsgchain(ret_m);
396	}
397}
398
399/*
400 * Switch the given ethernet frame when operating in layer 2 mode.
401 *
402 * vswp: pointer to the vsw instance
403 * mp: pointer to chain of ethernet frame(s) to be switched
404 * caller: identifies the source of this frame as:
405 * 		1. VSW_VNETPORT - a vsw port (connected to a vnet).
406 *		2. VSW_PHYSDEV - the physical ethernet device
407 *		3. VSW_LOCALDEV - vsw configured as a virtual interface
408 * arg: argument provided by the caller.
409 *		1. for VNETPORT - pointer to the corresponding vsw_port_t.
410 *		2. for PHYSDEV - NULL
411 *		3. for LOCALDEV - pointer to to this vsw_t(self)
412 */
413void
414vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
415			vsw_port_t *arg, mac_resource_handle_t mrh)
416{
417	struct ether_header	*ehp;
418	mblk_t			*bp, *ret_m;
419	vsw_fdbe_t		*fp;
420
421	D1(vswp, "%s: enter (caller %d)", __func__, caller);
422
423	/*
424	 * PERF: rather than breaking up the chain here, scan it
425	 * to find all mblks heading to same destination and then
426	 * pass that sub-chain to the lower transmit functions.
427	 */
428
429	/* process the chain of packets */
430	bp = mp;
431	while (bp) {
432		ehp = (struct ether_header *)bp->b_rptr;
433		mp = vsw_get_same_dest_list(ehp, &bp);
434		ASSERT(mp != NULL);
435
436		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
437		    __func__, MBLKSIZE(mp), MBLKL(mp));
438
439		if (ether_cmp(&ehp->ether_dhost, &vswp->if_addr) == 0) {
440			/*
441			 * If destination is VSW_LOCALDEV (vsw as an eth
442			 * interface) and if the device is up & running,
443			 * send the packet up the stack on this host.
444			 * If the virtual interface is down, drop the packet.
445			 */
446			if (caller != VSW_LOCALDEV) {
447				vsw_mac_rx(vswp, mrh, mp, VSW_MACRX_FREEMSG);
448			} else {
449				freemsgchain(mp);
450			}
451			continue;
452		}
453
454		/*
455		 * Find fdb entry for the destination
456		 * and hold a reference to it.
457		 */
458		fp = vsw_fdbe_find(vswp, &ehp->ether_dhost);
459		if (fp != NULL) {
460
461			/*
462			 * If plumbed and in promisc mode then copy msg
463			 * and send up the stack.
464			 */
465			vsw_mac_rx(vswp, mrh, mp,
466			    VSW_MACRX_PROMISC | VSW_MACRX_COPYMSG);
467
468			/*
469			 * If the destination is in FDB, the packet
470			 * should be forwarded to the correponding
471			 * vsw_port (connected to a vnet device -
472			 * VSW_VNETPORT)
473			 */
474			(void) vsw_portsend(fp->portp, mp);
475
476			/* Release the reference on the fdb entry */
477			VSW_FDBE_REFRELE(fp);
478		} else {
479			/*
480			 * Destination not in FDB.
481			 *
482			 * If the destination is broadcast or
483			 * multicast forward the packet to all
484			 * (VNETPORTs, PHYSDEV, LOCALDEV),
485			 * except the caller.
486			 */
487			if (IS_BROADCAST(ehp)) {
488				D2(vswp, "%s: BROADCAST pkt", __func__);
489				(void) vsw_forward_all(vswp, mp, caller, arg);
490			} else if (IS_MULTICAST(ehp)) {
491				D2(vswp, "%s: MULTICAST pkt", __func__);
492				(void) vsw_forward_grp(vswp, mp, caller, arg);
493			} else {
494				/*
495				 * If the destination is unicast, and came
496				 * from either a logical network device or
497				 * the switch itself when it is plumbed, then
498				 * send it out on the physical device and also
499				 * up the stack if the logical interface is
500				 * in promiscious mode.
501				 *
502				 * NOTE:  The assumption here is that if we
503				 * cannot find the destination in our fdb, its
504				 * a unicast address, and came from either a
505				 * vnet or down the stack (when plumbed) it
506				 * must be destinded for an ethernet device
507				 * outside our ldoms.
508				 */
509				if (caller == VSW_VNETPORT) {
510					/* promisc check copy etc */
511					vsw_mac_rx(vswp, mrh, mp,
512					    VSW_MACRX_PROMISC |
513					    VSW_MACRX_COPYMSG);
514
515					if ((ret_m = vsw_tx_msg(vswp, mp,
516					    caller, arg)) != NULL) {
517						DERR(vswp, "%s: drop mblks to "
518						    "phys dev", __func__);
519						freemsgchain(ret_m);
520					}
521
522				} else if (caller == VSW_PHYSDEV) {
523					/*
524					 * Pkt seen because card in promisc
525					 * mode. Send up stack if plumbed in
526					 * promisc mode, else drop it.
527					 */
528					vsw_mac_rx(vswp, mrh, mp,
529					    VSW_MACRX_PROMISC |
530					    VSW_MACRX_FREEMSG);
531
532				} else if (caller == VSW_LOCALDEV) {
533					/*
534					 * Pkt came down the stack, send out
535					 * over physical device.
536					 */
537					if ((ret_m = vsw_tx_msg(vswp, mp,
538					    caller, NULL)) != NULL) {
539						DERR(vswp, "%s: drop mblks to "
540						    "phys dev", __func__);
541						freemsgchain(ret_m);
542					}
543				}
544			}
545		}
546	}
547	D1(vswp, "%s: exit\n", __func__);
548}
549
550/*
551 * Switch ethernet frame when in layer 3 mode (i.e. using IP
552 * layer to do the routing).
553 *
554 * There is a large amount of overlap between this function and
555 * vsw_switch_l2_frame. At some stage we need to revisit and refactor
556 * both these functions.
557 */
558void
559vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
560			vsw_port_t *arg, mac_resource_handle_t mrh)
561{
562	struct ether_header	*ehp;
563	mblk_t			*bp = NULL;
564	vsw_fdbe_t		*fp;
565
566	D1(vswp, "%s: enter (caller %d)", __func__, caller);
567
568	/*
569	 * In layer 3 mode should only ever be switching packets
570	 * between IP layer and vnet devices. So make sure thats
571	 * who is invoking us.
572	 */
573	if ((caller != VSW_LOCALDEV) && (caller != VSW_VNETPORT)) {
574		DERR(vswp, "%s: unexpected caller (%d)", __func__, caller);
575		freemsgchain(mp);
576		return;
577	}
578
579	/* process the chain of packets */
580	bp = mp;
581	while (bp) {
582		ehp = (struct ether_header *)bp->b_rptr;
583		mp = vsw_get_same_dest_list(ehp, &bp);
584		ASSERT(mp != NULL);
585
586		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
587		    __func__, MBLKSIZE(mp), MBLKL(mp));
588
589		/*
590		 * Find fdb entry for the destination
591		 * and hold a reference to it.
592		 */
593		fp = vsw_fdbe_find(vswp, &ehp->ether_dhost);
594		if (fp != NULL) {
595
596			D2(vswp, "%s: sending to target port", __func__);
597			(void) vsw_portsend(fp->portp, mp);
598
599			/* Release the reference on the fdb entry */
600			VSW_FDBE_REFRELE(fp);
601		} else {
602			/*
603			 * Destination not in FDB
604			 *
605			 * If the destination is broadcast or
606			 * multicast forward the packet to all
607			 * (VNETPORTs, PHYSDEV, LOCALDEV),
608			 * except the caller.
609			 */
610			if (IS_BROADCAST(ehp)) {
611				D2(vswp, "%s: BROADCAST pkt", __func__);
612				(void) vsw_forward_all(vswp, mp, caller, arg);
613			} else if (IS_MULTICAST(ehp)) {
614				D2(vswp, "%s: MULTICAST pkt", __func__);
615				(void) vsw_forward_grp(vswp, mp, caller, arg);
616			} else {
617				/*
618				 * Unicast pkt from vnet that we don't have
619				 * an FDB entry for, so must be destinded for
620				 * the outside world. Attempt to send up to the
621				 * IP layer to allow it to deal with it.
622				 */
623				if (caller == VSW_VNETPORT) {
624					vsw_mac_rx(vswp, mrh,
625					    mp, VSW_MACRX_FREEMSG);
626				}
627			}
628		}
629	}
630
631	D1(vswp, "%s: exit", __func__);
632}
633
634/*
635 * Setup mac addrs and hio resources for layer 2 switching only.
636 */
637void
638vsw_setup_layer2_post_process(vsw_t *vswp)
639{
640	if (vswp->smode & VSW_LAYER2) {
641		/*
642		 * Program unicst, mcst addrs of vsw
643		 * interface and ports in the physdev.
644		 */
645		vsw_set_addrs(vswp);
646
647		/* Start HIO for ports that have already connected */
648		vsw_hio_start_ports(vswp);
649	}
650}
651
652/*
653 * Forward the ethernet frame to all ports (VNETPORTs, PHYSDEV, LOCALDEV),
654 * except the caller (port on which frame arrived).
655 */
656static int
657vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
658{
659	vsw_port_list_t	*plist = &vswp->plist;
660	vsw_port_t	*portp;
661	mblk_t		*nmp = NULL;
662	mblk_t		*ret_m = NULL;
663	int		skip_port = 0;
664
665	D1(vswp, "vsw_forward_all: enter\n");
666
667	/*
668	 * Broadcast message from inside ldoms so send to outside
669	 * world if in either of layer 2 modes.
670	 */
671	if ((vswp->smode & VSW_LAYER2) &&
672	    ((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) {
673
674		nmp = vsw_dupmsgchain(mp);
675		if (nmp) {
676			if ((ret_m = vsw_tx_msg(vswp, nmp, caller, arg))
677			    != NULL) {
678				DERR(vswp, "%s: dropping pkt(s) "
679				    "consisting of %ld bytes of data for"
680				    " physical device", __func__, MBLKL(ret_m));
681				freemsgchain(ret_m);
682			}
683		}
684	}
685
686	if (caller == VSW_VNETPORT)
687		skip_port = 1;
688
689	/*
690	 * Broadcast message from other vnet (layer 2 or 3) or outside
691	 * world (layer 2 only), send up stack if plumbed.
692	 */
693	if ((caller == VSW_PHYSDEV) || (caller == VSW_VNETPORT)) {
694		vsw_mac_rx(vswp, NULL, mp, VSW_MACRX_COPYMSG);
695	}
696
697	/* send it to all VNETPORTs */
698	READ_ENTER(&plist->lockrw);
699	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
700		D2(vswp, "vsw_forward_all: port %d", portp->p_instance);
701		/*
702		 * Caution ! - don't reorder these two checks as arg
703		 * will be NULL if the caller is PHYSDEV. skip_port is
704		 * only set if caller is VNETPORT.
705		 */
706		if ((skip_port) && (portp == arg)) {
707			continue;
708		} else {
709			nmp = vsw_dupmsgchain(mp);
710			if (nmp) {
711				/*
712				 * The plist->lockrw is protecting the
713				 * portp from getting destroyed here.
714				 * So, no ref_cnt is incremented here.
715				 */
716				(void) vsw_portsend(portp, nmp);
717			} else {
718				DERR(vswp, "vsw_forward_all: nmp NULL");
719			}
720		}
721	}
722	RW_EXIT(&plist->lockrw);
723
724	freemsgchain(mp);
725
726	D1(vswp, "vsw_forward_all: exit\n");
727	return (0);
728}
729
730/*
731 * Forward pkts to any devices or interfaces which have registered
732 * an interest in them (i.e. multicast groups).
733 */
734static int
735vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
736{
737	struct ether_header	*ehp = (struct ether_header *)mp->b_rptr;
738	mfdb_ent_t		*entp = NULL;
739	mfdb_ent_t		*tpp = NULL;
740	vsw_port_t 		*port;
741	uint64_t		key = 0;
742	mblk_t			*nmp = NULL;
743	mblk_t			*ret_m = NULL;
744	boolean_t		check_if = B_TRUE;
745
746	/*
747	 * Convert address to hash table key
748	 */
749	KEY_HASH(key, &ehp->ether_dhost);
750
751	D1(vswp, "%s: key 0x%llx", __func__, key);
752
753	/*
754	 * If pkt came from either a vnet or down the stack (if we are
755	 * plumbed) and we are in layer 2 mode, then we send the pkt out
756	 * over the physical adapter, and then check to see if any other
757	 * vnets are interested in it.
758	 */
759	if ((vswp->smode & VSW_LAYER2) &&
760	    ((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) {
761		nmp = vsw_dupmsgchain(mp);
762		if (nmp) {
763			if ((ret_m = vsw_tx_msg(vswp, nmp, caller, arg))
764			    != NULL) {
765				DERR(vswp, "%s: dropping pkt(s) consisting of "
766				    "%ld bytes of data for physical device",
767				    __func__, MBLKL(ret_m));
768				freemsgchain(ret_m);
769			}
770		}
771	}
772
773	READ_ENTER(&vswp->mfdbrw);
774	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)key,
775	    (mod_hash_val_t *)&entp) != 0) {
776		D3(vswp, "%s: no table entry found for addr 0x%llx",
777		    __func__, key);
778	} else {
779		/*
780		 * Send to list of devices associated with this address...
781		 */
782		for (tpp = entp; tpp != NULL; tpp = tpp->nextp) {
783
784			/* dont send to ourselves */
785			if ((caller == VSW_VNETPORT) &&
786			    (tpp->d_addr == (void *)arg)) {
787				port = (vsw_port_t *)tpp->d_addr;
788				D3(vswp, "%s: not sending to ourselves"
789				    " : port %d", __func__, port->p_instance);
790				continue;
791
792			} else if ((caller == VSW_LOCALDEV) &&
793			    (tpp->d_type == VSW_LOCALDEV)) {
794				D2(vswp, "%s: not sending back up stack",
795				    __func__);
796				continue;
797			}
798
799			if (tpp->d_type == VSW_VNETPORT) {
800				port = (vsw_port_t *)tpp->d_addr;
801				D3(vswp, "%s: sending to port %ld for addr "
802				    "0x%llx", __func__, port->p_instance, key);
803
804				nmp = vsw_dupmsgchain(mp);
805				if (nmp) {
806					/*
807					 * The vswp->mfdbrw is protecting the
808					 * portp from getting destroyed here.
809					 * So, no ref_cnt is incremented here.
810					 */
811					(void) vsw_portsend(port, nmp);
812				}
813			} else {
814				vsw_mac_rx(vswp, NULL,
815				    mp, VSW_MACRX_COPYMSG);
816				D2(vswp, "%s: sending up stack"
817				    " for addr 0x%llx", __func__, key);
818				check_if = B_FALSE;
819			}
820		}
821	}
822
823	RW_EXIT(&vswp->mfdbrw);
824
825	/*
826	 * If the pkt came from either a vnet or from physical device,
827	 * and if we havent already sent the pkt up the stack then we
828	 * check now if we can/should (i.e. the interface is plumbed
829	 * and in promisc mode).
830	 */
831	if ((check_if) &&
832	    ((caller == VSW_VNETPORT) || (caller == VSW_PHYSDEV))) {
833		vsw_mac_rx(vswp, NULL, mp,
834		    VSW_MACRX_PROMISC | VSW_MACRX_COPYMSG);
835	}
836
837	freemsgchain(mp);
838
839	D1(vswp, "%s: exit", __func__);
840
841	return (0);
842}
843
844/*
845 * This function creates the vlan id hash table for the given vsw device or
846 * port. It then adds each vlan that the device or port has been assigned,
847 * into this hash table.
848 * Arguments:
849 *   arg:  vsw device or port.
850 *   type: type of arg; VSW_LOCALDEV(vsw device) or VSW_VNETPORT(port).
851 */
852void
853vsw_create_vlans(void *arg, int type)
854{
855	/* create vlan hash table */
856	vsw_vlan_create_hash(arg, type);
857
858	/* add vlan ids of the vsw device into its hash table */
859	vsw_vlan_add_ids(arg, type);
860}
861
862/*
863 * This function removes the vlan ids of the vsw device or port from its hash
864 * table. It then destroys the vlan hash table.
865 * Arguments:
866 *   arg:  vsw device or port.
867 *   type: type of arg; VSW_LOCALDEV(vsw device) or VSW_VNETPORT(port).
868 */
869void
870vsw_destroy_vlans(void *arg, int type)
871{
872	/* remove vlan ids from the hash table */
873	vsw_vlan_remove_ids(arg, type);
874
875	/* destroy vlan-hash-table */
876	vsw_vlan_destroy_hash(arg, type);
877}
878
879/*
880 * Create a vlan-id hash table for the given vsw device or port.
881 */
882static void
883vsw_vlan_create_hash(void *arg, int type)
884{
885	char		hashname[MAXNAMELEN];
886
887	if (type == VSW_LOCALDEV) {
888		vsw_t		*vswp = (vsw_t *)arg;
889
890		(void) snprintf(hashname, MAXNAMELEN, "vsw%d-vlan-hash",
891		    vswp->instance);
892
893		vswp->vlan_nchains = vsw_vlan_nchains;
894		vswp->vlan_hashp = mod_hash_create_idhash(hashname,
895		    vswp->vlan_nchains, mod_hash_null_valdtor);
896
897	} else if (type == VSW_VNETPORT) {
898		vsw_port_t	*portp = (vsw_port_t *)arg;
899
900		(void) snprintf(hashname, MAXNAMELEN, "port%d-vlan-hash",
901		    portp->p_instance);
902
903		portp->vlan_nchains = vsw_vlan_nchains;
904		portp->vlan_hashp = mod_hash_create_idhash(hashname,
905		    portp->vlan_nchains, mod_hash_null_valdtor);
906
907	} else {
908		return;
909	}
910}
911
912/*
913 * Destroy the vlan-id hash table for the given vsw device or port.
914 */
915static void
916vsw_vlan_destroy_hash(void *arg, int type)
917{
918	if (type == VSW_LOCALDEV) {
919		vsw_t		*vswp = (vsw_t *)arg;
920
921		mod_hash_destroy_hash(vswp->vlan_hashp);
922		vswp->vlan_nchains = 0;
923	} else if (type == VSW_VNETPORT) {
924		vsw_port_t	*portp = (vsw_port_t *)arg;
925
926		mod_hash_destroy_hash(portp->vlan_hashp);
927		portp->vlan_nchains = 0;
928	} else {
929		return;
930	}
931}
932
933/*
934 * Add vlan ids of the given vsw device or port into its hash table.
935 */
936void
937vsw_vlan_add_ids(void *arg, int type)
938{
939	int	rv;
940	int	i;
941
942	if (type == VSW_LOCALDEV) {
943		vsw_t		*vswp = (vsw_t *)arg;
944
945		rv = mod_hash_insert(vswp->vlan_hashp,
946		    (mod_hash_key_t)VLAN_ID_KEY(vswp->pvid),
947		    (mod_hash_val_t)B_TRUE);
948		if (rv != 0) {
949			cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d) for "
950			    "the interface", vswp->instance, vswp->pvid);
951		}
952
953		for (i = 0; i < vswp->nvids; i++) {
954			rv = mod_hash_insert(vswp->vlan_hashp,
955			    (mod_hash_key_t)VLAN_ID_KEY(vswp->vids[i].vl_vid),
956			    (mod_hash_val_t)B_TRUE);
957			if (rv != 0) {
958				cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d)"
959				    " for the interface", vswp->instance,
960				    vswp->pvid);
961			}
962		}
963
964	} else if (type == VSW_VNETPORT) {
965		vsw_port_t	*portp = (vsw_port_t *)arg;
966		vsw_t		*vswp = portp->p_vswp;
967
968		rv = mod_hash_insert(portp->vlan_hashp,
969		    (mod_hash_key_t)VLAN_ID_KEY(portp->pvid),
970		    (mod_hash_val_t)B_TRUE);
971		if (rv != 0) {
972			cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d) for "
973			    "the port(%d)", vswp->instance, vswp->pvid,
974			    portp->p_instance);
975		}
976
977		for (i = 0; i < portp->nvids; i++) {
978			rv = mod_hash_insert(portp->vlan_hashp,
979			    (mod_hash_key_t)VLAN_ID_KEY(portp->vids[i].vl_vid),
980			    (mod_hash_val_t)B_TRUE);
981			if (rv != 0) {
982				cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d)"
983				    " for the port(%d)", vswp->instance,
984				    vswp->pvid, portp->p_instance);
985			}
986		}
987
988	}
989}
990
991/*
992 * Remove vlan ids of the given vsw device or port from its hash table.
993 */
994void
995vsw_vlan_remove_ids(void *arg, int type)
996{
997	mod_hash_val_t	vp;
998	int		rv;
999	int		i;
1000
1001	if (type == VSW_LOCALDEV) {
1002		vsw_t		*vswp = (vsw_t *)arg;
1003
1004		rv = vsw_vlan_lookup(vswp->vlan_hashp, vswp->pvid);
1005		if (rv == B_TRUE) {
1006			rv = mod_hash_remove(vswp->vlan_hashp,
1007			    (mod_hash_key_t)VLAN_ID_KEY(vswp->pvid),
1008			    (mod_hash_val_t *)&vp);
1009			ASSERT(rv == 0);
1010		}
1011
1012		for (i = 0; i < vswp->nvids; i++) {
1013			rv = vsw_vlan_lookup(vswp->vlan_hashp,
1014			    vswp->vids[i].vl_vid);
1015			if (rv == B_TRUE) {
1016				rv = mod_hash_remove(vswp->vlan_hashp,
1017				    (mod_hash_key_t)VLAN_ID_KEY(
1018				    vswp->vids[i].vl_vid),
1019				    (mod_hash_val_t *)&vp);
1020				ASSERT(rv == 0);
1021			}
1022		}
1023
1024	} else if (type == VSW_VNETPORT) {
1025		vsw_port_t	*portp = (vsw_port_t *)arg;
1026
1027		portp = (vsw_port_t *)arg;
1028		rv = vsw_vlan_lookup(portp->vlan_hashp, portp->pvid);
1029		if (rv == B_TRUE) {
1030			rv = mod_hash_remove(portp->vlan_hashp,
1031			    (mod_hash_key_t)VLAN_ID_KEY(portp->pvid),
1032			    (mod_hash_val_t *)&vp);
1033			ASSERT(rv == 0);
1034		}
1035
1036		for (i = 0; i < portp->nvids; i++) {
1037			rv = vsw_vlan_lookup(portp->vlan_hashp,
1038			    portp->vids[i].vl_vid);
1039			if (rv == B_TRUE) {
1040				rv = mod_hash_remove(portp->vlan_hashp,
1041				    (mod_hash_key_t)VLAN_ID_KEY(
1042				    portp->vids[i].vl_vid),
1043				    (mod_hash_val_t *)&vp);
1044				ASSERT(rv == 0);
1045			}
1046		}
1047
1048	} else {
1049		return;
1050	}
1051}
1052
1053/*
1054 * Find the given vlan id in the hash table.
1055 * Return: B_TRUE if the id is found; B_FALSE if not found.
1056 */
1057boolean_t
1058vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid)
1059{
1060	int		rv;
1061	mod_hash_val_t	vp;
1062
1063	rv = mod_hash_find(vlan_hashp, VLAN_ID_KEY(vid), (mod_hash_val_t *)&vp);
1064
1065	if (rv != 0)
1066		return (B_FALSE);
1067
1068	return (B_TRUE);
1069}
1070
1071/*
1072 * Add an entry into FDB for the given vsw.
1073 */
1074void
1075vsw_fdbe_add(vsw_t *vswp, void *port)
1076{
1077	uint64_t	addr = 0;
1078	vsw_port_t	*portp;
1079	vsw_fdbe_t	*fp;
1080	int		rv;
1081
1082	portp = (vsw_port_t *)port;
1083	KEY_HASH(addr, &portp->p_macaddr);
1084
1085	fp = kmem_zalloc(sizeof (vsw_fdbe_t), KM_SLEEP);
1086	fp->portp = port;
1087
1088	/*
1089	 * Note: duplicate keys will be rejected by mod_hash.
1090	 */
1091	rv = mod_hash_insert(vswp->fdb_hashp, (mod_hash_key_t)addr,
1092	    (mod_hash_val_t)fp);
1093	if (rv != 0) {
1094		cmn_err(CE_WARN, "vsw%d: Duplicate mac-address(%s) for "
1095		    "the port(%d)", vswp->instance,
1096		    ether_sprintf(&portp->p_macaddr), portp->p_instance);
1097	}
1098}
1099
1100/*
1101 * Remove an entry from FDB.
1102 */
1103void
1104vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr)
1105{
1106	uint64_t	addr = 0;
1107	vsw_fdbe_t	*fp;
1108	int		rv;
1109
1110	KEY_HASH(addr, eaddr);
1111
1112	/*
1113	 * Remove the entry from fdb hash table.
1114	 * This prevents further references to this fdb entry.
1115	 */
1116	rv = mod_hash_remove(vswp->fdb_hashp, (mod_hash_key_t)addr,
1117	    (mod_hash_val_t *)&fp);
1118	if (rv != 0) {
1119		/* invalid key? */
1120		return;
1121	}
1122
1123	/*
1124	 * If there are threads already ref holding before the entry was
1125	 * removed from hash table, then wait for ref count to drop to zero.
1126	 */
1127	while (fp->refcnt != 0) {
1128		delay(drv_usectohz(vsw_fdbe_refcnt_delay));
1129	}
1130
1131	kmem_free(fp, sizeof (*fp));
1132}
1133
1134/*
1135 * Search fdb for a given mac address. If an entry is found, hold
1136 * a reference to it and return the entry, else returns NULL.
1137 */
1138static vsw_fdbe_t *
1139vsw_fdbe_find(vsw_t *vswp, struct ether_addr *addrp)
1140{
1141	uint64_t	key = 0;
1142	vsw_fdbe_t	*fp;
1143	int		rv;
1144
1145	KEY_HASH(key, addrp);
1146
1147	rv = mod_hash_find_cb(vswp->fdb_hashp, (mod_hash_key_t)key,
1148	    (mod_hash_val_t *)&fp, vsw_fdbe_find_cb);
1149
1150	if (rv != 0)
1151		return (NULL);
1152
1153	return (fp);
1154}
1155
1156/*
1157 * Callback function provided to mod_hash_find_cb(). After finding the fdb
1158 * entry corresponding to the key (macaddr), this callback will be invoked by
1159 * mod_hash_find_cb() to atomically increment the reference count on the fdb
1160 * entry before returning the found entry.
1161 */
1162static void
1163vsw_fdbe_find_cb(mod_hash_key_t key, mod_hash_val_t val)
1164{
1165	_NOTE(ARGUNUSED(key))
1166	VSW_FDBE_REFHOLD((vsw_fdbe_t *)val);
1167}
1168
1169/*
1170 * A given frame must be always tagged with the appropriate vlan id (unless it
1171 * is in the default-vlan) before the mac address switching function is called.
1172 * Otherwise, after switching function determines the destination, we cannot
1173 * figure out if the destination belongs to the the same vlan that the frame
1174 * originated from and if it needs tag/untag. Frames which are inbound from
1175 * the external(physical) network over a vlan trunk link are always tagged.
1176 * However frames which are received from a vnet-port over ldc or frames which
1177 * are coming down the stack on the service domain over vsw interface may be
1178 * untagged. These frames must be tagged with the appropriate pvid of the
1179 * sender (vnet-port or vsw device), before invoking the switching function.
1180 *
1181 * Arguments:
1182 *   arg:    caller of the function.
1183 *   type:   type of arg(caller): VSW_LOCALDEV(vsw) or VSW_VNETPORT(port)
1184 *   mp:     frame(s) to be tagged.
1185 */
1186mblk_t *
1187vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp)
1188{
1189	vsw_t			*vswp;
1190	vsw_port_t		*portp;
1191	struct ether_header	*ehp;
1192	mblk_t			*bp;
1193	mblk_t			*bpt;
1194	mblk_t			*bph;
1195	mblk_t			*bpn;
1196	uint16_t		pvid;
1197
1198	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
1199
1200	if (type == VSW_LOCALDEV) {
1201		vswp = (vsw_t *)arg;
1202		pvid = vswp->pvid;
1203		portp = NULL;
1204	} else {
1205		/* VSW_VNETPORT */
1206		portp = (vsw_port_t *)arg;
1207		pvid = portp->pvid;
1208		vswp = portp->p_vswp;
1209	}
1210
1211	bpn = bph = bpt = NULL;
1212
1213	for (bp = mp; bp != NULL; bp = bpn) {
1214
1215		bpn = bp->b_next;
1216		bp->b_next = bp->b_prev = NULL;
1217
1218		/* Determine if it is an untagged frame */
1219		ehp = (struct ether_header *)bp->b_rptr;
1220
1221		if (ehp->ether_type != ETHERTYPE_VLAN) {	/* untagged */
1222
1223			/* no need to tag if the frame is in default vlan */
1224			if (pvid != vswp->default_vlan_id) {
1225				bp = vnet_vlan_insert_tag(bp, pvid);
1226				if (bp == NULL) {
1227					continue;
1228				}
1229			}
1230		}
1231
1232		/* build a chain of processed packets */
1233		if (bph == NULL) {
1234			bph = bpt = bp;
1235		} else {
1236			bpt->b_next = bp;
1237			bpt = bp;
1238		}
1239
1240	}
1241
1242	return (bph);
1243}
1244
1245/*
1246 * Frames destined to a vnet-port or to the local vsw interface, must be
1247 * untagged if necessary before sending. This function first checks that the
1248 * frame can be sent to the destination in the vlan identified by the frame
1249 * tag. Note that when this function is invoked the frame must have been
1250 * already tagged (unless it is in the default-vlan). Because, this function is
1251 * called when the switching function determines the destination and invokes
1252 * its send function (vnet-port or vsw interface) and all frames would have
1253 * been tagged by this time (see comments in vsw_vlan_frame_pretag()).
1254 *
1255 * Arguments:
1256 *   arg:    destination device.
1257 *   type:   type of arg(destination): VSW_LOCALDEV(vsw) or VSW_VNETPORT(port)
1258 *   np:     head of pkt chain to be validated and untagged.
1259 *   npt:    tail of pkt chain to be validated and untagged.
1260 *
1261 * Returns:
1262 *   np:     head of updated chain of packets
1263 *   npt:    tail of updated chain of packets
1264 *   rv:     count of the packets in the returned list
1265 */
1266uint32_t
1267vsw_vlan_frame_untag(void *arg, int type, mblk_t **np, mblk_t **npt)
1268{
1269	mblk_t			*bp;
1270	mblk_t			*bpt;
1271	mblk_t			*bph;
1272	mblk_t			*bpn;
1273	vsw_port_t		*portp;
1274	vsw_t			*vswp;
1275	uint32_t		count;
1276	struct ether_header	*ehp;
1277	boolean_t		is_tagged;
1278	boolean_t		rv;
1279	uint16_t		vlan_id;
1280	uint16_t		pvid;
1281	mod_hash_t		*vlan_hashp;
1282
1283	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
1284
1285
1286	if (type == VSW_LOCALDEV) {
1287		vswp = (vsw_t *)arg;
1288		pvid = vswp->pvid;
1289		vlan_hashp = vswp->vlan_hashp;
1290		portp = NULL;
1291	} else {
1292		/* type == VSW_VNETPORT */
1293		portp = (vsw_port_t *)arg;
1294		vswp = portp->p_vswp;
1295		vlan_hashp = portp->vlan_hashp;
1296		pvid = portp->pvid;
1297	}
1298
1299	/*
1300	 * If the MAC layer switching in place, then
1301	 * untagging required only if the pvid is not
1302	 * the same as default_vlan_id. This is because,
1303	 * the MAC layer will send packets for the
1304	 * registered vlans only.
1305	 */
1306	if ((vswp->mac_cl_switching == B_TRUE) &&
1307	    (pvid == vswp->default_vlan_id)) {
1308		/* simply count and set the tail */
1309		count = 1;
1310		bp = *np;
1311		ASSERT(bp != NULL);
1312		while (bp->b_next != NULL) {
1313			bp = bp->b_next;
1314			count++;
1315		}
1316		*npt = bp;
1317		return (count);
1318	}
1319
1320	bpn = bph = bpt = NULL;
1321	count = 0;
1322
1323	for (bp = *np; bp != NULL; bp = bpn) {
1324
1325		bpn = bp->b_next;
1326		bp->b_next = bp->b_prev = NULL;
1327
1328		/*
1329		 * Determine the vlan id that the frame belongs to.
1330		 */
1331		ehp = (struct ether_header *)bp->b_rptr;
1332		is_tagged = vsw_frame_lookup_vid(arg, type, ehp, &vlan_id);
1333
1334		/*
1335		 * If MAC layer switching in place, then we
1336		 * need to untag only if the tagged packet has
1337		 * vlan-id same as the pvid.
1338		 */
1339		if (vswp->mac_cl_switching == B_TRUE) {
1340
1341			/* only tagged packets expected here */
1342			ASSERT(is_tagged == B_TRUE);
1343			if (vlan_id == pvid) {
1344				bp = vnet_vlan_remove_tag(bp);
1345				if (bp == NULL) {
1346					/* packet dropped */
1347					continue;
1348				}
1349			}
1350		} else { /* No MAC layer switching */
1351
1352			/*
1353			 * Check the frame header if tag/untag is  needed.
1354			 */
1355			if (is_tagged == B_FALSE) {
1356				/*
1357				 * Untagged frame. We shouldn't have an
1358				 * untagged packet at this point, unless
1359				 * the destination's  vlan id is
1360				 * default-vlan-id; if it is not the
1361				 * default-vlan-id, we drop the packet.
1362				 */
1363				if (vlan_id != vswp->default_vlan_id) {
1364					/* drop the packet */
1365					freemsg(bp);
1366					continue;
1367				}
1368			} else {	/* Tagged */
1369				/*
1370				 * Tagged frame, untag if it's the
1371				 * destination's pvid.
1372				 */
1373				if (vlan_id == pvid) {
1374
1375					bp = vnet_vlan_remove_tag(bp);
1376					if (bp == NULL) {
1377						/* packet dropped */
1378						continue;
1379					}
1380				} else {
1381
1382					/*
1383					 * Check if the destination is in the
1384					 * same vlan.
1385					 */
1386					rv = vsw_vlan_lookup(vlan_hashp,
1387					    vlan_id);
1388					if (rv == B_FALSE) {
1389						/* drop the packet */
1390						freemsg(bp);
1391						continue;
1392					}
1393				}
1394
1395			}
1396		}
1397
1398		/* build a chain of processed packets */
1399		if (bph == NULL) {
1400			bph = bpt = bp;
1401		} else {
1402			bpt->b_next = bp;
1403			bpt = bp;
1404		}
1405		count++;
1406	}
1407
1408	*np = bph;
1409	*npt = bpt;
1410	return (count);
1411}
1412
1413/*
1414 * Lookup the vlan id of the given frame. If it is a vlan-tagged frame,
1415 * then the vlan-id is available in the tag; otherwise, its vlan id is
1416 * implicitly obtained based on the caller (destination of the frame:
1417 * VSW_VNETPORT or VSW_LOCALDEV).
1418 * The vlan id determined is returned in vidp.
1419 * Returns: B_TRUE if it is a tagged frame; B_FALSE if it is untagged.
1420 */
1421boolean_t
1422vsw_frame_lookup_vid(void *arg, int caller, struct ether_header *ehp,
1423	uint16_t *vidp)
1424{
1425	struct ether_vlan_header	*evhp;
1426	vsw_t				*vswp;
1427	vsw_port_t			*portp;
1428
1429	/* If it's a tagged frame, get the vid from vlan header */
1430	if (ehp->ether_type == ETHERTYPE_VLAN) {
1431
1432		evhp = (struct ether_vlan_header *)ehp;
1433		*vidp = VLAN_ID(ntohs(evhp->ether_tci));
1434		return (B_TRUE);
1435	}
1436
1437	/* Untagged frame; determine vlan id based on caller */
1438	switch (caller) {
1439
1440	case VSW_VNETPORT:
1441		/*
1442		 * packet destined to a vnet; vlan-id is pvid of vnet-port.
1443		 */
1444		portp = (vsw_port_t *)arg;
1445		*vidp = portp->pvid;
1446		break;
1447
1448	case VSW_LOCALDEV:
1449
1450		/*
1451		 * packet destined to vsw interface;
1452		 * vlan-id is port-vlan-id of vsw device.
1453		 */
1454		vswp = (vsw_t *)arg;
1455		*vidp = vswp->pvid;
1456		break;
1457	}
1458
1459	return (B_FALSE);
1460}
1461
1462/*
1463 * Add or remove multicast address(es).
1464 *
1465 * Returns 0 on success, 1 on failure.
1466 */
1467int
1468vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port)
1469{
1470	mcst_addr_t		*mcst_p = NULL;
1471	vsw_t			*vswp = port->p_vswp;
1472	uint64_t		addr = 0x0;
1473	int			i;
1474
1475	D1(vswp, "%s: enter", __func__);
1476
1477	D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count);
1478
1479	for (i = 0; i < mcst_pkt->count; i++) {
1480		/*
1481		 * Convert address into form that can be used
1482		 * as hash table key.
1483		 */
1484		KEY_HASH(addr, &(mcst_pkt->mca[i]));
1485
1486		/*
1487		 * Add or delete the specified address/port combination.
1488		 */
1489		if (mcst_pkt->set == 0x1) {
1490			D3(vswp, "%s: adding multicast address 0x%llx for "
1491			    "port %ld", __func__, addr, port->p_instance);
1492			if (vsw_add_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
1493				/*
1494				 * Update the list of multicast
1495				 * addresses contained within the
1496				 * port structure to include this new
1497				 * one.
1498				 */
1499				mcst_p = kmem_zalloc(sizeof (mcst_addr_t),
1500				    KM_NOSLEEP);
1501				if (mcst_p == NULL) {
1502					DERR(vswp, "%s: unable to alloc mem",
1503					    __func__);
1504					(void) vsw_del_mcst(vswp,
1505					    VSW_VNETPORT, addr, port);
1506					return (1);
1507				}
1508
1509				mcst_p->nextp = NULL;
1510				mcst_p->addr = addr;
1511				ether_copy(&mcst_pkt->mca[i], &mcst_p->mca);
1512
1513				/*
1514				 * Program the address into HW. If the addr
1515				 * has already been programmed then the MAC
1516				 * just increments a ref counter (which is
1517				 * used when the address is being deleted)
1518				 */
1519				if (vsw_mac_multicast_add(vswp, port, mcst_p,
1520				    VSW_VNETPORT)) {
1521					(void) vsw_del_mcst(vswp,
1522					    VSW_VNETPORT, addr, port);
1523					kmem_free(mcst_p, sizeof (*mcst_p));
1524					return (1);
1525				}
1526
1527				mutex_enter(&port->mca_lock);
1528				mcst_p->nextp = port->mcap;
1529				port->mcap = mcst_p;
1530				mutex_exit(&port->mca_lock);
1531
1532			} else {
1533				DERR(vswp, "%s: error adding multicast "
1534				    "address 0x%llx for port %ld",
1535				    __func__, addr, port->p_instance);
1536				return (1);
1537			}
1538		} else {
1539			/*
1540			 * Delete an entry from the multicast hash
1541			 * table and update the address list
1542			 * appropriately.
1543			 */
1544			if (vsw_del_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
1545				D3(vswp, "%s: deleting multicast address "
1546				    "0x%llx for port %ld", __func__, addr,
1547				    port->p_instance);
1548
1549				mcst_p = vsw_del_addr(VSW_VNETPORT, port, addr);
1550				ASSERT(mcst_p != NULL);
1551
1552				/*
1553				 * Remove the address from HW. The address
1554				 * will actually only be removed once the ref
1555				 * count within the MAC layer has dropped to
1556				 * zero. I.e. we can safely call this fn even
1557				 * if other ports are interested in this
1558				 * address.
1559				 */
1560				vsw_mac_multicast_remove(vswp, port, mcst_p,
1561				    VSW_VNETPORT);
1562				kmem_free(mcst_p, sizeof (*mcst_p));
1563
1564			} else {
1565				DERR(vswp, "%s: error deleting multicast "
1566				    "addr 0x%llx for port %ld",
1567				    __func__, addr, port->p_instance);
1568				return (1);
1569			}
1570		}
1571	}
1572	D1(vswp, "%s: exit", __func__);
1573	return (0);
1574}
1575
1576/*
1577 * Add a new multicast entry.
1578 *
1579 * Search hash table based on address. If match found then
1580 * update associated val (which is chain of ports), otherwise
1581 * create new key/val (addr/port) pair and insert into table.
1582 */
1583int
1584vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
1585{
1586	int		dup = 0;
1587	int		rv = 0;
1588	mfdb_ent_t	*ment = NULL;
1589	mfdb_ent_t	*tmp_ent = NULL;
1590	mfdb_ent_t	*new_ent = NULL;
1591	void		*tgt = NULL;
1592
1593	if (devtype == VSW_VNETPORT) {
1594		/*
1595		 * Being invoked from a vnet.
1596		 */
1597		ASSERT(arg != NULL);
1598		tgt = arg;
1599		D2(NULL, "%s: port %d : address 0x%llx", __func__,
1600		    ((vsw_port_t *)arg)->p_instance, addr);
1601	} else {
1602		/*
1603		 * We are being invoked via the m_multicst mac entry
1604		 * point.
1605		 */
1606		D2(NULL, "%s: address 0x%llx", __func__, addr);
1607		tgt = (void *)vswp;
1608	}
1609
1610	WRITE_ENTER(&vswp->mfdbrw);
1611	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
1612	    (mod_hash_val_t *)&ment) != 0) {
1613
1614		/* address not currently in table */
1615		ment = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
1616		ment->d_addr = (void *)tgt;
1617		ment->d_type = devtype;
1618		ment->nextp = NULL;
1619
1620		if (mod_hash_insert(vswp->mfdb, (mod_hash_key_t)addr,
1621		    (mod_hash_val_t)ment) != 0) {
1622			DERR(vswp, "%s: hash table insertion failed", __func__);
1623			kmem_free(ment, sizeof (mfdb_ent_t));
1624			rv = 1;
1625		} else {
1626			D2(vswp, "%s: added initial entry for 0x%llx to "
1627			    "table", __func__, addr);
1628		}
1629	} else {
1630		/*
1631		 * Address in table. Check to see if specified port
1632		 * is already associated with the address. If not add
1633		 * it now.
1634		 */
1635		tmp_ent = ment;
1636		while (tmp_ent != NULL) {
1637			if (tmp_ent->d_addr == (void *)tgt) {
1638				if (devtype == VSW_VNETPORT) {
1639					DERR(vswp, "%s: duplicate port entry "
1640					    "found for portid %ld and key "
1641					    "0x%llx", __func__,
1642					    ((vsw_port_t *)arg)->p_instance,
1643					    addr);
1644				} else {
1645					DERR(vswp, "%s: duplicate entry found"
1646					    "for key 0x%llx", __func__, addr);
1647				}
1648				rv = 1;
1649				dup = 1;
1650				break;
1651			}
1652			tmp_ent = tmp_ent->nextp;
1653		}
1654
1655		/*
1656		 * Port not on list so add it to end now.
1657		 */
1658		if (0 == dup) {
1659			D2(vswp, "%s: added entry for 0x%llx to table",
1660			    __func__, addr);
1661			new_ent = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
1662			new_ent->d_addr = (void *)tgt;
1663			new_ent->d_type = devtype;
1664			new_ent->nextp = NULL;
1665
1666			tmp_ent = ment;
1667			while (tmp_ent->nextp != NULL)
1668				tmp_ent = tmp_ent->nextp;
1669
1670			tmp_ent->nextp = new_ent;
1671		}
1672	}
1673
1674	RW_EXIT(&vswp->mfdbrw);
1675	return (rv);
1676}
1677
1678/*
1679 * Remove a multicast entry from the hashtable.
1680 *
1681 * Search hash table based on address. If match found, scan
1682 * list of ports associated with address. If specified port
1683 * found remove it from list.
1684 */
1685int
1686vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
1687{
1688	mfdb_ent_t	*ment = NULL;
1689	mfdb_ent_t	*curr_p, *prev_p;
1690	void		*tgt = NULL;
1691
1692	D1(vswp, "%s: enter", __func__);
1693
1694	if (devtype == VSW_VNETPORT) {
1695		tgt = (vsw_port_t *)arg;
1696		D2(vswp, "%s: removing port %d from mFDB for address"
1697		    " 0x%llx", __func__, ((vsw_port_t *)tgt)->p_instance, addr);
1698	} else {
1699		D2(vswp, "%s: removing entry", __func__);
1700		tgt = (void *)vswp;
1701	}
1702
1703	WRITE_ENTER(&vswp->mfdbrw);
1704	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
1705	    (mod_hash_val_t *)&ment) != 0) {
1706		D2(vswp, "%s: address 0x%llx not in table", __func__, addr);
1707		RW_EXIT(&vswp->mfdbrw);
1708		return (1);
1709	}
1710
1711	prev_p = curr_p = ment;
1712
1713	while (curr_p != NULL) {
1714		if (curr_p->d_addr == (void *)tgt) {
1715			if (devtype == VSW_VNETPORT) {
1716				D2(vswp, "%s: port %d found", __func__,
1717				    ((vsw_port_t *)tgt)->p_instance);
1718			} else {
1719				D2(vswp, "%s: instance found", __func__);
1720			}
1721
1722			if (prev_p == curr_p) {
1723				/*
1724				 * head of list, if no other element is in
1725				 * list then destroy this entry, otherwise
1726				 * just replace it with updated value.
1727				 */
1728				ment = curr_p->nextp;
1729				if (ment == NULL) {
1730					(void) mod_hash_destroy(vswp->mfdb,
1731					    (mod_hash_val_t)addr);
1732				} else {
1733					(void) mod_hash_replace(vswp->mfdb,
1734					    (mod_hash_key_t)addr,
1735					    (mod_hash_val_t)ment);
1736				}
1737			} else {
1738				/*
1739				 * Not head of list, no need to do
1740				 * replacement, just adjust list pointers.
1741				 */
1742				prev_p->nextp = curr_p->nextp;
1743			}
1744			break;
1745		}
1746
1747		prev_p = curr_p;
1748		curr_p = curr_p->nextp;
1749	}
1750
1751	RW_EXIT(&vswp->mfdbrw);
1752
1753	D1(vswp, "%s: exit", __func__);
1754
1755	if (curr_p == NULL)
1756		return (1);
1757	kmem_free(curr_p, sizeof (mfdb_ent_t));
1758	return (0);
1759}
1760
1761/*
1762 * Port is being deleted, but has registered an interest in one
1763 * or more multicast groups. Using the list of addresses maintained
1764 * within the port structure find the appropriate entry in the hash
1765 * table and remove this port from the list of interested ports.
1766 */
1767void
1768vsw_del_mcst_port(vsw_port_t *port)
1769{
1770	mcst_addr_t	*mcap = NULL;
1771	vsw_t		*vswp = port->p_vswp;
1772
1773	D1(vswp, "%s: enter", __func__);
1774
1775	mutex_enter(&port->mca_lock);
1776
1777	while ((mcap = port->mcap) != NULL) {
1778
1779		port->mcap = mcap->nextp;
1780
1781		mutex_exit(&port->mca_lock);
1782
1783		(void) vsw_del_mcst(vswp, VSW_VNETPORT,
1784		    mcap->addr, port);
1785
1786		/*
1787		 * Remove the address from HW. The address
1788		 * will actually only be removed once the ref
1789		 * count within the MAC layer has dropped to
1790		 * zero. I.e. we can safely call this fn even
1791		 * if other ports are interested in this
1792		 * address.
1793		 */
1794		vsw_mac_multicast_remove(vswp, port, mcap, VSW_VNETPORT);
1795		kmem_free(mcap, sizeof (*mcap));
1796
1797		mutex_enter(&port->mca_lock);
1798
1799	}
1800
1801	mutex_exit(&port->mca_lock);
1802
1803	D1(vswp, "%s: exit", __func__);
1804}
1805
1806/*
1807 * This vsw instance is detaching, but has registered an interest in one
1808 * or more multicast groups. Using the list of addresses maintained
1809 * within the vsw structure find the appropriate entry in the hash
1810 * table and remove this instance from the list of interested ports.
1811 */
1812void
1813vsw_del_mcst_vsw(vsw_t *vswp)
1814{
1815	mcst_addr_t	*next_p = NULL;
1816
1817	D1(vswp, "%s: enter", __func__);
1818
1819	mutex_enter(&vswp->mca_lock);
1820
1821	while (vswp->mcap != NULL) {
1822		DERR(vswp, "%s: deleting addr 0x%llx",
1823		    __func__, vswp->mcap->addr);
1824		(void) vsw_del_mcst(vswp, VSW_LOCALDEV, vswp->mcap->addr, NULL);
1825
1826		next_p = vswp->mcap->nextp;
1827		kmem_free(vswp->mcap, sizeof (mcst_addr_t));
1828		vswp->mcap = next_p;
1829	}
1830
1831	vswp->mcap = NULL;
1832	mutex_exit(&vswp->mca_lock);
1833
1834	D1(vswp, "%s: exit", __func__);
1835}
1836
1837mblk_t *
1838vsw_get_same_dest_list(struct ether_header *ehp, mblk_t **mpp)
1839{
1840	mblk_t			*bp;
1841	mblk_t			*nbp;
1842	mblk_t			*head = NULL;
1843	mblk_t			*tail = NULL;
1844	mblk_t			*prev = NULL;
1845	struct ether_header	*behp;
1846
1847	/* process the chain of packets */
1848	bp = *mpp;
1849	while (bp) {
1850		nbp = bp->b_next;
1851		behp = (struct ether_header *)bp->b_rptr;
1852		bp->b_prev = NULL;
1853		if (ether_cmp(&ehp->ether_dhost, &behp->ether_dhost) == 0) {
1854			if (prev == NULL) {
1855				*mpp = nbp;
1856			} else {
1857				prev->b_next = nbp;
1858			}
1859			bp->b_next =  NULL;
1860			if (head == NULL) {
1861				head = tail = bp;
1862			} else {
1863				tail->b_next = bp;
1864				tail = bp;
1865			}
1866		} else {
1867			prev = bp;
1868		}
1869		bp = nbp;
1870	}
1871	return (head);
1872}
1873
1874static mblk_t *
1875vsw_dupmsgchain(mblk_t *mp)
1876{
1877	mblk_t	*nmp = NULL;
1878	mblk_t	**nmpp = &nmp;
1879
1880	for (; mp != NULL; mp = mp->b_next) {
1881		if ((*nmpp = dupmsg(mp)) == NULL) {
1882			freemsgchain(nmp);
1883			return (NULL);
1884		}
1885
1886		nmpp = &((*nmpp)->b_next);
1887	}
1888
1889	return (nmp);
1890}
1891