vsw_ldc.c revision 5464:55cb315a7aad
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29#include <sys/types.h>
30#include <sys/errno.h>
31#include <sys/debug.h>
32#include <sys/time.h>
33#include <sys/sysmacros.h>
34#include <sys/systm.h>
35#include <sys/user.h>
36#include <sys/stropts.h>
37#include <sys/stream.h>
38#include <sys/strlog.h>
39#include <sys/strsubr.h>
40#include <sys/cmn_err.h>
41#include <sys/cpu.h>
42#include <sys/kmem.h>
43#include <sys/conf.h>
44#include <sys/ddi.h>
45#include <sys/sunddi.h>
46#include <sys/ksynch.h>
47#include <sys/stat.h>
48#include <sys/kstat.h>
49#include <sys/vtrace.h>
50#include <sys/strsun.h>
51#include <sys/dlpi.h>
52#include <sys/ethernet.h>
53#include <net/if.h>
54#include <sys/varargs.h>
55#include <sys/machsystm.h>
56#include <sys/modctl.h>
57#include <sys/modhash.h>
58#include <sys/mac.h>
59#include <sys/mac_ether.h>
60#include <sys/taskq.h>
61#include <sys/note.h>
62#include <sys/mach_descrip.h>
63#include <sys/mac.h>
64#include <sys/mdeg.h>
65#include <sys/ldc.h>
66#include <sys/vsw_fdb.h>
67#include <sys/vsw.h>
68#include <sys/vio_mailbox.h>
69#include <sys/vnet_mailbox.h>
70#include <sys/vnet_common.h>
71#include <sys/vio_util.h>
72#include <sys/sdt.h>
73#include <sys/atomic.h>
74#include <sys/callb.h>
75
76/* Port add/deletion/etc routines */
77static	int vsw_port_delete(vsw_port_t *port);
78static	int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id);
79static	int vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id);
80static	int vsw_init_ldcs(vsw_port_t *port);
81static	int vsw_uninit_ldcs(vsw_port_t *port);
82static	int vsw_ldc_init(vsw_ldc_t *ldcp);
83static	int vsw_ldc_uninit(vsw_ldc_t *ldcp);
84static	int vsw_drain_ldcs(vsw_port_t *port);
85static	int vsw_drain_port_taskq(vsw_port_t *port);
86static	void vsw_marker_task(void *);
87static	int vsw_plist_del_node(vsw_t *, vsw_port_t *port);
88int vsw_detach_ports(vsw_t *vswp);
89int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
90mcst_addr_t *vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr);
91int vsw_port_detach(vsw_t *vswp, int p_instance);
92int vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt);
93int vsw_port_attach(vsw_t *vswp, int p_instance,
94	uint64_t *ldcids, int nids, struct ether_addr *macaddr);
95vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
96
97
98/* Interrupt routines */
99static	uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg);
100
101/* Handshake routines */
102static	void vsw_ldc_reinit(vsw_ldc_t *);
103static	void vsw_process_conn_evt(vsw_ldc_t *, uint16_t);
104static	void vsw_conn_task(void *);
105static	int vsw_check_flag(vsw_ldc_t *, int, uint64_t);
106static	void vsw_next_milestone(vsw_ldc_t *);
107static	int vsw_supported_version(vio_ver_msg_t *);
108
109/* Data processing routines */
110static void vsw_process_pkt(void *);
111static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t);
112static void vsw_process_ctrl_pkt(void *);
113static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *);
114static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *);
115static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *);
116static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *);
117static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *);
118static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *);
119static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t);
120static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *);
121static void vsw_process_data_raw_pkt(vsw_ldc_t *, void *);
122static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *);
123static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t);
124
125/* Switching/data transmit routines */
126static	int vsw_dringsend(vsw_ldc_t *, mblk_t *);
127static	int vsw_descrsend(vsw_ldc_t *, mblk_t *);
128static int vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, int retries);
129
130/* Packet creation routines */
131static void vsw_send_ver(void *);
132static void vsw_send_attr(vsw_ldc_t *);
133static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *);
134static void vsw_send_dring_info(vsw_ldc_t *);
135static void vsw_send_rdx(vsw_ldc_t *);
136static int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t);
137
138/* Dring routines */
139static dring_info_t *vsw_create_dring(vsw_ldc_t *);
140static void vsw_create_privring(vsw_ldc_t *);
141static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp);
142static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
143    int *);
144static dring_info_t *vsw_ident2dring(lane_t *, uint64_t);
145static int vsw_reclaim_dring(dring_info_t *dp, int start);
146
147static void vsw_set_lane_attr(vsw_t *, lane_t *);
148static int vsw_check_attr(vnet_attr_msg_t *, vsw_port_t *);
149static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg);
150static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *);
151static int vsw_check_dring_info(vio_dring_reg_msg_t *);
152
153/* Rcv/Tx thread routines */
154static void vsw_stop_tx_thread(vsw_ldc_t *ldcp);
155static void vsw_ldc_tx_worker(void *arg);
156static uint_t vsw_rx_softintr(caddr_t arg1, caddr_t arg2);
157static void vsw_stop_rx_thread(vsw_ldc_t *ldcp);
158static void vsw_ldc_rx_worker(void *arg);
159
160/* Misc support routines */
161static	caddr_t vsw_print_ethaddr(uint8_t *addr, char *ebuf);
162static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
163static int vsw_free_ring(dring_info_t *);
164static void vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr);
165static int vsw_get_same_dest_list(struct ether_header *ehp,
166    mblk_t **rhead, mblk_t **rtail, mblk_t **mpp);
167static mblk_t *vsw_dupmsgchain(mblk_t *mp);
168static void vsw_mac_rx(vsw_t *vswp, int caller, mac_resource_handle_t mrh,
169    mblk_t *mp, mblk_t *mpt, vsw_macrx_flags_t flags);
170
171/* Debugging routines */
172static void dump_flags(uint64_t);
173static void display_state(void);
174static void display_lane(lane_t *);
175static void display_ring(dring_info_t *);
176
177/*
178 * Functions imported from other files.
179 */
180extern int vsw_set_hw(vsw_t *, vsw_port_t *, int);
181extern int vsw_unset_hw(vsw_t *, vsw_port_t *, int);
182extern void vsw_reconfig_hw(vsw_t *);
183extern int vsw_add_fdb(vsw_t *vswp, vsw_port_t *port);
184extern int vsw_del_fdb(vsw_t *vswp, vsw_port_t *port);
185extern int vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port);
186extern void vsw_del_mcst_port(vsw_port_t *port);
187extern int vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
188extern int vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
189
190#define	VSW_NUM_VMPOOLS		3	/* number of vio mblk pools */
191#define	VSW_PORT_REF_DELAY	30	/* delay for port ref_cnt to become 0 */
192
193/*
194 * Tunables used in this file.
195 */
196extern int vsw_num_handshakes;
197extern int vsw_wretries;
198extern int vsw_desc_delay;
199extern int vsw_read_attempts;
200extern int vsw_ldc_tx_delay;
201extern int vsw_ldc_tx_retries;
202extern int vsw_ldc_tx_max_failures;
203extern boolean_t vsw_ldc_rxthr_enabled;
204extern boolean_t vsw_ldc_txthr_enabled;
205extern uint32_t vsw_chain_len;
206extern uint32_t vsw_mblk_size1;
207extern uint32_t vsw_mblk_size2;
208extern uint32_t vsw_mblk_size3;
209extern uint32_t vsw_num_mblks1;
210extern uint32_t vsw_num_mblks2;
211extern uint32_t vsw_num_mblks3;
212
213
214#define	LDC_ENTER_LOCK(ldcp)	\
215				mutex_enter(&((ldcp)->ldc_cblock));\
216				mutex_enter(&((ldcp)->ldc_rxlock));\
217				mutex_enter(&((ldcp)->ldc_txlock));
218#define	LDC_EXIT_LOCK(ldcp)	\
219				mutex_exit(&((ldcp)->ldc_txlock));\
220				mutex_exit(&((ldcp)->ldc_rxlock));\
221				mutex_exit(&((ldcp)->ldc_cblock));
222
223
224/* supported versions */
225static	ver_sup_t	vsw_versions[] = { {1, 0} };
226
227/*
228 * For the moment the state dump routines have their own
229 * private flag.
230 */
231#define	DUMP_STATE	0
232
233#if DUMP_STATE
234
235#define	DUMP_TAG(tag) \
236{			\
237	D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \
238	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype);	\
239	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env);	\
240}
241
242#define	DUMP_TAG_PTR(tag) \
243{			\
244	D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \
245	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype);	\
246	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env);	\
247}
248
249#define	DUMP_FLAGS(flags) dump_flags(flags);
250#define	DISPLAY_STATE()	display_state()
251
252#else
253
254#define	DUMP_TAG(tag)
255#define	DUMP_TAG_PTR(tag)
256#define	DUMP_FLAGS(state)
257#define	DISPLAY_STATE()
258
259#endif	/* DUMP_STATE */
260
261/*
262 * Attach the specified port.
263 *
264 * Returns 0 on success, 1 on failure.
265 */
266int
267vsw_port_attach(vsw_t *vswp, int p_instance, uint64_t *ldcids, int nids,
268struct ether_addr *macaddr)
269{
270	vsw_port_list_t		*plist = &vswp->plist;
271	vsw_port_t		*port, **prev_port;
272	int			i;
273
274	D1(vswp, "%s: enter : port %d", __func__, p_instance);
275
276	/* port already exists? */
277	READ_ENTER(&plist->lockrw);
278	for (port = plist->head; port != NULL; port = port->p_next) {
279		if (port->p_instance == p_instance) {
280			DWARN(vswp, "%s: port instance %d already attached",
281			    __func__, p_instance);
282			RW_EXIT(&plist->lockrw);
283			return (1);
284		}
285	}
286	RW_EXIT(&plist->lockrw);
287
288	port = kmem_zalloc(sizeof (vsw_port_t), KM_SLEEP);
289	port->p_vswp = vswp;
290	port->p_instance = p_instance;
291	port->p_ldclist.num_ldcs = 0;
292	port->p_ldclist.head = NULL;
293	port->addr_set = VSW_ADDR_UNSET;
294
295	rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL);
296
297	mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
298	mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
299
300	mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
301	cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
302	port->state = VSW_PORT_INIT;
303
304	if (nids > VSW_PORT_MAX_LDCS) {
305		D2(vswp, "%s: using first of %d ldc ids",
306		    __func__, nids);
307		nids = VSW_PORT_MAX_LDCS;
308	}
309
310	D2(vswp, "%s: %d nids", __func__, nids);
311	for (i = 0; i < nids; i++) {
312		D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]);
313		if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) {
314			DERR(vswp, "%s: ldc_attach failed", __func__);
315
316			rw_destroy(&port->p_ldclist.lockrw);
317
318			cv_destroy(&port->state_cv);
319			mutex_destroy(&port->state_lock);
320
321			mutex_destroy(&port->tx_lock);
322			mutex_destroy(&port->mca_lock);
323			kmem_free(port, sizeof (vsw_port_t));
324			return (1);
325		}
326	}
327
328	ether_copy(macaddr, &port->p_macaddr);
329
330	if (vswp->switching_setup_done == B_TRUE) {
331		/*
332		 * If the underlying physical device has been setup,
333		 * program the mac address of this port in it.
334		 * Otherwise, port macaddr will be set after the physical
335		 * device is successfully setup by the timeout handler.
336		 */
337		mutex_enter(&vswp->hw_lock);
338		(void) vsw_set_hw(vswp, port, VSW_VNETPORT);
339		mutex_exit(&vswp->hw_lock);
340	}
341
342	WRITE_ENTER(&plist->lockrw);
343
344	/* create the fdb entry for this port/mac address */
345	(void) vsw_add_fdb(vswp, port);
346
347	/* link it into the list of ports for this vsw instance */
348	prev_port = (vsw_port_t **)(&plist->head);
349	port->p_next = *prev_port;
350	*prev_port = port;
351	plist->num_ports++;
352
353	RW_EXIT(&plist->lockrw);
354
355	/*
356	 * Initialise the port and any ldc's under it.
357	 */
358	(void) vsw_init_ldcs(port);
359
360	D1(vswp, "%s: exit", __func__);
361	return (0);
362}
363
364/*
365 * Detach the specified port.
366 *
367 * Returns 0 on success, 1 on failure.
368 */
369int
370vsw_port_detach(vsw_t *vswp, int p_instance)
371{
372	vsw_port_t	*port = NULL;
373	vsw_port_list_t	*plist = &vswp->plist;
374
375	D1(vswp, "%s: enter: port id %d", __func__, p_instance);
376
377	WRITE_ENTER(&plist->lockrw);
378
379	if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) {
380		RW_EXIT(&plist->lockrw);
381		return (1);
382	}
383
384	if (vsw_plist_del_node(vswp, port)) {
385		RW_EXIT(&plist->lockrw);
386		return (1);
387	}
388
389	/* Remove the fdb entry for this port/mac address */
390	(void) vsw_del_fdb(vswp, port);
391
392	/* Remove any multicast addresses.. */
393	vsw_del_mcst_port(port);
394
395	/*
396	 * No longer need to hold writer lock on port list now
397	 * that we have unlinked the target port from the list.
398	 */
399	RW_EXIT(&plist->lockrw);
400
401	/* Remove address if was programmed into HW. */
402	mutex_enter(&vswp->hw_lock);
403
404	/*
405	 * Port's address may not have been set in hardware. This could
406	 * happen if the underlying physical device is not yet available and
407	 * vsw_setup_switching_timeout() may be in progress.
408	 * We remove its addr from hardware only if it has been set before.
409	 */
410	if (port->addr_set != VSW_ADDR_UNSET)
411		(void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
412
413	if (vswp->recfg_reqd)
414		vsw_reconfig_hw(vswp);
415
416	mutex_exit(&vswp->hw_lock);
417
418	if (vsw_port_delete(port)) {
419		return (1);
420	}
421
422	D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance);
423	return (0);
424}
425
426/*
427 * Detach all active ports.
428 *
429 * Returns 0 on success, 1 on failure.
430 */
431int
432vsw_detach_ports(vsw_t *vswp)
433{
434	vsw_port_list_t 	*plist = &vswp->plist;
435	vsw_port_t		*port = NULL;
436
437	D1(vswp, "%s: enter", __func__);
438
439	WRITE_ENTER(&plist->lockrw);
440
441	while ((port = plist->head) != NULL) {
442		if (vsw_plist_del_node(vswp, port)) {
443			DERR(vswp, "%s: Error deleting port %d"
444			    " from port list", __func__, port->p_instance);
445			RW_EXIT(&plist->lockrw);
446			return (1);
447		}
448
449		/* Remove address if was programmed into HW. */
450		mutex_enter(&vswp->hw_lock);
451		(void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
452		mutex_exit(&vswp->hw_lock);
453
454		/* Remove the fdb entry for this port/mac address */
455		(void) vsw_del_fdb(vswp, port);
456
457		/* Remove any multicast addresses.. */
458		vsw_del_mcst_port(port);
459
460		/*
461		 * No longer need to hold the lock on the port list
462		 * now that we have unlinked the target port from the
463		 * list.
464		 */
465		RW_EXIT(&plist->lockrw);
466		if (vsw_port_delete(port)) {
467			DERR(vswp, "%s: Error deleting port %d",
468			    __func__, port->p_instance);
469			return (1);
470		}
471		WRITE_ENTER(&plist->lockrw);
472	}
473	RW_EXIT(&plist->lockrw);
474
475	D1(vswp, "%s: exit", __func__);
476
477	return (0);
478}
479
480/*
481 * Delete the specified port.
482 *
483 * Returns 0 on success, 1 on failure.
484 */
485static int
486vsw_port_delete(vsw_port_t *port)
487{
488	vsw_ldc_list_t 		*ldcl;
489	vsw_t			*vswp = port->p_vswp;
490
491	D1(vswp, "%s: enter : port id %d", __func__, port->p_instance);
492
493	(void) vsw_uninit_ldcs(port);
494
495	/*
496	 * Wait for any pending ctrl msg tasks which reference this
497	 * port to finish.
498	 */
499	if (vsw_drain_port_taskq(port))
500		return (1);
501
502	/*
503	 * Wait for port reference count to hit zero.
504	 */
505	while (port->ref_cnt != 0) {
506		delay(drv_usectohz(VSW_PORT_REF_DELAY));
507	}
508
509	/*
510	 * Wait for any active callbacks to finish
511	 */
512	if (vsw_drain_ldcs(port))
513		return (1);
514
515	ldcl = &port->p_ldclist;
516	WRITE_ENTER(&ldcl->lockrw);
517	while (ldcl->num_ldcs > 0) {
518		if (vsw_ldc_detach(port, ldcl->head->ldc_id) != 0) {
519			cmn_err(CE_WARN, "!vsw%d: unable to detach ldc %ld",
520			    vswp->instance, ldcl->head->ldc_id);
521			RW_EXIT(&ldcl->lockrw);
522			return (1);
523		}
524	}
525	RW_EXIT(&ldcl->lockrw);
526
527	rw_destroy(&port->p_ldclist.lockrw);
528
529	mutex_destroy(&port->mca_lock);
530	mutex_destroy(&port->tx_lock);
531	cv_destroy(&port->state_cv);
532	mutex_destroy(&port->state_lock);
533
534	kmem_free(port, sizeof (vsw_port_t));
535
536	D1(vswp, "%s: exit", __func__);
537
538	return (0);
539}
540
541/*
542 * Attach a logical domain channel (ldc) under a specified port.
543 *
544 * Returns 0 on success, 1 on failure.
545 */
546static int
547vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
548{
549	vsw_t 		*vswp = port->p_vswp;
550	vsw_ldc_list_t *ldcl = &port->p_ldclist;
551	vsw_ldc_t 	*ldcp = NULL;
552	ldc_attr_t 	attr;
553	ldc_status_t	istatus;
554	int 		status = DDI_FAILURE;
555	int		rv;
556	char		kname[MAXNAMELEN];
557	enum		{ PROG_init = 0x0, PROG_mblks = 0x1,
558			    PROG_callback = 0x2, PROG_rx_thread = 0x4,
559			    PROG_tx_thread = 0x8}
560			progress;
561
562	progress = PROG_init;
563
564	D1(vswp, "%s: enter", __func__);
565
566	ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP);
567	if (ldcp == NULL) {
568		DERR(vswp, "%s: kmem_zalloc failed", __func__);
569		return (1);
570	}
571	ldcp->ldc_id = ldc_id;
572
573	/* Allocate pools of receive mblks */
574	rv = vio_init_multipools(&ldcp->vmp, VSW_NUM_VMPOOLS,
575	    vsw_mblk_size1, vsw_mblk_size2, vsw_mblk_size3,
576	    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3);
577	if (rv) {
578		DWARN(vswp, "%s: unable to create free mblk pools for"
579		    " channel %ld (rv %d)", __func__, ldc_id, rv);
580		kmem_free(ldcp, sizeof (vsw_ldc_t));
581		return (1);
582	}
583
584	progress |= PROG_mblks;
585
586	mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
587	mutex_init(&ldcp->ldc_rxlock, NULL, MUTEX_DRIVER, NULL);
588	mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
589	mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
590	cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL);
591	rw_init(&ldcp->lane_in.dlistrw, NULL, RW_DRIVER, NULL);
592	rw_init(&ldcp->lane_out.dlistrw, NULL, RW_DRIVER, NULL);
593
594	/* required for handshake with peer */
595	ldcp->local_session = (uint64_t)ddi_get_lbolt();
596	ldcp->peer_session = 0;
597	ldcp->session_status = 0;
598	ldcp->hss_id = 1;	/* Initial handshake session id */
599
600	/* only set for outbound lane, inbound set by peer */
601	vsw_set_lane_attr(vswp, &ldcp->lane_out);
602
603	attr.devclass = LDC_DEV_NT_SVC;
604	attr.instance = ddi_get_instance(vswp->dip);
605	attr.mode = LDC_MODE_UNRELIABLE;
606	attr.mtu = VSW_LDC_MTU;
607	status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
608	if (status != 0) {
609		DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
610		    __func__, ldc_id, status);
611		goto ldc_attach_fail;
612	}
613
614	if (vsw_ldc_rxthr_enabled) {
615		ldcp->rx_thr_flags = 0;
616
617		mutex_init(&ldcp->rx_thr_lock, NULL, MUTEX_DRIVER, NULL);
618		cv_init(&ldcp->rx_thr_cv, NULL, CV_DRIVER, NULL);
619		ldcp->rx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
620		    vsw_ldc_rx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
621
622		progress |= PROG_rx_thread;
623		if (ldcp->rx_thread == NULL) {
624			DWARN(vswp, "%s(%lld): Failed to create worker thread",
625			    __func__, ldc_id);
626			goto ldc_attach_fail;
627		}
628	}
629
630	if (vsw_ldc_txthr_enabled) {
631		ldcp->tx_thr_flags = 0;
632		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
633
634		mutex_init(&ldcp->tx_thr_lock, NULL, MUTEX_DRIVER, NULL);
635		cv_init(&ldcp->tx_thr_cv, NULL, CV_DRIVER, NULL);
636		ldcp->tx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
637		    vsw_ldc_tx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
638
639		progress |= PROG_tx_thread;
640		if (ldcp->tx_thread == NULL) {
641			DWARN(vswp, "%s(%lld): Failed to create worker thread",
642			    __func__, ldc_id);
643			goto ldc_attach_fail;
644		}
645	}
646
647	status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
648	if (status != 0) {
649		DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
650		    __func__, ldc_id, status);
651		(void) ldc_fini(ldcp->ldc_handle);
652		goto ldc_attach_fail;
653	}
654
655	progress |= PROG_callback;
656
657	mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL);
658
659	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
660		DERR(vswp, "%s: ldc_status failed", __func__);
661		mutex_destroy(&ldcp->status_lock);
662		goto ldc_attach_fail;
663	}
664
665	ldcp->ldc_status = istatus;
666	ldcp->ldc_port = port;
667	ldcp->ldc_vswp = vswp;
668
669	(void) sprintf(kname, "%sldc0x%lx", DRV_NAME, ldcp->ldc_id);
670	ldcp->ksp = vgen_setup_kstats(DRV_NAME, vswp->instance,
671	    kname, &ldcp->ldc_stats);
672	if (ldcp->ksp == NULL) {
673		DERR(vswp, "%s: kstats setup failed", __func__);
674		goto ldc_attach_fail;
675	}
676
677	/* link it into the list of channels for this port */
678	WRITE_ENTER(&ldcl->lockrw);
679	ldcp->ldc_next = ldcl->head;
680	ldcl->head = ldcp;
681	ldcl->num_ldcs++;
682	RW_EXIT(&ldcl->lockrw);
683
684	D1(vswp, "%s: exit", __func__);
685	return (0);
686
687ldc_attach_fail:
688
689	if (progress & PROG_callback) {
690		(void) ldc_unreg_callback(ldcp->ldc_handle);
691	}
692
693	if (progress & PROG_rx_thread) {
694		if (ldcp->rx_thread != NULL) {
695			vsw_stop_rx_thread(ldcp);
696		}
697		mutex_destroy(&ldcp->rx_thr_lock);
698		cv_destroy(&ldcp->rx_thr_cv);
699	}
700
701	if (progress & PROG_tx_thread) {
702		if (ldcp->tx_thread != NULL) {
703			vsw_stop_tx_thread(ldcp);
704		}
705		mutex_destroy(&ldcp->tx_thr_lock);
706		cv_destroy(&ldcp->tx_thr_cv);
707	}
708	if (ldcp->ksp != NULL) {
709		vgen_destroy_kstats(ldcp->ksp);
710	}
711	mutex_destroy(&ldcp->ldc_txlock);
712	mutex_destroy(&ldcp->ldc_rxlock);
713	mutex_destroy(&ldcp->ldc_cblock);
714	mutex_destroy(&ldcp->drain_cv_lock);
715
716	cv_destroy(&ldcp->drain_cv);
717
718	rw_destroy(&ldcp->lane_in.dlistrw);
719	rw_destroy(&ldcp->lane_out.dlistrw);
720
721	if (progress & PROG_mblks) {
722		vio_destroy_multipools(&ldcp->vmp, &vswp->rxh);
723	}
724	kmem_free(ldcp, sizeof (vsw_ldc_t));
725
726	return (1);
727}
728
729/*
730 * Detach a logical domain channel (ldc) belonging to a
731 * particular port.
732 *
733 * Returns 0 on success, 1 on failure.
734 */
735static int
736vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id)
737{
738	vsw_t 		*vswp = port->p_vswp;
739	vsw_ldc_t 	*ldcp, *prev_ldcp;
740	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
741	int 		rv;
742
743	prev_ldcp = ldcl->head;
744	for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) {
745		if (ldcp->ldc_id == ldc_id) {
746			break;
747		}
748	}
749
750	/* specified ldc id not found */
751	if (ldcp == NULL) {
752		DERR(vswp, "%s: ldcp = NULL", __func__);
753		return (1);
754	}
755
756	D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id);
757
758	/* Stop the receive thread */
759	if (ldcp->rx_thread != NULL) {
760		vsw_stop_rx_thread(ldcp);
761		mutex_destroy(&ldcp->rx_thr_lock);
762		cv_destroy(&ldcp->rx_thr_cv);
763	}
764
765	/* Stop the tx thread */
766	if (ldcp->tx_thread != NULL) {
767		vsw_stop_tx_thread(ldcp);
768		mutex_destroy(&ldcp->tx_thr_lock);
769		cv_destroy(&ldcp->tx_thr_cv);
770		if (ldcp->tx_mhead != NULL) {
771			freemsgchain(ldcp->tx_mhead);
772			ldcp->tx_mhead = ldcp->tx_mtail = NULL;
773		}
774	}
775
776	/* Destory kstats */
777	vgen_destroy_kstats(ldcp->ksp);
778
779	/*
780	 * Before we can close the channel we must release any mapped
781	 * resources (e.g. drings).
782	 */
783	vsw_free_lane_resources(ldcp, INBOUND);
784	vsw_free_lane_resources(ldcp, OUTBOUND);
785
786	/*
787	 * If the close fails we are in serious trouble, as won't
788	 * be able to delete the parent port.
789	 */
790	if ((rv = ldc_close(ldcp->ldc_handle)) != 0) {
791		DERR(vswp, "%s: error %d closing channel %lld",
792		    __func__, rv, ldcp->ldc_id);
793		return (1);
794	}
795
796	(void) ldc_fini(ldcp->ldc_handle);
797
798	ldcp->ldc_status = LDC_INIT;
799	ldcp->ldc_handle = NULL;
800	ldcp->ldc_vswp = NULL;
801
802
803	/*
804	 * Most likely some mblks are still in use and
805	 * have not been returned to the pool. These mblks are
806	 * added to the pool that is maintained in the device instance.
807	 * Another attempt will be made to destroy the pool
808	 * when the device detaches.
809	 */
810	vio_destroy_multipools(&ldcp->vmp, &vswp->rxh);
811
812	/* unlink it from the list */
813	prev_ldcp = ldcp->ldc_next;
814	ldcl->num_ldcs--;
815
816	mutex_destroy(&ldcp->ldc_txlock);
817	mutex_destroy(&ldcp->ldc_rxlock);
818	mutex_destroy(&ldcp->ldc_cblock);
819	cv_destroy(&ldcp->drain_cv);
820	mutex_destroy(&ldcp->drain_cv_lock);
821	mutex_destroy(&ldcp->status_lock);
822	rw_destroy(&ldcp->lane_in.dlistrw);
823	rw_destroy(&ldcp->lane_out.dlistrw);
824
825	kmem_free(ldcp, sizeof (vsw_ldc_t));
826
827	return (0);
828}
829
830/*
831 * Open and attempt to bring up the channel. Note that channel
832 * can only be brought up if peer has also opened channel.
833 *
834 * Returns 0 if can open and bring up channel, otherwise
835 * returns 1.
836 */
837static int
838vsw_ldc_init(vsw_ldc_t *ldcp)
839{
840	vsw_t 		*vswp = ldcp->ldc_vswp;
841	ldc_status_t	istatus = 0;
842	int		rv;
843
844	D1(vswp, "%s: enter", __func__);
845
846	LDC_ENTER_LOCK(ldcp);
847
848	/* don't start at 0 in case clients don't like that */
849	ldcp->next_ident = 1;
850
851	rv = ldc_open(ldcp->ldc_handle);
852	if (rv != 0) {
853		DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)",
854		    __func__, ldcp->ldc_id, rv);
855		LDC_EXIT_LOCK(ldcp);
856		return (1);
857	}
858
859	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
860		DERR(vswp, "%s: unable to get status", __func__);
861		LDC_EXIT_LOCK(ldcp);
862		return (1);
863
864	} else if (istatus != LDC_OPEN && istatus != LDC_READY) {
865		DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY",
866		    __func__, ldcp->ldc_id, istatus);
867		LDC_EXIT_LOCK(ldcp);
868		return (1);
869	}
870
871	mutex_enter(&ldcp->status_lock);
872	ldcp->ldc_status = istatus;
873	mutex_exit(&ldcp->status_lock);
874
875	rv = ldc_up(ldcp->ldc_handle);
876	if (rv != 0) {
877		/*
878		 * Not a fatal error for ldc_up() to fail, as peer
879		 * end point may simply not be ready yet.
880		 */
881		D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
882		    ldcp->ldc_id, rv);
883		LDC_EXIT_LOCK(ldcp);
884		return (1);
885	}
886
887	/*
888	 * ldc_up() call is non-blocking so need to explicitly
889	 * check channel status to see if in fact the channel
890	 * is UP.
891	 */
892	mutex_enter(&ldcp->status_lock);
893	if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) {
894		DERR(vswp, "%s: unable to get status", __func__);
895		mutex_exit(&ldcp->status_lock);
896		LDC_EXIT_LOCK(ldcp);
897		return (1);
898
899	}
900
901	if (ldcp->ldc_status == LDC_UP) {
902		D2(vswp, "%s: channel %ld now UP (%ld)", __func__,
903		    ldcp->ldc_id, istatus);
904		mutex_exit(&ldcp->status_lock);
905		LDC_EXIT_LOCK(ldcp);
906
907		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
908		return (0);
909	}
910
911	mutex_exit(&ldcp->status_lock);
912	LDC_EXIT_LOCK(ldcp);
913
914	D1(vswp, "%s: exit", __func__);
915	return (0);
916}
917
918/* disable callbacks on the channel */
919static int
920vsw_ldc_uninit(vsw_ldc_t *ldcp)
921{
922	vsw_t	*vswp = ldcp->ldc_vswp;
923	int	rv;
924
925	D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id);
926
927	LDC_ENTER_LOCK(ldcp);
928
929	rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
930	if (rv != 0) {
931		DERR(vswp, "vsw_ldc_uninit(%lld): error disabling "
932		    "interrupts (rv = %d)\n", ldcp->ldc_id, rv);
933		LDC_EXIT_LOCK(ldcp);
934		return (1);
935	}
936
937	mutex_enter(&ldcp->status_lock);
938	ldcp->ldc_status = LDC_INIT;
939	mutex_exit(&ldcp->status_lock);
940
941	LDC_EXIT_LOCK(ldcp);
942
943	D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id);
944
945	return (0);
946}
947
948static int
949vsw_init_ldcs(vsw_port_t *port)
950{
951	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
952	vsw_ldc_t	*ldcp;
953
954	READ_ENTER(&ldcl->lockrw);
955	ldcp =  ldcl->head;
956	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
957		(void) vsw_ldc_init(ldcp);
958	}
959	RW_EXIT(&ldcl->lockrw);
960
961	return (0);
962}
963
964static int
965vsw_uninit_ldcs(vsw_port_t *port)
966{
967	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
968	vsw_ldc_t	*ldcp;
969
970	D1(NULL, "vsw_uninit_ldcs: enter\n");
971
972	READ_ENTER(&ldcl->lockrw);
973	ldcp =  ldcl->head;
974	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
975		(void) vsw_ldc_uninit(ldcp);
976	}
977	RW_EXIT(&ldcl->lockrw);
978
979	D1(NULL, "vsw_uninit_ldcs: exit\n");
980
981	return (0);
982}
983
984/*
985 * Wait until the callback(s) associated with the ldcs under the specified
986 * port have completed.
987 *
988 * Prior to this function being invoked each channel under this port
989 * should have been quiesced via ldc_set_cb_mode(DISABLE).
990 *
991 * A short explaination of what we are doing below..
992 *
993 * The simplest approach would be to have a reference counter in
994 * the ldc structure which is increment/decremented by the callbacks as
995 * they use the channel. The drain function could then simply disable any
996 * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately
997 * there is a tiny window here - before the callback is able to get the lock
998 * on the channel it is interrupted and this function gets to execute. It
999 * sees that the ref count is zero and believes its free to delete the
1000 * associated data structures.
1001 *
1002 * We get around this by taking advantage of the fact that before the ldc
1003 * framework invokes a callback it sets a flag to indicate that there is a
1004 * callback active (or about to become active). If when we attempt to
1005 * unregister a callback when this active flag is set then the unregister
1006 * will fail with EWOULDBLOCK.
1007 *
1008 * If the unregister fails we do a cv_timedwait. We will either be signaled
1009 * by the callback as it is exiting (note we have to wait a short period to
1010 * allow the callback to return fully to the ldc framework and it to clear
1011 * the active flag), or by the timer expiring. In either case we again attempt
1012 * the unregister. We repeat this until we can succesfully unregister the
1013 * callback.
1014 *
1015 * The reason we use a cv_timedwait rather than a simple cv_wait is to catch
1016 * the case where the callback has finished but the ldc framework has not yet
1017 * cleared the active flag. In this case we would never get a cv_signal.
1018 */
1019static int
1020vsw_drain_ldcs(vsw_port_t *port)
1021{
1022	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
1023	vsw_ldc_t	*ldcp;
1024	vsw_t		*vswp = port->p_vswp;
1025
1026	D1(vswp, "%s: enter", __func__);
1027
1028	READ_ENTER(&ldcl->lockrw);
1029
1030	ldcp = ldcl->head;
1031
1032	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
1033		/*
1034		 * If we can unregister the channel callback then we
1035		 * know that there is no callback either running or
1036		 * scheduled to run for this channel so move on to next
1037		 * channel in the list.
1038		 */
1039		mutex_enter(&ldcp->drain_cv_lock);
1040
1041		/* prompt active callbacks to quit */
1042		ldcp->drain_state = VSW_LDC_DRAINING;
1043
1044		if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) {
1045			D2(vswp, "%s: unreg callback for chan %ld", __func__,
1046			    ldcp->ldc_id);
1047			mutex_exit(&ldcp->drain_cv_lock);
1048			continue;
1049		} else {
1050			/*
1051			 * If we end up here we know that either 1) a callback
1052			 * is currently executing, 2) is about to start (i.e.
1053			 * the ldc framework has set the active flag but
1054			 * has not actually invoked the callback yet, or 3)
1055			 * has finished and has returned to the ldc framework
1056			 * but the ldc framework has not yet cleared the
1057			 * active bit.
1058			 *
1059			 * Wait for it to finish.
1060			 */
1061			while (ldc_unreg_callback(ldcp->ldc_handle)
1062			    == EWOULDBLOCK)
1063				(void) cv_timedwait(&ldcp->drain_cv,
1064				    &ldcp->drain_cv_lock, lbolt + hz);
1065
1066			mutex_exit(&ldcp->drain_cv_lock);
1067			D2(vswp, "%s: unreg callback for chan %ld after "
1068			    "timeout", __func__, ldcp->ldc_id);
1069		}
1070	}
1071	RW_EXIT(&ldcl->lockrw);
1072
1073	D1(vswp, "%s: exit", __func__);
1074	return (0);
1075}
1076
1077/*
1078 * Wait until all tasks which reference this port have completed.
1079 *
1080 * Prior to this function being invoked each channel under this port
1081 * should have been quiesced via ldc_set_cb_mode(DISABLE).
1082 */
1083static int
1084vsw_drain_port_taskq(vsw_port_t *port)
1085{
1086	vsw_t		*vswp = port->p_vswp;
1087
1088	D1(vswp, "%s: enter", __func__);
1089
1090	/*
1091	 * Mark the port as in the process of being detached, and
1092	 * dispatch a marker task to the queue so we know when all
1093	 * relevant tasks have completed.
1094	 */
1095	mutex_enter(&port->state_lock);
1096	port->state = VSW_PORT_DETACHING;
1097
1098	if ((vswp->taskq_p == NULL) ||
1099	    (ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task,
1100	    port, DDI_NOSLEEP) != DDI_SUCCESS)) {
1101		DERR(vswp, "%s: unable to dispatch marker task",
1102		    __func__);
1103		mutex_exit(&port->state_lock);
1104		return (1);
1105	}
1106
1107	/*
1108	 * Wait for the marker task to finish.
1109	 */
1110	while (port->state != VSW_PORT_DETACHABLE)
1111		cv_wait(&port->state_cv, &port->state_lock);
1112
1113	mutex_exit(&port->state_lock);
1114
1115	D1(vswp, "%s: exit", __func__);
1116
1117	return (0);
1118}
1119
1120static void
1121vsw_marker_task(void *arg)
1122{
1123	vsw_port_t	*port = arg;
1124	vsw_t		*vswp = port->p_vswp;
1125
1126	D1(vswp, "%s: enter", __func__);
1127
1128	mutex_enter(&port->state_lock);
1129
1130	/*
1131	 * No further tasks should be dispatched which reference
1132	 * this port so ok to mark it as safe to detach.
1133	 */
1134	port->state = VSW_PORT_DETACHABLE;
1135
1136	cv_signal(&port->state_cv);
1137
1138	mutex_exit(&port->state_lock);
1139
1140	D1(vswp, "%s: exit", __func__);
1141}
1142
1143vsw_port_t *
1144vsw_lookup_port(vsw_t *vswp, int p_instance)
1145{
1146	vsw_port_list_t *plist = &vswp->plist;
1147	vsw_port_t	*port;
1148
1149	for (port = plist->head; port != NULL; port = port->p_next) {
1150		if (port->p_instance == p_instance) {
1151			D2(vswp, "vsw_lookup_port: found p_instance\n");
1152			return (port);
1153		}
1154	}
1155
1156	return (NULL);
1157}
1158
1159/*
1160 * Search for and remove the specified port from the port
1161 * list. Returns 0 if able to locate and remove port, otherwise
1162 * returns 1.
1163 */
1164static int
1165vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port)
1166{
1167	vsw_port_list_t *plist = &vswp->plist;
1168	vsw_port_t	*curr_p, *prev_p;
1169
1170	if (plist->head == NULL)
1171		return (1);
1172
1173	curr_p = prev_p = plist->head;
1174
1175	while (curr_p != NULL) {
1176		if (curr_p == port) {
1177			if (prev_p == curr_p) {
1178				plist->head = curr_p->p_next;
1179			} else {
1180				prev_p->p_next = curr_p->p_next;
1181			}
1182			plist->num_ports--;
1183			break;
1184		} else {
1185			prev_p = curr_p;
1186			curr_p = curr_p->p_next;
1187		}
1188	}
1189	return (0);
1190}
1191
1192/*
1193 * Interrupt handler for ldc messages.
1194 */
1195static uint_t
1196vsw_ldc_cb(uint64_t event, caddr_t arg)
1197{
1198	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
1199	vsw_t 		*vswp = ldcp->ldc_vswp;
1200
1201	D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
1202
1203	mutex_enter(&ldcp->ldc_cblock);
1204	ldcp->ldc_stats.callbacks++;
1205
1206	mutex_enter(&ldcp->status_lock);
1207	if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) {
1208		mutex_exit(&ldcp->status_lock);
1209		mutex_exit(&ldcp->ldc_cblock);
1210		return (LDC_SUCCESS);
1211	}
1212	mutex_exit(&ldcp->status_lock);
1213
1214	if (event & LDC_EVT_UP) {
1215		/*
1216		 * Channel has come up.
1217		 */
1218		D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)",
1219		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1220
1221		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1222
1223		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1224	}
1225
1226	if (event & LDC_EVT_READ) {
1227		/*
1228		 * Data available for reading.
1229		 */
1230		D2(vswp, "%s: id(ld) event(%llx) data READ",
1231		    __func__, ldcp->ldc_id, event);
1232
1233		if (ldcp->rx_thread != NULL) {
1234			/*
1235			 * If the receive thread is enabled, then
1236			 * wakeup the receive thread to process the
1237			 * LDC messages.
1238			 */
1239			mutex_exit(&ldcp->ldc_cblock);
1240			mutex_enter(&ldcp->rx_thr_lock);
1241			if (!(ldcp->rx_thr_flags & VSW_WTHR_DATARCVD)) {
1242				ldcp->rx_thr_flags |= VSW_WTHR_DATARCVD;
1243				cv_signal(&ldcp->rx_thr_cv);
1244			}
1245			mutex_exit(&ldcp->rx_thr_lock);
1246			mutex_enter(&ldcp->ldc_cblock);
1247		} else {
1248			vsw_process_pkt(ldcp);
1249		}
1250
1251		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1252
1253		goto vsw_cb_exit;
1254	}
1255
1256	if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) {
1257		D2(vswp, "%s: id(%ld) event (%lx) DOWN/RESET: status(%ld)",
1258		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1259
1260		vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
1261	}
1262
1263	/*
1264	 * Catch either LDC_EVT_WRITE which we don't support or any
1265	 * unknown event.
1266	 */
1267	if (event &
1268	    ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) {
1269		DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)",
1270		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1271	}
1272
1273vsw_cb_exit:
1274	mutex_exit(&ldcp->ldc_cblock);
1275
1276	/*
1277	 * Let the drain function know we are finishing if it
1278	 * is waiting.
1279	 */
1280	mutex_enter(&ldcp->drain_cv_lock);
1281	if (ldcp->drain_state == VSW_LDC_DRAINING)
1282		cv_signal(&ldcp->drain_cv);
1283	mutex_exit(&ldcp->drain_cv_lock);
1284
1285	return (LDC_SUCCESS);
1286}
1287
1288/*
1289 * Reinitialise data structures associated with the channel.
1290 */
1291static void
1292vsw_ldc_reinit(vsw_ldc_t *ldcp)
1293{
1294	vsw_t		*vswp = ldcp->ldc_vswp;
1295	vsw_port_t	*port;
1296	vsw_ldc_list_t	*ldcl;
1297
1298	D1(vswp, "%s: enter", __func__);
1299
1300	port = ldcp->ldc_port;
1301	ldcl = &port->p_ldclist;
1302
1303	READ_ENTER(&ldcl->lockrw);
1304
1305	D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__,
1306	    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
1307
1308	vsw_free_lane_resources(ldcp, INBOUND);
1309	vsw_free_lane_resources(ldcp, OUTBOUND);
1310	RW_EXIT(&ldcl->lockrw);
1311
1312	ldcp->lane_in.lstate = 0;
1313	ldcp->lane_out.lstate = 0;
1314
1315	/*
1316	 * Remove parent port from any multicast groups
1317	 * it may have registered with. Client must resend
1318	 * multicast add command after handshake completes.
1319	 */
1320	(void) vsw_del_fdb(vswp, port);
1321
1322	vsw_del_mcst_port(port);
1323
1324	ldcp->peer_session = 0;
1325	ldcp->session_status = 0;
1326	ldcp->hcnt = 0;
1327	ldcp->hphase = VSW_MILESTONE0;
1328	ldcp->tx_failures = 0;
1329
1330	D1(vswp, "%s: exit", __func__);
1331}
1332
1333/*
1334 * Process a connection event.
1335 *
1336 * Note - care must be taken to ensure that this function is
1337 * not called with the dlistrw lock held.
1338 */
1339static void
1340vsw_process_conn_evt(vsw_ldc_t *ldcp, uint16_t evt)
1341{
1342	vsw_t		*vswp = ldcp->ldc_vswp;
1343	vsw_conn_evt_t	*conn = NULL;
1344
1345	D1(vswp, "%s: enter", __func__);
1346
1347	/*
1348	 * Check if either a reset or restart event is pending
1349	 * or in progress. If so just return.
1350	 *
1351	 * A VSW_CONN_RESET event originates either with a LDC_RESET_EVT
1352	 * being received by the callback handler, or a ECONNRESET error
1353	 * code being returned from a ldc_read() or ldc_write() call.
1354	 *
1355	 * A VSW_CONN_RESTART event occurs when some error checking code
1356	 * decides that there is a problem with data from the channel,
1357	 * and that the handshake should be restarted.
1358	 */
1359	if (((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) &&
1360	    (ldstub((uint8_t *)&ldcp->reset_active)))
1361		return;
1362
1363	/*
1364	 * If it is an LDC_UP event we first check the recorded
1365	 * state of the channel. If this is UP then we know that
1366	 * the channel moving to the UP state has already been dealt
1367	 * with and don't need to dispatch a  new task.
1368	 *
1369	 * The reason for this check is that when we do a ldc_up(),
1370	 * depending on the state of the peer, we may or may not get
1371	 * a LDC_UP event. As we can't depend on getting a LDC_UP evt
1372	 * every time we do ldc_up() we explicitly check the channel
1373	 * status to see has it come up (ldc_up() is asynch and will
1374	 * complete at some undefined time), and take the appropriate
1375	 * action.
1376	 *
1377	 * The flip side of this is that we may get a LDC_UP event
1378	 * when we have already seen that the channel is up and have
1379	 * dealt with that.
1380	 */
1381	mutex_enter(&ldcp->status_lock);
1382	if (evt == VSW_CONN_UP) {
1383		if ((ldcp->ldc_status == LDC_UP) || (ldcp->reset_active != 0)) {
1384			mutex_exit(&ldcp->status_lock);
1385			return;
1386		}
1387	}
1388	mutex_exit(&ldcp->status_lock);
1389
1390	/*
1391	 * The transaction group id allows us to identify and discard
1392	 * any tasks which are still pending on the taskq and refer
1393	 * to the handshake session we are about to restart or reset.
1394	 * These stale messages no longer have any real meaning.
1395	 */
1396	(void) atomic_inc_32(&ldcp->hss_id);
1397
1398	ASSERT(vswp->taskq_p != NULL);
1399
1400	if ((conn = kmem_zalloc(sizeof (vsw_conn_evt_t), KM_NOSLEEP)) == NULL) {
1401		cmn_err(CE_WARN, "!vsw%d: unable to allocate memory for"
1402		    " connection event", vswp->instance);
1403		goto err_exit;
1404	}
1405
1406	conn->evt = evt;
1407	conn->ldcp = ldcp;
1408
1409	if (ddi_taskq_dispatch(vswp->taskq_p, vsw_conn_task, conn,
1410	    DDI_NOSLEEP) != DDI_SUCCESS) {
1411		cmn_err(CE_WARN, "!vsw%d: Can't dispatch connection task",
1412		    vswp->instance);
1413
1414		kmem_free(conn, sizeof (vsw_conn_evt_t));
1415		goto err_exit;
1416	}
1417
1418	D1(vswp, "%s: exit", __func__);
1419	return;
1420
1421err_exit:
1422	/*
1423	 * Have mostly likely failed due to memory shortage. Clear the flag so
1424	 * that future requests will at least be attempted and will hopefully
1425	 * succeed.
1426	 */
1427	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1428		ldcp->reset_active = 0;
1429}
1430
1431/*
1432 * Deal with events relating to a connection. Invoked from a taskq.
1433 */
1434static void
1435vsw_conn_task(void *arg)
1436{
1437	vsw_conn_evt_t	*conn = (vsw_conn_evt_t *)arg;
1438	vsw_ldc_t	*ldcp = NULL;
1439	vsw_t		*vswp = NULL;
1440	uint16_t	evt;
1441	ldc_status_t	curr_status;
1442
1443	ldcp = conn->ldcp;
1444	evt = conn->evt;
1445	vswp = ldcp->ldc_vswp;
1446
1447	D1(vswp, "%s: enter", __func__);
1448
1449	/* can safely free now have copied out data */
1450	kmem_free(conn, sizeof (vsw_conn_evt_t));
1451
1452	mutex_enter(&ldcp->status_lock);
1453	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1454		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1455		    "channel %ld", vswp->instance, ldcp->ldc_id);
1456		mutex_exit(&ldcp->status_lock);
1457		return;
1458	}
1459
1460	/*
1461	 * If we wish to restart the handshake on this channel, then if
1462	 * the channel is UP we bring it DOWN to flush the underlying
1463	 * ldc queue.
1464	 */
1465	if ((evt == VSW_CONN_RESTART) && (curr_status == LDC_UP))
1466		(void) ldc_down(ldcp->ldc_handle);
1467
1468	/*
1469	 * re-init all the associated data structures.
1470	 */
1471	vsw_ldc_reinit(ldcp);
1472
1473	/*
1474	 * Bring the channel back up (note it does no harm to
1475	 * do this even if the channel is already UP, Just
1476	 * becomes effectively a no-op).
1477	 */
1478	(void) ldc_up(ldcp->ldc_handle);
1479
1480	/*
1481	 * Check if channel is now UP. This will only happen if
1482	 * peer has also done a ldc_up().
1483	 */
1484	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1485		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1486		    "channel %ld", vswp->instance, ldcp->ldc_id);
1487		mutex_exit(&ldcp->status_lock);
1488		return;
1489	}
1490
1491	ldcp->ldc_status = curr_status;
1492
1493	/* channel UP so restart handshake by sending version info */
1494	if (curr_status == LDC_UP) {
1495		if (ldcp->hcnt++ > vsw_num_handshakes) {
1496			cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted"
1497			    " handshake attempts (%d) on channel %ld",
1498			    vswp->instance, ldcp->hcnt, ldcp->ldc_id);
1499			mutex_exit(&ldcp->status_lock);
1500			return;
1501		}
1502
1503		if (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp,
1504		    DDI_NOSLEEP) != DDI_SUCCESS) {
1505			cmn_err(CE_WARN, "!vsw%d: Can't dispatch version task",
1506			    vswp->instance);
1507
1508			/*
1509			 * Don't count as valid restart attempt if couldn't
1510			 * send version msg.
1511			 */
1512			if (ldcp->hcnt > 0)
1513				ldcp->hcnt--;
1514		}
1515	}
1516
1517	/*
1518	 * Mark that the process is complete by clearing the flag.
1519	 *
1520	 * Note is it possible that the taskq dispatch above may have failed,
1521	 * most likely due to memory shortage. We still clear the flag so
1522	 * future attempts will at least be attempted and will hopefully
1523	 * succeed.
1524	 */
1525	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1526		ldcp->reset_active = 0;
1527
1528	mutex_exit(&ldcp->status_lock);
1529
1530	D1(vswp, "%s: exit", __func__);
1531}
1532
1533/*
1534 * returns 0 if legal for event signified by flag to have
1535 * occured at the time it did. Otherwise returns 1.
1536 */
1537int
1538vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag)
1539{
1540	vsw_t		*vswp = ldcp->ldc_vswp;
1541	uint64_t	state;
1542	uint64_t	phase;
1543
1544	if (dir == INBOUND)
1545		state = ldcp->lane_in.lstate;
1546	else
1547		state = ldcp->lane_out.lstate;
1548
1549	phase = ldcp->hphase;
1550
1551	switch (flag) {
1552	case VSW_VER_INFO_RECV:
1553		if (phase > VSW_MILESTONE0) {
1554			DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV"
1555			    " when in state %d\n", ldcp->ldc_id, phase);
1556			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1557			return (1);
1558		}
1559		break;
1560
1561	case VSW_VER_ACK_RECV:
1562	case VSW_VER_NACK_RECV:
1563		if (!(state & VSW_VER_INFO_SENT)) {
1564			DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK or "
1565			    "VER_NACK when in state %d\n", ldcp->ldc_id, phase);
1566			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1567			return (1);
1568		} else
1569			state &= ~VSW_VER_INFO_SENT;
1570		break;
1571
1572	case VSW_ATTR_INFO_RECV:
1573		if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) {
1574			DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV"
1575			    " when in state %d\n", ldcp->ldc_id, phase);
1576			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1577			return (1);
1578		}
1579		break;
1580
1581	case VSW_ATTR_ACK_RECV:
1582	case VSW_ATTR_NACK_RECV:
1583		if (!(state & VSW_ATTR_INFO_SENT)) {
1584			DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK"
1585			    " or ATTR_NACK when in state %d\n",
1586			    ldcp->ldc_id, phase);
1587			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1588			return (1);
1589		} else
1590			state &= ~VSW_ATTR_INFO_SENT;
1591		break;
1592
1593	case VSW_DRING_INFO_RECV:
1594		if (phase < VSW_MILESTONE1) {
1595			DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV"
1596			    " when in state %d\n", ldcp->ldc_id, phase);
1597			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1598			return (1);
1599		}
1600		break;
1601
1602	case VSW_DRING_ACK_RECV:
1603	case VSW_DRING_NACK_RECV:
1604		if (!(state & VSW_DRING_INFO_SENT)) {
1605			DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK "
1606			    " or DRING_NACK when in state %d\n",
1607			    ldcp->ldc_id, phase);
1608			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1609			return (1);
1610		} else
1611			state &= ~VSW_DRING_INFO_SENT;
1612		break;
1613
1614	case VSW_RDX_INFO_RECV:
1615		if (phase < VSW_MILESTONE3) {
1616			DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV"
1617			    " when in state %d\n", ldcp->ldc_id, phase);
1618			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1619			return (1);
1620		}
1621		break;
1622
1623	case VSW_RDX_ACK_RECV:
1624	case VSW_RDX_NACK_RECV:
1625		if (!(state & VSW_RDX_INFO_SENT)) {
1626			DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK or "
1627			    "RDX_NACK when in state %d\n", ldcp->ldc_id, phase);
1628			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1629			return (1);
1630		} else
1631			state &= ~VSW_RDX_INFO_SENT;
1632		break;
1633
1634	case VSW_MCST_INFO_RECV:
1635		if (phase < VSW_MILESTONE3) {
1636			DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV"
1637			    " when in state %d\n", ldcp->ldc_id, phase);
1638			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1639			return (1);
1640		}
1641		break;
1642
1643	default:
1644		DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)",
1645		    ldcp->ldc_id, flag);
1646		return (1);
1647	}
1648
1649	if (dir == INBOUND)
1650		ldcp->lane_in.lstate = state;
1651	else
1652		ldcp->lane_out.lstate = state;
1653
1654	D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id);
1655
1656	return (0);
1657}
1658
1659void
1660vsw_next_milestone(vsw_ldc_t *ldcp)
1661{
1662	vsw_t		*vswp = ldcp->ldc_vswp;
1663
1664	D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__,
1665	    ldcp->ldc_id, ldcp->hphase);
1666
1667	DUMP_FLAGS(ldcp->lane_in.lstate);
1668	DUMP_FLAGS(ldcp->lane_out.lstate);
1669
1670	switch (ldcp->hphase) {
1671
1672	case VSW_MILESTONE0:
1673		/*
1674		 * If we haven't started to handshake with our peer,
1675		 * start to do so now.
1676		 */
1677		if (ldcp->lane_out.lstate == 0) {
1678			D2(vswp, "%s: (chan %lld) starting handshake "
1679			    "with peer", __func__, ldcp->ldc_id);
1680			vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1681		}
1682
1683		/*
1684		 * Only way to pass this milestone is to have successfully
1685		 * negotiated version info.
1686		 */
1687		if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) &&
1688		    (ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) {
1689
1690			D2(vswp, "%s: (chan %lld) leaving milestone 0",
1691			    __func__, ldcp->ldc_id);
1692
1693			/*
1694			 * Next milestone is passed when attribute
1695			 * information has been successfully exchanged.
1696			 */
1697			ldcp->hphase = VSW_MILESTONE1;
1698			vsw_send_attr(ldcp);
1699
1700		}
1701		break;
1702
1703	case VSW_MILESTONE1:
1704		/*
1705		 * Only way to pass this milestone is to have successfully
1706		 * negotiated attribute information.
1707		 */
1708		if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) {
1709
1710			ldcp->hphase = VSW_MILESTONE2;
1711
1712			/*
1713			 * If the peer device has said it wishes to
1714			 * use descriptor rings then we send it our ring
1715			 * info, otherwise we just set up a private ring
1716			 * which we use an internal buffer
1717			 */
1718			if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE)
1719				vsw_send_dring_info(ldcp);
1720		}
1721		break;
1722
1723	case VSW_MILESTONE2:
1724		/*
1725		 * If peer has indicated in its attribute message that
1726		 * it wishes to use descriptor rings then the only way
1727		 * to pass this milestone is for us to have received
1728		 * valid dring info.
1729		 *
1730		 * If peer is not using descriptor rings then just fall
1731		 * through.
1732		 */
1733		if ((ldcp->lane_in.xfer_mode == VIO_DRING_MODE) &&
1734		    (!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT)))
1735			break;
1736
1737		D2(vswp, "%s: (chan %lld) leaving milestone 2",
1738		    __func__, ldcp->ldc_id);
1739
1740		ldcp->hphase = VSW_MILESTONE3;
1741		vsw_send_rdx(ldcp);
1742		break;
1743
1744	case VSW_MILESTONE3:
1745		/*
1746		 * Pass this milestone when all paramaters have been
1747		 * successfully exchanged and RDX sent in both directions.
1748		 *
1749		 * Mark outbound lane as available to transmit data.
1750		 */
1751		if ((ldcp->lane_out.lstate & VSW_RDX_ACK_SENT) &&
1752		    (ldcp->lane_in.lstate & VSW_RDX_ACK_RECV)) {
1753
1754			D2(vswp, "%s: (chan %lld) leaving milestone 3",
1755			    __func__, ldcp->ldc_id);
1756			D2(vswp, "%s: ** handshake complete (0x%llx : "
1757			    "0x%llx) **", __func__, ldcp->lane_in.lstate,
1758			    ldcp->lane_out.lstate);
1759			ldcp->lane_out.lstate |= VSW_LANE_ACTIVE;
1760			ldcp->hphase = VSW_MILESTONE4;
1761			ldcp->hcnt = 0;
1762			DISPLAY_STATE();
1763		} else {
1764			D2(vswp, "%s: still in milestone 3 (0x%llx : 0x%llx)",
1765			    __func__, ldcp->lane_in.lstate,
1766			    ldcp->lane_out.lstate);
1767		}
1768		break;
1769
1770	case VSW_MILESTONE4:
1771		D2(vswp, "%s: (chan %lld) in milestone 4", __func__,
1772		    ldcp->ldc_id);
1773		break;
1774
1775	default:
1776		DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__,
1777		    ldcp->ldc_id, ldcp->hphase);
1778	}
1779
1780	D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id,
1781	    ldcp->hphase);
1782}
1783
1784/*
1785 * Check if major version is supported.
1786 *
1787 * Returns 0 if finds supported major number, and if necessary
1788 * adjusts the minor field.
1789 *
1790 * Returns 1 if can't match major number exactly. Sets mjor/minor
1791 * to next lowest support values, or to zero if no other values possible.
1792 */
1793static int
1794vsw_supported_version(vio_ver_msg_t *vp)
1795{
1796	int	i;
1797
1798	D1(NULL, "vsw_supported_version: enter");
1799
1800	for (i = 0; i < VSW_NUM_VER; i++) {
1801		if (vsw_versions[i].ver_major == vp->ver_major) {
1802			/*
1803			 * Matching or lower major version found. Update
1804			 * minor number if necessary.
1805			 */
1806			if (vp->ver_minor > vsw_versions[i].ver_minor) {
1807				D2(NULL, "%s: adjusting minor value from %d "
1808				    "to %d", __func__, vp->ver_minor,
1809				    vsw_versions[i].ver_minor);
1810				vp->ver_minor = vsw_versions[i].ver_minor;
1811			}
1812
1813			return (0);
1814		}
1815
1816		if (vsw_versions[i].ver_major < vp->ver_major) {
1817			if (vp->ver_minor > vsw_versions[i].ver_minor) {
1818				D2(NULL, "%s: adjusting minor value from %d "
1819				    "to %d", __func__, vp->ver_minor,
1820				    vsw_versions[i].ver_minor);
1821				vp->ver_minor = vsw_versions[i].ver_minor;
1822			}
1823			return (1);
1824		}
1825	}
1826
1827	/* No match was possible, zero out fields */
1828	vp->ver_major = 0;
1829	vp->ver_minor = 0;
1830
1831	D1(NULL, "vsw_supported_version: exit");
1832
1833	return (1);
1834}
1835
1836/*
1837 * Main routine for processing messages received over LDC.
1838 */
1839static void
1840vsw_process_pkt(void *arg)
1841{
1842	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
1843	vsw_t 		*vswp = ldcp->ldc_vswp;
1844	size_t		msglen;
1845	vio_msg_tag_t	tag;
1846	def_msg_t	dmsg;
1847	int 		rv = 0;
1848
1849
1850	D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
1851
1852	ASSERT(MUTEX_HELD(&ldcp->ldc_cblock));
1853
1854	/*
1855	 * If channel is up read messages until channel is empty.
1856	 */
1857	do {
1858		msglen = sizeof (dmsg);
1859		rv = ldc_read(ldcp->ldc_handle, (caddr_t)&dmsg, &msglen);
1860
1861		if (rv != 0) {
1862			DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) len(%d)\n",
1863			    __func__, ldcp->ldc_id, rv, msglen);
1864		}
1865
1866		/* channel has been reset */
1867		if (rv == ECONNRESET) {
1868			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
1869			break;
1870		}
1871
1872		if (msglen == 0) {
1873			D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__,
1874			    ldcp->ldc_id);
1875			break;
1876		}
1877
1878		D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__,
1879		    ldcp->ldc_id, msglen);
1880
1881		/*
1882		 * Figure out what sort of packet we have gotten by
1883		 * examining the msg tag, and then switch it appropriately.
1884		 */
1885		bcopy(&dmsg, &tag, sizeof (vio_msg_tag_t));
1886
1887		switch (tag.vio_msgtype) {
1888		case VIO_TYPE_CTRL:
1889			vsw_dispatch_ctrl_task(ldcp, &dmsg, tag);
1890			break;
1891		case VIO_TYPE_DATA:
1892			vsw_process_data_pkt(ldcp, &dmsg, tag);
1893			break;
1894		case VIO_TYPE_ERR:
1895			vsw_process_err_pkt(ldcp, &dmsg, tag);
1896			break;
1897		default:
1898			DERR(vswp, "%s: Unknown tag(%lx) ", __func__,
1899			    "id(%lx)\n", tag.vio_msgtype, ldcp->ldc_id);
1900			break;
1901		}
1902	} while (msglen);
1903
1904	D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id);
1905}
1906
1907/*
1908 * Dispatch a task to process a VIO control message.
1909 */
1910static void
1911vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t tag)
1912{
1913	vsw_ctrl_task_t		*ctaskp = NULL;
1914	vsw_port_t		*port = ldcp->ldc_port;
1915	vsw_t			*vswp = port->p_vswp;
1916
1917	D1(vswp, "%s: enter", __func__);
1918
1919	/*
1920	 * We need to handle RDX ACK messages in-band as once they
1921	 * are exchanged it is possible that we will get an
1922	 * immediate (legitimate) data packet.
1923	 */
1924	if ((tag.vio_subtype_env == VIO_RDX) &&
1925	    (tag.vio_subtype == VIO_SUBTYPE_ACK)) {
1926
1927		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_ACK_RECV))
1928			return;
1929
1930		ldcp->lane_in.lstate |= VSW_RDX_ACK_RECV;
1931		D2(vswp, "%s (%ld) handling RDX_ACK in place "
1932		    "(ostate 0x%llx : hphase %d)", __func__,
1933		    ldcp->ldc_id, ldcp->lane_in.lstate, ldcp->hphase);
1934		vsw_next_milestone(ldcp);
1935		return;
1936	}
1937
1938	ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP);
1939
1940	if (ctaskp == NULL) {
1941		DERR(vswp, "%s: unable to alloc space for ctrl msg", __func__);
1942		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1943		return;
1944	}
1945
1946	ctaskp->ldcp = ldcp;
1947	bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t));
1948	ctaskp->hss_id = ldcp->hss_id;
1949
1950	/*
1951	 * Dispatch task to processing taskq if port is not in
1952	 * the process of being detached.
1953	 */
1954	mutex_enter(&port->state_lock);
1955	if (port->state == VSW_PORT_INIT) {
1956		if ((vswp->taskq_p == NULL) ||
1957		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_process_ctrl_pkt,
1958		    ctaskp, DDI_NOSLEEP) != DDI_SUCCESS)) {
1959			DERR(vswp, "%s: unable to dispatch task to taskq",
1960			    __func__);
1961			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
1962			mutex_exit(&port->state_lock);
1963			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1964			return;
1965		}
1966	} else {
1967		DWARN(vswp, "%s: port %d detaching, not dispatching "
1968		    "task", __func__, port->p_instance);
1969	}
1970
1971	mutex_exit(&port->state_lock);
1972
1973	D2(vswp, "%s: dispatched task to taskq for chan %d", __func__,
1974	    ldcp->ldc_id);
1975	D1(vswp, "%s: exit", __func__);
1976}
1977
1978/*
1979 * Process a VIO ctrl message. Invoked from taskq.
1980 */
1981static void
1982vsw_process_ctrl_pkt(void *arg)
1983{
1984	vsw_ctrl_task_t	*ctaskp = (vsw_ctrl_task_t *)arg;
1985	vsw_ldc_t	*ldcp = ctaskp->ldcp;
1986	vsw_t 		*vswp = ldcp->ldc_vswp;
1987	vio_msg_tag_t	tag;
1988	uint16_t	env;
1989
1990	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
1991
1992	bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t));
1993	env = tag.vio_subtype_env;
1994
1995	/* stale pkt check */
1996	if (ctaskp->hss_id < ldcp->hss_id) {
1997		DWARN(vswp, "%s: discarding stale packet belonging to earlier"
1998		    " (%ld) handshake session", __func__, ctaskp->hss_id);
1999		return;
2000	}
2001
2002	/* session id check */
2003	if (ldcp->session_status & VSW_PEER_SESSION) {
2004		if (ldcp->peer_session != tag.vio_sid) {
2005			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
2006			    __func__, ldcp->ldc_id, tag.vio_sid);
2007			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2008			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2009			return;
2010		}
2011	}
2012
2013	/*
2014	 * Switch on vio_subtype envelope, then let lower routines
2015	 * decide if its an INFO, ACK or NACK packet.
2016	 */
2017	switch (env) {
2018	case VIO_VER_INFO:
2019		vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp);
2020		break;
2021	case VIO_DRING_REG:
2022		vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp);
2023		break;
2024	case VIO_DRING_UNREG:
2025		vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp);
2026		break;
2027	case VIO_ATTR_INFO:
2028		vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp);
2029		break;
2030	case VNET_MCAST_INFO:
2031		vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp);
2032		break;
2033	case VIO_RDX:
2034		vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp);
2035		break;
2036	default:
2037		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
2038	}
2039
2040	kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2041	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2042}
2043
2044/*
2045 * Version negotiation. We can end up here either because our peer
2046 * has responded to a handshake message we have sent it, or our peer
2047 * has initiated a handshake with us. If its the former then can only
2048 * be ACK or NACK, if its the later can only be INFO.
2049 *
2050 * If its an ACK we move to the next stage of the handshake, namely
2051 * attribute exchange. If its a NACK we see if we can specify another
2052 * version, if we can't we stop.
2053 *
2054 * If it is an INFO we reset all params associated with communication
2055 * in that direction over this channel (remember connection is
2056 * essentially 2 independent simplex channels).
2057 */
2058void
2059vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt)
2060{
2061	vio_ver_msg_t	*ver_pkt;
2062	vsw_t 		*vswp = ldcp->ldc_vswp;
2063
2064	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2065
2066	/*
2067	 * We know this is a ctrl/version packet so
2068	 * cast it into the correct structure.
2069	 */
2070	ver_pkt = (vio_ver_msg_t *)pkt;
2071
2072	switch (ver_pkt->tag.vio_subtype) {
2073	case VIO_SUBTYPE_INFO:
2074		D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n");
2075
2076		/*
2077		 * Record the session id, which we will use from now
2078		 * until we see another VER_INFO msg. Even then the
2079		 * session id in most cases will be unchanged, execpt
2080		 * if channel was reset.
2081		 */
2082		if ((ldcp->session_status & VSW_PEER_SESSION) &&
2083		    (ldcp->peer_session != ver_pkt->tag.vio_sid)) {
2084			DERR(vswp, "%s: updating session id for chan %lld "
2085			    "from %llx to %llx", __func__, ldcp->ldc_id,
2086			    ldcp->peer_session, ver_pkt->tag.vio_sid);
2087		}
2088
2089		ldcp->peer_session = ver_pkt->tag.vio_sid;
2090		ldcp->session_status |= VSW_PEER_SESSION;
2091
2092		/* Legal message at this time ? */
2093		if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV))
2094			return;
2095
2096		/*
2097		 * First check the device class. Currently only expect
2098		 * to be talking to a network device. In the future may
2099		 * also talk to another switch.
2100		 */
2101		if (ver_pkt->dev_class != VDEV_NETWORK) {
2102			DERR(vswp, "%s: illegal device class %d", __func__,
2103			    ver_pkt->dev_class);
2104
2105			ver_pkt->tag.vio_sid = ldcp->local_session;
2106			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2107
2108			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2109
2110			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2111			    sizeof (vio_ver_msg_t), B_TRUE);
2112
2113			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2114			vsw_next_milestone(ldcp);
2115			return;
2116		} else {
2117			ldcp->dev_class = ver_pkt->dev_class;
2118		}
2119
2120		/*
2121		 * Now check the version.
2122		 */
2123		if (vsw_supported_version(ver_pkt) == 0) {
2124			/*
2125			 * Support this major version and possibly
2126			 * adjusted minor version.
2127			 */
2128
2129			D2(vswp, "%s: accepted ver %d:%d", __func__,
2130			    ver_pkt->ver_major, ver_pkt->ver_minor);
2131
2132			/* Store accepted values */
2133			ldcp->lane_in.ver_major = ver_pkt->ver_major;
2134			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2135
2136			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2137
2138			ldcp->lane_in.lstate |= VSW_VER_ACK_SENT;
2139		} else {
2140			/*
2141			 * NACK back with the next lower major/minor
2142			 * pairing we support (if don't suuport any more
2143			 * versions then they will be set to zero.
2144			 */
2145
2146			D2(vswp, "%s: replying with ver %d:%d", __func__,
2147			    ver_pkt->ver_major, ver_pkt->ver_minor);
2148
2149			/* Store updated values */
2150			ldcp->lane_in.ver_major = ver_pkt->ver_major;
2151			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2152
2153			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2154
2155			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2156		}
2157
2158		DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2159		ver_pkt->tag.vio_sid = ldcp->local_session;
2160		(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2161		    sizeof (vio_ver_msg_t), B_TRUE);
2162
2163		vsw_next_milestone(ldcp);
2164		break;
2165
2166	case VIO_SUBTYPE_ACK:
2167		D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__);
2168
2169		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV))
2170			return;
2171
2172		/* Store updated values */
2173		ldcp->lane_in.ver_major = ver_pkt->ver_major;
2174		ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2175
2176		ldcp->lane_out.lstate |= VSW_VER_ACK_RECV;
2177		vsw_next_milestone(ldcp);
2178
2179		break;
2180
2181	case VIO_SUBTYPE_NACK:
2182		D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__);
2183
2184		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV))
2185			return;
2186
2187		/*
2188		 * If our peer sent us a NACK with the ver fields set to
2189		 * zero then there is nothing more we can do. Otherwise see
2190		 * if we support either the version suggested, or a lesser
2191		 * one.
2192		 */
2193		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2194			DERR(vswp, "%s: peer unable to negotiate any "
2195			    "further.", __func__);
2196			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2197			vsw_next_milestone(ldcp);
2198			return;
2199		}
2200
2201		/*
2202		 * Check to see if we support this major version or
2203		 * a lower one. If we don't then maj/min will be set
2204		 * to zero.
2205		 */
2206		(void) vsw_supported_version(ver_pkt);
2207		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2208			/* Nothing more we can do */
2209			DERR(vswp, "%s: version negotiation failed.\n",
2210			    __func__);
2211			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2212			vsw_next_milestone(ldcp);
2213		} else {
2214			/* found a supported major version */
2215			ldcp->lane_out.ver_major = ver_pkt->ver_major;
2216			ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2217
2218			D2(vswp, "%s: resending with updated values (%x, %x)",
2219			    __func__, ver_pkt->ver_major, ver_pkt->ver_minor);
2220
2221			ldcp->lane_out.lstate |= VSW_VER_INFO_SENT;
2222			ver_pkt->tag.vio_sid = ldcp->local_session;
2223			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
2224
2225			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2226
2227			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2228			    sizeof (vio_ver_msg_t), B_TRUE);
2229
2230			vsw_next_milestone(ldcp);
2231
2232		}
2233		break;
2234
2235	default:
2236		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2237		    ver_pkt->tag.vio_subtype);
2238	}
2239
2240	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
2241}
2242
2243/*
2244 * Process an attribute packet. We can end up here either because our peer
2245 * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our
2246 * peer has sent us an attribute INFO message
2247 *
2248 * If its an ACK we then move to the next stage of the handshake which
2249 * is to send our descriptor ring info to our peer. If its a NACK then
2250 * there is nothing more we can (currently) do.
2251 *
2252 * If we get a valid/acceptable INFO packet (and we have already negotiated
2253 * a version) we ACK back and set channel state to ATTR_RECV, otherwise we
2254 * NACK back and reset channel state to INACTIV.
2255 *
2256 * FUTURE: in time we will probably negotiate over attributes, but for
2257 * the moment unacceptable attributes are regarded as a fatal error.
2258 *
2259 */
2260void
2261vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt)
2262{
2263	vnet_attr_msg_t		*attr_pkt;
2264	vsw_t			*vswp = ldcp->ldc_vswp;
2265	vsw_port_t		*port = ldcp->ldc_port;
2266	uint64_t		macaddr = 0;
2267	int			i;
2268
2269	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
2270
2271	/*
2272	 * We know this is a ctrl/attr packet so
2273	 * cast it into the correct structure.
2274	 */
2275	attr_pkt = (vnet_attr_msg_t *)pkt;
2276
2277	switch (attr_pkt->tag.vio_subtype) {
2278	case VIO_SUBTYPE_INFO:
2279		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2280
2281		if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV))
2282			return;
2283
2284		/*
2285		 * If the attributes are unacceptable then we NACK back.
2286		 */
2287		if (vsw_check_attr(attr_pkt, ldcp->ldc_port)) {
2288
2289			DERR(vswp, "%s (chan %d): invalid attributes",
2290			    __func__, ldcp->ldc_id);
2291
2292			vsw_free_lane_resources(ldcp, INBOUND);
2293
2294			attr_pkt->tag.vio_sid = ldcp->local_session;
2295			attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2296
2297			DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
2298			ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT;
2299			(void) vsw_send_msg(ldcp, (void *)attr_pkt,
2300			    sizeof (vnet_attr_msg_t), B_TRUE);
2301
2302			vsw_next_milestone(ldcp);
2303			return;
2304		}
2305
2306		/*
2307		 * Otherwise store attributes for this lane and update
2308		 * lane state.
2309		 */
2310		ldcp->lane_in.mtu = attr_pkt->mtu;
2311		ldcp->lane_in.addr = attr_pkt->addr;
2312		ldcp->lane_in.addr_type = attr_pkt->addr_type;
2313		ldcp->lane_in.xfer_mode = attr_pkt->xfer_mode;
2314		ldcp->lane_in.ack_freq = attr_pkt->ack_freq;
2315
2316		macaddr = ldcp->lane_in.addr;
2317		for (i = ETHERADDRL - 1; i >= 0; i--) {
2318			port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF;
2319			macaddr >>= 8;
2320		}
2321
2322		/* create the fdb entry for this port/mac address */
2323		(void) vsw_add_fdb(vswp, port);
2324
2325		/* setup device specifc xmit routines */
2326		mutex_enter(&port->tx_lock);
2327		if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE) {
2328			D2(vswp, "%s: mode = VIO_DRING_MODE", __func__);
2329			port->transmit = vsw_dringsend;
2330			ldcp->lane_out.xfer_mode = VIO_DRING_MODE;
2331		} else if (ldcp->lane_in.xfer_mode == VIO_DESC_MODE) {
2332			D2(vswp, "%s: mode = VIO_DESC_MODE", __func__);
2333			vsw_create_privring(ldcp);
2334			port->transmit = vsw_descrsend;
2335			ldcp->lane_out.xfer_mode = VIO_DESC_MODE;
2336		}
2337		mutex_exit(&port->tx_lock);
2338
2339		attr_pkt->tag.vio_sid = ldcp->local_session;
2340		attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2341
2342		DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
2343
2344		ldcp->lane_in.lstate |= VSW_ATTR_ACK_SENT;
2345
2346		(void) vsw_send_msg(ldcp, (void *)attr_pkt,
2347		    sizeof (vnet_attr_msg_t), B_TRUE);
2348
2349		vsw_next_milestone(ldcp);
2350		break;
2351
2352	case VIO_SUBTYPE_ACK:
2353		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2354
2355		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV))
2356			return;
2357
2358		ldcp->lane_out.lstate |= VSW_ATTR_ACK_RECV;
2359		vsw_next_milestone(ldcp);
2360		break;
2361
2362	case VIO_SUBTYPE_NACK:
2363		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2364
2365		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV))
2366			return;
2367
2368		ldcp->lane_out.lstate |= VSW_ATTR_NACK_RECV;
2369		vsw_next_milestone(ldcp);
2370		break;
2371
2372	default:
2373		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2374		    attr_pkt->tag.vio_subtype);
2375	}
2376
2377	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
2378}
2379
2380/*
2381 * Process a dring info packet. We can end up here either because our peer
2382 * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our
2383 * peer has sent us a dring INFO message.
2384 *
2385 * If we get a valid/acceptable INFO packet (and we have already negotiated
2386 * a version) we ACK back and update the lane state, otherwise we NACK back.
2387 *
2388 * FUTURE: nothing to stop client from sending us info on multiple dring's
2389 * but for the moment we will just use the first one we are given.
2390 *
2391 */
2392void
2393vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt)
2394{
2395	vio_dring_reg_msg_t	*dring_pkt;
2396	vsw_t			*vswp = ldcp->ldc_vswp;
2397	ldc_mem_info_t		minfo;
2398	dring_info_t		*dp, *dbp;
2399	int			dring_found = 0;
2400
2401	/*
2402	 * We know this is a ctrl/dring packet so
2403	 * cast it into the correct structure.
2404	 */
2405	dring_pkt = (vio_dring_reg_msg_t *)pkt;
2406
2407	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
2408
2409	switch (dring_pkt->tag.vio_subtype) {
2410	case VIO_SUBTYPE_INFO:
2411		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2412
2413		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
2414			return;
2415
2416		/*
2417		 * If the dring params are unacceptable then we NACK back.
2418		 */
2419		if (vsw_check_dring_info(dring_pkt)) {
2420
2421			DERR(vswp, "%s (%lld): invalid dring info",
2422			    __func__, ldcp->ldc_id);
2423
2424			vsw_free_lane_resources(ldcp, INBOUND);
2425
2426			dring_pkt->tag.vio_sid = ldcp->local_session;
2427			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2428
2429			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
2430
2431			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
2432
2433			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2434			    sizeof (vio_dring_reg_msg_t), B_TRUE);
2435
2436			vsw_next_milestone(ldcp);
2437			return;
2438		}
2439
2440		/*
2441		 * Otherwise, attempt to map in the dring using the
2442		 * cookie. If that succeeds we send back a unique dring
2443		 * identifier that the sending side will use in future
2444		 * to refer to this descriptor ring.
2445		 */
2446		dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
2447
2448		dp->num_descriptors = dring_pkt->num_descriptors;
2449		dp->descriptor_size = dring_pkt->descriptor_size;
2450		dp->options = dring_pkt->options;
2451		dp->ncookies = dring_pkt->ncookies;
2452
2453		/*
2454		 * Note: should only get one cookie. Enforced in
2455		 * the ldc layer.
2456		 */
2457		bcopy(&dring_pkt->cookie[0], &dp->cookie[0],
2458		    sizeof (ldc_mem_cookie_t));
2459
2460		D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__,
2461		    dp->num_descriptors, dp->descriptor_size);
2462		D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__,
2463		    dp->options, dp->ncookies);
2464
2465		if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0],
2466		    dp->ncookies, dp->num_descriptors, dp->descriptor_size,
2467		    LDC_SHADOW_MAP, &(dp->handle))) != 0) {
2468
2469			DERR(vswp, "%s: dring_map failed\n", __func__);
2470
2471			kmem_free(dp, sizeof (dring_info_t));
2472			vsw_free_lane_resources(ldcp, INBOUND);
2473
2474			dring_pkt->tag.vio_sid = ldcp->local_session;
2475			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2476
2477			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
2478
2479			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
2480			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2481			    sizeof (vio_dring_reg_msg_t), B_TRUE);
2482
2483			vsw_next_milestone(ldcp);
2484			return;
2485		}
2486
2487		if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
2488
2489			DERR(vswp, "%s: dring_addr failed\n", __func__);
2490
2491			kmem_free(dp, sizeof (dring_info_t));
2492			vsw_free_lane_resources(ldcp, INBOUND);
2493
2494			dring_pkt->tag.vio_sid = ldcp->local_session;
2495			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2496
2497			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
2498
2499			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
2500			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2501			    sizeof (vio_dring_reg_msg_t), B_TRUE);
2502
2503			vsw_next_milestone(ldcp);
2504			return;
2505		} else {
2506			/* store the address of the pub part of ring */
2507			dp->pub_addr = minfo.vaddr;
2508		}
2509
2510		/* no private section as we are importing */
2511		dp->priv_addr = NULL;
2512
2513		/*
2514		 * Using simple mono increasing int for ident at
2515		 * the moment.
2516		 */
2517		dp->ident = ldcp->next_ident;
2518		ldcp->next_ident++;
2519
2520		dp->end_idx = 0;
2521		dp->next = NULL;
2522
2523		/*
2524		 * Link it onto the end of the list of drings
2525		 * for this lane.
2526		 */
2527		if (ldcp->lane_in.dringp == NULL) {
2528			D2(vswp, "%s: adding first INBOUND dring", __func__);
2529			ldcp->lane_in.dringp = dp;
2530		} else {
2531			dbp = ldcp->lane_in.dringp;
2532
2533			while (dbp->next != NULL)
2534				dbp = dbp->next;
2535
2536			dbp->next = dp;
2537		}
2538
2539		/* acknowledge it */
2540		dring_pkt->tag.vio_sid = ldcp->local_session;
2541		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2542		dring_pkt->dring_ident = dp->ident;
2543
2544		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2545		    sizeof (vio_dring_reg_msg_t), B_TRUE);
2546
2547		ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT;
2548		vsw_next_milestone(ldcp);
2549		break;
2550
2551	case VIO_SUBTYPE_ACK:
2552		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2553
2554		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV))
2555			return;
2556
2557		/*
2558		 * Peer is acknowledging our dring info and will have
2559		 * sent us a dring identifier which we will use to
2560		 * refer to this ring w.r.t. our peer.
2561		 */
2562		dp = ldcp->lane_out.dringp;
2563		if (dp != NULL) {
2564			/*
2565			 * Find the ring this ident should be associated
2566			 * with.
2567			 */
2568			if (vsw_dring_match(dp, dring_pkt)) {
2569				dring_found = 1;
2570
2571			} else while (dp != NULL) {
2572				if (vsw_dring_match(dp, dring_pkt)) {
2573					dring_found = 1;
2574					break;
2575				}
2576				dp = dp->next;
2577			}
2578
2579			if (dring_found == 0) {
2580				DERR(NULL, "%s: unrecognised ring cookie",
2581				    __func__);
2582				vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2583				return;
2584			}
2585
2586		} else {
2587			DERR(vswp, "%s: DRING ACK received but no drings "
2588			    "allocated", __func__);
2589			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2590			return;
2591		}
2592
2593		/* store ident */
2594		dp->ident = dring_pkt->dring_ident;
2595		ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV;
2596		vsw_next_milestone(ldcp);
2597		break;
2598
2599	case VIO_SUBTYPE_NACK:
2600		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2601
2602		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV))
2603			return;
2604
2605		ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV;
2606		vsw_next_milestone(ldcp);
2607		break;
2608
2609	default:
2610		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
2611		    dring_pkt->tag.vio_subtype);
2612	}
2613
2614	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
2615}
2616
2617/*
2618 * Process a request from peer to unregister a dring.
2619 *
2620 * For the moment we just restart the handshake if our
2621 * peer endpoint attempts to unregister a dring.
2622 */
2623void
2624vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt)
2625{
2626	vsw_t			*vswp = ldcp->ldc_vswp;
2627	vio_dring_unreg_msg_t	*dring_pkt;
2628
2629	/*
2630	 * We know this is a ctrl/dring packet so
2631	 * cast it into the correct structure.
2632	 */
2633	dring_pkt = (vio_dring_unreg_msg_t *)pkt;
2634
2635	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2636
2637	switch (dring_pkt->tag.vio_subtype) {
2638	case VIO_SUBTYPE_INFO:
2639		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2640
2641		DWARN(vswp, "%s: restarting handshake..", __func__);
2642		break;
2643
2644	case VIO_SUBTYPE_ACK:
2645		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2646
2647		DWARN(vswp, "%s: restarting handshake..", __func__);
2648		break;
2649
2650	case VIO_SUBTYPE_NACK:
2651		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2652
2653		DWARN(vswp, "%s: restarting handshake..", __func__);
2654		break;
2655
2656	default:
2657		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
2658		    dring_pkt->tag.vio_subtype);
2659	}
2660
2661	vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2662
2663	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2664}
2665
2666#define	SND_MCST_NACK(ldcp, pkt) \
2667	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
2668	pkt->tag.vio_sid = ldcp->local_session; \
2669	(void) vsw_send_msg(ldcp, (void *)pkt, \
2670			sizeof (vnet_mcast_msg_t), B_TRUE);
2671
2672/*
2673 * Process a multicast request from a vnet.
2674 *
2675 * Vnet's specify a multicast address that they are interested in. This
2676 * address is used as a key into the hash table which forms the multicast
2677 * forwarding database (mFDB).
2678 *
2679 * The table keys are the multicast addresses, while the table entries
2680 * are pointers to lists of ports which wish to receive packets for the
2681 * specified multicast address.
2682 *
2683 * When a multicast packet is being switched we use the address as a key
2684 * into the hash table, and then walk the appropriate port list forwarding
2685 * the pkt to each port in turn.
2686 *
2687 * If a vnet is no longer interested in a particular multicast grouping
2688 * we simply find the correct location in the hash table and then delete
2689 * the relevant port from the port list.
2690 *
2691 * To deal with the case whereby a port is being deleted without first
2692 * removing itself from the lists in the hash table, we maintain a list
2693 * of multicast addresses the port has registered an interest in, within
2694 * the port structure itself. We then simply walk that list of addresses
2695 * using them as keys into the hash table and remove the port from the
2696 * appropriate lists.
2697 */
2698static void
2699vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt)
2700{
2701	vnet_mcast_msg_t	*mcst_pkt;
2702	vsw_port_t		*port = ldcp->ldc_port;
2703	vsw_t			*vswp = ldcp->ldc_vswp;
2704	int			i;
2705
2706	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2707
2708	/*
2709	 * We know this is a ctrl/mcast packet so
2710	 * cast it into the correct structure.
2711	 */
2712	mcst_pkt = (vnet_mcast_msg_t *)pkt;
2713
2714	switch (mcst_pkt->tag.vio_subtype) {
2715	case VIO_SUBTYPE_INFO:
2716		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2717
2718		/*
2719		 * Check if in correct state to receive a multicast
2720		 * message (i.e. handshake complete). If not reset
2721		 * the handshake.
2722		 */
2723		if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV))
2724			return;
2725
2726		/*
2727		 * Before attempting to add or remove address check
2728		 * that they are valid multicast addresses.
2729		 * If not, then NACK back.
2730		 */
2731		for (i = 0; i < mcst_pkt->count; i++) {
2732			if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) {
2733				DERR(vswp, "%s: invalid multicast address",
2734				    __func__);
2735				SND_MCST_NACK(ldcp, mcst_pkt);
2736				return;
2737			}
2738		}
2739
2740		/*
2741		 * Now add/remove the addresses. If this fails we
2742		 * NACK back.
2743		 */
2744		if (vsw_add_rem_mcst(mcst_pkt, port) != 0) {
2745			SND_MCST_NACK(ldcp, mcst_pkt);
2746			return;
2747		}
2748
2749		mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2750		mcst_pkt->tag.vio_sid = ldcp->local_session;
2751
2752		DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt);
2753
2754		(void) vsw_send_msg(ldcp, (void *)mcst_pkt,
2755		    sizeof (vnet_mcast_msg_t), B_TRUE);
2756		break;
2757
2758	case VIO_SUBTYPE_ACK:
2759		DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2760
2761		/*
2762		 * We shouldn't ever get a multicast ACK message as
2763		 * at the moment we never request multicast addresses
2764		 * to be set on some other device. This may change in
2765		 * the future if we have cascading switches.
2766		 */
2767		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV))
2768			return;
2769
2770				/* Do nothing */
2771		break;
2772
2773	case VIO_SUBTYPE_NACK:
2774		DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2775
2776		/*
2777		 * We shouldn't get a multicast NACK packet for the
2778		 * same reasons as we shouldn't get a ACK packet.
2779		 */
2780		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV))
2781			return;
2782
2783				/* Do nothing */
2784		break;
2785
2786	default:
2787		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2788		    mcst_pkt->tag.vio_subtype);
2789	}
2790
2791	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2792}
2793
2794static void
2795vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt)
2796{
2797	vio_rdx_msg_t	*rdx_pkt;
2798	vsw_t		*vswp = ldcp->ldc_vswp;
2799
2800	/*
2801	 * We know this is a ctrl/rdx packet so
2802	 * cast it into the correct structure.
2803	 */
2804	rdx_pkt = (vio_rdx_msg_t *)pkt;
2805
2806	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
2807
2808	switch (rdx_pkt->tag.vio_subtype) {
2809	case VIO_SUBTYPE_INFO:
2810		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2811
2812		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_INFO_RECV))
2813			return;
2814
2815		rdx_pkt->tag.vio_sid = ldcp->local_session;
2816		rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2817
2818		DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt);
2819
2820		ldcp->lane_out.lstate |= VSW_RDX_ACK_SENT;
2821
2822		(void) vsw_send_msg(ldcp, (void *)rdx_pkt,
2823		    sizeof (vio_rdx_msg_t), B_TRUE);
2824
2825		vsw_next_milestone(ldcp);
2826		break;
2827
2828	case VIO_SUBTYPE_ACK:
2829		/*
2830		 * Should be handled in-band by callback handler.
2831		 */
2832		DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__);
2833		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2834		break;
2835
2836	case VIO_SUBTYPE_NACK:
2837		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2838
2839		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_NACK_RECV))
2840			return;
2841
2842		ldcp->lane_in.lstate |= VSW_RDX_NACK_RECV;
2843		vsw_next_milestone(ldcp);
2844		break;
2845
2846	default:
2847		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
2848		    rdx_pkt->tag.vio_subtype);
2849	}
2850
2851	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2852}
2853
2854static void
2855vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t tag)
2856{
2857	uint16_t	env = tag.vio_subtype_env;
2858	vsw_t		*vswp = ldcp->ldc_vswp;
2859
2860	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2861
2862	/* session id check */
2863	if (ldcp->session_status & VSW_PEER_SESSION) {
2864		if (ldcp->peer_session != tag.vio_sid) {
2865			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
2866			    __func__, ldcp->ldc_id, tag.vio_sid);
2867			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2868			return;
2869		}
2870	}
2871
2872	/*
2873	 * It is an error for us to be getting data packets
2874	 * before the handshake has completed.
2875	 */
2876	if (ldcp->hphase != VSW_MILESTONE4) {
2877		DERR(vswp, "%s: got data packet before handshake complete "
2878		    "hphase %d (%x: %x)", __func__, ldcp->hphase,
2879		    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
2880		DUMP_FLAGS(ldcp->lane_in.lstate);
2881		DUMP_FLAGS(ldcp->lane_out.lstate);
2882		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2883		return;
2884	}
2885
2886	/*
2887	 * To reduce the locking contention, release the
2888	 * ldc_cblock here and re-acquire it once we are done
2889	 * receiving packets.
2890	 */
2891	mutex_exit(&ldcp->ldc_cblock);
2892	mutex_enter(&ldcp->ldc_rxlock);
2893
2894	/*
2895	 * Switch on vio_subtype envelope, then let lower routines
2896	 * decide if its an INFO, ACK or NACK packet.
2897	 */
2898	if (env == VIO_DRING_DATA) {
2899		vsw_process_data_dring_pkt(ldcp, dpkt);
2900	} else if (env == VIO_PKT_DATA) {
2901		vsw_process_data_raw_pkt(ldcp, dpkt);
2902	} else if (env == VIO_DESC_DATA) {
2903		vsw_process_data_ibnd_pkt(ldcp, dpkt);
2904	} else {
2905		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
2906	}
2907
2908	mutex_exit(&ldcp->ldc_rxlock);
2909	mutex_enter(&ldcp->ldc_cblock);
2910
2911	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2912}
2913
2914#define	SND_DRING_NACK(ldcp, pkt) \
2915	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
2916	pkt->tag.vio_sid = ldcp->local_session; \
2917	(void) vsw_send_msg(ldcp, (void *)pkt, \
2918			sizeof (vio_dring_msg_t), B_TRUE);
2919
2920static void
2921vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt)
2922{
2923	vio_dring_msg_t		*dring_pkt;
2924	vnet_public_desc_t	*pub_addr = NULL;
2925	vsw_private_desc_t	*priv_addr = NULL;
2926	dring_info_t		*dp = NULL;
2927	vsw_t			*vswp = ldcp->ldc_vswp;
2928	mblk_t			*mp = NULL;
2929	mblk_t			*bp = NULL;
2930	mblk_t			*bpt = NULL;
2931	size_t			nbytes = 0;
2932	uint64_t		ncookies = 0;
2933	uint64_t		chain = 0;
2934	uint64_t		len;
2935	uint32_t		pos, start, datalen;
2936	uint32_t		range_start, range_end;
2937	int32_t			end, num, cnt = 0;
2938	int			i, rv, msg_rv = 0;
2939	boolean_t		ack_needed = B_FALSE;
2940	boolean_t		prev_desc_ack = B_FALSE;
2941	int			read_attempts = 0;
2942	struct ether_header	*ehp;
2943
2944	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2945
2946	/*
2947	 * We know this is a data/dring packet so
2948	 * cast it into the correct structure.
2949	 */
2950	dring_pkt = (vio_dring_msg_t *)dpkt;
2951
2952	/*
2953	 * Switch on the vio_subtype. If its INFO then we need to
2954	 * process the data. If its an ACK we need to make sure
2955	 * it makes sense (i.e did we send an earlier data/info),
2956	 * and if its a NACK then we maybe attempt a retry.
2957	 */
2958	switch (dring_pkt->tag.vio_subtype) {
2959	case VIO_SUBTYPE_INFO:
2960		D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id);
2961
2962		READ_ENTER(&ldcp->lane_in.dlistrw);
2963		if ((dp = vsw_ident2dring(&ldcp->lane_in,
2964		    dring_pkt->dring_ident)) == NULL) {
2965			RW_EXIT(&ldcp->lane_in.dlistrw);
2966
2967			DERR(vswp, "%s(%lld): unable to find dring from "
2968			    "ident 0x%llx", __func__, ldcp->ldc_id,
2969			    dring_pkt->dring_ident);
2970
2971			SND_DRING_NACK(ldcp, dring_pkt);
2972			return;
2973		}
2974
2975		start = pos = dring_pkt->start_idx;
2976		end = dring_pkt->end_idx;
2977		len = dp->num_descriptors;
2978
2979		range_start = range_end = pos;
2980
2981		D2(vswp, "%s(%lld): start index %ld : end %ld\n",
2982		    __func__, ldcp->ldc_id, start, end);
2983
2984		if (end == -1) {
2985			num = -1;
2986		} else if (end >= 0) {
2987			num = end >= pos ? end - pos + 1: (len - pos + 1) + end;
2988
2989			/* basic sanity check */
2990			if (end > len) {
2991				RW_EXIT(&ldcp->lane_in.dlistrw);
2992				DERR(vswp, "%s(%lld): endpoint %lld outside "
2993				    "ring length %lld", __func__,
2994				    ldcp->ldc_id, end, len);
2995
2996				SND_DRING_NACK(ldcp, dring_pkt);
2997				return;
2998			}
2999		} else {
3000			RW_EXIT(&ldcp->lane_in.dlistrw);
3001			DERR(vswp, "%s(%lld): invalid endpoint %lld",
3002			    __func__, ldcp->ldc_id, end);
3003			SND_DRING_NACK(ldcp, dring_pkt);
3004			return;
3005		}
3006
3007		while (cnt != num) {
3008vsw_recheck_desc:
3009			if ((rv = ldc_mem_dring_acquire(dp->handle,
3010			    pos, pos)) != 0) {
3011				RW_EXIT(&ldcp->lane_in.dlistrw);
3012				DERR(vswp, "%s(%lld): unable to acquire "
3013				    "descriptor at pos %d: err %d",
3014				    __func__, pos, ldcp->ldc_id, rv);
3015				SND_DRING_NACK(ldcp, dring_pkt);
3016				ldcp->ldc_stats.ierrors++;
3017				return;
3018			}
3019
3020			pub_addr = (vnet_public_desc_t *)dp->pub_addr + pos;
3021
3022			/*
3023			 * When given a bounded range of descriptors
3024			 * to process, its an error to hit a descriptor
3025			 * which is not ready. In the non-bounded case
3026			 * (end_idx == -1) this simply indicates we have
3027			 * reached the end of the current active range.
3028			 */
3029			if (pub_addr->hdr.dstate != VIO_DESC_READY) {
3030				/* unbound - no error */
3031				if (end == -1) {
3032					if (read_attempts == vsw_read_attempts)
3033						break;
3034
3035					delay(drv_usectohz(vsw_desc_delay));
3036					read_attempts++;
3037					goto vsw_recheck_desc;
3038				}
3039
3040				/* bounded - error - so NACK back */
3041				RW_EXIT(&ldcp->lane_in.dlistrw);
3042				DERR(vswp, "%s(%lld): descriptor not READY "
3043				    "(%d)", __func__, ldcp->ldc_id,
3044				    pub_addr->hdr.dstate);
3045				SND_DRING_NACK(ldcp, dring_pkt);
3046				return;
3047			}
3048
3049			DTRACE_PROBE1(read_attempts, int, read_attempts);
3050
3051			range_end = pos;
3052
3053			/*
3054			 * If we ACK'd the previous descriptor then now
3055			 * record the new range start position for later
3056			 * ACK's.
3057			 */
3058			if (prev_desc_ack) {
3059				range_start = pos;
3060
3061				D2(vswp, "%s(%lld): updating range start to be "
3062				    "%d", __func__, ldcp->ldc_id, range_start);
3063
3064				prev_desc_ack = B_FALSE;
3065			}
3066
3067			/*
3068			 * Data is padded to align on 8 byte boundary,
3069			 * datalen is actual data length, i.e. minus that
3070			 * padding.
3071			 */
3072			datalen = pub_addr->nbytes;
3073
3074			/*
3075			 * Does peer wish us to ACK when we have finished
3076			 * with this descriptor ?
3077			 */
3078			if (pub_addr->hdr.ack)
3079				ack_needed = B_TRUE;
3080
3081			D2(vswp, "%s(%lld): processing desc %lld at pos"
3082			    " 0x%llx : dstate 0x%lx : datalen 0x%lx",
3083			    __func__, ldcp->ldc_id, pos, pub_addr,
3084			    pub_addr->hdr.dstate, datalen);
3085
3086			/*
3087			 * Mark that we are starting to process descriptor.
3088			 */
3089			pub_addr->hdr.dstate = VIO_DESC_ACCEPTED;
3090
3091			/*
3092			 * Ensure that we ask ldc for an aligned
3093			 * number of bytes.
3094			 */
3095			nbytes = (datalen + VNET_IPALIGN + 7) & ~7;
3096
3097			mp = vio_multipool_allocb(&ldcp->vmp, nbytes);
3098			if (mp == NULL) {
3099				ldcp->ldc_stats.rx_vio_allocb_fail++;
3100				/*
3101				 * No free receive buffers available, so
3102				 * fallback onto allocb(9F). Make sure that
3103				 * we get a data buffer which is a multiple
3104				 * of 8 as this is required by ldc_mem_copy.
3105				 */
3106				DTRACE_PROBE(allocb);
3107				if ((mp = allocb(datalen + VNET_IPALIGN + 8,
3108				    BPRI_MED)) == NULL) {
3109					DERR(vswp, "%s(%ld): allocb failed",
3110					    __func__, ldcp->ldc_id);
3111					pub_addr->hdr.dstate = VIO_DESC_DONE;
3112					(void) ldc_mem_dring_release(dp->handle,
3113					    pos, pos);
3114					ldcp->ldc_stats.ierrors++;
3115					ldcp->ldc_stats.rx_allocb_fail++;
3116					break;
3117				}
3118			}
3119
3120			ncookies = pub_addr->ncookies;
3121			rv = ldc_mem_copy(ldcp->ldc_handle,
3122			    (caddr_t)mp->b_rptr, 0, &nbytes,
3123			    pub_addr->memcookie, ncookies, LDC_COPY_IN);
3124
3125			if (rv != 0) {
3126				DERR(vswp, "%s(%d): unable to copy in data "
3127				    "from %d cookies in desc %d (rv %d)",
3128				    __func__, ldcp->ldc_id, ncookies, pos, rv);
3129				freemsg(mp);
3130
3131				pub_addr->hdr.dstate = VIO_DESC_DONE;
3132				(void) ldc_mem_dring_release(dp->handle,
3133				    pos, pos);
3134				ldcp->ldc_stats.ierrors++;
3135				break;
3136			} else {
3137				D2(vswp, "%s(%d): copied in %ld bytes"
3138				    " using %d cookies", __func__,
3139				    ldcp->ldc_id, nbytes, ncookies);
3140			}
3141
3142			/* adjust the read pointer to skip over the padding */
3143			mp->b_rptr += VNET_IPALIGN;
3144
3145			/* point to the actual end of data */
3146			mp->b_wptr = mp->b_rptr + datalen;
3147
3148			/* update statistics */
3149			ehp = (struct ether_header *)mp->b_rptr;
3150			if (IS_BROADCAST(ehp))
3151				ldcp->ldc_stats.brdcstrcv++;
3152			else if (IS_MULTICAST(ehp))
3153				ldcp->ldc_stats.multircv++;
3154
3155			ldcp->ldc_stats.ipackets++;
3156			ldcp->ldc_stats.rbytes += datalen;
3157
3158			/* build a chain of received packets */
3159			if (bp == NULL) {
3160				/* first pkt */
3161				bp = mp;
3162				bp->b_next = bp->b_prev = NULL;
3163				bpt = bp;
3164				chain = 1;
3165			} else {
3166				mp->b_next = mp->b_prev = NULL;
3167				bpt->b_next = mp;
3168				bpt = mp;
3169				chain++;
3170			}
3171
3172			/* mark we are finished with this descriptor */
3173			pub_addr->hdr.dstate = VIO_DESC_DONE;
3174
3175			(void) ldc_mem_dring_release(dp->handle, pos, pos);
3176
3177			/*
3178			 * Send an ACK back to peer if requested.
3179			 */
3180			if (ack_needed) {
3181				ack_needed = B_FALSE;
3182
3183				dring_pkt->start_idx = range_start;
3184				dring_pkt->end_idx = range_end;
3185
3186				DERR(vswp, "%s(%lld): processed %d %d, ACK"
3187				    " requested", __func__, ldcp->ldc_id,
3188				    dring_pkt->start_idx, dring_pkt->end_idx);
3189
3190				dring_pkt->dring_process_state = VIO_DP_ACTIVE;
3191				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3192				dring_pkt->tag.vio_sid = ldcp->local_session;
3193
3194				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
3195				    sizeof (vio_dring_msg_t), B_FALSE);
3196
3197				/*
3198				 * Check if ACK was successfully sent. If not
3199				 * we break and deal with that below.
3200				 */
3201				if (msg_rv != 0)
3202					break;
3203
3204				prev_desc_ack = B_TRUE;
3205				range_start = pos;
3206			}
3207
3208			/* next descriptor */
3209			pos = (pos + 1) % len;
3210			cnt++;
3211
3212			/*
3213			 * Break out of loop here and stop processing to
3214			 * allow some other network device (or disk) to
3215			 * get access to the cpu.
3216			 */
3217			if (chain > vsw_chain_len) {
3218				D3(vswp, "%s(%lld): switching chain of %d "
3219				    "msgs", __func__, ldcp->ldc_id, chain);
3220				break;
3221			}
3222		}
3223		RW_EXIT(&ldcp->lane_in.dlistrw);
3224
3225		/*
3226		 * If when we attempted to send the ACK we found that the
3227		 * channel had been reset then now handle this. We deal with
3228		 * it here as we cannot reset the channel while holding the
3229		 * dlistrw lock, and we don't want to acquire/release it
3230		 * continuously in the above loop, as a channel reset should
3231		 * be a rare event.
3232		 */
3233		if (msg_rv == ECONNRESET) {
3234			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
3235			break;
3236		}
3237
3238		/* send the chain of packets to be switched */
3239		if (bp != NULL) {
3240			DTRACE_PROBE1(vsw_rcv_msgs, int, chain);
3241			D3(vswp, "%s(%lld): switching chain of %d msgs",
3242			    __func__, ldcp->ldc_id, chain);
3243			vswp->vsw_switch_frame(vswp, bp, VSW_VNETPORT,
3244			    ldcp->ldc_port, NULL);
3245		}
3246
3247		DTRACE_PROBE1(msg_cnt, int, cnt);
3248
3249		/*
3250		 * We are now finished so ACK back with the state
3251		 * set to STOPPING so our peer knows we are finished
3252		 */
3253		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3254		dring_pkt->tag.vio_sid = ldcp->local_session;
3255
3256		dring_pkt->dring_process_state = VIO_DP_STOPPED;
3257
3258		DTRACE_PROBE(stop_process_sent);
3259
3260		/*
3261		 * We have not processed any more descriptors beyond
3262		 * the last one we ACK'd.
3263		 */
3264		if (prev_desc_ack)
3265			range_start = range_end;
3266
3267		dring_pkt->start_idx = range_start;
3268		dring_pkt->end_idx = range_end;
3269
3270		D2(vswp, "%s(%lld) processed : %d : %d, now stopping",
3271		    __func__, ldcp->ldc_id, dring_pkt->start_idx,
3272		    dring_pkt->end_idx);
3273
3274		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
3275		    sizeof (vio_dring_msg_t), B_TRUE);
3276		break;
3277
3278	case VIO_SUBTYPE_ACK:
3279		D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id);
3280		/*
3281		 * Verify that the relevant descriptors are all
3282		 * marked as DONE
3283		 */
3284		READ_ENTER(&ldcp->lane_out.dlistrw);
3285		if ((dp = vsw_ident2dring(&ldcp->lane_out,
3286		    dring_pkt->dring_ident)) == NULL) {
3287			RW_EXIT(&ldcp->lane_out.dlistrw);
3288			DERR(vswp, "%s: unknown ident in ACK", __func__);
3289			return;
3290		}
3291
3292		start = end = 0;
3293		start = dring_pkt->start_idx;
3294		end = dring_pkt->end_idx;
3295		len = dp->num_descriptors;
3296
3297
3298		mutex_enter(&dp->dlock);
3299		dp->last_ack_recv = end;
3300		ldcp->ldc_stats.dring_data_acks++;
3301		mutex_exit(&dp->dlock);
3302
3303		(void) vsw_reclaim_dring(dp, start);
3304
3305		/*
3306		 * If our peer is stopping processing descriptors then
3307		 * we check to make sure it has processed all the descriptors
3308		 * we have updated. If not then we send it a new message
3309		 * to prompt it to restart.
3310		 */
3311		if (dring_pkt->dring_process_state == VIO_DP_STOPPED) {
3312			DTRACE_PROBE(stop_process_recv);
3313			D2(vswp, "%s(%lld): got stopping msg : %d : %d",
3314			    __func__, ldcp->ldc_id, dring_pkt->start_idx,
3315			    dring_pkt->end_idx);
3316
3317			/*
3318			 * Check next descriptor in public section of ring.
3319			 * If its marked as READY then we need to prompt our
3320			 * peer to start processing the ring again.
3321			 */
3322			i = (end + 1) % len;
3323			pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
3324			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
3325
3326			/*
3327			 * Hold the restart lock across all of this to
3328			 * make sure that its not possible for us to
3329			 * decide that a msg needs to be sent in the future
3330			 * but the sending code having already checked is
3331			 * about to exit.
3332			 */
3333			mutex_enter(&dp->restart_lock);
3334			ldcp->ldc_stats.dring_stopped_acks++;
3335			mutex_enter(&priv_addr->dstate_lock);
3336			if (pub_addr->hdr.dstate == VIO_DESC_READY) {
3337
3338				mutex_exit(&priv_addr->dstate_lock);
3339
3340				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
3341				dring_pkt->tag.vio_sid = ldcp->local_session;
3342
3343				dring_pkt->seq_num =
3344				    atomic_inc_64_nv(&ldcp->lane_out.seq_num);
3345
3346				dring_pkt->start_idx = (end + 1) % len;
3347				dring_pkt->end_idx = -1;
3348
3349				D2(vswp, "%s(%lld) : sending restart msg:"
3350				    " %d : %d", __func__, ldcp->ldc_id,
3351				    dring_pkt->start_idx, dring_pkt->end_idx);
3352
3353				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
3354				    sizeof (vio_dring_msg_t), B_FALSE);
3355				ldcp->ldc_stats.dring_data_msgs++;
3356
3357			} else {
3358				mutex_exit(&priv_addr->dstate_lock);
3359				dp->restart_reqd = B_TRUE;
3360			}
3361			mutex_exit(&dp->restart_lock);
3362		}
3363		RW_EXIT(&ldcp->lane_out.dlistrw);
3364
3365		/* only do channel reset after dropping dlistrw lock */
3366		if (msg_rv == ECONNRESET)
3367			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
3368
3369		break;
3370
3371	case VIO_SUBTYPE_NACK:
3372		DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK",
3373		    __func__, ldcp->ldc_id);
3374		/*
3375		 * Something is badly wrong if we are getting NACK's
3376		 * for our data pkts. So reset the channel.
3377		 */
3378		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3379
3380		break;
3381
3382	default:
3383		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
3384		    ldcp->ldc_id, dring_pkt->tag.vio_subtype);
3385	}
3386
3387	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3388}
3389
3390/*
3391 * VIO_PKT_DATA (a.k.a raw data mode )
3392 *
3393 * Note - currently not supported. Do nothing.
3394 */
3395static void
3396vsw_process_data_raw_pkt(vsw_ldc_t *ldcp, void *dpkt)
3397{
3398	_NOTE(ARGUNUSED(dpkt))
3399
3400	D1(NULL, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
3401	DERR(NULL, "%s (%lld): currently unsupported", __func__, ldcp->ldc_id);
3402	D1(NULL, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
3403}
3404
3405/*
3406 * Process an in-band descriptor message (most likely from
3407 * OBP).
3408 */
3409static void
3410vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
3411{
3412	vnet_ibnd_desc_t	*ibnd_desc;
3413	dring_info_t		*dp = NULL;
3414	vsw_private_desc_t	*priv_addr = NULL;
3415	vsw_t			*vswp = ldcp->ldc_vswp;
3416	mblk_t			*mp = NULL;
3417	size_t			nbytes = 0;
3418	size_t			off = 0;
3419	uint64_t		idx = 0;
3420	uint32_t		num = 1, len, datalen = 0;
3421	uint64_t		ncookies = 0;
3422	int			i, rv;
3423	int			j = 0;
3424
3425	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3426
3427	ibnd_desc = (vnet_ibnd_desc_t *)pkt;
3428
3429	switch (ibnd_desc->hdr.tag.vio_subtype) {
3430	case VIO_SUBTYPE_INFO:
3431		D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3432
3433		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
3434			return;
3435
3436		/*
3437		 * Data is padded to align on a 8 byte boundary,
3438		 * nbytes is actual data length, i.e. minus that
3439		 * padding.
3440		 */
3441		datalen = ibnd_desc->nbytes;
3442
3443		D2(vswp, "%s(%lld): processing inband desc : "
3444		    ": datalen 0x%lx", __func__, ldcp->ldc_id, datalen);
3445
3446		ncookies = ibnd_desc->ncookies;
3447
3448		/*
3449		 * allocb(9F) returns an aligned data block. We
3450		 * need to ensure that we ask ldc for an aligned
3451		 * number of bytes also.
3452		 */
3453		nbytes = datalen;
3454		if (nbytes & 0x7) {
3455			off = 8 - (nbytes & 0x7);
3456			nbytes += off;
3457		}
3458
3459		mp = allocb(datalen, BPRI_MED);
3460		if (mp == NULL) {
3461			DERR(vswp, "%s(%lld): allocb failed",
3462			    __func__, ldcp->ldc_id);
3463			ldcp->ldc_stats.rx_allocb_fail++;
3464			return;
3465		}
3466
3467		rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr,
3468		    0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies,
3469		    LDC_COPY_IN);
3470
3471		if (rv != 0) {
3472			DERR(vswp, "%s(%d): unable to copy in data from "
3473			    "%d cookie(s)", __func__, ldcp->ldc_id, ncookies);
3474			freemsg(mp);
3475			ldcp->ldc_stats.ierrors++;
3476			return;
3477		}
3478
3479		D2(vswp, "%s(%d): copied in %ld bytes using %d cookies",
3480		    __func__, ldcp->ldc_id, nbytes, ncookies);
3481
3482		/* point to the actual end of data */
3483		mp->b_wptr = mp->b_rptr + datalen;
3484		ldcp->ldc_stats.ipackets++;
3485		ldcp->ldc_stats.rbytes += datalen;
3486
3487		/*
3488		 * We ACK back every in-band descriptor message we process
3489		 */
3490		ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK;
3491		ibnd_desc->hdr.tag.vio_sid = ldcp->local_session;
3492		(void) vsw_send_msg(ldcp, (void *)ibnd_desc,
3493		    sizeof (vnet_ibnd_desc_t), B_TRUE);
3494
3495		/* send the packet to be switched */
3496		vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT,
3497		    ldcp->ldc_port, NULL);
3498
3499		break;
3500
3501	case VIO_SUBTYPE_ACK:
3502		D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3503
3504		/* Verify the ACK is valid */
3505		idx = ibnd_desc->hdr.desc_handle;
3506
3507		if (idx >= VSW_RING_NUM_EL) {
3508			cmn_err(CE_WARN, "!vsw%d: corrupted ACK received "
3509			    "(idx %ld)", vswp->instance, idx);
3510			return;
3511		}
3512
3513		if ((dp = ldcp->lane_out.dringp) == NULL) {
3514			DERR(vswp, "%s: no dring found", __func__);
3515			return;
3516		}
3517
3518		len = dp->num_descriptors;
3519		/*
3520		 * If the descriptor we are being ACK'ed for is not the
3521		 * one we expected, then pkts were lost somwhere, either
3522		 * when we tried to send a msg, or a previous ACK msg from
3523		 * our peer. In either case we now reclaim the descriptors
3524		 * in the range from the last ACK we received up to the
3525		 * current ACK.
3526		 */
3527		if (idx != dp->last_ack_recv) {
3528			DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)",
3529			    __func__, dp->last_ack_recv, idx);
3530			num = idx >= dp->last_ack_recv ?
3531			    idx - dp->last_ack_recv + 1:
3532			    (len - dp->last_ack_recv + 1) + idx;
3533		}
3534
3535		/*
3536		 * When we sent the in-band message to our peer we
3537		 * marked the copy in our private ring as READY. We now
3538		 * check that the descriptor we are being ACK'ed for is in
3539		 * fact READY, i.e. it is one we have shared with our peer.
3540		 *
3541		 * If its not we flag an error, but still reset the descr
3542		 * back to FREE.
3543		 */
3544		for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) {
3545			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
3546			mutex_enter(&priv_addr->dstate_lock);
3547			if (priv_addr->dstate != VIO_DESC_READY) {
3548				DERR(vswp, "%s: (%ld) desc at index %ld not "
3549				    "READY (0x%lx)", __func__,
3550				    ldcp->ldc_id, idx, priv_addr->dstate);
3551				DERR(vswp, "%s: bound %d: ncookies %ld : "
3552				    "datalen %ld", __func__,
3553				    priv_addr->bound, priv_addr->ncookies,
3554				    priv_addr->datalen);
3555			}
3556			D2(vswp, "%s: (%lld) freeing descp at %lld", __func__,
3557			    ldcp->ldc_id, idx);
3558			/* release resources associated with sent msg */
3559			priv_addr->datalen = 0;
3560			priv_addr->dstate = VIO_DESC_FREE;
3561			mutex_exit(&priv_addr->dstate_lock);
3562		}
3563		/* update to next expected value */
3564		dp->last_ack_recv = (idx + 1) % dp->num_descriptors;
3565
3566		break;
3567
3568	case VIO_SUBTYPE_NACK:
3569		DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3570
3571		/*
3572		 * We should only get a NACK if our peer doesn't like
3573		 * something about a message we have sent it. If this
3574		 * happens we just release the resources associated with
3575		 * the message. (We are relying on higher layers to decide
3576		 * whether or not to resend.
3577		 */
3578
3579		/* limit check */
3580		idx = ibnd_desc->hdr.desc_handle;
3581
3582		if (idx >= VSW_RING_NUM_EL) {
3583			DERR(vswp, "%s: corrupted NACK received (idx %lld)",
3584			    __func__, idx);
3585			return;
3586		}
3587
3588		if ((dp = ldcp->lane_out.dringp) == NULL) {
3589			DERR(vswp, "%s: no dring found", __func__);
3590			return;
3591		}
3592
3593		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
3594
3595		/* move to correct location in ring */
3596		priv_addr += idx;
3597
3598		/* release resources associated with sent msg */
3599		mutex_enter(&priv_addr->dstate_lock);
3600		priv_addr->datalen = 0;
3601		priv_addr->dstate = VIO_DESC_FREE;
3602		mutex_exit(&priv_addr->dstate_lock);
3603
3604		break;
3605
3606	default:
3607		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
3608		    ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype);
3609	}
3610
3611	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3612}
3613
3614static void
3615vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t tag)
3616{
3617	_NOTE(ARGUNUSED(epkt))
3618
3619	vsw_t		*vswp = ldcp->ldc_vswp;
3620	uint16_t	env = tag.vio_subtype_env;
3621
3622	D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
3623
3624	/*
3625	 * Error vio_subtypes have yet to be defined. So for
3626	 * the moment we can't do anything.
3627	 */
3628	D2(vswp, "%s: (%x) vio_subtype env", __func__, env);
3629
3630	D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
3631}
3632
3633/* transmit the packet over the given port */
3634int
3635vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt)
3636{
3637	vsw_ldc_list_t 	*ldcl = &port->p_ldclist;
3638	vsw_ldc_t 	*ldcp;
3639	mblk_t		*tmp;
3640	int		status = 0;
3641
3642	READ_ENTER(&ldcl->lockrw);
3643	/*
3644	 * Note for now, we have a single channel.
3645	 */
3646	ldcp = ldcl->head;
3647	if (ldcp == NULL) {
3648		DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n");
3649		freemsg(mp);
3650		RW_EXIT(&ldcl->lockrw);
3651		return (1);
3652	}
3653
3654	/*
3655	 * If the TX thread is enabled, then queue the packets
3656	 * and signal the tx thread.
3657	 */
3658	if (ldcp->tx_thread != NULL) {
3659		mutex_enter(&ldcp->tx_thr_lock);
3660		if (ldcp->tx_mhead == NULL) {
3661			ldcp->tx_mhead = mp;
3662			ldcp->tx_mtail = mpt;
3663			cv_signal(&ldcp->tx_thr_cv);
3664		} else {
3665			ldcp->tx_mtail->b_next = mp;
3666			ldcp->tx_mtail = mpt;
3667		}
3668		mutex_exit(&ldcp->tx_thr_lock);
3669	} else {
3670		while (mp != NULL) {
3671			tmp = mp->b_next;
3672			mp->b_next = mp->b_prev = NULL;
3673			(void) vsw_ldcsend(ldcp, mp, 1);
3674			mp = tmp;
3675		}
3676	}
3677
3678	RW_EXIT(&ldcl->lockrw);
3679
3680	return (status);
3681}
3682
3683/*
3684 * Transmit the packet over the given LDC channel.
3685 *
3686 * The 'retries' argument indicates how many times a packet
3687 * is retried before it is dropped. Note, the retry is done
3688 * only for a resource related failure, for all other failures
3689 * the packet is dropped immediately.
3690 *
3691 * The 'tx_failure' counter is used as mechanism to track
3692 * continuous failures. Once these failures are more than
3693 * 'vsw_ldc_tx_max_failures' tunable, the packets are tried only
3694 * once and then they are dropped. This is done to avoid
3695 * buffering too many packets.
3696 */
3697static int
3698vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, int retries)
3699{
3700	int i;
3701	int rc;
3702	int status = 0;
3703	vsw_port_t *port = ldcp->ldc_port;
3704	dring_info_t *dp = NULL;
3705
3706
3707	for (i = 0; i < retries; ) {
3708		/*
3709		 * Send the message out using the appropriate
3710		 * transmit function which will free mblock when it
3711		 * is finished with it.
3712		 */
3713		mutex_enter(&port->tx_lock);
3714		if (port->transmit != NULL) {
3715			status = (*port->transmit)(ldcp, mp);
3716		}
3717		if (status == LDC_TX_SUCCESS) {
3718			ldcp->tx_failures = 0;
3719			mutex_exit(&port->tx_lock);
3720			break;
3721		} else if (ldcp->tx_failures > vsw_ldc_tx_max_failures) {
3722			/*
3723			 * If the failures crossed the threshold then
3724			 * break here.
3725			 */
3726			ldcp->ldc_stats.oerrors++;
3727			mutex_exit(&port->tx_lock);
3728			break;
3729		} else {
3730			ldcp->tx_failures++;
3731		}
3732		i++;	/* increment the counter here */
3733
3734		/* If its the last retry, then update the oerror */
3735		if ((i == retries) && (status == LDC_TX_NORESOURCES)) {
3736			ldcp->ldc_stats.oerrors++;
3737		}
3738		mutex_exit(&port->tx_lock);
3739
3740		if (status != LDC_TX_NORESOURCES) {
3741			/*
3742			 * No retrying required for errors un-related
3743			 * to resources.
3744			 */
3745			break;
3746		}
3747		READ_ENTER(&ldcp->lane_out.dlistrw);
3748		if (((dp = ldcp->lane_out.dringp) != NULL) &&
3749		    (ldcp->lane_out.xfer_mode == VIO_DRING_MODE)) {
3750			rc = vsw_reclaim_dring(dp, dp->end_idx);
3751		} else {
3752			/*
3753			 * If there is no dring or the xfer_mode is
3754			 * set to DESC_MODE(ie., OBP), then simply break here.
3755			 */
3756			RW_EXIT(&ldcp->lane_out.dlistrw);
3757			break;
3758		}
3759		RW_EXIT(&ldcp->lane_out.dlistrw);
3760
3761		/*
3762		 * Delay only if none were reclaimed
3763		 * and its not the last retry.
3764		 */
3765		if ((rc == 0) && (i < retries)) {
3766			delay(drv_usectohz(vsw_ldc_tx_delay));
3767		}
3768	}
3769	freemsg(mp);
3770	return (status);
3771}
3772
3773/*
3774 * Send packet out via descriptor ring to a logical device.
3775 */
3776static int
3777vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp)
3778{
3779	vio_dring_msg_t		dring_pkt;
3780	dring_info_t		*dp = NULL;
3781	vsw_private_desc_t	*priv_desc = NULL;
3782	vnet_public_desc_t	*pub = NULL;
3783	vsw_t			*vswp = ldcp->ldc_vswp;
3784	mblk_t			*bp;
3785	size_t			n, size;
3786	caddr_t			bufp;
3787	int			idx;
3788	int			status = LDC_TX_SUCCESS;
3789	struct ether_header	*ehp = (struct ether_header *)mp->b_rptr;
3790
3791	D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id);
3792
3793	/* TODO: make test a macro */
3794	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
3795	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
3796		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
3797		    "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
3798		    ldcp->lane_out.lstate);
3799		ldcp->ldc_stats.oerrors++;
3800		return (LDC_TX_FAILURE);
3801	}
3802
3803	/*
3804	 * Note - using first ring only, this may change
3805	 * in the future.
3806	 */
3807	READ_ENTER(&ldcp->lane_out.dlistrw);
3808	if ((dp = ldcp->lane_out.dringp) == NULL) {
3809		RW_EXIT(&ldcp->lane_out.dlistrw);
3810		DERR(vswp, "%s(%lld): no dring for outbound lane on"
3811		    " channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id);
3812		ldcp->ldc_stats.oerrors++;
3813		return (LDC_TX_FAILURE);
3814	}
3815
3816	size = msgsize(mp);
3817	if (size > (size_t)ETHERMAX) {
3818		RW_EXIT(&ldcp->lane_out.dlistrw);
3819		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
3820		    ldcp->ldc_id, size);
3821		ldcp->ldc_stats.oerrors++;
3822		return (LDC_TX_FAILURE);
3823	}
3824
3825	/*
3826	 * Find a free descriptor
3827	 *
3828	 * Note: for the moment we are assuming that we will only
3829	 * have one dring going from the switch to each of its
3830	 * peers. This may change in the future.
3831	 */
3832	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
3833		D2(vswp, "%s(%lld): no descriptor available for ring "
3834		    "at 0x%llx", __func__, ldcp->ldc_id, dp);
3835
3836		/* nothing more we can do */
3837		status = LDC_TX_NORESOURCES;
3838		ldcp->ldc_stats.tx_no_desc++;
3839		goto vsw_dringsend_free_exit;
3840	} else {
3841		D2(vswp, "%s(%lld): free private descriptor found at pos %ld "
3842		    "addr 0x%llx\n", __func__, ldcp->ldc_id, idx, priv_desc);
3843	}
3844
3845	/* copy data into the descriptor */
3846	bufp = priv_desc->datap;
3847	bufp += VNET_IPALIGN;
3848	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
3849		n = MBLKL(bp);
3850		bcopy(bp->b_rptr, bufp, n);
3851		bufp += n;
3852	}
3853
3854	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
3855
3856	pub = priv_desc->descp;
3857	pub->nbytes = priv_desc->datalen;
3858
3859	/* update statistics */
3860	if (IS_BROADCAST(ehp))
3861		ldcp->ldc_stats.brdcstxmt++;
3862	else if (IS_MULTICAST(ehp))
3863		ldcp->ldc_stats.multixmt++;
3864	ldcp->ldc_stats.opackets++;
3865	ldcp->ldc_stats.obytes += priv_desc->datalen;
3866
3867	mutex_enter(&priv_desc->dstate_lock);
3868	pub->hdr.dstate = VIO_DESC_READY;
3869	mutex_exit(&priv_desc->dstate_lock);
3870
3871	/*
3872	 * Determine whether or not we need to send a message to our
3873	 * peer prompting them to read our newly updated descriptor(s).
3874	 */
3875	mutex_enter(&dp->restart_lock);
3876	if (dp->restart_reqd) {
3877		dp->restart_reqd = B_FALSE;
3878		ldcp->ldc_stats.dring_data_msgs++;
3879		mutex_exit(&dp->restart_lock);
3880
3881		/*
3882		 * Send a vio_dring_msg to peer to prompt them to read
3883		 * the updated descriptor ring.
3884		 */
3885		dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA;
3886		dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO;
3887		dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA;
3888		dring_pkt.tag.vio_sid = ldcp->local_session;
3889
3890		/* Note - for now using first ring */
3891		dring_pkt.dring_ident = dp->ident;
3892		dring_pkt.seq_num = atomic_inc_64_nv(&ldcp->lane_out.seq_num);
3893
3894		/*
3895		 * If last_ack_recv is -1 then we know we've not
3896		 * received any ack's yet, so this must be the first
3897		 * msg sent, so set the start to the begining of the ring.
3898		 */
3899		mutex_enter(&dp->dlock);
3900		if (dp->last_ack_recv == -1) {
3901			dring_pkt.start_idx = 0;
3902		} else {
3903			dring_pkt.start_idx =
3904			    (dp->last_ack_recv + 1) % dp->num_descriptors;
3905		}
3906		dring_pkt.end_idx = -1;
3907		mutex_exit(&dp->dlock);
3908
3909		D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__,
3910		    ldcp->ldc_id, dp, dring_pkt.dring_ident);
3911		D3(vswp, "%s(%lld): start %lld : end %lld : seq %lld\n",
3912		    __func__, ldcp->ldc_id, dring_pkt.start_idx,
3913		    dring_pkt.end_idx, dring_pkt.seq_num);
3914
3915		RW_EXIT(&ldcp->lane_out.dlistrw);
3916
3917		(void) vsw_send_msg(ldcp, (void *)&dring_pkt,
3918		    sizeof (vio_dring_msg_t), B_TRUE);
3919
3920		return (status);
3921
3922	} else {
3923		mutex_exit(&dp->restart_lock);
3924		D2(vswp, "%s(%lld): updating descp %d", __func__,
3925		    ldcp->ldc_id, idx);
3926	}
3927
3928vsw_dringsend_free_exit:
3929
3930	RW_EXIT(&ldcp->lane_out.dlistrw);
3931
3932	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
3933	return (status);
3934}
3935
3936/*
3937 * Send an in-band descriptor message over ldc.
3938 */
3939static int
3940vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
3941{
3942	vsw_t			*vswp = ldcp->ldc_vswp;
3943	vnet_ibnd_desc_t	ibnd_msg;
3944	vsw_private_desc_t	*priv_desc = NULL;
3945	dring_info_t		*dp = NULL;
3946	size_t			n, size = 0;
3947	caddr_t			bufp;
3948	mblk_t			*bp;
3949	int			idx, i;
3950	int			status = LDC_TX_SUCCESS;
3951	static int		warn_msg = 1;
3952
3953	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3954
3955	ASSERT(mp != NULL);
3956
3957	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
3958	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
3959		DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt",
3960		    __func__, ldcp->ldc_id, ldcp->ldc_status,
3961		    ldcp->lane_out.lstate);
3962		ldcp->ldc_stats.oerrors++;
3963		return (LDC_TX_FAILURE);
3964	}
3965
3966	/*
3967	 * only expect single dring to exist, which we use
3968	 * as an internal buffer, rather than a transfer channel.
3969	 */
3970	READ_ENTER(&ldcp->lane_out.dlistrw);
3971	if ((dp = ldcp->lane_out.dringp) == NULL) {
3972		DERR(vswp, "%s(%lld): no dring for outbound lane",
3973		    __func__, ldcp->ldc_id);
3974		DERR(vswp, "%s(%lld) status(%d) state (0x%llx)", __func__,
3975		    ldcp->ldc_id, ldcp->ldc_status, ldcp->lane_out.lstate);
3976		RW_EXIT(&ldcp->lane_out.dlistrw);
3977		ldcp->ldc_stats.oerrors++;
3978		return (LDC_TX_FAILURE);
3979	}
3980
3981	size = msgsize(mp);
3982	if (size > (size_t)ETHERMAX) {
3983		RW_EXIT(&ldcp->lane_out.dlistrw);
3984		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
3985		    ldcp->ldc_id, size);
3986		ldcp->ldc_stats.oerrors++;
3987		return (LDC_TX_FAILURE);
3988	}
3989
3990	/*
3991	 * Find a free descriptor in our buffer ring
3992	 */
3993	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
3994		RW_EXIT(&ldcp->lane_out.dlistrw);
3995		if (warn_msg) {
3996			DERR(vswp, "%s(%lld): no descriptor available for ring "
3997			    "at 0x%llx", __func__, ldcp->ldc_id, dp);
3998			warn_msg = 0;
3999		}
4000
4001		/* nothing more we can do */
4002		status = LDC_TX_NORESOURCES;
4003		goto vsw_descrsend_free_exit;
4004	} else {
4005		D2(vswp, "%s(%lld): free private descriptor found at pos "
4006		    "%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx, priv_desc);
4007		warn_msg = 1;
4008	}
4009
4010	/* copy data into the descriptor */
4011	bufp = priv_desc->datap;
4012	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
4013		n = MBLKL(bp);
4014		bcopy(bp->b_rptr, bufp, n);
4015		bufp += n;
4016	}
4017
4018	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
4019
4020	/* create and send the in-band descp msg */
4021	ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
4022	ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO;
4023	ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
4024	ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;
4025
4026	ibnd_msg.hdr.seq_num = atomic_inc_64_nv(&ldcp->lane_out.seq_num);
4027
4028	/*
4029	 * Copy the mem cookies describing the data from the
4030	 * private region of the descriptor ring into the inband
4031	 * descriptor.
4032	 */
4033	for (i = 0; i < priv_desc->ncookies; i++) {
4034		bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i],
4035		    sizeof (ldc_mem_cookie_t));
4036	}
4037
4038	ibnd_msg.hdr.desc_handle = idx;
4039	ibnd_msg.ncookies = priv_desc->ncookies;
4040	ibnd_msg.nbytes = size;
4041
4042	ldcp->ldc_stats.opackets++;
4043	ldcp->ldc_stats.obytes += size;
4044
4045	RW_EXIT(&ldcp->lane_out.dlistrw);
4046
4047	(void) vsw_send_msg(ldcp, (void *)&ibnd_msg,
4048	    sizeof (vnet_ibnd_desc_t), B_TRUE);
4049
4050vsw_descrsend_free_exit:
4051
4052	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4053	return (status);
4054}
4055
4056static void
4057vsw_send_ver(void *arg)
4058{
4059	vsw_ldc_t	*ldcp = (vsw_ldc_t *)arg;
4060	vsw_t		*vswp = ldcp->ldc_vswp;
4061	lane_t		*lp = &ldcp->lane_out;
4062	vio_ver_msg_t	ver_msg;
4063
4064	D1(vswp, "%s enter", __func__);
4065
4066	ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4067	ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4068	ver_msg.tag.vio_subtype_env = VIO_VER_INFO;
4069	ver_msg.tag.vio_sid = ldcp->local_session;
4070
4071	ver_msg.ver_major = vsw_versions[0].ver_major;
4072	ver_msg.ver_minor = vsw_versions[0].ver_minor;
4073	ver_msg.dev_class = VDEV_NETWORK_SWITCH;
4074
4075	lp->lstate |= VSW_VER_INFO_SENT;
4076	lp->ver_major = ver_msg.ver_major;
4077	lp->ver_minor = ver_msg.ver_minor;
4078
4079	DUMP_TAG(ver_msg.tag);
4080
4081	(void) vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t), B_TRUE);
4082
4083	D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id);
4084}
4085
4086static void
4087vsw_send_attr(vsw_ldc_t *ldcp)
4088{
4089	vsw_t			*vswp = ldcp->ldc_vswp;
4090	lane_t			*lp = &ldcp->lane_out;
4091	vnet_attr_msg_t		attr_msg;
4092
4093	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
4094
4095	/*
4096	 * Subtype is set to INFO by default
4097	 */
4098	attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4099	attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4100	attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO;
4101	attr_msg.tag.vio_sid = ldcp->local_session;
4102
4103	/* payload copied from default settings for lane */
4104	attr_msg.mtu = lp->mtu;
4105	attr_msg.addr_type = lp->addr_type;
4106	attr_msg.xfer_mode = lp->xfer_mode;
4107	attr_msg.ack_freq = lp->xfer_mode;
4108
4109	READ_ENTER(&vswp->if_lockrw);
4110	attr_msg.addr = vnet_macaddr_strtoul((vswp->if_addr).ether_addr_octet);
4111	RW_EXIT(&vswp->if_lockrw);
4112
4113	ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT;
4114
4115	DUMP_TAG(attr_msg.tag);
4116
4117	(void) vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t), B_TRUE);
4118
4119	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
4120}
4121
4122/*
4123 * Create dring info msg (which also results in the creation of
4124 * a dring).
4125 */
4126static vio_dring_reg_msg_t *
4127vsw_create_dring_info_pkt(vsw_ldc_t *ldcp)
4128{
4129	vio_dring_reg_msg_t	*mp;
4130	dring_info_t		*dp;
4131	vsw_t			*vswp = ldcp->ldc_vswp;
4132
4133	D1(vswp, "vsw_create_dring_info_pkt enter\n");
4134
4135	/*
4136	 * If we can't create a dring, obviously no point sending
4137	 * a message.
4138	 */
4139	if ((dp = vsw_create_dring(ldcp)) == NULL)
4140		return (NULL);
4141
4142	mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP);
4143
4144	mp->tag.vio_msgtype = VIO_TYPE_CTRL;
4145	mp->tag.vio_subtype = VIO_SUBTYPE_INFO;
4146	mp->tag.vio_subtype_env = VIO_DRING_REG;
4147	mp->tag.vio_sid = ldcp->local_session;
4148
4149	/* payload */
4150	mp->num_descriptors = dp->num_descriptors;
4151	mp->descriptor_size = dp->descriptor_size;
4152	mp->options = dp->options;
4153	mp->ncookies = dp->ncookies;
4154	bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t));
4155
4156	mp->dring_ident = 0;
4157
4158	D1(vswp, "vsw_create_dring_info_pkt exit\n");
4159
4160	return (mp);
4161}
4162
4163static void
4164vsw_send_dring_info(vsw_ldc_t *ldcp)
4165{
4166	vio_dring_reg_msg_t	*dring_msg;
4167	vsw_t			*vswp = ldcp->ldc_vswp;
4168
4169	D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id);
4170
4171	dring_msg = vsw_create_dring_info_pkt(ldcp);
4172	if (dring_msg == NULL) {
4173		cmn_err(CE_WARN, "!vsw%d: %s: error creating msg",
4174		    vswp->instance, __func__);
4175		return;
4176	}
4177
4178	ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT;
4179
4180	DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg);
4181
4182	(void) vsw_send_msg(ldcp, dring_msg,
4183	    sizeof (vio_dring_reg_msg_t), B_TRUE);
4184
4185	kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t));
4186
4187	D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id);
4188}
4189
4190static void
4191vsw_send_rdx(vsw_ldc_t *ldcp)
4192{
4193	vsw_t		*vswp = ldcp->ldc_vswp;
4194	vio_rdx_msg_t	rdx_msg;
4195
4196	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
4197
4198	rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4199	rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4200	rdx_msg.tag.vio_subtype_env = VIO_RDX;
4201	rdx_msg.tag.vio_sid = ldcp->local_session;
4202
4203	ldcp->lane_in.lstate |= VSW_RDX_INFO_SENT;
4204
4205	DUMP_TAG(rdx_msg.tag);
4206
4207	(void) vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t), B_TRUE);
4208
4209	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
4210}
4211
4212/*
4213 * Generic routine to send message out over ldc channel.
4214 *
4215 * It is possible that when we attempt to write over the ldc channel
4216 * that we get notified that it has been reset. Depending on the value
4217 * of the handle_reset flag we either handle that event here or simply
4218 * notify the caller that the channel was reset.
4219 */
4220static int
4221vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size, boolean_t handle_reset)
4222{
4223	int		rv;
4224	size_t		msglen = size;
4225	vio_msg_tag_t	*tag = (vio_msg_tag_t *)msgp;
4226	vsw_t		*vswp = ldcp->ldc_vswp;
4227
4228	D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes",
4229	    ldcp->ldc_id, size);
4230
4231	D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype);
4232	D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype);
4233	D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env);
4234
4235	mutex_enter(&ldcp->ldc_txlock);
4236	do {
4237		msglen = size;
4238		rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen);
4239	} while (rv == EWOULDBLOCK && --vsw_wretries > 0);
4240
4241	if ((rv != 0) || (msglen != size)) {
4242		DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) rv(%d) "
4243		    "size (%d) msglen(%d)\n", ldcp->ldc_id, rv, size, msglen);
4244		ldcp->ldc_stats.oerrors++;
4245	}
4246	mutex_exit(&ldcp->ldc_txlock);
4247
4248	/*
4249	 * If channel has been reset we either handle it here or
4250	 * simply report back that it has been reset and let caller
4251	 * decide what to do.
4252	 */
4253	if (rv == ECONNRESET) {
4254		DWARN(vswp, "%s (%lld) channel reset", __func__, ldcp->ldc_id);
4255
4256		/*
4257		 * N.B - must never be holding the dlistrw lock when
4258		 * we do a reset of the channel.
4259		 */
4260		if (handle_reset) {
4261			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
4262		}
4263	}
4264
4265	return (rv);
4266}
4267
4268/*
4269 * Remove the specified address from the list of address maintained
4270 * in this port node.
4271 */
4272mcst_addr_t *
4273vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr)
4274{
4275	vsw_t		*vswp = NULL;
4276	vsw_port_t	*port = NULL;
4277	mcst_addr_t	*prev_p = NULL;
4278	mcst_addr_t	*curr_p = NULL;
4279
4280	D1(NULL, "%s: enter : devtype %d : addr 0x%llx",
4281	    __func__, devtype, addr);
4282
4283	if (devtype == VSW_VNETPORT) {
4284		port = (vsw_port_t *)arg;
4285		mutex_enter(&port->mca_lock);
4286		prev_p = curr_p = port->mcap;
4287	} else {
4288		vswp = (vsw_t *)arg;
4289		mutex_enter(&vswp->mca_lock);
4290		prev_p = curr_p = vswp->mcap;
4291	}
4292
4293	while (curr_p != NULL) {
4294		if (curr_p->addr == addr) {
4295			D2(NULL, "%s: address found", __func__);
4296			/* match found */
4297			if (prev_p == curr_p) {
4298				/* list head */
4299				if (devtype == VSW_VNETPORT)
4300					port->mcap = curr_p->nextp;
4301				else
4302					vswp->mcap = curr_p->nextp;
4303			} else {
4304				prev_p->nextp = curr_p->nextp;
4305			}
4306			break;
4307		} else {
4308			prev_p = curr_p;
4309			curr_p = curr_p->nextp;
4310		}
4311	}
4312
4313	if (devtype == VSW_VNETPORT)
4314		mutex_exit(&port->mca_lock);
4315	else
4316		mutex_exit(&vswp->mca_lock);
4317
4318	D1(NULL, "%s: exit", __func__);
4319
4320	return (curr_p);
4321}
4322
4323/*
4324 * Creates a descriptor ring (dring) and links it into the
4325 * link of outbound drings for this channel.
4326 *
4327 * Returns NULL if creation failed.
4328 */
4329static dring_info_t *
4330vsw_create_dring(vsw_ldc_t *ldcp)
4331{
4332	vsw_private_desc_t	*priv_addr = NULL;
4333	vsw_t			*vswp = ldcp->ldc_vswp;
4334	ldc_mem_info_t		minfo;
4335	dring_info_t		*dp, *tp;
4336	int			i;
4337
4338	dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
4339
4340	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
4341
4342	/* create public section of ring */
4343	if ((ldc_mem_dring_create(VSW_RING_NUM_EL,
4344	    VSW_PUB_SIZE, &dp->handle)) != 0) {
4345
4346		DERR(vswp, "vsw_create_dring(%lld): ldc dring create "
4347		    "failed", ldcp->ldc_id);
4348		goto create_fail_exit;
4349	}
4350
4351	ASSERT(dp->handle != NULL);
4352
4353	/*
4354	 * Get the base address of the public section of the ring.
4355	 */
4356	if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
4357		DERR(vswp, "vsw_create_dring(%lld): dring info failed\n",
4358		    ldcp->ldc_id);
4359		goto dring_fail_exit;
4360	} else {
4361		ASSERT(minfo.vaddr != 0);
4362		dp->pub_addr = minfo.vaddr;
4363	}
4364
4365	dp->num_descriptors = VSW_RING_NUM_EL;
4366	dp->descriptor_size = VSW_PUB_SIZE;
4367	dp->options = VIO_TX_DRING;
4368	dp->ncookies = 1;	/* guaranteed by ldc */
4369
4370	/*
4371	 * create private portion of ring
4372	 */
4373	dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc(
4374	    (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL), KM_SLEEP);
4375
4376	if (vsw_setup_ring(ldcp, dp)) {
4377		DERR(vswp, "%s: unable to setup ring", __func__);
4378		goto dring_fail_exit;
4379	}
4380
4381	/* haven't used any descriptors yet */
4382	dp->end_idx = 0;
4383	dp->last_ack_recv = -1;
4384
4385	/* bind dring to the channel */
4386	if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle,
4387	    LDC_SHADOW_MAP, LDC_MEM_RW,
4388	    &dp->cookie[0], &dp->ncookies)) != 0) {
4389		DERR(vswp, "vsw_create_dring: unable to bind to channel "
4390		    "%lld", ldcp->ldc_id);
4391		goto dring_fail_exit;
4392	}
4393
4394	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
4395	dp->restart_reqd = B_TRUE;
4396
4397	/*
4398	 * Only ever create rings for outgoing lane. Link it onto
4399	 * end of list.
4400	 */
4401	WRITE_ENTER(&ldcp->lane_out.dlistrw);
4402	if (ldcp->lane_out.dringp == NULL) {
4403		D2(vswp, "vsw_create_dring: adding first outbound ring");
4404		ldcp->lane_out.dringp = dp;
4405	} else {
4406		tp = ldcp->lane_out.dringp;
4407		while (tp->next != NULL)
4408			tp = tp->next;
4409
4410		tp->next = dp;
4411	}
4412	RW_EXIT(&ldcp->lane_out.dlistrw);
4413
4414	return (dp);
4415
4416dring_fail_exit:
4417	(void) ldc_mem_dring_destroy(dp->handle);
4418
4419create_fail_exit:
4420	if (dp->priv_addr != NULL) {
4421		priv_addr = dp->priv_addr;
4422		for (i = 0; i < VSW_RING_NUM_EL; i++) {
4423			if (priv_addr->memhandle != NULL)
4424				(void) ldc_mem_free_handle(
4425				    priv_addr->memhandle);
4426			priv_addr++;
4427		}
4428		kmem_free(dp->priv_addr,
4429		    (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL));
4430	}
4431	mutex_destroy(&dp->dlock);
4432
4433	kmem_free(dp, sizeof (dring_info_t));
4434	return (NULL);
4435}
4436
4437/*
4438 * Create a ring consisting of just a private portion and link
4439 * it into the list of rings for the outbound lane.
4440 *
4441 * These type of rings are used primarily for temporary data
4442 * storage (i.e. as data buffers).
4443 */
4444void
4445vsw_create_privring(vsw_ldc_t *ldcp)
4446{
4447	dring_info_t		*dp, *tp;
4448	vsw_t			*vswp = ldcp->ldc_vswp;
4449
4450	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4451
4452	dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
4453
4454	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
4455
4456	/* no public section */
4457	dp->pub_addr = NULL;
4458
4459	dp->priv_addr = kmem_zalloc(
4460	    (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL), KM_SLEEP);
4461
4462	dp->num_descriptors = VSW_RING_NUM_EL;
4463
4464	if (vsw_setup_ring(ldcp, dp)) {
4465		DERR(vswp, "%s: setup of ring failed", __func__);
4466		kmem_free(dp->priv_addr,
4467		    (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL));
4468		mutex_destroy(&dp->dlock);
4469		kmem_free(dp, sizeof (dring_info_t));
4470		return;
4471	}
4472
4473	/* haven't used any descriptors yet */
4474	dp->end_idx = 0;
4475
4476	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
4477	dp->restart_reqd = B_TRUE;
4478
4479	/*
4480	 * Only ever create rings for outgoing lane. Link it onto
4481	 * end of list.
4482	 */
4483	WRITE_ENTER(&ldcp->lane_out.dlistrw);
4484	if (ldcp->lane_out.dringp == NULL) {
4485		D2(vswp, "%s: adding first outbound privring", __func__);
4486		ldcp->lane_out.dringp = dp;
4487	} else {
4488		tp = ldcp->lane_out.dringp;
4489		while (tp->next != NULL)
4490			tp = tp->next;
4491
4492		tp->next = dp;
4493	}
4494	RW_EXIT(&ldcp->lane_out.dlistrw);
4495
4496	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4497}
4498
4499/*
4500 * Setup the descriptors in the dring. Returns 0 on success, 1 on
4501 * failure.
4502 */
4503int
4504vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp)
4505{
4506	vnet_public_desc_t	*pub_addr = NULL;
4507	vsw_private_desc_t	*priv_addr = NULL;
4508	vsw_t			*vswp = ldcp->ldc_vswp;
4509	uint64_t		*tmpp;
4510	uint64_t		offset = 0;
4511	uint32_t		ncookies = 0;
4512	static char		*name = "vsw_setup_ring";
4513	int			i, j, nc, rv;
4514
4515	priv_addr = dp->priv_addr;
4516	pub_addr = dp->pub_addr;
4517
4518	/* public section may be null but private should never be */
4519	ASSERT(priv_addr != NULL);
4520
4521	/*
4522	 * Allocate the region of memory which will be used to hold
4523	 * the data the descriptors will refer to.
4524	 */
4525	dp->data_sz = (VSW_RING_NUM_EL * VSW_RING_EL_DATA_SZ);
4526	dp->data_addr = kmem_alloc(dp->data_sz, KM_SLEEP);
4527
4528	D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name,
4529	    dp->data_sz, dp->data_addr);
4530
4531	tmpp = (uint64_t *)dp->data_addr;
4532	offset = VSW_RING_EL_DATA_SZ / sizeof (tmpp);
4533
4534	/*
4535	 * Initialise some of the private and public (if they exist)
4536	 * descriptor fields.
4537	 */
4538	for (i = 0; i < VSW_RING_NUM_EL; i++) {
4539		mutex_init(&priv_addr->dstate_lock, NULL, MUTEX_DRIVER, NULL);
4540
4541		if ((ldc_mem_alloc_handle(ldcp->ldc_handle,
4542		    &priv_addr->memhandle)) != 0) {
4543			DERR(vswp, "%s: alloc mem handle failed", name);
4544			goto setup_ring_cleanup;
4545		}
4546
4547		priv_addr->datap = (void *)tmpp;
4548
4549		rv = ldc_mem_bind_handle(priv_addr->memhandle,
4550		    (caddr_t)priv_addr->datap, VSW_RING_EL_DATA_SZ,
4551		    LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W,
4552		    &(priv_addr->memcookie[0]), &ncookies);
4553		if (rv != 0) {
4554			DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed "
4555			    "(rv %d)", name, ldcp->ldc_id, rv);
4556			goto setup_ring_cleanup;
4557		}
4558		priv_addr->bound = 1;
4559
4560		D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx",
4561		    name, i, priv_addr->memcookie[0].addr,
4562		    priv_addr->memcookie[0].size);
4563
4564		if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) {
4565			DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned "
4566			    "invalid num of cookies (%d) for size 0x%llx",
4567			    name, ldcp->ldc_id, ncookies, VSW_RING_EL_DATA_SZ);
4568
4569			goto setup_ring_cleanup;
4570		} else {
4571			for (j = 1; j < ncookies; j++) {
4572				rv = ldc_mem_nextcookie(priv_addr->memhandle,
4573				    &(priv_addr->memcookie[j]));
4574				if (rv != 0) {
4575					DERR(vswp, "%s: ldc_mem_nextcookie "
4576					    "failed rv (%d)", name, rv);
4577					goto setup_ring_cleanup;
4578				}
4579				D3(vswp, "%s: memcookie %d : addr 0x%llx : "
4580				    "size 0x%llx", name, j,
4581				    priv_addr->memcookie[j].addr,
4582				    priv_addr->memcookie[j].size);
4583			}
4584
4585		}
4586		priv_addr->ncookies = ncookies;
4587		priv_addr->dstate = VIO_DESC_FREE;
4588
4589		if (pub_addr != NULL) {
4590
4591			/* link pub and private sides */
4592			priv_addr->descp = pub_addr;
4593
4594			pub_addr->ncookies = priv_addr->ncookies;
4595
4596			for (nc = 0; nc < pub_addr->ncookies; nc++) {
4597				bcopy(&priv_addr->memcookie[nc],
4598				    &pub_addr->memcookie[nc],
4599				    sizeof (ldc_mem_cookie_t));
4600			}
4601
4602			pub_addr->hdr.dstate = VIO_DESC_FREE;
4603			pub_addr++;
4604		}
4605
4606		/*
4607		 * move to next element in the dring and the next
4608		 * position in the data buffer.
4609		 */
4610		priv_addr++;
4611		tmpp += offset;
4612	}
4613
4614	return (0);
4615
4616setup_ring_cleanup:
4617	priv_addr = dp->priv_addr;
4618
4619	for (j = 0; j < i; j++) {
4620		(void) ldc_mem_unbind_handle(priv_addr->memhandle);
4621		(void) ldc_mem_free_handle(priv_addr->memhandle);
4622
4623		mutex_destroy(&priv_addr->dstate_lock);
4624
4625		priv_addr++;
4626	}
4627	kmem_free(dp->data_addr, dp->data_sz);
4628
4629	return (1);
4630}
4631
4632/*
4633 * Searches the private section of a ring for a free descriptor,
4634 * starting at the location of the last free descriptor found
4635 * previously.
4636 *
4637 * Returns 0 if free descriptor is available, and updates state
4638 * of private descriptor to VIO_DESC_READY,  otherwise returns 1.
4639 *
4640 * FUTURE: might need to return contiguous range of descriptors
4641 * as dring info msg assumes all will be contiguous.
4642 */
4643static int
4644vsw_dring_find_free_desc(dring_info_t *dringp,
4645		vsw_private_desc_t **priv_p, int *idx)
4646{
4647	vsw_private_desc_t	*addr = NULL;
4648	int			num = VSW_RING_NUM_EL;
4649	int			ret = 1;
4650
4651	D1(NULL, "%s enter\n", __func__);
4652
4653	ASSERT(dringp->priv_addr != NULL);
4654
4655	D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld",
4656	    __func__, dringp, dringp->end_idx);
4657
4658	addr = (vsw_private_desc_t *)dringp->priv_addr + dringp->end_idx;
4659
4660	mutex_enter(&addr->dstate_lock);
4661	if (addr->dstate == VIO_DESC_FREE) {
4662		addr->dstate = VIO_DESC_READY;
4663		*priv_p = addr;
4664		*idx = dringp->end_idx;
4665		dringp->end_idx = (dringp->end_idx + 1) % num;
4666		ret = 0;
4667
4668	}
4669	mutex_exit(&addr->dstate_lock);
4670
4671	/* ring full */
4672	if (ret == 1) {
4673		D2(NULL, "%s: no desp free: started at %d", __func__,
4674		    dringp->end_idx);
4675	}
4676
4677	D1(NULL, "%s: exit\n", __func__);
4678
4679	return (ret);
4680}
4681
4682/*
4683 * Map from a dring identifier to the ring itself. Returns
4684 * pointer to ring or NULL if no match found.
4685 *
4686 * Should be called with dlistrw rwlock held as reader.
4687 */
4688static dring_info_t *
4689vsw_ident2dring(lane_t *lane, uint64_t ident)
4690{
4691	dring_info_t	*dp = NULL;
4692
4693	if ((dp = lane->dringp) == NULL) {
4694		return (NULL);
4695	} else {
4696		if (dp->ident == ident)
4697			return (dp);
4698
4699		while (dp != NULL) {
4700			if (dp->ident == ident)
4701				break;
4702			dp = dp->next;
4703		}
4704	}
4705
4706	return (dp);
4707}
4708
4709/*
4710 * Set the default lane attributes. These are copied into
4711 * the attr msg we send to our peer. If they are not acceptable
4712 * then (currently) the handshake ends.
4713 */
4714static void
4715vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
4716{
4717	bzero(lp, sizeof (lane_t));
4718
4719	READ_ENTER(&vswp->if_lockrw);
4720	ether_copy(&(vswp->if_addr), &(lp->addr));
4721	RW_EXIT(&vswp->if_lockrw);
4722
4723	lp->mtu = VSW_MTU;
4724	lp->addr_type = ADDR_TYPE_MAC;
4725	lp->xfer_mode = VIO_DRING_MODE;
4726	lp->ack_freq = 0;	/* for shared mode */
4727
4728	/*
4729	 * As the seq_num is incremented before sending,
4730	 * initialize it with VNET_ISS - 1.
4731	 */
4732	atomic_swap_64(&lp->seq_num, (VNET_ISS - 1));
4733}
4734
4735/*
4736 * Verify that the attributes are acceptable.
4737 *
4738 * FUTURE: If some attributes are not acceptable, change them
4739 * our desired values.
4740 */
4741static int
4742vsw_check_attr(vnet_attr_msg_t *pkt, vsw_port_t *port)
4743{
4744	int			ret = 0;
4745	struct ether_addr	ea;
4746
4747	D1(NULL, "vsw_check_attr enter\n");
4748
4749	/*
4750	 * Note we currently only support in-band descriptors
4751	 * and descriptor rings, not packet based transfer (VIO_PKT_MODE)
4752	 */
4753	if ((pkt->xfer_mode != VIO_DESC_MODE) &&
4754	    (pkt->xfer_mode != VIO_DRING_MODE)) {
4755		D2(NULL, "vsw_check_attr: unknown mode %x\n", pkt->xfer_mode);
4756		ret = 1;
4757	}
4758
4759	/* Only support MAC addresses at moment. */
4760	if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) {
4761		D2(NULL, "vsw_check_attr: invalid addr_type %x, "
4762		    "or address 0x%llx\n", pkt->addr_type, pkt->addr);
4763		ret = 1;
4764	}
4765
4766	/*
4767	 * MAC address supplied by device should match that stored
4768	 * in the vsw-port OBP node. Need to decide what to do if they
4769	 * don't match, for the moment just warn but don't fail.
4770	 */
4771	vnet_macaddr_ultostr(pkt->addr, ea.ether_addr_octet);
4772	if (ether_cmp(&ea, &port->p_macaddr) != 0) {
4773		DERR(NULL, "vsw_check_attr: device supplied address "
4774		    "0x%llx doesn't match node address 0x%llx\n",
4775		    pkt->addr, port->p_macaddr);
4776	}
4777
4778	/*
4779	 * Ack freq only makes sense in pkt mode, in shared
4780	 * mode the ring descriptors say whether or not to
4781	 * send back an ACK.
4782	 */
4783	if ((pkt->xfer_mode == VIO_DRING_MODE) &&
4784	    (pkt->ack_freq > 0)) {
4785		D2(NULL, "vsw_check_attr: non zero ack freq "
4786		    " in SHM mode\n");
4787		ret = 1;
4788	}
4789
4790	/*
4791	 * Note: for the moment we only support ETHER
4792	 * frames. This may change in the future.
4793	 */
4794	if ((pkt->mtu > VSW_MTU) || (pkt->mtu <= 0)) {
4795		D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n",
4796		    pkt->mtu);
4797		ret = 1;
4798	}
4799
4800	D1(NULL, "vsw_check_attr exit\n");
4801
4802	return (ret);
4803}
4804
4805/*
4806 * Returns 1 if there is a problem, 0 otherwise.
4807 */
4808static int
4809vsw_check_dring_info(vio_dring_reg_msg_t *pkt)
4810{
4811	_NOTE(ARGUNUSED(pkt))
4812
4813	int	ret = 0;
4814
4815	D1(NULL, "vsw_check_dring_info enter\n");
4816
4817	if ((pkt->num_descriptors == 0) ||
4818	    (pkt->descriptor_size == 0) ||
4819	    (pkt->ncookies != 1)) {
4820		DERR(NULL, "vsw_check_dring_info: invalid dring msg");
4821		ret = 1;
4822	}
4823
4824	D1(NULL, "vsw_check_dring_info exit\n");
4825
4826	return (ret);
4827}
4828
4829/*
4830 * Returns 1 if two memory cookies match. Otherwise returns 0.
4831 */
4832static int
4833vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2)
4834{
4835	if ((m1->addr != m2->addr) ||
4836	    (m2->size != m2->size)) {
4837		return (0);
4838	} else {
4839		return (1);
4840	}
4841}
4842
4843/*
4844 * Returns 1 if ring described in reg message matches that
4845 * described by dring_info structure. Otherwise returns 0.
4846 */
4847static int
4848vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg)
4849{
4850	if ((msg->descriptor_size != dp->descriptor_size) ||
4851	    (msg->num_descriptors != dp->num_descriptors) ||
4852	    (msg->ncookies != dp->ncookies) ||
4853	    !(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) {
4854		return (0);
4855	} else {
4856		return (1);
4857	}
4858
4859}
4860
4861static caddr_t
4862vsw_print_ethaddr(uint8_t *a, char *ebuf)
4863{
4864	(void) sprintf(ebuf, "%x:%x:%x:%x:%x:%x",
4865	    a[0], a[1], a[2], a[3], a[4], a[5]);
4866	return (ebuf);
4867}
4868
4869/*
4870 * Reset and free all the resources associated with
4871 * the channel.
4872 */
4873static void
4874vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
4875{
4876	dring_info_t		*dp, *dpp;
4877	lane_t			*lp = NULL;
4878	int			rv = 0;
4879
4880	ASSERT(ldcp != NULL);
4881
4882	D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id);
4883
4884	if (dir == INBOUND) {
4885		D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane"
4886		    " of channel %lld", __func__, ldcp->ldc_id);
4887		lp = &ldcp->lane_in;
4888	} else {
4889		D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane"
4890		    " of channel %lld", __func__, ldcp->ldc_id);
4891		lp = &ldcp->lane_out;
4892	}
4893
4894	lp->lstate = VSW_LANE_INACTIV;
4895
4896	/*
4897	 * As the seq_num is incremented before sending,
4898	 * initialize it with VNET_ISS - 1.
4899	 */
4900	atomic_swap_64(&lp->seq_num, (VNET_ISS - 1));
4901
4902	if (lp->dringp) {
4903		if (dir == INBOUND) {
4904			WRITE_ENTER(&lp->dlistrw);
4905			dp = lp->dringp;
4906			while (dp != NULL) {
4907				dpp = dp->next;
4908				if (dp->handle != NULL)
4909					(void) ldc_mem_dring_unmap(dp->handle);
4910				kmem_free(dp, sizeof (dring_info_t));
4911				dp = dpp;
4912			}
4913			RW_EXIT(&lp->dlistrw);
4914		} else {
4915			/*
4916			 * unbind, destroy exported dring, free dring struct
4917			 */
4918			WRITE_ENTER(&lp->dlistrw);
4919			dp = lp->dringp;
4920			rv = vsw_free_ring(dp);
4921			RW_EXIT(&lp->dlistrw);
4922		}
4923		if (rv == 0) {
4924			lp->dringp = NULL;
4925		}
4926	}
4927
4928	D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id);
4929}
4930
4931/*
4932 * Free ring and all associated resources.
4933 *
4934 * Should be called with dlistrw rwlock held as writer.
4935 */
4936static int
4937vsw_free_ring(dring_info_t *dp)
4938{
4939	vsw_private_desc_t	*paddr = NULL;
4940	dring_info_t		*dpp;
4941	int			i, rv = 1;
4942
4943	while (dp != NULL) {
4944		mutex_enter(&dp->dlock);
4945		dpp = dp->next;
4946		if (dp->priv_addr != NULL) {
4947			/*
4948			 * First unbind and free the memory handles
4949			 * stored in each descriptor within the ring.
4950			 */
4951			for (i = 0; i < VSW_RING_NUM_EL; i++) {
4952				paddr = (vsw_private_desc_t *)
4953				    dp->priv_addr + i;
4954				if (paddr->memhandle != NULL) {
4955					if (paddr->bound == 1) {
4956						rv = ldc_mem_unbind_handle(
4957						    paddr->memhandle);
4958
4959						if (rv != 0) {
4960							DERR(NULL, "error "
4961							"unbinding handle for "
4962							"ring 0x%llx at pos %d",
4963							    dp, i);
4964							mutex_exit(&dp->dlock);
4965							return (rv);
4966						}
4967						paddr->bound = 0;
4968					}
4969
4970					rv = ldc_mem_free_handle(
4971					    paddr->memhandle);
4972					if (rv != 0) {
4973						DERR(NULL, "error freeing "
4974						    "handle for ring 0x%llx "
4975						    "at pos %d", dp, i);
4976						mutex_exit(&dp->dlock);
4977						return (rv);
4978					}
4979					paddr->memhandle = NULL;
4980				}
4981				mutex_destroy(&paddr->dstate_lock);
4982			}
4983			kmem_free(dp->priv_addr,
4984			    (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL));
4985		}
4986
4987		/*
4988		 * Now unbind and destroy the ring itself.
4989		 */
4990		if (dp->handle != NULL) {
4991			(void) ldc_mem_dring_unbind(dp->handle);
4992			(void) ldc_mem_dring_destroy(dp->handle);
4993		}
4994
4995		if (dp->data_addr != NULL) {
4996			kmem_free(dp->data_addr, dp->data_sz);
4997		}
4998
4999		mutex_exit(&dp->dlock);
5000		mutex_destroy(&dp->dlock);
5001		mutex_destroy(&dp->restart_lock);
5002		kmem_free(dp, sizeof (dring_info_t));
5003
5004		dp = dpp;
5005	}
5006	return (0);
5007}
5008
5009/*
5010 * vsw_ldc_rx_worker -- A per LDC worker thread to receive data.
5011 * This thread is woken up by the LDC interrupt handler to process
5012 * LDC packets and receive data.
5013 */
5014static void
5015vsw_ldc_rx_worker(void *arg)
5016{
5017	callb_cpr_t	cprinfo;
5018	vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
5019	vsw_t *vswp = ldcp->ldc_vswp;
5020
5021	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5022	CALLB_CPR_INIT(&cprinfo, &ldcp->rx_thr_lock, callb_generic_cpr,
5023	    "vsw_rx_thread");
5024	mutex_enter(&ldcp->rx_thr_lock);
5025	ldcp->rx_thr_flags |= VSW_WTHR_RUNNING;
5026	while (!(ldcp->rx_thr_flags & VSW_WTHR_STOP)) {
5027
5028		CALLB_CPR_SAFE_BEGIN(&cprinfo);
5029		/*
5030		 * Wait until the data is received or a stop
5031		 * request is received.
5032		 */
5033		while (!(ldcp->rx_thr_flags &
5034		    (VSW_WTHR_DATARCVD | VSW_WTHR_STOP))) {
5035			cv_wait(&ldcp->rx_thr_cv, &ldcp->rx_thr_lock);
5036		}
5037		CALLB_CPR_SAFE_END(&cprinfo, &ldcp->rx_thr_lock)
5038
5039		/*
5040		 * First process the stop request.
5041		 */
5042		if (ldcp->rx_thr_flags & VSW_WTHR_STOP) {
5043			D2(vswp, "%s(%lld):Rx thread stopped\n",
5044			    __func__, ldcp->ldc_id);
5045			break;
5046		}
5047		ldcp->rx_thr_flags &= ~VSW_WTHR_DATARCVD;
5048		mutex_exit(&ldcp->rx_thr_lock);
5049		D1(vswp, "%s(%lld):calling vsw_process_pkt\n",
5050		    __func__, ldcp->ldc_id);
5051		mutex_enter(&ldcp->ldc_cblock);
5052		vsw_process_pkt(ldcp);
5053		mutex_exit(&ldcp->ldc_cblock);
5054		mutex_enter(&ldcp->rx_thr_lock);
5055	}
5056
5057	/*
5058	 * Update the run status and wakeup the thread that
5059	 * has sent the stop request.
5060	 */
5061	ldcp->rx_thr_flags &= ~VSW_WTHR_RUNNING;
5062	cv_signal(&ldcp->rx_thr_cv);
5063	CALLB_CPR_EXIT(&cprinfo);
5064	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5065	thread_exit();
5066}
5067
5068/* vsw_stop_rx_thread -- Co-ordinate with receive thread to stop it */
5069static void
5070vsw_stop_rx_thread(vsw_ldc_t *ldcp)
5071{
5072	vsw_t *vswp = ldcp->ldc_vswp;
5073
5074	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5075	/*
5076	 * Send a stop request by setting the stop flag and
5077	 * wait until the receive thread stops.
5078	 */
5079	mutex_enter(&ldcp->rx_thr_lock);
5080	if (ldcp->rx_thr_flags & VSW_WTHR_RUNNING) {
5081		ldcp->rx_thr_flags |= VSW_WTHR_STOP;
5082		cv_signal(&ldcp->rx_thr_cv);
5083		while (ldcp->rx_thr_flags & VSW_WTHR_RUNNING) {
5084			cv_wait(&ldcp->rx_thr_cv, &ldcp->rx_thr_lock);
5085		}
5086	}
5087	mutex_exit(&ldcp->rx_thr_lock);
5088	ldcp->rx_thread = NULL;
5089	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5090}
5091
5092/*
5093 * vsw_ldc_tx_worker -- A per LDC worker thread to transmit data.
5094 * This thread is woken up by the vsw_portsend to transmit
5095 * packets.
5096 */
5097static void
5098vsw_ldc_tx_worker(void *arg)
5099{
5100	callb_cpr_t	cprinfo;
5101	vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
5102	vsw_t *vswp = ldcp->ldc_vswp;
5103	mblk_t *mp;
5104	mblk_t *tmp;
5105
5106	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5107	CALLB_CPR_INIT(&cprinfo, &ldcp->tx_thr_lock, callb_generic_cpr,
5108	    "vnet_tx_thread");
5109	mutex_enter(&ldcp->tx_thr_lock);
5110	ldcp->tx_thr_flags |= VSW_WTHR_RUNNING;
5111	while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP)) {
5112
5113		CALLB_CPR_SAFE_BEGIN(&cprinfo);
5114		/*
5115		 * Wait until the data is received or a stop
5116		 * request is received.
5117		 */
5118		while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP) &&
5119		    (ldcp->tx_mhead == NULL)) {
5120			cv_wait(&ldcp->tx_thr_cv, &ldcp->tx_thr_lock);
5121		}
5122		CALLB_CPR_SAFE_END(&cprinfo, &ldcp->tx_thr_lock)
5123
5124		/*
5125		 * First process the stop request.
5126		 */
5127		if (ldcp->tx_thr_flags & VSW_WTHR_STOP) {
5128			D2(vswp, "%s(%lld):tx thread stopped\n",
5129			    __func__, ldcp->ldc_id);
5130			break;
5131		}
5132		mp = ldcp->tx_mhead;
5133		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
5134		mutex_exit(&ldcp->tx_thr_lock);
5135		D2(vswp, "%s(%lld):calling vsw_ldcsend\n",
5136		    __func__, ldcp->ldc_id);
5137		while (mp != NULL) {
5138			tmp = mp->b_next;
5139			mp->b_next = mp->b_prev = NULL;
5140			(void) vsw_ldcsend(ldcp, mp, vsw_ldc_tx_retries);
5141			mp = tmp;
5142		}
5143		mutex_enter(&ldcp->tx_thr_lock);
5144	}
5145
5146	/*
5147	 * Update the run status and wakeup the thread that
5148	 * has sent the stop request.
5149	 */
5150	ldcp->tx_thr_flags &= ~VSW_WTHR_RUNNING;
5151	cv_signal(&ldcp->tx_thr_cv);
5152	CALLB_CPR_EXIT(&cprinfo);
5153	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5154	thread_exit();
5155}
5156
5157/* vsw_stop_tx_thread -- Co-ordinate with receive thread to stop it */
5158static void
5159vsw_stop_tx_thread(vsw_ldc_t *ldcp)
5160{
5161	vsw_t *vswp = ldcp->ldc_vswp;
5162
5163	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5164	/*
5165	 * Send a stop request by setting the stop flag and
5166	 * wait until the receive thread stops.
5167	 */
5168	mutex_enter(&ldcp->tx_thr_lock);
5169	if (ldcp->tx_thr_flags & VSW_WTHR_RUNNING) {
5170		ldcp->tx_thr_flags |= VSW_WTHR_STOP;
5171		cv_signal(&ldcp->tx_thr_cv);
5172		while (ldcp->tx_thr_flags & VSW_WTHR_RUNNING) {
5173			cv_wait(&ldcp->tx_thr_cv, &ldcp->tx_thr_lock);
5174		}
5175	}
5176	mutex_exit(&ldcp->tx_thr_lock);
5177	ldcp->tx_thread = NULL;
5178	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5179}
5180
5181/* vsw_reclaim_dring -- reclaim descriptors */
5182static int
5183vsw_reclaim_dring(dring_info_t *dp, int start)
5184{
5185	int i, j, len;
5186	vsw_private_desc_t *priv_addr;
5187	vnet_public_desc_t *pub_addr;
5188
5189	pub_addr = (vnet_public_desc_t *)dp->pub_addr;
5190	priv_addr = (vsw_private_desc_t *)dp->priv_addr;
5191	len = dp->num_descriptors;
5192
5193	D2(NULL, "%s: start index %ld\n", __func__, start);
5194
5195	j = 0;
5196	for (i = start; j < len; i = (i + 1) % len, j++) {
5197		pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
5198		priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
5199
5200		mutex_enter(&priv_addr->dstate_lock);
5201		if (pub_addr->hdr.dstate != VIO_DESC_DONE) {
5202			mutex_exit(&priv_addr->dstate_lock);
5203			break;
5204		}
5205		pub_addr->hdr.dstate = VIO_DESC_FREE;
5206		priv_addr->dstate = VIO_DESC_FREE;
5207		/* clear all the fields */
5208		priv_addr->datalen = 0;
5209		pub_addr->hdr.ack = 0;
5210		mutex_exit(&priv_addr->dstate_lock);
5211
5212		D3(NULL, "claiming descp:%d pub state:0x%llx priv state 0x%llx",
5213		    i, pub_addr->hdr.dstate, priv_addr->dstate);
5214	}
5215	return (j);
5216}
5217
5218/*
5219 * Debugging routines
5220 */
5221static void
5222display_state(void)
5223{
5224	vsw_t		*vswp;
5225	vsw_port_list_t	*plist;
5226	vsw_port_t 	*port;
5227	vsw_ldc_list_t	*ldcl;
5228	vsw_ldc_t 	*ldcp;
5229	extern vsw_t 	*vsw_head;
5230
5231	cmn_err(CE_NOTE, "***** system state *****");
5232
5233	for (vswp = vsw_head; vswp; vswp = vswp->next) {
5234		plist = &vswp->plist;
5235		READ_ENTER(&plist->lockrw);
5236		cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n",
5237		    vswp->instance, plist->num_ports);
5238
5239		for (port = plist->head; port != NULL; port = port->p_next) {
5240			ldcl = &port->p_ldclist;
5241			cmn_err(CE_CONT, "port %d : %d ldcs attached\n",
5242			    port->p_instance, ldcl->num_ldcs);
5243			READ_ENTER(&ldcl->lockrw);
5244			ldcp = ldcl->head;
5245			for (; ldcp != NULL; ldcp = ldcp->ldc_next) {
5246				cmn_err(CE_CONT, "chan %lu : dev %d : "
5247				    "status %d : phase %u\n",
5248				    ldcp->ldc_id, ldcp->dev_class,
5249				    ldcp->ldc_status, ldcp->hphase);
5250				cmn_err(CE_CONT, "chan %lu : lsession %lu : "
5251				    "psession %lu\n", ldcp->ldc_id,
5252				    ldcp->local_session, ldcp->peer_session);
5253
5254				cmn_err(CE_CONT, "Inbound lane:\n");
5255				display_lane(&ldcp->lane_in);
5256				cmn_err(CE_CONT, "Outbound lane:\n");
5257				display_lane(&ldcp->lane_out);
5258			}
5259			RW_EXIT(&ldcl->lockrw);
5260		}
5261		RW_EXIT(&plist->lockrw);
5262	}
5263	cmn_err(CE_NOTE, "***** system state *****");
5264}
5265
5266static void
5267display_lane(lane_t *lp)
5268{
5269	dring_info_t	*drp;
5270
5271	cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n",
5272	    lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu);
5273	cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n",
5274	    lp->addr_type, lp->addr, lp->xfer_mode);
5275	cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp);
5276
5277	cmn_err(CE_CONT, "Dring info:\n");
5278	for (drp = lp->dringp; drp != NULL; drp = drp->next) {
5279		cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n",
5280		    drp->num_descriptors, drp->descriptor_size);
5281		cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle);
5282		cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n",
5283		    (uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr);
5284		cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n",
5285		    drp->ident, drp->end_idx);
5286		display_ring(drp);
5287	}
5288}
5289
5290static void
5291display_ring(dring_info_t *dringp)
5292{
5293	uint64_t		i;
5294	uint64_t		priv_count = 0;
5295	uint64_t		pub_count = 0;
5296	vnet_public_desc_t	*pub_addr = NULL;
5297	vsw_private_desc_t	*priv_addr = NULL;
5298
5299	for (i = 0; i < VSW_RING_NUM_EL; i++) {
5300		if (dringp->pub_addr != NULL) {
5301			pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i;
5302
5303			if (pub_addr->hdr.dstate == VIO_DESC_FREE)
5304				pub_count++;
5305		}
5306
5307		if (dringp->priv_addr != NULL) {
5308			priv_addr = (vsw_private_desc_t *)dringp->priv_addr + i;
5309
5310			if (priv_addr->dstate == VIO_DESC_FREE)
5311				priv_count++;
5312		}
5313	}
5314	cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n",
5315	    i, priv_count, pub_count);
5316}
5317
5318static void
5319dump_flags(uint64_t state)
5320{
5321	int	i;
5322
5323	typedef struct flag_name {
5324		int	flag_val;
5325		char	*flag_name;
5326	} flag_name_t;
5327
5328	flag_name_t	flags[] = {
5329		VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT",
5330		VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV",
5331		VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV",
5332		VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT",
5333		VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV",
5334		VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT",
5335		VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT",
5336		VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV",
5337		VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT",
5338		VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV",
5339		VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT",
5340		VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV",
5341		VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT",
5342		VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV",
5343		VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT",
5344		VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV",
5345		VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT",
5346		VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV",
5347		VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT",
5348		VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV",
5349		VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT",
5350		VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV",
5351		VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT",
5352		VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV",
5353		VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT",
5354		VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV",
5355		VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT",
5356		VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV",
5357		VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT",
5358		VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV",
5359		VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"};
5360
5361	DERR(NULL, "DUMP_FLAGS: %llx\n", state);
5362	for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) {
5363		if (state & flags[i].flag_val)
5364			DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name);
5365	}
5366}
5367