vsw_ldc.c revision 6197:e02c1f4ecce1
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29#include <sys/types.h>
30#include <sys/errno.h>
31#include <sys/debug.h>
32#include <sys/time.h>
33#include <sys/sysmacros.h>
34#include <sys/systm.h>
35#include <sys/user.h>
36#include <sys/stropts.h>
37#include <sys/stream.h>
38#include <sys/strlog.h>
39#include <sys/strsubr.h>
40#include <sys/cmn_err.h>
41#include <sys/cpu.h>
42#include <sys/kmem.h>
43#include <sys/conf.h>
44#include <sys/ddi.h>
45#include <sys/sunddi.h>
46#include <sys/ksynch.h>
47#include <sys/stat.h>
48#include <sys/kstat.h>
49#include <sys/vtrace.h>
50#include <sys/strsun.h>
51#include <sys/dlpi.h>
52#include <sys/ethernet.h>
53#include <net/if.h>
54#include <sys/varargs.h>
55#include <sys/machsystm.h>
56#include <sys/modctl.h>
57#include <sys/modhash.h>
58#include <sys/mac.h>
59#include <sys/mac_ether.h>
60#include <sys/taskq.h>
61#include <sys/note.h>
62#include <sys/mach_descrip.h>
63#include <sys/mac.h>
64#include <sys/mdeg.h>
65#include <sys/ldc.h>
66#include <sys/vsw_fdb.h>
67#include <sys/vsw.h>
68#include <sys/vio_mailbox.h>
69#include <sys/vnet_mailbox.h>
70#include <sys/vnet_common.h>
71#include <sys/vio_util.h>
72#include <sys/sdt.h>
73#include <sys/atomic.h>
74#include <sys/callb.h>
75
76/* Port add/deletion/etc routines */
77static	int vsw_port_delete(vsw_port_t *port);
78static	int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id);
79static	int vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id);
80static	int vsw_init_ldcs(vsw_port_t *port);
81static	int vsw_uninit_ldcs(vsw_port_t *port);
82static	int vsw_ldc_init(vsw_ldc_t *ldcp);
83static	int vsw_ldc_uninit(vsw_ldc_t *ldcp);
84static	int vsw_drain_ldcs(vsw_port_t *port);
85static	int vsw_drain_port_taskq(vsw_port_t *port);
86static	void vsw_marker_task(void *);
87static	int vsw_plist_del_node(vsw_t *, vsw_port_t *port);
88int vsw_detach_ports(vsw_t *vswp);
89int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
90mcst_addr_t *vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr);
91int vsw_port_detach(vsw_t *vswp, int p_instance);
92int vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt, uint32_t count);
93int vsw_port_attach(vsw_t *vswp, int p_instance,
94	uint64_t *ldcids, int nids, struct ether_addr *macaddr);
95vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
96
97
98/* Interrupt routines */
99static	uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg);
100
101/* Handshake routines */
102static	void vsw_ldc_reinit(vsw_ldc_t *);
103static	void vsw_process_conn_evt(vsw_ldc_t *, uint16_t);
104static	void vsw_conn_task(void *);
105static	int vsw_check_flag(vsw_ldc_t *, int, uint64_t);
106static	void vsw_next_milestone(vsw_ldc_t *);
107static	int vsw_supported_version(vio_ver_msg_t *);
108static	void vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp);
109static	void vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp);
110
111/* Data processing routines */
112static void vsw_process_pkt(void *);
113static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t *);
114static void vsw_process_ctrl_pkt(void *);
115static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *);
116static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *);
117static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *);
118static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *);
119static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *);
120static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *);
121static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *,
122	uint32_t);
123static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *);
124static void vsw_process_pkt_data_nop(void *, void *, uint32_t);
125static void vsw_process_pkt_data(void *, void *, uint32_t);
126static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *);
127static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *);
128
129/* Switching/data transmit routines */
130static	int vsw_dringsend(vsw_ldc_t *, mblk_t *);
131static	int vsw_descrsend(vsw_ldc_t *, mblk_t *);
132static void vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp);
133static int vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries);
134static int vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
135static int vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
136
137/* Packet creation routines */
138static void vsw_send_ver(void *);
139static void vsw_send_attr(vsw_ldc_t *);
140static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *);
141static void vsw_send_dring_info(vsw_ldc_t *);
142static void vsw_send_rdx(vsw_ldc_t *);
143static int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t);
144
145/* Dring routines */
146static dring_info_t *vsw_create_dring(vsw_ldc_t *);
147static void vsw_create_privring(vsw_ldc_t *);
148static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp);
149static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
150    int *);
151static dring_info_t *vsw_ident2dring(lane_t *, uint64_t);
152static int vsw_reclaim_dring(dring_info_t *dp, int start);
153
154static void vsw_set_lane_attr(vsw_t *, lane_t *);
155static int vsw_check_attr(vnet_attr_msg_t *, vsw_ldc_t *);
156static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg);
157static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *);
158static int vsw_check_dring_info(vio_dring_reg_msg_t *);
159
160/* Rcv/Tx thread routines */
161static void vsw_stop_tx_thread(vsw_ldc_t *ldcp);
162static void vsw_ldc_tx_worker(void *arg);
163static void vsw_stop_rx_thread(vsw_ldc_t *ldcp);
164static void vsw_ldc_rx_worker(void *arg);
165
166/* Misc support routines */
167static	caddr_t vsw_print_ethaddr(uint8_t *addr, char *ebuf);
168static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
169static int vsw_free_ring(dring_info_t *);
170static void vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr);
171static int vsw_get_same_dest_list(struct ether_header *ehp,
172    mblk_t **rhead, mblk_t **rtail, mblk_t **mpp);
173static mblk_t *vsw_dupmsgchain(mblk_t *mp);
174static void vsw_mac_rx(vsw_t *vswp, int caller, mac_resource_handle_t mrh,
175    mblk_t *mp, mblk_t *mpt, vsw_macrx_flags_t flags);
176
177/* Debugging routines */
178static void dump_flags(uint64_t);
179static void display_state(void);
180static void display_lane(lane_t *);
181static void display_ring(dring_info_t *);
182
183/*
184 * Functions imported from other files.
185 */
186extern int vsw_set_hw(vsw_t *, vsw_port_t *, int);
187extern int vsw_unset_hw(vsw_t *, vsw_port_t *, int);
188extern void vsw_reconfig_hw(vsw_t *);
189extern int vsw_add_fdb(vsw_t *vswp, vsw_port_t *port);
190extern int vsw_del_fdb(vsw_t *vswp, vsw_port_t *port);
191extern int vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port);
192extern void vsw_del_mcst_port(vsw_port_t *port);
193extern int vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
194extern int vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
195
196#define	VSW_NUM_VMPOOLS		3	/* number of vio mblk pools */
197#define	VSW_PORT_REF_DELAY	30	/* delay for port ref_cnt to become 0 */
198
199/*
200 * Tunables used in this file.
201 */
202extern int vsw_num_handshakes;
203extern int vsw_wretries;
204extern int vsw_desc_delay;
205extern int vsw_read_attempts;
206extern int vsw_ldc_tx_delay;
207extern int vsw_ldc_tx_retries;
208extern boolean_t vsw_ldc_rxthr_enabled;
209extern boolean_t vsw_ldc_txthr_enabled;
210extern uint32_t vsw_ntxds;
211extern uint32_t vsw_max_tx_qcount;
212extern uint32_t vsw_chain_len;
213extern uint32_t vsw_mblk_size1;
214extern uint32_t vsw_mblk_size2;
215extern uint32_t vsw_mblk_size3;
216extern uint32_t vsw_num_mblks1;
217extern uint32_t vsw_num_mblks2;
218extern uint32_t vsw_num_mblks3;
219extern boolean_t vsw_obp_ver_proto_workaround;
220
221#define	LDC_ENTER_LOCK(ldcp)	\
222				mutex_enter(&((ldcp)->ldc_cblock));\
223				mutex_enter(&((ldcp)->ldc_rxlock));\
224				mutex_enter(&((ldcp)->ldc_txlock));
225#define	LDC_EXIT_LOCK(ldcp)	\
226				mutex_exit(&((ldcp)->ldc_txlock));\
227				mutex_exit(&((ldcp)->ldc_rxlock));\
228				mutex_exit(&((ldcp)->ldc_cblock));
229
230#define	VSW_VER_EQ(ldcp, major, minor)	\
231	((ldcp)->lane_out.ver_major == (major) &&	\
232	    (ldcp)->lane_out.ver_minor == (minor))
233
234#define	VSW_VER_LT(ldcp, major, minor)	\
235	(((ldcp)->lane_out.ver_major < (major)) ||	\
236	    ((ldcp)->lane_out.ver_major == (major) &&	\
237	    (ldcp)->lane_out.ver_minor < (minor)))
238
239/* supported versions */
240static	ver_sup_t	vsw_versions[] = { {1, 2} };
241
242/*
243 * For the moment the state dump routines have their own
244 * private flag.
245 */
246#define	DUMP_STATE	0
247
248#if DUMP_STATE
249
250#define	DUMP_TAG(tag) \
251{			\
252	D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \
253	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype);	\
254	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env);	\
255}
256
257#define	DUMP_TAG_PTR(tag) \
258{			\
259	D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \
260	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype);	\
261	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env);	\
262}
263
264#define	DUMP_FLAGS(flags) dump_flags(flags);
265#define	DISPLAY_STATE()	display_state()
266
267#else
268
269#define	DUMP_TAG(tag)
270#define	DUMP_TAG_PTR(tag)
271#define	DUMP_FLAGS(state)
272#define	DISPLAY_STATE()
273
274#endif	/* DUMP_STATE */
275
276/*
277 * Attach the specified port.
278 *
279 * Returns 0 on success, 1 on failure.
280 */
281int
282vsw_port_attach(vsw_t *vswp, int p_instance, uint64_t *ldcids, int nids,
283struct ether_addr *macaddr)
284{
285	vsw_port_list_t		*plist = &vswp->plist;
286	vsw_port_t		*port, **prev_port;
287	int			i;
288
289	D1(vswp, "%s: enter : port %d", __func__, p_instance);
290
291	/* port already exists? */
292	READ_ENTER(&plist->lockrw);
293	for (port = plist->head; port != NULL; port = port->p_next) {
294		if (port->p_instance == p_instance) {
295			DWARN(vswp, "%s: port instance %d already attached",
296			    __func__, p_instance);
297			RW_EXIT(&plist->lockrw);
298			return (1);
299		}
300	}
301	RW_EXIT(&plist->lockrw);
302
303	port = kmem_zalloc(sizeof (vsw_port_t), KM_SLEEP);
304	port->p_vswp = vswp;
305	port->p_instance = p_instance;
306	port->p_ldclist.num_ldcs = 0;
307	port->p_ldclist.head = NULL;
308	port->addr_set = VSW_ADDR_UNSET;
309
310	rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL);
311
312	mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
313	mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
314
315	mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
316	cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
317	port->state = VSW_PORT_INIT;
318
319	if (nids > VSW_PORT_MAX_LDCS) {
320		D2(vswp, "%s: using first of %d ldc ids",
321		    __func__, nids);
322		nids = VSW_PORT_MAX_LDCS;
323	}
324
325	D2(vswp, "%s: %d nids", __func__, nids);
326	for (i = 0; i < nids; i++) {
327		D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]);
328		if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) {
329			DERR(vswp, "%s: ldc_attach failed", __func__);
330
331			rw_destroy(&port->p_ldclist.lockrw);
332
333			cv_destroy(&port->state_cv);
334			mutex_destroy(&port->state_lock);
335
336			mutex_destroy(&port->tx_lock);
337			mutex_destroy(&port->mca_lock);
338			kmem_free(port, sizeof (vsw_port_t));
339			return (1);
340		}
341	}
342
343	ether_copy(macaddr, &port->p_macaddr);
344
345	if (vswp->switching_setup_done == B_TRUE) {
346		/*
347		 * If the underlying physical device has been setup,
348		 * program the mac address of this port in it.
349		 * Otherwise, port macaddr will be set after the physical
350		 * device is successfully setup by the timeout handler.
351		 */
352		mutex_enter(&vswp->hw_lock);
353		(void) vsw_set_hw(vswp, port, VSW_VNETPORT);
354		mutex_exit(&vswp->hw_lock);
355	}
356
357	WRITE_ENTER(&plist->lockrw);
358
359	/* create the fdb entry for this port/mac address */
360	(void) vsw_add_fdb(vswp, port);
361
362	/* link it into the list of ports for this vsw instance */
363	prev_port = (vsw_port_t **)(&plist->head);
364	port->p_next = *prev_port;
365	*prev_port = port;
366	plist->num_ports++;
367
368	RW_EXIT(&plist->lockrw);
369
370	/*
371	 * Initialise the port and any ldc's under it.
372	 */
373	(void) vsw_init_ldcs(port);
374
375	D1(vswp, "%s: exit", __func__);
376	return (0);
377}
378
379/*
380 * Detach the specified port.
381 *
382 * Returns 0 on success, 1 on failure.
383 */
384int
385vsw_port_detach(vsw_t *vswp, int p_instance)
386{
387	vsw_port_t	*port = NULL;
388	vsw_port_list_t	*plist = &vswp->plist;
389
390	D1(vswp, "%s: enter: port id %d", __func__, p_instance);
391
392	WRITE_ENTER(&plist->lockrw);
393
394	if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) {
395		RW_EXIT(&plist->lockrw);
396		return (1);
397	}
398
399	if (vsw_plist_del_node(vswp, port)) {
400		RW_EXIT(&plist->lockrw);
401		return (1);
402	}
403
404	/* Remove the fdb entry for this port/mac address */
405	(void) vsw_del_fdb(vswp, port);
406
407	/* Remove any multicast addresses.. */
408	vsw_del_mcst_port(port);
409
410	/*
411	 * No longer need to hold writer lock on port list now
412	 * that we have unlinked the target port from the list.
413	 */
414	RW_EXIT(&plist->lockrw);
415
416	/* Remove address if was programmed into HW. */
417	mutex_enter(&vswp->hw_lock);
418
419	/*
420	 * Port's address may not have been set in hardware. This could
421	 * happen if the underlying physical device is not yet available and
422	 * vsw_setup_switching_timeout() may be in progress.
423	 * We remove its addr from hardware only if it has been set before.
424	 */
425	if (port->addr_set != VSW_ADDR_UNSET)
426		(void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
427
428	if (vswp->recfg_reqd)
429		vsw_reconfig_hw(vswp);
430
431	mutex_exit(&vswp->hw_lock);
432
433	if (vsw_port_delete(port)) {
434		return (1);
435	}
436
437	D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance);
438	return (0);
439}
440
441/*
442 * Detach all active ports.
443 *
444 * Returns 0 on success, 1 on failure.
445 */
446int
447vsw_detach_ports(vsw_t *vswp)
448{
449	vsw_port_list_t 	*plist = &vswp->plist;
450	vsw_port_t		*port = NULL;
451
452	D1(vswp, "%s: enter", __func__);
453
454	WRITE_ENTER(&plist->lockrw);
455
456	while ((port = plist->head) != NULL) {
457		if (vsw_plist_del_node(vswp, port)) {
458			DERR(vswp, "%s: Error deleting port %d"
459			    " from port list", __func__, port->p_instance);
460			RW_EXIT(&plist->lockrw);
461			return (1);
462		}
463
464		/* Remove address if was programmed into HW. */
465		mutex_enter(&vswp->hw_lock);
466		(void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
467		mutex_exit(&vswp->hw_lock);
468
469		/* Remove the fdb entry for this port/mac address */
470		(void) vsw_del_fdb(vswp, port);
471
472		/* Remove any multicast addresses.. */
473		vsw_del_mcst_port(port);
474
475		/*
476		 * No longer need to hold the lock on the port list
477		 * now that we have unlinked the target port from the
478		 * list.
479		 */
480		RW_EXIT(&plist->lockrw);
481		if (vsw_port_delete(port)) {
482			DERR(vswp, "%s: Error deleting port %d",
483			    __func__, port->p_instance);
484			return (1);
485		}
486		WRITE_ENTER(&plist->lockrw);
487	}
488	RW_EXIT(&plist->lockrw);
489
490	D1(vswp, "%s: exit", __func__);
491
492	return (0);
493}
494
495/*
496 * Delete the specified port.
497 *
498 * Returns 0 on success, 1 on failure.
499 */
500static int
501vsw_port_delete(vsw_port_t *port)
502{
503	vsw_ldc_list_t 		*ldcl;
504	vsw_t			*vswp = port->p_vswp;
505
506	D1(vswp, "%s: enter : port id %d", __func__, port->p_instance);
507
508	(void) vsw_uninit_ldcs(port);
509
510	/*
511	 * Wait for any pending ctrl msg tasks which reference this
512	 * port to finish.
513	 */
514	if (vsw_drain_port_taskq(port))
515		return (1);
516
517	/*
518	 * Wait for port reference count to hit zero.
519	 */
520	while (port->ref_cnt != 0) {
521		delay(drv_usectohz(VSW_PORT_REF_DELAY));
522	}
523
524	/*
525	 * Wait for any active callbacks to finish
526	 */
527	if (vsw_drain_ldcs(port))
528		return (1);
529
530	ldcl = &port->p_ldclist;
531	WRITE_ENTER(&ldcl->lockrw);
532	while (ldcl->num_ldcs > 0) {
533		if (vsw_ldc_detach(port, ldcl->head->ldc_id) != 0) {
534			cmn_err(CE_WARN, "!vsw%d: unable to detach ldc %ld",
535			    vswp->instance, ldcl->head->ldc_id);
536			RW_EXIT(&ldcl->lockrw);
537			return (1);
538		}
539	}
540	RW_EXIT(&ldcl->lockrw);
541
542	rw_destroy(&port->p_ldclist.lockrw);
543
544	mutex_destroy(&port->mca_lock);
545	mutex_destroy(&port->tx_lock);
546	cv_destroy(&port->state_cv);
547	mutex_destroy(&port->state_lock);
548
549	kmem_free(port, sizeof (vsw_port_t));
550
551	D1(vswp, "%s: exit", __func__);
552
553	return (0);
554}
555
556/*
557 * Attach a logical domain channel (ldc) under a specified port.
558 *
559 * Returns 0 on success, 1 on failure.
560 */
561static int
562vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
563{
564	vsw_t 		*vswp = port->p_vswp;
565	vsw_ldc_list_t *ldcl = &port->p_ldclist;
566	vsw_ldc_t 	*ldcp = NULL;
567	ldc_attr_t 	attr;
568	ldc_status_t	istatus;
569	int 		status = DDI_FAILURE;
570	int		rv;
571	char		kname[MAXNAMELEN];
572	enum		{ PROG_init = 0x0, PROG_mblks = 0x1,
573			    PROG_callback = 0x2, PROG_rx_thread = 0x4,
574			    PROG_tx_thread = 0x8}
575			progress;
576
577	progress = PROG_init;
578
579	D1(vswp, "%s: enter", __func__);
580
581	ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP);
582	if (ldcp == NULL) {
583		DERR(vswp, "%s: kmem_zalloc failed", __func__);
584		return (1);
585	}
586	ldcp->ldc_id = ldc_id;
587
588	/* Allocate pools of receive mblks */
589	rv = vio_init_multipools(&ldcp->vmp, VSW_NUM_VMPOOLS,
590	    vsw_mblk_size1, vsw_mblk_size2, vsw_mblk_size3,
591	    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3);
592	if (rv) {
593		DWARN(vswp, "%s: unable to create free mblk pools for"
594		    " channel %ld (rv %d)", __func__, ldc_id, rv);
595		kmem_free(ldcp, sizeof (vsw_ldc_t));
596		return (1);
597	}
598
599	progress |= PROG_mblks;
600
601	mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
602	mutex_init(&ldcp->ldc_rxlock, NULL, MUTEX_DRIVER, NULL);
603	mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
604	mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
605	cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL);
606	rw_init(&ldcp->lane_in.dlistrw, NULL, RW_DRIVER, NULL);
607	rw_init(&ldcp->lane_out.dlistrw, NULL, RW_DRIVER, NULL);
608
609	/* required for handshake with peer */
610	ldcp->local_session = (uint64_t)ddi_get_lbolt();
611	ldcp->peer_session = 0;
612	ldcp->session_status = 0;
613	ldcp->hss_id = 1;	/* Initial handshake session id */
614
615	/* only set for outbound lane, inbound set by peer */
616	vsw_set_lane_attr(vswp, &ldcp->lane_out);
617
618	attr.devclass = LDC_DEV_NT_SVC;
619	attr.instance = ddi_get_instance(vswp->dip);
620	attr.mode = LDC_MODE_UNRELIABLE;
621	attr.mtu = VSW_LDC_MTU;
622	status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
623	if (status != 0) {
624		DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
625		    __func__, ldc_id, status);
626		goto ldc_attach_fail;
627	}
628
629	if (vsw_ldc_rxthr_enabled) {
630		ldcp->rx_thr_flags = 0;
631
632		mutex_init(&ldcp->rx_thr_lock, NULL, MUTEX_DRIVER, NULL);
633		cv_init(&ldcp->rx_thr_cv, NULL, CV_DRIVER, NULL);
634		ldcp->rx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
635		    vsw_ldc_rx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
636
637		progress |= PROG_rx_thread;
638		if (ldcp->rx_thread == NULL) {
639			DWARN(vswp, "%s(%lld): Failed to create worker thread",
640			    __func__, ldc_id);
641			goto ldc_attach_fail;
642		}
643	}
644
645	if (vsw_ldc_txthr_enabled) {
646		ldcp->tx_thr_flags = 0;
647		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
648
649		mutex_init(&ldcp->tx_thr_lock, NULL, MUTEX_DRIVER, NULL);
650		cv_init(&ldcp->tx_thr_cv, NULL, CV_DRIVER, NULL);
651		ldcp->tx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
652		    vsw_ldc_tx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
653
654		progress |= PROG_tx_thread;
655		if (ldcp->tx_thread == NULL) {
656			DWARN(vswp, "%s(%lld): Failed to create worker thread",
657			    __func__, ldc_id);
658			goto ldc_attach_fail;
659		}
660	}
661
662	status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
663	if (status != 0) {
664		DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
665		    __func__, ldc_id, status);
666		(void) ldc_fini(ldcp->ldc_handle);
667		goto ldc_attach_fail;
668	}
669	/*
670	 * allocate a message for ldc_read()s, big enough to hold ctrl and
671	 * data msgs, including raw data msgs used to recv priority frames.
672	 */
673	ldcp->msglen = VIO_PKT_DATA_HDRSIZE + ETHERMAX;
674	ldcp->ldcmsg = kmem_alloc(ldcp->msglen, KM_SLEEP);
675
676	progress |= PROG_callback;
677
678	mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL);
679
680	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
681		DERR(vswp, "%s: ldc_status failed", __func__);
682		mutex_destroy(&ldcp->status_lock);
683		goto ldc_attach_fail;
684	}
685
686	ldcp->ldc_status = istatus;
687	ldcp->ldc_port = port;
688	ldcp->ldc_vswp = vswp;
689
690	vsw_reset_vnet_proto_ops(ldcp);
691
692	(void) sprintf(kname, "%sldc0x%lx", DRV_NAME, ldcp->ldc_id);
693	ldcp->ksp = vgen_setup_kstats(DRV_NAME, vswp->instance,
694	    kname, &ldcp->ldc_stats);
695	if (ldcp->ksp == NULL) {
696		DERR(vswp, "%s: kstats setup failed", __func__);
697		goto ldc_attach_fail;
698	}
699
700	/* link it into the list of channels for this port */
701	WRITE_ENTER(&ldcl->lockrw);
702	ldcp->ldc_next = ldcl->head;
703	ldcl->head = ldcp;
704	ldcl->num_ldcs++;
705	RW_EXIT(&ldcl->lockrw);
706
707	D1(vswp, "%s: exit", __func__);
708	return (0);
709
710ldc_attach_fail:
711
712	if (progress & PROG_callback) {
713		(void) ldc_unreg_callback(ldcp->ldc_handle);
714		kmem_free(ldcp->ldcmsg, ldcp->msglen);
715	}
716
717	if (progress & PROG_rx_thread) {
718		if (ldcp->rx_thread != NULL) {
719			vsw_stop_rx_thread(ldcp);
720		}
721		mutex_destroy(&ldcp->rx_thr_lock);
722		cv_destroy(&ldcp->rx_thr_cv);
723	}
724
725	if (progress & PROG_tx_thread) {
726		if (ldcp->tx_thread != NULL) {
727			vsw_stop_tx_thread(ldcp);
728		}
729		mutex_destroy(&ldcp->tx_thr_lock);
730		cv_destroy(&ldcp->tx_thr_cv);
731	}
732	if (ldcp->ksp != NULL) {
733		vgen_destroy_kstats(ldcp->ksp);
734	}
735	mutex_destroy(&ldcp->ldc_txlock);
736	mutex_destroy(&ldcp->ldc_rxlock);
737	mutex_destroy(&ldcp->ldc_cblock);
738	mutex_destroy(&ldcp->drain_cv_lock);
739
740	cv_destroy(&ldcp->drain_cv);
741
742	rw_destroy(&ldcp->lane_in.dlistrw);
743	rw_destroy(&ldcp->lane_out.dlistrw);
744
745	if (progress & PROG_mblks) {
746		vio_destroy_multipools(&ldcp->vmp, &vswp->rxh);
747	}
748	kmem_free(ldcp, sizeof (vsw_ldc_t));
749
750	return (1);
751}
752
753/*
754 * Detach a logical domain channel (ldc) belonging to a
755 * particular port.
756 *
757 * Returns 0 on success, 1 on failure.
758 */
759static int
760vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id)
761{
762	vsw_t 		*vswp = port->p_vswp;
763	vsw_ldc_t 	*ldcp, *prev_ldcp;
764	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
765	int 		rv;
766
767	prev_ldcp = ldcl->head;
768	for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) {
769		if (ldcp->ldc_id == ldc_id) {
770			break;
771		}
772	}
773
774	/* specified ldc id not found */
775	if (ldcp == NULL) {
776		DERR(vswp, "%s: ldcp = NULL", __func__);
777		return (1);
778	}
779
780	D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id);
781
782	/* Stop the receive thread */
783	if (ldcp->rx_thread != NULL) {
784		vsw_stop_rx_thread(ldcp);
785		mutex_destroy(&ldcp->rx_thr_lock);
786		cv_destroy(&ldcp->rx_thr_cv);
787	}
788	kmem_free(ldcp->ldcmsg, ldcp->msglen);
789
790	/* Stop the tx thread */
791	if (ldcp->tx_thread != NULL) {
792		vsw_stop_tx_thread(ldcp);
793		mutex_destroy(&ldcp->tx_thr_lock);
794		cv_destroy(&ldcp->tx_thr_cv);
795		if (ldcp->tx_mhead != NULL) {
796			freemsgchain(ldcp->tx_mhead);
797			ldcp->tx_mhead = ldcp->tx_mtail = NULL;
798			ldcp->tx_cnt = 0;
799		}
800	}
801
802	/* Destory kstats */
803	vgen_destroy_kstats(ldcp->ksp);
804
805	/*
806	 * Before we can close the channel we must release any mapped
807	 * resources (e.g. drings).
808	 */
809	vsw_free_lane_resources(ldcp, INBOUND);
810	vsw_free_lane_resources(ldcp, OUTBOUND);
811
812	/*
813	 * If the close fails we are in serious trouble, as won't
814	 * be able to delete the parent port.
815	 */
816	if ((rv = ldc_close(ldcp->ldc_handle)) != 0) {
817		DERR(vswp, "%s: error %d closing channel %lld",
818		    __func__, rv, ldcp->ldc_id);
819		return (1);
820	}
821
822	(void) ldc_fini(ldcp->ldc_handle);
823
824	ldcp->ldc_status = LDC_INIT;
825	ldcp->ldc_handle = NULL;
826	ldcp->ldc_vswp = NULL;
827
828
829	/*
830	 * Most likely some mblks are still in use and
831	 * have not been returned to the pool. These mblks are
832	 * added to the pool that is maintained in the device instance.
833	 * Another attempt will be made to destroy the pool
834	 * when the device detaches.
835	 */
836	vio_destroy_multipools(&ldcp->vmp, &vswp->rxh);
837
838	/* unlink it from the list */
839	prev_ldcp = ldcp->ldc_next;
840	ldcl->num_ldcs--;
841
842	mutex_destroy(&ldcp->ldc_txlock);
843	mutex_destroy(&ldcp->ldc_rxlock);
844	mutex_destroy(&ldcp->ldc_cblock);
845	cv_destroy(&ldcp->drain_cv);
846	mutex_destroy(&ldcp->drain_cv_lock);
847	mutex_destroy(&ldcp->status_lock);
848	rw_destroy(&ldcp->lane_in.dlistrw);
849	rw_destroy(&ldcp->lane_out.dlistrw);
850
851	kmem_free(ldcp, sizeof (vsw_ldc_t));
852
853	return (0);
854}
855
856/*
857 * Open and attempt to bring up the channel. Note that channel
858 * can only be brought up if peer has also opened channel.
859 *
860 * Returns 0 if can open and bring up channel, otherwise
861 * returns 1.
862 */
863static int
864vsw_ldc_init(vsw_ldc_t *ldcp)
865{
866	vsw_t 		*vswp = ldcp->ldc_vswp;
867	ldc_status_t	istatus = 0;
868	int		rv;
869
870	D1(vswp, "%s: enter", __func__);
871
872	LDC_ENTER_LOCK(ldcp);
873
874	/* don't start at 0 in case clients don't like that */
875	ldcp->next_ident = 1;
876
877	rv = ldc_open(ldcp->ldc_handle);
878	if (rv != 0) {
879		DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)",
880		    __func__, ldcp->ldc_id, rv);
881		LDC_EXIT_LOCK(ldcp);
882		return (1);
883	}
884
885	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
886		DERR(vswp, "%s: unable to get status", __func__);
887		LDC_EXIT_LOCK(ldcp);
888		return (1);
889
890	} else if (istatus != LDC_OPEN && istatus != LDC_READY) {
891		DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY",
892		    __func__, ldcp->ldc_id, istatus);
893		LDC_EXIT_LOCK(ldcp);
894		return (1);
895	}
896
897	mutex_enter(&ldcp->status_lock);
898	ldcp->ldc_status = istatus;
899	mutex_exit(&ldcp->status_lock);
900
901	rv = ldc_up(ldcp->ldc_handle);
902	if (rv != 0) {
903		/*
904		 * Not a fatal error for ldc_up() to fail, as peer
905		 * end point may simply not be ready yet.
906		 */
907		D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
908		    ldcp->ldc_id, rv);
909		LDC_EXIT_LOCK(ldcp);
910		return (1);
911	}
912
913	/*
914	 * ldc_up() call is non-blocking so need to explicitly
915	 * check channel status to see if in fact the channel
916	 * is UP.
917	 */
918	mutex_enter(&ldcp->status_lock);
919	if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) {
920		DERR(vswp, "%s: unable to get status", __func__);
921		mutex_exit(&ldcp->status_lock);
922		LDC_EXIT_LOCK(ldcp);
923		return (1);
924
925	}
926
927	if (ldcp->ldc_status == LDC_UP) {
928		D2(vswp, "%s: channel %ld now UP (%ld)", __func__,
929		    ldcp->ldc_id, istatus);
930		mutex_exit(&ldcp->status_lock);
931		LDC_EXIT_LOCK(ldcp);
932
933		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
934		return (0);
935	}
936
937	mutex_exit(&ldcp->status_lock);
938	LDC_EXIT_LOCK(ldcp);
939
940	D1(vswp, "%s: exit", __func__);
941	return (0);
942}
943
944/* disable callbacks on the channel */
945static int
946vsw_ldc_uninit(vsw_ldc_t *ldcp)
947{
948	vsw_t	*vswp = ldcp->ldc_vswp;
949	int	rv;
950
951	D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id);
952
953	LDC_ENTER_LOCK(ldcp);
954
955	rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
956	if (rv != 0) {
957		DERR(vswp, "vsw_ldc_uninit(%lld): error disabling "
958		    "interrupts (rv = %d)\n", ldcp->ldc_id, rv);
959		LDC_EXIT_LOCK(ldcp);
960		return (1);
961	}
962
963	mutex_enter(&ldcp->status_lock);
964	ldcp->ldc_status = LDC_INIT;
965	mutex_exit(&ldcp->status_lock);
966
967	LDC_EXIT_LOCK(ldcp);
968
969	D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id);
970
971	return (0);
972}
973
974static int
975vsw_init_ldcs(vsw_port_t *port)
976{
977	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
978	vsw_ldc_t	*ldcp;
979
980	READ_ENTER(&ldcl->lockrw);
981	ldcp =  ldcl->head;
982	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
983		(void) vsw_ldc_init(ldcp);
984	}
985	RW_EXIT(&ldcl->lockrw);
986
987	return (0);
988}
989
990static int
991vsw_uninit_ldcs(vsw_port_t *port)
992{
993	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
994	vsw_ldc_t	*ldcp;
995
996	D1(NULL, "vsw_uninit_ldcs: enter\n");
997
998	READ_ENTER(&ldcl->lockrw);
999	ldcp =  ldcl->head;
1000	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
1001		(void) vsw_ldc_uninit(ldcp);
1002	}
1003	RW_EXIT(&ldcl->lockrw);
1004
1005	D1(NULL, "vsw_uninit_ldcs: exit\n");
1006
1007	return (0);
1008}
1009
1010/*
1011 * Wait until the callback(s) associated with the ldcs under the specified
1012 * port have completed.
1013 *
1014 * Prior to this function being invoked each channel under this port
1015 * should have been quiesced via ldc_set_cb_mode(DISABLE).
1016 *
1017 * A short explaination of what we are doing below..
1018 *
1019 * The simplest approach would be to have a reference counter in
1020 * the ldc structure which is increment/decremented by the callbacks as
1021 * they use the channel. The drain function could then simply disable any
1022 * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately
1023 * there is a tiny window here - before the callback is able to get the lock
1024 * on the channel it is interrupted and this function gets to execute. It
1025 * sees that the ref count is zero and believes its free to delete the
1026 * associated data structures.
1027 *
1028 * We get around this by taking advantage of the fact that before the ldc
1029 * framework invokes a callback it sets a flag to indicate that there is a
1030 * callback active (or about to become active). If when we attempt to
1031 * unregister a callback when this active flag is set then the unregister
1032 * will fail with EWOULDBLOCK.
1033 *
1034 * If the unregister fails we do a cv_timedwait. We will either be signaled
1035 * by the callback as it is exiting (note we have to wait a short period to
1036 * allow the callback to return fully to the ldc framework and it to clear
1037 * the active flag), or by the timer expiring. In either case we again attempt
1038 * the unregister. We repeat this until we can succesfully unregister the
1039 * callback.
1040 *
1041 * The reason we use a cv_timedwait rather than a simple cv_wait is to catch
1042 * the case where the callback has finished but the ldc framework has not yet
1043 * cleared the active flag. In this case we would never get a cv_signal.
1044 */
1045static int
1046vsw_drain_ldcs(vsw_port_t *port)
1047{
1048	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
1049	vsw_ldc_t	*ldcp;
1050	vsw_t		*vswp = port->p_vswp;
1051
1052	D1(vswp, "%s: enter", __func__);
1053
1054	READ_ENTER(&ldcl->lockrw);
1055
1056	ldcp = ldcl->head;
1057
1058	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
1059		/*
1060		 * If we can unregister the channel callback then we
1061		 * know that there is no callback either running or
1062		 * scheduled to run for this channel so move on to next
1063		 * channel in the list.
1064		 */
1065		mutex_enter(&ldcp->drain_cv_lock);
1066
1067		/* prompt active callbacks to quit */
1068		ldcp->drain_state = VSW_LDC_DRAINING;
1069
1070		if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) {
1071			D2(vswp, "%s: unreg callback for chan %ld", __func__,
1072			    ldcp->ldc_id);
1073			mutex_exit(&ldcp->drain_cv_lock);
1074			continue;
1075		} else {
1076			/*
1077			 * If we end up here we know that either 1) a callback
1078			 * is currently executing, 2) is about to start (i.e.
1079			 * the ldc framework has set the active flag but
1080			 * has not actually invoked the callback yet, or 3)
1081			 * has finished and has returned to the ldc framework
1082			 * but the ldc framework has not yet cleared the
1083			 * active bit.
1084			 *
1085			 * Wait for it to finish.
1086			 */
1087			while (ldc_unreg_callback(ldcp->ldc_handle)
1088			    == EWOULDBLOCK)
1089				(void) cv_timedwait(&ldcp->drain_cv,
1090				    &ldcp->drain_cv_lock, lbolt + hz);
1091
1092			mutex_exit(&ldcp->drain_cv_lock);
1093			D2(vswp, "%s: unreg callback for chan %ld after "
1094			    "timeout", __func__, ldcp->ldc_id);
1095		}
1096	}
1097	RW_EXIT(&ldcl->lockrw);
1098
1099	D1(vswp, "%s: exit", __func__);
1100	return (0);
1101}
1102
1103/*
1104 * Wait until all tasks which reference this port have completed.
1105 *
1106 * Prior to this function being invoked each channel under this port
1107 * should have been quiesced via ldc_set_cb_mode(DISABLE).
1108 */
1109static int
1110vsw_drain_port_taskq(vsw_port_t *port)
1111{
1112	vsw_t		*vswp = port->p_vswp;
1113
1114	D1(vswp, "%s: enter", __func__);
1115
1116	/*
1117	 * Mark the port as in the process of being detached, and
1118	 * dispatch a marker task to the queue so we know when all
1119	 * relevant tasks have completed.
1120	 */
1121	mutex_enter(&port->state_lock);
1122	port->state = VSW_PORT_DETACHING;
1123
1124	if ((vswp->taskq_p == NULL) ||
1125	    (ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task,
1126	    port, DDI_NOSLEEP) != DDI_SUCCESS)) {
1127		DERR(vswp, "%s: unable to dispatch marker task",
1128		    __func__);
1129		mutex_exit(&port->state_lock);
1130		return (1);
1131	}
1132
1133	/*
1134	 * Wait for the marker task to finish.
1135	 */
1136	while (port->state != VSW_PORT_DETACHABLE)
1137		cv_wait(&port->state_cv, &port->state_lock);
1138
1139	mutex_exit(&port->state_lock);
1140
1141	D1(vswp, "%s: exit", __func__);
1142
1143	return (0);
1144}
1145
1146static void
1147vsw_marker_task(void *arg)
1148{
1149	vsw_port_t	*port = arg;
1150	vsw_t		*vswp = port->p_vswp;
1151
1152	D1(vswp, "%s: enter", __func__);
1153
1154	mutex_enter(&port->state_lock);
1155
1156	/*
1157	 * No further tasks should be dispatched which reference
1158	 * this port so ok to mark it as safe to detach.
1159	 */
1160	port->state = VSW_PORT_DETACHABLE;
1161
1162	cv_signal(&port->state_cv);
1163
1164	mutex_exit(&port->state_lock);
1165
1166	D1(vswp, "%s: exit", __func__);
1167}
1168
1169vsw_port_t *
1170vsw_lookup_port(vsw_t *vswp, int p_instance)
1171{
1172	vsw_port_list_t *plist = &vswp->plist;
1173	vsw_port_t	*port;
1174
1175	for (port = plist->head; port != NULL; port = port->p_next) {
1176		if (port->p_instance == p_instance) {
1177			D2(vswp, "vsw_lookup_port: found p_instance\n");
1178			return (port);
1179		}
1180	}
1181
1182	return (NULL);
1183}
1184
1185/*
1186 * Search for and remove the specified port from the port
1187 * list. Returns 0 if able to locate and remove port, otherwise
1188 * returns 1.
1189 */
1190static int
1191vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port)
1192{
1193	vsw_port_list_t *plist = &vswp->plist;
1194	vsw_port_t	*curr_p, *prev_p;
1195
1196	if (plist->head == NULL)
1197		return (1);
1198
1199	curr_p = prev_p = plist->head;
1200
1201	while (curr_p != NULL) {
1202		if (curr_p == port) {
1203			if (prev_p == curr_p) {
1204				plist->head = curr_p->p_next;
1205			} else {
1206				prev_p->p_next = curr_p->p_next;
1207			}
1208			plist->num_ports--;
1209			break;
1210		} else {
1211			prev_p = curr_p;
1212			curr_p = curr_p->p_next;
1213		}
1214	}
1215	return (0);
1216}
1217
1218/*
1219 * Interrupt handler for ldc messages.
1220 */
1221static uint_t
1222vsw_ldc_cb(uint64_t event, caddr_t arg)
1223{
1224	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
1225	vsw_t 		*vswp = ldcp->ldc_vswp;
1226
1227	D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
1228
1229	mutex_enter(&ldcp->ldc_cblock);
1230	ldcp->ldc_stats.callbacks++;
1231
1232	mutex_enter(&ldcp->status_lock);
1233	if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) {
1234		mutex_exit(&ldcp->status_lock);
1235		mutex_exit(&ldcp->ldc_cblock);
1236		return (LDC_SUCCESS);
1237	}
1238	mutex_exit(&ldcp->status_lock);
1239
1240	if (event & LDC_EVT_UP) {
1241		/*
1242		 * Channel has come up.
1243		 */
1244		D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)",
1245		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1246
1247		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1248
1249		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1250	}
1251
1252	if (event & LDC_EVT_READ) {
1253		/*
1254		 * Data available for reading.
1255		 */
1256		D2(vswp, "%s: id(ld) event(%llx) data READ",
1257		    __func__, ldcp->ldc_id, event);
1258
1259		if (ldcp->rx_thread != NULL) {
1260			/*
1261			 * If the receive thread is enabled, then
1262			 * wakeup the receive thread to process the
1263			 * LDC messages.
1264			 */
1265			mutex_exit(&ldcp->ldc_cblock);
1266			mutex_enter(&ldcp->rx_thr_lock);
1267			if (!(ldcp->rx_thr_flags & VSW_WTHR_DATARCVD)) {
1268				ldcp->rx_thr_flags |= VSW_WTHR_DATARCVD;
1269				cv_signal(&ldcp->rx_thr_cv);
1270			}
1271			mutex_exit(&ldcp->rx_thr_lock);
1272			mutex_enter(&ldcp->ldc_cblock);
1273		} else {
1274			vsw_process_pkt(ldcp);
1275		}
1276
1277		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1278
1279		goto vsw_cb_exit;
1280	}
1281
1282	if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) {
1283		D2(vswp, "%s: id(%ld) event (%lx) DOWN/RESET: status(%ld)",
1284		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1285
1286		vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
1287	}
1288
1289	/*
1290	 * Catch either LDC_EVT_WRITE which we don't support or any
1291	 * unknown event.
1292	 */
1293	if (event &
1294	    ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) {
1295		DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)",
1296		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1297	}
1298
1299vsw_cb_exit:
1300	mutex_exit(&ldcp->ldc_cblock);
1301
1302	/*
1303	 * Let the drain function know we are finishing if it
1304	 * is waiting.
1305	 */
1306	mutex_enter(&ldcp->drain_cv_lock);
1307	if (ldcp->drain_state == VSW_LDC_DRAINING)
1308		cv_signal(&ldcp->drain_cv);
1309	mutex_exit(&ldcp->drain_cv_lock);
1310
1311	return (LDC_SUCCESS);
1312}
1313
1314/*
1315 * Reinitialise data structures associated with the channel.
1316 */
1317static void
1318vsw_ldc_reinit(vsw_ldc_t *ldcp)
1319{
1320	vsw_t		*vswp = ldcp->ldc_vswp;
1321	vsw_port_t	*port;
1322	vsw_ldc_list_t	*ldcl;
1323
1324	D1(vswp, "%s: enter", __func__);
1325
1326	port = ldcp->ldc_port;
1327	ldcl = &port->p_ldclist;
1328
1329	READ_ENTER(&ldcl->lockrw);
1330
1331	D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__,
1332	    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
1333
1334	vsw_free_lane_resources(ldcp, INBOUND);
1335	vsw_free_lane_resources(ldcp, OUTBOUND);
1336	RW_EXIT(&ldcl->lockrw);
1337
1338	ldcp->lane_in.lstate = 0;
1339	ldcp->lane_out.lstate = 0;
1340
1341	/*
1342	 * Remove parent port from any multicast groups
1343	 * it may have registered with. Client must resend
1344	 * multicast add command after handshake completes.
1345	 */
1346	(void) vsw_del_fdb(vswp, port);
1347
1348	vsw_del_mcst_port(port);
1349
1350	ldcp->peer_session = 0;
1351	ldcp->session_status = 0;
1352	ldcp->hcnt = 0;
1353	ldcp->hphase = VSW_MILESTONE0;
1354
1355	vsw_reset_vnet_proto_ops(ldcp);
1356
1357	D1(vswp, "%s: exit", __func__);
1358}
1359
1360/*
1361 * Process a connection event.
1362 *
1363 * Note - care must be taken to ensure that this function is
1364 * not called with the dlistrw lock held.
1365 */
1366static void
1367vsw_process_conn_evt(vsw_ldc_t *ldcp, uint16_t evt)
1368{
1369	vsw_t		*vswp = ldcp->ldc_vswp;
1370	vsw_conn_evt_t	*conn = NULL;
1371
1372	D1(vswp, "%s: enter", __func__);
1373
1374	/*
1375	 * Check if either a reset or restart event is pending
1376	 * or in progress. If so just return.
1377	 *
1378	 * A VSW_CONN_RESET event originates either with a LDC_RESET_EVT
1379	 * being received by the callback handler, or a ECONNRESET error
1380	 * code being returned from a ldc_read() or ldc_write() call.
1381	 *
1382	 * A VSW_CONN_RESTART event occurs when some error checking code
1383	 * decides that there is a problem with data from the channel,
1384	 * and that the handshake should be restarted.
1385	 */
1386	if (((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) &&
1387	    (ldstub((uint8_t *)&ldcp->reset_active)))
1388		return;
1389
1390	/*
1391	 * If it is an LDC_UP event we first check the recorded
1392	 * state of the channel. If this is UP then we know that
1393	 * the channel moving to the UP state has already been dealt
1394	 * with and don't need to dispatch a  new task.
1395	 *
1396	 * The reason for this check is that when we do a ldc_up(),
1397	 * depending on the state of the peer, we may or may not get
1398	 * a LDC_UP event. As we can't depend on getting a LDC_UP evt
1399	 * every time we do ldc_up() we explicitly check the channel
1400	 * status to see has it come up (ldc_up() is asynch and will
1401	 * complete at some undefined time), and take the appropriate
1402	 * action.
1403	 *
1404	 * The flip side of this is that we may get a LDC_UP event
1405	 * when we have already seen that the channel is up and have
1406	 * dealt with that.
1407	 */
1408	mutex_enter(&ldcp->status_lock);
1409	if (evt == VSW_CONN_UP) {
1410		if ((ldcp->ldc_status == LDC_UP) || (ldcp->reset_active != 0)) {
1411			mutex_exit(&ldcp->status_lock);
1412			return;
1413		}
1414	}
1415	mutex_exit(&ldcp->status_lock);
1416
1417	/*
1418	 * The transaction group id allows us to identify and discard
1419	 * any tasks which are still pending on the taskq and refer
1420	 * to the handshake session we are about to restart or reset.
1421	 * These stale messages no longer have any real meaning.
1422	 */
1423	(void) atomic_inc_32(&ldcp->hss_id);
1424
1425	ASSERT(vswp->taskq_p != NULL);
1426
1427	if ((conn = kmem_zalloc(sizeof (vsw_conn_evt_t), KM_NOSLEEP)) == NULL) {
1428		cmn_err(CE_WARN, "!vsw%d: unable to allocate memory for"
1429		    " connection event", vswp->instance);
1430		goto err_exit;
1431	}
1432
1433	conn->evt = evt;
1434	conn->ldcp = ldcp;
1435
1436	if (ddi_taskq_dispatch(vswp->taskq_p, vsw_conn_task, conn,
1437	    DDI_NOSLEEP) != DDI_SUCCESS) {
1438		cmn_err(CE_WARN, "!vsw%d: Can't dispatch connection task",
1439		    vswp->instance);
1440
1441		kmem_free(conn, sizeof (vsw_conn_evt_t));
1442		goto err_exit;
1443	}
1444
1445	D1(vswp, "%s: exit", __func__);
1446	return;
1447
1448err_exit:
1449	/*
1450	 * Have mostly likely failed due to memory shortage. Clear the flag so
1451	 * that future requests will at least be attempted and will hopefully
1452	 * succeed.
1453	 */
1454	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1455		ldcp->reset_active = 0;
1456}
1457
1458/*
1459 * Deal with events relating to a connection. Invoked from a taskq.
1460 */
1461static void
1462vsw_conn_task(void *arg)
1463{
1464	vsw_conn_evt_t	*conn = (vsw_conn_evt_t *)arg;
1465	vsw_ldc_t	*ldcp = NULL;
1466	vsw_t		*vswp = NULL;
1467	uint16_t	evt;
1468	ldc_status_t	curr_status;
1469
1470	ldcp = conn->ldcp;
1471	evt = conn->evt;
1472	vswp = ldcp->ldc_vswp;
1473
1474	D1(vswp, "%s: enter", __func__);
1475
1476	/* can safely free now have copied out data */
1477	kmem_free(conn, sizeof (vsw_conn_evt_t));
1478
1479	mutex_enter(&ldcp->status_lock);
1480	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1481		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1482		    "channel %ld", vswp->instance, ldcp->ldc_id);
1483		mutex_exit(&ldcp->status_lock);
1484		return;
1485	}
1486
1487	/*
1488	 * If we wish to restart the handshake on this channel, then if
1489	 * the channel is UP we bring it DOWN to flush the underlying
1490	 * ldc queue.
1491	 */
1492	if ((evt == VSW_CONN_RESTART) && (curr_status == LDC_UP))
1493		(void) ldc_down(ldcp->ldc_handle);
1494
1495	/*
1496	 * re-init all the associated data structures.
1497	 */
1498	vsw_ldc_reinit(ldcp);
1499
1500	/*
1501	 * Bring the channel back up (note it does no harm to
1502	 * do this even if the channel is already UP, Just
1503	 * becomes effectively a no-op).
1504	 */
1505	(void) ldc_up(ldcp->ldc_handle);
1506
1507	/*
1508	 * Check if channel is now UP. This will only happen if
1509	 * peer has also done a ldc_up().
1510	 */
1511	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1512		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1513		    "channel %ld", vswp->instance, ldcp->ldc_id);
1514		mutex_exit(&ldcp->status_lock);
1515		return;
1516	}
1517
1518	ldcp->ldc_status = curr_status;
1519
1520	/* channel UP so restart handshake by sending version info */
1521	if (curr_status == LDC_UP) {
1522		if (ldcp->hcnt++ > vsw_num_handshakes) {
1523			cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted"
1524			    " handshake attempts (%d) on channel %ld",
1525			    vswp->instance, ldcp->hcnt, ldcp->ldc_id);
1526			mutex_exit(&ldcp->status_lock);
1527			return;
1528		}
1529
1530		if (vsw_obp_ver_proto_workaround == B_FALSE &&
1531		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp,
1532		    DDI_NOSLEEP) != DDI_SUCCESS)) {
1533			cmn_err(CE_WARN, "!vsw%d: Can't dispatch version task",
1534			    vswp->instance);
1535
1536			/*
1537			 * Don't count as valid restart attempt if couldn't
1538			 * send version msg.
1539			 */
1540			if (ldcp->hcnt > 0)
1541				ldcp->hcnt--;
1542		}
1543	}
1544
1545	/*
1546	 * Mark that the process is complete by clearing the flag.
1547	 *
1548	 * Note is it possible that the taskq dispatch above may have failed,
1549	 * most likely due to memory shortage. We still clear the flag so
1550	 * future attempts will at least be attempted and will hopefully
1551	 * succeed.
1552	 */
1553	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1554		ldcp->reset_active = 0;
1555
1556	mutex_exit(&ldcp->status_lock);
1557
1558	D1(vswp, "%s: exit", __func__);
1559}
1560
1561/*
1562 * returns 0 if legal for event signified by flag to have
1563 * occured at the time it did. Otherwise returns 1.
1564 */
1565int
1566vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag)
1567{
1568	vsw_t		*vswp = ldcp->ldc_vswp;
1569	uint64_t	state;
1570	uint64_t	phase;
1571
1572	if (dir == INBOUND)
1573		state = ldcp->lane_in.lstate;
1574	else
1575		state = ldcp->lane_out.lstate;
1576
1577	phase = ldcp->hphase;
1578
1579	switch (flag) {
1580	case VSW_VER_INFO_RECV:
1581		if (phase > VSW_MILESTONE0) {
1582			DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV"
1583			    " when in state %d\n", ldcp->ldc_id, phase);
1584			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1585			return (1);
1586		}
1587		break;
1588
1589	case VSW_VER_ACK_RECV:
1590	case VSW_VER_NACK_RECV:
1591		if (!(state & VSW_VER_INFO_SENT)) {
1592			DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK or "
1593			    "VER_NACK when in state %d\n", ldcp->ldc_id, phase);
1594			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1595			return (1);
1596		} else
1597			state &= ~VSW_VER_INFO_SENT;
1598		break;
1599
1600	case VSW_ATTR_INFO_RECV:
1601		if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) {
1602			DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV"
1603			    " when in state %d\n", ldcp->ldc_id, phase);
1604			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1605			return (1);
1606		}
1607		break;
1608
1609	case VSW_ATTR_ACK_RECV:
1610	case VSW_ATTR_NACK_RECV:
1611		if (!(state & VSW_ATTR_INFO_SENT)) {
1612			DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK"
1613			    " or ATTR_NACK when in state %d\n",
1614			    ldcp->ldc_id, phase);
1615			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1616			return (1);
1617		} else
1618			state &= ~VSW_ATTR_INFO_SENT;
1619		break;
1620
1621	case VSW_DRING_INFO_RECV:
1622		if (phase < VSW_MILESTONE1) {
1623			DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV"
1624			    " when in state %d\n", ldcp->ldc_id, phase);
1625			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1626			return (1);
1627		}
1628		break;
1629
1630	case VSW_DRING_ACK_RECV:
1631	case VSW_DRING_NACK_RECV:
1632		if (!(state & VSW_DRING_INFO_SENT)) {
1633			DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK "
1634			    " or DRING_NACK when in state %d\n",
1635			    ldcp->ldc_id, phase);
1636			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1637			return (1);
1638		} else
1639			state &= ~VSW_DRING_INFO_SENT;
1640		break;
1641
1642	case VSW_RDX_INFO_RECV:
1643		if (phase < VSW_MILESTONE3) {
1644			DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV"
1645			    " when in state %d\n", ldcp->ldc_id, phase);
1646			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1647			return (1);
1648		}
1649		break;
1650
1651	case VSW_RDX_ACK_RECV:
1652	case VSW_RDX_NACK_RECV:
1653		if (!(state & VSW_RDX_INFO_SENT)) {
1654			DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK or "
1655			    "RDX_NACK when in state %d\n", ldcp->ldc_id, phase);
1656			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1657			return (1);
1658		} else
1659			state &= ~VSW_RDX_INFO_SENT;
1660		break;
1661
1662	case VSW_MCST_INFO_RECV:
1663		if (phase < VSW_MILESTONE3) {
1664			DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV"
1665			    " when in state %d\n", ldcp->ldc_id, phase);
1666			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1667			return (1);
1668		}
1669		break;
1670
1671	default:
1672		DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)",
1673		    ldcp->ldc_id, flag);
1674		return (1);
1675	}
1676
1677	if (dir == INBOUND)
1678		ldcp->lane_in.lstate = state;
1679	else
1680		ldcp->lane_out.lstate = state;
1681
1682	D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id);
1683
1684	return (0);
1685}
1686
1687void
1688vsw_next_milestone(vsw_ldc_t *ldcp)
1689{
1690	vsw_t		*vswp = ldcp->ldc_vswp;
1691
1692	D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__,
1693	    ldcp->ldc_id, ldcp->hphase);
1694
1695	DUMP_FLAGS(ldcp->lane_in.lstate);
1696	DUMP_FLAGS(ldcp->lane_out.lstate);
1697
1698	switch (ldcp->hphase) {
1699
1700	case VSW_MILESTONE0:
1701		/*
1702		 * If we haven't started to handshake with our peer,
1703		 * start to do so now.
1704		 */
1705		if (ldcp->lane_out.lstate == 0) {
1706			D2(vswp, "%s: (chan %lld) starting handshake "
1707			    "with peer", __func__, ldcp->ldc_id);
1708			vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1709		}
1710
1711		/*
1712		 * Only way to pass this milestone is to have successfully
1713		 * negotiated version info.
1714		 */
1715		if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) &&
1716		    (ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) {
1717
1718			D2(vswp, "%s: (chan %lld) leaving milestone 0",
1719			    __func__, ldcp->ldc_id);
1720
1721			vsw_set_vnet_proto_ops(ldcp);
1722
1723			/*
1724			 * Next milestone is passed when attribute
1725			 * information has been successfully exchanged.
1726			 */
1727			ldcp->hphase = VSW_MILESTONE1;
1728			vsw_send_attr(ldcp);
1729
1730		}
1731		break;
1732
1733	case VSW_MILESTONE1:
1734		/*
1735		 * Only way to pass this milestone is to have successfully
1736		 * negotiated attribute information.
1737		 */
1738		if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) {
1739
1740			ldcp->hphase = VSW_MILESTONE2;
1741
1742			/*
1743			 * If the peer device has said it wishes to
1744			 * use descriptor rings then we send it our ring
1745			 * info, otherwise we just set up a private ring
1746			 * which we use an internal buffer
1747			 */
1748			if ((VSW_VER_EQ(ldcp, 1, 2) &&
1749			    (ldcp->lane_in.xfer_mode & VIO_DRING_MODE_V1_2)) ||
1750			    (VSW_VER_LT(ldcp, 1, 2) &&
1751			    (ldcp->lane_in.xfer_mode ==
1752			    VIO_DRING_MODE_V1_0))) {
1753				vsw_send_dring_info(ldcp);
1754			}
1755		}
1756		break;
1757
1758	case VSW_MILESTONE2:
1759		/*
1760		 * If peer has indicated in its attribute message that
1761		 * it wishes to use descriptor rings then the only way
1762		 * to pass this milestone is for us to have received
1763		 * valid dring info.
1764		 *
1765		 * If peer is not using descriptor rings then just fall
1766		 * through.
1767		 */
1768		if ((VSW_VER_EQ(ldcp, 1, 2) &&
1769		    (ldcp->lane_in.xfer_mode & VIO_DRING_MODE_V1_2)) ||
1770		    (VSW_VER_LT(ldcp, 1, 2) &&
1771		    (ldcp->lane_in.xfer_mode ==
1772		    VIO_DRING_MODE_V1_0))) {
1773			if (!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT))
1774				break;
1775		}
1776
1777		D2(vswp, "%s: (chan %lld) leaving milestone 2",
1778		    __func__, ldcp->ldc_id);
1779
1780		ldcp->hphase = VSW_MILESTONE3;
1781		vsw_send_rdx(ldcp);
1782		break;
1783
1784	case VSW_MILESTONE3:
1785		/*
1786		 * Pass this milestone when all paramaters have been
1787		 * successfully exchanged and RDX sent in both directions.
1788		 *
1789		 * Mark outbound lane as available to transmit data.
1790		 */
1791		if ((ldcp->lane_out.lstate & VSW_RDX_ACK_SENT) &&
1792		    (ldcp->lane_in.lstate & VSW_RDX_ACK_RECV)) {
1793
1794			D2(vswp, "%s: (chan %lld) leaving milestone 3",
1795			    __func__, ldcp->ldc_id);
1796			D2(vswp, "%s: ** handshake complete (0x%llx : "
1797			    "0x%llx) **", __func__, ldcp->lane_in.lstate,
1798			    ldcp->lane_out.lstate);
1799			ldcp->lane_out.lstate |= VSW_LANE_ACTIVE;
1800			ldcp->hphase = VSW_MILESTONE4;
1801			ldcp->hcnt = 0;
1802			DISPLAY_STATE();
1803		} else {
1804			D2(vswp, "%s: still in milestone 3 (0x%llx : 0x%llx)",
1805			    __func__, ldcp->lane_in.lstate,
1806			    ldcp->lane_out.lstate);
1807		}
1808		break;
1809
1810	case VSW_MILESTONE4:
1811		D2(vswp, "%s: (chan %lld) in milestone 4", __func__,
1812		    ldcp->ldc_id);
1813		break;
1814
1815	default:
1816		DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__,
1817		    ldcp->ldc_id, ldcp->hphase);
1818	}
1819
1820	D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id,
1821	    ldcp->hphase);
1822}
1823
1824/*
1825 * Check if major version is supported.
1826 *
1827 * Returns 0 if finds supported major number, and if necessary
1828 * adjusts the minor field.
1829 *
1830 * Returns 1 if can't match major number exactly. Sets mjor/minor
1831 * to next lowest support values, or to zero if no other values possible.
1832 */
1833static int
1834vsw_supported_version(vio_ver_msg_t *vp)
1835{
1836	int	i;
1837
1838	D1(NULL, "vsw_supported_version: enter");
1839
1840	for (i = 0; i < VSW_NUM_VER; i++) {
1841		if (vsw_versions[i].ver_major == vp->ver_major) {
1842			/*
1843			 * Matching or lower major version found. Update
1844			 * minor number if necessary.
1845			 */
1846			if (vp->ver_minor > vsw_versions[i].ver_minor) {
1847				D2(NULL, "%s: adjusting minor value from %d "
1848				    "to %d", __func__, vp->ver_minor,
1849				    vsw_versions[i].ver_minor);
1850				vp->ver_minor = vsw_versions[i].ver_minor;
1851			}
1852
1853			return (0);
1854		}
1855
1856		/*
1857		 * If the message contains a higher major version number, set
1858		 * the message's major/minor versions to the current values
1859		 * and return false, so this message will get resent with
1860		 * these values.
1861		 */
1862		if (vsw_versions[i].ver_major < vp->ver_major) {
1863			D2(NULL, "%s: adjusting major and minor "
1864			    "values to %d, %d\n",
1865			    __func__, vsw_versions[i].ver_major,
1866			    vsw_versions[i].ver_minor);
1867			vp->ver_major = vsw_versions[i].ver_major;
1868			vp->ver_minor = vsw_versions[i].ver_minor;
1869			return (1);
1870		}
1871	}
1872
1873	/* No match was possible, zero out fields */
1874	vp->ver_major = 0;
1875	vp->ver_minor = 0;
1876
1877	D1(NULL, "vsw_supported_version: exit");
1878
1879	return (1);
1880}
1881
1882/*
1883 * Set vnet-protocol-version dependent functions based on version.
1884 */
1885static void
1886vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp)
1887{
1888	vsw_t	*vswp = ldcp->ldc_vswp;
1889	lane_t	*lp = &ldcp->lane_out;
1890
1891	if (VSW_VER_EQ(ldcp, 1, 2)) {
1892		/* Version 1.2 */
1893
1894		if (VSW_PRI_ETH_DEFINED(vswp)) {
1895			/*
1896			 * enable priority routines and pkt mode only if
1897			 * at least one pri-eth-type is specified in MD.
1898			 */
1899			ldcp->tx = vsw_ldctx_pri;
1900			ldcp->rx_pktdata = vsw_process_pkt_data;
1901
1902			/* set xfer mode for vsw_send_attr() */
1903			lp->xfer_mode = VIO_PKT_MODE | VIO_DRING_MODE_V1_2;
1904		} else {
1905			/* no priority eth types defined in MD */
1906
1907			ldcp->tx = vsw_ldctx;
1908			ldcp->rx_pktdata = vsw_process_pkt_data_nop;
1909
1910			/* set xfer mode for vsw_send_attr() */
1911			lp->xfer_mode = VIO_DRING_MODE_V1_2;
1912
1913		}
1914	} else {
1915		/* Versions prior to 1.2  */
1916
1917		vsw_reset_vnet_proto_ops(ldcp);
1918	}
1919}
1920
1921/*
1922 * Reset vnet-protocol-version dependent functions to v1.0.
1923 */
1924static void
1925vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp)
1926{
1927	lane_t	*lp = &ldcp->lane_out;
1928
1929	ldcp->tx = vsw_ldctx;
1930	ldcp->rx_pktdata = vsw_process_pkt_data_nop;
1931
1932	/* set xfer mode for vsw_send_attr() */
1933	lp->xfer_mode = VIO_DRING_MODE_V1_0;
1934}
1935
1936/*
1937 * Main routine for processing messages received over LDC.
1938 */
1939static void
1940vsw_process_pkt(void *arg)
1941{
1942	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
1943	vsw_t 		*vswp = ldcp->ldc_vswp;
1944	size_t		msglen;
1945	vio_msg_tag_t	*tagp;
1946	uint64_t	*ldcmsg;
1947	int 		rv = 0;
1948
1949
1950	D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
1951
1952	ASSERT(MUTEX_HELD(&ldcp->ldc_cblock));
1953
1954	ldcmsg = ldcp->ldcmsg;
1955	/*
1956	 * If channel is up read messages until channel is empty.
1957	 */
1958	do {
1959		msglen = ldcp->msglen;
1960		rv = ldc_read(ldcp->ldc_handle, (caddr_t)ldcmsg, &msglen);
1961
1962		if (rv != 0) {
1963			DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) len(%d)\n",
1964			    __func__, ldcp->ldc_id, rv, msglen);
1965		}
1966
1967		/* channel has been reset */
1968		if (rv == ECONNRESET) {
1969			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
1970			break;
1971		}
1972
1973		if (msglen == 0) {
1974			D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__,
1975			    ldcp->ldc_id);
1976			break;
1977		}
1978
1979		D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__,
1980		    ldcp->ldc_id, msglen);
1981
1982		/*
1983		 * Figure out what sort of packet we have gotten by
1984		 * examining the msg tag, and then switch it appropriately.
1985		 */
1986		tagp = (vio_msg_tag_t *)ldcmsg;
1987
1988		switch (tagp->vio_msgtype) {
1989		case VIO_TYPE_CTRL:
1990			vsw_dispatch_ctrl_task(ldcp, ldcmsg, tagp);
1991			break;
1992		case VIO_TYPE_DATA:
1993			vsw_process_data_pkt(ldcp, ldcmsg, tagp, msglen);
1994			break;
1995		case VIO_TYPE_ERR:
1996			vsw_process_err_pkt(ldcp, ldcmsg, tagp);
1997			break;
1998		default:
1999			DERR(vswp, "%s: Unknown tag(%lx) ", __func__,
2000			    "id(%lx)\n", tagp->vio_msgtype, ldcp->ldc_id);
2001			break;
2002		}
2003	} while (msglen);
2004
2005	D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2006}
2007
2008/*
2009 * Dispatch a task to process a VIO control message.
2010 */
2011static void
2012vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t *tagp)
2013{
2014	vsw_ctrl_task_t		*ctaskp = NULL;
2015	vsw_port_t		*port = ldcp->ldc_port;
2016	vsw_t			*vswp = port->p_vswp;
2017
2018	D1(vswp, "%s: enter", __func__);
2019
2020	/*
2021	 * We need to handle RDX ACK messages in-band as once they
2022	 * are exchanged it is possible that we will get an
2023	 * immediate (legitimate) data packet.
2024	 */
2025	if ((tagp->vio_subtype_env == VIO_RDX) &&
2026	    (tagp->vio_subtype == VIO_SUBTYPE_ACK)) {
2027
2028		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_ACK_RECV))
2029			return;
2030
2031		ldcp->lane_in.lstate |= VSW_RDX_ACK_RECV;
2032		D2(vswp, "%s (%ld) handling RDX_ACK in place "
2033		    "(ostate 0x%llx : hphase %d)", __func__,
2034		    ldcp->ldc_id, ldcp->lane_in.lstate, ldcp->hphase);
2035		vsw_next_milestone(ldcp);
2036		return;
2037	}
2038
2039	ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP);
2040
2041	if (ctaskp == NULL) {
2042		DERR(vswp, "%s: unable to alloc space for ctrl msg", __func__);
2043		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2044		return;
2045	}
2046
2047	ctaskp->ldcp = ldcp;
2048	bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t));
2049	ctaskp->hss_id = ldcp->hss_id;
2050
2051	/*
2052	 * Dispatch task to processing taskq if port is not in
2053	 * the process of being detached.
2054	 */
2055	mutex_enter(&port->state_lock);
2056	if (port->state == VSW_PORT_INIT) {
2057		if ((vswp->taskq_p == NULL) ||
2058		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_process_ctrl_pkt,
2059		    ctaskp, DDI_NOSLEEP) != DDI_SUCCESS)) {
2060			DERR(vswp, "%s: unable to dispatch task to taskq",
2061			    __func__);
2062			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2063			mutex_exit(&port->state_lock);
2064			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2065			return;
2066		}
2067	} else {
2068		DWARN(vswp, "%s: port %d detaching, not dispatching "
2069		    "task", __func__, port->p_instance);
2070	}
2071
2072	mutex_exit(&port->state_lock);
2073
2074	D2(vswp, "%s: dispatched task to taskq for chan %d", __func__,
2075	    ldcp->ldc_id);
2076	D1(vswp, "%s: exit", __func__);
2077}
2078
2079/*
2080 * Process a VIO ctrl message. Invoked from taskq.
2081 */
2082static void
2083vsw_process_ctrl_pkt(void *arg)
2084{
2085	vsw_ctrl_task_t	*ctaskp = (vsw_ctrl_task_t *)arg;
2086	vsw_ldc_t	*ldcp = ctaskp->ldcp;
2087	vsw_t 		*vswp = ldcp->ldc_vswp;
2088	vio_msg_tag_t	tag;
2089	uint16_t	env;
2090
2091	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2092
2093	bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t));
2094	env = tag.vio_subtype_env;
2095
2096	/* stale pkt check */
2097	if (ctaskp->hss_id < ldcp->hss_id) {
2098		DWARN(vswp, "%s: discarding stale packet belonging to earlier"
2099		    " (%ld) handshake session", __func__, ctaskp->hss_id);
2100		return;
2101	}
2102
2103	/* session id check */
2104	if (ldcp->session_status & VSW_PEER_SESSION) {
2105		if (ldcp->peer_session != tag.vio_sid) {
2106			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
2107			    __func__, ldcp->ldc_id, tag.vio_sid);
2108			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2109			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2110			return;
2111		}
2112	}
2113
2114	/*
2115	 * Switch on vio_subtype envelope, then let lower routines
2116	 * decide if its an INFO, ACK or NACK packet.
2117	 */
2118	switch (env) {
2119	case VIO_VER_INFO:
2120		vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp);
2121		break;
2122	case VIO_DRING_REG:
2123		vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp);
2124		break;
2125	case VIO_DRING_UNREG:
2126		vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp);
2127		break;
2128	case VIO_ATTR_INFO:
2129		vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp);
2130		break;
2131	case VNET_MCAST_INFO:
2132		vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp);
2133		break;
2134	case VIO_RDX:
2135		vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp);
2136		break;
2137	default:
2138		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
2139	}
2140
2141	kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2142	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2143}
2144
2145/*
2146 * Version negotiation. We can end up here either because our peer
2147 * has responded to a handshake message we have sent it, or our peer
2148 * has initiated a handshake with us. If its the former then can only
2149 * be ACK or NACK, if its the later can only be INFO.
2150 *
2151 * If its an ACK we move to the next stage of the handshake, namely
2152 * attribute exchange. If its a NACK we see if we can specify another
2153 * version, if we can't we stop.
2154 *
2155 * If it is an INFO we reset all params associated with communication
2156 * in that direction over this channel (remember connection is
2157 * essentially 2 independent simplex channels).
2158 */
2159void
2160vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt)
2161{
2162	vio_ver_msg_t	*ver_pkt;
2163	vsw_t 		*vswp = ldcp->ldc_vswp;
2164
2165	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2166
2167	/*
2168	 * We know this is a ctrl/version packet so
2169	 * cast it into the correct structure.
2170	 */
2171	ver_pkt = (vio_ver_msg_t *)pkt;
2172
2173	switch (ver_pkt->tag.vio_subtype) {
2174	case VIO_SUBTYPE_INFO:
2175		D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n");
2176
2177		/*
2178		 * Record the session id, which we will use from now
2179		 * until we see another VER_INFO msg. Even then the
2180		 * session id in most cases will be unchanged, execpt
2181		 * if channel was reset.
2182		 */
2183		if ((ldcp->session_status & VSW_PEER_SESSION) &&
2184		    (ldcp->peer_session != ver_pkt->tag.vio_sid)) {
2185			DERR(vswp, "%s: updating session id for chan %lld "
2186			    "from %llx to %llx", __func__, ldcp->ldc_id,
2187			    ldcp->peer_session, ver_pkt->tag.vio_sid);
2188		}
2189
2190		ldcp->peer_session = ver_pkt->tag.vio_sid;
2191		ldcp->session_status |= VSW_PEER_SESSION;
2192
2193		/* Legal message at this time ? */
2194		if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV))
2195			return;
2196
2197		/*
2198		 * First check the device class. Currently only expect
2199		 * to be talking to a network device. In the future may
2200		 * also talk to another switch.
2201		 */
2202		if (ver_pkt->dev_class != VDEV_NETWORK) {
2203			DERR(vswp, "%s: illegal device class %d", __func__,
2204			    ver_pkt->dev_class);
2205
2206			ver_pkt->tag.vio_sid = ldcp->local_session;
2207			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2208
2209			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2210
2211			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2212			    sizeof (vio_ver_msg_t), B_TRUE);
2213
2214			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2215			vsw_next_milestone(ldcp);
2216			return;
2217		} else {
2218			ldcp->dev_class = ver_pkt->dev_class;
2219		}
2220
2221		/*
2222		 * Now check the version.
2223		 */
2224		if (vsw_supported_version(ver_pkt) == 0) {
2225			/*
2226			 * Support this major version and possibly
2227			 * adjusted minor version.
2228			 */
2229
2230			D2(vswp, "%s: accepted ver %d:%d", __func__,
2231			    ver_pkt->ver_major, ver_pkt->ver_minor);
2232
2233			/* Store accepted values */
2234			ldcp->lane_in.ver_major = ver_pkt->ver_major;
2235			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2236
2237			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2238
2239			ldcp->lane_in.lstate |= VSW_VER_ACK_SENT;
2240
2241			if (vsw_obp_ver_proto_workaround == B_TRUE) {
2242				/*
2243				 * Send a version info message
2244				 * using the accepted version that
2245				 * we are about to ack. Also note that
2246				 * we send our ver info before we ack.
2247				 * Otherwise, as soon as receiving the
2248				 * ack, obp sends attr info msg, which
2249				 * breaks vsw_check_flag() invoked
2250				 * from vsw_process_ctrl_attr_pkt();
2251				 * as we also need VSW_VER_ACK_RECV to
2252				 * be set in lane_out.lstate, before
2253				 * we can receive attr info.
2254				 */
2255				vsw_send_ver(ldcp);
2256			}
2257		} else {
2258			/*
2259			 * NACK back with the next lower major/minor
2260			 * pairing we support (if don't suuport any more
2261			 * versions then they will be set to zero.
2262			 */
2263
2264			D2(vswp, "%s: replying with ver %d:%d", __func__,
2265			    ver_pkt->ver_major, ver_pkt->ver_minor);
2266
2267			/* Store updated values */
2268			ldcp->lane_in.ver_major = ver_pkt->ver_major;
2269			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2270
2271			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2272
2273			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2274		}
2275
2276		DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2277		ver_pkt->tag.vio_sid = ldcp->local_session;
2278		(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2279		    sizeof (vio_ver_msg_t), B_TRUE);
2280
2281		vsw_next_milestone(ldcp);
2282		break;
2283
2284	case VIO_SUBTYPE_ACK:
2285		D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__);
2286
2287		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV))
2288			return;
2289
2290		/* Store updated values */
2291		ldcp->lane_out.ver_major = ver_pkt->ver_major;
2292		ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2293
2294		ldcp->lane_out.lstate |= VSW_VER_ACK_RECV;
2295		vsw_next_milestone(ldcp);
2296
2297		break;
2298
2299	case VIO_SUBTYPE_NACK:
2300		D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__);
2301
2302		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV))
2303			return;
2304
2305		/*
2306		 * If our peer sent us a NACK with the ver fields set to
2307		 * zero then there is nothing more we can do. Otherwise see
2308		 * if we support either the version suggested, or a lesser
2309		 * one.
2310		 */
2311		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2312			DERR(vswp, "%s: peer unable to negotiate any "
2313			    "further.", __func__);
2314			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2315			vsw_next_milestone(ldcp);
2316			return;
2317		}
2318
2319		/*
2320		 * Check to see if we support this major version or
2321		 * a lower one. If we don't then maj/min will be set
2322		 * to zero.
2323		 */
2324		(void) vsw_supported_version(ver_pkt);
2325		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2326			/* Nothing more we can do */
2327			DERR(vswp, "%s: version negotiation failed.\n",
2328			    __func__);
2329			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2330			vsw_next_milestone(ldcp);
2331		} else {
2332			/* found a supported major version */
2333			ldcp->lane_out.ver_major = ver_pkt->ver_major;
2334			ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2335
2336			D2(vswp, "%s: resending with updated values (%x, %x)",
2337			    __func__, ver_pkt->ver_major, ver_pkt->ver_minor);
2338
2339			ldcp->lane_out.lstate |= VSW_VER_INFO_SENT;
2340			ver_pkt->tag.vio_sid = ldcp->local_session;
2341			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
2342
2343			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2344
2345			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2346			    sizeof (vio_ver_msg_t), B_TRUE);
2347
2348			vsw_next_milestone(ldcp);
2349
2350		}
2351		break;
2352
2353	default:
2354		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2355		    ver_pkt->tag.vio_subtype);
2356	}
2357
2358	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
2359}
2360
2361/*
2362 * Process an attribute packet. We can end up here either because our peer
2363 * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our
2364 * peer has sent us an attribute INFO message
2365 *
2366 * If its an ACK we then move to the next stage of the handshake which
2367 * is to send our descriptor ring info to our peer. If its a NACK then
2368 * there is nothing more we can (currently) do.
2369 *
2370 * If we get a valid/acceptable INFO packet (and we have already negotiated
2371 * a version) we ACK back and set channel state to ATTR_RECV, otherwise we
2372 * NACK back and reset channel state to INACTIV.
2373 *
2374 * FUTURE: in time we will probably negotiate over attributes, but for
2375 * the moment unacceptable attributes are regarded as a fatal error.
2376 *
2377 */
2378void
2379vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt)
2380{
2381	vnet_attr_msg_t		*attr_pkt;
2382	vsw_t			*vswp = ldcp->ldc_vswp;
2383	vsw_port_t		*port = ldcp->ldc_port;
2384	uint64_t		macaddr = 0;
2385	int			i;
2386
2387	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
2388
2389	/*
2390	 * We know this is a ctrl/attr packet so
2391	 * cast it into the correct structure.
2392	 */
2393	attr_pkt = (vnet_attr_msg_t *)pkt;
2394
2395	switch (attr_pkt->tag.vio_subtype) {
2396	case VIO_SUBTYPE_INFO:
2397		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2398
2399		if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV))
2400			return;
2401
2402		/*
2403		 * If the attributes are unacceptable then we NACK back.
2404		 */
2405		if (vsw_check_attr(attr_pkt, ldcp)) {
2406
2407			DERR(vswp, "%s (chan %d): invalid attributes",
2408			    __func__, ldcp->ldc_id);
2409
2410			vsw_free_lane_resources(ldcp, INBOUND);
2411
2412			attr_pkt->tag.vio_sid = ldcp->local_session;
2413			attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2414
2415			DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
2416			ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT;
2417			(void) vsw_send_msg(ldcp, (void *)attr_pkt,
2418			    sizeof (vnet_attr_msg_t), B_TRUE);
2419
2420			vsw_next_milestone(ldcp);
2421			return;
2422		}
2423
2424		/*
2425		 * Otherwise store attributes for this lane and update
2426		 * lane state.
2427		 */
2428		ldcp->lane_in.mtu = attr_pkt->mtu;
2429		ldcp->lane_in.addr = attr_pkt->addr;
2430		ldcp->lane_in.addr_type = attr_pkt->addr_type;
2431		ldcp->lane_in.xfer_mode = attr_pkt->xfer_mode;
2432		ldcp->lane_in.ack_freq = attr_pkt->ack_freq;
2433
2434		macaddr = ldcp->lane_in.addr;
2435		for (i = ETHERADDRL - 1; i >= 0; i--) {
2436			port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF;
2437			macaddr >>= 8;
2438		}
2439
2440		/* create the fdb entry for this port/mac address */
2441		(void) vsw_add_fdb(vswp, port);
2442
2443		/* setup device specifc xmit routines */
2444		mutex_enter(&port->tx_lock);
2445		if ((VSW_VER_EQ(ldcp, 1, 2) &&
2446		    (ldcp->lane_in.xfer_mode & VIO_DRING_MODE_V1_2)) ||
2447		    (VSW_VER_LT(ldcp, 1, 2) &&
2448		    (ldcp->lane_in.xfer_mode == VIO_DRING_MODE_V1_0))) {
2449			D2(vswp, "%s: mode = VIO_DRING_MODE", __func__);
2450			port->transmit = vsw_dringsend;
2451		} else if (ldcp->lane_in.xfer_mode == VIO_DESC_MODE) {
2452			D2(vswp, "%s: mode = VIO_DESC_MODE", __func__);
2453			vsw_create_privring(ldcp);
2454			port->transmit = vsw_descrsend;
2455			ldcp->lane_out.xfer_mode = VIO_DESC_MODE;
2456		}
2457		mutex_exit(&port->tx_lock);
2458
2459		attr_pkt->tag.vio_sid = ldcp->local_session;
2460		attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2461
2462		DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
2463
2464		ldcp->lane_in.lstate |= VSW_ATTR_ACK_SENT;
2465
2466		(void) vsw_send_msg(ldcp, (void *)attr_pkt,
2467		    sizeof (vnet_attr_msg_t), B_TRUE);
2468
2469		vsw_next_milestone(ldcp);
2470		break;
2471
2472	case VIO_SUBTYPE_ACK:
2473		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2474
2475		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV))
2476			return;
2477
2478		ldcp->lane_out.lstate |= VSW_ATTR_ACK_RECV;
2479		vsw_next_milestone(ldcp);
2480		break;
2481
2482	case VIO_SUBTYPE_NACK:
2483		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2484
2485		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV))
2486			return;
2487
2488		ldcp->lane_out.lstate |= VSW_ATTR_NACK_RECV;
2489		vsw_next_milestone(ldcp);
2490		break;
2491
2492	default:
2493		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2494		    attr_pkt->tag.vio_subtype);
2495	}
2496
2497	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
2498}
2499
2500/*
2501 * Process a dring info packet. We can end up here either because our peer
2502 * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our
2503 * peer has sent us a dring INFO message.
2504 *
2505 * If we get a valid/acceptable INFO packet (and we have already negotiated
2506 * a version) we ACK back and update the lane state, otherwise we NACK back.
2507 *
2508 * FUTURE: nothing to stop client from sending us info on multiple dring's
2509 * but for the moment we will just use the first one we are given.
2510 *
2511 */
2512void
2513vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt)
2514{
2515	vio_dring_reg_msg_t	*dring_pkt;
2516	vsw_t			*vswp = ldcp->ldc_vswp;
2517	ldc_mem_info_t		minfo;
2518	dring_info_t		*dp, *dbp;
2519	int			dring_found = 0;
2520
2521	/*
2522	 * We know this is a ctrl/dring packet so
2523	 * cast it into the correct structure.
2524	 */
2525	dring_pkt = (vio_dring_reg_msg_t *)pkt;
2526
2527	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
2528
2529	switch (dring_pkt->tag.vio_subtype) {
2530	case VIO_SUBTYPE_INFO:
2531		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2532
2533		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
2534			return;
2535
2536		/*
2537		 * If the dring params are unacceptable then we NACK back.
2538		 */
2539		if (vsw_check_dring_info(dring_pkt)) {
2540
2541			DERR(vswp, "%s (%lld): invalid dring info",
2542			    __func__, ldcp->ldc_id);
2543
2544			vsw_free_lane_resources(ldcp, INBOUND);
2545
2546			dring_pkt->tag.vio_sid = ldcp->local_session;
2547			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2548
2549			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
2550
2551			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
2552
2553			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2554			    sizeof (vio_dring_reg_msg_t), B_TRUE);
2555
2556			vsw_next_milestone(ldcp);
2557			return;
2558		}
2559
2560		/*
2561		 * Otherwise, attempt to map in the dring using the
2562		 * cookie. If that succeeds we send back a unique dring
2563		 * identifier that the sending side will use in future
2564		 * to refer to this descriptor ring.
2565		 */
2566		dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
2567
2568		dp->num_descriptors = dring_pkt->num_descriptors;
2569		dp->descriptor_size = dring_pkt->descriptor_size;
2570		dp->options = dring_pkt->options;
2571		dp->ncookies = dring_pkt->ncookies;
2572
2573		/*
2574		 * Note: should only get one cookie. Enforced in
2575		 * the ldc layer.
2576		 */
2577		bcopy(&dring_pkt->cookie[0], &dp->cookie[0],
2578		    sizeof (ldc_mem_cookie_t));
2579
2580		D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__,
2581		    dp->num_descriptors, dp->descriptor_size);
2582		D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__,
2583		    dp->options, dp->ncookies);
2584
2585		if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0],
2586		    dp->ncookies, dp->num_descriptors, dp->descriptor_size,
2587		    LDC_SHADOW_MAP, &(dp->handle))) != 0) {
2588
2589			DERR(vswp, "%s: dring_map failed\n", __func__);
2590
2591			kmem_free(dp, sizeof (dring_info_t));
2592			vsw_free_lane_resources(ldcp, INBOUND);
2593
2594			dring_pkt->tag.vio_sid = ldcp->local_session;
2595			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2596
2597			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
2598
2599			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
2600			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2601			    sizeof (vio_dring_reg_msg_t), B_TRUE);
2602
2603			vsw_next_milestone(ldcp);
2604			return;
2605		}
2606
2607		if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
2608
2609			DERR(vswp, "%s: dring_addr failed\n", __func__);
2610
2611			kmem_free(dp, sizeof (dring_info_t));
2612			vsw_free_lane_resources(ldcp, INBOUND);
2613
2614			dring_pkt->tag.vio_sid = ldcp->local_session;
2615			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2616
2617			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
2618
2619			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
2620			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2621			    sizeof (vio_dring_reg_msg_t), B_TRUE);
2622
2623			vsw_next_milestone(ldcp);
2624			return;
2625		} else {
2626			/* store the address of the pub part of ring */
2627			dp->pub_addr = minfo.vaddr;
2628		}
2629
2630		/* no private section as we are importing */
2631		dp->priv_addr = NULL;
2632
2633		/*
2634		 * Using simple mono increasing int for ident at
2635		 * the moment.
2636		 */
2637		dp->ident = ldcp->next_ident;
2638		ldcp->next_ident++;
2639
2640		dp->end_idx = 0;
2641		dp->next = NULL;
2642
2643		/*
2644		 * Link it onto the end of the list of drings
2645		 * for this lane.
2646		 */
2647		if (ldcp->lane_in.dringp == NULL) {
2648			D2(vswp, "%s: adding first INBOUND dring", __func__);
2649			ldcp->lane_in.dringp = dp;
2650		} else {
2651			dbp = ldcp->lane_in.dringp;
2652
2653			while (dbp->next != NULL)
2654				dbp = dbp->next;
2655
2656			dbp->next = dp;
2657		}
2658
2659		/* acknowledge it */
2660		dring_pkt->tag.vio_sid = ldcp->local_session;
2661		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2662		dring_pkt->dring_ident = dp->ident;
2663
2664		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2665		    sizeof (vio_dring_reg_msg_t), B_TRUE);
2666
2667		ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT;
2668		vsw_next_milestone(ldcp);
2669		break;
2670
2671	case VIO_SUBTYPE_ACK:
2672		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2673
2674		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV))
2675			return;
2676
2677		/*
2678		 * Peer is acknowledging our dring info and will have
2679		 * sent us a dring identifier which we will use to
2680		 * refer to this ring w.r.t. our peer.
2681		 */
2682		dp = ldcp->lane_out.dringp;
2683		if (dp != NULL) {
2684			/*
2685			 * Find the ring this ident should be associated
2686			 * with.
2687			 */
2688			if (vsw_dring_match(dp, dring_pkt)) {
2689				dring_found = 1;
2690
2691			} else while (dp != NULL) {
2692				if (vsw_dring_match(dp, dring_pkt)) {
2693					dring_found = 1;
2694					break;
2695				}
2696				dp = dp->next;
2697			}
2698
2699			if (dring_found == 0) {
2700				DERR(NULL, "%s: unrecognised ring cookie",
2701				    __func__);
2702				vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2703				return;
2704			}
2705
2706		} else {
2707			DERR(vswp, "%s: DRING ACK received but no drings "
2708			    "allocated", __func__);
2709			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2710			return;
2711		}
2712
2713		/* store ident */
2714		dp->ident = dring_pkt->dring_ident;
2715		ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV;
2716		vsw_next_milestone(ldcp);
2717		break;
2718
2719	case VIO_SUBTYPE_NACK:
2720		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2721
2722		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV))
2723			return;
2724
2725		ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV;
2726		vsw_next_milestone(ldcp);
2727		break;
2728
2729	default:
2730		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
2731		    dring_pkt->tag.vio_subtype);
2732	}
2733
2734	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
2735}
2736
2737/*
2738 * Process a request from peer to unregister a dring.
2739 *
2740 * For the moment we just restart the handshake if our
2741 * peer endpoint attempts to unregister a dring.
2742 */
2743void
2744vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt)
2745{
2746	vsw_t			*vswp = ldcp->ldc_vswp;
2747	vio_dring_unreg_msg_t	*dring_pkt;
2748
2749	/*
2750	 * We know this is a ctrl/dring packet so
2751	 * cast it into the correct structure.
2752	 */
2753	dring_pkt = (vio_dring_unreg_msg_t *)pkt;
2754
2755	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2756
2757	switch (dring_pkt->tag.vio_subtype) {
2758	case VIO_SUBTYPE_INFO:
2759		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2760
2761		DWARN(vswp, "%s: restarting handshake..", __func__);
2762		break;
2763
2764	case VIO_SUBTYPE_ACK:
2765		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2766
2767		DWARN(vswp, "%s: restarting handshake..", __func__);
2768		break;
2769
2770	case VIO_SUBTYPE_NACK:
2771		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2772
2773		DWARN(vswp, "%s: restarting handshake..", __func__);
2774		break;
2775
2776	default:
2777		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
2778		    dring_pkt->tag.vio_subtype);
2779	}
2780
2781	vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2782
2783	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2784}
2785
2786#define	SND_MCST_NACK(ldcp, pkt) \
2787	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
2788	pkt->tag.vio_sid = ldcp->local_session; \
2789	(void) vsw_send_msg(ldcp, (void *)pkt, \
2790			sizeof (vnet_mcast_msg_t), B_TRUE);
2791
2792/*
2793 * Process a multicast request from a vnet.
2794 *
2795 * Vnet's specify a multicast address that they are interested in. This
2796 * address is used as a key into the hash table which forms the multicast
2797 * forwarding database (mFDB).
2798 *
2799 * The table keys are the multicast addresses, while the table entries
2800 * are pointers to lists of ports which wish to receive packets for the
2801 * specified multicast address.
2802 *
2803 * When a multicast packet is being switched we use the address as a key
2804 * into the hash table, and then walk the appropriate port list forwarding
2805 * the pkt to each port in turn.
2806 *
2807 * If a vnet is no longer interested in a particular multicast grouping
2808 * we simply find the correct location in the hash table and then delete
2809 * the relevant port from the port list.
2810 *
2811 * To deal with the case whereby a port is being deleted without first
2812 * removing itself from the lists in the hash table, we maintain a list
2813 * of multicast addresses the port has registered an interest in, within
2814 * the port structure itself. We then simply walk that list of addresses
2815 * using them as keys into the hash table and remove the port from the
2816 * appropriate lists.
2817 */
2818static void
2819vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt)
2820{
2821	vnet_mcast_msg_t	*mcst_pkt;
2822	vsw_port_t		*port = ldcp->ldc_port;
2823	vsw_t			*vswp = ldcp->ldc_vswp;
2824	int			i;
2825
2826	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2827
2828	/*
2829	 * We know this is a ctrl/mcast packet so
2830	 * cast it into the correct structure.
2831	 */
2832	mcst_pkt = (vnet_mcast_msg_t *)pkt;
2833
2834	switch (mcst_pkt->tag.vio_subtype) {
2835	case VIO_SUBTYPE_INFO:
2836		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2837
2838		/*
2839		 * Check if in correct state to receive a multicast
2840		 * message (i.e. handshake complete). If not reset
2841		 * the handshake.
2842		 */
2843		if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV))
2844			return;
2845
2846		/*
2847		 * Before attempting to add or remove address check
2848		 * that they are valid multicast addresses.
2849		 * If not, then NACK back.
2850		 */
2851		for (i = 0; i < mcst_pkt->count; i++) {
2852			if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) {
2853				DERR(vswp, "%s: invalid multicast address",
2854				    __func__);
2855				SND_MCST_NACK(ldcp, mcst_pkt);
2856				return;
2857			}
2858		}
2859
2860		/*
2861		 * Now add/remove the addresses. If this fails we
2862		 * NACK back.
2863		 */
2864		if (vsw_add_rem_mcst(mcst_pkt, port) != 0) {
2865			SND_MCST_NACK(ldcp, mcst_pkt);
2866			return;
2867		}
2868
2869		mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2870		mcst_pkt->tag.vio_sid = ldcp->local_session;
2871
2872		DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt);
2873
2874		(void) vsw_send_msg(ldcp, (void *)mcst_pkt,
2875		    sizeof (vnet_mcast_msg_t), B_TRUE);
2876		break;
2877
2878	case VIO_SUBTYPE_ACK:
2879		DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2880
2881		/*
2882		 * We shouldn't ever get a multicast ACK message as
2883		 * at the moment we never request multicast addresses
2884		 * to be set on some other device. This may change in
2885		 * the future if we have cascading switches.
2886		 */
2887		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV))
2888			return;
2889
2890				/* Do nothing */
2891		break;
2892
2893	case VIO_SUBTYPE_NACK:
2894		DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2895
2896		/*
2897		 * We shouldn't get a multicast NACK packet for the
2898		 * same reasons as we shouldn't get a ACK packet.
2899		 */
2900		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV))
2901			return;
2902
2903				/* Do nothing */
2904		break;
2905
2906	default:
2907		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2908		    mcst_pkt->tag.vio_subtype);
2909	}
2910
2911	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2912}
2913
2914static void
2915vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt)
2916{
2917	vio_rdx_msg_t	*rdx_pkt;
2918	vsw_t		*vswp = ldcp->ldc_vswp;
2919
2920	/*
2921	 * We know this is a ctrl/rdx packet so
2922	 * cast it into the correct structure.
2923	 */
2924	rdx_pkt = (vio_rdx_msg_t *)pkt;
2925
2926	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
2927
2928	switch (rdx_pkt->tag.vio_subtype) {
2929	case VIO_SUBTYPE_INFO:
2930		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2931
2932		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_INFO_RECV))
2933			return;
2934
2935		rdx_pkt->tag.vio_sid = ldcp->local_session;
2936		rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2937
2938		DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt);
2939
2940		ldcp->lane_out.lstate |= VSW_RDX_ACK_SENT;
2941
2942		(void) vsw_send_msg(ldcp, (void *)rdx_pkt,
2943		    sizeof (vio_rdx_msg_t), B_TRUE);
2944
2945		vsw_next_milestone(ldcp);
2946		break;
2947
2948	case VIO_SUBTYPE_ACK:
2949		/*
2950		 * Should be handled in-band by callback handler.
2951		 */
2952		DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__);
2953		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2954		break;
2955
2956	case VIO_SUBTYPE_NACK:
2957		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2958
2959		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_NACK_RECV))
2960			return;
2961
2962		ldcp->lane_in.lstate |= VSW_RDX_NACK_RECV;
2963		vsw_next_milestone(ldcp);
2964		break;
2965
2966	default:
2967		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
2968		    rdx_pkt->tag.vio_subtype);
2969	}
2970
2971	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2972}
2973
2974static void
2975vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t *tagp,
2976	uint32_t msglen)
2977{
2978	uint16_t	env = tagp->vio_subtype_env;
2979	vsw_t		*vswp = ldcp->ldc_vswp;
2980
2981	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2982
2983	/* session id check */
2984	if (ldcp->session_status & VSW_PEER_SESSION) {
2985		if (ldcp->peer_session != tagp->vio_sid) {
2986			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
2987			    __func__, ldcp->ldc_id, tagp->vio_sid);
2988			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2989			return;
2990		}
2991	}
2992
2993	/*
2994	 * It is an error for us to be getting data packets
2995	 * before the handshake has completed.
2996	 */
2997	if (ldcp->hphase != VSW_MILESTONE4) {
2998		DERR(vswp, "%s: got data packet before handshake complete "
2999		    "hphase %d (%x: %x)", __func__, ldcp->hphase,
3000		    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
3001		DUMP_FLAGS(ldcp->lane_in.lstate);
3002		DUMP_FLAGS(ldcp->lane_out.lstate);
3003		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3004		return;
3005	}
3006
3007	/*
3008	 * To reduce the locking contention, release the
3009	 * ldc_cblock here and re-acquire it once we are done
3010	 * receiving packets.
3011	 */
3012	mutex_exit(&ldcp->ldc_cblock);
3013	mutex_enter(&ldcp->ldc_rxlock);
3014
3015	/*
3016	 * Switch on vio_subtype envelope, then let lower routines
3017	 * decide if its an INFO, ACK or NACK packet.
3018	 */
3019	if (env == VIO_DRING_DATA) {
3020		vsw_process_data_dring_pkt(ldcp, dpkt);
3021	} else if (env == VIO_PKT_DATA) {
3022		ldcp->rx_pktdata(ldcp, dpkt, msglen);
3023	} else if (env == VIO_DESC_DATA) {
3024		vsw_process_data_ibnd_pkt(ldcp, dpkt);
3025	} else {
3026		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
3027	}
3028
3029	mutex_exit(&ldcp->ldc_rxlock);
3030	mutex_enter(&ldcp->ldc_cblock);
3031
3032	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3033}
3034
3035#define	SND_DRING_NACK(ldcp, pkt) \
3036	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
3037	pkt->tag.vio_sid = ldcp->local_session; \
3038	(void) vsw_send_msg(ldcp, (void *)pkt, \
3039			sizeof (vio_dring_msg_t), B_TRUE);
3040
3041static void
3042vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt)
3043{
3044	vio_dring_msg_t		*dring_pkt;
3045	vnet_public_desc_t	*pub_addr = NULL;
3046	vsw_private_desc_t	*priv_addr = NULL;
3047	dring_info_t		*dp = NULL;
3048	vsw_t			*vswp = ldcp->ldc_vswp;
3049	mblk_t			*mp = NULL;
3050	mblk_t			*bp = NULL;
3051	mblk_t			*bpt = NULL;
3052	size_t			nbytes = 0;
3053	uint64_t		ncookies = 0;
3054	uint64_t		chain = 0;
3055	uint64_t		len;
3056	uint32_t		pos, start, datalen;
3057	uint32_t		range_start, range_end;
3058	int32_t			end, num, cnt = 0;
3059	int			i, rv, msg_rv = 0;
3060	boolean_t		ack_needed = B_FALSE;
3061	boolean_t		prev_desc_ack = B_FALSE;
3062	int			read_attempts = 0;
3063	struct ether_header	*ehp;
3064
3065	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3066
3067	/*
3068	 * We know this is a data/dring packet so
3069	 * cast it into the correct structure.
3070	 */
3071	dring_pkt = (vio_dring_msg_t *)dpkt;
3072
3073	/*
3074	 * Switch on the vio_subtype. If its INFO then we need to
3075	 * process the data. If its an ACK we need to make sure
3076	 * it makes sense (i.e did we send an earlier data/info),
3077	 * and if its a NACK then we maybe attempt a retry.
3078	 */
3079	switch (dring_pkt->tag.vio_subtype) {
3080	case VIO_SUBTYPE_INFO:
3081		D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id);
3082
3083		READ_ENTER(&ldcp->lane_in.dlistrw);
3084		if ((dp = vsw_ident2dring(&ldcp->lane_in,
3085		    dring_pkt->dring_ident)) == NULL) {
3086			RW_EXIT(&ldcp->lane_in.dlistrw);
3087
3088			DERR(vswp, "%s(%lld): unable to find dring from "
3089			    "ident 0x%llx", __func__, ldcp->ldc_id,
3090			    dring_pkt->dring_ident);
3091
3092			SND_DRING_NACK(ldcp, dring_pkt);
3093			return;
3094		}
3095
3096		start = pos = dring_pkt->start_idx;
3097		end = dring_pkt->end_idx;
3098		len = dp->num_descriptors;
3099
3100		range_start = range_end = pos;
3101
3102		D2(vswp, "%s(%lld): start index %ld : end %ld\n",
3103		    __func__, ldcp->ldc_id, start, end);
3104
3105		if (end == -1) {
3106			num = -1;
3107		} else if (end >= 0) {
3108			num = end >= pos ? end - pos + 1: (len - pos + 1) + end;
3109
3110			/* basic sanity check */
3111			if (end > len) {
3112				RW_EXIT(&ldcp->lane_in.dlistrw);
3113				DERR(vswp, "%s(%lld): endpoint %lld outside "
3114				    "ring length %lld", __func__,
3115				    ldcp->ldc_id, end, len);
3116
3117				SND_DRING_NACK(ldcp, dring_pkt);
3118				return;
3119			}
3120		} else {
3121			RW_EXIT(&ldcp->lane_in.dlistrw);
3122			DERR(vswp, "%s(%lld): invalid endpoint %lld",
3123			    __func__, ldcp->ldc_id, end);
3124			SND_DRING_NACK(ldcp, dring_pkt);
3125			return;
3126		}
3127
3128		while (cnt != num) {
3129vsw_recheck_desc:
3130			if ((rv = ldc_mem_dring_acquire(dp->handle,
3131			    pos, pos)) != 0) {
3132				RW_EXIT(&ldcp->lane_in.dlistrw);
3133				DERR(vswp, "%s(%lld): unable to acquire "
3134				    "descriptor at pos %d: err %d",
3135				    __func__, pos, ldcp->ldc_id, rv);
3136				SND_DRING_NACK(ldcp, dring_pkt);
3137				ldcp->ldc_stats.ierrors++;
3138				return;
3139			}
3140
3141			pub_addr = (vnet_public_desc_t *)dp->pub_addr + pos;
3142
3143			/*
3144			 * When given a bounded range of descriptors
3145			 * to process, its an error to hit a descriptor
3146			 * which is not ready. In the non-bounded case
3147			 * (end_idx == -1) this simply indicates we have
3148			 * reached the end of the current active range.
3149			 */
3150			if (pub_addr->hdr.dstate != VIO_DESC_READY) {
3151				/* unbound - no error */
3152				if (end == -1) {
3153					if (read_attempts == vsw_read_attempts)
3154						break;
3155
3156					delay(drv_usectohz(vsw_desc_delay));
3157					read_attempts++;
3158					goto vsw_recheck_desc;
3159				}
3160
3161				/* bounded - error - so NACK back */
3162				RW_EXIT(&ldcp->lane_in.dlistrw);
3163				DERR(vswp, "%s(%lld): descriptor not READY "
3164				    "(%d)", __func__, ldcp->ldc_id,
3165				    pub_addr->hdr.dstate);
3166				SND_DRING_NACK(ldcp, dring_pkt);
3167				return;
3168			}
3169
3170			DTRACE_PROBE1(read_attempts, int, read_attempts);
3171
3172			range_end = pos;
3173
3174			/*
3175			 * If we ACK'd the previous descriptor then now
3176			 * record the new range start position for later
3177			 * ACK's.
3178			 */
3179			if (prev_desc_ack) {
3180				range_start = pos;
3181
3182				D2(vswp, "%s(%lld): updating range start to be "
3183				    "%d", __func__, ldcp->ldc_id, range_start);
3184
3185				prev_desc_ack = B_FALSE;
3186			}
3187
3188			/*
3189			 * Data is padded to align on 8 byte boundary,
3190			 * datalen is actual data length, i.e. minus that
3191			 * padding.
3192			 */
3193			datalen = pub_addr->nbytes;
3194
3195			/*
3196			 * Does peer wish us to ACK when we have finished
3197			 * with this descriptor ?
3198			 */
3199			if (pub_addr->hdr.ack)
3200				ack_needed = B_TRUE;
3201
3202			D2(vswp, "%s(%lld): processing desc %lld at pos"
3203			    " 0x%llx : dstate 0x%lx : datalen 0x%lx",
3204			    __func__, ldcp->ldc_id, pos, pub_addr,
3205			    pub_addr->hdr.dstate, datalen);
3206
3207			/*
3208			 * Mark that we are starting to process descriptor.
3209			 */
3210			pub_addr->hdr.dstate = VIO_DESC_ACCEPTED;
3211
3212			/*
3213			 * Ensure that we ask ldc for an aligned
3214			 * number of bytes.
3215			 */
3216			nbytes = (datalen + VNET_IPALIGN + 7) & ~7;
3217
3218			mp = vio_multipool_allocb(&ldcp->vmp, nbytes);
3219			if (mp == NULL) {
3220				ldcp->ldc_stats.rx_vio_allocb_fail++;
3221				/*
3222				 * No free receive buffers available, so
3223				 * fallback onto allocb(9F). Make sure that
3224				 * we get a data buffer which is a multiple
3225				 * of 8 as this is required by ldc_mem_copy.
3226				 */
3227				DTRACE_PROBE(allocb);
3228				if ((mp = allocb(datalen + VNET_IPALIGN + 8,
3229				    BPRI_MED)) == NULL) {
3230					DERR(vswp, "%s(%ld): allocb failed",
3231					    __func__, ldcp->ldc_id);
3232					pub_addr->hdr.dstate = VIO_DESC_DONE;
3233					(void) ldc_mem_dring_release(dp->handle,
3234					    pos, pos);
3235					ldcp->ldc_stats.ierrors++;
3236					ldcp->ldc_stats.rx_allocb_fail++;
3237					break;
3238				}
3239			}
3240
3241			ncookies = pub_addr->ncookies;
3242			rv = ldc_mem_copy(ldcp->ldc_handle,
3243			    (caddr_t)mp->b_rptr, 0, &nbytes,
3244			    pub_addr->memcookie, ncookies, LDC_COPY_IN);
3245
3246			if (rv != 0) {
3247				DERR(vswp, "%s(%d): unable to copy in data "
3248				    "from %d cookies in desc %d (rv %d)",
3249				    __func__, ldcp->ldc_id, ncookies, pos, rv);
3250				freemsg(mp);
3251
3252				pub_addr->hdr.dstate = VIO_DESC_DONE;
3253				(void) ldc_mem_dring_release(dp->handle,
3254				    pos, pos);
3255				ldcp->ldc_stats.ierrors++;
3256				break;
3257			} else {
3258				D2(vswp, "%s(%d): copied in %ld bytes"
3259				    " using %d cookies", __func__,
3260				    ldcp->ldc_id, nbytes, ncookies);
3261			}
3262
3263			/* adjust the read pointer to skip over the padding */
3264			mp->b_rptr += VNET_IPALIGN;
3265
3266			/* point to the actual end of data */
3267			mp->b_wptr = mp->b_rptr + datalen;
3268
3269			/* update statistics */
3270			ehp = (struct ether_header *)mp->b_rptr;
3271			if (IS_BROADCAST(ehp))
3272				ldcp->ldc_stats.brdcstrcv++;
3273			else if (IS_MULTICAST(ehp))
3274				ldcp->ldc_stats.multircv++;
3275
3276			ldcp->ldc_stats.ipackets++;
3277			ldcp->ldc_stats.rbytes += datalen;
3278
3279			/* build a chain of received packets */
3280			if (bp == NULL) {
3281				/* first pkt */
3282				bp = mp;
3283				bp->b_next = bp->b_prev = NULL;
3284				bpt = bp;
3285				chain = 1;
3286			} else {
3287				mp->b_next = mp->b_prev = NULL;
3288				bpt->b_next = mp;
3289				bpt = mp;
3290				chain++;
3291			}
3292
3293			/* mark we are finished with this descriptor */
3294			pub_addr->hdr.dstate = VIO_DESC_DONE;
3295
3296			(void) ldc_mem_dring_release(dp->handle, pos, pos);
3297
3298			/*
3299			 * Send an ACK back to peer if requested.
3300			 */
3301			if (ack_needed) {
3302				ack_needed = B_FALSE;
3303
3304				dring_pkt->start_idx = range_start;
3305				dring_pkt->end_idx = range_end;
3306
3307				DERR(vswp, "%s(%lld): processed %d %d, ACK"
3308				    " requested", __func__, ldcp->ldc_id,
3309				    dring_pkt->start_idx, dring_pkt->end_idx);
3310
3311				dring_pkt->dring_process_state = VIO_DP_ACTIVE;
3312				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3313				dring_pkt->tag.vio_sid = ldcp->local_session;
3314
3315				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
3316				    sizeof (vio_dring_msg_t), B_FALSE);
3317
3318				/*
3319				 * Check if ACK was successfully sent. If not
3320				 * we break and deal with that below.
3321				 */
3322				if (msg_rv != 0)
3323					break;
3324
3325				prev_desc_ack = B_TRUE;
3326				range_start = pos;
3327			}
3328
3329			/* next descriptor */
3330			pos = (pos + 1) % len;
3331			cnt++;
3332
3333			/*
3334			 * Break out of loop here and stop processing to
3335			 * allow some other network device (or disk) to
3336			 * get access to the cpu.
3337			 */
3338			if (chain > vsw_chain_len) {
3339				D3(vswp, "%s(%lld): switching chain of %d "
3340				    "msgs", __func__, ldcp->ldc_id, chain);
3341				break;
3342			}
3343		}
3344		RW_EXIT(&ldcp->lane_in.dlistrw);
3345
3346		/*
3347		 * If when we attempted to send the ACK we found that the
3348		 * channel had been reset then now handle this. We deal with
3349		 * it here as we cannot reset the channel while holding the
3350		 * dlistrw lock, and we don't want to acquire/release it
3351		 * continuously in the above loop, as a channel reset should
3352		 * be a rare event.
3353		 */
3354		if (msg_rv == ECONNRESET) {
3355			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
3356			break;
3357		}
3358
3359		/* send the chain of packets to be switched */
3360		if (bp != NULL) {
3361			DTRACE_PROBE1(vsw_rcv_msgs, int, chain);
3362			D3(vswp, "%s(%lld): switching chain of %d msgs",
3363			    __func__, ldcp->ldc_id, chain);
3364			vswp->vsw_switch_frame(vswp, bp, VSW_VNETPORT,
3365			    ldcp->ldc_port, NULL);
3366		}
3367
3368		DTRACE_PROBE1(msg_cnt, int, cnt);
3369
3370		/*
3371		 * We are now finished so ACK back with the state
3372		 * set to STOPPING so our peer knows we are finished
3373		 */
3374		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3375		dring_pkt->tag.vio_sid = ldcp->local_session;
3376
3377		dring_pkt->dring_process_state = VIO_DP_STOPPED;
3378
3379		DTRACE_PROBE(stop_process_sent);
3380
3381		/*
3382		 * We have not processed any more descriptors beyond
3383		 * the last one we ACK'd.
3384		 */
3385		if (prev_desc_ack)
3386			range_start = range_end;
3387
3388		dring_pkt->start_idx = range_start;
3389		dring_pkt->end_idx = range_end;
3390
3391		D2(vswp, "%s(%lld) processed : %d : %d, now stopping",
3392		    __func__, ldcp->ldc_id, dring_pkt->start_idx,
3393		    dring_pkt->end_idx);
3394
3395		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
3396		    sizeof (vio_dring_msg_t), B_TRUE);
3397		break;
3398
3399	case VIO_SUBTYPE_ACK:
3400		D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id);
3401		/*
3402		 * Verify that the relevant descriptors are all
3403		 * marked as DONE
3404		 */
3405		READ_ENTER(&ldcp->lane_out.dlistrw);
3406		if ((dp = vsw_ident2dring(&ldcp->lane_out,
3407		    dring_pkt->dring_ident)) == NULL) {
3408			RW_EXIT(&ldcp->lane_out.dlistrw);
3409			DERR(vswp, "%s: unknown ident in ACK", __func__);
3410			return;
3411		}
3412
3413		start = end = 0;
3414		start = dring_pkt->start_idx;
3415		end = dring_pkt->end_idx;
3416		len = dp->num_descriptors;
3417
3418
3419		mutex_enter(&dp->dlock);
3420		dp->last_ack_recv = end;
3421		ldcp->ldc_stats.dring_data_acks++;
3422		mutex_exit(&dp->dlock);
3423
3424		(void) vsw_reclaim_dring(dp, start);
3425
3426		/*
3427		 * If our peer is stopping processing descriptors then
3428		 * we check to make sure it has processed all the descriptors
3429		 * we have updated. If not then we send it a new message
3430		 * to prompt it to restart.
3431		 */
3432		if (dring_pkt->dring_process_state == VIO_DP_STOPPED) {
3433			DTRACE_PROBE(stop_process_recv);
3434			D2(vswp, "%s(%lld): got stopping msg : %d : %d",
3435			    __func__, ldcp->ldc_id, dring_pkt->start_idx,
3436			    dring_pkt->end_idx);
3437
3438			/*
3439			 * Check next descriptor in public section of ring.
3440			 * If its marked as READY then we need to prompt our
3441			 * peer to start processing the ring again.
3442			 */
3443			i = (end + 1) % len;
3444			pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
3445			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
3446
3447			/*
3448			 * Hold the restart lock across all of this to
3449			 * make sure that its not possible for us to
3450			 * decide that a msg needs to be sent in the future
3451			 * but the sending code having already checked is
3452			 * about to exit.
3453			 */
3454			mutex_enter(&dp->restart_lock);
3455			ldcp->ldc_stats.dring_stopped_acks++;
3456			mutex_enter(&priv_addr->dstate_lock);
3457			if (pub_addr->hdr.dstate == VIO_DESC_READY) {
3458
3459				mutex_exit(&priv_addr->dstate_lock);
3460
3461				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
3462				dring_pkt->tag.vio_sid = ldcp->local_session;
3463
3464				dring_pkt->start_idx = (end + 1) % len;
3465				dring_pkt->end_idx = -1;
3466
3467				D2(vswp, "%s(%lld) : sending restart msg:"
3468				    " %d : %d", __func__, ldcp->ldc_id,
3469				    dring_pkt->start_idx, dring_pkt->end_idx);
3470
3471				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
3472				    sizeof (vio_dring_msg_t), B_FALSE);
3473				ldcp->ldc_stats.dring_data_msgs++;
3474
3475			} else {
3476				mutex_exit(&priv_addr->dstate_lock);
3477				dp->restart_reqd = B_TRUE;
3478			}
3479			mutex_exit(&dp->restart_lock);
3480		}
3481		RW_EXIT(&ldcp->lane_out.dlistrw);
3482
3483		/* only do channel reset after dropping dlistrw lock */
3484		if (msg_rv == ECONNRESET)
3485			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
3486
3487		break;
3488
3489	case VIO_SUBTYPE_NACK:
3490		DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK",
3491		    __func__, ldcp->ldc_id);
3492		/*
3493		 * Something is badly wrong if we are getting NACK's
3494		 * for our data pkts. So reset the channel.
3495		 */
3496		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3497
3498		break;
3499
3500	default:
3501		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
3502		    ldcp->ldc_id, dring_pkt->tag.vio_subtype);
3503	}
3504
3505	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3506}
3507
3508/*
3509 * dummy pkt data handler function for vnet protocol version 1.0
3510 */
3511static void
3512vsw_process_pkt_data_nop(void *arg1, void *arg2, uint32_t msglen)
3513{
3514	_NOTE(ARGUNUSED(arg1, arg2, msglen))
3515}
3516
3517/*
3518 * This function handles raw pkt data messages received over the channel.
3519 * Currently, only priority-eth-type frames are received through this mechanism.
3520 * In this case, the frame(data) is present within the message itself which
3521 * is copied into an mblk before switching it.
3522 */
3523static void
3524vsw_process_pkt_data(void *arg1, void *arg2, uint32_t msglen)
3525{
3526	vsw_ldc_t		*ldcp = (vsw_ldc_t *)arg1;
3527	vio_raw_data_msg_t	*dpkt = (vio_raw_data_msg_t *)arg2;
3528	uint32_t		size;
3529	mblk_t			*mp;
3530	vsw_t			*vswp = ldcp->ldc_vswp;
3531	vgen_stats_t		*statsp = &ldcp->ldc_stats;
3532
3533	size = msglen - VIO_PKT_DATA_HDRSIZE;
3534	if (size < ETHERMIN || size > ETHERMAX) {
3535		(void) atomic_inc_32(&statsp->rx_pri_fail);
3536		DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
3537		    ldcp->ldc_id, size);
3538		return;
3539	}
3540
3541	mp = vio_multipool_allocb(&ldcp->vmp, size);
3542	if (mp == NULL) {
3543		mp = allocb(size, BPRI_MED);
3544		if (mp == NULL) {
3545			(void) atomic_inc_32(&statsp->rx_pri_fail);
3546			DWARN(vswp, "%s(%lld) allocb failure, "
3547			    "unable to process priority frame\n", __func__,
3548			    ldcp->ldc_id);
3549			return;
3550		}
3551	}
3552
3553	/* copy the frame from the payload of raw data msg into the mblk */
3554	bcopy(dpkt->data, mp->b_rptr, size);
3555	mp->b_wptr = mp->b_rptr + size;
3556
3557	/* update stats */
3558	(void) atomic_inc_64(&statsp->rx_pri_packets);
3559	(void) atomic_add_64(&statsp->rx_pri_bytes, size);
3560
3561	/* switch the frame to destination */
3562	vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT, ldcp->ldc_port, NULL);
3563}
3564
3565/*
3566 * Process an in-band descriptor message (most likely from
3567 * OBP).
3568 */
3569static void
3570vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
3571{
3572	vnet_ibnd_desc_t	*ibnd_desc;
3573	dring_info_t		*dp = NULL;
3574	vsw_private_desc_t	*priv_addr = NULL;
3575	vsw_t			*vswp = ldcp->ldc_vswp;
3576	mblk_t			*mp = NULL;
3577	size_t			nbytes = 0;
3578	size_t			off = 0;
3579	uint64_t		idx = 0;
3580	uint32_t		num = 1, len, datalen = 0;
3581	uint64_t		ncookies = 0;
3582	int			i, rv;
3583	int			j = 0;
3584
3585	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3586
3587	ibnd_desc = (vnet_ibnd_desc_t *)pkt;
3588
3589	switch (ibnd_desc->hdr.tag.vio_subtype) {
3590	case VIO_SUBTYPE_INFO:
3591		D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3592
3593		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
3594			return;
3595
3596		/*
3597		 * Data is padded to align on a 8 byte boundary,
3598		 * nbytes is actual data length, i.e. minus that
3599		 * padding.
3600		 */
3601		datalen = ibnd_desc->nbytes;
3602
3603		D2(vswp, "%s(%lld): processing inband desc : "
3604		    ": datalen 0x%lx", __func__, ldcp->ldc_id, datalen);
3605
3606		ncookies = ibnd_desc->ncookies;
3607
3608		/*
3609		 * allocb(9F) returns an aligned data block. We
3610		 * need to ensure that we ask ldc for an aligned
3611		 * number of bytes also.
3612		 */
3613		nbytes = datalen;
3614		if (nbytes & 0x7) {
3615			off = 8 - (nbytes & 0x7);
3616			nbytes += off;
3617		}
3618
3619		mp = allocb(datalen, BPRI_MED);
3620		if (mp == NULL) {
3621			DERR(vswp, "%s(%lld): allocb failed",
3622			    __func__, ldcp->ldc_id);
3623			ldcp->ldc_stats.rx_allocb_fail++;
3624			return;
3625		}
3626
3627		rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr,
3628		    0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies,
3629		    LDC_COPY_IN);
3630
3631		if (rv != 0) {
3632			DERR(vswp, "%s(%d): unable to copy in data from "
3633			    "%d cookie(s)", __func__, ldcp->ldc_id, ncookies);
3634			freemsg(mp);
3635			ldcp->ldc_stats.ierrors++;
3636			return;
3637		}
3638
3639		D2(vswp, "%s(%d): copied in %ld bytes using %d cookies",
3640		    __func__, ldcp->ldc_id, nbytes, ncookies);
3641
3642		/* point to the actual end of data */
3643		mp->b_wptr = mp->b_rptr + datalen;
3644		ldcp->ldc_stats.ipackets++;
3645		ldcp->ldc_stats.rbytes += datalen;
3646
3647		/*
3648		 * We ACK back every in-band descriptor message we process
3649		 */
3650		ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK;
3651		ibnd_desc->hdr.tag.vio_sid = ldcp->local_session;
3652		(void) vsw_send_msg(ldcp, (void *)ibnd_desc,
3653		    sizeof (vnet_ibnd_desc_t), B_TRUE);
3654
3655		/* send the packet to be switched */
3656		vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT,
3657		    ldcp->ldc_port, NULL);
3658
3659		break;
3660
3661	case VIO_SUBTYPE_ACK:
3662		D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3663
3664		/* Verify the ACK is valid */
3665		idx = ibnd_desc->hdr.desc_handle;
3666
3667		if (idx >= vsw_ntxds) {
3668			cmn_err(CE_WARN, "!vsw%d: corrupted ACK received "
3669			    "(idx %ld)", vswp->instance, idx);
3670			return;
3671		}
3672
3673		if ((dp = ldcp->lane_out.dringp) == NULL) {
3674			DERR(vswp, "%s: no dring found", __func__);
3675			return;
3676		}
3677
3678		len = dp->num_descriptors;
3679		/*
3680		 * If the descriptor we are being ACK'ed for is not the
3681		 * one we expected, then pkts were lost somwhere, either
3682		 * when we tried to send a msg, or a previous ACK msg from
3683		 * our peer. In either case we now reclaim the descriptors
3684		 * in the range from the last ACK we received up to the
3685		 * current ACK.
3686		 */
3687		if (idx != dp->last_ack_recv) {
3688			DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)",
3689			    __func__, dp->last_ack_recv, idx);
3690			num = idx >= dp->last_ack_recv ?
3691			    idx - dp->last_ack_recv + 1:
3692			    (len - dp->last_ack_recv + 1) + idx;
3693		}
3694
3695		/*
3696		 * When we sent the in-band message to our peer we
3697		 * marked the copy in our private ring as READY. We now
3698		 * check that the descriptor we are being ACK'ed for is in
3699		 * fact READY, i.e. it is one we have shared with our peer.
3700		 *
3701		 * If its not we flag an error, but still reset the descr
3702		 * back to FREE.
3703		 */
3704		for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) {
3705			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
3706			mutex_enter(&priv_addr->dstate_lock);
3707			if (priv_addr->dstate != VIO_DESC_READY) {
3708				DERR(vswp, "%s: (%ld) desc at index %ld not "
3709				    "READY (0x%lx)", __func__,
3710				    ldcp->ldc_id, idx, priv_addr->dstate);
3711				DERR(vswp, "%s: bound %d: ncookies %ld : "
3712				    "datalen %ld", __func__,
3713				    priv_addr->bound, priv_addr->ncookies,
3714				    priv_addr->datalen);
3715			}
3716			D2(vswp, "%s: (%lld) freeing descp at %lld", __func__,
3717			    ldcp->ldc_id, idx);
3718			/* release resources associated with sent msg */
3719			priv_addr->datalen = 0;
3720			priv_addr->dstate = VIO_DESC_FREE;
3721			mutex_exit(&priv_addr->dstate_lock);
3722		}
3723		/* update to next expected value */
3724		dp->last_ack_recv = (idx + 1) % dp->num_descriptors;
3725
3726		break;
3727
3728	case VIO_SUBTYPE_NACK:
3729		DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3730
3731		/*
3732		 * We should only get a NACK if our peer doesn't like
3733		 * something about a message we have sent it. If this
3734		 * happens we just release the resources associated with
3735		 * the message. (We are relying on higher layers to decide
3736		 * whether or not to resend.
3737		 */
3738
3739		/* limit check */
3740		idx = ibnd_desc->hdr.desc_handle;
3741
3742		if (idx >= vsw_ntxds) {
3743			DERR(vswp, "%s: corrupted NACK received (idx %lld)",
3744			    __func__, idx);
3745			return;
3746		}
3747
3748		if ((dp = ldcp->lane_out.dringp) == NULL) {
3749			DERR(vswp, "%s: no dring found", __func__);
3750			return;
3751		}
3752
3753		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
3754
3755		/* move to correct location in ring */
3756		priv_addr += idx;
3757
3758		/* release resources associated with sent msg */
3759		mutex_enter(&priv_addr->dstate_lock);
3760		priv_addr->datalen = 0;
3761		priv_addr->dstate = VIO_DESC_FREE;
3762		mutex_exit(&priv_addr->dstate_lock);
3763
3764		break;
3765
3766	default:
3767		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
3768		    ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype);
3769	}
3770
3771	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3772}
3773
3774static void
3775vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t *tagp)
3776{
3777	_NOTE(ARGUNUSED(epkt))
3778
3779	vsw_t		*vswp = ldcp->ldc_vswp;
3780	uint16_t	env = tagp->vio_subtype_env;
3781
3782	D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
3783
3784	/*
3785	 * Error vio_subtypes have yet to be defined. So for
3786	 * the moment we can't do anything.
3787	 */
3788	D2(vswp, "%s: (%x) vio_subtype env", __func__, env);
3789
3790	D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
3791}
3792
3793/* transmit the packet over the given port */
3794int
3795vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt, uint32_t count)
3796{
3797	vsw_ldc_list_t 	*ldcl = &port->p_ldclist;
3798	vsw_ldc_t 	*ldcp;
3799	int		status = 0;
3800
3801	READ_ENTER(&ldcl->lockrw);
3802	/*
3803	 * Note for now, we have a single channel.
3804	 */
3805	ldcp = ldcl->head;
3806	if (ldcp == NULL) {
3807		DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n");
3808		freemsgchain(mp);
3809		RW_EXIT(&ldcl->lockrw);
3810		return (1);
3811	}
3812
3813	status = ldcp->tx(ldcp, mp, mpt, count);
3814
3815	RW_EXIT(&ldcl->lockrw);
3816
3817	return (status);
3818}
3819
3820/*
3821 * Break up frames into 2 seperate chains: normal and
3822 * priority, based on the frame type. The number of
3823 * priority frames is also counted and returned.
3824 *
3825 * Params:
3826 * 	vswp:	pointer to the instance of vsw
3827 *	np:	head of packet chain to be broken
3828 *	npt:	tail of packet chain to be broken
3829 *
3830 * Returns:
3831 *	np:	head of normal data packets
3832 *	npt:	tail of normal data packets
3833 *	hp:	head of high priority packets
3834 *	hpt:	tail of high priority packets
3835 */
3836static uint32_t
3837vsw_get_pri_packets(vsw_t *vswp, mblk_t **np, mblk_t **npt,
3838	mblk_t **hp, mblk_t **hpt)
3839{
3840	mblk_t			*tmp = NULL;
3841	mblk_t			*smp = NULL;
3842	mblk_t			*hmp = NULL;	/* high prio pkts head */
3843	mblk_t			*hmpt = NULL;	/* high prio pkts tail */
3844	mblk_t			*nmp = NULL;	/* normal pkts head */
3845	mblk_t			*nmpt = NULL;	/* normal pkts tail */
3846	uint32_t		count = 0;
3847	int			i;
3848	struct ether_header	*ehp;
3849	uint32_t		num_types;
3850	uint16_t		*types;
3851
3852	tmp = *np;
3853	while (tmp != NULL) {
3854
3855		smp = tmp;
3856		tmp = tmp->b_next;
3857		smp->b_next = NULL;
3858		smp->b_prev = NULL;
3859
3860		ehp = (struct ether_header *)smp->b_rptr;
3861		num_types = vswp->pri_num_types;
3862		types = vswp->pri_types;
3863		for (i = 0; i < num_types; i++) {
3864			if (ehp->ether_type == types[i]) {
3865				/* high priority frame */
3866
3867				if (hmp != NULL) {
3868					hmpt->b_next = smp;
3869					hmpt = smp;
3870				} else {
3871					hmp = hmpt = smp;
3872				}
3873				count++;
3874				break;
3875			}
3876		}
3877		if (i == num_types) {
3878			/* normal data frame */
3879
3880			if (nmp != NULL) {
3881				nmpt->b_next = smp;
3882				nmpt = smp;
3883			} else {
3884				nmp = nmpt = smp;
3885			}
3886		}
3887	}
3888
3889	*hp = hmp;
3890	*hpt = hmpt;
3891	*np = nmp;
3892	*npt = nmpt;
3893
3894	return (count);
3895}
3896
3897/*
3898 * Wrapper function to transmit normal and/or priority frames over the channel.
3899 */
3900static int
3901vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
3902{
3903	vsw_ldc_t 		*ldcp = (vsw_ldc_t *)arg;
3904	mblk_t			*tmp;
3905	mblk_t			*smp;
3906	mblk_t			*hmp;	/* high prio pkts head */
3907	mblk_t			*hmpt;	/* high prio pkts tail */
3908	mblk_t			*nmp;	/* normal pkts head */
3909	mblk_t			*nmpt;	/* normal pkts tail */
3910	uint32_t		n = 0;
3911	vsw_t			*vswp = ldcp->ldc_vswp;
3912
3913	ASSERT(VSW_PRI_ETH_DEFINED(vswp));
3914	ASSERT(count != 0);
3915
3916	nmp = mp;
3917	nmpt = mpt;
3918
3919	/* gather any priority frames from the chain of packets */
3920	n = vsw_get_pri_packets(vswp, &nmp, &nmpt, &hmp, &hmpt);
3921
3922	/* transmit priority frames */
3923	tmp = hmp;
3924	while (tmp != NULL) {
3925		smp = tmp;
3926		tmp = tmp->b_next;
3927		smp->b_next = NULL;
3928		vsw_ldcsend_pkt(ldcp, smp);
3929	}
3930
3931	count -= n;
3932
3933	if (count == 0) {
3934		/* no normal data frames to process */
3935		return (0);
3936	}
3937
3938	return (vsw_ldctx(ldcp, nmp, nmpt, count));
3939}
3940
3941/*
3942 * Wrapper function to transmit normal frames over the channel.
3943 */
3944static int
3945vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
3946{
3947	vsw_ldc_t 	*ldcp = (vsw_ldc_t *)arg;
3948	mblk_t		*tmp = NULL;
3949
3950	ASSERT(count != 0);
3951	/*
3952	 * If the TX thread is enabled, then queue the
3953	 * ordinary frames and signal the tx thread.
3954	 */
3955	if (ldcp->tx_thread != NULL) {
3956
3957		mutex_enter(&ldcp->tx_thr_lock);
3958
3959		if ((ldcp->tx_cnt + count) >= vsw_max_tx_qcount) {
3960			/*
3961			 * If we reached queue limit,
3962			 * do not queue new packets,
3963			 * drop them.
3964			 */
3965			ldcp->ldc_stats.tx_qfull += count;
3966			mutex_exit(&ldcp->tx_thr_lock);
3967			freemsgchain(mp);
3968			goto exit;
3969		}
3970		if (ldcp->tx_mhead == NULL) {
3971			ldcp->tx_mhead = mp;
3972			ldcp->tx_mtail = mpt;
3973			cv_signal(&ldcp->tx_thr_cv);
3974		} else {
3975			ldcp->tx_mtail->b_next = mp;
3976			ldcp->tx_mtail = mpt;
3977		}
3978		ldcp->tx_cnt += count;
3979		mutex_exit(&ldcp->tx_thr_lock);
3980	} else {
3981		while (mp != NULL) {
3982			tmp = mp->b_next;
3983			mp->b_next = mp->b_prev = NULL;
3984			(void) vsw_ldcsend(ldcp, mp, 1);
3985			mp = tmp;
3986		}
3987	}
3988
3989exit:
3990	return (0);
3991}
3992
3993/*
3994 * This function transmits the frame in the payload of a raw data
3995 * (VIO_PKT_DATA) message. Thus, it provides an Out-Of-Band path to
3996 * send special frames with high priorities, without going through
3997 * the normal data path which uses descriptor ring mechanism.
3998 */
3999static void
4000vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp)
4001{
4002	vio_raw_data_msg_t	*pkt;
4003	mblk_t			*bp;
4004	mblk_t			*nmp = NULL;
4005	caddr_t			dst;
4006	uint32_t		mblksz;
4007	uint32_t		size;
4008	uint32_t		nbytes;
4009	int			rv;
4010	vsw_t			*vswp = ldcp->ldc_vswp;
4011	vgen_stats_t		*statsp = &ldcp->ldc_stats;
4012
4013	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4014	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4015		(void) atomic_inc_32(&statsp->tx_pri_fail);
4016		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
4017		    "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
4018		    ldcp->lane_out.lstate);
4019		goto send_pkt_exit;
4020	}
4021
4022	size = msgsize(mp);
4023
4024	/* frame size bigger than available payload len of raw data msg ? */
4025	if (size > (size_t)(ldcp->msglen - VIO_PKT_DATA_HDRSIZE)) {
4026		(void) atomic_inc_32(&statsp->tx_pri_fail);
4027		DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
4028		    ldcp->ldc_id, size);
4029		goto send_pkt_exit;
4030	}
4031
4032	if (size < ETHERMIN)
4033		size = ETHERMIN;
4034
4035	/* alloc space for a raw data message */
4036	nmp = vio_allocb(vswp->pri_tx_vmp);
4037	if (nmp == NULL) {
4038		(void) atomic_inc_32(&statsp->tx_pri_fail);
4039		DWARN(vswp, "vio_allocb failed\n");
4040		goto send_pkt_exit;
4041	}
4042	pkt = (vio_raw_data_msg_t *)nmp->b_rptr;
4043
4044	/* copy frame into the payload of raw data message */
4045	dst = (caddr_t)pkt->data;
4046	for (bp = mp; bp != NULL; bp = bp->b_cont) {
4047		mblksz = MBLKL(bp);
4048		bcopy(bp->b_rptr, dst, mblksz);
4049		dst += mblksz;
4050	}
4051
4052	/* setup the raw data msg */
4053	pkt->tag.vio_msgtype = VIO_TYPE_DATA;
4054	pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
4055	pkt->tag.vio_subtype_env = VIO_PKT_DATA;
4056	pkt->tag.vio_sid = ldcp->local_session;
4057	nbytes = VIO_PKT_DATA_HDRSIZE + size;
4058
4059	/* send the msg over ldc */
4060	rv = vsw_send_msg(ldcp, (void *)pkt, nbytes, B_TRUE);
4061	if (rv != 0) {
4062		(void) atomic_inc_32(&statsp->tx_pri_fail);
4063		DWARN(vswp, "%s(%lld) Error sending priority frame\n", __func__,
4064		    ldcp->ldc_id);
4065		goto send_pkt_exit;
4066	}
4067
4068	/* update stats */
4069	(void) atomic_inc_64(&statsp->tx_pri_packets);
4070	(void) atomic_add_64(&statsp->tx_pri_packets, size);
4071
4072send_pkt_exit:
4073	if (nmp != NULL)
4074		freemsg(nmp);
4075	freemsg(mp);
4076}
4077
4078/*
4079 * Transmit the packet over the given LDC channel.
4080 *
4081 * The 'retries' argument indicates how many times a packet
4082 * is retried before it is dropped. Note, the retry is done
4083 * only for a resource related failure, for all other failures
4084 * the packet is dropped immediately.
4085 */
4086static int
4087vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries)
4088{
4089	int i;
4090	int rc;
4091	int status = 0;
4092	vsw_port_t *port = ldcp->ldc_port;
4093	dring_info_t *dp = NULL;
4094
4095
4096	for (i = 0; i < retries; ) {
4097		/*
4098		 * Send the message out using the appropriate
4099		 * transmit function which will free mblock when it
4100		 * is finished with it.
4101		 */
4102		mutex_enter(&port->tx_lock);
4103		if (port->transmit != NULL) {
4104			status = (*port->transmit)(ldcp, mp);
4105		}
4106		if (status == LDC_TX_SUCCESS) {
4107			mutex_exit(&port->tx_lock);
4108			break;
4109		}
4110		i++;	/* increment the counter here */
4111
4112		/* If its the last retry, then update the oerror */
4113		if ((i == retries) && (status == LDC_TX_NORESOURCES)) {
4114			ldcp->ldc_stats.oerrors++;
4115		}
4116		mutex_exit(&port->tx_lock);
4117
4118		if (status != LDC_TX_NORESOURCES) {
4119			/*
4120			 * No retrying required for errors un-related
4121			 * to resources.
4122			 */
4123			break;
4124		}
4125		READ_ENTER(&ldcp->lane_out.dlistrw);
4126		if (((dp = ldcp->lane_out.dringp) != NULL) &&
4127		    ((VSW_VER_EQ(ldcp, 1, 2) &&
4128		    (ldcp->lane_out.xfer_mode & VIO_DRING_MODE_V1_2)) ||
4129		    ((VSW_VER_LT(ldcp, 1, 2) &&
4130		    (ldcp->lane_out.xfer_mode == VIO_DRING_MODE_V1_0))))) {
4131			rc = vsw_reclaim_dring(dp, dp->end_idx);
4132		} else {
4133			/*
4134			 * If there is no dring or the xfer_mode is
4135			 * set to DESC_MODE(ie., OBP), then simply break here.
4136			 */
4137			RW_EXIT(&ldcp->lane_out.dlistrw);
4138			break;
4139		}
4140		RW_EXIT(&ldcp->lane_out.dlistrw);
4141
4142		/*
4143		 * Delay only if none were reclaimed
4144		 * and its not the last retry.
4145		 */
4146		if ((rc == 0) && (i < retries)) {
4147			delay(drv_usectohz(vsw_ldc_tx_delay));
4148		}
4149	}
4150	freemsg(mp);
4151	return (status);
4152}
4153
4154/*
4155 * Send packet out via descriptor ring to a logical device.
4156 */
4157static int
4158vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp)
4159{
4160	vio_dring_msg_t		dring_pkt;
4161	dring_info_t		*dp = NULL;
4162	vsw_private_desc_t	*priv_desc = NULL;
4163	vnet_public_desc_t	*pub = NULL;
4164	vsw_t			*vswp = ldcp->ldc_vswp;
4165	mblk_t			*bp;
4166	size_t			n, size;
4167	caddr_t			bufp;
4168	int			idx;
4169	int			status = LDC_TX_SUCCESS;
4170	struct ether_header	*ehp = (struct ether_header *)mp->b_rptr;
4171
4172	D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id);
4173
4174	/* TODO: make test a macro */
4175	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4176	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4177		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
4178		    "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
4179		    ldcp->lane_out.lstate);
4180		ldcp->ldc_stats.oerrors++;
4181		return (LDC_TX_FAILURE);
4182	}
4183
4184	/*
4185	 * Note - using first ring only, this may change
4186	 * in the future.
4187	 */
4188	READ_ENTER(&ldcp->lane_out.dlistrw);
4189	if ((dp = ldcp->lane_out.dringp) == NULL) {
4190		RW_EXIT(&ldcp->lane_out.dlistrw);
4191		DERR(vswp, "%s(%lld): no dring for outbound lane on"
4192		    " channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id);
4193		ldcp->ldc_stats.oerrors++;
4194		return (LDC_TX_FAILURE);
4195	}
4196
4197	size = msgsize(mp);
4198	if (size > (size_t)ETHERMAX) {
4199		RW_EXIT(&ldcp->lane_out.dlistrw);
4200		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
4201		    ldcp->ldc_id, size);
4202		ldcp->ldc_stats.oerrors++;
4203		return (LDC_TX_FAILURE);
4204	}
4205
4206	/*
4207	 * Find a free descriptor
4208	 *
4209	 * Note: for the moment we are assuming that we will only
4210	 * have one dring going from the switch to each of its
4211	 * peers. This may change in the future.
4212	 */
4213	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
4214		D2(vswp, "%s(%lld): no descriptor available for ring "
4215		    "at 0x%llx", __func__, ldcp->ldc_id, dp);
4216
4217		/* nothing more we can do */
4218		status = LDC_TX_NORESOURCES;
4219		ldcp->ldc_stats.tx_no_desc++;
4220		goto vsw_dringsend_free_exit;
4221	} else {
4222		D2(vswp, "%s(%lld): free private descriptor found at pos %ld "
4223		    "addr 0x%llx\n", __func__, ldcp->ldc_id, idx, priv_desc);
4224	}
4225
4226	/* copy data into the descriptor */
4227	bufp = priv_desc->datap;
4228	bufp += VNET_IPALIGN;
4229	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
4230		n = MBLKL(bp);
4231		bcopy(bp->b_rptr, bufp, n);
4232		bufp += n;
4233	}
4234
4235	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
4236
4237	pub = priv_desc->descp;
4238	pub->nbytes = priv_desc->datalen;
4239
4240	/* update statistics */
4241	if (IS_BROADCAST(ehp))
4242		ldcp->ldc_stats.brdcstxmt++;
4243	else if (IS_MULTICAST(ehp))
4244		ldcp->ldc_stats.multixmt++;
4245	ldcp->ldc_stats.opackets++;
4246	ldcp->ldc_stats.obytes += priv_desc->datalen;
4247
4248	mutex_enter(&priv_desc->dstate_lock);
4249	pub->hdr.dstate = VIO_DESC_READY;
4250	mutex_exit(&priv_desc->dstate_lock);
4251
4252	/*
4253	 * Determine whether or not we need to send a message to our
4254	 * peer prompting them to read our newly updated descriptor(s).
4255	 */
4256	mutex_enter(&dp->restart_lock);
4257	if (dp->restart_reqd) {
4258		dp->restart_reqd = B_FALSE;
4259		ldcp->ldc_stats.dring_data_msgs++;
4260		mutex_exit(&dp->restart_lock);
4261
4262		/*
4263		 * Send a vio_dring_msg to peer to prompt them to read
4264		 * the updated descriptor ring.
4265		 */
4266		dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA;
4267		dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO;
4268		dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA;
4269		dring_pkt.tag.vio_sid = ldcp->local_session;
4270
4271		/* Note - for now using first ring */
4272		dring_pkt.dring_ident = dp->ident;
4273
4274		/*
4275		 * If last_ack_recv is -1 then we know we've not
4276		 * received any ack's yet, so this must be the first
4277		 * msg sent, so set the start to the begining of the ring.
4278		 */
4279		mutex_enter(&dp->dlock);
4280		if (dp->last_ack_recv == -1) {
4281			dring_pkt.start_idx = 0;
4282		} else {
4283			dring_pkt.start_idx =
4284			    (dp->last_ack_recv + 1) % dp->num_descriptors;
4285		}
4286		dring_pkt.end_idx = -1;
4287		mutex_exit(&dp->dlock);
4288
4289		D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__,
4290		    ldcp->ldc_id, dp, dring_pkt.dring_ident);
4291		D3(vswp, "%s(%lld): start %lld : end %lld :\n",
4292		    __func__, ldcp->ldc_id, dring_pkt.start_idx,
4293		    dring_pkt.end_idx);
4294
4295		RW_EXIT(&ldcp->lane_out.dlistrw);
4296
4297		(void) vsw_send_msg(ldcp, (void *)&dring_pkt,
4298		    sizeof (vio_dring_msg_t), B_TRUE);
4299
4300		return (status);
4301
4302	} else {
4303		mutex_exit(&dp->restart_lock);
4304		D2(vswp, "%s(%lld): updating descp %d", __func__,
4305		    ldcp->ldc_id, idx);
4306	}
4307
4308vsw_dringsend_free_exit:
4309
4310	RW_EXIT(&ldcp->lane_out.dlistrw);
4311
4312	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
4313	return (status);
4314}
4315
4316/*
4317 * Send an in-band descriptor message over ldc.
4318 */
4319static int
4320vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
4321{
4322	vsw_t			*vswp = ldcp->ldc_vswp;
4323	vnet_ibnd_desc_t	ibnd_msg;
4324	vsw_private_desc_t	*priv_desc = NULL;
4325	dring_info_t		*dp = NULL;
4326	size_t			n, size = 0;
4327	caddr_t			bufp;
4328	mblk_t			*bp;
4329	int			idx, i;
4330	int			status = LDC_TX_SUCCESS;
4331	static int		warn_msg = 1;
4332
4333	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4334
4335	ASSERT(mp != NULL);
4336
4337	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4338	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4339		DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt",
4340		    __func__, ldcp->ldc_id, ldcp->ldc_status,
4341		    ldcp->lane_out.lstate);
4342		ldcp->ldc_stats.oerrors++;
4343		return (LDC_TX_FAILURE);
4344	}
4345
4346	/*
4347	 * only expect single dring to exist, which we use
4348	 * as an internal buffer, rather than a transfer channel.
4349	 */
4350	READ_ENTER(&ldcp->lane_out.dlistrw);
4351	if ((dp = ldcp->lane_out.dringp) == NULL) {
4352		DERR(vswp, "%s(%lld): no dring for outbound lane",
4353		    __func__, ldcp->ldc_id);
4354		DERR(vswp, "%s(%lld) status(%d) state (0x%llx)", __func__,
4355		    ldcp->ldc_id, ldcp->ldc_status, ldcp->lane_out.lstate);
4356		RW_EXIT(&ldcp->lane_out.dlistrw);
4357		ldcp->ldc_stats.oerrors++;
4358		return (LDC_TX_FAILURE);
4359	}
4360
4361	size = msgsize(mp);
4362	if (size > (size_t)ETHERMAX) {
4363		RW_EXIT(&ldcp->lane_out.dlistrw);
4364		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
4365		    ldcp->ldc_id, size);
4366		ldcp->ldc_stats.oerrors++;
4367		return (LDC_TX_FAILURE);
4368	}
4369
4370	/*
4371	 * Find a free descriptor in our buffer ring
4372	 */
4373	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
4374		RW_EXIT(&ldcp->lane_out.dlistrw);
4375		if (warn_msg) {
4376			DERR(vswp, "%s(%lld): no descriptor available for ring "
4377			    "at 0x%llx", __func__, ldcp->ldc_id, dp);
4378			warn_msg = 0;
4379		}
4380
4381		/* nothing more we can do */
4382		status = LDC_TX_NORESOURCES;
4383		goto vsw_descrsend_free_exit;
4384	} else {
4385		D2(vswp, "%s(%lld): free private descriptor found at pos "
4386		    "%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx, priv_desc);
4387		warn_msg = 1;
4388	}
4389
4390	/* copy data into the descriptor */
4391	bufp = priv_desc->datap;
4392	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
4393		n = MBLKL(bp);
4394		bcopy(bp->b_rptr, bufp, n);
4395		bufp += n;
4396	}
4397
4398	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
4399
4400	/* create and send the in-band descp msg */
4401	ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
4402	ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO;
4403	ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
4404	ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;
4405
4406	/*
4407	 * Copy the mem cookies describing the data from the
4408	 * private region of the descriptor ring into the inband
4409	 * descriptor.
4410	 */
4411	for (i = 0; i < priv_desc->ncookies; i++) {
4412		bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i],
4413		    sizeof (ldc_mem_cookie_t));
4414	}
4415
4416	ibnd_msg.hdr.desc_handle = idx;
4417	ibnd_msg.ncookies = priv_desc->ncookies;
4418	ibnd_msg.nbytes = size;
4419
4420	ldcp->ldc_stats.opackets++;
4421	ldcp->ldc_stats.obytes += size;
4422
4423	RW_EXIT(&ldcp->lane_out.dlistrw);
4424
4425	(void) vsw_send_msg(ldcp, (void *)&ibnd_msg,
4426	    sizeof (vnet_ibnd_desc_t), B_TRUE);
4427
4428vsw_descrsend_free_exit:
4429
4430	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4431	return (status);
4432}
4433
4434static void
4435vsw_send_ver(void *arg)
4436{
4437	vsw_ldc_t	*ldcp = (vsw_ldc_t *)arg;
4438	vsw_t		*vswp = ldcp->ldc_vswp;
4439	lane_t		*lp = &ldcp->lane_out;
4440	vio_ver_msg_t	ver_msg;
4441
4442	D1(vswp, "%s enter", __func__);
4443
4444	ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4445	ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4446	ver_msg.tag.vio_subtype_env = VIO_VER_INFO;
4447	ver_msg.tag.vio_sid = ldcp->local_session;
4448
4449	if (vsw_obp_ver_proto_workaround == B_FALSE) {
4450		ver_msg.ver_major = vsw_versions[0].ver_major;
4451		ver_msg.ver_minor = vsw_versions[0].ver_minor;
4452	} else {
4453		/* use the major,minor that we've ack'd */
4454		lane_t	*lpi = &ldcp->lane_in;
4455		ver_msg.ver_major = lpi->ver_major;
4456		ver_msg.ver_minor = lpi->ver_minor;
4457	}
4458	ver_msg.dev_class = VDEV_NETWORK_SWITCH;
4459
4460	lp->lstate |= VSW_VER_INFO_SENT;
4461	lp->ver_major = ver_msg.ver_major;
4462	lp->ver_minor = ver_msg.ver_minor;
4463
4464	DUMP_TAG(ver_msg.tag);
4465
4466	(void) vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t), B_TRUE);
4467
4468	D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id);
4469}
4470
4471static void
4472vsw_send_attr(vsw_ldc_t *ldcp)
4473{
4474	vsw_t			*vswp = ldcp->ldc_vswp;
4475	lane_t			*lp = &ldcp->lane_out;
4476	vnet_attr_msg_t		attr_msg;
4477
4478	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
4479
4480	/*
4481	 * Subtype is set to INFO by default
4482	 */
4483	attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4484	attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4485	attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO;
4486	attr_msg.tag.vio_sid = ldcp->local_session;
4487
4488	/* payload copied from default settings for lane */
4489	attr_msg.mtu = lp->mtu;
4490	attr_msg.addr_type = lp->addr_type;
4491	attr_msg.xfer_mode = lp->xfer_mode;
4492	attr_msg.ack_freq = lp->xfer_mode;
4493
4494	READ_ENTER(&vswp->if_lockrw);
4495	attr_msg.addr = vnet_macaddr_strtoul((vswp->if_addr).ether_addr_octet);
4496	RW_EXIT(&vswp->if_lockrw);
4497
4498	ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT;
4499
4500	DUMP_TAG(attr_msg.tag);
4501
4502	(void) vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t), B_TRUE);
4503
4504	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
4505}
4506
4507/*
4508 * Create dring info msg (which also results in the creation of
4509 * a dring).
4510 */
4511static vio_dring_reg_msg_t *
4512vsw_create_dring_info_pkt(vsw_ldc_t *ldcp)
4513{
4514	vio_dring_reg_msg_t	*mp;
4515	dring_info_t		*dp;
4516	vsw_t			*vswp = ldcp->ldc_vswp;
4517
4518	D1(vswp, "vsw_create_dring_info_pkt enter\n");
4519
4520	/*
4521	 * If we can't create a dring, obviously no point sending
4522	 * a message.
4523	 */
4524	if ((dp = vsw_create_dring(ldcp)) == NULL)
4525		return (NULL);
4526
4527	mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP);
4528
4529	mp->tag.vio_msgtype = VIO_TYPE_CTRL;
4530	mp->tag.vio_subtype = VIO_SUBTYPE_INFO;
4531	mp->tag.vio_subtype_env = VIO_DRING_REG;
4532	mp->tag.vio_sid = ldcp->local_session;
4533
4534	/* payload */
4535	mp->num_descriptors = dp->num_descriptors;
4536	mp->descriptor_size = dp->descriptor_size;
4537	mp->options = dp->options;
4538	mp->ncookies = dp->ncookies;
4539	bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t));
4540
4541	mp->dring_ident = 0;
4542
4543	D1(vswp, "vsw_create_dring_info_pkt exit\n");
4544
4545	return (mp);
4546}
4547
4548static void
4549vsw_send_dring_info(vsw_ldc_t *ldcp)
4550{
4551	vio_dring_reg_msg_t	*dring_msg;
4552	vsw_t			*vswp = ldcp->ldc_vswp;
4553
4554	D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id);
4555
4556	dring_msg = vsw_create_dring_info_pkt(ldcp);
4557	if (dring_msg == NULL) {
4558		cmn_err(CE_WARN, "!vsw%d: %s: error creating msg",
4559		    vswp->instance, __func__);
4560		return;
4561	}
4562
4563	ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT;
4564
4565	DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg);
4566
4567	(void) vsw_send_msg(ldcp, dring_msg,
4568	    sizeof (vio_dring_reg_msg_t), B_TRUE);
4569
4570	kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t));
4571
4572	D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id);
4573}
4574
4575static void
4576vsw_send_rdx(vsw_ldc_t *ldcp)
4577{
4578	vsw_t		*vswp = ldcp->ldc_vswp;
4579	vio_rdx_msg_t	rdx_msg;
4580
4581	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
4582
4583	rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4584	rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4585	rdx_msg.tag.vio_subtype_env = VIO_RDX;
4586	rdx_msg.tag.vio_sid = ldcp->local_session;
4587
4588	ldcp->lane_in.lstate |= VSW_RDX_INFO_SENT;
4589
4590	DUMP_TAG(rdx_msg.tag);
4591
4592	(void) vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t), B_TRUE);
4593
4594	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
4595}
4596
4597/*
4598 * Generic routine to send message out over ldc channel.
4599 *
4600 * It is possible that when we attempt to write over the ldc channel
4601 * that we get notified that it has been reset. Depending on the value
4602 * of the handle_reset flag we either handle that event here or simply
4603 * notify the caller that the channel was reset.
4604 */
4605static int
4606vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size, boolean_t handle_reset)
4607{
4608	int			rv;
4609	size_t			msglen = size;
4610	vio_msg_tag_t		*tag = (vio_msg_tag_t *)msgp;
4611	vsw_t			*vswp = ldcp->ldc_vswp;
4612	vio_dring_msg_t		*dmsg;
4613	vio_raw_data_msg_t	*rmsg;
4614	vnet_ibnd_desc_t	*imsg;
4615	boolean_t		data_msg = B_FALSE;
4616
4617	D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes",
4618	    ldcp->ldc_id, size);
4619
4620	D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype);
4621	D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype);
4622	D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env);
4623
4624	mutex_enter(&ldcp->ldc_txlock);
4625
4626	if (tag->vio_subtype == VIO_SUBTYPE_INFO) {
4627		if (tag->vio_subtype_env == VIO_DRING_DATA) {
4628			dmsg = (vio_dring_msg_t *)tag;
4629			dmsg->seq_num = ldcp->lane_out.seq_num;
4630			data_msg = B_TRUE;
4631		} else if (tag->vio_subtype_env == VIO_PKT_DATA) {
4632			rmsg = (vio_raw_data_msg_t *)tag;
4633			rmsg->seq_num = ldcp->lane_out.seq_num;
4634			data_msg = B_TRUE;
4635		} else if (tag->vio_subtype_env == VIO_DESC_DATA) {
4636			imsg = (vnet_ibnd_desc_t *)tag;
4637			imsg->hdr.seq_num = ldcp->lane_out.seq_num;
4638			data_msg = B_TRUE;
4639		}
4640	}
4641
4642	do {
4643		msglen = size;
4644		rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen);
4645	} while (rv == EWOULDBLOCK && --vsw_wretries > 0);
4646
4647	if (rv == 0 && data_msg == B_TRUE) {
4648		ldcp->lane_out.seq_num++;
4649	}
4650
4651	if ((rv != 0) || (msglen != size)) {
4652		DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) rv(%d) "
4653		    "size (%d) msglen(%d)\n", ldcp->ldc_id, rv, size, msglen);
4654		ldcp->ldc_stats.oerrors++;
4655	}
4656
4657	mutex_exit(&ldcp->ldc_txlock);
4658
4659	/*
4660	 * If channel has been reset we either handle it here or
4661	 * simply report back that it has been reset and let caller
4662	 * decide what to do.
4663	 */
4664	if (rv == ECONNRESET) {
4665		DWARN(vswp, "%s (%lld) channel reset", __func__, ldcp->ldc_id);
4666
4667		/*
4668		 * N.B - must never be holding the dlistrw lock when
4669		 * we do a reset of the channel.
4670		 */
4671		if (handle_reset) {
4672			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
4673		}
4674	}
4675
4676	return (rv);
4677}
4678
4679/*
4680 * Remove the specified address from the list of address maintained
4681 * in this port node.
4682 */
4683mcst_addr_t *
4684vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr)
4685{
4686	vsw_t		*vswp = NULL;
4687	vsw_port_t	*port = NULL;
4688	mcst_addr_t	*prev_p = NULL;
4689	mcst_addr_t	*curr_p = NULL;
4690
4691	D1(NULL, "%s: enter : devtype %d : addr 0x%llx",
4692	    __func__, devtype, addr);
4693
4694	if (devtype == VSW_VNETPORT) {
4695		port = (vsw_port_t *)arg;
4696		mutex_enter(&port->mca_lock);
4697		prev_p = curr_p = port->mcap;
4698	} else {
4699		vswp = (vsw_t *)arg;
4700		mutex_enter(&vswp->mca_lock);
4701		prev_p = curr_p = vswp->mcap;
4702	}
4703
4704	while (curr_p != NULL) {
4705		if (curr_p->addr == addr) {
4706			D2(NULL, "%s: address found", __func__);
4707			/* match found */
4708			if (prev_p == curr_p) {
4709				/* list head */
4710				if (devtype == VSW_VNETPORT)
4711					port->mcap = curr_p->nextp;
4712				else
4713					vswp->mcap = curr_p->nextp;
4714			} else {
4715				prev_p->nextp = curr_p->nextp;
4716			}
4717			break;
4718		} else {
4719			prev_p = curr_p;
4720			curr_p = curr_p->nextp;
4721		}
4722	}
4723
4724	if (devtype == VSW_VNETPORT)
4725		mutex_exit(&port->mca_lock);
4726	else
4727		mutex_exit(&vswp->mca_lock);
4728
4729	D1(NULL, "%s: exit", __func__);
4730
4731	return (curr_p);
4732}
4733
4734/*
4735 * Creates a descriptor ring (dring) and links it into the
4736 * link of outbound drings for this channel.
4737 *
4738 * Returns NULL if creation failed.
4739 */
4740static dring_info_t *
4741vsw_create_dring(vsw_ldc_t *ldcp)
4742{
4743	vsw_private_desc_t	*priv_addr = NULL;
4744	vsw_t			*vswp = ldcp->ldc_vswp;
4745	ldc_mem_info_t		minfo;
4746	dring_info_t		*dp, *tp;
4747	int			i;
4748
4749	dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
4750
4751	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
4752
4753	/* create public section of ring */
4754	if ((ldc_mem_dring_create(vsw_ntxds,
4755	    VSW_PUB_SIZE, &dp->handle)) != 0) {
4756
4757		DERR(vswp, "vsw_create_dring(%lld): ldc dring create "
4758		    "failed", ldcp->ldc_id);
4759		goto create_fail_exit;
4760	}
4761
4762	ASSERT(dp->handle != NULL);
4763
4764	/*
4765	 * Get the base address of the public section of the ring.
4766	 */
4767	if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
4768		DERR(vswp, "vsw_create_dring(%lld): dring info failed\n",
4769		    ldcp->ldc_id);
4770		goto dring_fail_exit;
4771	} else {
4772		ASSERT(minfo.vaddr != 0);
4773		dp->pub_addr = minfo.vaddr;
4774	}
4775
4776	dp->num_descriptors = vsw_ntxds;
4777	dp->descriptor_size = VSW_PUB_SIZE;
4778	dp->options = VIO_TX_DRING;
4779	dp->ncookies = 1;	/* guaranteed by ldc */
4780
4781	/*
4782	 * create private portion of ring
4783	 */
4784	dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc(
4785	    (sizeof (vsw_private_desc_t) * vsw_ntxds), KM_SLEEP);
4786
4787	if (vsw_setup_ring(ldcp, dp)) {
4788		DERR(vswp, "%s: unable to setup ring", __func__);
4789		goto dring_fail_exit;
4790	}
4791
4792	/* haven't used any descriptors yet */
4793	dp->end_idx = 0;
4794	dp->last_ack_recv = -1;
4795
4796	/* bind dring to the channel */
4797	if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle,
4798	    LDC_SHADOW_MAP, LDC_MEM_RW,
4799	    &dp->cookie[0], &dp->ncookies)) != 0) {
4800		DERR(vswp, "vsw_create_dring: unable to bind to channel "
4801		    "%lld", ldcp->ldc_id);
4802		goto dring_fail_exit;
4803	}
4804
4805	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
4806	dp->restart_reqd = B_TRUE;
4807
4808	/*
4809	 * Only ever create rings for outgoing lane. Link it onto
4810	 * end of list.
4811	 */
4812	WRITE_ENTER(&ldcp->lane_out.dlistrw);
4813	if (ldcp->lane_out.dringp == NULL) {
4814		D2(vswp, "vsw_create_dring: adding first outbound ring");
4815		ldcp->lane_out.dringp = dp;
4816	} else {
4817		tp = ldcp->lane_out.dringp;
4818		while (tp->next != NULL)
4819			tp = tp->next;
4820
4821		tp->next = dp;
4822	}
4823	RW_EXIT(&ldcp->lane_out.dlistrw);
4824
4825	return (dp);
4826
4827dring_fail_exit:
4828	(void) ldc_mem_dring_destroy(dp->handle);
4829
4830create_fail_exit:
4831	if (dp->priv_addr != NULL) {
4832		priv_addr = dp->priv_addr;
4833		for (i = 0; i < vsw_ntxds; i++) {
4834			if (priv_addr->memhandle != NULL)
4835				(void) ldc_mem_free_handle(
4836				    priv_addr->memhandle);
4837			priv_addr++;
4838		}
4839		kmem_free(dp->priv_addr,
4840		    (sizeof (vsw_private_desc_t) * vsw_ntxds));
4841	}
4842	mutex_destroy(&dp->dlock);
4843
4844	kmem_free(dp, sizeof (dring_info_t));
4845	return (NULL);
4846}
4847
4848/*
4849 * Create a ring consisting of just a private portion and link
4850 * it into the list of rings for the outbound lane.
4851 *
4852 * These type of rings are used primarily for temporary data
4853 * storage (i.e. as data buffers).
4854 */
4855void
4856vsw_create_privring(vsw_ldc_t *ldcp)
4857{
4858	dring_info_t		*dp, *tp;
4859	vsw_t			*vswp = ldcp->ldc_vswp;
4860
4861	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4862
4863	dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
4864
4865	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
4866
4867	/* no public section */
4868	dp->pub_addr = NULL;
4869
4870	dp->priv_addr = kmem_zalloc(
4871	    (sizeof (vsw_private_desc_t) * vsw_ntxds), KM_SLEEP);
4872
4873	dp->num_descriptors = vsw_ntxds;
4874
4875	if (vsw_setup_ring(ldcp, dp)) {
4876		DERR(vswp, "%s: setup of ring failed", __func__);
4877		kmem_free(dp->priv_addr,
4878		    (sizeof (vsw_private_desc_t) * vsw_ntxds));
4879		mutex_destroy(&dp->dlock);
4880		kmem_free(dp, sizeof (dring_info_t));
4881		return;
4882	}
4883
4884	/* haven't used any descriptors yet */
4885	dp->end_idx = 0;
4886
4887	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
4888	dp->restart_reqd = B_TRUE;
4889
4890	/*
4891	 * Only ever create rings for outgoing lane. Link it onto
4892	 * end of list.
4893	 */
4894	WRITE_ENTER(&ldcp->lane_out.dlistrw);
4895	if (ldcp->lane_out.dringp == NULL) {
4896		D2(vswp, "%s: adding first outbound privring", __func__);
4897		ldcp->lane_out.dringp = dp;
4898	} else {
4899		tp = ldcp->lane_out.dringp;
4900		while (tp->next != NULL)
4901			tp = tp->next;
4902
4903		tp->next = dp;
4904	}
4905	RW_EXIT(&ldcp->lane_out.dlistrw);
4906
4907	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4908}
4909
4910/*
4911 * Setup the descriptors in the dring. Returns 0 on success, 1 on
4912 * failure.
4913 */
4914int
4915vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp)
4916{
4917	vnet_public_desc_t	*pub_addr = NULL;
4918	vsw_private_desc_t	*priv_addr = NULL;
4919	vsw_t			*vswp = ldcp->ldc_vswp;
4920	uint64_t		*tmpp;
4921	uint64_t		offset = 0;
4922	uint32_t		ncookies = 0;
4923	static char		*name = "vsw_setup_ring";
4924	int			i, j, nc, rv;
4925
4926	priv_addr = dp->priv_addr;
4927	pub_addr = dp->pub_addr;
4928
4929	/* public section may be null but private should never be */
4930	ASSERT(priv_addr != NULL);
4931
4932	/*
4933	 * Allocate the region of memory which will be used to hold
4934	 * the data the descriptors will refer to.
4935	 */
4936	dp->data_sz = (vsw_ntxds * VSW_RING_EL_DATA_SZ);
4937	dp->data_addr = kmem_alloc(dp->data_sz, KM_SLEEP);
4938
4939	D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name,
4940	    dp->data_sz, dp->data_addr);
4941
4942	tmpp = (uint64_t *)dp->data_addr;
4943	offset = VSW_RING_EL_DATA_SZ / sizeof (tmpp);
4944
4945	/*
4946	 * Initialise some of the private and public (if they exist)
4947	 * descriptor fields.
4948	 */
4949	for (i = 0; i < vsw_ntxds; i++) {
4950		mutex_init(&priv_addr->dstate_lock, NULL, MUTEX_DRIVER, NULL);
4951
4952		if ((ldc_mem_alloc_handle(ldcp->ldc_handle,
4953		    &priv_addr->memhandle)) != 0) {
4954			DERR(vswp, "%s: alloc mem handle failed", name);
4955			goto setup_ring_cleanup;
4956		}
4957
4958		priv_addr->datap = (void *)tmpp;
4959
4960		rv = ldc_mem_bind_handle(priv_addr->memhandle,
4961		    (caddr_t)priv_addr->datap, VSW_RING_EL_DATA_SZ,
4962		    LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W,
4963		    &(priv_addr->memcookie[0]), &ncookies);
4964		if (rv != 0) {
4965			DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed "
4966			    "(rv %d)", name, ldcp->ldc_id, rv);
4967			goto setup_ring_cleanup;
4968		}
4969		priv_addr->bound = 1;
4970
4971		D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx",
4972		    name, i, priv_addr->memcookie[0].addr,
4973		    priv_addr->memcookie[0].size);
4974
4975		if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) {
4976			DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned "
4977			    "invalid num of cookies (%d) for size 0x%llx",
4978			    name, ldcp->ldc_id, ncookies, VSW_RING_EL_DATA_SZ);
4979
4980			goto setup_ring_cleanup;
4981		} else {
4982			for (j = 1; j < ncookies; j++) {
4983				rv = ldc_mem_nextcookie(priv_addr->memhandle,
4984				    &(priv_addr->memcookie[j]));
4985				if (rv != 0) {
4986					DERR(vswp, "%s: ldc_mem_nextcookie "
4987					    "failed rv (%d)", name, rv);
4988					goto setup_ring_cleanup;
4989				}
4990				D3(vswp, "%s: memcookie %d : addr 0x%llx : "
4991				    "size 0x%llx", name, j,
4992				    priv_addr->memcookie[j].addr,
4993				    priv_addr->memcookie[j].size);
4994			}
4995
4996		}
4997		priv_addr->ncookies = ncookies;
4998		priv_addr->dstate = VIO_DESC_FREE;
4999
5000		if (pub_addr != NULL) {
5001
5002			/* link pub and private sides */
5003			priv_addr->descp = pub_addr;
5004
5005			pub_addr->ncookies = priv_addr->ncookies;
5006
5007			for (nc = 0; nc < pub_addr->ncookies; nc++) {
5008				bcopy(&priv_addr->memcookie[nc],
5009				    &pub_addr->memcookie[nc],
5010				    sizeof (ldc_mem_cookie_t));
5011			}
5012
5013			pub_addr->hdr.dstate = VIO_DESC_FREE;
5014			pub_addr++;
5015		}
5016
5017		/*
5018		 * move to next element in the dring and the next
5019		 * position in the data buffer.
5020		 */
5021		priv_addr++;
5022		tmpp += offset;
5023	}
5024
5025	return (0);
5026
5027setup_ring_cleanup:
5028	priv_addr = dp->priv_addr;
5029
5030	for (j = 0; j < i; j++) {
5031		(void) ldc_mem_unbind_handle(priv_addr->memhandle);
5032		(void) ldc_mem_free_handle(priv_addr->memhandle);
5033
5034		mutex_destroy(&priv_addr->dstate_lock);
5035
5036		priv_addr++;
5037	}
5038	kmem_free(dp->data_addr, dp->data_sz);
5039
5040	return (1);
5041}
5042
5043/*
5044 * Searches the private section of a ring for a free descriptor,
5045 * starting at the location of the last free descriptor found
5046 * previously.
5047 *
5048 * Returns 0 if free descriptor is available, and updates state
5049 * of private descriptor to VIO_DESC_READY,  otherwise returns 1.
5050 *
5051 * FUTURE: might need to return contiguous range of descriptors
5052 * as dring info msg assumes all will be contiguous.
5053 */
5054static int
5055vsw_dring_find_free_desc(dring_info_t *dringp,
5056		vsw_private_desc_t **priv_p, int *idx)
5057{
5058	vsw_private_desc_t	*addr = NULL;
5059	int			num = vsw_ntxds;
5060	int			ret = 1;
5061
5062	D1(NULL, "%s enter\n", __func__);
5063
5064	ASSERT(dringp->priv_addr != NULL);
5065
5066	D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld",
5067	    __func__, dringp, dringp->end_idx);
5068
5069	addr = (vsw_private_desc_t *)dringp->priv_addr + dringp->end_idx;
5070
5071	mutex_enter(&addr->dstate_lock);
5072	if (addr->dstate == VIO_DESC_FREE) {
5073		addr->dstate = VIO_DESC_READY;
5074		*priv_p = addr;
5075		*idx = dringp->end_idx;
5076		dringp->end_idx = (dringp->end_idx + 1) % num;
5077		ret = 0;
5078
5079	}
5080	mutex_exit(&addr->dstate_lock);
5081
5082	/* ring full */
5083	if (ret == 1) {
5084		D2(NULL, "%s: no desp free: started at %d", __func__,
5085		    dringp->end_idx);
5086	}
5087
5088	D1(NULL, "%s: exit\n", __func__);
5089
5090	return (ret);
5091}
5092
5093/*
5094 * Map from a dring identifier to the ring itself. Returns
5095 * pointer to ring or NULL if no match found.
5096 *
5097 * Should be called with dlistrw rwlock held as reader.
5098 */
5099static dring_info_t *
5100vsw_ident2dring(lane_t *lane, uint64_t ident)
5101{
5102	dring_info_t	*dp = NULL;
5103
5104	if ((dp = lane->dringp) == NULL) {
5105		return (NULL);
5106	} else {
5107		if (dp->ident == ident)
5108			return (dp);
5109
5110		while (dp != NULL) {
5111			if (dp->ident == ident)
5112				break;
5113			dp = dp->next;
5114		}
5115	}
5116
5117	return (dp);
5118}
5119
5120/*
5121 * Set the default lane attributes. These are copied into
5122 * the attr msg we send to our peer. If they are not acceptable
5123 * then (currently) the handshake ends.
5124 */
5125static void
5126vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
5127{
5128	bzero(lp, sizeof (lane_t));
5129
5130	READ_ENTER(&vswp->if_lockrw);
5131	ether_copy(&(vswp->if_addr), &(lp->addr));
5132	RW_EXIT(&vswp->if_lockrw);
5133
5134	lp->mtu = VSW_MTU;
5135	lp->addr_type = ADDR_TYPE_MAC;
5136	lp->xfer_mode = VIO_DRING_MODE_V1_0;
5137	lp->ack_freq = 0;	/* for shared mode */
5138	lp->seq_num = VNET_ISS;
5139}
5140
5141/*
5142 * Verify that the attributes are acceptable.
5143 *
5144 * FUTURE: If some attributes are not acceptable, change them
5145 * our desired values.
5146 */
5147static int
5148vsw_check_attr(vnet_attr_msg_t *pkt, vsw_ldc_t *ldcp)
5149{
5150	int			ret = 0;
5151	struct ether_addr	ea;
5152	vsw_port_t		*port = ldcp->ldc_port;
5153	lane_t			*lp = &ldcp->lane_out;
5154
5155
5156	D1(NULL, "vsw_check_attr enter\n");
5157
5158	if ((pkt->xfer_mode != VIO_DESC_MODE) &&
5159	    (pkt->xfer_mode != lp->xfer_mode)) {
5160		D2(NULL, "vsw_check_attr: unknown mode %x\n", pkt->xfer_mode);
5161		ret = 1;
5162	}
5163
5164	/* Only support MAC addresses at moment. */
5165	if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) {
5166		D2(NULL, "vsw_check_attr: invalid addr_type %x, "
5167		    "or address 0x%llx\n", pkt->addr_type, pkt->addr);
5168		ret = 1;
5169	}
5170
5171	/*
5172	 * MAC address supplied by device should match that stored
5173	 * in the vsw-port OBP node. Need to decide what to do if they
5174	 * don't match, for the moment just warn but don't fail.
5175	 */
5176	vnet_macaddr_ultostr(pkt->addr, ea.ether_addr_octet);
5177	if (ether_cmp(&ea, &port->p_macaddr) != 0) {
5178		DERR(NULL, "vsw_check_attr: device supplied address "
5179		    "0x%llx doesn't match node address 0x%llx\n",
5180		    pkt->addr, port->p_macaddr);
5181	}
5182
5183	/*
5184	 * Ack freq only makes sense in pkt mode, in shared
5185	 * mode the ring descriptors say whether or not to
5186	 * send back an ACK.
5187	 */
5188	if ((VSW_VER_EQ(ldcp, 1, 2) &&
5189	    (pkt->xfer_mode & VIO_DRING_MODE_V1_2)) ||
5190	    (VSW_VER_LT(ldcp, 1, 2) &&
5191	    (pkt->xfer_mode == VIO_DRING_MODE_V1_0))) {
5192		if (pkt->ack_freq > 0) {
5193			D2(NULL, "vsw_check_attr: non zero ack freq "
5194			    " in SHM mode\n");
5195			ret = 1;
5196		}
5197	}
5198
5199	/*
5200	 * Note: for the moment we only support ETHER
5201	 * frames. This may change in the future.
5202	 */
5203	if ((pkt->mtu > VSW_MTU) || (pkt->mtu <= 0)) {
5204		D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n",
5205		    pkt->mtu);
5206		ret = 1;
5207	}
5208
5209	D1(NULL, "vsw_check_attr exit\n");
5210
5211	return (ret);
5212}
5213
5214/*
5215 * Returns 1 if there is a problem, 0 otherwise.
5216 */
5217static int
5218vsw_check_dring_info(vio_dring_reg_msg_t *pkt)
5219{
5220	_NOTE(ARGUNUSED(pkt))
5221
5222	int	ret = 0;
5223
5224	D1(NULL, "vsw_check_dring_info enter\n");
5225
5226	if ((pkt->num_descriptors == 0) ||
5227	    (pkt->descriptor_size == 0) ||
5228	    (pkt->ncookies != 1)) {
5229		DERR(NULL, "vsw_check_dring_info: invalid dring msg");
5230		ret = 1;
5231	}
5232
5233	D1(NULL, "vsw_check_dring_info exit\n");
5234
5235	return (ret);
5236}
5237
5238/*
5239 * Returns 1 if two memory cookies match. Otherwise returns 0.
5240 */
5241static int
5242vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2)
5243{
5244	if ((m1->addr != m2->addr) ||
5245	    (m2->size != m2->size)) {
5246		return (0);
5247	} else {
5248		return (1);
5249	}
5250}
5251
5252/*
5253 * Returns 1 if ring described in reg message matches that
5254 * described by dring_info structure. Otherwise returns 0.
5255 */
5256static int
5257vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg)
5258{
5259	if ((msg->descriptor_size != dp->descriptor_size) ||
5260	    (msg->num_descriptors != dp->num_descriptors) ||
5261	    (msg->ncookies != dp->ncookies) ||
5262	    !(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) {
5263		return (0);
5264	} else {
5265		return (1);
5266	}
5267
5268}
5269
5270static caddr_t
5271vsw_print_ethaddr(uint8_t *a, char *ebuf)
5272{
5273	(void) sprintf(ebuf, "%x:%x:%x:%x:%x:%x",
5274	    a[0], a[1], a[2], a[3], a[4], a[5]);
5275	return (ebuf);
5276}
5277
5278/*
5279 * Reset and free all the resources associated with
5280 * the channel.
5281 */
5282static void
5283vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
5284{
5285	dring_info_t		*dp, *dpp;
5286	lane_t			*lp = NULL;
5287	int			rv = 0;
5288
5289	ASSERT(ldcp != NULL);
5290
5291	D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id);
5292
5293	if (dir == INBOUND) {
5294		D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane"
5295		    " of channel %lld", __func__, ldcp->ldc_id);
5296		lp = &ldcp->lane_in;
5297	} else {
5298		D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane"
5299		    " of channel %lld", __func__, ldcp->ldc_id);
5300		lp = &ldcp->lane_out;
5301	}
5302
5303	lp->lstate = VSW_LANE_INACTIV;
5304	lp->seq_num = VNET_ISS;
5305
5306	if (lp->dringp) {
5307		if (dir == INBOUND) {
5308			WRITE_ENTER(&lp->dlistrw);
5309			dp = lp->dringp;
5310			while (dp != NULL) {
5311				dpp = dp->next;
5312				if (dp->handle != NULL)
5313					(void) ldc_mem_dring_unmap(dp->handle);
5314				kmem_free(dp, sizeof (dring_info_t));
5315				dp = dpp;
5316			}
5317			RW_EXIT(&lp->dlistrw);
5318		} else {
5319			/*
5320			 * unbind, destroy exported dring, free dring struct
5321			 */
5322			WRITE_ENTER(&lp->dlistrw);
5323			dp = lp->dringp;
5324			rv = vsw_free_ring(dp);
5325			RW_EXIT(&lp->dlistrw);
5326		}
5327		if (rv == 0) {
5328			lp->dringp = NULL;
5329		}
5330	}
5331
5332	D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id);
5333}
5334
5335/*
5336 * Free ring and all associated resources.
5337 *
5338 * Should be called with dlistrw rwlock held as writer.
5339 */
5340static int
5341vsw_free_ring(dring_info_t *dp)
5342{
5343	vsw_private_desc_t	*paddr = NULL;
5344	dring_info_t		*dpp;
5345	int			i, rv = 1;
5346
5347	while (dp != NULL) {
5348		mutex_enter(&dp->dlock);
5349		dpp = dp->next;
5350		if (dp->priv_addr != NULL) {
5351			/*
5352			 * First unbind and free the memory handles
5353			 * stored in each descriptor within the ring.
5354			 */
5355			for (i = 0; i < vsw_ntxds; i++) {
5356				paddr = (vsw_private_desc_t *)
5357				    dp->priv_addr + i;
5358				if (paddr->memhandle != NULL) {
5359					if (paddr->bound == 1) {
5360						rv = ldc_mem_unbind_handle(
5361						    paddr->memhandle);
5362
5363						if (rv != 0) {
5364							DERR(NULL, "error "
5365							"unbinding handle for "
5366							"ring 0x%llx at pos %d",
5367							    dp, i);
5368							mutex_exit(&dp->dlock);
5369							return (rv);
5370						}
5371						paddr->bound = 0;
5372					}
5373
5374					rv = ldc_mem_free_handle(
5375					    paddr->memhandle);
5376					if (rv != 0) {
5377						DERR(NULL, "error freeing "
5378						    "handle for ring 0x%llx "
5379						    "at pos %d", dp, i);
5380						mutex_exit(&dp->dlock);
5381						return (rv);
5382					}
5383					paddr->memhandle = NULL;
5384				}
5385				mutex_destroy(&paddr->dstate_lock);
5386			}
5387			kmem_free(dp->priv_addr,
5388			    (sizeof (vsw_private_desc_t) * vsw_ntxds));
5389		}
5390
5391		/*
5392		 * Now unbind and destroy the ring itself.
5393		 */
5394		if (dp->handle != NULL) {
5395			(void) ldc_mem_dring_unbind(dp->handle);
5396			(void) ldc_mem_dring_destroy(dp->handle);
5397		}
5398
5399		if (dp->data_addr != NULL) {
5400			kmem_free(dp->data_addr, dp->data_sz);
5401		}
5402
5403		mutex_exit(&dp->dlock);
5404		mutex_destroy(&dp->dlock);
5405		mutex_destroy(&dp->restart_lock);
5406		kmem_free(dp, sizeof (dring_info_t));
5407
5408		dp = dpp;
5409	}
5410	return (0);
5411}
5412
5413/*
5414 * vsw_ldc_rx_worker -- A per LDC worker thread to receive data.
5415 * This thread is woken up by the LDC interrupt handler to process
5416 * LDC packets and receive data.
5417 */
5418static void
5419vsw_ldc_rx_worker(void *arg)
5420{
5421	callb_cpr_t	cprinfo;
5422	vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
5423	vsw_t *vswp = ldcp->ldc_vswp;
5424
5425	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5426	CALLB_CPR_INIT(&cprinfo, &ldcp->rx_thr_lock, callb_generic_cpr,
5427	    "vsw_rx_thread");
5428	mutex_enter(&ldcp->rx_thr_lock);
5429	ldcp->rx_thr_flags |= VSW_WTHR_RUNNING;
5430	while (!(ldcp->rx_thr_flags & VSW_WTHR_STOP)) {
5431
5432		CALLB_CPR_SAFE_BEGIN(&cprinfo);
5433		/*
5434		 * Wait until the data is received or a stop
5435		 * request is received.
5436		 */
5437		while (!(ldcp->rx_thr_flags &
5438		    (VSW_WTHR_DATARCVD | VSW_WTHR_STOP))) {
5439			cv_wait(&ldcp->rx_thr_cv, &ldcp->rx_thr_lock);
5440		}
5441		CALLB_CPR_SAFE_END(&cprinfo, &ldcp->rx_thr_lock)
5442
5443		/*
5444		 * First process the stop request.
5445		 */
5446		if (ldcp->rx_thr_flags & VSW_WTHR_STOP) {
5447			D2(vswp, "%s(%lld):Rx thread stopped\n",
5448			    __func__, ldcp->ldc_id);
5449			break;
5450		}
5451		ldcp->rx_thr_flags &= ~VSW_WTHR_DATARCVD;
5452		mutex_exit(&ldcp->rx_thr_lock);
5453		D1(vswp, "%s(%lld):calling vsw_process_pkt\n",
5454		    __func__, ldcp->ldc_id);
5455		mutex_enter(&ldcp->ldc_cblock);
5456		vsw_process_pkt(ldcp);
5457		mutex_exit(&ldcp->ldc_cblock);
5458		mutex_enter(&ldcp->rx_thr_lock);
5459	}
5460
5461	/*
5462	 * Update the run status and wakeup the thread that
5463	 * has sent the stop request.
5464	 */
5465	ldcp->rx_thr_flags &= ~VSW_WTHR_RUNNING;
5466	cv_signal(&ldcp->rx_thr_cv);
5467	CALLB_CPR_EXIT(&cprinfo);
5468	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5469	thread_exit();
5470}
5471
5472/* vsw_stop_rx_thread -- Co-ordinate with receive thread to stop it */
5473static void
5474vsw_stop_rx_thread(vsw_ldc_t *ldcp)
5475{
5476	vsw_t *vswp = ldcp->ldc_vswp;
5477
5478	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5479	/*
5480	 * Send a stop request by setting the stop flag and
5481	 * wait until the receive thread stops.
5482	 */
5483	mutex_enter(&ldcp->rx_thr_lock);
5484	if (ldcp->rx_thr_flags & VSW_WTHR_RUNNING) {
5485		ldcp->rx_thr_flags |= VSW_WTHR_STOP;
5486		cv_signal(&ldcp->rx_thr_cv);
5487		while (ldcp->rx_thr_flags & VSW_WTHR_RUNNING) {
5488			cv_wait(&ldcp->rx_thr_cv, &ldcp->rx_thr_lock);
5489		}
5490	}
5491	mutex_exit(&ldcp->rx_thr_lock);
5492	ldcp->rx_thread = NULL;
5493	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5494}
5495
5496/*
5497 * vsw_ldc_tx_worker -- A per LDC worker thread to transmit data.
5498 * This thread is woken up by the vsw_portsend to transmit
5499 * packets.
5500 */
5501static void
5502vsw_ldc_tx_worker(void *arg)
5503{
5504	callb_cpr_t	cprinfo;
5505	vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
5506	vsw_t *vswp = ldcp->ldc_vswp;
5507	mblk_t *mp;
5508	mblk_t *tmp;
5509
5510	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5511	CALLB_CPR_INIT(&cprinfo, &ldcp->tx_thr_lock, callb_generic_cpr,
5512	    "vnet_tx_thread");
5513	mutex_enter(&ldcp->tx_thr_lock);
5514	ldcp->tx_thr_flags |= VSW_WTHR_RUNNING;
5515	while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP)) {
5516
5517		CALLB_CPR_SAFE_BEGIN(&cprinfo);
5518		/*
5519		 * Wait until the data is received or a stop
5520		 * request is received.
5521		 */
5522		while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP) &&
5523		    (ldcp->tx_mhead == NULL)) {
5524			cv_wait(&ldcp->tx_thr_cv, &ldcp->tx_thr_lock);
5525		}
5526		CALLB_CPR_SAFE_END(&cprinfo, &ldcp->tx_thr_lock)
5527
5528		/*
5529		 * First process the stop request.
5530		 */
5531		if (ldcp->tx_thr_flags & VSW_WTHR_STOP) {
5532			D2(vswp, "%s(%lld):tx thread stopped\n",
5533			    __func__, ldcp->ldc_id);
5534			break;
5535		}
5536		mp = ldcp->tx_mhead;
5537		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
5538		ldcp->tx_cnt = 0;
5539		mutex_exit(&ldcp->tx_thr_lock);
5540		D2(vswp, "%s(%lld):calling vsw_ldcsend\n",
5541		    __func__, ldcp->ldc_id);
5542		while (mp != NULL) {
5543			tmp = mp->b_next;
5544			mp->b_next = mp->b_prev = NULL;
5545			(void) vsw_ldcsend(ldcp, mp, vsw_ldc_tx_retries);
5546			mp = tmp;
5547		}
5548		mutex_enter(&ldcp->tx_thr_lock);
5549	}
5550
5551	/*
5552	 * Update the run status and wakeup the thread that
5553	 * has sent the stop request.
5554	 */
5555	ldcp->tx_thr_flags &= ~VSW_WTHR_RUNNING;
5556	cv_signal(&ldcp->tx_thr_cv);
5557	CALLB_CPR_EXIT(&cprinfo);
5558	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5559	thread_exit();
5560}
5561
5562/* vsw_stop_tx_thread -- Co-ordinate with receive thread to stop it */
5563static void
5564vsw_stop_tx_thread(vsw_ldc_t *ldcp)
5565{
5566	vsw_t *vswp = ldcp->ldc_vswp;
5567
5568	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5569	/*
5570	 * Send a stop request by setting the stop flag and
5571	 * wait until the receive thread stops.
5572	 */
5573	mutex_enter(&ldcp->tx_thr_lock);
5574	if (ldcp->tx_thr_flags & VSW_WTHR_RUNNING) {
5575		ldcp->tx_thr_flags |= VSW_WTHR_STOP;
5576		cv_signal(&ldcp->tx_thr_cv);
5577		while (ldcp->tx_thr_flags & VSW_WTHR_RUNNING) {
5578			cv_wait(&ldcp->tx_thr_cv, &ldcp->tx_thr_lock);
5579		}
5580	}
5581	mutex_exit(&ldcp->tx_thr_lock);
5582	ldcp->tx_thread = NULL;
5583	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5584}
5585
5586/* vsw_reclaim_dring -- reclaim descriptors */
5587static int
5588vsw_reclaim_dring(dring_info_t *dp, int start)
5589{
5590	int i, j, len;
5591	vsw_private_desc_t *priv_addr;
5592	vnet_public_desc_t *pub_addr;
5593
5594	pub_addr = (vnet_public_desc_t *)dp->pub_addr;
5595	priv_addr = (vsw_private_desc_t *)dp->priv_addr;
5596	len = dp->num_descriptors;
5597
5598	D2(NULL, "%s: start index %ld\n", __func__, start);
5599
5600	j = 0;
5601	for (i = start; j < len; i = (i + 1) % len, j++) {
5602		pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
5603		priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
5604
5605		mutex_enter(&priv_addr->dstate_lock);
5606		if (pub_addr->hdr.dstate != VIO_DESC_DONE) {
5607			mutex_exit(&priv_addr->dstate_lock);
5608			break;
5609		}
5610		pub_addr->hdr.dstate = VIO_DESC_FREE;
5611		priv_addr->dstate = VIO_DESC_FREE;
5612		/* clear all the fields */
5613		priv_addr->datalen = 0;
5614		pub_addr->hdr.ack = 0;
5615		mutex_exit(&priv_addr->dstate_lock);
5616
5617		D3(NULL, "claiming descp:%d pub state:0x%llx priv state 0x%llx",
5618		    i, pub_addr->hdr.dstate, priv_addr->dstate);
5619	}
5620	return (j);
5621}
5622
5623/*
5624 * Debugging routines
5625 */
5626static void
5627display_state(void)
5628{
5629	vsw_t		*vswp;
5630	vsw_port_list_t	*plist;
5631	vsw_port_t 	*port;
5632	vsw_ldc_list_t	*ldcl;
5633	vsw_ldc_t 	*ldcp;
5634	extern vsw_t 	*vsw_head;
5635
5636	cmn_err(CE_NOTE, "***** system state *****");
5637
5638	for (vswp = vsw_head; vswp; vswp = vswp->next) {
5639		plist = &vswp->plist;
5640		READ_ENTER(&plist->lockrw);
5641		cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n",
5642		    vswp->instance, plist->num_ports);
5643
5644		for (port = plist->head; port != NULL; port = port->p_next) {
5645			ldcl = &port->p_ldclist;
5646			cmn_err(CE_CONT, "port %d : %d ldcs attached\n",
5647			    port->p_instance, ldcl->num_ldcs);
5648			READ_ENTER(&ldcl->lockrw);
5649			ldcp = ldcl->head;
5650			for (; ldcp != NULL; ldcp = ldcp->ldc_next) {
5651				cmn_err(CE_CONT, "chan %lu : dev %d : "
5652				    "status %d : phase %u\n",
5653				    ldcp->ldc_id, ldcp->dev_class,
5654				    ldcp->ldc_status, ldcp->hphase);
5655				cmn_err(CE_CONT, "chan %lu : lsession %lu : "
5656				    "psession %lu\n", ldcp->ldc_id,
5657				    ldcp->local_session, ldcp->peer_session);
5658
5659				cmn_err(CE_CONT, "Inbound lane:\n");
5660				display_lane(&ldcp->lane_in);
5661				cmn_err(CE_CONT, "Outbound lane:\n");
5662				display_lane(&ldcp->lane_out);
5663			}
5664			RW_EXIT(&ldcl->lockrw);
5665		}
5666		RW_EXIT(&plist->lockrw);
5667	}
5668	cmn_err(CE_NOTE, "***** system state *****");
5669}
5670
5671static void
5672display_lane(lane_t *lp)
5673{
5674	dring_info_t	*drp;
5675
5676	cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n",
5677	    lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu);
5678	cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n",
5679	    lp->addr_type, lp->addr, lp->xfer_mode);
5680	cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp);
5681
5682	cmn_err(CE_CONT, "Dring info:\n");
5683	for (drp = lp->dringp; drp != NULL; drp = drp->next) {
5684		cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n",
5685		    drp->num_descriptors, drp->descriptor_size);
5686		cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle);
5687		cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n",
5688		    (uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr);
5689		cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n",
5690		    drp->ident, drp->end_idx);
5691		display_ring(drp);
5692	}
5693}
5694
5695static void
5696display_ring(dring_info_t *dringp)
5697{
5698	uint64_t		i;
5699	uint64_t		priv_count = 0;
5700	uint64_t		pub_count = 0;
5701	vnet_public_desc_t	*pub_addr = NULL;
5702	vsw_private_desc_t	*priv_addr = NULL;
5703
5704	for (i = 0; i < vsw_ntxds; i++) {
5705		if (dringp->pub_addr != NULL) {
5706			pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i;
5707
5708			if (pub_addr->hdr.dstate == VIO_DESC_FREE)
5709				pub_count++;
5710		}
5711
5712		if (dringp->priv_addr != NULL) {
5713			priv_addr = (vsw_private_desc_t *)dringp->priv_addr + i;
5714
5715			if (priv_addr->dstate == VIO_DESC_FREE)
5716				priv_count++;
5717		}
5718	}
5719	cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n",
5720	    i, priv_count, pub_count);
5721}
5722
5723static void
5724dump_flags(uint64_t state)
5725{
5726	int	i;
5727
5728	typedef struct flag_name {
5729		int	flag_val;
5730		char	*flag_name;
5731	} flag_name_t;
5732
5733	flag_name_t	flags[] = {
5734		VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT",
5735		VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV",
5736		VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV",
5737		VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT",
5738		VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV",
5739		VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT",
5740		VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT",
5741		VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV",
5742		VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT",
5743		VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV",
5744		VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT",
5745		VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV",
5746		VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT",
5747		VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV",
5748		VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT",
5749		VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV",
5750		VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT",
5751		VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV",
5752		VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT",
5753		VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV",
5754		VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT",
5755		VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV",
5756		VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT",
5757		VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV",
5758		VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT",
5759		VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV",
5760		VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT",
5761		VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV",
5762		VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT",
5763		VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV",
5764		VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"};
5765
5766	DERR(NULL, "DUMP_FLAGS: %llx\n", state);
5767	for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) {
5768		if (state & flags[i].flag_val)
5769			DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name);
5770	}
5771}
5772