1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26#include <sys/types.h>
27#include <sys/errno.h>
28#include <sys/debug.h>
29#include <sys/time.h>
30#include <sys/sysmacros.h>
31#include <sys/systm.h>
32#include <sys/user.h>
33#include <sys/stropts.h>
34#include <sys/stream.h>
35#include <sys/strlog.h>
36#include <sys/strsubr.h>
37#include <sys/cmn_err.h>
38#include <sys/cpu.h>
39#include <sys/kmem.h>
40#include <sys/conf.h>
41#include <sys/ddi.h>
42#include <sys/sunddi.h>
43#include <sys/ksynch.h>
44#include <sys/stat.h>
45#include <sys/kstat.h>
46#include <sys/vtrace.h>
47#include <sys/strsun.h>
48#include <sys/dlpi.h>
49#include <sys/ethernet.h>
50#include <net/if.h>
51#include <sys/varargs.h>
52#include <sys/machsystm.h>
53#include <sys/modctl.h>
54#include <sys/modhash.h>
55#include <sys/mac.h>
56#include <sys/mac_ether.h>
57#include <sys/taskq.h>
58#include <sys/note.h>
59#include <sys/mach_descrip.h>
60#include <sys/mdeg.h>
61#include <sys/ldc.h>
62#include <sys/vsw_fdb.h>
63#include <sys/vsw.h>
64#include <sys/vio_mailbox.h>
65#include <sys/vnet_mailbox.h>
66#include <sys/vnet_common.h>
67#include <sys/vio_util.h>
68#include <sys/sdt.h>
69#include <sys/atomic.h>
70#include <sys/callb.h>
71#include <sys/vlan.h>
72
73/* Port add/deletion/etc routines */
74static	void vsw_port_delete(vsw_port_t *port);
75static	int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id);
76static	void vsw_ldc_detach(vsw_ldc_t *ldcp);
77static	int vsw_ldc_init(vsw_ldc_t *ldcp);
78static	void vsw_ldc_uninit(vsw_ldc_t *ldcp);
79static	void vsw_ldc_drain(vsw_ldc_t *ldcp);
80static	void vsw_drain_port_taskq(vsw_port_t *port);
81static	void vsw_marker_task(void *);
82static	int vsw_plist_del_node(vsw_t *, vsw_port_t *port);
83void vsw_detach_ports(vsw_t *vswp);
84int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
85mcst_addr_t *vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr);
86int vsw_port_detach(vsw_t *vswp, int p_instance);
87int vsw_portsend(vsw_port_t *port, mblk_t *mp);
88int vsw_port_attach(vsw_port_t *portp);
89vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
90void vsw_vlan_unaware_port_reset(vsw_port_t *portp);
91void vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate);
92void vsw_reset_ports(vsw_t *vswp);
93void vsw_port_reset(vsw_port_t *portp);
94void vsw_physlink_update_ports(vsw_t *vswp);
95static	void vsw_port_physlink_update(vsw_port_t *portp);
96
97/* Interrupt routines */
98static	uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg);
99
100/* Handshake routines */
101static	void vsw_ldc_reinit(vsw_ldc_t *);
102static	void vsw_conn_task(void *);
103static	int vsw_check_flag(vsw_ldc_t *, int, uint64_t);
104static	void vsw_next_milestone(vsw_ldc_t *);
105static	int vsw_supported_version(vio_ver_msg_t *);
106static	void vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp);
107static	void vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp);
108void vsw_process_conn_evt(vsw_ldc_t *, uint16_t);
109
110/* Data processing routines */
111void vsw_process_pkt(void *);
112static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t *, int);
113static void vsw_process_ctrl_pkt(void *);
114static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *);
115static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *);
116static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *);
117static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *);
118static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *);
119static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *);
120static void vsw_process_physlink_msg(vsw_ldc_t *, void *);
121static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *,
122	uint32_t);
123static void vsw_process_pkt_data_nop(void *, void *, uint32_t);
124static void vsw_process_pkt_data(void *, void *, uint32_t);
125static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *);
126static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *);
127static void vsw_process_evt_read(vsw_ldc_t *ldcp);
128static void vsw_ldc_rcv(vsw_ldc_t *ldcp);
129
130/* Switching/data transmit routines */
131static	int vsw_descrsend(vsw_ldc_t *, mblk_t *);
132static void vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp);
133static int vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries);
134static int vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
135static int vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
136
137/* Packet creation routines */
138static void vsw_send_ver(void *);
139static void vsw_send_attr(vsw_ldc_t *);
140static void vsw_send_dring_info(vsw_ldc_t *);
141static void vsw_send_rdx(vsw_ldc_t *);
142static void vsw_send_physlink_msg(vsw_ldc_t *ldcp, link_state_t plink_state);
143
144/* Dring routines */
145static void vsw_create_privring(vsw_ldc_t *);
146static dring_info_t *vsw_map_dring(vsw_ldc_t *ldcp, void *pkt);
147static void vsw_unmap_dring(vsw_ldc_t *ldcp);
148static void vsw_destroy_dring(vsw_ldc_t *ldcp);
149static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
150static int vsw_map_data(vsw_ldc_t *ldcp, dring_info_t *dp, void *pkt);
151static void vsw_set_lane_attr(vsw_t *, lane_t *);
152dring_info_t *vsw_map_dring_cmn(vsw_ldc_t *ldcp,
153    vio_dring_reg_msg_t *dring_pkt);
154static int vsw_mapin_avail(vsw_ldc_t *ldcp);
155
156/* tx/msg/rcv thread routines */
157static void vsw_stop_tx_thread(vsw_ldc_t *ldcp);
158static void vsw_ldc_tx_worker(void *arg);
159
160/* Misc support routines */
161static void vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr);
162static int vsw_get_same_dest_list(struct ether_header *ehp,
163    mblk_t **rhead, mblk_t **rtail, mblk_t **mpp);
164static mblk_t *vsw_dupmsgchain(mblk_t *mp);
165
166/* Debugging routines */
167static void dump_flags(uint64_t);
168static void display_state(void);
169static void display_lane(lane_t *);
170static void display_ring(dring_info_t *);
171
172/*
173 * Functions imported from other files.
174 */
175extern int vsw_set_hw(vsw_t *, vsw_port_t *, int);
176extern void vsw_unset_hw(vsw_t *, vsw_port_t *, int);
177extern int vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port);
178extern void vsw_del_mcst_port(vsw_port_t *port);
179extern int vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
180extern int vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
181extern void vsw_fdbe_add(vsw_t *vswp, void *port);
182extern void vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr);
183extern void vsw_create_vlans(void *arg, int type);
184extern void vsw_destroy_vlans(void *arg, int type);
185extern void vsw_vlan_add_ids(void *arg, int type);
186extern void vsw_vlan_remove_ids(void *arg, int type);
187extern boolean_t vsw_frame_lookup_vid(void *arg, int caller,
188	struct ether_header *ehp, uint16_t *vidp);
189extern mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp);
190extern uint32_t vsw_vlan_frame_untag(void *arg, int type, mblk_t **np,
191	mblk_t **npt);
192extern boolean_t vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid);
193extern void vsw_hio_start(vsw_t *vswp, vsw_ldc_t *ldcp);
194extern void vsw_hio_stop(vsw_t *vswp, vsw_ldc_t *ldcp);
195extern void vsw_process_dds_msg(vsw_t *vswp, vsw_ldc_t *ldcp, void *msg);
196extern void vsw_hio_stop_port(vsw_port_t *portp);
197extern void vsw_publish_macaddr(vsw_t *vswp, vsw_port_t *portp);
198extern int vsw_mac_client_init(vsw_t *vswp, vsw_port_t *port, int type);
199extern void vsw_mac_client_cleanup(vsw_t *vswp, vsw_port_t *port, int type);
200extern void vsw_destroy_rxpools(void *arg);
201extern void vsw_stop_msg_thread(vsw_ldc_t *ldcp);
202extern int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t);
203extern int vsw_dringsend(vsw_ldc_t *, mblk_t *);
204extern int vsw_reclaim_dring(dring_info_t *dp, int start);
205extern int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
206    int *);
207extern vio_dring_reg_msg_t *vsw_create_tx_dring_info(vsw_ldc_t *);
208extern int vsw_setup_tx_dring(vsw_ldc_t *ldcp, dring_info_t *dp);
209extern void vsw_destroy_tx_dring(vsw_ldc_t *ldcp);
210extern dring_info_t *vsw_map_rx_dring(vsw_ldc_t *ldcp, void *pkt);
211extern void vsw_unmap_rx_dring(vsw_ldc_t *ldcp);
212extern void vsw_ldc_msg_worker(void *arg);
213extern void vsw_process_dringdata(void *, void *);
214extern vio_dring_reg_msg_t *vsw_create_rx_dring_info(vsw_ldc_t *);
215extern void vsw_destroy_rx_dring(vsw_ldc_t *ldcp);
216extern dring_info_t *vsw_map_tx_dring(vsw_ldc_t *ldcp, void *pkt);
217extern void vsw_unmap_tx_dring(vsw_ldc_t *ldcp);
218extern void vsw_ldc_rcv_worker(void *arg);
219extern void vsw_stop_rcv_thread(vsw_ldc_t *ldcp);
220extern int vsw_dringsend_shm(vsw_ldc_t *, mblk_t *);
221extern void vsw_process_dringdata_shm(void *, void *);
222
223/*
224 * Tunables used in this file.
225 */
226extern int vsw_num_handshakes;
227extern int vsw_ldc_tx_delay;
228extern int vsw_ldc_tx_retries;
229extern int vsw_ldc_retries;
230extern int vsw_ldc_delay;
231extern boolean_t vsw_ldc_rxthr_enabled;
232extern boolean_t vsw_ldc_txthr_enabled;
233extern uint32_t vsw_num_descriptors;
234extern uint8_t  vsw_dring_mode;
235extern uint32_t vsw_max_tx_qcount;
236extern boolean_t vsw_obp_ver_proto_workaround;
237extern uint32_t vsw_publish_macaddr_count;
238extern uint32_t vsw_nrbufs_factor;
239
240#define	LDC_ENTER_LOCK(ldcp)	\
241				mutex_enter(&((ldcp)->ldc_cblock));\
242				mutex_enter(&((ldcp)->ldc_rxlock));\
243				mutex_enter(&((ldcp)->ldc_txlock));
244#define	LDC_EXIT_LOCK(ldcp)	\
245				mutex_exit(&((ldcp)->ldc_txlock));\
246				mutex_exit(&((ldcp)->ldc_rxlock));\
247				mutex_exit(&((ldcp)->ldc_cblock));
248
249#define	VSW_VER_EQ(ldcp, major, minor)	\
250	((ldcp)->lane_out.ver_major == (major) &&	\
251	    (ldcp)->lane_out.ver_minor == (minor))
252
253#define	VSW_VER_LT(ldcp, major, minor)	\
254	(((ldcp)->lane_out.ver_major < (major)) ||	\
255	    ((ldcp)->lane_out.ver_major == (major) &&	\
256	    (ldcp)->lane_out.ver_minor < (minor)))
257
258#define	VSW_VER_GTEQ(ldcp, major, minor)	\
259	(((ldcp)->lane_out.ver_major > (major)) ||	\
260	    ((ldcp)->lane_out.ver_major == (major) &&	\
261	    (ldcp)->lane_out.ver_minor >= (minor)))
262
263#define	VSW_VER_LTEQ(ldcp, major, minor)	\
264	(((ldcp)->lane_out.ver_major < (major)) ||	\
265	    ((ldcp)->lane_out.ver_major == (major) &&	\
266	    (ldcp)->lane_out.ver_minor <= (minor)))
267
268/*
269 * VIO Protocol Version Info:
270 *
271 * The version specified below represents the version of protocol currently
272 * supported in the driver. It means the driver can negotiate with peers with
273 * versions <= this version. Here is a summary of the feature(s) that are
274 * supported at each version of the protocol:
275 *
276 * 1.0			Basic VIO protocol.
277 * 1.1			vDisk protocol update (no virtual network update).
278 * 1.2			Support for priority frames (priority-ether-types).
279 * 1.3			VLAN and HybridIO support.
280 * 1.4			Jumbo Frame support.
281 * 1.5			Link State Notification support with optional support
282 * 			for Physical Link information.
283 * 1.6			Support for RxDringData mode.
284 */
285static	ver_sup_t	vsw_versions[] = { {1, 6} };
286
287/*
288 * For the moment the state dump routines have their own
289 * private flag.
290 */
291#define	DUMP_STATE	0
292
293#if DUMP_STATE
294
295#define	DUMP_TAG(tag) \
296{			\
297	D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \
298	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype);	\
299	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env);	\
300}
301
302#define	DUMP_TAG_PTR(tag) \
303{			\
304	D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \
305	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype);	\
306	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env);	\
307}
308
309#define	DUMP_FLAGS(flags) dump_flags(flags);
310#define	DISPLAY_STATE()	display_state()
311
312#else
313
314#define	DUMP_TAG(tag)
315#define	DUMP_TAG_PTR(tag)
316#define	DUMP_FLAGS(state)
317#define	DISPLAY_STATE()
318
319#endif	/* DUMP_STATE */
320
321/*
322 * Attach the specified port.
323 *
324 * Returns 0 on success, 1 on failure.
325 */
326int
327vsw_port_attach(vsw_port_t *port)
328{
329	vsw_t			*vswp = port->p_vswp;
330	vsw_port_list_t		*plist = &vswp->plist;
331	vsw_port_t		*p, **pp;
332	int			nids = port->num_ldcs;
333	uint64_t		*ldcids;
334	int			rv;
335
336	D1(vswp, "%s: enter : port %d", __func__, port->p_instance);
337
338	/* port already exists? */
339	READ_ENTER(&plist->lockrw);
340	for (p = plist->head; p != NULL; p = p->p_next) {
341		if (p->p_instance == port->p_instance) {
342			DWARN(vswp, "%s: port instance %d already attached",
343			    __func__, p->p_instance);
344			RW_EXIT(&plist->lockrw);
345			return (1);
346		}
347	}
348	RW_EXIT(&plist->lockrw);
349
350	mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
351	mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
352	rw_init(&port->maccl_rwlock, NULL, RW_DRIVER, NULL);
353
354	mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
355	cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
356	port->state = VSW_PORT_INIT;
357
358	D2(vswp, "%s: %d nids", __func__, nids);
359	ldcids = port->ldc_ids;
360	D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[0]);
361	if (vsw_ldc_attach(port, (uint64_t)ldcids[0]) != 0) {
362		DERR(vswp, "%s: ldc_attach failed", __func__);
363		goto exit_error;
364	}
365
366	if (vswp->switching_setup_done == B_TRUE) {
367		/*
368		 * If the underlying network device has been setup,
369		 * then open a mac client and porgram the mac address
370		 * for this port.
371		 */
372		rv = vsw_mac_client_init(vswp, port, VSW_VNETPORT);
373		if (rv != 0) {
374			goto exit_error;
375		}
376	}
377
378	/* create the fdb entry for this port/mac address */
379	vsw_fdbe_add(vswp, port);
380
381	vsw_create_vlans(port, VSW_VNETPORT);
382
383	WRITE_ENTER(&plist->lockrw);
384
385	/* link it into the list of ports for this vsw instance */
386	pp = (vsw_port_t **)(&plist->head);
387	port->p_next = *pp;
388	*pp = port;
389	plist->num_ports++;
390
391	RW_EXIT(&plist->lockrw);
392
393	/*
394	 * Initialise the port and any ldc's under it.
395	 */
396	(void) vsw_ldc_init(port->ldcp);
397
398	/* announce macaddr of vnet to the physical switch */
399	if (vsw_publish_macaddr_count != 0) {	/* enabled */
400		vsw_publish_macaddr(vswp, port);
401	}
402
403	D1(vswp, "%s: exit", __func__);
404	return (0);
405
406exit_error:
407
408	cv_destroy(&port->state_cv);
409	mutex_destroy(&port->state_lock);
410
411	rw_destroy(&port->maccl_rwlock);
412	mutex_destroy(&port->tx_lock);
413	mutex_destroy(&port->mca_lock);
414	kmem_free(port, sizeof (vsw_port_t));
415	return (1);
416}
417
418/*
419 * Detach the specified port.
420 *
421 * Returns 0 on success, 1 on failure.
422 */
423int
424vsw_port_detach(vsw_t *vswp, int p_instance)
425{
426	vsw_port_t	*port = NULL;
427	vsw_port_list_t	*plist = &vswp->plist;
428
429	D1(vswp, "%s: enter: port id %d", __func__, p_instance);
430
431	WRITE_ENTER(&plist->lockrw);
432
433	if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) {
434		RW_EXIT(&plist->lockrw);
435		return (1);
436	}
437
438	if (vsw_plist_del_node(vswp, port)) {
439		RW_EXIT(&plist->lockrw);
440		return (1);
441	}
442
443	/* cleanup any HybridIO for this port */
444	vsw_hio_stop_port(port);
445
446	/*
447	 * No longer need to hold writer lock on port list now
448	 * that we have unlinked the target port from the list.
449	 */
450	RW_EXIT(&plist->lockrw);
451
452	/* Cleanup and close the mac client */
453	vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT);
454
455	/* Remove the fdb entry for this port/mac address */
456	vsw_fdbe_del(vswp, &(port->p_macaddr));
457	vsw_destroy_vlans(port, VSW_VNETPORT);
458
459	/* Remove any multicast addresses.. */
460	vsw_del_mcst_port(port);
461
462	vsw_port_delete(port);
463
464	D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance);
465	return (0);
466}
467
468/*
469 * Detach all active ports.
470 */
471void
472vsw_detach_ports(vsw_t *vswp)
473{
474	vsw_port_list_t 	*plist = &vswp->plist;
475	vsw_port_t		*port = NULL;
476
477	D1(vswp, "%s: enter", __func__);
478
479	WRITE_ENTER(&plist->lockrw);
480
481	while ((port = plist->head) != NULL) {
482		(void) vsw_plist_del_node(vswp, port);
483
484		/* cleanup any HybridIO for this port */
485		vsw_hio_stop_port(port);
486
487		/* Cleanup and close the mac client */
488		vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT);
489
490		/* Remove the fdb entry for this port/mac address */
491		vsw_fdbe_del(vswp, &(port->p_macaddr));
492		vsw_destroy_vlans(port, VSW_VNETPORT);
493
494		/* Remove any multicast addresses.. */
495		vsw_del_mcst_port(port);
496
497		/*
498		 * No longer need to hold the lock on the port list
499		 * now that we have unlinked the target port from the
500		 * list.
501		 */
502		RW_EXIT(&plist->lockrw);
503		vsw_port_delete(port);
504		WRITE_ENTER(&plist->lockrw);
505	}
506	RW_EXIT(&plist->lockrw);
507
508	D1(vswp, "%s: exit", __func__);
509}
510
511/*
512 * Delete the specified port.
513 */
514static void
515vsw_port_delete(vsw_port_t *port)
516{
517	vsw_t			*vswp = port->p_vswp;
518
519	D1(vswp, "%s: enter : port id %d", __func__, port->p_instance);
520
521	vsw_ldc_uninit(port->ldcp);
522
523	/*
524	 * Wait for any pending ctrl msg tasks which reference this
525	 * port to finish.
526	 */
527	vsw_drain_port_taskq(port);
528
529	/*
530	 * Wait for any active callbacks to finish
531	 */
532	vsw_ldc_drain(port->ldcp);
533
534	vsw_ldc_detach(port->ldcp);
535
536	rw_destroy(&port->maccl_rwlock);
537	mutex_destroy(&port->mca_lock);
538	mutex_destroy(&port->tx_lock);
539
540	cv_destroy(&port->state_cv);
541	mutex_destroy(&port->state_lock);
542
543	if (port->num_ldcs != 0) {
544		kmem_free(port->ldc_ids, port->num_ldcs * sizeof (uint64_t));
545		port->num_ldcs = 0;
546	}
547
548	if (port->nvids != 0) {
549		kmem_free(port->vids, sizeof (vsw_vlanid_t) * port->nvids);
550	}
551
552	kmem_free(port, sizeof (vsw_port_t));
553
554	D1(vswp, "%s: exit", __func__);
555}
556
557/*
558 * Attach a logical domain channel (ldc) under a specified port.
559 *
560 * Returns 0 on success, 1 on failure.
561 */
562static int
563vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
564{
565	vsw_t 		*vswp = port->p_vswp;
566	vsw_ldc_t 	*ldcp = NULL;
567	ldc_attr_t 	attr;
568	ldc_status_t	istatus;
569	int 		status = DDI_FAILURE;
570	char		kname[MAXNAMELEN];
571	enum		{ PROG_init = 0x0,
572			    PROG_callback = 0x1,
573			    PROG_tx_thread = 0x2}
574			progress;
575
576	progress = PROG_init;
577
578	D1(vswp, "%s: enter", __func__);
579
580	ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP);
581	if (ldcp == NULL) {
582		DERR(vswp, "%s: kmem_zalloc failed", __func__);
583		return (1);
584	}
585	ldcp->ldc_id = ldc_id;
586
587	mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
588	mutex_init(&ldcp->ldc_rxlock, NULL, MUTEX_DRIVER, NULL);
589	mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
590	ldcp->msg_thr_flags = 0;
591	mutex_init(&ldcp->msg_thr_lock, NULL, MUTEX_DRIVER, NULL);
592	cv_init(&ldcp->msg_thr_cv, NULL, CV_DRIVER, NULL);
593	ldcp->rcv_thr_flags = 0;
594	mutex_init(&ldcp->rcv_thr_lock, NULL, MUTEX_DRIVER, NULL);
595	cv_init(&ldcp->rcv_thr_cv, NULL, CV_DRIVER, NULL);
596	mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
597	cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL);
598
599	/* required for handshake with peer */
600	ldcp->local_session = (uint64_t)ddi_get_lbolt();
601	ldcp->peer_session = 0;
602	ldcp->session_status = 0;
603	ldcp->hss_id = 1;	/* Initial handshake session id */
604	ldcp->hphase = VSW_MILESTONE0;
605
606	(void) atomic_swap_32(&port->p_hio_capable, B_FALSE);
607
608	/* only set for outbound lane, inbound set by peer */
609	vsw_set_lane_attr(vswp, &ldcp->lane_out);
610
611	attr.devclass = LDC_DEV_NT_SVC;
612	attr.instance = ddi_get_instance(vswp->dip);
613	attr.mode = LDC_MODE_UNRELIABLE;
614	attr.mtu = VSW_LDC_MTU;
615	status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
616	if (status != 0) {
617		DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
618		    __func__, ldc_id, status);
619		goto ldc_attach_fail;
620	}
621
622	if (vsw_ldc_txthr_enabled) {
623		ldcp->tx_thr_flags = 0;
624		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
625
626		mutex_init(&ldcp->tx_thr_lock, NULL, MUTEX_DRIVER, NULL);
627		cv_init(&ldcp->tx_thr_cv, NULL, CV_DRIVER, NULL);
628		ldcp->tx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
629		    vsw_ldc_tx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
630
631		progress |= PROG_tx_thread;
632		if (ldcp->tx_thread == NULL) {
633			DWARN(vswp, "%s(%lld): Failed to create worker thread",
634			    __func__, ldc_id);
635			goto ldc_attach_fail;
636		}
637	}
638
639	status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
640	if (status != 0) {
641		DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
642		    __func__, ldc_id, status);
643		(void) ldc_fini(ldcp->ldc_handle);
644		goto ldc_attach_fail;
645	}
646	/*
647	 * allocate a message for ldc_read()s, big enough to hold ctrl and
648	 * data msgs, including raw data msgs used to recv priority frames.
649	 */
650	ldcp->msglen = VIO_PKT_DATA_HDRSIZE + vswp->max_frame_size;
651	ldcp->ldcmsg = kmem_alloc(ldcp->msglen, KM_SLEEP);
652
653	progress |= PROG_callback;
654
655	mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL);
656
657	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
658		DERR(vswp, "%s: ldc_status failed", __func__);
659		mutex_destroy(&ldcp->status_lock);
660		goto ldc_attach_fail;
661	}
662
663	ldcp->ldc_status = istatus;
664	ldcp->ldc_port = port;
665	ldcp->ldc_vswp = vswp;
666
667	vsw_reset_vnet_proto_ops(ldcp);
668
669	(void) sprintf(kname, "%sldc0x%lx", DRV_NAME, ldcp->ldc_id);
670	ldcp->ksp = vgen_setup_kstats(DRV_NAME, vswp->instance,
671	    kname, &ldcp->ldc_stats);
672	if (ldcp->ksp == NULL) {
673		DERR(vswp, "%s: kstats setup failed", __func__);
674		goto ldc_attach_fail;
675	}
676
677	/* link it into this port */
678	port->ldcp = ldcp;
679
680	D1(vswp, "%s: exit", __func__);
681	return (0);
682
683ldc_attach_fail:
684
685	if (progress & PROG_callback) {
686		(void) ldc_unreg_callback(ldcp->ldc_handle);
687		kmem_free(ldcp->ldcmsg, ldcp->msglen);
688	}
689
690	if (progress & PROG_tx_thread) {
691		if (ldcp->tx_thread != NULL) {
692			vsw_stop_tx_thread(ldcp);
693		}
694		mutex_destroy(&ldcp->tx_thr_lock);
695		cv_destroy(&ldcp->tx_thr_cv);
696	}
697	if (ldcp->ksp != NULL) {
698		vgen_destroy_kstats(ldcp->ksp);
699	}
700	mutex_destroy(&ldcp->msg_thr_lock);
701	mutex_destroy(&ldcp->rcv_thr_lock);
702	mutex_destroy(&ldcp->ldc_txlock);
703	mutex_destroy(&ldcp->ldc_rxlock);
704	mutex_destroy(&ldcp->ldc_cblock);
705	mutex_destroy(&ldcp->drain_cv_lock);
706	cv_destroy(&ldcp->msg_thr_cv);
707	cv_destroy(&ldcp->rcv_thr_cv);
708	cv_destroy(&ldcp->drain_cv);
709
710	kmem_free(ldcp, sizeof (vsw_ldc_t));
711
712	return (1);
713}
714
715/*
716 * Detach a logical domain channel (ldc) belonging to a
717 * particular port.
718 */
719static void
720vsw_ldc_detach(vsw_ldc_t *ldcp)
721{
722	int 		rv;
723	vsw_t 		*vswp = ldcp->ldc_port->p_vswp;
724	int		retries = 0;
725
726	D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id);
727
728	/* Stop msg/rcv thread */
729	if (ldcp->rcv_thread != NULL) {
730		vsw_stop_rcv_thread(ldcp);
731	} else if (ldcp->msg_thread != NULL) {
732		vsw_stop_msg_thread(ldcp);
733	}
734	kmem_free(ldcp->ldcmsg, ldcp->msglen);
735
736	/* Stop the tx thread */
737	if (ldcp->tx_thread != NULL) {
738		vsw_stop_tx_thread(ldcp);
739		mutex_destroy(&ldcp->tx_thr_lock);
740		cv_destroy(&ldcp->tx_thr_cv);
741		if (ldcp->tx_mhead != NULL) {
742			freemsgchain(ldcp->tx_mhead);
743			ldcp->tx_mhead = ldcp->tx_mtail = NULL;
744			ldcp->tx_cnt = 0;
745		}
746	}
747
748	/* Destory kstats */
749	vgen_destroy_kstats(ldcp->ksp);
750
751	/*
752	 * Before we can close the channel we must release any mapped
753	 * resources (e.g. drings).
754	 */
755	vsw_free_lane_resources(ldcp, INBOUND);
756	vsw_free_lane_resources(ldcp, OUTBOUND);
757
758	/*
759	 * Close the channel, retry on EAAGIN.
760	 */
761	while ((rv = ldc_close(ldcp->ldc_handle)) == EAGAIN) {
762		if (++retries > vsw_ldc_retries) {
763			break;
764		}
765		drv_usecwait(vsw_ldc_delay);
766	}
767	if (rv != 0) {
768		cmn_err(CE_NOTE,
769		    "!vsw%d: Error(%d) closing the channel(0x%lx)\n",
770		    vswp->instance, rv, ldcp->ldc_id);
771	}
772
773	(void) ldc_fini(ldcp->ldc_handle);
774
775	ldcp->ldc_status = LDC_INIT;
776	ldcp->ldc_handle = NULL;
777	ldcp->ldc_vswp = NULL;
778
779	mutex_destroy(&ldcp->msg_thr_lock);
780	mutex_destroy(&ldcp->rcv_thr_lock);
781	mutex_destroy(&ldcp->ldc_txlock);
782	mutex_destroy(&ldcp->ldc_rxlock);
783	mutex_destroy(&ldcp->ldc_cblock);
784	mutex_destroy(&ldcp->drain_cv_lock);
785	mutex_destroy(&ldcp->status_lock);
786	cv_destroy(&ldcp->msg_thr_cv);
787	cv_destroy(&ldcp->rcv_thr_cv);
788	cv_destroy(&ldcp->drain_cv);
789
790	kmem_free(ldcp, sizeof (vsw_ldc_t));
791}
792
793/*
794 * Open and attempt to bring up the channel. Note that channel
795 * can only be brought up if peer has also opened channel.
796 *
797 * Returns 0 if can open and bring up channel, otherwise
798 * returns 1.
799 */
800static int
801vsw_ldc_init(vsw_ldc_t *ldcp)
802{
803	vsw_t 		*vswp = ldcp->ldc_vswp;
804	ldc_status_t	istatus = 0;
805	int		rv;
806
807	D1(vswp, "%s: enter", __func__);
808
809	LDC_ENTER_LOCK(ldcp);
810
811	/* don't start at 0 in case clients don't like that */
812	ldcp->next_ident = 1;
813
814	rv = ldc_open(ldcp->ldc_handle);
815	if (rv != 0) {
816		DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)",
817		    __func__, ldcp->ldc_id, rv);
818		LDC_EXIT_LOCK(ldcp);
819		return (1);
820	}
821
822	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
823		DERR(vswp, "%s: unable to get status", __func__);
824		LDC_EXIT_LOCK(ldcp);
825		return (1);
826
827	} else if (istatus != LDC_OPEN && istatus != LDC_READY) {
828		DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY",
829		    __func__, ldcp->ldc_id, istatus);
830		LDC_EXIT_LOCK(ldcp);
831		return (1);
832	}
833
834	mutex_enter(&ldcp->status_lock);
835	ldcp->ldc_status = istatus;
836	mutex_exit(&ldcp->status_lock);
837
838	rv = ldc_up(ldcp->ldc_handle);
839	if (rv != 0) {
840		/*
841		 * Not a fatal error for ldc_up() to fail, as peer
842		 * end point may simply not be ready yet.
843		 */
844		D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
845		    ldcp->ldc_id, rv);
846		LDC_EXIT_LOCK(ldcp);
847		return (1);
848	}
849
850	/*
851	 * ldc_up() call is non-blocking so need to explicitly
852	 * check channel status to see if in fact the channel
853	 * is UP.
854	 */
855	mutex_enter(&ldcp->status_lock);
856	if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) {
857		DERR(vswp, "%s: unable to get status", __func__);
858		mutex_exit(&ldcp->status_lock);
859		LDC_EXIT_LOCK(ldcp);
860		return (1);
861
862	}
863
864	if (ldcp->ldc_status == LDC_UP) {
865		D2(vswp, "%s: channel %ld now UP (%ld)", __func__,
866		    ldcp->ldc_id, istatus);
867		mutex_exit(&ldcp->status_lock);
868		LDC_EXIT_LOCK(ldcp);
869
870		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
871		return (0);
872	}
873
874	mutex_exit(&ldcp->status_lock);
875	LDC_EXIT_LOCK(ldcp);
876
877	D1(vswp, "%s: exit", __func__);
878	return (0);
879}
880
881/* disable callbacks on the channel */
882static void
883vsw_ldc_uninit(vsw_ldc_t *ldcp)
884{
885	vsw_t	*vswp = ldcp->ldc_vswp;
886	int	rv;
887
888	D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id);
889
890	LDC_ENTER_LOCK(ldcp);
891
892	rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
893	if (rv != 0) {
894		cmn_err(CE_NOTE, "!vsw_ldc_uninit(%ld): error disabling "
895		    "interrupts (rv = %d)\n", ldcp->ldc_id, rv);
896	}
897
898	mutex_enter(&ldcp->status_lock);
899	ldcp->ldc_status = LDC_INIT;
900	mutex_exit(&ldcp->status_lock);
901
902	LDC_EXIT_LOCK(ldcp);
903
904	D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id);
905}
906
907/*
908 * Wait until the callback(s) associated with the ldcs under the specified
909 * port have completed.
910 *
911 * Prior to this function being invoked each channel under this port
912 * should have been quiesced via ldc_set_cb_mode(DISABLE).
913 *
914 * A short explaination of what we are doing below..
915 *
916 * The simplest approach would be to have a reference counter in
917 * the ldc structure which is increment/decremented by the callbacks as
918 * they use the channel. The drain function could then simply disable any
919 * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately
920 * there is a tiny window here - before the callback is able to get the lock
921 * on the channel it is interrupted and this function gets to execute. It
922 * sees that the ref count is zero and believes its free to delete the
923 * associated data structures.
924 *
925 * We get around this by taking advantage of the fact that before the ldc
926 * framework invokes a callback it sets a flag to indicate that there is a
927 * callback active (or about to become active). If when we attempt to
928 * unregister a callback when this active flag is set then the unregister
929 * will fail with EWOULDBLOCK.
930 *
931 * If the unregister fails we do a cv_timedwait. We will either be signaled
932 * by the callback as it is exiting (note we have to wait a short period to
933 * allow the callback to return fully to the ldc framework and it to clear
934 * the active flag), or by the timer expiring. In either case we again attempt
935 * the unregister. We repeat this until we can succesfully unregister the
936 * callback.
937 *
938 * The reason we use a cv_timedwait rather than a simple cv_wait is to catch
939 * the case where the callback has finished but the ldc framework has not yet
940 * cleared the active flag. In this case we would never get a cv_signal.
941 */
942static void
943vsw_ldc_drain(vsw_ldc_t *ldcp)
944{
945	vsw_t	*vswp = ldcp->ldc_port->p_vswp;
946
947	D1(vswp, "%s: enter", __func__);
948
949	/*
950	 * If we can unregister the channel callback then we
951	 * know that there is no callback either running or
952	 * scheduled to run for this channel so move on to next
953	 * channel in the list.
954	 */
955	mutex_enter(&ldcp->drain_cv_lock);
956
957	/* prompt active callbacks to quit */
958	ldcp->drain_state = VSW_LDC_DRAINING;
959
960	if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) {
961		D2(vswp, "%s: unreg callback for chan %ld", __func__,
962		    ldcp->ldc_id);
963		mutex_exit(&ldcp->drain_cv_lock);
964	} else {
965		/*
966		 * If we end up here we know that either 1) a callback
967		 * is currently executing, 2) is about to start (i.e.
968		 * the ldc framework has set the active flag but
969		 * has not actually invoked the callback yet, or 3)
970		 * has finished and has returned to the ldc framework
971		 * but the ldc framework has not yet cleared the
972		 * active bit.
973		 *
974		 * Wait for it to finish.
975		 */
976		while (ldc_unreg_callback(ldcp->ldc_handle) == EWOULDBLOCK) {
977			(void) cv_timedwait(&ldcp->drain_cv,
978			    &ldcp->drain_cv_lock, ddi_get_lbolt() + hz);
979		}
980
981		mutex_exit(&ldcp->drain_cv_lock);
982		D2(vswp, "%s: unreg callback for chan %ld after "
983		    "timeout", __func__, ldcp->ldc_id);
984	}
985
986	D1(vswp, "%s: exit", __func__);
987}
988
989/*
990 * Wait until all tasks which reference this port have completed.
991 *
992 * Prior to this function being invoked each channel under this port
993 * should have been quiesced via ldc_set_cb_mode(DISABLE).
994 */
995static void
996vsw_drain_port_taskq(vsw_port_t *port)
997{
998	vsw_t		*vswp = port->p_vswp;
999
1000	D1(vswp, "%s: enter", __func__);
1001
1002	/*
1003	 * Mark the port as in the process of being detached, and
1004	 * dispatch a marker task to the queue so we know when all
1005	 * relevant tasks have completed.
1006	 */
1007	mutex_enter(&port->state_lock);
1008	port->state = VSW_PORT_DETACHING;
1009
1010	if ((vswp->taskq_p == NULL) ||
1011	    (ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task,
1012	    port, DDI_NOSLEEP) != DDI_SUCCESS)) {
1013		cmn_err(CE_NOTE, "!vsw%d: unable to dispatch marker task",
1014		    vswp->instance);
1015		mutex_exit(&port->state_lock);
1016		return;
1017	}
1018
1019	/*
1020	 * Wait for the marker task to finish.
1021	 */
1022	while (port->state != VSW_PORT_DETACHABLE)
1023		cv_wait(&port->state_cv, &port->state_lock);
1024
1025	mutex_exit(&port->state_lock);
1026
1027	D1(vswp, "%s: exit", __func__);
1028}
1029
1030static void
1031vsw_marker_task(void *arg)
1032{
1033	vsw_port_t	*port = arg;
1034	vsw_t		*vswp = port->p_vswp;
1035
1036	D1(vswp, "%s: enter", __func__);
1037
1038	mutex_enter(&port->state_lock);
1039
1040	/*
1041	 * No further tasks should be dispatched which reference
1042	 * this port so ok to mark it as safe to detach.
1043	 */
1044	port->state = VSW_PORT_DETACHABLE;
1045
1046	cv_signal(&port->state_cv);
1047
1048	mutex_exit(&port->state_lock);
1049
1050	D1(vswp, "%s: exit", __func__);
1051}
1052
1053vsw_port_t *
1054vsw_lookup_port(vsw_t *vswp, int p_instance)
1055{
1056	vsw_port_list_t *plist = &vswp->plist;
1057	vsw_port_t	*port;
1058
1059	for (port = plist->head; port != NULL; port = port->p_next) {
1060		if (port->p_instance == p_instance) {
1061			D2(vswp, "vsw_lookup_port: found p_instance\n");
1062			return (port);
1063		}
1064	}
1065
1066	return (NULL);
1067}
1068
1069void
1070vsw_vlan_unaware_port_reset(vsw_port_t *portp)
1071{
1072	vsw_ldc_t	*ldcp = portp->ldcp;
1073
1074	mutex_enter(&ldcp->ldc_cblock);
1075
1076	/*
1077	 * If the peer is vlan_unaware(ver < 1.3), reset channel and terminate
1078	 * the connection. See comments in vsw_set_vnet_proto_ops().
1079	 */
1080	if (ldcp->hphase == VSW_MILESTONE4 && VSW_VER_LT(ldcp, 1, 3) &&
1081	    portp->nvids != 0) {
1082		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1083	}
1084
1085	mutex_exit(&ldcp->ldc_cblock);
1086}
1087
1088void
1089vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate)
1090{
1091	vsw_ldc_t	*ldcp = portp->ldcp;
1092
1093	mutex_enter(&ldcp->ldc_cblock);
1094
1095	/*
1096	 * If the peer is HybridIO capable (ver >= 1.3), reset channel
1097	 * to trigger re-negotiation, which inturn trigger HybridIO
1098	 * setup/cleanup.
1099	 */
1100	if ((ldcp->hphase == VSW_MILESTONE4) &&
1101	    (portp->p_hio_capable == B_TRUE)) {
1102		if (immediate == B_TRUE) {
1103			(void) ldc_down(ldcp->ldc_handle);
1104		} else {
1105			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1106		}
1107	}
1108
1109	mutex_exit(&ldcp->ldc_cblock);
1110}
1111
1112void
1113vsw_port_reset(vsw_port_t *portp)
1114{
1115	vsw_ldc_t	*ldcp = portp->ldcp;
1116
1117	mutex_enter(&ldcp->ldc_cblock);
1118
1119	/*
1120	 * reset channel and terminate the connection.
1121	 */
1122	vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1123
1124	mutex_exit(&ldcp->ldc_cblock);
1125}
1126
1127void
1128vsw_reset_ports(vsw_t *vswp)
1129{
1130	vsw_port_list_t	*plist = &vswp->plist;
1131	vsw_port_t	*portp;
1132
1133	READ_ENTER(&plist->lockrw);
1134	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
1135		if ((portp->p_hio_capable) && (portp->p_hio_enabled)) {
1136			vsw_hio_stop_port(portp);
1137		}
1138		vsw_port_reset(portp);
1139	}
1140	RW_EXIT(&plist->lockrw);
1141}
1142
1143static void
1144vsw_send_physlink_msg(vsw_ldc_t *ldcp, link_state_t plink_state)
1145{
1146	vnet_physlink_msg_t	msg;
1147	vnet_physlink_msg_t	*msgp = &msg;
1148	uint32_t		physlink_info = 0;
1149
1150	if (plink_state == LINK_STATE_UP) {
1151		physlink_info |= VNET_PHYSLINK_STATE_UP;
1152	} else {
1153		physlink_info |= VNET_PHYSLINK_STATE_DOWN;
1154	}
1155
1156	msgp->tag.vio_msgtype = VIO_TYPE_CTRL;
1157	msgp->tag.vio_subtype = VIO_SUBTYPE_INFO;
1158	msgp->tag.vio_subtype_env = VNET_PHYSLINK_INFO;
1159	msgp->tag.vio_sid = ldcp->local_session;
1160	msgp->physlink_info = physlink_info;
1161
1162	(void) vsw_send_msg(ldcp, msgp, sizeof (msg), B_TRUE);
1163}
1164
1165static void
1166vsw_port_physlink_update(vsw_port_t *portp)
1167{
1168	vsw_ldc_t	*ldcp;
1169	vsw_t		*vswp;
1170
1171	vswp = portp->p_vswp;
1172	ldcp = portp->ldcp;
1173
1174	mutex_enter(&ldcp->ldc_cblock);
1175
1176	/*
1177	 * If handshake has completed successfully and if the vnet device
1178	 * has negotiated to get physical link state updates, send a message
1179	 * with the current state.
1180	 */
1181	if (ldcp->hphase == VSW_MILESTONE4 && ldcp->pls_negotiated == B_TRUE) {
1182		vsw_send_physlink_msg(ldcp, vswp->phys_link_state);
1183	}
1184
1185	mutex_exit(&ldcp->ldc_cblock);
1186}
1187
1188void
1189vsw_physlink_update_ports(vsw_t *vswp)
1190{
1191	vsw_port_list_t	*plist = &vswp->plist;
1192	vsw_port_t	*portp;
1193
1194	READ_ENTER(&plist->lockrw);
1195	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
1196		vsw_port_physlink_update(portp);
1197	}
1198	RW_EXIT(&plist->lockrw);
1199}
1200
1201/*
1202 * Search for and remove the specified port from the port
1203 * list. Returns 0 if able to locate and remove port, otherwise
1204 * returns 1.
1205 */
1206static int
1207vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port)
1208{
1209	vsw_port_list_t *plist = &vswp->plist;
1210	vsw_port_t	*curr_p, *prev_p;
1211
1212	if (plist->head == NULL)
1213		return (1);
1214
1215	curr_p = prev_p = plist->head;
1216
1217	while (curr_p != NULL) {
1218		if (curr_p == port) {
1219			if (prev_p == curr_p) {
1220				plist->head = curr_p->p_next;
1221			} else {
1222				prev_p->p_next = curr_p->p_next;
1223			}
1224			plist->num_ports--;
1225			break;
1226		} else {
1227			prev_p = curr_p;
1228			curr_p = curr_p->p_next;
1229		}
1230	}
1231	return (0);
1232}
1233
1234/*
1235 * Interrupt handler for ldc messages.
1236 */
1237static uint_t
1238vsw_ldc_cb(uint64_t event, caddr_t arg)
1239{
1240	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
1241	vsw_t 		*vswp = ldcp->ldc_vswp;
1242
1243	D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
1244
1245	mutex_enter(&ldcp->ldc_cblock);
1246	ldcp->ldc_stats.callbacks++;
1247
1248	mutex_enter(&ldcp->status_lock);
1249	if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) {
1250		mutex_exit(&ldcp->status_lock);
1251		mutex_exit(&ldcp->ldc_cblock);
1252		return (LDC_SUCCESS);
1253	}
1254	mutex_exit(&ldcp->status_lock);
1255
1256	if (event & LDC_EVT_UP) {
1257		/*
1258		 * Channel has come up.
1259		 */
1260		D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)",
1261		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1262
1263		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1264
1265		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1266	}
1267
1268	if (event & LDC_EVT_READ) {
1269		/*
1270		 * Data available for reading.
1271		 */
1272		D2(vswp, "%s: id(ld) event(%llx) data READ",
1273		    __func__, ldcp->ldc_id, event);
1274
1275		vsw_process_evt_read(ldcp);
1276
1277		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1278
1279		goto vsw_cb_exit;
1280	}
1281
1282	if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) {
1283		D2(vswp, "%s: id(%ld) event (%lx) DOWN/RESET: status(%ld)",
1284		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1285
1286		vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
1287	}
1288
1289	/*
1290	 * Catch either LDC_EVT_WRITE which we don't support or any
1291	 * unknown event.
1292	 */
1293	if (event &
1294	    ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) {
1295		DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)",
1296		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1297	}
1298
1299vsw_cb_exit:
1300	mutex_exit(&ldcp->ldc_cblock);
1301
1302	/*
1303	 * Let the drain function know we are finishing if it
1304	 * is waiting.
1305	 */
1306	mutex_enter(&ldcp->drain_cv_lock);
1307	if (ldcp->drain_state == VSW_LDC_DRAINING)
1308		cv_signal(&ldcp->drain_cv);
1309	mutex_exit(&ldcp->drain_cv_lock);
1310
1311	return (LDC_SUCCESS);
1312}
1313
1314/*
1315 * Reinitialise data structures associated with the channel.
1316 */
1317static void
1318vsw_ldc_reinit(vsw_ldc_t *ldcp)
1319{
1320	vsw_t		*vswp = ldcp->ldc_vswp;
1321	vsw_port_t	*port;
1322
1323	D1(vswp, "%s: enter", __func__);
1324
1325	port = ldcp->ldc_port;
1326
1327	D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__,
1328	    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
1329
1330	vsw_free_lane_resources(ldcp, INBOUND);
1331	vsw_free_lane_resources(ldcp, OUTBOUND);
1332
1333	ldcp->lane_in.lstate = 0;
1334	ldcp->lane_out.lstate = 0;
1335
1336	/*
1337	 * Remove parent port from any multicast groups
1338	 * it may have registered with. Client must resend
1339	 * multicast add command after handshake completes.
1340	 */
1341	vsw_del_mcst_port(port);
1342
1343	ldcp->peer_session = 0;
1344	ldcp->session_status = 0;
1345	ldcp->hcnt = 0;
1346	ldcp->hphase = VSW_MILESTONE0;
1347
1348	vsw_reset_vnet_proto_ops(ldcp);
1349
1350	D1(vswp, "%s: exit", __func__);
1351}
1352
1353/*
1354 * Process a connection event.
1355 */
1356void
1357vsw_process_conn_evt(vsw_ldc_t *ldcp, uint16_t evt)
1358{
1359	vsw_t		*vswp = ldcp->ldc_vswp;
1360	vsw_conn_evt_t	*conn = NULL;
1361
1362	D1(vswp, "%s: enter", __func__);
1363
1364	/*
1365	 * Check if either a reset or restart event is pending
1366	 * or in progress. If so just return.
1367	 *
1368	 * A VSW_CONN_RESET event originates either with a LDC_RESET_EVT
1369	 * being received by the callback handler, or a ECONNRESET error
1370	 * code being returned from a ldc_read() or ldc_write() call.
1371	 *
1372	 * A VSW_CONN_RESTART event occurs when some error checking code
1373	 * decides that there is a problem with data from the channel,
1374	 * and that the handshake should be restarted.
1375	 */
1376	if (((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) &&
1377	    (ldstub((uint8_t *)&ldcp->reset_active)))
1378		return;
1379
1380	/*
1381	 * If it is an LDC_UP event we first check the recorded
1382	 * state of the channel. If this is UP then we know that
1383	 * the channel moving to the UP state has already been dealt
1384	 * with and don't need to dispatch a  new task.
1385	 *
1386	 * The reason for this check is that when we do a ldc_up(),
1387	 * depending on the state of the peer, we may or may not get
1388	 * a LDC_UP event. As we can't depend on getting a LDC_UP evt
1389	 * every time we do ldc_up() we explicitly check the channel
1390	 * status to see has it come up (ldc_up() is asynch and will
1391	 * complete at some undefined time), and take the appropriate
1392	 * action.
1393	 *
1394	 * The flip side of this is that we may get a LDC_UP event
1395	 * when we have already seen that the channel is up and have
1396	 * dealt with that.
1397	 */
1398	mutex_enter(&ldcp->status_lock);
1399	if (evt == VSW_CONN_UP) {
1400		if ((ldcp->ldc_status == LDC_UP) || (ldcp->reset_active != 0)) {
1401			mutex_exit(&ldcp->status_lock);
1402			return;
1403		}
1404	}
1405	mutex_exit(&ldcp->status_lock);
1406
1407	/*
1408	 * The transaction group id allows us to identify and discard
1409	 * any tasks which are still pending on the taskq and refer
1410	 * to the handshake session we are about to restart or reset.
1411	 * These stale messages no longer have any real meaning.
1412	 */
1413	(void) atomic_inc_32(&ldcp->hss_id);
1414
1415	ASSERT(vswp->taskq_p != NULL);
1416
1417	if ((conn = kmem_zalloc(sizeof (vsw_conn_evt_t), KM_NOSLEEP)) == NULL) {
1418		cmn_err(CE_WARN, "!vsw%d: unable to allocate memory for"
1419		    " connection event", vswp->instance);
1420		goto err_exit;
1421	}
1422
1423	conn->evt = evt;
1424	conn->ldcp = ldcp;
1425
1426	if (ddi_taskq_dispatch(vswp->taskq_p, vsw_conn_task, conn,
1427	    DDI_NOSLEEP) != DDI_SUCCESS) {
1428		cmn_err(CE_WARN, "!vsw%d: Can't dispatch connection task",
1429		    vswp->instance);
1430
1431		kmem_free(conn, sizeof (vsw_conn_evt_t));
1432		goto err_exit;
1433	}
1434
1435	D1(vswp, "%s: exit", __func__);
1436	return;
1437
1438err_exit:
1439	/*
1440	 * Have mostly likely failed due to memory shortage. Clear the flag so
1441	 * that future requests will at least be attempted and will hopefully
1442	 * succeed.
1443	 */
1444	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1445		ldcp->reset_active = 0;
1446}
1447
1448/*
1449 * Deal with events relating to a connection. Invoked from a taskq.
1450 */
1451static void
1452vsw_conn_task(void *arg)
1453{
1454	vsw_conn_evt_t	*conn = (vsw_conn_evt_t *)arg;
1455	vsw_ldc_t	*ldcp = NULL;
1456	vsw_port_t	*portp;
1457	vsw_t		*vswp = NULL;
1458	uint16_t	evt;
1459	ldc_status_t	curr_status;
1460
1461	ldcp = conn->ldcp;
1462	evt = conn->evt;
1463	vswp = ldcp->ldc_vswp;
1464	portp = ldcp->ldc_port;
1465
1466	D1(vswp, "%s: enter", __func__);
1467
1468	/* can safely free now have copied out data */
1469	kmem_free(conn, sizeof (vsw_conn_evt_t));
1470
1471	if (ldcp->rcv_thread != NULL) {
1472		vsw_stop_rcv_thread(ldcp);
1473	} else if (ldcp->msg_thread != NULL) {
1474		vsw_stop_msg_thread(ldcp);
1475	}
1476
1477	mutex_enter(&ldcp->status_lock);
1478	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1479		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1480		    "channel %ld", vswp->instance, ldcp->ldc_id);
1481		mutex_exit(&ldcp->status_lock);
1482		return;
1483	}
1484
1485	/*
1486	 * If we wish to restart the handshake on this channel, then if
1487	 * the channel is UP we bring it DOWN to flush the underlying
1488	 * ldc queue.
1489	 */
1490	if ((evt == VSW_CONN_RESTART) && (curr_status == LDC_UP))
1491		(void) ldc_down(ldcp->ldc_handle);
1492
1493	if ((portp->p_hio_capable) && (portp->p_hio_enabled)) {
1494		vsw_hio_stop(vswp, ldcp);
1495	}
1496
1497	/*
1498	 * re-init all the associated data structures.
1499	 */
1500	vsw_ldc_reinit(ldcp);
1501
1502	/*
1503	 * Bring the channel back up (note it does no harm to
1504	 * do this even if the channel is already UP, Just
1505	 * becomes effectively a no-op).
1506	 */
1507	(void) ldc_up(ldcp->ldc_handle);
1508
1509	/*
1510	 * Check if channel is now UP. This will only happen if
1511	 * peer has also done a ldc_up().
1512	 */
1513	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1514		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1515		    "channel %ld", vswp->instance, ldcp->ldc_id);
1516		mutex_exit(&ldcp->status_lock);
1517		return;
1518	}
1519
1520	ldcp->ldc_status = curr_status;
1521
1522	/* channel UP so restart handshake by sending version info */
1523	if (curr_status == LDC_UP) {
1524		if (ldcp->hcnt++ > vsw_num_handshakes) {
1525			cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted"
1526			    " handshake attempts (%d) on channel %ld",
1527			    vswp->instance, ldcp->hcnt, ldcp->ldc_id);
1528			mutex_exit(&ldcp->status_lock);
1529			return;
1530		}
1531
1532		if (vsw_obp_ver_proto_workaround == B_FALSE &&
1533		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp,
1534		    DDI_NOSLEEP) != DDI_SUCCESS)) {
1535			cmn_err(CE_WARN, "!vsw%d: Can't dispatch version task",
1536			    vswp->instance);
1537
1538			/*
1539			 * Don't count as valid restart attempt if couldn't
1540			 * send version msg.
1541			 */
1542			if (ldcp->hcnt > 0)
1543				ldcp->hcnt--;
1544		}
1545	}
1546
1547	/*
1548	 * Mark that the process is complete by clearing the flag.
1549	 *
1550	 * Note is it possible that the taskq dispatch above may have failed,
1551	 * most likely due to memory shortage. We still clear the flag so
1552	 * future attempts will at least be attempted and will hopefully
1553	 * succeed.
1554	 */
1555	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1556		ldcp->reset_active = 0;
1557
1558	mutex_exit(&ldcp->status_lock);
1559
1560	D1(vswp, "%s: exit", __func__);
1561}
1562
1563/*
1564 * returns 0 if legal for event signified by flag to have
1565 * occured at the time it did. Otherwise returns 1.
1566 */
1567int
1568vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag)
1569{
1570	vsw_t		*vswp = ldcp->ldc_vswp;
1571	uint64_t	state;
1572	uint64_t	phase;
1573
1574	if (dir == INBOUND)
1575		state = ldcp->lane_in.lstate;
1576	else
1577		state = ldcp->lane_out.lstate;
1578
1579	phase = ldcp->hphase;
1580
1581	switch (flag) {
1582	case VSW_VER_INFO_RECV:
1583		if (phase > VSW_MILESTONE0) {
1584			DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV"
1585			    " when in state %d\n", ldcp->ldc_id, phase);
1586			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1587			return (1);
1588		}
1589		break;
1590
1591	case VSW_VER_ACK_RECV:
1592	case VSW_VER_NACK_RECV:
1593		if (!(state & VSW_VER_INFO_SENT)) {
1594			DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK or "
1595			    "VER_NACK when in state %d\n", ldcp->ldc_id, phase);
1596			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1597			return (1);
1598		} else
1599			state &= ~VSW_VER_INFO_SENT;
1600		break;
1601
1602	case VSW_ATTR_INFO_RECV:
1603		if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) {
1604			DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV"
1605			    " when in state %d\n", ldcp->ldc_id, phase);
1606			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1607			return (1);
1608		}
1609		break;
1610
1611	case VSW_ATTR_ACK_RECV:
1612	case VSW_ATTR_NACK_RECV:
1613		if (!(state & VSW_ATTR_INFO_SENT)) {
1614			DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK"
1615			    " or ATTR_NACK when in state %d\n",
1616			    ldcp->ldc_id, phase);
1617			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1618			return (1);
1619		} else
1620			state &= ~VSW_ATTR_INFO_SENT;
1621		break;
1622
1623	case VSW_DRING_INFO_RECV:
1624		if (phase < VSW_MILESTONE1) {
1625			DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV"
1626			    " when in state %d\n", ldcp->ldc_id, phase);
1627			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1628			return (1);
1629		}
1630		break;
1631
1632	case VSW_DRING_ACK_RECV:
1633	case VSW_DRING_NACK_RECV:
1634		if (!(state & VSW_DRING_INFO_SENT)) {
1635			DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK "
1636			    " or DRING_NACK when in state %d\n",
1637			    ldcp->ldc_id, phase);
1638			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1639			return (1);
1640		} else
1641			state &= ~VSW_DRING_INFO_SENT;
1642		break;
1643
1644	case VSW_RDX_INFO_RECV:
1645		if (phase < VSW_MILESTONE3) {
1646			DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV"
1647			    " when in state %d\n", ldcp->ldc_id, phase);
1648			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1649			return (1);
1650		}
1651		break;
1652
1653	case VSW_RDX_ACK_RECV:
1654	case VSW_RDX_NACK_RECV:
1655		if (!(state & VSW_RDX_INFO_SENT)) {
1656			DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK or "
1657			    "RDX_NACK when in state %d\n", ldcp->ldc_id, phase);
1658			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1659			return (1);
1660		} else
1661			state &= ~VSW_RDX_INFO_SENT;
1662		break;
1663
1664	case VSW_MCST_INFO_RECV:
1665		if (phase < VSW_MILESTONE3) {
1666			DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV"
1667			    " when in state %d\n", ldcp->ldc_id, phase);
1668			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1669			return (1);
1670		}
1671		break;
1672
1673	default:
1674		DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)",
1675		    ldcp->ldc_id, flag);
1676		return (1);
1677	}
1678
1679	if (dir == INBOUND)
1680		ldcp->lane_in.lstate = state;
1681	else
1682		ldcp->lane_out.lstate = state;
1683
1684	D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id);
1685
1686	return (0);
1687}
1688
1689void
1690vsw_next_milestone(vsw_ldc_t *ldcp)
1691{
1692	vsw_t		*vswp = ldcp->ldc_vswp;
1693	vsw_port_t	*portp = ldcp->ldc_port;
1694	lane_t		*lane_out = &ldcp->lane_out;
1695	lane_t		*lane_in = &ldcp->lane_in;
1696
1697	D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__,
1698	    ldcp->ldc_id, ldcp->hphase);
1699
1700	DUMP_FLAGS(lane_in->lstate);
1701	DUMP_FLAGS(lane_out->lstate);
1702
1703	switch (ldcp->hphase) {
1704
1705	case VSW_MILESTONE0:
1706		/*
1707		 * If we haven't started to handshake with our peer,
1708		 * start to do so now.
1709		 */
1710		if (lane_out->lstate == 0) {
1711			D2(vswp, "%s: (chan %lld) starting handshake "
1712			    "with peer", __func__, ldcp->ldc_id);
1713			vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1714		}
1715
1716		/*
1717		 * Only way to pass this milestone is to have successfully
1718		 * negotiated version info.
1719		 */
1720		if ((lane_in->lstate & VSW_VER_ACK_SENT) &&
1721		    (lane_out->lstate & VSW_VER_ACK_RECV)) {
1722
1723			D2(vswp, "%s: (chan %lld) leaving milestone 0",
1724			    __func__, ldcp->ldc_id);
1725
1726			vsw_set_vnet_proto_ops(ldcp);
1727
1728			/*
1729			 * Next milestone is passed when attribute
1730			 * information has been successfully exchanged.
1731			 */
1732			ldcp->hphase = VSW_MILESTONE1;
1733			vsw_send_attr(ldcp);
1734
1735		}
1736		break;
1737
1738	case VSW_MILESTONE1:
1739		/*
1740		 * Only way to pass this milestone is to have successfully
1741		 * negotiated attribute information, in both directions.
1742		 */
1743		if (!((lane_in->lstate & VSW_ATTR_ACK_SENT) &&
1744		    (lane_out->lstate & VSW_ATTR_ACK_RECV))) {
1745			break;
1746		}
1747
1748		ldcp->hphase = VSW_MILESTONE2;
1749
1750		/*
1751		 * If the peer device has said it wishes to
1752		 * use descriptor rings then we send it our ring
1753		 * info, otherwise we just set up a private ring
1754		 * which we use an internal buffer
1755		 */
1756		if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
1757		    (lane_in->xfer_mode & VIO_DRING_MODE_V1_2)) ||
1758		    (VSW_VER_LT(ldcp, 1, 2) &&
1759		    (lane_in->xfer_mode == VIO_DRING_MODE_V1_0))) {
1760			vsw_send_dring_info(ldcp);
1761			break;
1762		}
1763
1764		/*
1765		 * The peer doesn't operate in dring mode; we
1766		 * can simply fallthru to the RDX phase from
1767		 * here.
1768		 */
1769		/*FALLTHRU*/
1770
1771	case VSW_MILESTONE2:
1772		/*
1773		 * If peer has indicated in its attribute message that
1774		 * it wishes to use descriptor rings then the only way
1775		 * to pass this milestone is for us to have received
1776		 * valid dring info.
1777		 *
1778		 * If peer is not using descriptor rings then just fall
1779		 * through.
1780		 */
1781		if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
1782		    (lane_in->xfer_mode & VIO_DRING_MODE_V1_2)) ||
1783		    (VSW_VER_LT(ldcp, 1, 2) &&
1784		    (lane_in->xfer_mode ==
1785		    VIO_DRING_MODE_V1_0))) {
1786			if (!(lane_in->lstate & VSW_DRING_ACK_SENT))
1787				break;
1788		}
1789
1790		D2(vswp, "%s: (chan %lld) leaving milestone 2",
1791		    __func__, ldcp->ldc_id);
1792
1793		ldcp->hphase = VSW_MILESTONE3;
1794		vsw_send_rdx(ldcp);
1795		break;
1796
1797	case VSW_MILESTONE3:
1798		/*
1799		 * Pass this milestone when all paramaters have been
1800		 * successfully exchanged and RDX sent in both directions.
1801		 *
1802		 * Mark the relevant lane as available to transmit data. In
1803		 * RxDringData mode, lane_in is associated with transmit and
1804		 * lane_out is associated with receive. It is the reverse in
1805		 * TxDring mode.
1806		 */
1807		if ((lane_out->lstate & VSW_RDX_ACK_SENT) &&
1808		    (lane_in->lstate & VSW_RDX_ACK_RECV)) {
1809
1810			D2(vswp, "%s: (chan %lld) leaving milestone 3",
1811			    __func__, ldcp->ldc_id);
1812			D2(vswp, "%s: ** handshake complete (0x%llx : "
1813			    "0x%llx) **", __func__, lane_in->lstate,
1814			    lane_out->lstate);
1815			if (lane_out->dring_mode == VIO_RX_DRING_DATA) {
1816				lane_in->lstate |= VSW_LANE_ACTIVE;
1817			} else {
1818				lane_out->lstate |= VSW_LANE_ACTIVE;
1819			}
1820			ldcp->hphase = VSW_MILESTONE4;
1821			ldcp->hcnt = 0;
1822			DISPLAY_STATE();
1823			/* Start HIO if enabled and capable */
1824			if ((portp->p_hio_enabled) && (portp->p_hio_capable)) {
1825				D2(vswp, "%s: start HybridIO setup", __func__);
1826				vsw_hio_start(vswp, ldcp);
1827			}
1828
1829			if (ldcp->pls_negotiated == B_TRUE) {
1830				/*
1831				 * The vnet device has negotiated to get phys
1832				 * link updates. Now that the handshake with
1833				 * the vnet device is complete, send an initial
1834				 * update with the current physical link state.
1835				 */
1836				vsw_send_physlink_msg(ldcp,
1837				    vswp->phys_link_state);
1838			}
1839
1840		} else {
1841			D2(vswp, "%s: still in milestone 3 (0x%llx : 0x%llx)",
1842			    __func__, lane_in->lstate,
1843			    lane_out->lstate);
1844		}
1845		break;
1846
1847	case VSW_MILESTONE4:
1848		D2(vswp, "%s: (chan %lld) in milestone 4", __func__,
1849		    ldcp->ldc_id);
1850		break;
1851
1852	default:
1853		DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__,
1854		    ldcp->ldc_id, ldcp->hphase);
1855	}
1856
1857	D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id,
1858	    ldcp->hphase);
1859}
1860
1861/*
1862 * Check if major version is supported.
1863 *
1864 * Returns 0 if finds supported major number, and if necessary
1865 * adjusts the minor field.
1866 *
1867 * Returns 1 if can't match major number exactly. Sets mjor/minor
1868 * to next lowest support values, or to zero if no other values possible.
1869 */
1870static int
1871vsw_supported_version(vio_ver_msg_t *vp)
1872{
1873	int	i;
1874
1875	D1(NULL, "vsw_supported_version: enter");
1876
1877	for (i = 0; i < VSW_NUM_VER; i++) {
1878		if (vsw_versions[i].ver_major == vp->ver_major) {
1879			/*
1880			 * Matching or lower major version found. Update
1881			 * minor number if necessary.
1882			 */
1883			if (vp->ver_minor > vsw_versions[i].ver_minor) {
1884				D2(NULL, "%s: adjusting minor value from %d "
1885				    "to %d", __func__, vp->ver_minor,
1886				    vsw_versions[i].ver_minor);
1887				vp->ver_minor = vsw_versions[i].ver_minor;
1888			}
1889
1890			return (0);
1891		}
1892
1893		/*
1894		 * If the message contains a higher major version number, set
1895		 * the message's major/minor versions to the current values
1896		 * and return false, so this message will get resent with
1897		 * these values.
1898		 */
1899		if (vsw_versions[i].ver_major < vp->ver_major) {
1900			D2(NULL, "%s: adjusting major and minor "
1901			    "values to %d, %d\n",
1902			    __func__, vsw_versions[i].ver_major,
1903			    vsw_versions[i].ver_minor);
1904			vp->ver_major = vsw_versions[i].ver_major;
1905			vp->ver_minor = vsw_versions[i].ver_minor;
1906			return (1);
1907		}
1908	}
1909
1910	/* No match was possible, zero out fields */
1911	vp->ver_major = 0;
1912	vp->ver_minor = 0;
1913
1914	D1(NULL, "vsw_supported_version: exit");
1915
1916	return (1);
1917}
1918
1919/*
1920 * Set vnet-protocol-version dependent functions based on version.
1921 */
1922static void
1923vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp)
1924{
1925	vsw_t	*vswp = ldcp->ldc_vswp;
1926	lane_t	*lp = &ldcp->lane_out;
1927
1928	/*
1929	 * Setup the appropriate dring data processing routine and any
1930	 * associated thread based on the version.
1931	 *
1932	 * In versions < 1.6, we support only TxDring mode. In this mode, the
1933	 * msg worker thread processes all types of VIO msgs (ctrl and data).
1934	 *
1935	 * In versions >= 1.6, we also support RxDringData mode. In this mode,
1936	 * the rcv worker thread processes dring data messages (msgtype:
1937	 * VIO_TYPE_DATA, subtype: VIO_SUBTYPE_INFO, env: VIO_DRING_DATA). The
1938	 * rest of the data messages (including acks) and ctrl messages are
1939	 * handled directly by the callback (intr) thread.
1940	 *
1941	 * However, for versions >= 1.6, we could still fallback to TxDring
1942	 * mode. This could happen if RxDringData mode has been disabled (see
1943	 * below) on this guest or on the peer guest. This info is determined
1944	 * as part of attr exchange phase of handshake. Hence, we setup these
1945	 * pointers for v1.6 after attr msg phase completes during handshake.
1946	 */
1947	if (VSW_VER_GTEQ(ldcp, 1, 6)) {
1948		/*
1949		 * Set data dring mode for vsw_send_attr(). We setup msg worker
1950		 * thread in TxDring mode or rcv worker thread in RxDringData
1951		 * mode when attr phase of handshake completes.
1952		 */
1953		if (vsw_mapin_avail(ldcp) == B_TRUE) {
1954			lp->dring_mode = (VIO_RX_DRING_DATA | VIO_TX_DRING);
1955		} else {
1956			lp->dring_mode = VIO_TX_DRING;
1957		}
1958	} else {
1959		lp->dring_mode = VIO_TX_DRING;
1960	}
1961
1962	/*
1963	 * Setup the MTU for attribute negotiation based on the version.
1964	 */
1965	if (VSW_VER_GTEQ(ldcp, 1, 4)) {
1966		/*
1967		 * If the version negotiated with peer is >= 1.4(Jumbo Frame
1968		 * Support), set the mtu in our attributes to max_frame_size.
1969		 */
1970		lp->mtu = vswp->max_frame_size;
1971	} else if (VSW_VER_EQ(ldcp, 1, 3)) {
1972		/*
1973		 * If the version negotiated with peer is == 1.3 (Vlan Tag
1974		 * Support) set the attr.mtu to ETHERMAX + VLAN_TAGSZ.
1975		 */
1976		lp->mtu = ETHERMAX + VLAN_TAGSZ;
1977	} else {
1978		vsw_port_t	*portp = ldcp->ldc_port;
1979		/*
1980		 * Pre-1.3 peers expect max frame size of ETHERMAX.
1981		 * We can negotiate that size with those peers provided only
1982		 * pvid is defined for our peer and there are no vids. Then we
1983		 * can send/recv only untagged frames of max size ETHERMAX.
1984		 * Note that pvid of the peer can be different, as vsw has to
1985		 * serve the vnet in that vlan even if itself is not assigned
1986		 * to that vlan.
1987		 */
1988		if (portp->nvids == 0) {
1989			lp->mtu = ETHERMAX;
1990		}
1991	}
1992
1993	/*
1994	 * Setup version dependent data processing functions.
1995	 */
1996	if (VSW_VER_GTEQ(ldcp, 1, 2)) {
1997		/* Versions >= 1.2 */
1998
1999		if (VSW_PRI_ETH_DEFINED(vswp)) {
2000			/*
2001			 * enable priority routines and pkt mode only if
2002			 * at least one pri-eth-type is specified in MD.
2003			 */
2004			ldcp->tx = vsw_ldctx_pri;
2005			ldcp->rx_pktdata = vsw_process_pkt_data;
2006
2007			/* set xfer mode for vsw_send_attr() */
2008			lp->xfer_mode = VIO_PKT_MODE | VIO_DRING_MODE_V1_2;
2009		} else {
2010			/* no priority eth types defined in MD */
2011
2012			ldcp->tx = vsw_ldctx;
2013			ldcp->rx_pktdata = vsw_process_pkt_data_nop;
2014
2015			/* set xfer mode for vsw_send_attr() */
2016			lp->xfer_mode = VIO_DRING_MODE_V1_2;
2017		}
2018
2019	} else {
2020		/* Versions prior to 1.2  */
2021
2022		vsw_reset_vnet_proto_ops(ldcp);
2023	}
2024}
2025
2026/*
2027 * Reset vnet-protocol-version dependent functions to v1.0.
2028 */
2029static void
2030vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp)
2031{
2032	lane_t	*lp = &ldcp->lane_out;
2033
2034	ldcp->tx = vsw_ldctx;
2035	ldcp->rx_pktdata = vsw_process_pkt_data_nop;
2036
2037	/* set xfer mode for vsw_send_attr() */
2038	lp->xfer_mode = VIO_DRING_MODE_V1_0;
2039}
2040
2041static void
2042vsw_process_evt_read(vsw_ldc_t *ldcp)
2043{
2044	if (ldcp->msg_thread != NULL) {
2045		/*
2046		 * TxDring mode; wakeup message worker
2047		 * thread to process the VIO messages.
2048		 */
2049		mutex_exit(&ldcp->ldc_cblock);
2050		mutex_enter(&ldcp->msg_thr_lock);
2051		if (!(ldcp->msg_thr_flags & VSW_WTHR_DATARCVD)) {
2052			ldcp->msg_thr_flags |= VSW_WTHR_DATARCVD;
2053			cv_signal(&ldcp->msg_thr_cv);
2054		}
2055		mutex_exit(&ldcp->msg_thr_lock);
2056		mutex_enter(&ldcp->ldc_cblock);
2057	} else {
2058		/*
2059		 * We invoke vsw_process_pkt() in the context of the LDC
2060		 * callback (vsw_ldc_cb()) during handshake, until the dring
2061		 * mode is negotiated. After the dring mode is negotiated, the
2062		 * msgs are processed by the msg worker thread (above case) if
2063		 * the dring mode is TxDring. Otherwise (in RxDringData mode)
2064		 * we continue to process the msgs directly in the callback
2065		 * context.
2066		 */
2067		vsw_process_pkt(ldcp);
2068	}
2069}
2070
2071/*
2072 * Main routine for processing messages received over LDC.
2073 */
2074void
2075vsw_process_pkt(void *arg)
2076{
2077	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
2078	vsw_t 		*vswp = ldcp->ldc_vswp;
2079	size_t		msglen;
2080	vio_msg_tag_t	*tagp;
2081	uint64_t	*ldcmsg;
2082	int 		rv = 0;
2083
2084
2085	D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2086
2087	ASSERT(MUTEX_HELD(&ldcp->ldc_cblock));
2088
2089	ldcmsg = ldcp->ldcmsg;
2090	/*
2091	 * If channel is up read messages until channel is empty.
2092	 */
2093	do {
2094		msglen = ldcp->msglen;
2095		rv = ldc_read(ldcp->ldc_handle, (caddr_t)ldcmsg, &msglen);
2096
2097		if (rv != 0) {
2098			DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) len(%d)\n",
2099			    __func__, ldcp->ldc_id, rv, msglen);
2100		}
2101
2102		/* channel has been reset */
2103		if (rv == ECONNRESET) {
2104			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
2105			break;
2106		}
2107
2108		if (msglen == 0) {
2109			D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__,
2110			    ldcp->ldc_id);
2111			break;
2112		}
2113
2114		D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__,
2115		    ldcp->ldc_id, msglen);
2116
2117		/*
2118		 * Figure out what sort of packet we have gotten by
2119		 * examining the msg tag, and then switch it appropriately.
2120		 */
2121		tagp = (vio_msg_tag_t *)ldcmsg;
2122
2123		switch (tagp->vio_msgtype) {
2124		case VIO_TYPE_CTRL:
2125			vsw_dispatch_ctrl_task(ldcp, ldcmsg, tagp, msglen);
2126			break;
2127		case VIO_TYPE_DATA:
2128			vsw_process_data_pkt(ldcp, ldcmsg, tagp, msglen);
2129			break;
2130		case VIO_TYPE_ERR:
2131			vsw_process_err_pkt(ldcp, ldcmsg, tagp);
2132			break;
2133		default:
2134			DERR(vswp, "%s: Unknown tag(%lx) ", __func__,
2135			    "id(%lx)\n", tagp->vio_msgtype, ldcp->ldc_id);
2136			break;
2137		}
2138	} while (msglen);
2139
2140	D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2141}
2142
2143/*
2144 * Dispatch a task to process a VIO control message.
2145 */
2146static void
2147vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t *tagp,
2148	int msglen)
2149{
2150	vsw_ctrl_task_t		*ctaskp = NULL;
2151	vsw_port_t		*port = ldcp->ldc_port;
2152	vsw_t			*vswp = port->p_vswp;
2153
2154	D1(vswp, "%s: enter", __func__);
2155
2156	/*
2157	 * We need to handle RDX ACK messages in-band as once they
2158	 * are exchanged it is possible that we will get an
2159	 * immediate (legitimate) data packet.
2160	 */
2161	if ((tagp->vio_subtype_env == VIO_RDX) &&
2162	    (tagp->vio_subtype == VIO_SUBTYPE_ACK)) {
2163
2164		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_ACK_RECV))
2165			return;
2166
2167		ldcp->lane_in.lstate |= VSW_RDX_ACK_RECV;
2168		D2(vswp, "%s (%ld) handling RDX_ACK in place "
2169		    "(ostate 0x%llx : hphase %d)", __func__,
2170		    ldcp->ldc_id, ldcp->lane_in.lstate, ldcp->hphase);
2171		vsw_next_milestone(ldcp);
2172		return;
2173	}
2174
2175	ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP);
2176
2177	if (ctaskp == NULL) {
2178		DERR(vswp, "%s: unable to alloc space for ctrl msg", __func__);
2179		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2180		return;
2181	}
2182
2183	ctaskp->ldcp = ldcp;
2184	bcopy((def_msg_t *)cpkt, &ctaskp->pktp, msglen);
2185	ctaskp->hss_id = ldcp->hss_id;
2186
2187	/*
2188	 * Dispatch task to processing taskq if port is not in
2189	 * the process of being detached.
2190	 */
2191	mutex_enter(&port->state_lock);
2192	if (port->state == VSW_PORT_INIT) {
2193		if ((vswp->taskq_p == NULL) ||
2194		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_process_ctrl_pkt,
2195		    ctaskp, DDI_NOSLEEP) != DDI_SUCCESS)) {
2196			mutex_exit(&port->state_lock);
2197			DERR(vswp, "%s: unable to dispatch task to taskq",
2198			    __func__);
2199			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2200			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2201			return;
2202		}
2203	} else {
2204		kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2205		DWARN(vswp, "%s: port %d detaching, not dispatching "
2206		    "task", __func__, port->p_instance);
2207	}
2208
2209	mutex_exit(&port->state_lock);
2210
2211	D2(vswp, "%s: dispatched task to taskq for chan %d", __func__,
2212	    ldcp->ldc_id);
2213	D1(vswp, "%s: exit", __func__);
2214}
2215
2216/*
2217 * Process a VIO ctrl message. Invoked from taskq.
2218 */
2219static void
2220vsw_process_ctrl_pkt(void *arg)
2221{
2222	vsw_ctrl_task_t	*ctaskp = (vsw_ctrl_task_t *)arg;
2223	vsw_ldc_t	*ldcp = ctaskp->ldcp;
2224	vsw_t 		*vswp = ldcp->ldc_vswp;
2225	vio_msg_tag_t	tag;
2226	uint16_t	env;
2227
2228	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2229
2230	bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t));
2231	env = tag.vio_subtype_env;
2232
2233	/* stale pkt check */
2234	if (ctaskp->hss_id < ldcp->hss_id) {
2235		DWARN(vswp, "%s: discarding stale packet belonging to earlier"
2236		    " (%ld) handshake session", __func__, ctaskp->hss_id);
2237		kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2238		return;
2239	}
2240
2241	/* session id check */
2242	if (ldcp->session_status & VSW_PEER_SESSION) {
2243		if (ldcp->peer_session != tag.vio_sid) {
2244			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
2245			    __func__, ldcp->ldc_id, tag.vio_sid);
2246			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2247			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2248			return;
2249		}
2250	}
2251
2252	/*
2253	 * Switch on vio_subtype envelope, then let lower routines
2254	 * decide if its an INFO, ACK or NACK packet.
2255	 */
2256	switch (env) {
2257	case VIO_VER_INFO:
2258		vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp);
2259		break;
2260	case VIO_DRING_REG:
2261		vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp);
2262		break;
2263	case VIO_DRING_UNREG:
2264		vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp);
2265		break;
2266	case VIO_ATTR_INFO:
2267		vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp);
2268		break;
2269	case VNET_MCAST_INFO:
2270		vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp);
2271		break;
2272	case VIO_RDX:
2273		vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp);
2274		break;
2275	case VIO_DDS_INFO:
2276		vsw_process_dds_msg(vswp, ldcp, &ctaskp->pktp);
2277		break;
2278
2279	case VNET_PHYSLINK_INFO:
2280		vsw_process_physlink_msg(ldcp, &ctaskp->pktp);
2281		break;
2282	default:
2283		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
2284	}
2285
2286	kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2287	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2288}
2289
2290/*
2291 * Version negotiation. We can end up here either because our peer
2292 * has responded to a handshake message we have sent it, or our peer
2293 * has initiated a handshake with us. If its the former then can only
2294 * be ACK or NACK, if its the later can only be INFO.
2295 *
2296 * If its an ACK we move to the next stage of the handshake, namely
2297 * attribute exchange. If its a NACK we see if we can specify another
2298 * version, if we can't we stop.
2299 *
2300 * If it is an INFO we reset all params associated with communication
2301 * in that direction over this channel (remember connection is
2302 * essentially 2 independent simplex channels).
2303 */
2304void
2305vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt)
2306{
2307	vio_ver_msg_t	*ver_pkt;
2308	vsw_t 		*vswp = ldcp->ldc_vswp;
2309
2310	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2311
2312	/*
2313	 * We know this is a ctrl/version packet so
2314	 * cast it into the correct structure.
2315	 */
2316	ver_pkt = (vio_ver_msg_t *)pkt;
2317
2318	switch (ver_pkt->tag.vio_subtype) {
2319	case VIO_SUBTYPE_INFO:
2320		D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n");
2321
2322		/*
2323		 * Record the session id, which we will use from now
2324		 * until we see another VER_INFO msg. Even then the
2325		 * session id in most cases will be unchanged, execpt
2326		 * if channel was reset.
2327		 */
2328		if ((ldcp->session_status & VSW_PEER_SESSION) &&
2329		    (ldcp->peer_session != ver_pkt->tag.vio_sid)) {
2330			DERR(vswp, "%s: updating session id for chan %lld "
2331			    "from %llx to %llx", __func__, ldcp->ldc_id,
2332			    ldcp->peer_session, ver_pkt->tag.vio_sid);
2333		}
2334
2335		ldcp->peer_session = ver_pkt->tag.vio_sid;
2336		ldcp->session_status |= VSW_PEER_SESSION;
2337
2338		/* Legal message at this time ? */
2339		if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV))
2340			return;
2341
2342		/*
2343		 * First check the device class. Currently only expect
2344		 * to be talking to a network device. In the future may
2345		 * also talk to another switch.
2346		 */
2347		if (ver_pkt->dev_class != VDEV_NETWORK) {
2348			DERR(vswp, "%s: illegal device class %d", __func__,
2349			    ver_pkt->dev_class);
2350
2351			ver_pkt->tag.vio_sid = ldcp->local_session;
2352			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2353
2354			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2355
2356			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2357			    sizeof (vio_ver_msg_t), B_TRUE);
2358
2359			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2360			vsw_next_milestone(ldcp);
2361			return;
2362		} else {
2363			ldcp->dev_class = ver_pkt->dev_class;
2364		}
2365
2366		/*
2367		 * Now check the version.
2368		 */
2369		if (vsw_supported_version(ver_pkt) == 0) {
2370			/*
2371			 * Support this major version and possibly
2372			 * adjusted minor version.
2373			 */
2374
2375			D2(vswp, "%s: accepted ver %d:%d", __func__,
2376			    ver_pkt->ver_major, ver_pkt->ver_minor);
2377
2378			/* Store accepted values */
2379			ldcp->lane_in.ver_major = ver_pkt->ver_major;
2380			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2381
2382			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2383
2384			ldcp->lane_in.lstate |= VSW_VER_ACK_SENT;
2385
2386			if (vsw_obp_ver_proto_workaround == B_TRUE) {
2387				/*
2388				 * Send a version info message
2389				 * using the accepted version that
2390				 * we are about to ack. Also note that
2391				 * we send our ver info before we ack.
2392				 * Otherwise, as soon as receiving the
2393				 * ack, obp sends attr info msg, which
2394				 * breaks vsw_check_flag() invoked
2395				 * from vsw_process_ctrl_attr_pkt();
2396				 * as we also need VSW_VER_ACK_RECV to
2397				 * be set in lane_out.lstate, before
2398				 * we can receive attr info.
2399				 */
2400				vsw_send_ver(ldcp);
2401			}
2402		} else {
2403			/*
2404			 * NACK back with the next lower major/minor
2405			 * pairing we support (if don't suuport any more
2406			 * versions then they will be set to zero.
2407			 */
2408
2409			D2(vswp, "%s: replying with ver %d:%d", __func__,
2410			    ver_pkt->ver_major, ver_pkt->ver_minor);
2411
2412			/* Store updated values */
2413			ldcp->lane_in.ver_major = ver_pkt->ver_major;
2414			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2415
2416			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2417
2418			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2419		}
2420
2421		DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2422		ver_pkt->tag.vio_sid = ldcp->local_session;
2423		(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2424		    sizeof (vio_ver_msg_t), B_TRUE);
2425
2426		vsw_next_milestone(ldcp);
2427		break;
2428
2429	case VIO_SUBTYPE_ACK:
2430		D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__);
2431
2432		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV))
2433			return;
2434
2435		/* Store updated values */
2436		ldcp->lane_out.ver_major = ver_pkt->ver_major;
2437		ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2438
2439		ldcp->lane_out.lstate |= VSW_VER_ACK_RECV;
2440		vsw_next_milestone(ldcp);
2441
2442		break;
2443
2444	case VIO_SUBTYPE_NACK:
2445		D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__);
2446
2447		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV))
2448			return;
2449
2450		/*
2451		 * If our peer sent us a NACK with the ver fields set to
2452		 * zero then there is nothing more we can do. Otherwise see
2453		 * if we support either the version suggested, or a lesser
2454		 * one.
2455		 */
2456		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2457			DERR(vswp, "%s: peer unable to negotiate any "
2458			    "further.", __func__);
2459			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2460			vsw_next_milestone(ldcp);
2461			return;
2462		}
2463
2464		/*
2465		 * Check to see if we support this major version or
2466		 * a lower one. If we don't then maj/min will be set
2467		 * to zero.
2468		 */
2469		(void) vsw_supported_version(ver_pkt);
2470		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2471			/* Nothing more we can do */
2472			DERR(vswp, "%s: version negotiation failed.\n",
2473			    __func__);
2474			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2475			vsw_next_milestone(ldcp);
2476		} else {
2477			/* found a supported major version */
2478			ldcp->lane_out.ver_major = ver_pkt->ver_major;
2479			ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2480
2481			D2(vswp, "%s: resending with updated values (%x, %x)",
2482			    __func__, ver_pkt->ver_major, ver_pkt->ver_minor);
2483
2484			ldcp->lane_out.lstate |= VSW_VER_INFO_SENT;
2485			ver_pkt->tag.vio_sid = ldcp->local_session;
2486			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
2487
2488			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2489
2490			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2491			    sizeof (vio_ver_msg_t), B_TRUE);
2492
2493			vsw_next_milestone(ldcp);
2494
2495		}
2496		break;
2497
2498	default:
2499		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2500		    ver_pkt->tag.vio_subtype);
2501	}
2502
2503	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
2504}
2505
2506static int
2507vsw_process_attr_info(vsw_ldc_t *ldcp, vnet_attr_msg_t *msg)
2508{
2509	vsw_t			*vswp = ldcp->ldc_vswp;
2510	vsw_port_t		*port = ldcp->ldc_port;
2511	struct ether_addr	ea;
2512	uint64_t		macaddr = 0;
2513	lane_t			*lane_out = &ldcp->lane_out;
2514	lane_t			*lane_in = &ldcp->lane_in;
2515	uint32_t		mtu;
2516	int			i;
2517	uint8_t			dring_mode;
2518
2519	D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2520
2521	if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV)) {
2522		return (1);
2523	}
2524
2525	if ((msg->xfer_mode != VIO_DESC_MODE) &&
2526	    (msg->xfer_mode != lane_out->xfer_mode)) {
2527		D2(NULL, "%s: unknown mode %x\n", __func__, msg->xfer_mode);
2528		return (1);
2529	}
2530
2531	/* Only support MAC addresses at moment. */
2532	if ((msg->addr_type != ADDR_TYPE_MAC) || (msg->addr == 0)) {
2533		D2(NULL, "%s: invalid addr_type %x, or address 0x%llx\n",
2534		    __func__, msg->addr_type, msg->addr);
2535		return (1);
2536	}
2537
2538	/*
2539	 * MAC address supplied by device should match that stored
2540	 * in the vsw-port OBP node. Need to decide what to do if they
2541	 * don't match, for the moment just warn but don't fail.
2542	 */
2543	vnet_macaddr_ultostr(msg->addr, ea.ether_addr_octet);
2544	if (ether_cmp(&ea, &port->p_macaddr) != 0) {
2545		DERR(NULL, "%s: device supplied address "
2546		    "0x%llx doesn't match node address 0x%llx\n",
2547		    __func__, msg->addr, port->p_macaddr);
2548	}
2549
2550	/*
2551	 * Ack freq only makes sense in pkt mode, in shared
2552	 * mode the ring descriptors say whether or not to
2553	 * send back an ACK.
2554	 */
2555	if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
2556	    (msg->xfer_mode & VIO_DRING_MODE_V1_2)) ||
2557	    (VSW_VER_LT(ldcp, 1, 2) &&
2558	    (msg->xfer_mode == VIO_DRING_MODE_V1_0))) {
2559		if (msg->ack_freq > 0) {
2560			D2(NULL, "%s: non zero ack freq in SHM mode\n",
2561			    __func__);
2562			return (1);
2563		}
2564	}
2565
2566	/*
2567	 * Process dring mode attribute.
2568	 */
2569	if (VSW_VER_GTEQ(ldcp, 1, 6)) {
2570		/*
2571		 * Versions >= 1.6:
2572		 * Though we are operating in v1.6 mode, it is possible that
2573		 * RxDringData mode has been disabled either on this guest or
2574		 * on the peer guest. If so, we revert to pre v1.6 behavior of
2575		 * TxDring mode. But this must be agreed upon in both
2576		 * directions of attr exchange. We first determine the mode
2577		 * that can be negotiated.
2578		 */
2579		if ((msg->options & VIO_RX_DRING_DATA) != 0 &&
2580		    vsw_mapin_avail(ldcp) == B_TRUE) {
2581			/*
2582			 * The peer is capable of handling RxDringData AND we
2583			 * are also capable of it; we enable RxDringData mode
2584			 * on this channel.
2585			 */
2586			dring_mode = VIO_RX_DRING_DATA;
2587		} else if ((msg->options & VIO_TX_DRING) != 0) {
2588			/*
2589			 * If the peer is capable of TxDring mode, we
2590			 * negotiate TxDring mode on this channel.
2591			 */
2592			dring_mode = VIO_TX_DRING;
2593		} else {
2594			/*
2595			 * We support only VIO_TX_DRING and VIO_RX_DRING_DATA
2596			 * modes. We don't support VIO_RX_DRING mode.
2597			 */
2598			return (1);
2599		}
2600
2601		/*
2602		 * If we have received an ack for the attr info that we sent,
2603		 * then check if the dring mode matches what the peer had ack'd
2604		 * (saved in lane_out). If they don't match, we fail the
2605		 * handshake.
2606		 */
2607		if (lane_out->lstate & VSW_ATTR_ACK_RECV) {
2608			if (msg->options != lane_out->dring_mode) {
2609				/* send NACK */
2610				return (1);
2611			}
2612		} else {
2613			/*
2614			 * Save the negotiated dring mode in our attr
2615			 * parameters, so it gets sent in the attr info from us
2616			 * to the peer.
2617			 */
2618			lane_out->dring_mode = dring_mode;
2619		}
2620
2621		/* save the negotiated dring mode in the msg to be replied */
2622		msg->options = dring_mode;
2623	}
2624
2625	/*
2626	 * Process MTU attribute.
2627	 */
2628	if (VSW_VER_GTEQ(ldcp, 1, 4)) {
2629		/*
2630		 * Versions >= 1.4:
2631		 * Validate mtu of the peer is at least ETHERMAX. Then, the mtu
2632		 * is negotiated down to the minimum of our mtu and peer's mtu.
2633		 */
2634		if (msg->mtu < ETHERMAX) {
2635			return (1);
2636		}
2637
2638		mtu = MIN(msg->mtu, vswp->max_frame_size);
2639
2640		/*
2641		 * If we have received an ack for the attr info
2642		 * that we sent, then check if the mtu computed
2643		 * above matches the mtu that the peer had ack'd
2644		 * (saved in local hparams). If they don't
2645		 * match, we fail the handshake.
2646		 */
2647		if (lane_out->lstate & VSW_ATTR_ACK_RECV) {
2648			if (mtu != lane_out->mtu) {
2649				/* send NACK */
2650				return (1);
2651			}
2652		} else {
2653			/*
2654			 * Save the mtu computed above in our
2655			 * attr parameters, so it gets sent in
2656			 * the attr info from us to the peer.
2657			 */
2658			lane_out->mtu = mtu;
2659		}
2660
2661		/* save the MIN mtu in the msg to be replied */
2662		msg->mtu = mtu;
2663	} else {
2664		/* Versions < 1.4, mtu must match */
2665		if (msg->mtu != lane_out->mtu) {
2666			D2(NULL, "%s: invalid MTU (0x%llx)\n",
2667			    __func__, msg->mtu);
2668			return (1);
2669		}
2670	}
2671
2672	/*
2673	 * Otherwise store attributes for this lane and update
2674	 * lane state.
2675	 */
2676	lane_in->mtu = msg->mtu;
2677	lane_in->addr = msg->addr;
2678	lane_in->addr_type = msg->addr_type;
2679	lane_in->xfer_mode = msg->xfer_mode;
2680	lane_in->ack_freq = msg->ack_freq;
2681	lane_in->physlink_update = msg->physlink_update;
2682	lane_in->dring_mode = msg->options;
2683
2684	/*
2685	 * Check if the client has requested physlink state updates.
2686	 * If there is a physical device bound to this vswitch (L2
2687	 * mode), set the ack bits to indicate it is supported.
2688	 * Otherwise, set the nack bits.
2689	 */
2690	if (VSW_VER_GTEQ(ldcp, 1, 5)) {	/* Protocol ver >= 1.5 */
2691
2692		/* Does the vnet need phys link state updates ? */
2693		if ((lane_in->physlink_update &
2694		    PHYSLINK_UPDATE_STATE_MASK) ==
2695		    PHYSLINK_UPDATE_STATE) {
2696
2697			if (vswp->smode & VSW_LAYER2) {
2698				/* is a net-dev assigned to us ? */
2699				msg->physlink_update =
2700				    PHYSLINK_UPDATE_STATE_ACK;
2701				ldcp->pls_negotiated = B_TRUE;
2702			} else {
2703				/* not in L2 mode */
2704				msg->physlink_update =
2705				    PHYSLINK_UPDATE_STATE_NACK;
2706				ldcp->pls_negotiated = B_FALSE;
2707			}
2708
2709		} else {
2710			msg->physlink_update =
2711			    PHYSLINK_UPDATE_NONE;
2712			ldcp->pls_negotiated = B_FALSE;
2713		}
2714
2715	} else {
2716		/*
2717		 * physlink_update bits are ignored
2718		 * if set by clients < v1.5 protocol.
2719		 */
2720		msg->physlink_update = PHYSLINK_UPDATE_NONE;
2721		ldcp->pls_negotiated = B_FALSE;
2722	}
2723
2724	macaddr = lane_in->addr;
2725	for (i = ETHERADDRL - 1; i >= 0; i--) {
2726		port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF;
2727		macaddr >>= 8;
2728	}
2729
2730	/*
2731	 * Setup device specific xmit routines. Note this could be changed
2732	 * further in vsw_send_dring_info() for versions >= 1.6 if operating in
2733	 * RxDringData mode.
2734	 */
2735	mutex_enter(&port->tx_lock);
2736
2737	if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
2738	    (lane_in->xfer_mode & VIO_DRING_MODE_V1_2)) ||
2739	    (VSW_VER_LT(ldcp, 1, 2) &&
2740	    (lane_in->xfer_mode == VIO_DRING_MODE_V1_0))) {
2741		D2(vswp, "%s: mode = VIO_DRING_MODE", __func__);
2742		port->transmit = vsw_dringsend;
2743	} else if (lane_in->xfer_mode == VIO_DESC_MODE) {
2744		D2(vswp, "%s: mode = VIO_DESC_MODE", __func__);
2745		vsw_create_privring(ldcp);
2746		port->transmit = vsw_descrsend;
2747		lane_out->xfer_mode = VIO_DESC_MODE;
2748	}
2749
2750	/*
2751	 * HybridIO is supported only vnet, not by OBP.
2752	 * So, set hio_capable to true only when in DRING mode.
2753	 */
2754	if (VSW_VER_GTEQ(ldcp, 1, 3) &&
2755	    (lane_in->xfer_mode != VIO_DESC_MODE)) {
2756		(void) atomic_swap_32(&port->p_hio_capable, B_TRUE);
2757	} else {
2758		(void) atomic_swap_32(&port->p_hio_capable, B_FALSE);
2759	}
2760
2761	mutex_exit(&port->tx_lock);
2762
2763	return (0);
2764}
2765
2766static int
2767vsw_process_attr_ack(vsw_ldc_t *ldcp, vnet_attr_msg_t *msg)
2768{
2769	vsw_t	*vswp = ldcp->ldc_vswp;
2770	lane_t	*lane_out = &ldcp->lane_out;
2771	lane_t	*lane_in = &ldcp->lane_in;
2772
2773	D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2774
2775	if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV)) {
2776		return (1);
2777	}
2778
2779	/*
2780	 * Process dring mode attribute.
2781	 */
2782	if (VSW_VER_GTEQ(ldcp, 1, 6)) {
2783		/*
2784		 * Versions >= 1.6:
2785		 * The ack msg sent by the peer contains the negotiated dring
2786		 * mode between our capability (that we had sent in our attr
2787		 * info) and the peer's capability.
2788		 */
2789		if (lane_in->lstate & VSW_ATTR_ACK_SENT) {
2790			/*
2791			 * If we have sent an ack for the attr info msg from
2792			 * the peer, check if the dring mode that was
2793			 * negotiated then (saved in lane_out) matches the
2794			 * mode that the peer has ack'd. If they don't match,
2795			 * we fail the handshake.
2796			 */
2797			if (lane_out->dring_mode != msg->options) {
2798				return (1);
2799			}
2800		} else {
2801			if ((msg->options & lane_out->dring_mode) == 0) {
2802				/*
2803				 * Peer ack'd with a mode that we don't
2804				 * support; we fail the handshake.
2805				 */
2806				return (1);
2807			}
2808			if ((msg->options & (VIO_TX_DRING|VIO_RX_DRING_DATA))
2809			    == (VIO_TX_DRING|VIO_RX_DRING_DATA)) {
2810				/*
2811				 * Peer must ack with only one negotiated mode.
2812				 * Otherwise fail handshake.
2813				 */
2814				return (1);
2815			}
2816
2817			/*
2818			 * Save the negotiated mode, so we can validate it when
2819			 * we receive attr info from the peer.
2820			 */
2821			lane_out->dring_mode = msg->options;
2822		}
2823	}
2824
2825	/*
2826	 * Process MTU attribute.
2827	 */
2828	if (VSW_VER_GTEQ(ldcp, 1, 4)) {
2829		/*
2830		 * Versions >= 1.4:
2831		 * The ack msg sent by the peer contains the minimum of
2832		 * our mtu (that we had sent in our attr info) and the
2833		 * peer's mtu.
2834		 *
2835		 * If we have sent an ack for the attr info msg from
2836		 * the peer, check if the mtu that was computed then
2837		 * (saved in lane_out params) matches the mtu that the
2838		 * peer has ack'd. If they don't match, we fail the
2839		 * handshake.
2840		 */
2841		if (lane_in->lstate & VSW_ATTR_ACK_SENT) {
2842			if (lane_out->mtu != msg->mtu) {
2843				return (1);
2844			}
2845		} else {
2846			/*
2847			 * If the mtu ack'd by the peer is > our mtu
2848			 * fail handshake. Otherwise, save the mtu, so
2849			 * we can validate it when we receive attr info
2850			 * from our peer.
2851			 */
2852			if (msg->mtu <= lane_out->mtu) {
2853				lane_out->mtu = msg->mtu;
2854			} else {
2855				return (1);
2856			}
2857		}
2858	}
2859
2860	return (0);
2861}
2862
2863/*
2864 * Process an attribute packet. We can end up here either because our peer
2865 * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our
2866 * peer has sent us an attribute INFO message
2867 *
2868 * If its an ACK we then move to the next stage of the handshake which
2869 * is to send our descriptor ring info to our peer. If its a NACK then
2870 * there is nothing more we can (currently) do.
2871 *
2872 * If we get a valid/acceptable INFO packet (and we have already negotiated
2873 * a version) we ACK back and set channel state to ATTR_RECV, otherwise we
2874 * NACK back and reset channel state to INACTIV.
2875 *
2876 * FUTURE: in time we will probably negotiate over attributes, but for
2877 * the moment unacceptable attributes are regarded as a fatal error.
2878 *
2879 */
2880void
2881vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt)
2882{
2883	vnet_attr_msg_t	*attr_pkt;
2884	vsw_t		*vswp = ldcp->ldc_vswp;
2885	lane_t		*lane_out = &ldcp->lane_out;
2886	lane_t		*lane_in = &ldcp->lane_in;
2887	int		rv;
2888
2889	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
2890
2891	/*
2892	 * We know this is a ctrl/attr packet so
2893	 * cast it into the correct structure.
2894	 */
2895	attr_pkt = (vnet_attr_msg_t *)pkt;
2896
2897	switch (attr_pkt->tag.vio_subtype) {
2898	case VIO_SUBTYPE_INFO:
2899
2900		rv = vsw_process_attr_info(ldcp, attr_pkt);
2901		if (rv != 0) {
2902			vsw_free_lane_resources(ldcp, INBOUND);
2903			attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2904			ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT;
2905		} else {
2906			attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2907			lane_in->lstate |= VSW_ATTR_ACK_SENT;
2908		}
2909		attr_pkt->tag.vio_sid = ldcp->local_session;
2910		DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
2911		(void) vsw_send_msg(ldcp, (void *)attr_pkt,
2912		    sizeof (vnet_attr_msg_t), B_TRUE);
2913		vsw_next_milestone(ldcp);
2914		break;
2915
2916	case VIO_SUBTYPE_ACK:
2917
2918		rv = vsw_process_attr_ack(ldcp, attr_pkt);
2919		if (rv != 0) {
2920			return;
2921		}
2922		lane_out->lstate |= VSW_ATTR_ACK_RECV;
2923		vsw_next_milestone(ldcp);
2924		break;
2925
2926	case VIO_SUBTYPE_NACK:
2927		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2928
2929		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV))
2930			return;
2931
2932		lane_out->lstate |= VSW_ATTR_NACK_RECV;
2933		vsw_next_milestone(ldcp);
2934		break;
2935
2936	default:
2937		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2938		    attr_pkt->tag.vio_subtype);
2939	}
2940
2941	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
2942}
2943
2944static int
2945vsw_process_dring_reg_info(vsw_ldc_t *ldcp, vio_msg_tag_t *tagp)
2946{
2947	int		rv;
2948	vsw_t		*vswp = ldcp->ldc_vswp;
2949	lane_t		*lp = &ldcp->lane_out;
2950	dring_info_t	*dp = NULL;
2951
2952	D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2953
2954	rv = vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV);
2955	if (rv != 0) {
2956		return (1);
2957	}
2958
2959	if (VSW_VER_GTEQ(ldcp, 1, 6) &&
2960	    (lp->dring_mode != ((vio_dring_reg_msg_t *)tagp)->options)) {
2961		/*
2962		 * The earlier version of Solaris vnet driver doesn't set the
2963		 * option (VIO_TX_DRING in its case) correctly in its dring reg
2964		 * message. We workaround that here by doing the check only
2965		 * for versions >= v1.6.
2966		 */
2967		DWARN(vswp, "%s(%lld): Rcvd dring reg option (%d), "
2968		    "negotiated mode (%d)\n", __func__, ldcp->ldc_id,
2969		    ((vio_dring_reg_msg_t *)tagp)->options, lp->dring_mode);
2970		return (1);
2971	}
2972
2973	/*
2974	 * Map dring exported by the peer.
2975	 */
2976	dp = vsw_map_dring(ldcp, (void *)tagp);
2977	if (dp == NULL) {
2978		return (1);
2979	}
2980
2981	/*
2982	 * Map data buffers exported by the peer if we are in RxDringData mode.
2983	 */
2984	if (lp->dring_mode == VIO_RX_DRING_DATA) {
2985		rv = vsw_map_data(ldcp, dp, (void *)tagp);
2986		if (rv != 0) {
2987			vsw_unmap_dring(ldcp);
2988			return (1);
2989		}
2990	}
2991
2992	return (0);
2993}
2994
2995static int
2996vsw_process_dring_reg_ack(vsw_ldc_t *ldcp, vio_msg_tag_t *tagp)
2997{
2998	vsw_t		*vswp = ldcp->ldc_vswp;
2999	dring_info_t	*dp;
3000
3001	D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3002
3003	if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV)) {
3004		return (1);
3005	}
3006
3007	dp = ldcp->lane_out.dringp;
3008
3009	/* save dring_ident acked by peer */
3010	dp->ident = ((vio_dring_reg_msg_t *)tagp)->dring_ident;
3011
3012	return (0);
3013}
3014
3015/*
3016 * Process a dring info packet. We can end up here either because our peer
3017 * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our
3018 * peer has sent us a dring INFO message.
3019 *
3020 * If we get a valid/acceptable INFO packet (and we have already negotiated
3021 * a version) we ACK back and update the lane state, otherwise we NACK back.
3022 *
3023 * FUTURE: nothing to stop client from sending us info on multiple dring's
3024 * but for the moment we will just use the first one we are given.
3025 *
3026 */
3027void
3028vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt)
3029{
3030	int		rv;
3031	int		msgsize;
3032	dring_info_t	*dp;
3033	vio_msg_tag_t	*tagp = (vio_msg_tag_t *)pkt;
3034	vsw_t		*vswp = ldcp->ldc_vswp;
3035	lane_t		*lane_out = &ldcp->lane_out;
3036	lane_t		*lane_in = &ldcp->lane_in;
3037
3038	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3039
3040	switch (tagp->vio_subtype) {
3041	case VIO_SUBTYPE_INFO:
3042		rv = vsw_process_dring_reg_info(ldcp, tagp);
3043		if (rv != 0) {
3044			vsw_free_lane_resources(ldcp, INBOUND);
3045			tagp->vio_subtype = VIO_SUBTYPE_NACK;
3046			lane_in->lstate |= VSW_DRING_NACK_SENT;
3047		} else {
3048			tagp->vio_subtype = VIO_SUBTYPE_ACK;
3049			lane_in->lstate |= VSW_DRING_ACK_SENT;
3050		}
3051		tagp->vio_sid = ldcp->local_session;
3052		DUMP_TAG_PTR(tagp);
3053		if (lane_out->dring_mode == VIO_RX_DRING_DATA) {
3054			dp = lane_in->dringp;
3055			msgsize =
3056			    VNET_DRING_REG_EXT_MSG_SIZE(dp->data_ncookies);
3057		} else {
3058			msgsize = sizeof (vio_dring_reg_msg_t);
3059		}
3060		(void) vsw_send_msg(ldcp, (void *)tagp, msgsize, B_TRUE);
3061		vsw_next_milestone(ldcp);
3062		break;
3063
3064	case VIO_SUBTYPE_ACK:
3065		rv = vsw_process_dring_reg_ack(ldcp, tagp);
3066		if (rv != 0) {
3067			return;
3068		}
3069		lane_out->lstate |= VSW_DRING_ACK_RECV;
3070		vsw_next_milestone(ldcp);
3071		break;
3072
3073	case VIO_SUBTYPE_NACK:
3074		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3075
3076		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV))
3077			return;
3078
3079		lane_out->lstate |= VSW_DRING_NACK_RECV;
3080		vsw_next_milestone(ldcp);
3081		break;
3082
3083	default:
3084		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3085		    tagp->vio_subtype);
3086	}
3087
3088	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3089}
3090
3091/*
3092 * Process a request from peer to unregister a dring.
3093 *
3094 * For the moment we just restart the handshake if our
3095 * peer endpoint attempts to unregister a dring.
3096 */
3097void
3098vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt)
3099{
3100	vsw_t			*vswp = ldcp->ldc_vswp;
3101	vio_dring_unreg_msg_t	*dring_pkt;
3102
3103	/*
3104	 * We know this is a ctrl/dring packet so
3105	 * cast it into the correct structure.
3106	 */
3107	dring_pkt = (vio_dring_unreg_msg_t *)pkt;
3108
3109	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3110
3111	switch (dring_pkt->tag.vio_subtype) {
3112	case VIO_SUBTYPE_INFO:
3113		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3114
3115		DWARN(vswp, "%s: restarting handshake..", __func__);
3116		break;
3117
3118	case VIO_SUBTYPE_ACK:
3119		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3120
3121		DWARN(vswp, "%s: restarting handshake..", __func__);
3122		break;
3123
3124	case VIO_SUBTYPE_NACK:
3125		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3126
3127		DWARN(vswp, "%s: restarting handshake..", __func__);
3128		break;
3129
3130	default:
3131		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3132		    dring_pkt->tag.vio_subtype);
3133	}
3134
3135	vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3136
3137	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3138}
3139
3140#define	SND_MCST_NACK(ldcp, pkt) \
3141	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
3142	pkt->tag.vio_sid = ldcp->local_session; \
3143	(void) vsw_send_msg(ldcp, (void *)pkt, \
3144			sizeof (vnet_mcast_msg_t), B_TRUE);
3145
3146/*
3147 * Process a multicast request from a vnet.
3148 *
3149 * Vnet's specify a multicast address that they are interested in. This
3150 * address is used as a key into the hash table which forms the multicast
3151 * forwarding database (mFDB).
3152 *
3153 * The table keys are the multicast addresses, while the table entries
3154 * are pointers to lists of ports which wish to receive packets for the
3155 * specified multicast address.
3156 *
3157 * When a multicast packet is being switched we use the address as a key
3158 * into the hash table, and then walk the appropriate port list forwarding
3159 * the pkt to each port in turn.
3160 *
3161 * If a vnet is no longer interested in a particular multicast grouping
3162 * we simply find the correct location in the hash table and then delete
3163 * the relevant port from the port list.
3164 *
3165 * To deal with the case whereby a port is being deleted without first
3166 * removing itself from the lists in the hash table, we maintain a list
3167 * of multicast addresses the port has registered an interest in, within
3168 * the port structure itself. We then simply walk that list of addresses
3169 * using them as keys into the hash table and remove the port from the
3170 * appropriate lists.
3171 */
3172static void
3173vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt)
3174{
3175	vnet_mcast_msg_t	*mcst_pkt;
3176	vsw_port_t		*port = ldcp->ldc_port;
3177	vsw_t			*vswp = ldcp->ldc_vswp;
3178	int			i;
3179
3180	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3181
3182	/*
3183	 * We know this is a ctrl/mcast packet so
3184	 * cast it into the correct structure.
3185	 */
3186	mcst_pkt = (vnet_mcast_msg_t *)pkt;
3187
3188	switch (mcst_pkt->tag.vio_subtype) {
3189	case VIO_SUBTYPE_INFO:
3190		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3191
3192		/*
3193		 * Check if in correct state to receive a multicast
3194		 * message (i.e. handshake complete). If not reset
3195		 * the handshake.
3196		 */
3197		if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV))
3198			return;
3199
3200		/*
3201		 * Before attempting to add or remove address check
3202		 * that they are valid multicast addresses.
3203		 * If not, then NACK back.
3204		 */
3205		for (i = 0; i < mcst_pkt->count; i++) {
3206			if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) {
3207				DERR(vswp, "%s: invalid multicast address",
3208				    __func__);
3209				SND_MCST_NACK(ldcp, mcst_pkt);
3210				return;
3211			}
3212		}
3213
3214		/*
3215		 * Now add/remove the addresses. If this fails we
3216		 * NACK back.
3217		 */
3218		if (vsw_add_rem_mcst(mcst_pkt, port) != 0) {
3219			SND_MCST_NACK(ldcp, mcst_pkt);
3220			return;
3221		}
3222
3223		mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3224		mcst_pkt->tag.vio_sid = ldcp->local_session;
3225
3226		DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt);
3227
3228		(void) vsw_send_msg(ldcp, (void *)mcst_pkt,
3229		    sizeof (vnet_mcast_msg_t), B_TRUE);
3230		break;
3231
3232	case VIO_SUBTYPE_ACK:
3233		DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3234
3235		/*
3236		 * We shouldn't ever get a multicast ACK message as
3237		 * at the moment we never request multicast addresses
3238		 * to be set on some other device. This may change in
3239		 * the future if we have cascading switches.
3240		 */
3241		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV))
3242			return;
3243
3244				/* Do nothing */
3245		break;
3246
3247	case VIO_SUBTYPE_NACK:
3248		DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3249
3250		/*
3251		 * We shouldn't get a multicast NACK packet for the
3252		 * same reasons as we shouldn't get a ACK packet.
3253		 */
3254		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV))
3255			return;
3256
3257				/* Do nothing */
3258		break;
3259
3260	default:
3261		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
3262		    mcst_pkt->tag.vio_subtype);
3263	}
3264
3265	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3266}
3267
3268static void
3269vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt)
3270{
3271	vio_rdx_msg_t	*rdx_pkt;
3272	vsw_t		*vswp = ldcp->ldc_vswp;
3273
3274	/*
3275	 * We know this is a ctrl/rdx packet so
3276	 * cast it into the correct structure.
3277	 */
3278	rdx_pkt = (vio_rdx_msg_t *)pkt;
3279
3280	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3281
3282	switch (rdx_pkt->tag.vio_subtype) {
3283	case VIO_SUBTYPE_INFO:
3284		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3285
3286		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_INFO_RECV))
3287			return;
3288
3289		rdx_pkt->tag.vio_sid = ldcp->local_session;
3290		rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3291
3292		DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt);
3293
3294		ldcp->lane_out.lstate |= VSW_RDX_ACK_SENT;
3295
3296		(void) vsw_send_msg(ldcp, (void *)rdx_pkt,
3297		    sizeof (vio_rdx_msg_t), B_TRUE);
3298
3299		vsw_next_milestone(ldcp);
3300		break;
3301
3302	case VIO_SUBTYPE_ACK:
3303		/*
3304		 * Should be handled in-band by callback handler.
3305		 */
3306		DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__);
3307		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3308		break;
3309
3310	case VIO_SUBTYPE_NACK:
3311		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3312
3313		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_NACK_RECV))
3314			return;
3315
3316		ldcp->lane_in.lstate |= VSW_RDX_NACK_RECV;
3317		vsw_next_milestone(ldcp);
3318		break;
3319
3320	default:
3321		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3322		    rdx_pkt->tag.vio_subtype);
3323	}
3324
3325	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3326}
3327
3328static void
3329vsw_process_physlink_msg(vsw_ldc_t *ldcp, void *pkt)
3330{
3331	vnet_physlink_msg_t	*msgp;
3332	vsw_t			*vswp = ldcp->ldc_vswp;
3333
3334	msgp = (vnet_physlink_msg_t *)pkt;
3335
3336	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3337
3338	switch (msgp->tag.vio_subtype) {
3339	case VIO_SUBTYPE_INFO:
3340
3341		/* vsw shouldn't recv physlink info */
3342		DWARN(vswp, "%s: Unexpected VIO_SUBTYPE_INFO", __func__);
3343		break;
3344
3345	case VIO_SUBTYPE_ACK:
3346
3347		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3348		break;
3349
3350	case VIO_SUBTYPE_NACK:
3351
3352		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3353		break;
3354
3355	default:
3356		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3357		    msgp->tag.vio_subtype);
3358	}
3359
3360	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3361}
3362
3363static void
3364vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t *tagp,
3365	uint32_t msglen)
3366{
3367	uint16_t	env = tagp->vio_subtype_env;
3368	vsw_t		*vswp = ldcp->ldc_vswp;
3369	lane_t		*lp = &ldcp->lane_out;
3370	uint8_t		dring_mode = lp->dring_mode;
3371
3372	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3373
3374	/* session id check */
3375	if (ldcp->session_status & VSW_PEER_SESSION) {
3376		if (ldcp->peer_session != tagp->vio_sid) {
3377			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
3378			    __func__, ldcp->ldc_id, tagp->vio_sid);
3379			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3380			return;
3381		}
3382	}
3383
3384	/*
3385	 * It is an error for us to be getting data packets
3386	 * before the handshake has completed.
3387	 */
3388	if (ldcp->hphase != VSW_MILESTONE4) {
3389		DERR(vswp, "%s: got data packet before handshake complete "
3390		    "hphase %d (%x: %x)", __func__, ldcp->hphase,
3391		    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
3392		DUMP_FLAGS(ldcp->lane_in.lstate);
3393		DUMP_FLAGS(ldcp->lane_out.lstate);
3394		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3395		return;
3396	}
3397	if (dring_mode == VIO_TX_DRING) {
3398		/*
3399		 * To reduce the locking contention, release the ldc_cblock
3400		 * here and re-acquire it once we are done receiving packets.
3401		 * We do this only in TxDring mode to allow further callbaks to
3402		 * continue while the msg worker thread processes the messages.
3403		 * In RxDringData mode, we process the messages in the callback
3404		 * itself and wake up rcv worker thread to process only data
3405		 * info messages.
3406		 */
3407		mutex_exit(&ldcp->ldc_cblock);
3408		mutex_enter(&ldcp->ldc_rxlock);
3409	}
3410
3411	/*
3412	 * Switch on vio_subtype envelope, then let lower routines
3413	 * decide if its an INFO, ACK or NACK packet.
3414	 */
3415	if (env == VIO_DRING_DATA) {
3416		ldcp->rx_dringdata(ldcp, dpkt);
3417	} else if (env == VIO_PKT_DATA) {
3418		ldcp->rx_pktdata(ldcp, dpkt, msglen);
3419	} else if (env == VIO_DESC_DATA) {
3420		vsw_process_data_ibnd_pkt(ldcp, dpkt);
3421	} else {
3422		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n",
3423		    __func__, env);
3424	}
3425
3426	if (dring_mode == VIO_TX_DRING) {
3427		mutex_exit(&ldcp->ldc_rxlock);
3428		mutex_enter(&ldcp->ldc_cblock);
3429	}
3430
3431	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3432}
3433
3434/*
3435 * dummy pkt data handler function for vnet protocol version 1.0
3436 */
3437static void
3438vsw_process_pkt_data_nop(void *arg1, void *arg2, uint32_t msglen)
3439{
3440	_NOTE(ARGUNUSED(arg1, arg2, msglen))
3441}
3442
3443/*
3444 * This function handles raw pkt data messages received over the channel.
3445 * Currently, only priority-eth-type frames are received through this mechanism.
3446 * In this case, the frame(data) is present within the message itself which
3447 * is copied into an mblk before switching it.
3448 */
3449static void
3450vsw_process_pkt_data(void *arg1, void *arg2, uint32_t msglen)
3451{
3452	vsw_ldc_t		*ldcp = (vsw_ldc_t *)arg1;
3453	vio_raw_data_msg_t	*dpkt = (vio_raw_data_msg_t *)arg2;
3454	uint32_t		size;
3455	mblk_t			*mp;
3456	vio_mblk_t		*vmp;
3457	vsw_t			*vswp = ldcp->ldc_vswp;
3458	vgen_stats_t		*statsp = &ldcp->ldc_stats;
3459	lane_t			*lp = &ldcp->lane_out;
3460
3461	size = msglen - VIO_PKT_DATA_HDRSIZE;
3462	if (size < ETHERMIN || size > lp->mtu) {
3463		(void) atomic_inc_32(&statsp->rx_pri_fail);
3464		DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
3465		    ldcp->ldc_id, size);
3466		return;
3467	}
3468
3469	vmp = vio_multipool_allocb(&ldcp->vmp, size + VLAN_TAGSZ);
3470	if (vmp == NULL) {
3471		mp = allocb(size + VLAN_TAGSZ, BPRI_MED);
3472		if (mp == NULL) {
3473			(void) atomic_inc_32(&statsp->rx_pri_fail);
3474			DWARN(vswp, "%s(%lld) allocb failure, "
3475			    "unable to process priority frame\n", __func__,
3476			    ldcp->ldc_id);
3477			return;
3478		}
3479	} else {
3480		mp = vmp->mp;
3481	}
3482
3483	/* skip over the extra space for vlan tag */
3484	mp->b_rptr += VLAN_TAGSZ;
3485
3486	/* copy the frame from the payload of raw data msg into the mblk */
3487	bcopy(dpkt->data, mp->b_rptr, size);
3488	mp->b_wptr = mp->b_rptr + size;
3489
3490	if (vmp != NULL) {
3491		vmp->state = VIO_MBLK_HAS_DATA;
3492	}
3493
3494	/* update stats */
3495	(void) atomic_inc_64(&statsp->rx_pri_packets);
3496	(void) atomic_add_64(&statsp->rx_pri_bytes, size);
3497
3498	/*
3499	 * VLAN_TAGSZ of extra space has been pre-alloc'd if tag is needed.
3500	 */
3501	(void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);
3502
3503	/* switch the frame to destination */
3504	vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT, ldcp->ldc_port, NULL);
3505}
3506
3507/*
3508 * Process an in-band descriptor message (most likely from
3509 * OBP).
3510 */
3511static void
3512vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
3513{
3514	vnet_ibnd_desc_t	*ibnd_desc;
3515	dring_info_t		*dp = NULL;
3516	vsw_private_desc_t	*priv_addr = NULL;
3517	vsw_t			*vswp = ldcp->ldc_vswp;
3518	mblk_t			*mp = NULL;
3519	size_t			nbytes = 0;
3520	size_t			off = 0;
3521	uint64_t		idx = 0;
3522	uint32_t		num = 1, len, datalen = 0;
3523	uint64_t		ncookies = 0;
3524	int			i, rv;
3525	int			j = 0;
3526
3527	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3528
3529	ibnd_desc = (vnet_ibnd_desc_t *)pkt;
3530
3531	switch (ibnd_desc->hdr.tag.vio_subtype) {
3532	case VIO_SUBTYPE_INFO:
3533		D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3534
3535		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
3536			return;
3537
3538		/*
3539		 * Data is padded to align on a 8 byte boundary,
3540		 * nbytes is actual data length, i.e. minus that
3541		 * padding.
3542		 */
3543		datalen = ibnd_desc->nbytes;
3544
3545		D2(vswp, "%s(%lld): processing inband desc : "
3546		    ": datalen 0x%lx", __func__, ldcp->ldc_id, datalen);
3547
3548		ncookies = ibnd_desc->ncookies;
3549
3550		/*
3551		 * allocb(9F) returns an aligned data block. We
3552		 * need to ensure that we ask ldc for an aligned
3553		 * number of bytes also.
3554		 */
3555		nbytes = datalen;
3556		if (nbytes & 0x7) {
3557			off = 8 - (nbytes & 0x7);
3558			nbytes += off;
3559		}
3560
3561		/* alloc extra space for VLAN_TAG */
3562		mp = allocb(datalen + 8, BPRI_MED);
3563		if (mp == NULL) {
3564			DERR(vswp, "%s(%lld): allocb failed",
3565			    __func__, ldcp->ldc_id);
3566			ldcp->ldc_stats.rx_allocb_fail++;
3567			return;
3568		}
3569
3570		/* skip over the extra space for VLAN_TAG */
3571		mp->b_rptr += 8;
3572
3573		rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr,
3574		    0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies,
3575		    LDC_COPY_IN);
3576
3577		if (rv != 0) {
3578			DERR(vswp, "%s(%d): unable to copy in data from "
3579			    "%d cookie(s)", __func__, ldcp->ldc_id, ncookies);
3580			freemsg(mp);
3581			ldcp->ldc_stats.ierrors++;
3582			return;
3583		}
3584
3585		D2(vswp, "%s(%d): copied in %ld bytes using %d cookies",
3586		    __func__, ldcp->ldc_id, nbytes, ncookies);
3587
3588		/* point to the actual end of data */
3589		mp->b_wptr = mp->b_rptr + datalen;
3590		ldcp->ldc_stats.ipackets++;
3591		ldcp->ldc_stats.rbytes += datalen;
3592
3593		/*
3594		 * We ACK back every in-band descriptor message we process
3595		 */
3596		ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK;
3597		ibnd_desc->hdr.tag.vio_sid = ldcp->local_session;
3598		(void) vsw_send_msg(ldcp, (void *)ibnd_desc,
3599		    sizeof (vnet_ibnd_desc_t), B_TRUE);
3600
3601		/*
3602		 * there is extra space alloc'd for VLAN_TAG
3603		 */
3604		(void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);
3605
3606		/* send the packet to be switched */
3607		vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT,
3608		    ldcp->ldc_port, NULL);
3609
3610		break;
3611
3612	case VIO_SUBTYPE_ACK:
3613		D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3614
3615		/* Verify the ACK is valid */
3616		idx = ibnd_desc->hdr.desc_handle;
3617
3618		if (idx >= vsw_num_descriptors) {
3619			cmn_err(CE_WARN, "!vsw%d: corrupted ACK received "
3620			    "(idx %ld)", vswp->instance, idx);
3621			return;
3622		}
3623
3624		if ((dp = ldcp->lane_out.dringp) == NULL) {
3625			DERR(vswp, "%s: no dring found", __func__);
3626			return;
3627		}
3628
3629		len = dp->num_descriptors;
3630		/*
3631		 * If the descriptor we are being ACK'ed for is not the
3632		 * one we expected, then pkts were lost somwhere, either
3633		 * when we tried to send a msg, or a previous ACK msg from
3634		 * our peer. In either case we now reclaim the descriptors
3635		 * in the range from the last ACK we received up to the
3636		 * current ACK.
3637		 */
3638		if (idx != dp->last_ack_recv) {
3639			DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)",
3640			    __func__, dp->last_ack_recv, idx);
3641			num = idx >= dp->last_ack_recv ?
3642			    idx - dp->last_ack_recv + 1:
3643			    (len - dp->last_ack_recv + 1) + idx;
3644		}
3645
3646		/*
3647		 * When we sent the in-band message to our peer we
3648		 * marked the copy in our private ring as READY. We now
3649		 * check that the descriptor we are being ACK'ed for is in
3650		 * fact READY, i.e. it is one we have shared with our peer.
3651		 *
3652		 * If its not we flag an error, but still reset the descr
3653		 * back to FREE.
3654		 */
3655		for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) {
3656			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
3657			mutex_enter(&priv_addr->dstate_lock);
3658			if (priv_addr->dstate != VIO_DESC_READY) {
3659				DERR(vswp, "%s: (%ld) desc at index %ld not "
3660				    "READY (0x%lx)", __func__,
3661				    ldcp->ldc_id, idx, priv_addr->dstate);
3662				DERR(vswp, "%s: bound %d: ncookies %ld : "
3663				    "datalen %ld", __func__,
3664				    priv_addr->bound, priv_addr->ncookies,
3665				    priv_addr->datalen);
3666			}
3667			D2(vswp, "%s: (%lld) freeing descp at %lld", __func__,
3668			    ldcp->ldc_id, idx);
3669			/* release resources associated with sent msg */
3670			priv_addr->datalen = 0;
3671			priv_addr->dstate = VIO_DESC_FREE;
3672			mutex_exit(&priv_addr->dstate_lock);
3673		}
3674		/* update to next expected value */
3675		dp->last_ack_recv = (idx + 1) % dp->num_descriptors;
3676
3677		break;
3678
3679	case VIO_SUBTYPE_NACK:
3680		DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3681
3682		/*
3683		 * We should only get a NACK if our peer doesn't like
3684		 * something about a message we have sent it. If this
3685		 * happens we just release the resources associated with
3686		 * the message. (We are relying on higher layers to decide
3687		 * whether or not to resend.
3688		 */
3689
3690		/* limit check */
3691		idx = ibnd_desc->hdr.desc_handle;
3692
3693		if (idx >= vsw_num_descriptors) {
3694			DERR(vswp, "%s: corrupted NACK received (idx %lld)",
3695			    __func__, idx);
3696			return;
3697		}
3698
3699		if ((dp = ldcp->lane_out.dringp) == NULL) {
3700			DERR(vswp, "%s: no dring found", __func__);
3701			return;
3702		}
3703
3704		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
3705
3706		/* move to correct location in ring */
3707		priv_addr += idx;
3708
3709		/* release resources associated with sent msg */
3710		mutex_enter(&priv_addr->dstate_lock);
3711		priv_addr->datalen = 0;
3712		priv_addr->dstate = VIO_DESC_FREE;
3713		mutex_exit(&priv_addr->dstate_lock);
3714
3715		break;
3716
3717	default:
3718		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
3719		    ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype);
3720	}
3721
3722	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3723}
3724
3725static void
3726vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t *tagp)
3727{
3728	_NOTE(ARGUNUSED(epkt))
3729
3730	vsw_t		*vswp = ldcp->ldc_vswp;
3731	uint16_t	env = tagp->vio_subtype_env;
3732
3733	D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
3734
3735	/*
3736	 * Error vio_subtypes have yet to be defined. So for
3737	 * the moment we can't do anything.
3738	 */
3739	D2(vswp, "%s: (%x) vio_subtype env", __func__, env);
3740
3741	D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
3742}
3743
3744/* transmit the packet over the given port */
3745int
3746vsw_portsend(vsw_port_t *port, mblk_t *mp)
3747{
3748	mblk_t		*mpt;
3749	int		count;
3750	vsw_ldc_t 	*ldcp = port->ldcp;
3751	int		status = 0;
3752
3753	count = vsw_vlan_frame_untag(port, VSW_VNETPORT, &mp, &mpt);
3754	if (count != 0) {
3755		status = ldcp->tx(ldcp, mp, mpt, count);
3756	}
3757	return (status);
3758}
3759
3760/*
3761 * Break up frames into 2 seperate chains: normal and
3762 * priority, based on the frame type. The number of
3763 * priority frames is also counted and returned.
3764 *
3765 * Params:
3766 * 	vswp:	pointer to the instance of vsw
3767 *	np:	head of packet chain to be broken
3768 *	npt:	tail of packet chain to be broken
3769 *
3770 * Returns:
3771 *	np:	head of normal data packets
3772 *	npt:	tail of normal data packets
3773 *	hp:	head of high priority packets
3774 *	hpt:	tail of high priority packets
3775 */
3776static uint32_t
3777vsw_get_pri_packets(vsw_t *vswp, mblk_t **np, mblk_t **npt,
3778	mblk_t **hp, mblk_t **hpt)
3779{
3780	mblk_t			*tmp = NULL;
3781	mblk_t			*smp = NULL;
3782	mblk_t			*hmp = NULL;	/* high prio pkts head */
3783	mblk_t			*hmpt = NULL;	/* high prio pkts tail */
3784	mblk_t			*nmp = NULL;	/* normal pkts head */
3785	mblk_t			*nmpt = NULL;	/* normal pkts tail */
3786	uint32_t		count = 0;
3787	int			i;
3788	struct ether_header	*ehp;
3789	uint32_t		num_types;
3790	uint16_t		*types;
3791
3792	tmp = *np;
3793	while (tmp != NULL) {
3794
3795		smp = tmp;
3796		tmp = tmp->b_next;
3797		smp->b_next = NULL;
3798		smp->b_prev = NULL;
3799
3800		ehp = (struct ether_header *)smp->b_rptr;
3801		num_types = vswp->pri_num_types;
3802		types = vswp->pri_types;
3803		for (i = 0; i < num_types; i++) {
3804			if (ehp->ether_type == types[i]) {
3805				/* high priority frame */
3806
3807				if (hmp != NULL) {
3808					hmpt->b_next = smp;
3809					hmpt = smp;
3810				} else {
3811					hmp = hmpt = smp;
3812				}
3813				count++;
3814				break;
3815			}
3816		}
3817		if (i == num_types) {
3818			/* normal data frame */
3819
3820			if (nmp != NULL) {
3821				nmpt->b_next = smp;
3822				nmpt = smp;
3823			} else {
3824				nmp = nmpt = smp;
3825			}
3826		}
3827	}
3828
3829	*hp = hmp;
3830	*hpt = hmpt;
3831	*np = nmp;
3832	*npt = nmpt;
3833
3834	return (count);
3835}
3836
3837/*
3838 * Wrapper function to transmit normal and/or priority frames over the channel.
3839 */
3840static int
3841vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
3842{
3843	vsw_ldc_t 		*ldcp = (vsw_ldc_t *)arg;
3844	mblk_t			*tmp;
3845	mblk_t			*smp;
3846	mblk_t			*hmp;	/* high prio pkts head */
3847	mblk_t			*hmpt;	/* high prio pkts tail */
3848	mblk_t			*nmp;	/* normal pkts head */
3849	mblk_t			*nmpt;	/* normal pkts tail */
3850	uint32_t		n = 0;
3851	vsw_t			*vswp = ldcp->ldc_vswp;
3852
3853	ASSERT(VSW_PRI_ETH_DEFINED(vswp));
3854	ASSERT(count != 0);
3855
3856	nmp = mp;
3857	nmpt = mpt;
3858
3859	/* gather any priority frames from the chain of packets */
3860	n = vsw_get_pri_packets(vswp, &nmp, &nmpt, &hmp, &hmpt);
3861
3862	/* transmit priority frames */
3863	tmp = hmp;
3864	while (tmp != NULL) {
3865		smp = tmp;
3866		tmp = tmp->b_next;
3867		smp->b_next = NULL;
3868		vsw_ldcsend_pkt(ldcp, smp);
3869	}
3870
3871	count -= n;
3872
3873	if (count == 0) {
3874		/* no normal data frames to process */
3875		return (0);
3876	}
3877
3878	return (vsw_ldctx(ldcp, nmp, nmpt, count));
3879}
3880
3881/*
3882 * Wrapper function to transmit normal frames over the channel.
3883 */
3884static int
3885vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
3886{
3887	vsw_ldc_t 	*ldcp = (vsw_ldc_t *)arg;
3888	mblk_t		*tmp = NULL;
3889
3890	ASSERT(count != 0);
3891	/*
3892	 * If the TX thread is enabled, then queue the
3893	 * ordinary frames and signal the tx thread.
3894	 */
3895	if (ldcp->tx_thread != NULL) {
3896
3897		mutex_enter(&ldcp->tx_thr_lock);
3898
3899		if ((ldcp->tx_cnt + count) >= vsw_max_tx_qcount) {
3900			/*
3901			 * If we reached queue limit,
3902			 * do not queue new packets,
3903			 * drop them.
3904			 */
3905			ldcp->ldc_stats.tx_qfull += count;
3906			mutex_exit(&ldcp->tx_thr_lock);
3907			freemsgchain(mp);
3908			goto exit;
3909		}
3910		if (ldcp->tx_mhead == NULL) {
3911			ldcp->tx_mhead = mp;
3912			ldcp->tx_mtail = mpt;
3913			cv_signal(&ldcp->tx_thr_cv);
3914		} else {
3915			ldcp->tx_mtail->b_next = mp;
3916			ldcp->tx_mtail = mpt;
3917		}
3918		ldcp->tx_cnt += count;
3919		mutex_exit(&ldcp->tx_thr_lock);
3920	} else {
3921		while (mp != NULL) {
3922			tmp = mp->b_next;
3923			mp->b_next = mp->b_prev = NULL;
3924			(void) vsw_ldcsend(ldcp, mp, 1);
3925			mp = tmp;
3926		}
3927	}
3928
3929exit:
3930	return (0);
3931}
3932
3933/*
3934 * This function transmits the frame in the payload of a raw data
3935 * (VIO_PKT_DATA) message. Thus, it provides an Out-Of-Band path to
3936 * send special frames with high priorities, without going through
3937 * the normal data path which uses descriptor ring mechanism.
3938 */
3939static void
3940vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp)
3941{
3942	vio_raw_data_msg_t	*pkt;
3943	mblk_t			*bp;
3944	mblk_t			*nmp = NULL;
3945	vio_mblk_t		*vmp;
3946	caddr_t			dst;
3947	uint32_t		mblksz;
3948	uint32_t		size;
3949	uint32_t		nbytes;
3950	int			rv;
3951	vsw_t			*vswp = ldcp->ldc_vswp;
3952	vgen_stats_t		*statsp = &ldcp->ldc_stats;
3953
3954	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
3955	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
3956		(void) atomic_inc_32(&statsp->tx_pri_fail);
3957		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
3958		    "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
3959		    ldcp->lane_out.lstate);
3960		goto send_pkt_exit;
3961	}
3962
3963	size = msgsize(mp);
3964
3965	/* frame size bigger than available payload len of raw data msg ? */
3966	if (size > (size_t)(ldcp->msglen - VIO_PKT_DATA_HDRSIZE)) {
3967		(void) atomic_inc_32(&statsp->tx_pri_fail);
3968		DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
3969		    ldcp->ldc_id, size);
3970		goto send_pkt_exit;
3971	}
3972
3973	if (size < ETHERMIN)
3974		size = ETHERMIN;
3975
3976	/* alloc space for a raw data message */
3977	vmp = vio_allocb(vswp->pri_tx_vmp);
3978	if (vmp == NULL) {
3979		(void) atomic_inc_32(&statsp->tx_pri_fail);
3980		DWARN(vswp, "vio_allocb failed\n");
3981		goto send_pkt_exit;
3982	} else {
3983		nmp = vmp->mp;
3984	}
3985	pkt = (vio_raw_data_msg_t *)nmp->b_rptr;
3986
3987	/* copy frame into the payload of raw data message */
3988	dst = (caddr_t)pkt->data;
3989	for (bp = mp; bp != NULL; bp = bp->b_cont) {
3990		mblksz = MBLKL(bp);
3991		bcopy(bp->b_rptr, dst, mblksz);
3992		dst += mblksz;
3993	}
3994
3995	vmp->state = VIO_MBLK_HAS_DATA;
3996
3997	/* setup the raw data msg */
3998	pkt->tag.vio_msgtype = VIO_TYPE_DATA;
3999	pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
4000	pkt->tag.vio_subtype_env = VIO_PKT_DATA;
4001	pkt->tag.vio_sid = ldcp->local_session;
4002	nbytes = VIO_PKT_DATA_HDRSIZE + size;
4003
4004	/* send the msg over ldc */
4005	rv = vsw_send_msg(ldcp, (void *)pkt, nbytes, B_TRUE);
4006	if (rv != 0) {
4007		(void) atomic_inc_32(&statsp->tx_pri_fail);
4008		DWARN(vswp, "%s(%lld) Error sending priority frame\n", __func__,
4009		    ldcp->ldc_id);
4010		goto send_pkt_exit;
4011	}
4012
4013	/* update stats */
4014	(void) atomic_inc_64(&statsp->tx_pri_packets);
4015	(void) atomic_add_64(&statsp->tx_pri_packets, size);
4016
4017send_pkt_exit:
4018	if (nmp != NULL)
4019		freemsg(nmp);
4020	freemsg(mp);
4021}
4022
4023/*
4024 * Transmit the packet over the given LDC channel.
4025 *
4026 * The 'retries' argument indicates how many times a packet
4027 * is retried before it is dropped. Note, the retry is done
4028 * only for a resource related failure, for all other failures
4029 * the packet is dropped immediately.
4030 */
4031static int
4032vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries)
4033{
4034	int		i;
4035	int		rc;
4036	int		status = 0;
4037	vsw_port_t	*port = ldcp->ldc_port;
4038	dring_info_t	*dp = NULL;
4039	lane_t		*lp = &ldcp->lane_out;
4040
4041	for (i = 0; i < retries; ) {
4042		/*
4043		 * Send the message out using the appropriate
4044		 * transmit function which will free mblock when it
4045		 * is finished with it.
4046		 */
4047		mutex_enter(&port->tx_lock);
4048		if (port->transmit != NULL) {
4049			status = (*port->transmit)(ldcp, mp);
4050		}
4051		if (status == LDC_TX_SUCCESS) {
4052			mutex_exit(&port->tx_lock);
4053			break;
4054		}
4055		i++;	/* increment the counter here */
4056
4057		/* If its the last retry, then update the oerror */
4058		if ((i == retries) && (status == LDC_TX_NORESOURCES)) {
4059			ldcp->ldc_stats.oerrors++;
4060		}
4061		mutex_exit(&port->tx_lock);
4062
4063		if (status != LDC_TX_NORESOURCES) {
4064			/*
4065			 * No retrying required for errors un-related
4066			 * to resources.
4067			 */
4068			break;
4069		}
4070		if (((dp = ldcp->lane_out.dringp) != NULL) &&
4071		    ((VSW_VER_GTEQ(ldcp, 1, 2) &&
4072		    (ldcp->lane_out.xfer_mode & VIO_DRING_MODE_V1_2)) ||
4073		    ((VSW_VER_LT(ldcp, 1, 2) &&
4074		    (ldcp->lane_out.xfer_mode == VIO_DRING_MODE_V1_0))))) {
4075
4076			/* Need to reclaim in TxDring mode. */
4077			if (lp->dring_mode == VIO_TX_DRING) {
4078				rc = vsw_reclaim_dring(dp, dp->end_idx);
4079			}
4080
4081		} else {
4082			/*
4083			 * If there is no dring or the xfer_mode is
4084			 * set to DESC_MODE(ie., OBP), then simply break here.
4085			 */
4086			break;
4087		}
4088
4089		/*
4090		 * Delay only if none were reclaimed
4091		 * and its not the last retry.
4092		 */
4093		if ((rc == 0) && (i < retries)) {
4094			delay(drv_usectohz(vsw_ldc_tx_delay));
4095		}
4096	}
4097	freemsg(mp);
4098	return (status);
4099}
4100
4101/*
4102 * Send an in-band descriptor message over ldc.
4103 */
4104static int
4105vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
4106{
4107	vsw_t			*vswp = ldcp->ldc_vswp;
4108	vnet_ibnd_desc_t	ibnd_msg;
4109	vsw_private_desc_t	*priv_desc = NULL;
4110	dring_info_t		*dp = NULL;
4111	size_t			n, size = 0;
4112	caddr_t			bufp;
4113	mblk_t			*bp;
4114	int			idx, i;
4115	int			status = LDC_TX_SUCCESS;
4116	static int		warn_msg = 1;
4117	lane_t			*lp = &ldcp->lane_out;
4118
4119	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4120
4121	ASSERT(mp != NULL);
4122
4123	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4124	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4125		DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt",
4126		    __func__, ldcp->ldc_id, ldcp->ldc_status,
4127		    ldcp->lane_out.lstate);
4128		ldcp->ldc_stats.oerrors++;
4129		return (LDC_TX_FAILURE);
4130	}
4131
4132	/*
4133	 * The dring here is as an internal buffer,
4134	 * rather than a transfer channel.
4135	 */
4136	if ((dp = ldcp->lane_out.dringp) == NULL) {
4137		DERR(vswp, "%s(%lld): no dring for outbound lane",
4138		    __func__, ldcp->ldc_id);
4139		DERR(vswp, "%s(%lld) status(%d) state (0x%llx)", __func__,
4140		    ldcp->ldc_id, ldcp->ldc_status, ldcp->lane_out.lstate);
4141		ldcp->ldc_stats.oerrors++;
4142		return (LDC_TX_FAILURE);
4143	}
4144
4145	size = msgsize(mp);
4146	if (size > (size_t)lp->mtu) {
4147		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
4148		    ldcp->ldc_id, size);
4149		ldcp->ldc_stats.oerrors++;
4150		return (LDC_TX_FAILURE);
4151	}
4152
4153	/*
4154	 * Find a free descriptor in our buffer ring
4155	 */
4156	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
4157		if (warn_msg) {
4158			DERR(vswp, "%s(%lld): no descriptor available for ring "
4159			    "at 0x%llx", __func__, ldcp->ldc_id, dp);
4160			warn_msg = 0;
4161		}
4162
4163		/* nothing more we can do */
4164		status = LDC_TX_NORESOURCES;
4165		goto vsw_descrsend_free_exit;
4166	} else {
4167		D2(vswp, "%s(%lld): free private descriptor found at pos "
4168		    "%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx, priv_desc);
4169		warn_msg = 1;
4170	}
4171
4172	/* copy data into the descriptor */
4173	bufp = priv_desc->datap;
4174	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
4175		n = MBLKL(bp);
4176		bcopy(bp->b_rptr, bufp, n);
4177		bufp += n;
4178	}
4179
4180	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
4181
4182	/* create and send the in-band descp msg */
4183	ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
4184	ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO;
4185	ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
4186	ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;
4187
4188	/*
4189	 * Copy the mem cookies describing the data from the
4190	 * private region of the descriptor ring into the inband
4191	 * descriptor.
4192	 */
4193	for (i = 0; i < priv_desc->ncookies; i++) {
4194		bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i],
4195		    sizeof (ldc_mem_cookie_t));
4196	}
4197
4198	ibnd_msg.hdr.desc_handle = idx;
4199	ibnd_msg.ncookies = priv_desc->ncookies;
4200	ibnd_msg.nbytes = size;
4201
4202	ldcp->ldc_stats.opackets++;
4203	ldcp->ldc_stats.obytes += size;
4204
4205	(void) vsw_send_msg(ldcp, (void *)&ibnd_msg,
4206	    sizeof (vnet_ibnd_desc_t), B_TRUE);
4207
4208vsw_descrsend_free_exit:
4209
4210	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4211	return (status);
4212}
4213
4214static void
4215vsw_send_ver(void *arg)
4216{
4217	vsw_ldc_t	*ldcp = (vsw_ldc_t *)arg;
4218	vsw_t		*vswp = ldcp->ldc_vswp;
4219	lane_t		*lp = &ldcp->lane_out;
4220	vio_ver_msg_t	ver_msg;
4221
4222	D1(vswp, "%s enter", __func__);
4223
4224	ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4225	ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4226	ver_msg.tag.vio_subtype_env = VIO_VER_INFO;
4227	ver_msg.tag.vio_sid = ldcp->local_session;
4228
4229	if (vsw_obp_ver_proto_workaround == B_FALSE) {
4230		ver_msg.ver_major = vsw_versions[0].ver_major;
4231		ver_msg.ver_minor = vsw_versions[0].ver_minor;
4232	} else {
4233		/* use the major,minor that we've ack'd */
4234		lane_t	*lpi = &ldcp->lane_in;
4235		ver_msg.ver_major = lpi->ver_major;
4236		ver_msg.ver_minor = lpi->ver_minor;
4237	}
4238	ver_msg.dev_class = VDEV_NETWORK_SWITCH;
4239
4240	lp->lstate |= VSW_VER_INFO_SENT;
4241	lp->ver_major = ver_msg.ver_major;
4242	lp->ver_minor = ver_msg.ver_minor;
4243
4244	DUMP_TAG(ver_msg.tag);
4245
4246	(void) vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t), B_TRUE);
4247
4248	D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id);
4249}
4250
4251static void
4252vsw_send_attr(vsw_ldc_t *ldcp)
4253{
4254	vsw_t			*vswp = ldcp->ldc_vswp;
4255	lane_t			*lp = &ldcp->lane_out;
4256	vnet_attr_msg_t		attr_msg;
4257
4258	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
4259
4260	/*
4261	 * Subtype is set to INFO by default
4262	 */
4263	attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4264	attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4265	attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO;
4266	attr_msg.tag.vio_sid = ldcp->local_session;
4267
4268	/* payload copied from default settings for lane */
4269	attr_msg.mtu = lp->mtu;
4270	attr_msg.addr_type = lp->addr_type;
4271	attr_msg.xfer_mode = lp->xfer_mode;
4272	attr_msg.ack_freq = lp->xfer_mode;
4273	attr_msg.options = lp->dring_mode;
4274
4275	READ_ENTER(&vswp->if_lockrw);
4276	attr_msg.addr = vnet_macaddr_strtoul((vswp->if_addr).ether_addr_octet);
4277	RW_EXIT(&vswp->if_lockrw);
4278
4279	ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT;
4280
4281	DUMP_TAG(attr_msg.tag);
4282
4283	(void) vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t), B_TRUE);
4284
4285	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
4286}
4287
4288static void
4289vsw_send_dring_info(vsw_ldc_t *ldcp)
4290{
4291	int		msgsize;
4292	void		*msg;
4293	vsw_t		*vswp = ldcp->ldc_vswp;
4294	vsw_port_t	*port = ldcp->ldc_port;
4295	lane_t		*lp = &ldcp->lane_out;
4296	vgen_stats_t	*statsp = &ldcp->ldc_stats;
4297
4298	D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id);
4299
4300	/* dring mode has been negotiated in attr phase; save in stats */
4301	statsp->dring_mode = lp->dring_mode;
4302
4303	if (lp->dring_mode == VIO_RX_DRING_DATA) {
4304		/*
4305		 * Change the transmit routine for RxDringData mode.
4306		 */
4307		port->transmit = vsw_dringsend_shm;
4308		msg = (void *) vsw_create_rx_dring_info(ldcp);
4309		if (msg == NULL) {
4310			return;
4311		}
4312		msgsize =
4313		    VNET_DRING_REG_EXT_MSG_SIZE(lp->dringp->data_ncookies);
4314		ldcp->rcv_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
4315		    vsw_ldc_rcv_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
4316		ldcp->rx_dringdata = vsw_process_dringdata_shm;
4317	} else {
4318		msg = (void *) vsw_create_tx_dring_info(ldcp);
4319		if (msg == NULL) {
4320			return;
4321		}
4322		msgsize = sizeof (vio_dring_reg_msg_t);
4323		ldcp->msg_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
4324		    vsw_ldc_msg_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
4325		ldcp->rx_dringdata = vsw_process_dringdata;
4326	}
4327
4328	lp->lstate |= VSW_DRING_INFO_SENT;
4329	DUMP_TAG_PTR((vio_msg_tag_t *)msg);
4330	(void) vsw_send_msg(ldcp, msg, msgsize, B_TRUE);
4331	kmem_free(msg, msgsize);
4332
4333	D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id);
4334}
4335
4336static void
4337vsw_send_rdx(vsw_ldc_t *ldcp)
4338{
4339	vsw_t		*vswp = ldcp->ldc_vswp;
4340	vio_rdx_msg_t	rdx_msg;
4341
4342	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
4343
4344	rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4345	rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4346	rdx_msg.tag.vio_subtype_env = VIO_RDX;
4347	rdx_msg.tag.vio_sid = ldcp->local_session;
4348
4349	ldcp->lane_in.lstate |= VSW_RDX_INFO_SENT;
4350
4351	DUMP_TAG(rdx_msg.tag);
4352
4353	(void) vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t), B_TRUE);
4354
4355	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
4356}
4357
4358/*
4359 * Remove the specified address from the list of address maintained
4360 * in this port node.
4361 */
4362mcst_addr_t *
4363vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr)
4364{
4365	vsw_t		*vswp = NULL;
4366	vsw_port_t	*port = NULL;
4367	mcst_addr_t	*prev_p = NULL;
4368	mcst_addr_t	*curr_p = NULL;
4369
4370	D1(NULL, "%s: enter : devtype %d : addr 0x%llx",
4371	    __func__, devtype, addr);
4372
4373	if (devtype == VSW_VNETPORT) {
4374		port = (vsw_port_t *)arg;
4375		mutex_enter(&port->mca_lock);
4376		prev_p = curr_p = port->mcap;
4377	} else {
4378		vswp = (vsw_t *)arg;
4379		mutex_enter(&vswp->mca_lock);
4380		prev_p = curr_p = vswp->mcap;
4381	}
4382
4383	while (curr_p != NULL) {
4384		if (curr_p->addr == addr) {
4385			D2(NULL, "%s: address found", __func__);
4386			/* match found */
4387			if (prev_p == curr_p) {
4388				/* list head */
4389				if (devtype == VSW_VNETPORT)
4390					port->mcap = curr_p->nextp;
4391				else
4392					vswp->mcap = curr_p->nextp;
4393			} else {
4394				prev_p->nextp = curr_p->nextp;
4395			}
4396			break;
4397		} else {
4398			prev_p = curr_p;
4399			curr_p = curr_p->nextp;
4400		}
4401	}
4402
4403	if (devtype == VSW_VNETPORT)
4404		mutex_exit(&port->mca_lock);
4405	else
4406		mutex_exit(&vswp->mca_lock);
4407
4408	D1(NULL, "%s: exit", __func__);
4409
4410	return (curr_p);
4411}
4412
4413/*
4414 * Create a ring consisting of just a private portion and link
4415 * it into the list of rings for the outbound lane.
4416 *
4417 * These type of rings are used primarily for temporary data
4418 * storage (i.e. as data buffers).
4419 */
4420void
4421vsw_create_privring(vsw_ldc_t *ldcp)
4422{
4423	dring_info_t		*dp;
4424	vsw_t			*vswp = ldcp->ldc_vswp;
4425
4426	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4427
4428	dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
4429	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
4430	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
4431	ldcp->lane_out.dringp = dp;
4432
4433	/* no public section */
4434	dp->pub_addr = NULL;
4435	dp->priv_addr = kmem_zalloc(
4436	    (sizeof (vsw_private_desc_t) * vsw_num_descriptors), KM_SLEEP);
4437	dp->num_descriptors = vsw_num_descriptors;
4438
4439	if (vsw_setup_tx_dring(ldcp, dp)) {
4440		DERR(vswp, "%s: setup of ring failed", __func__);
4441		vsw_destroy_tx_dring(ldcp);
4442		return;
4443	}
4444
4445	/* haven't used any descriptors yet */
4446	dp->end_idx = 0;
4447	dp->restart_reqd = B_TRUE;
4448
4449	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4450}
4451
4452/*
4453 * Set the default lane attributes. These are copied into
4454 * the attr msg we send to our peer. If they are not acceptable
4455 * then (currently) the handshake ends.
4456 */
4457static void
4458vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
4459{
4460	bzero(lp, sizeof (lane_t));
4461
4462	READ_ENTER(&vswp->if_lockrw);
4463	ether_copy(&(vswp->if_addr), &(lp->addr));
4464	RW_EXIT(&vswp->if_lockrw);
4465
4466	lp->mtu = vswp->max_frame_size;
4467	lp->addr_type = ADDR_TYPE_MAC;
4468	lp->xfer_mode = VIO_DRING_MODE_V1_0;
4469	lp->ack_freq = 0;	/* for shared mode */
4470	lp->seq_num = VNET_ISS;
4471}
4472
4473/*
4474 * Map the descriptor ring exported by the peer.
4475 */
4476static dring_info_t *
4477vsw_map_dring(vsw_ldc_t *ldcp, void *pkt)
4478{
4479	dring_info_t	*dp = NULL;
4480	lane_t		*lp = &ldcp->lane_out;
4481
4482	if (lp->dring_mode == VIO_RX_DRING_DATA) {
4483		/*
4484		 * In RxDringData mode, dring that we map in
4485		 * becomes our transmit descriptor ring.
4486		 */
4487		dp =  vsw_map_tx_dring(ldcp, pkt);
4488	} else {
4489		/*
4490		 * In TxDring mode, dring that we map in
4491		 * becomes our receive descriptor ring.
4492		 */
4493		dp =  vsw_map_rx_dring(ldcp, pkt);
4494	}
4495	return (dp);
4496}
4497
4498/*
4499 * Common dring mapping function used in both TxDring and RxDringData modes.
4500 */
4501dring_info_t *
4502vsw_map_dring_cmn(vsw_ldc_t *ldcp, vio_dring_reg_msg_t *dring_pkt)
4503{
4504	int		rv;
4505	dring_info_t	*dp;
4506	ldc_mem_info_t	minfo;
4507	vsw_t		*vswp = ldcp->ldc_vswp;
4508
4509	/*
4510	 * If the dring params are unacceptable then we NACK back.
4511	 */
4512	if ((dring_pkt->num_descriptors == 0) ||
4513	    (dring_pkt->descriptor_size == 0) ||
4514	    (dring_pkt->ncookies != 1)) {
4515		DERR(vswp, "%s (%lld): invalid dring info",
4516		    __func__, ldcp->ldc_id);
4517		return (NULL);
4518	}
4519
4520	dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
4521
4522	dp->num_descriptors = dring_pkt->num_descriptors;
4523	dp->descriptor_size = dring_pkt->descriptor_size;
4524	dp->options = dring_pkt->options;
4525	dp->dring_ncookies = dring_pkt->ncookies;
4526
4527	/*
4528	 * Note: should only get one cookie. Enforced in
4529	 * the ldc layer.
4530	 */
4531	bcopy(&dring_pkt->cookie[0], &dp->dring_cookie[0],
4532	    sizeof (ldc_mem_cookie_t));
4533
4534	rv = ldc_mem_dring_map(ldcp->ldc_handle, &dp->dring_cookie[0],
4535	    dp->dring_ncookies, dp->num_descriptors, dp->descriptor_size,
4536	    LDC_DIRECT_MAP, &(dp->dring_handle));
4537	if (rv != 0) {
4538		goto fail;
4539	}
4540
4541	rv = ldc_mem_dring_info(dp->dring_handle, &minfo);
4542	if (rv != 0) {
4543		goto fail;
4544	}
4545	/* store the address of the ring */
4546	dp->pub_addr = minfo.vaddr;
4547
4548	/* cache the dring mtype */
4549	dp->dring_mtype = minfo.mtype;
4550
4551	/* no private section as we are importing */
4552	dp->priv_addr = NULL;
4553
4554	/*
4555	 * Using simple mono increasing int for ident at the moment.
4556	 */
4557	dp->ident = ldcp->next_ident;
4558	ldcp->next_ident++;
4559
4560	/*
4561	 * Acknowledge it; we send back a unique dring identifier that
4562	 * the sending side will use in future to refer to this
4563	 * descriptor ring.
4564	 */
4565	dring_pkt->dring_ident = dp->ident;
4566
4567	return (dp);
4568fail:
4569	if (dp->dring_handle != NULL) {
4570		(void) ldc_mem_dring_unmap(dp->dring_handle);
4571	}
4572	kmem_free(dp, sizeof (*dp));
4573	return (NULL);
4574}
4575
4576/*
4577 * Unmap the descriptor ring exported by the peer.
4578 */
4579static void
4580vsw_unmap_dring(vsw_ldc_t *ldcp)
4581{
4582	lane_t	*lane_out = &ldcp->lane_out;
4583
4584	if (lane_out->dring_mode == VIO_RX_DRING_DATA) {
4585		vsw_unmap_tx_dring(ldcp);
4586	} else {
4587		vsw_unmap_rx_dring(ldcp);
4588	}
4589}
4590
4591/*
4592 * Map the shared memory data buffer area exported by the peer.
4593 * Used in RxDringData mode only.
4594 */
4595static int
4596vsw_map_data(vsw_ldc_t *ldcp, dring_info_t *dp, void *pkt)
4597{
4598	int			rv;
4599	vio_dring_reg_ext_msg_t	*emsg;
4600	vio_dring_reg_msg_t	*msg = pkt;
4601	uint8_t			*buf = (uint8_t *)msg->cookie;
4602	vsw_t			*vswp = ldcp->ldc_vswp;
4603	ldc_mem_info_t		minfo;
4604
4605	/* skip over dring cookies */
4606	ASSERT(msg->ncookies == 1);
4607	buf += (msg->ncookies * sizeof (ldc_mem_cookie_t));
4608
4609	emsg = (vio_dring_reg_ext_msg_t *)buf;
4610	if (emsg->data_ncookies > VNET_DATA_AREA_COOKIES) {
4611		return (1);
4612	}
4613
4614	/* save # of data area cookies */
4615	dp->data_ncookies = emsg->data_ncookies;
4616
4617	/* save data area size */
4618	dp->data_sz = emsg->data_area_size;
4619
4620	/* allocate ldc mem handle for data area */
4621	rv = ldc_mem_alloc_handle(ldcp->ldc_handle, &dp->data_handle);
4622	if (rv != 0) {
4623		cmn_err(CE_WARN, "ldc_mem_alloc_handle failed\n");
4624		DWARN(vswp, "%s (%lld) ldc_mem_alloc_handle() failed: %d\n",
4625		    __func__, ldcp->ldc_id, rv);
4626		return (1);
4627	}
4628
4629	/* map the data area */
4630	rv = ldc_mem_map(dp->data_handle, emsg->data_cookie,
4631	    emsg->data_ncookies, LDC_DIRECT_MAP, LDC_MEM_R,
4632	    (caddr_t *)&dp->data_addr, NULL);
4633	if (rv != 0) {
4634		cmn_err(CE_WARN, "ldc_mem_map failed\n");
4635		DWARN(vswp, "%s (%lld) ldc_mem_map() failed: %d\n",
4636		    __func__, ldcp->ldc_id, rv);
4637		return (1);
4638	}
4639
4640	/* get the map info */
4641	rv = ldc_mem_info(dp->data_handle, &minfo);
4642	if (rv != 0) {
4643		cmn_err(CE_WARN, "ldc_mem_info failed\n");
4644		DWARN(vswp, "%s (%lld) ldc_mem_info() failed: %d\n",
4645		    __func__, ldcp->ldc_id, rv);
4646		return (1);
4647	}
4648
4649	if (minfo.mtype != LDC_DIRECT_MAP) {
4650		DWARN(vswp, "%s (%lld) mtype(%d) is not direct map\n",
4651		    __func__, ldcp->ldc_id, minfo.mtype);
4652		return (1);
4653	}
4654
4655	/* allocate memory for data area cookies */
4656	dp->data_cookie = kmem_zalloc(emsg->data_ncookies *
4657	    sizeof (ldc_mem_cookie_t), KM_SLEEP);
4658
4659	/* save data area cookies */
4660	bcopy(emsg->data_cookie, dp->data_cookie,
4661	    emsg->data_ncookies * sizeof (ldc_mem_cookie_t));
4662
4663	return (0);
4664}
4665
4666/*
4667 * Reset and free all the resources associated with the channel.
4668 */
4669static void
4670vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
4671{
4672	lane_t	*lp;
4673
4674	D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id);
4675
4676	if (dir == INBOUND) {
4677		D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane"
4678		    " of channel %lld", __func__, ldcp->ldc_id);
4679		lp = &ldcp->lane_in;
4680	} else {
4681		D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane"
4682		    " of channel %lld", __func__, ldcp->ldc_id);
4683		lp = &ldcp->lane_out;
4684	}
4685
4686	lp->lstate = VSW_LANE_INACTIV;
4687	lp->seq_num = VNET_ISS;
4688
4689	if (dir == INBOUND) {
4690		/* Unmap the remote dring which is imported from the peer */
4691		vsw_unmap_dring(ldcp);
4692	} else {
4693		/* Destroy the local dring which is exported to the peer */
4694		vsw_destroy_dring(ldcp);
4695	}
4696
4697	D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id);
4698}
4699
4700/*
4701 * Destroy the descriptor ring.
4702 */
4703static void
4704vsw_destroy_dring(vsw_ldc_t *ldcp)
4705{
4706	lane_t	*lp = &ldcp->lane_out;
4707
4708	if (lp->dring_mode == VIO_RX_DRING_DATA) {
4709		vsw_destroy_rx_dring(ldcp);
4710	} else {
4711		vsw_destroy_tx_dring(ldcp);
4712	}
4713}
4714
4715/*
4716 * vsw_ldc_tx_worker -- A per LDC worker thread to transmit data.
4717 * This thread is woken up by the vsw_portsend to transmit
4718 * packets.
4719 */
4720static void
4721vsw_ldc_tx_worker(void *arg)
4722{
4723	callb_cpr_t	cprinfo;
4724	vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
4725	vsw_t *vswp = ldcp->ldc_vswp;
4726	mblk_t *mp;
4727	mblk_t *tmp;
4728
4729	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
4730	CALLB_CPR_INIT(&cprinfo, &ldcp->tx_thr_lock, callb_generic_cpr,
4731	    "vnet_tx_thread");
4732	mutex_enter(&ldcp->tx_thr_lock);
4733	while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP)) {
4734
4735		CALLB_CPR_SAFE_BEGIN(&cprinfo);
4736		/*
4737		 * Wait until the data is received or a stop
4738		 * request is received.
4739		 */
4740		while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP) &&
4741		    (ldcp->tx_mhead == NULL)) {
4742			cv_wait(&ldcp->tx_thr_cv, &ldcp->tx_thr_lock);
4743		}
4744		CALLB_CPR_SAFE_END(&cprinfo, &ldcp->tx_thr_lock)
4745
4746		/*
4747		 * First process the stop request.
4748		 */
4749		if (ldcp->tx_thr_flags & VSW_WTHR_STOP) {
4750			D2(vswp, "%s(%lld):tx thread stopped\n",
4751			    __func__, ldcp->ldc_id);
4752			break;
4753		}
4754		mp = ldcp->tx_mhead;
4755		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
4756		ldcp->tx_cnt = 0;
4757		mutex_exit(&ldcp->tx_thr_lock);
4758		D2(vswp, "%s(%lld):calling vsw_ldcsend\n",
4759		    __func__, ldcp->ldc_id);
4760		while (mp != NULL) {
4761			tmp = mp->b_next;
4762			mp->b_next = mp->b_prev = NULL;
4763			(void) vsw_ldcsend(ldcp, mp, vsw_ldc_tx_retries);
4764			mp = tmp;
4765		}
4766		mutex_enter(&ldcp->tx_thr_lock);
4767	}
4768
4769	/*
4770	 * Update the run status and wakeup the thread that
4771	 * has sent the stop request.
4772	 */
4773	ldcp->tx_thr_flags &= ~VSW_WTHR_STOP;
4774	ldcp->tx_thread = NULL;
4775	CALLB_CPR_EXIT(&cprinfo);
4776	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
4777	thread_exit();
4778}
4779
4780/* vsw_stop_tx_thread -- Co-ordinate with receive thread to stop it */
4781static void
4782vsw_stop_tx_thread(vsw_ldc_t *ldcp)
4783{
4784	kt_did_t	tid = 0;
4785	vsw_t		*vswp = ldcp->ldc_vswp;
4786
4787	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
4788	/*
4789	 * Send a stop request by setting the stop flag and
4790	 * wait until the receive thread stops.
4791	 */
4792	mutex_enter(&ldcp->tx_thr_lock);
4793	if (ldcp->tx_thread != NULL) {
4794		tid = ldcp->tx_thread->t_did;
4795		ldcp->tx_thr_flags |= VSW_WTHR_STOP;
4796		cv_signal(&ldcp->tx_thr_cv);
4797	}
4798	mutex_exit(&ldcp->tx_thr_lock);
4799
4800	if (tid != 0) {
4801		thread_join(tid);
4802	}
4803
4804	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
4805}
4806
4807static int
4808vsw_mapin_avail(vsw_ldc_t *ldcp)
4809{
4810	int		rv;
4811	ldc_info_t	info;
4812	uint64_t	mapin_sz_req;
4813	uint64_t	dblk_sz;
4814	vsw_t		*vswp = ldcp->ldc_vswp;
4815
4816	rv = ldc_info(ldcp->ldc_handle, &info);
4817	if (rv != 0) {
4818		return (B_FALSE);
4819	}
4820
4821	dblk_sz = RXDRING_DBLK_SZ(vswp->max_frame_size);
4822	mapin_sz_req = (VSW_RXDRING_NRBUFS * dblk_sz);
4823
4824	if (info.direct_map_size_max >= mapin_sz_req) {
4825		return (B_TRUE);
4826	}
4827
4828	return (B_FALSE);
4829}
4830
4831/*
4832 * Debugging routines
4833 */
4834static void
4835display_state(void)
4836{
4837	vsw_t		*vswp;
4838	vsw_port_list_t	*plist;
4839	vsw_port_t 	*port;
4840	vsw_ldc_t 	*ldcp;
4841	extern vsw_t 	*vsw_head;
4842
4843	cmn_err(CE_NOTE, "***** system state *****");
4844
4845	for (vswp = vsw_head; vswp; vswp = vswp->next) {
4846		plist = &vswp->plist;
4847		READ_ENTER(&plist->lockrw);
4848		cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n",
4849		    vswp->instance, plist->num_ports);
4850
4851		for (port = plist->head; port != NULL; port = port->p_next) {
4852			cmn_err(CE_CONT, "port %d : %d ldcs attached\n",
4853			    port->p_instance, port->num_ldcs);
4854			ldcp = port->ldcp;
4855			cmn_err(CE_CONT, "chan %lu : dev %d : "
4856			    "status %d : phase %u\n",
4857			    ldcp->ldc_id, ldcp->dev_class,
4858			    ldcp->ldc_status, ldcp->hphase);
4859			cmn_err(CE_CONT, "chan %lu : lsession %lu : "
4860			    "psession %lu\n", ldcp->ldc_id,
4861			    ldcp->local_session, ldcp->peer_session);
4862
4863			cmn_err(CE_CONT, "Inbound lane:\n");
4864			display_lane(&ldcp->lane_in);
4865			cmn_err(CE_CONT, "Outbound lane:\n");
4866			display_lane(&ldcp->lane_out);
4867		}
4868		RW_EXIT(&plist->lockrw);
4869	}
4870	cmn_err(CE_NOTE, "***** system state *****");
4871}
4872
4873static void
4874display_lane(lane_t *lp)
4875{
4876	dring_info_t	*drp = lp->dringp;
4877
4878	cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n",
4879	    lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu);
4880	cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n",
4881	    lp->addr_type, lp->addr, lp->xfer_mode);
4882	cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp);
4883
4884	cmn_err(CE_CONT, "Dring info:\n");
4885	cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n",
4886	    drp->num_descriptors, drp->descriptor_size);
4887	cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->dring_handle);
4888	cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n",
4889	    (uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr);
4890	cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n",
4891	    drp->ident, drp->end_idx);
4892	display_ring(drp);
4893}
4894
4895static void
4896display_ring(dring_info_t *dringp)
4897{
4898	uint64_t		i;
4899	uint64_t		priv_count = 0;
4900	uint64_t		pub_count = 0;
4901	vnet_public_desc_t	*pub_addr = NULL;
4902	vsw_private_desc_t	*priv_addr = NULL;
4903
4904	for (i = 0; i < vsw_num_descriptors; i++) {
4905		if (dringp->pub_addr != NULL) {
4906			pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i;
4907
4908			if (pub_addr->hdr.dstate == VIO_DESC_FREE)
4909				pub_count++;
4910		}
4911
4912		if (dringp->priv_addr != NULL) {
4913			priv_addr = (vsw_private_desc_t *)dringp->priv_addr + i;
4914
4915			if (priv_addr->dstate == VIO_DESC_FREE)
4916				priv_count++;
4917		}
4918	}
4919	cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n",
4920	    i, priv_count, pub_count);
4921}
4922
4923static void
4924dump_flags(uint64_t state)
4925{
4926	int	i;
4927
4928	typedef struct flag_name {
4929		int	flag_val;
4930		char	*flag_name;
4931	} flag_name_t;
4932
4933	flag_name_t	flags[] = {
4934		VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT",
4935		VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV",
4936		VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV",
4937		VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT",
4938		VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV",
4939		VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT",
4940		VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT",
4941		VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV",
4942		VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT",
4943		VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV",
4944		VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT",
4945		VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV",
4946		VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT",
4947		VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV",
4948		VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT",
4949		VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV",
4950		VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT",
4951		VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV",
4952		VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT",
4953		VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV",
4954		VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT",
4955		VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV",
4956		VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT",
4957		VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV",
4958		VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT",
4959		VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV",
4960		VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT",
4961		VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV",
4962		VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT",
4963		VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV",
4964		VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"};
4965
4966	DERR(NULL, "DUMP_FLAGS: %llx\n", state);
4967	for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) {
4968		if (state & flags[i].flag_val)
4969			DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name);
4970	}
4971}
4972