vsw_ldc.c revision 6495:1a95fa8c7c94
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29#include <sys/types.h>
30#include <sys/errno.h>
31#include <sys/debug.h>
32#include <sys/time.h>
33#include <sys/sysmacros.h>
34#include <sys/systm.h>
35#include <sys/user.h>
36#include <sys/stropts.h>
37#include <sys/stream.h>
38#include <sys/strlog.h>
39#include <sys/strsubr.h>
40#include <sys/cmn_err.h>
41#include <sys/cpu.h>
42#include <sys/kmem.h>
43#include <sys/conf.h>
44#include <sys/ddi.h>
45#include <sys/sunddi.h>
46#include <sys/ksynch.h>
47#include <sys/stat.h>
48#include <sys/kstat.h>
49#include <sys/vtrace.h>
50#include <sys/strsun.h>
51#include <sys/dlpi.h>
52#include <sys/ethernet.h>
53#include <net/if.h>
54#include <sys/varargs.h>
55#include <sys/machsystm.h>
56#include <sys/modctl.h>
57#include <sys/modhash.h>
58#include <sys/mac.h>
59#include <sys/mac_ether.h>
60#include <sys/taskq.h>
61#include <sys/note.h>
62#include <sys/mach_descrip.h>
63#include <sys/mac.h>
64#include <sys/mdeg.h>
65#include <sys/ldc.h>
66#include <sys/vsw_fdb.h>
67#include <sys/vsw.h>
68#include <sys/vio_mailbox.h>
69#include <sys/vnet_mailbox.h>
70#include <sys/vnet_common.h>
71#include <sys/vio_util.h>
72#include <sys/sdt.h>
73#include <sys/atomic.h>
74#include <sys/callb.h>
75#include <sys/vlan.h>
76
77/* Port add/deletion/etc routines */
78static	int vsw_port_delete(vsw_port_t *port);
79static	int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id);
80static	int vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id);
81static	int vsw_init_ldcs(vsw_port_t *port);
82static	int vsw_uninit_ldcs(vsw_port_t *port);
83static	int vsw_ldc_init(vsw_ldc_t *ldcp);
84static	int vsw_ldc_uninit(vsw_ldc_t *ldcp);
85static	int vsw_drain_ldcs(vsw_port_t *port);
86static	int vsw_drain_port_taskq(vsw_port_t *port);
87static	void vsw_marker_task(void *);
88static	int vsw_plist_del_node(vsw_t *, vsw_port_t *port);
89int vsw_detach_ports(vsw_t *vswp);
90int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
91mcst_addr_t *vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr);
92int vsw_port_detach(vsw_t *vswp, int p_instance);
93int vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt, uint32_t count);
94int vsw_port_attach(vsw_port_t *portp);
95vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
96void vsw_vlan_unaware_port_reset(vsw_port_t *portp);
97int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t);
98void vsw_hio_port_reset(vsw_port_t *portp);
99
100/* Interrupt routines */
101static	uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg);
102
103/* Handshake routines */
104static	void vsw_ldc_reinit(vsw_ldc_t *);
105static	void vsw_process_conn_evt(vsw_ldc_t *, uint16_t);
106static	void vsw_conn_task(void *);
107static	int vsw_check_flag(vsw_ldc_t *, int, uint64_t);
108static	void vsw_next_milestone(vsw_ldc_t *);
109static	int vsw_supported_version(vio_ver_msg_t *);
110static	void vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp);
111static	void vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp);
112
113/* Data processing routines */
114static void vsw_process_pkt(void *);
115static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t *);
116static void vsw_process_ctrl_pkt(void *);
117static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *);
118static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *);
119static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *);
120static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *);
121static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *);
122static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *);
123static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *,
124	uint32_t);
125static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *);
126static void vsw_process_pkt_data_nop(void *, void *, uint32_t);
127static void vsw_process_pkt_data(void *, void *, uint32_t);
128static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *);
129static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *);
130
131/* Switching/data transmit routines */
132static	int vsw_dringsend(vsw_ldc_t *, mblk_t *);
133static	int vsw_descrsend(vsw_ldc_t *, mblk_t *);
134static void vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp);
135static int vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries);
136static int vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
137static int vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
138
139/* Packet creation routines */
140static void vsw_send_ver(void *);
141static void vsw_send_attr(vsw_ldc_t *);
142static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *);
143static void vsw_send_dring_info(vsw_ldc_t *);
144static void vsw_send_rdx(vsw_ldc_t *);
145
146/* Dring routines */
147static dring_info_t *vsw_create_dring(vsw_ldc_t *);
148static void vsw_create_privring(vsw_ldc_t *);
149static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp);
150static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
151    int *);
152static dring_info_t *vsw_ident2dring(lane_t *, uint64_t);
153static int vsw_reclaim_dring(dring_info_t *dp, int start);
154
155static void vsw_set_lane_attr(vsw_t *, lane_t *);
156static int vsw_check_attr(vnet_attr_msg_t *, vsw_ldc_t *);
157static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg);
158static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *);
159static int vsw_check_dring_info(vio_dring_reg_msg_t *);
160
161/* Rcv/Tx thread routines */
162static void vsw_stop_tx_thread(vsw_ldc_t *ldcp);
163static void vsw_ldc_tx_worker(void *arg);
164static void vsw_stop_rx_thread(vsw_ldc_t *ldcp);
165static void vsw_ldc_rx_worker(void *arg);
166
167/* Misc support routines */
168static	caddr_t vsw_print_ethaddr(uint8_t *addr, char *ebuf);
169static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
170static int vsw_free_ring(dring_info_t *);
171static void vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr);
172static int vsw_get_same_dest_list(struct ether_header *ehp,
173    mblk_t **rhead, mblk_t **rtail, mblk_t **mpp);
174static mblk_t *vsw_dupmsgchain(mblk_t *mp);
175
176/* Debugging routines */
177static void dump_flags(uint64_t);
178static void display_state(void);
179static void display_lane(lane_t *);
180static void display_ring(dring_info_t *);
181
182/*
183 * Functions imported from other files.
184 */
185extern int vsw_set_hw(vsw_t *, vsw_port_t *, int);
186extern int vsw_unset_hw(vsw_t *, vsw_port_t *, int);
187extern void vsw_reconfig_hw(vsw_t *);
188extern int vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port);
189extern void vsw_del_mcst_port(vsw_port_t *port);
190extern int vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
191extern int vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
192extern void vsw_fdbe_add(vsw_t *vswp, void *port);
193extern void vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr);
194extern void vsw_create_vlans(void *arg, int type);
195extern void vsw_destroy_vlans(void *arg, int type);
196extern void vsw_vlan_add_ids(void *arg, int type);
197extern void vsw_vlan_remove_ids(void *arg, int type);
198extern boolean_t vsw_frame_lookup_vid(void *arg, int caller,
199	struct ether_header *ehp, uint16_t *vidp);
200extern mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp);
201extern uint32_t vsw_vlan_frame_untag(void *arg, int type, mblk_t **np,
202	mblk_t **npt);
203extern boolean_t vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid);
204extern void vsw_hio_start(vsw_t *vswp, vsw_ldc_t *ldcp);
205extern void vsw_hio_stop(vsw_t *vswp, vsw_ldc_t *ldcp);
206extern void vsw_process_dds_msg(vsw_t *vswp, vsw_ldc_t *ldcp, void *msg);
207extern void vsw_hio_stop_port(vsw_port_t *portp);
208
209#define	VSW_NUM_VMPOOLS		3	/* number of vio mblk pools */
210
211/*
212 * Tunables used in this file.
213 */
214extern int vsw_num_handshakes;
215extern int vsw_wretries;
216extern int vsw_desc_delay;
217extern int vsw_read_attempts;
218extern int vsw_ldc_tx_delay;
219extern int vsw_ldc_tx_retries;
220extern boolean_t vsw_ldc_rxthr_enabled;
221extern boolean_t vsw_ldc_txthr_enabled;
222extern uint32_t vsw_ntxds;
223extern uint32_t vsw_max_tx_qcount;
224extern uint32_t vsw_chain_len;
225extern uint32_t vsw_mblk_size1;
226extern uint32_t vsw_mblk_size2;
227extern uint32_t vsw_mblk_size3;
228extern uint32_t vsw_num_mblks1;
229extern uint32_t vsw_num_mblks2;
230extern uint32_t vsw_num_mblks3;
231extern boolean_t vsw_obp_ver_proto_workaround;
232
233#define	LDC_ENTER_LOCK(ldcp)	\
234				mutex_enter(&((ldcp)->ldc_cblock));\
235				mutex_enter(&((ldcp)->ldc_rxlock));\
236				mutex_enter(&((ldcp)->ldc_txlock));
237#define	LDC_EXIT_LOCK(ldcp)	\
238				mutex_exit(&((ldcp)->ldc_txlock));\
239				mutex_exit(&((ldcp)->ldc_rxlock));\
240				mutex_exit(&((ldcp)->ldc_cblock));
241
242#define	VSW_VER_EQ(ldcp, major, minor)	\
243	((ldcp)->lane_out.ver_major == (major) &&	\
244	    (ldcp)->lane_out.ver_minor == (minor))
245
246#define	VSW_VER_LT(ldcp, major, minor)	\
247	(((ldcp)->lane_out.ver_major < (major)) ||	\
248	    ((ldcp)->lane_out.ver_major == (major) &&	\
249	    (ldcp)->lane_out.ver_minor < (minor)))
250
251#define	VSW_VER_GTEQ(ldcp, major, minor)	\
252	(((ldcp)->lane_out.ver_major > (major)) ||	\
253	    ((ldcp)->lane_out.ver_major == (major) &&	\
254	    (ldcp)->lane_out.ver_minor >= (minor)))
255
256/* supported versions */
257static	ver_sup_t	vsw_versions[] = { {1, 3} };
258
259/*
260 * For the moment the state dump routines have their own
261 * private flag.
262 */
263#define	DUMP_STATE	0
264
265#if DUMP_STATE
266
267#define	DUMP_TAG(tag) \
268{			\
269	D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \
270	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype);	\
271	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env);	\
272}
273
274#define	DUMP_TAG_PTR(tag) \
275{			\
276	D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \
277	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype);	\
278	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env);	\
279}
280
281#define	DUMP_FLAGS(flags) dump_flags(flags);
282#define	DISPLAY_STATE()	display_state()
283
284#else
285
286#define	DUMP_TAG(tag)
287#define	DUMP_TAG_PTR(tag)
288#define	DUMP_FLAGS(state)
289#define	DISPLAY_STATE()
290
291#endif	/* DUMP_STATE */
292
293/*
294 * Attach the specified port.
295 *
296 * Returns 0 on success, 1 on failure.
297 */
298int
299vsw_port_attach(vsw_port_t *port)
300{
301	vsw_t			*vswp = port->p_vswp;
302	vsw_port_list_t		*plist = &vswp->plist;
303	vsw_port_t		*p, **pp;
304	int			i;
305	int			nids = port->num_ldcs;
306	uint64_t		*ldcids;
307
308	D1(vswp, "%s: enter : port %d", __func__, port->p_instance);
309
310	/* port already exists? */
311	READ_ENTER(&plist->lockrw);
312	for (p = plist->head; p != NULL; p = p->p_next) {
313		if (p->p_instance == port->p_instance) {
314			DWARN(vswp, "%s: port instance %d already attached",
315			    __func__, p->p_instance);
316			RW_EXIT(&plist->lockrw);
317			return (1);
318		}
319	}
320	RW_EXIT(&plist->lockrw);
321
322	rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL);
323
324	mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
325	mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
326
327	mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
328	cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
329	port->state = VSW_PORT_INIT;
330
331	D2(vswp, "%s: %d nids", __func__, nids);
332	ldcids = port->ldc_ids;
333	for (i = 0; i < nids; i++) {
334		D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]);
335		if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) {
336			DERR(vswp, "%s: ldc_attach failed", __func__);
337
338			rw_destroy(&port->p_ldclist.lockrw);
339
340			cv_destroy(&port->state_cv);
341			mutex_destroy(&port->state_lock);
342
343			mutex_destroy(&port->tx_lock);
344			mutex_destroy(&port->mca_lock);
345			kmem_free(port, sizeof (vsw_port_t));
346			return (1);
347		}
348	}
349
350	if (vswp->switching_setup_done == B_TRUE) {
351		/*
352		 * If the underlying physical device has been setup,
353		 * program the mac address of this port in it.
354		 * Otherwise, port macaddr will be set after the physical
355		 * device is successfully setup by the timeout handler.
356		 */
357		mutex_enter(&vswp->hw_lock);
358		(void) vsw_set_hw(vswp, port, VSW_VNETPORT);
359		mutex_exit(&vswp->hw_lock);
360	}
361
362	/* create the fdb entry for this port/mac address */
363	vsw_fdbe_add(vswp, port);
364
365	vsw_create_vlans(port, VSW_VNETPORT);
366
367	WRITE_ENTER(&plist->lockrw);
368
369	/* link it into the list of ports for this vsw instance */
370	pp = (vsw_port_t **)(&plist->head);
371	port->p_next = *pp;
372	*pp = port;
373	plist->num_ports++;
374
375	RW_EXIT(&plist->lockrw);
376
377	/*
378	 * Initialise the port and any ldc's under it.
379	 */
380	(void) vsw_init_ldcs(port);
381
382	D1(vswp, "%s: exit", __func__);
383	return (0);
384}
385
386/*
387 * Detach the specified port.
388 *
389 * Returns 0 on success, 1 on failure.
390 */
391int
392vsw_port_detach(vsw_t *vswp, int p_instance)
393{
394	vsw_port_t	*port = NULL;
395	vsw_port_list_t	*plist = &vswp->plist;
396
397	D1(vswp, "%s: enter: port id %d", __func__, p_instance);
398
399	WRITE_ENTER(&plist->lockrw);
400
401	if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) {
402		RW_EXIT(&plist->lockrw);
403		return (1);
404	}
405
406	if (vsw_plist_del_node(vswp, port)) {
407		RW_EXIT(&plist->lockrw);
408		return (1);
409	}
410
411	/* cleanup any HybridIO for this port */
412	vsw_hio_stop_port(port);
413
414	/*
415	 * No longer need to hold writer lock on port list now
416	 * that we have unlinked the target port from the list.
417	 */
418	RW_EXIT(&plist->lockrw);
419
420	/* Remove the fdb entry for this port/mac address */
421	vsw_fdbe_del(vswp, &(port->p_macaddr));
422	vsw_destroy_vlans(port, VSW_VNETPORT);
423
424	/* Remove any multicast addresses.. */
425	vsw_del_mcst_port(port);
426
427	/* Remove address if was programmed into HW. */
428	mutex_enter(&vswp->hw_lock);
429
430	/*
431	 * Port's address may not have been set in hardware. This could
432	 * happen if the underlying physical device is not yet available and
433	 * vsw_setup_switching_timeout() may be in progress.
434	 * We remove its addr from hardware only if it has been set before.
435	 */
436	if (port->addr_set != VSW_ADDR_UNSET)
437		(void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
438
439	if (vswp->recfg_reqd)
440		vsw_reconfig_hw(vswp);
441
442	mutex_exit(&vswp->hw_lock);
443
444	if (vsw_port_delete(port)) {
445		return (1);
446	}
447
448	D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance);
449	return (0);
450}
451
452/*
453 * Detach all active ports.
454 *
455 * Returns 0 on success, 1 on failure.
456 */
457int
458vsw_detach_ports(vsw_t *vswp)
459{
460	vsw_port_list_t 	*plist = &vswp->plist;
461	vsw_port_t		*port = NULL;
462
463	D1(vswp, "%s: enter", __func__);
464
465	WRITE_ENTER(&plist->lockrw);
466
467	while ((port = plist->head) != NULL) {
468		if (vsw_plist_del_node(vswp, port)) {
469			DERR(vswp, "%s: Error deleting port %d"
470			    " from port list", __func__, port->p_instance);
471			RW_EXIT(&plist->lockrw);
472			return (1);
473		}
474
475		/* Remove address if was programmed into HW. */
476		mutex_enter(&vswp->hw_lock);
477		(void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
478		mutex_exit(&vswp->hw_lock);
479
480		/* Remove the fdb entry for this port/mac address */
481		vsw_fdbe_del(vswp, &(port->p_macaddr));
482		vsw_destroy_vlans(port, VSW_VNETPORT);
483
484		/* Remove any multicast addresses.. */
485		vsw_del_mcst_port(port);
486
487		/*
488		 * No longer need to hold the lock on the port list
489		 * now that we have unlinked the target port from the
490		 * list.
491		 */
492		RW_EXIT(&plist->lockrw);
493		if (vsw_port_delete(port)) {
494			DERR(vswp, "%s: Error deleting port %d",
495			    __func__, port->p_instance);
496			return (1);
497		}
498		WRITE_ENTER(&plist->lockrw);
499	}
500	RW_EXIT(&plist->lockrw);
501
502	D1(vswp, "%s: exit", __func__);
503
504	return (0);
505}
506
507/*
508 * Delete the specified port.
509 *
510 * Returns 0 on success, 1 on failure.
511 */
512static int
513vsw_port_delete(vsw_port_t *port)
514{
515	vsw_ldc_list_t 		*ldcl;
516	vsw_t			*vswp = port->p_vswp;
517	int			num_ldcs;
518
519	D1(vswp, "%s: enter : port id %d", __func__, port->p_instance);
520
521	(void) vsw_uninit_ldcs(port);
522
523	/*
524	 * Wait for any pending ctrl msg tasks which reference this
525	 * port to finish.
526	 */
527	if (vsw_drain_port_taskq(port))
528		return (1);
529
530	/*
531	 * Wait for any active callbacks to finish
532	 */
533	if (vsw_drain_ldcs(port))
534		return (1);
535
536	ldcl = &port->p_ldclist;
537	num_ldcs = port->num_ldcs;
538	WRITE_ENTER(&ldcl->lockrw);
539	while (num_ldcs > 0) {
540		if (vsw_ldc_detach(port, ldcl->head->ldc_id) != 0) {
541			cmn_err(CE_WARN, "!vsw%d: unable to detach ldc %ld",
542			    vswp->instance, ldcl->head->ldc_id);
543			RW_EXIT(&ldcl->lockrw);
544			port->num_ldcs = num_ldcs;
545			return (1);
546		}
547		num_ldcs--;
548	}
549	RW_EXIT(&ldcl->lockrw);
550
551	rw_destroy(&port->p_ldclist.lockrw);
552
553	mutex_destroy(&port->mca_lock);
554	mutex_destroy(&port->tx_lock);
555
556	cv_destroy(&port->state_cv);
557	mutex_destroy(&port->state_lock);
558
559	if (port->num_ldcs != 0) {
560		kmem_free(port->ldc_ids, port->num_ldcs * sizeof (uint64_t));
561		port->num_ldcs = 0;
562	}
563	kmem_free(port, sizeof (vsw_port_t));
564
565	D1(vswp, "%s: exit", __func__);
566
567	return (0);
568}
569
570/*
571 * Attach a logical domain channel (ldc) under a specified port.
572 *
573 * Returns 0 on success, 1 on failure.
574 */
575static int
576vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
577{
578	vsw_t 		*vswp = port->p_vswp;
579	vsw_ldc_list_t *ldcl = &port->p_ldclist;
580	vsw_ldc_t 	*ldcp = NULL;
581	ldc_attr_t 	attr;
582	ldc_status_t	istatus;
583	int 		status = DDI_FAILURE;
584	int		rv;
585	char		kname[MAXNAMELEN];
586	enum		{ PROG_init = 0x0, PROG_mblks = 0x1,
587			    PROG_callback = 0x2, PROG_rx_thread = 0x4,
588			    PROG_tx_thread = 0x8}
589			progress;
590
591	progress = PROG_init;
592
593	D1(vswp, "%s: enter", __func__);
594
595	ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP);
596	if (ldcp == NULL) {
597		DERR(vswp, "%s: kmem_zalloc failed", __func__);
598		return (1);
599	}
600	ldcp->ldc_id = ldc_id;
601
602	/* Allocate pools of receive mblks */
603	rv = vio_init_multipools(&ldcp->vmp, VSW_NUM_VMPOOLS,
604	    vsw_mblk_size1, vsw_mblk_size2, vsw_mblk_size3,
605	    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3);
606	if (rv) {
607		DWARN(vswp, "%s: unable to create free mblk pools for"
608		    " channel %ld (rv %d)", __func__, ldc_id, rv);
609		kmem_free(ldcp, sizeof (vsw_ldc_t));
610		return (1);
611	}
612
613	progress |= PROG_mblks;
614
615	mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
616	mutex_init(&ldcp->ldc_rxlock, NULL, MUTEX_DRIVER, NULL);
617	mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
618	mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
619	cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL);
620	rw_init(&ldcp->lane_in.dlistrw, NULL, RW_DRIVER, NULL);
621	rw_init(&ldcp->lane_out.dlistrw, NULL, RW_DRIVER, NULL);
622
623	/* required for handshake with peer */
624	ldcp->local_session = (uint64_t)ddi_get_lbolt();
625	ldcp->peer_session = 0;
626	ldcp->session_status = 0;
627	ldcp->hss_id = 1;	/* Initial handshake session id */
628
629	(void) atomic_swap_32(&port->p_hio_capable, B_FALSE);
630
631	/* only set for outbound lane, inbound set by peer */
632	vsw_set_lane_attr(vswp, &ldcp->lane_out);
633
634	attr.devclass = LDC_DEV_NT_SVC;
635	attr.instance = ddi_get_instance(vswp->dip);
636	attr.mode = LDC_MODE_UNRELIABLE;
637	attr.mtu = VSW_LDC_MTU;
638	status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
639	if (status != 0) {
640		DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
641		    __func__, ldc_id, status);
642		goto ldc_attach_fail;
643	}
644
645	if (vsw_ldc_rxthr_enabled) {
646		ldcp->rx_thr_flags = 0;
647
648		mutex_init(&ldcp->rx_thr_lock, NULL, MUTEX_DRIVER, NULL);
649		cv_init(&ldcp->rx_thr_cv, NULL, CV_DRIVER, NULL);
650		ldcp->rx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
651		    vsw_ldc_rx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
652
653		progress |= PROG_rx_thread;
654		if (ldcp->rx_thread == NULL) {
655			DWARN(vswp, "%s(%lld): Failed to create worker thread",
656			    __func__, ldc_id);
657			goto ldc_attach_fail;
658		}
659	}
660
661	if (vsw_ldc_txthr_enabled) {
662		ldcp->tx_thr_flags = 0;
663		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
664
665		mutex_init(&ldcp->tx_thr_lock, NULL, MUTEX_DRIVER, NULL);
666		cv_init(&ldcp->tx_thr_cv, NULL, CV_DRIVER, NULL);
667		ldcp->tx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
668		    vsw_ldc_tx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
669
670		progress |= PROG_tx_thread;
671		if (ldcp->tx_thread == NULL) {
672			DWARN(vswp, "%s(%lld): Failed to create worker thread",
673			    __func__, ldc_id);
674			goto ldc_attach_fail;
675		}
676	}
677
678	status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
679	if (status != 0) {
680		DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
681		    __func__, ldc_id, status);
682		(void) ldc_fini(ldcp->ldc_handle);
683		goto ldc_attach_fail;
684	}
685	/*
686	 * allocate a message for ldc_read()s, big enough to hold ctrl and
687	 * data msgs, including raw data msgs used to recv priority frames.
688	 */
689	ldcp->msglen = VIO_PKT_DATA_HDRSIZE + vswp->max_frame_size;
690	ldcp->ldcmsg = kmem_alloc(ldcp->msglen, KM_SLEEP);
691
692	progress |= PROG_callback;
693
694	mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL);
695
696	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
697		DERR(vswp, "%s: ldc_status failed", __func__);
698		mutex_destroy(&ldcp->status_lock);
699		goto ldc_attach_fail;
700	}
701
702	ldcp->ldc_status = istatus;
703	ldcp->ldc_port = port;
704	ldcp->ldc_vswp = vswp;
705
706	vsw_reset_vnet_proto_ops(ldcp);
707
708	(void) sprintf(kname, "%sldc0x%lx", DRV_NAME, ldcp->ldc_id);
709	ldcp->ksp = vgen_setup_kstats(DRV_NAME, vswp->instance,
710	    kname, &ldcp->ldc_stats);
711	if (ldcp->ksp == NULL) {
712		DERR(vswp, "%s: kstats setup failed", __func__);
713		goto ldc_attach_fail;
714	}
715
716	/* link it into the list of channels for this port */
717	WRITE_ENTER(&ldcl->lockrw);
718	ldcp->ldc_next = ldcl->head;
719	ldcl->head = ldcp;
720	RW_EXIT(&ldcl->lockrw);
721
722	D1(vswp, "%s: exit", __func__);
723	return (0);
724
725ldc_attach_fail:
726
727	if (progress & PROG_callback) {
728		(void) ldc_unreg_callback(ldcp->ldc_handle);
729		kmem_free(ldcp->ldcmsg, ldcp->msglen);
730	}
731
732	if (progress & PROG_rx_thread) {
733		if (ldcp->rx_thread != NULL) {
734			vsw_stop_rx_thread(ldcp);
735		}
736		mutex_destroy(&ldcp->rx_thr_lock);
737		cv_destroy(&ldcp->rx_thr_cv);
738	}
739
740	if (progress & PROG_tx_thread) {
741		if (ldcp->tx_thread != NULL) {
742			vsw_stop_tx_thread(ldcp);
743		}
744		mutex_destroy(&ldcp->tx_thr_lock);
745		cv_destroy(&ldcp->tx_thr_cv);
746	}
747	if (ldcp->ksp != NULL) {
748		vgen_destroy_kstats(ldcp->ksp);
749	}
750	mutex_destroy(&ldcp->ldc_txlock);
751	mutex_destroy(&ldcp->ldc_rxlock);
752	mutex_destroy(&ldcp->ldc_cblock);
753	mutex_destroy(&ldcp->drain_cv_lock);
754
755	cv_destroy(&ldcp->drain_cv);
756
757	rw_destroy(&ldcp->lane_in.dlistrw);
758	rw_destroy(&ldcp->lane_out.dlistrw);
759
760	if (progress & PROG_mblks) {
761		vio_destroy_multipools(&ldcp->vmp, &vswp->rxh);
762	}
763	kmem_free(ldcp, sizeof (vsw_ldc_t));
764
765	return (1);
766}
767
768/*
769 * Detach a logical domain channel (ldc) belonging to a
770 * particular port.
771 *
772 * Returns 0 on success, 1 on failure.
773 */
774static int
775vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id)
776{
777	vsw_t 		*vswp = port->p_vswp;
778	vsw_ldc_t 	*ldcp, *prev_ldcp;
779	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
780	int 		rv;
781
782	prev_ldcp = ldcl->head;
783	for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) {
784		if (ldcp->ldc_id == ldc_id) {
785			break;
786		}
787	}
788
789	/* specified ldc id not found */
790	if (ldcp == NULL) {
791		DERR(vswp, "%s: ldcp = NULL", __func__);
792		return (1);
793	}
794
795	D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id);
796
797	/* Stop the receive thread */
798	if (ldcp->rx_thread != NULL) {
799		vsw_stop_rx_thread(ldcp);
800		mutex_destroy(&ldcp->rx_thr_lock);
801		cv_destroy(&ldcp->rx_thr_cv);
802	}
803	kmem_free(ldcp->ldcmsg, ldcp->msglen);
804
805	/* Stop the tx thread */
806	if (ldcp->tx_thread != NULL) {
807		vsw_stop_tx_thread(ldcp);
808		mutex_destroy(&ldcp->tx_thr_lock);
809		cv_destroy(&ldcp->tx_thr_cv);
810		if (ldcp->tx_mhead != NULL) {
811			freemsgchain(ldcp->tx_mhead);
812			ldcp->tx_mhead = ldcp->tx_mtail = NULL;
813			ldcp->tx_cnt = 0;
814		}
815	}
816
817	/* Destory kstats */
818	vgen_destroy_kstats(ldcp->ksp);
819
820	/*
821	 * Before we can close the channel we must release any mapped
822	 * resources (e.g. drings).
823	 */
824	vsw_free_lane_resources(ldcp, INBOUND);
825	vsw_free_lane_resources(ldcp, OUTBOUND);
826
827	/*
828	 * If the close fails we are in serious trouble, as won't
829	 * be able to delete the parent port.
830	 */
831	if ((rv = ldc_close(ldcp->ldc_handle)) != 0) {
832		DERR(vswp, "%s: error %d closing channel %lld",
833		    __func__, rv, ldcp->ldc_id);
834		return (1);
835	}
836
837	(void) ldc_fini(ldcp->ldc_handle);
838
839	ldcp->ldc_status = LDC_INIT;
840	ldcp->ldc_handle = NULL;
841	ldcp->ldc_vswp = NULL;
842
843
844	/*
845	 * Most likely some mblks are still in use and
846	 * have not been returned to the pool. These mblks are
847	 * added to the pool that is maintained in the device instance.
848	 * Another attempt will be made to destroy the pool
849	 * when the device detaches.
850	 */
851	vio_destroy_multipools(&ldcp->vmp, &vswp->rxh);
852
853	/* unlink it from the list */
854	prev_ldcp = ldcp->ldc_next;
855
856	mutex_destroy(&ldcp->ldc_txlock);
857	mutex_destroy(&ldcp->ldc_rxlock);
858	mutex_destroy(&ldcp->ldc_cblock);
859	cv_destroy(&ldcp->drain_cv);
860	mutex_destroy(&ldcp->drain_cv_lock);
861	mutex_destroy(&ldcp->status_lock);
862	rw_destroy(&ldcp->lane_in.dlistrw);
863	rw_destroy(&ldcp->lane_out.dlistrw);
864
865	kmem_free(ldcp, sizeof (vsw_ldc_t));
866
867	return (0);
868}
869
870/*
871 * Open and attempt to bring up the channel. Note that channel
872 * can only be brought up if peer has also opened channel.
873 *
874 * Returns 0 if can open and bring up channel, otherwise
875 * returns 1.
876 */
877static int
878vsw_ldc_init(vsw_ldc_t *ldcp)
879{
880	vsw_t 		*vswp = ldcp->ldc_vswp;
881	ldc_status_t	istatus = 0;
882	int		rv;
883
884	D1(vswp, "%s: enter", __func__);
885
886	LDC_ENTER_LOCK(ldcp);
887
888	/* don't start at 0 in case clients don't like that */
889	ldcp->next_ident = 1;
890
891	rv = ldc_open(ldcp->ldc_handle);
892	if (rv != 0) {
893		DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)",
894		    __func__, ldcp->ldc_id, rv);
895		LDC_EXIT_LOCK(ldcp);
896		return (1);
897	}
898
899	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
900		DERR(vswp, "%s: unable to get status", __func__);
901		LDC_EXIT_LOCK(ldcp);
902		return (1);
903
904	} else if (istatus != LDC_OPEN && istatus != LDC_READY) {
905		DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY",
906		    __func__, ldcp->ldc_id, istatus);
907		LDC_EXIT_LOCK(ldcp);
908		return (1);
909	}
910
911	mutex_enter(&ldcp->status_lock);
912	ldcp->ldc_status = istatus;
913	mutex_exit(&ldcp->status_lock);
914
915	rv = ldc_up(ldcp->ldc_handle);
916	if (rv != 0) {
917		/*
918		 * Not a fatal error for ldc_up() to fail, as peer
919		 * end point may simply not be ready yet.
920		 */
921		D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
922		    ldcp->ldc_id, rv);
923		LDC_EXIT_LOCK(ldcp);
924		return (1);
925	}
926
927	/*
928	 * ldc_up() call is non-blocking so need to explicitly
929	 * check channel status to see if in fact the channel
930	 * is UP.
931	 */
932	mutex_enter(&ldcp->status_lock);
933	if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) {
934		DERR(vswp, "%s: unable to get status", __func__);
935		mutex_exit(&ldcp->status_lock);
936		LDC_EXIT_LOCK(ldcp);
937		return (1);
938
939	}
940
941	if (ldcp->ldc_status == LDC_UP) {
942		D2(vswp, "%s: channel %ld now UP (%ld)", __func__,
943		    ldcp->ldc_id, istatus);
944		mutex_exit(&ldcp->status_lock);
945		LDC_EXIT_LOCK(ldcp);
946
947		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
948		return (0);
949	}
950
951	mutex_exit(&ldcp->status_lock);
952	LDC_EXIT_LOCK(ldcp);
953
954	D1(vswp, "%s: exit", __func__);
955	return (0);
956}
957
958/* disable callbacks on the channel */
959static int
960vsw_ldc_uninit(vsw_ldc_t *ldcp)
961{
962	vsw_t	*vswp = ldcp->ldc_vswp;
963	int	rv;
964
965	D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id);
966
967	LDC_ENTER_LOCK(ldcp);
968
969	rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
970	if (rv != 0) {
971		DERR(vswp, "vsw_ldc_uninit(%lld): error disabling "
972		    "interrupts (rv = %d)\n", ldcp->ldc_id, rv);
973		LDC_EXIT_LOCK(ldcp);
974		return (1);
975	}
976
977	mutex_enter(&ldcp->status_lock);
978	ldcp->ldc_status = LDC_INIT;
979	mutex_exit(&ldcp->status_lock);
980
981	LDC_EXIT_LOCK(ldcp);
982
983	D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id);
984
985	return (0);
986}
987
988static int
989vsw_init_ldcs(vsw_port_t *port)
990{
991	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
992	vsw_ldc_t	*ldcp;
993
994	READ_ENTER(&ldcl->lockrw);
995	ldcp =  ldcl->head;
996	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
997		(void) vsw_ldc_init(ldcp);
998	}
999	RW_EXIT(&ldcl->lockrw);
1000
1001	return (0);
1002}
1003
1004static int
1005vsw_uninit_ldcs(vsw_port_t *port)
1006{
1007	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
1008	vsw_ldc_t	*ldcp;
1009
1010	D1(NULL, "vsw_uninit_ldcs: enter\n");
1011
1012	READ_ENTER(&ldcl->lockrw);
1013	ldcp =  ldcl->head;
1014	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
1015		(void) vsw_ldc_uninit(ldcp);
1016	}
1017	RW_EXIT(&ldcl->lockrw);
1018
1019	D1(NULL, "vsw_uninit_ldcs: exit\n");
1020
1021	return (0);
1022}
1023
1024/*
1025 * Wait until the callback(s) associated with the ldcs under the specified
1026 * port have completed.
1027 *
1028 * Prior to this function being invoked each channel under this port
1029 * should have been quiesced via ldc_set_cb_mode(DISABLE).
1030 *
1031 * A short explaination of what we are doing below..
1032 *
1033 * The simplest approach would be to have a reference counter in
1034 * the ldc structure which is increment/decremented by the callbacks as
1035 * they use the channel. The drain function could then simply disable any
1036 * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately
1037 * there is a tiny window here - before the callback is able to get the lock
1038 * on the channel it is interrupted and this function gets to execute. It
1039 * sees that the ref count is zero and believes its free to delete the
1040 * associated data structures.
1041 *
1042 * We get around this by taking advantage of the fact that before the ldc
1043 * framework invokes a callback it sets a flag to indicate that there is a
1044 * callback active (or about to become active). If when we attempt to
1045 * unregister a callback when this active flag is set then the unregister
1046 * will fail with EWOULDBLOCK.
1047 *
1048 * If the unregister fails we do a cv_timedwait. We will either be signaled
1049 * by the callback as it is exiting (note we have to wait a short period to
1050 * allow the callback to return fully to the ldc framework and it to clear
1051 * the active flag), or by the timer expiring. In either case we again attempt
1052 * the unregister. We repeat this until we can succesfully unregister the
1053 * callback.
1054 *
1055 * The reason we use a cv_timedwait rather than a simple cv_wait is to catch
1056 * the case where the callback has finished but the ldc framework has not yet
1057 * cleared the active flag. In this case we would never get a cv_signal.
1058 */
1059static int
1060vsw_drain_ldcs(vsw_port_t *port)
1061{
1062	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
1063	vsw_ldc_t	*ldcp;
1064	vsw_t		*vswp = port->p_vswp;
1065
1066	D1(vswp, "%s: enter", __func__);
1067
1068	READ_ENTER(&ldcl->lockrw);
1069
1070	ldcp = ldcl->head;
1071
1072	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
1073		/*
1074		 * If we can unregister the channel callback then we
1075		 * know that there is no callback either running or
1076		 * scheduled to run for this channel so move on to next
1077		 * channel in the list.
1078		 */
1079		mutex_enter(&ldcp->drain_cv_lock);
1080
1081		/* prompt active callbacks to quit */
1082		ldcp->drain_state = VSW_LDC_DRAINING;
1083
1084		if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) {
1085			D2(vswp, "%s: unreg callback for chan %ld", __func__,
1086			    ldcp->ldc_id);
1087			mutex_exit(&ldcp->drain_cv_lock);
1088			continue;
1089		} else {
1090			/*
1091			 * If we end up here we know that either 1) a callback
1092			 * is currently executing, 2) is about to start (i.e.
1093			 * the ldc framework has set the active flag but
1094			 * has not actually invoked the callback yet, or 3)
1095			 * has finished and has returned to the ldc framework
1096			 * but the ldc framework has not yet cleared the
1097			 * active bit.
1098			 *
1099			 * Wait for it to finish.
1100			 */
1101			while (ldc_unreg_callback(ldcp->ldc_handle)
1102			    == EWOULDBLOCK)
1103				(void) cv_timedwait(&ldcp->drain_cv,
1104				    &ldcp->drain_cv_lock, lbolt + hz);
1105
1106			mutex_exit(&ldcp->drain_cv_lock);
1107			D2(vswp, "%s: unreg callback for chan %ld after "
1108			    "timeout", __func__, ldcp->ldc_id);
1109		}
1110	}
1111	RW_EXIT(&ldcl->lockrw);
1112
1113	D1(vswp, "%s: exit", __func__);
1114	return (0);
1115}
1116
1117/*
1118 * Wait until all tasks which reference this port have completed.
1119 *
1120 * Prior to this function being invoked each channel under this port
1121 * should have been quiesced via ldc_set_cb_mode(DISABLE).
1122 */
1123static int
1124vsw_drain_port_taskq(vsw_port_t *port)
1125{
1126	vsw_t		*vswp = port->p_vswp;
1127
1128	D1(vswp, "%s: enter", __func__);
1129
1130	/*
1131	 * Mark the port as in the process of being detached, and
1132	 * dispatch a marker task to the queue so we know when all
1133	 * relevant tasks have completed.
1134	 */
1135	mutex_enter(&port->state_lock);
1136	port->state = VSW_PORT_DETACHING;
1137
1138	if ((vswp->taskq_p == NULL) ||
1139	    (ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task,
1140	    port, DDI_NOSLEEP) != DDI_SUCCESS)) {
1141		DERR(vswp, "%s: unable to dispatch marker task",
1142		    __func__);
1143		mutex_exit(&port->state_lock);
1144		return (1);
1145	}
1146
1147	/*
1148	 * Wait for the marker task to finish.
1149	 */
1150	while (port->state != VSW_PORT_DETACHABLE)
1151		cv_wait(&port->state_cv, &port->state_lock);
1152
1153	mutex_exit(&port->state_lock);
1154
1155	D1(vswp, "%s: exit", __func__);
1156
1157	return (0);
1158}
1159
1160static void
1161vsw_marker_task(void *arg)
1162{
1163	vsw_port_t	*port = arg;
1164	vsw_t		*vswp = port->p_vswp;
1165
1166	D1(vswp, "%s: enter", __func__);
1167
1168	mutex_enter(&port->state_lock);
1169
1170	/*
1171	 * No further tasks should be dispatched which reference
1172	 * this port so ok to mark it as safe to detach.
1173	 */
1174	port->state = VSW_PORT_DETACHABLE;
1175
1176	cv_signal(&port->state_cv);
1177
1178	mutex_exit(&port->state_lock);
1179
1180	D1(vswp, "%s: exit", __func__);
1181}
1182
1183vsw_port_t *
1184vsw_lookup_port(vsw_t *vswp, int p_instance)
1185{
1186	vsw_port_list_t *plist = &vswp->plist;
1187	vsw_port_t	*port;
1188
1189	for (port = plist->head; port != NULL; port = port->p_next) {
1190		if (port->p_instance == p_instance) {
1191			D2(vswp, "vsw_lookup_port: found p_instance\n");
1192			return (port);
1193		}
1194	}
1195
1196	return (NULL);
1197}
1198
1199void
1200vsw_vlan_unaware_port_reset(vsw_port_t *portp)
1201{
1202	vsw_ldc_list_t 	*ldclp;
1203	vsw_ldc_t	*ldcp;
1204
1205	ldclp = &portp->p_ldclist;
1206
1207	READ_ENTER(&ldclp->lockrw);
1208
1209	/*
1210	 * NOTE: for now, we will assume we have a single channel.
1211	 */
1212	if (ldclp->head == NULL) {
1213		RW_EXIT(&ldclp->lockrw);
1214		return;
1215	}
1216	ldcp = ldclp->head;
1217
1218	mutex_enter(&ldcp->ldc_cblock);
1219
1220	/*
1221	 * If the peer is vlan_unaware(ver < 1.3), reset channel and terminate
1222	 * the connection. See comments in vsw_set_vnet_proto_ops().
1223	 */
1224	if (ldcp->hphase == VSW_MILESTONE4 && VSW_VER_LT(ldcp, 1, 3) &&
1225	    portp->nvids != 0) {
1226		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1227	}
1228
1229	mutex_exit(&ldcp->ldc_cblock);
1230
1231	RW_EXIT(&ldclp->lockrw);
1232}
1233
1234void
1235vsw_hio_port_reset(vsw_port_t *portp)
1236{
1237	vsw_ldc_list_t	*ldclp;
1238	vsw_ldc_t	*ldcp;
1239
1240	ldclp = &portp->p_ldclist;
1241
1242	READ_ENTER(&ldclp->lockrw);
1243
1244	/*
1245	 * NOTE: for now, we will assume we have a single channel.
1246	 */
1247	if (ldclp->head == NULL) {
1248		RW_EXIT(&ldclp->lockrw);
1249		return;
1250	}
1251	ldcp = ldclp->head;
1252
1253	mutex_enter(&ldcp->ldc_cblock);
1254
1255	/*
1256	 * If the peer is HybridIO capable (ver >= 1.3), reset channel
1257	 * to trigger re-negotiation, which inturn trigger HybridIO
1258	 * setup/cleanup.
1259	 */
1260	if ((ldcp->hphase == VSW_MILESTONE4) &&
1261	    (portp->p_hio_capable == B_TRUE)) {
1262		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1263	}
1264
1265	mutex_exit(&ldcp->ldc_cblock);
1266
1267	RW_EXIT(&ldclp->lockrw);
1268}
1269
1270/*
1271 * Search for and remove the specified port from the port
1272 * list. Returns 0 if able to locate and remove port, otherwise
1273 * returns 1.
1274 */
1275static int
1276vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port)
1277{
1278	vsw_port_list_t *plist = &vswp->plist;
1279	vsw_port_t	*curr_p, *prev_p;
1280
1281	if (plist->head == NULL)
1282		return (1);
1283
1284	curr_p = prev_p = plist->head;
1285
1286	while (curr_p != NULL) {
1287		if (curr_p == port) {
1288			if (prev_p == curr_p) {
1289				plist->head = curr_p->p_next;
1290			} else {
1291				prev_p->p_next = curr_p->p_next;
1292			}
1293			plist->num_ports--;
1294			break;
1295		} else {
1296			prev_p = curr_p;
1297			curr_p = curr_p->p_next;
1298		}
1299	}
1300	return (0);
1301}
1302
1303/*
1304 * Interrupt handler for ldc messages.
1305 */
1306static uint_t
1307vsw_ldc_cb(uint64_t event, caddr_t arg)
1308{
1309	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
1310	vsw_t 		*vswp = ldcp->ldc_vswp;
1311
1312	D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
1313
1314	mutex_enter(&ldcp->ldc_cblock);
1315	ldcp->ldc_stats.callbacks++;
1316
1317	mutex_enter(&ldcp->status_lock);
1318	if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) {
1319		mutex_exit(&ldcp->status_lock);
1320		mutex_exit(&ldcp->ldc_cblock);
1321		return (LDC_SUCCESS);
1322	}
1323	mutex_exit(&ldcp->status_lock);
1324
1325	if (event & LDC_EVT_UP) {
1326		/*
1327		 * Channel has come up.
1328		 */
1329		D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)",
1330		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1331
1332		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1333
1334		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1335	}
1336
1337	if (event & LDC_EVT_READ) {
1338		/*
1339		 * Data available for reading.
1340		 */
1341		D2(vswp, "%s: id(ld) event(%llx) data READ",
1342		    __func__, ldcp->ldc_id, event);
1343
1344		if (ldcp->rx_thread != NULL) {
1345			/*
1346			 * If the receive thread is enabled, then
1347			 * wakeup the receive thread to process the
1348			 * LDC messages.
1349			 */
1350			mutex_exit(&ldcp->ldc_cblock);
1351			mutex_enter(&ldcp->rx_thr_lock);
1352			if (!(ldcp->rx_thr_flags & VSW_WTHR_DATARCVD)) {
1353				ldcp->rx_thr_flags |= VSW_WTHR_DATARCVD;
1354				cv_signal(&ldcp->rx_thr_cv);
1355			}
1356			mutex_exit(&ldcp->rx_thr_lock);
1357			mutex_enter(&ldcp->ldc_cblock);
1358		} else {
1359			vsw_process_pkt(ldcp);
1360		}
1361
1362		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1363
1364		goto vsw_cb_exit;
1365	}
1366
1367	if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) {
1368		D2(vswp, "%s: id(%ld) event (%lx) DOWN/RESET: status(%ld)",
1369		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1370
1371		vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
1372	}
1373
1374	/*
1375	 * Catch either LDC_EVT_WRITE which we don't support or any
1376	 * unknown event.
1377	 */
1378	if (event &
1379	    ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) {
1380		DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)",
1381		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1382	}
1383
1384vsw_cb_exit:
1385	mutex_exit(&ldcp->ldc_cblock);
1386
1387	/*
1388	 * Let the drain function know we are finishing if it
1389	 * is waiting.
1390	 */
1391	mutex_enter(&ldcp->drain_cv_lock);
1392	if (ldcp->drain_state == VSW_LDC_DRAINING)
1393		cv_signal(&ldcp->drain_cv);
1394	mutex_exit(&ldcp->drain_cv_lock);
1395
1396	return (LDC_SUCCESS);
1397}
1398
1399/*
1400 * Reinitialise data structures associated with the channel.
1401 */
1402static void
1403vsw_ldc_reinit(vsw_ldc_t *ldcp)
1404{
1405	vsw_t		*vswp = ldcp->ldc_vswp;
1406	vsw_port_t	*port;
1407	vsw_ldc_list_t	*ldcl;
1408
1409	D1(vswp, "%s: enter", __func__);
1410
1411	port = ldcp->ldc_port;
1412	ldcl = &port->p_ldclist;
1413
1414	READ_ENTER(&ldcl->lockrw);
1415
1416	D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__,
1417	    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
1418
1419	vsw_free_lane_resources(ldcp, INBOUND);
1420	vsw_free_lane_resources(ldcp, OUTBOUND);
1421	RW_EXIT(&ldcl->lockrw);
1422
1423	ldcp->lane_in.lstate = 0;
1424	ldcp->lane_out.lstate = 0;
1425
1426	/* Remove the fdb entry for this port/mac address */
1427	vsw_fdbe_del(vswp, &(port->p_macaddr));
1428
1429	/* remove the port from vlans it has been assigned to */
1430	vsw_vlan_remove_ids(port, VSW_VNETPORT);
1431
1432	/*
1433	 * Remove parent port from any multicast groups
1434	 * it may have registered with. Client must resend
1435	 * multicast add command after handshake completes.
1436	 */
1437	vsw_del_mcst_port(port);
1438
1439	ldcp->peer_session = 0;
1440	ldcp->session_status = 0;
1441	ldcp->hcnt = 0;
1442	ldcp->hphase = VSW_MILESTONE0;
1443
1444	vsw_reset_vnet_proto_ops(ldcp);
1445
1446	D1(vswp, "%s: exit", __func__);
1447}
1448
1449/*
1450 * Process a connection event.
1451 *
1452 * Note - care must be taken to ensure that this function is
1453 * not called with the dlistrw lock held.
1454 */
1455static void
1456vsw_process_conn_evt(vsw_ldc_t *ldcp, uint16_t evt)
1457{
1458	vsw_t		*vswp = ldcp->ldc_vswp;
1459	vsw_conn_evt_t	*conn = NULL;
1460
1461	D1(vswp, "%s: enter", __func__);
1462
1463	/*
1464	 * Check if either a reset or restart event is pending
1465	 * or in progress. If so just return.
1466	 *
1467	 * A VSW_CONN_RESET event originates either with a LDC_RESET_EVT
1468	 * being received by the callback handler, or a ECONNRESET error
1469	 * code being returned from a ldc_read() or ldc_write() call.
1470	 *
1471	 * A VSW_CONN_RESTART event occurs when some error checking code
1472	 * decides that there is a problem with data from the channel,
1473	 * and that the handshake should be restarted.
1474	 */
1475	if (((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) &&
1476	    (ldstub((uint8_t *)&ldcp->reset_active)))
1477		return;
1478
1479	/*
1480	 * If it is an LDC_UP event we first check the recorded
1481	 * state of the channel. If this is UP then we know that
1482	 * the channel moving to the UP state has already been dealt
1483	 * with and don't need to dispatch a  new task.
1484	 *
1485	 * The reason for this check is that when we do a ldc_up(),
1486	 * depending on the state of the peer, we may or may not get
1487	 * a LDC_UP event. As we can't depend on getting a LDC_UP evt
1488	 * every time we do ldc_up() we explicitly check the channel
1489	 * status to see has it come up (ldc_up() is asynch and will
1490	 * complete at some undefined time), and take the appropriate
1491	 * action.
1492	 *
1493	 * The flip side of this is that we may get a LDC_UP event
1494	 * when we have already seen that the channel is up and have
1495	 * dealt with that.
1496	 */
1497	mutex_enter(&ldcp->status_lock);
1498	if (evt == VSW_CONN_UP) {
1499		if ((ldcp->ldc_status == LDC_UP) || (ldcp->reset_active != 0)) {
1500			mutex_exit(&ldcp->status_lock);
1501			return;
1502		}
1503	}
1504	mutex_exit(&ldcp->status_lock);
1505
1506	/*
1507	 * The transaction group id allows us to identify and discard
1508	 * any tasks which are still pending on the taskq and refer
1509	 * to the handshake session we are about to restart or reset.
1510	 * These stale messages no longer have any real meaning.
1511	 */
1512	(void) atomic_inc_32(&ldcp->hss_id);
1513
1514	ASSERT(vswp->taskq_p != NULL);
1515
1516	if ((conn = kmem_zalloc(sizeof (vsw_conn_evt_t), KM_NOSLEEP)) == NULL) {
1517		cmn_err(CE_WARN, "!vsw%d: unable to allocate memory for"
1518		    " connection event", vswp->instance);
1519		goto err_exit;
1520	}
1521
1522	conn->evt = evt;
1523	conn->ldcp = ldcp;
1524
1525	if (ddi_taskq_dispatch(vswp->taskq_p, vsw_conn_task, conn,
1526	    DDI_NOSLEEP) != DDI_SUCCESS) {
1527		cmn_err(CE_WARN, "!vsw%d: Can't dispatch connection task",
1528		    vswp->instance);
1529
1530		kmem_free(conn, sizeof (vsw_conn_evt_t));
1531		goto err_exit;
1532	}
1533
1534	D1(vswp, "%s: exit", __func__);
1535	return;
1536
1537err_exit:
1538	/*
1539	 * Have mostly likely failed due to memory shortage. Clear the flag so
1540	 * that future requests will at least be attempted and will hopefully
1541	 * succeed.
1542	 */
1543	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1544		ldcp->reset_active = 0;
1545}
1546
1547/*
1548 * Deal with events relating to a connection. Invoked from a taskq.
1549 */
1550static void
1551vsw_conn_task(void *arg)
1552{
1553	vsw_conn_evt_t	*conn = (vsw_conn_evt_t *)arg;
1554	vsw_ldc_t	*ldcp = NULL;
1555	vsw_port_t	*portp;
1556	vsw_t		*vswp = NULL;
1557	uint16_t	evt;
1558	ldc_status_t	curr_status;
1559
1560	ldcp = conn->ldcp;
1561	evt = conn->evt;
1562	vswp = ldcp->ldc_vswp;
1563	portp = ldcp->ldc_port;
1564
1565	D1(vswp, "%s: enter", __func__);
1566
1567	/* can safely free now have copied out data */
1568	kmem_free(conn, sizeof (vsw_conn_evt_t));
1569
1570	mutex_enter(&ldcp->status_lock);
1571	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1572		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1573		    "channel %ld", vswp->instance, ldcp->ldc_id);
1574		mutex_exit(&ldcp->status_lock);
1575		return;
1576	}
1577
1578	/*
1579	 * If we wish to restart the handshake on this channel, then if
1580	 * the channel is UP we bring it DOWN to flush the underlying
1581	 * ldc queue.
1582	 */
1583	if ((evt == VSW_CONN_RESTART) && (curr_status == LDC_UP))
1584		(void) ldc_down(ldcp->ldc_handle);
1585
1586	if ((vswp->hio_capable) && (portp->p_hio_enabled)) {
1587		vsw_hio_stop(vswp, ldcp);
1588	}
1589
1590	/*
1591	 * re-init all the associated data structures.
1592	 */
1593	vsw_ldc_reinit(ldcp);
1594
1595	/*
1596	 * Bring the channel back up (note it does no harm to
1597	 * do this even if the channel is already UP, Just
1598	 * becomes effectively a no-op).
1599	 */
1600	(void) ldc_up(ldcp->ldc_handle);
1601
1602	/*
1603	 * Check if channel is now UP. This will only happen if
1604	 * peer has also done a ldc_up().
1605	 */
1606	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1607		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1608		    "channel %ld", vswp->instance, ldcp->ldc_id);
1609		mutex_exit(&ldcp->status_lock);
1610		return;
1611	}
1612
1613	ldcp->ldc_status = curr_status;
1614
1615	/* channel UP so restart handshake by sending version info */
1616	if (curr_status == LDC_UP) {
1617		if (ldcp->hcnt++ > vsw_num_handshakes) {
1618			cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted"
1619			    " handshake attempts (%d) on channel %ld",
1620			    vswp->instance, ldcp->hcnt, ldcp->ldc_id);
1621			mutex_exit(&ldcp->status_lock);
1622			return;
1623		}
1624
1625		if (vsw_obp_ver_proto_workaround == B_FALSE &&
1626		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp,
1627		    DDI_NOSLEEP) != DDI_SUCCESS)) {
1628			cmn_err(CE_WARN, "!vsw%d: Can't dispatch version task",
1629			    vswp->instance);
1630
1631			/*
1632			 * Don't count as valid restart attempt if couldn't
1633			 * send version msg.
1634			 */
1635			if (ldcp->hcnt > 0)
1636				ldcp->hcnt--;
1637		}
1638	}
1639
1640	/*
1641	 * Mark that the process is complete by clearing the flag.
1642	 *
1643	 * Note is it possible that the taskq dispatch above may have failed,
1644	 * most likely due to memory shortage. We still clear the flag so
1645	 * future attempts will at least be attempted and will hopefully
1646	 * succeed.
1647	 */
1648	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1649		ldcp->reset_active = 0;
1650
1651	mutex_exit(&ldcp->status_lock);
1652
1653	D1(vswp, "%s: exit", __func__);
1654}
1655
1656/*
1657 * returns 0 if legal for event signified by flag to have
1658 * occured at the time it did. Otherwise returns 1.
1659 */
1660int
1661vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag)
1662{
1663	vsw_t		*vswp = ldcp->ldc_vswp;
1664	uint64_t	state;
1665	uint64_t	phase;
1666
1667	if (dir == INBOUND)
1668		state = ldcp->lane_in.lstate;
1669	else
1670		state = ldcp->lane_out.lstate;
1671
1672	phase = ldcp->hphase;
1673
1674	switch (flag) {
1675	case VSW_VER_INFO_RECV:
1676		if (phase > VSW_MILESTONE0) {
1677			DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV"
1678			    " when in state %d\n", ldcp->ldc_id, phase);
1679			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1680			return (1);
1681		}
1682		break;
1683
1684	case VSW_VER_ACK_RECV:
1685	case VSW_VER_NACK_RECV:
1686		if (!(state & VSW_VER_INFO_SENT)) {
1687			DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK or "
1688			    "VER_NACK when in state %d\n", ldcp->ldc_id, phase);
1689			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1690			return (1);
1691		} else
1692			state &= ~VSW_VER_INFO_SENT;
1693		break;
1694
1695	case VSW_ATTR_INFO_RECV:
1696		if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) {
1697			DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV"
1698			    " when in state %d\n", ldcp->ldc_id, phase);
1699			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1700			return (1);
1701		}
1702		break;
1703
1704	case VSW_ATTR_ACK_RECV:
1705	case VSW_ATTR_NACK_RECV:
1706		if (!(state & VSW_ATTR_INFO_SENT)) {
1707			DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK"
1708			    " or ATTR_NACK when in state %d\n",
1709			    ldcp->ldc_id, phase);
1710			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1711			return (1);
1712		} else
1713			state &= ~VSW_ATTR_INFO_SENT;
1714		break;
1715
1716	case VSW_DRING_INFO_RECV:
1717		if (phase < VSW_MILESTONE1) {
1718			DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV"
1719			    " when in state %d\n", ldcp->ldc_id, phase);
1720			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1721			return (1);
1722		}
1723		break;
1724
1725	case VSW_DRING_ACK_RECV:
1726	case VSW_DRING_NACK_RECV:
1727		if (!(state & VSW_DRING_INFO_SENT)) {
1728			DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK "
1729			    " or DRING_NACK when in state %d\n",
1730			    ldcp->ldc_id, phase);
1731			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1732			return (1);
1733		} else
1734			state &= ~VSW_DRING_INFO_SENT;
1735		break;
1736
1737	case VSW_RDX_INFO_RECV:
1738		if (phase < VSW_MILESTONE3) {
1739			DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV"
1740			    " when in state %d\n", ldcp->ldc_id, phase);
1741			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1742			return (1);
1743		}
1744		break;
1745
1746	case VSW_RDX_ACK_RECV:
1747	case VSW_RDX_NACK_RECV:
1748		if (!(state & VSW_RDX_INFO_SENT)) {
1749			DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK or "
1750			    "RDX_NACK when in state %d\n", ldcp->ldc_id, phase);
1751			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1752			return (1);
1753		} else
1754			state &= ~VSW_RDX_INFO_SENT;
1755		break;
1756
1757	case VSW_MCST_INFO_RECV:
1758		if (phase < VSW_MILESTONE3) {
1759			DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV"
1760			    " when in state %d\n", ldcp->ldc_id, phase);
1761			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1762			return (1);
1763		}
1764		break;
1765
1766	default:
1767		DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)",
1768		    ldcp->ldc_id, flag);
1769		return (1);
1770	}
1771
1772	if (dir == INBOUND)
1773		ldcp->lane_in.lstate = state;
1774	else
1775		ldcp->lane_out.lstate = state;
1776
1777	D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id);
1778
1779	return (0);
1780}
1781
1782void
1783vsw_next_milestone(vsw_ldc_t *ldcp)
1784{
1785	vsw_t		*vswp = ldcp->ldc_vswp;
1786	vsw_port_t	*portp = ldcp->ldc_port;
1787
1788	D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__,
1789	    ldcp->ldc_id, ldcp->hphase);
1790
1791	DUMP_FLAGS(ldcp->lane_in.lstate);
1792	DUMP_FLAGS(ldcp->lane_out.lstate);
1793
1794	switch (ldcp->hphase) {
1795
1796	case VSW_MILESTONE0:
1797		/*
1798		 * If we haven't started to handshake with our peer,
1799		 * start to do so now.
1800		 */
1801		if (ldcp->lane_out.lstate == 0) {
1802			D2(vswp, "%s: (chan %lld) starting handshake "
1803			    "with peer", __func__, ldcp->ldc_id);
1804			vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1805		}
1806
1807		/*
1808		 * Only way to pass this milestone is to have successfully
1809		 * negotiated version info.
1810		 */
1811		if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) &&
1812		    (ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) {
1813
1814			D2(vswp, "%s: (chan %lld) leaving milestone 0",
1815			    __func__, ldcp->ldc_id);
1816
1817			vsw_set_vnet_proto_ops(ldcp);
1818
1819			/*
1820			 * Next milestone is passed when attribute
1821			 * information has been successfully exchanged.
1822			 */
1823			ldcp->hphase = VSW_MILESTONE1;
1824			vsw_send_attr(ldcp);
1825
1826		}
1827		break;
1828
1829	case VSW_MILESTONE1:
1830		/*
1831		 * Only way to pass this milestone is to have successfully
1832		 * negotiated attribute information.
1833		 */
1834		if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) {
1835
1836			ldcp->hphase = VSW_MILESTONE2;
1837
1838			/*
1839			 * If the peer device has said it wishes to
1840			 * use descriptor rings then we send it our ring
1841			 * info, otherwise we just set up a private ring
1842			 * which we use an internal buffer
1843			 */
1844			if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
1845			    (ldcp->lane_in.xfer_mode & VIO_DRING_MODE_V1_2)) ||
1846			    (VSW_VER_LT(ldcp, 1, 2) &&
1847			    (ldcp->lane_in.xfer_mode ==
1848			    VIO_DRING_MODE_V1_0))) {
1849				vsw_send_dring_info(ldcp);
1850			}
1851		}
1852		break;
1853
1854	case VSW_MILESTONE2:
1855		/*
1856		 * If peer has indicated in its attribute message that
1857		 * it wishes to use descriptor rings then the only way
1858		 * to pass this milestone is for us to have received
1859		 * valid dring info.
1860		 *
1861		 * If peer is not using descriptor rings then just fall
1862		 * through.
1863		 */
1864		if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
1865		    (ldcp->lane_in.xfer_mode & VIO_DRING_MODE_V1_2)) ||
1866		    (VSW_VER_LT(ldcp, 1, 2) &&
1867		    (ldcp->lane_in.xfer_mode ==
1868		    VIO_DRING_MODE_V1_0))) {
1869			if (!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT))
1870				break;
1871		}
1872
1873		D2(vswp, "%s: (chan %lld) leaving milestone 2",
1874		    __func__, ldcp->ldc_id);
1875
1876		ldcp->hphase = VSW_MILESTONE3;
1877		vsw_send_rdx(ldcp);
1878		break;
1879
1880	case VSW_MILESTONE3:
1881		/*
1882		 * Pass this milestone when all paramaters have been
1883		 * successfully exchanged and RDX sent in both directions.
1884		 *
1885		 * Mark outbound lane as available to transmit data.
1886		 */
1887		if ((ldcp->lane_out.lstate & VSW_RDX_ACK_SENT) &&
1888		    (ldcp->lane_in.lstate & VSW_RDX_ACK_RECV)) {
1889
1890			D2(vswp, "%s: (chan %lld) leaving milestone 3",
1891			    __func__, ldcp->ldc_id);
1892			D2(vswp, "%s: ** handshake complete (0x%llx : "
1893			    "0x%llx) **", __func__, ldcp->lane_in.lstate,
1894			    ldcp->lane_out.lstate);
1895			ldcp->lane_out.lstate |= VSW_LANE_ACTIVE;
1896			ldcp->hphase = VSW_MILESTONE4;
1897			ldcp->hcnt = 0;
1898			DISPLAY_STATE();
1899			/* Start HIO if enabled and capable */
1900			if ((portp->p_hio_enabled) && (portp->p_hio_capable)) {
1901				D2(vswp, "%s: start HybridIO setup", __func__);
1902				vsw_hio_start(vswp, ldcp);
1903			}
1904		} else {
1905			D2(vswp, "%s: still in milestone 3 (0x%llx : 0x%llx)",
1906			    __func__, ldcp->lane_in.lstate,
1907			    ldcp->lane_out.lstate);
1908		}
1909		break;
1910
1911	case VSW_MILESTONE4:
1912		D2(vswp, "%s: (chan %lld) in milestone 4", __func__,
1913		    ldcp->ldc_id);
1914		break;
1915
1916	default:
1917		DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__,
1918		    ldcp->ldc_id, ldcp->hphase);
1919	}
1920
1921	D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id,
1922	    ldcp->hphase);
1923}
1924
1925/*
1926 * Check if major version is supported.
1927 *
1928 * Returns 0 if finds supported major number, and if necessary
1929 * adjusts the minor field.
1930 *
1931 * Returns 1 if can't match major number exactly. Sets mjor/minor
1932 * to next lowest support values, or to zero if no other values possible.
1933 */
1934static int
1935vsw_supported_version(vio_ver_msg_t *vp)
1936{
1937	int	i;
1938
1939	D1(NULL, "vsw_supported_version: enter");
1940
1941	for (i = 0; i < VSW_NUM_VER; i++) {
1942		if (vsw_versions[i].ver_major == vp->ver_major) {
1943			/*
1944			 * Matching or lower major version found. Update
1945			 * minor number if necessary.
1946			 */
1947			if (vp->ver_minor > vsw_versions[i].ver_minor) {
1948				D2(NULL, "%s: adjusting minor value from %d "
1949				    "to %d", __func__, vp->ver_minor,
1950				    vsw_versions[i].ver_minor);
1951				vp->ver_minor = vsw_versions[i].ver_minor;
1952			}
1953
1954			return (0);
1955		}
1956
1957		/*
1958		 * If the message contains a higher major version number, set
1959		 * the message's major/minor versions to the current values
1960		 * and return false, so this message will get resent with
1961		 * these values.
1962		 */
1963		if (vsw_versions[i].ver_major < vp->ver_major) {
1964			D2(NULL, "%s: adjusting major and minor "
1965			    "values to %d, %d\n",
1966			    __func__, vsw_versions[i].ver_major,
1967			    vsw_versions[i].ver_minor);
1968			vp->ver_major = vsw_versions[i].ver_major;
1969			vp->ver_minor = vsw_versions[i].ver_minor;
1970			return (1);
1971		}
1972	}
1973
1974	/* No match was possible, zero out fields */
1975	vp->ver_major = 0;
1976	vp->ver_minor = 0;
1977
1978	D1(NULL, "vsw_supported_version: exit");
1979
1980	return (1);
1981}
1982
1983/*
1984 * Set vnet-protocol-version dependent functions based on version.
1985 */
1986static void
1987vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp)
1988{
1989	vsw_t	*vswp = ldcp->ldc_vswp;
1990	lane_t	*lp = &ldcp->lane_out;
1991
1992	if (VSW_VER_GTEQ(ldcp, 1, 3)) {
1993		/*
1994		 * If the version negotiated with peer is >= 1.3,
1995		 * set the mtu in our attributes to max_frame_size.
1996		 */
1997		lp->mtu = vswp->max_frame_size;
1998	} else {
1999		vsw_port_t	*portp = ldcp->ldc_port;
2000		/*
2001		 * Pre-1.3 peers expect max frame size of ETHERMAX.
2002		 * We can negotiate that size with those peers provided the
2003		 * following conditions are true:
2004		 * - Our max_frame_size is greater only by VLAN_TAGSZ (4).
2005		 * - Only pvid is defined for our peer and there are no vids.
2006		 * If the above conditions are true, then we can send/recv only
2007		 * untagged frames of max size ETHERMAX. Note that pvid of the
2008		 * peer can be different, as vsw has to serve the vnet in that
2009		 * vlan even if itself is not assigned to that vlan.
2010		 */
2011		if ((vswp->max_frame_size == ETHERMAX + VLAN_TAGSZ) &&
2012		    portp->nvids == 0) {
2013			lp->mtu = ETHERMAX;
2014		}
2015	}
2016
2017	if (VSW_VER_GTEQ(ldcp, 1, 2)) {
2018		/* Versions >= 1.2 */
2019
2020		if (VSW_PRI_ETH_DEFINED(vswp)) {
2021			/*
2022			 * enable priority routines and pkt mode only if
2023			 * at least one pri-eth-type is specified in MD.
2024			 */
2025			ldcp->tx = vsw_ldctx_pri;
2026			ldcp->rx_pktdata = vsw_process_pkt_data;
2027
2028			/* set xfer mode for vsw_send_attr() */
2029			lp->xfer_mode = VIO_PKT_MODE | VIO_DRING_MODE_V1_2;
2030		} else {
2031			/* no priority eth types defined in MD */
2032
2033			ldcp->tx = vsw_ldctx;
2034			ldcp->rx_pktdata = vsw_process_pkt_data_nop;
2035
2036			/* set xfer mode for vsw_send_attr() */
2037			lp->xfer_mode = VIO_DRING_MODE_V1_2;
2038		}
2039
2040	} else {
2041		/* Versions prior to 1.2  */
2042
2043		vsw_reset_vnet_proto_ops(ldcp);
2044	}
2045}
2046
2047/*
2048 * Reset vnet-protocol-version dependent functions to v1.0.
2049 */
2050static void
2051vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp)
2052{
2053	lane_t	*lp = &ldcp->lane_out;
2054
2055	ldcp->tx = vsw_ldctx;
2056	ldcp->rx_pktdata = vsw_process_pkt_data_nop;
2057
2058	/* set xfer mode for vsw_send_attr() */
2059	lp->xfer_mode = VIO_DRING_MODE_V1_0;
2060}
2061
2062/*
2063 * Main routine for processing messages received over LDC.
2064 */
2065static void
2066vsw_process_pkt(void *arg)
2067{
2068	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
2069	vsw_t 		*vswp = ldcp->ldc_vswp;
2070	size_t		msglen;
2071	vio_msg_tag_t	*tagp;
2072	uint64_t	*ldcmsg;
2073	int 		rv = 0;
2074
2075
2076	D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2077
2078	ASSERT(MUTEX_HELD(&ldcp->ldc_cblock));
2079
2080	ldcmsg = ldcp->ldcmsg;
2081	/*
2082	 * If channel is up read messages until channel is empty.
2083	 */
2084	do {
2085		msglen = ldcp->msglen;
2086		rv = ldc_read(ldcp->ldc_handle, (caddr_t)ldcmsg, &msglen);
2087
2088		if (rv != 0) {
2089			DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) len(%d)\n",
2090			    __func__, ldcp->ldc_id, rv, msglen);
2091		}
2092
2093		/* channel has been reset */
2094		if (rv == ECONNRESET) {
2095			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
2096			break;
2097		}
2098
2099		if (msglen == 0) {
2100			D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__,
2101			    ldcp->ldc_id);
2102			break;
2103		}
2104
2105		D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__,
2106		    ldcp->ldc_id, msglen);
2107
2108		/*
2109		 * Figure out what sort of packet we have gotten by
2110		 * examining the msg tag, and then switch it appropriately.
2111		 */
2112		tagp = (vio_msg_tag_t *)ldcmsg;
2113
2114		switch (tagp->vio_msgtype) {
2115		case VIO_TYPE_CTRL:
2116			vsw_dispatch_ctrl_task(ldcp, ldcmsg, tagp);
2117			break;
2118		case VIO_TYPE_DATA:
2119			vsw_process_data_pkt(ldcp, ldcmsg, tagp, msglen);
2120			break;
2121		case VIO_TYPE_ERR:
2122			vsw_process_err_pkt(ldcp, ldcmsg, tagp);
2123			break;
2124		default:
2125			DERR(vswp, "%s: Unknown tag(%lx) ", __func__,
2126			    "id(%lx)\n", tagp->vio_msgtype, ldcp->ldc_id);
2127			break;
2128		}
2129	} while (msglen);
2130
2131	D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2132}
2133
2134/*
2135 * Dispatch a task to process a VIO control message.
2136 */
2137static void
2138vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t *tagp)
2139{
2140	vsw_ctrl_task_t		*ctaskp = NULL;
2141	vsw_port_t		*port = ldcp->ldc_port;
2142	vsw_t			*vswp = port->p_vswp;
2143
2144	D1(vswp, "%s: enter", __func__);
2145
2146	/*
2147	 * We need to handle RDX ACK messages in-band as once they
2148	 * are exchanged it is possible that we will get an
2149	 * immediate (legitimate) data packet.
2150	 */
2151	if ((tagp->vio_subtype_env == VIO_RDX) &&
2152	    (tagp->vio_subtype == VIO_SUBTYPE_ACK)) {
2153
2154		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_ACK_RECV))
2155			return;
2156
2157		ldcp->lane_in.lstate |= VSW_RDX_ACK_RECV;
2158		D2(vswp, "%s (%ld) handling RDX_ACK in place "
2159		    "(ostate 0x%llx : hphase %d)", __func__,
2160		    ldcp->ldc_id, ldcp->lane_in.lstate, ldcp->hphase);
2161		vsw_next_milestone(ldcp);
2162		return;
2163	}
2164
2165	ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP);
2166
2167	if (ctaskp == NULL) {
2168		DERR(vswp, "%s: unable to alloc space for ctrl msg", __func__);
2169		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2170		return;
2171	}
2172
2173	ctaskp->ldcp = ldcp;
2174	bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t));
2175	ctaskp->hss_id = ldcp->hss_id;
2176
2177	/*
2178	 * Dispatch task to processing taskq if port is not in
2179	 * the process of being detached.
2180	 */
2181	mutex_enter(&port->state_lock);
2182	if (port->state == VSW_PORT_INIT) {
2183		if ((vswp->taskq_p == NULL) ||
2184		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_process_ctrl_pkt,
2185		    ctaskp, DDI_NOSLEEP) != DDI_SUCCESS)) {
2186			DERR(vswp, "%s: unable to dispatch task to taskq",
2187			    __func__);
2188			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2189			mutex_exit(&port->state_lock);
2190			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2191			return;
2192		}
2193	} else {
2194		DWARN(vswp, "%s: port %d detaching, not dispatching "
2195		    "task", __func__, port->p_instance);
2196	}
2197
2198	mutex_exit(&port->state_lock);
2199
2200	D2(vswp, "%s: dispatched task to taskq for chan %d", __func__,
2201	    ldcp->ldc_id);
2202	D1(vswp, "%s: exit", __func__);
2203}
2204
2205/*
2206 * Process a VIO ctrl message. Invoked from taskq.
2207 */
2208static void
2209vsw_process_ctrl_pkt(void *arg)
2210{
2211	vsw_ctrl_task_t	*ctaskp = (vsw_ctrl_task_t *)arg;
2212	vsw_ldc_t	*ldcp = ctaskp->ldcp;
2213	vsw_t 		*vswp = ldcp->ldc_vswp;
2214	vio_msg_tag_t	tag;
2215	uint16_t	env;
2216
2217	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2218
2219	bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t));
2220	env = tag.vio_subtype_env;
2221
2222	/* stale pkt check */
2223	if (ctaskp->hss_id < ldcp->hss_id) {
2224		DWARN(vswp, "%s: discarding stale packet belonging to earlier"
2225		    " (%ld) handshake session", __func__, ctaskp->hss_id);
2226		return;
2227	}
2228
2229	/* session id check */
2230	if (ldcp->session_status & VSW_PEER_SESSION) {
2231		if (ldcp->peer_session != tag.vio_sid) {
2232			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
2233			    __func__, ldcp->ldc_id, tag.vio_sid);
2234			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2235			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2236			return;
2237		}
2238	}
2239
2240	/*
2241	 * Switch on vio_subtype envelope, then let lower routines
2242	 * decide if its an INFO, ACK or NACK packet.
2243	 */
2244	switch (env) {
2245	case VIO_VER_INFO:
2246		vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp);
2247		break;
2248	case VIO_DRING_REG:
2249		vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp);
2250		break;
2251	case VIO_DRING_UNREG:
2252		vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp);
2253		break;
2254	case VIO_ATTR_INFO:
2255		vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp);
2256		break;
2257	case VNET_MCAST_INFO:
2258		vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp);
2259		break;
2260	case VIO_RDX:
2261		vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp);
2262		break;
2263	case VIO_DDS_INFO:
2264		vsw_process_dds_msg(vswp, ldcp, &ctaskp->pktp);
2265		break;
2266	default:
2267		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
2268	}
2269
2270	kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2271	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2272}
2273
2274/*
2275 * Version negotiation. We can end up here either because our peer
2276 * has responded to a handshake message we have sent it, or our peer
2277 * has initiated a handshake with us. If its the former then can only
2278 * be ACK or NACK, if its the later can only be INFO.
2279 *
2280 * If its an ACK we move to the next stage of the handshake, namely
2281 * attribute exchange. If its a NACK we see if we can specify another
2282 * version, if we can't we stop.
2283 *
2284 * If it is an INFO we reset all params associated with communication
2285 * in that direction over this channel (remember connection is
2286 * essentially 2 independent simplex channels).
2287 */
2288void
2289vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt)
2290{
2291	vio_ver_msg_t	*ver_pkt;
2292	vsw_t 		*vswp = ldcp->ldc_vswp;
2293
2294	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2295
2296	/*
2297	 * We know this is a ctrl/version packet so
2298	 * cast it into the correct structure.
2299	 */
2300	ver_pkt = (vio_ver_msg_t *)pkt;
2301
2302	switch (ver_pkt->tag.vio_subtype) {
2303	case VIO_SUBTYPE_INFO:
2304		D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n");
2305
2306		/*
2307		 * Record the session id, which we will use from now
2308		 * until we see another VER_INFO msg. Even then the
2309		 * session id in most cases will be unchanged, execpt
2310		 * if channel was reset.
2311		 */
2312		if ((ldcp->session_status & VSW_PEER_SESSION) &&
2313		    (ldcp->peer_session != ver_pkt->tag.vio_sid)) {
2314			DERR(vswp, "%s: updating session id for chan %lld "
2315			    "from %llx to %llx", __func__, ldcp->ldc_id,
2316			    ldcp->peer_session, ver_pkt->tag.vio_sid);
2317		}
2318
2319		ldcp->peer_session = ver_pkt->tag.vio_sid;
2320		ldcp->session_status |= VSW_PEER_SESSION;
2321
2322		/* Legal message at this time ? */
2323		if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV))
2324			return;
2325
2326		/*
2327		 * First check the device class. Currently only expect
2328		 * to be talking to a network device. In the future may
2329		 * also talk to another switch.
2330		 */
2331		if (ver_pkt->dev_class != VDEV_NETWORK) {
2332			DERR(vswp, "%s: illegal device class %d", __func__,
2333			    ver_pkt->dev_class);
2334
2335			ver_pkt->tag.vio_sid = ldcp->local_session;
2336			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2337
2338			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2339
2340			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2341			    sizeof (vio_ver_msg_t), B_TRUE);
2342
2343			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2344			vsw_next_milestone(ldcp);
2345			return;
2346		} else {
2347			ldcp->dev_class = ver_pkt->dev_class;
2348		}
2349
2350		/*
2351		 * Now check the version.
2352		 */
2353		if (vsw_supported_version(ver_pkt) == 0) {
2354			/*
2355			 * Support this major version and possibly
2356			 * adjusted minor version.
2357			 */
2358
2359			D2(vswp, "%s: accepted ver %d:%d", __func__,
2360			    ver_pkt->ver_major, ver_pkt->ver_minor);
2361
2362			/* Store accepted values */
2363			ldcp->lane_in.ver_major = ver_pkt->ver_major;
2364			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2365
2366			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2367
2368			ldcp->lane_in.lstate |= VSW_VER_ACK_SENT;
2369
2370			if (vsw_obp_ver_proto_workaround == B_TRUE) {
2371				/*
2372				 * Send a version info message
2373				 * using the accepted version that
2374				 * we are about to ack. Also note that
2375				 * we send our ver info before we ack.
2376				 * Otherwise, as soon as receiving the
2377				 * ack, obp sends attr info msg, which
2378				 * breaks vsw_check_flag() invoked
2379				 * from vsw_process_ctrl_attr_pkt();
2380				 * as we also need VSW_VER_ACK_RECV to
2381				 * be set in lane_out.lstate, before
2382				 * we can receive attr info.
2383				 */
2384				vsw_send_ver(ldcp);
2385			}
2386		} else {
2387			/*
2388			 * NACK back with the next lower major/minor
2389			 * pairing we support (if don't suuport any more
2390			 * versions then they will be set to zero.
2391			 */
2392
2393			D2(vswp, "%s: replying with ver %d:%d", __func__,
2394			    ver_pkt->ver_major, ver_pkt->ver_minor);
2395
2396			/* Store updated values */
2397			ldcp->lane_in.ver_major = ver_pkt->ver_major;
2398			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2399
2400			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2401
2402			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2403		}
2404
2405		DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2406		ver_pkt->tag.vio_sid = ldcp->local_session;
2407		(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2408		    sizeof (vio_ver_msg_t), B_TRUE);
2409
2410		vsw_next_milestone(ldcp);
2411		break;
2412
2413	case VIO_SUBTYPE_ACK:
2414		D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__);
2415
2416		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV))
2417			return;
2418
2419		/* Store updated values */
2420		ldcp->lane_out.ver_major = ver_pkt->ver_major;
2421		ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2422
2423		ldcp->lane_out.lstate |= VSW_VER_ACK_RECV;
2424		vsw_next_milestone(ldcp);
2425
2426		break;
2427
2428	case VIO_SUBTYPE_NACK:
2429		D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__);
2430
2431		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV))
2432			return;
2433
2434		/*
2435		 * If our peer sent us a NACK with the ver fields set to
2436		 * zero then there is nothing more we can do. Otherwise see
2437		 * if we support either the version suggested, or a lesser
2438		 * one.
2439		 */
2440		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2441			DERR(vswp, "%s: peer unable to negotiate any "
2442			    "further.", __func__);
2443			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2444			vsw_next_milestone(ldcp);
2445			return;
2446		}
2447
2448		/*
2449		 * Check to see if we support this major version or
2450		 * a lower one. If we don't then maj/min will be set
2451		 * to zero.
2452		 */
2453		(void) vsw_supported_version(ver_pkt);
2454		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2455			/* Nothing more we can do */
2456			DERR(vswp, "%s: version negotiation failed.\n",
2457			    __func__);
2458			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2459			vsw_next_milestone(ldcp);
2460		} else {
2461			/* found a supported major version */
2462			ldcp->lane_out.ver_major = ver_pkt->ver_major;
2463			ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2464
2465			D2(vswp, "%s: resending with updated values (%x, %x)",
2466			    __func__, ver_pkt->ver_major, ver_pkt->ver_minor);
2467
2468			ldcp->lane_out.lstate |= VSW_VER_INFO_SENT;
2469			ver_pkt->tag.vio_sid = ldcp->local_session;
2470			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
2471
2472			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2473
2474			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2475			    sizeof (vio_ver_msg_t), B_TRUE);
2476
2477			vsw_next_milestone(ldcp);
2478
2479		}
2480		break;
2481
2482	default:
2483		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2484		    ver_pkt->tag.vio_subtype);
2485	}
2486
2487	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
2488}
2489
2490/*
2491 * Process an attribute packet. We can end up here either because our peer
2492 * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our
2493 * peer has sent us an attribute INFO message
2494 *
2495 * If its an ACK we then move to the next stage of the handshake which
2496 * is to send our descriptor ring info to our peer. If its a NACK then
2497 * there is nothing more we can (currently) do.
2498 *
2499 * If we get a valid/acceptable INFO packet (and we have already negotiated
2500 * a version) we ACK back and set channel state to ATTR_RECV, otherwise we
2501 * NACK back and reset channel state to INACTIV.
2502 *
2503 * FUTURE: in time we will probably negotiate over attributes, but for
2504 * the moment unacceptable attributes are regarded as a fatal error.
2505 *
2506 */
2507void
2508vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt)
2509{
2510	vnet_attr_msg_t		*attr_pkt;
2511	vsw_t			*vswp = ldcp->ldc_vswp;
2512	vsw_port_t		*port = ldcp->ldc_port;
2513	uint64_t		macaddr = 0;
2514	int			i;
2515
2516	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
2517
2518	/*
2519	 * We know this is a ctrl/attr packet so
2520	 * cast it into the correct structure.
2521	 */
2522	attr_pkt = (vnet_attr_msg_t *)pkt;
2523
2524	switch (attr_pkt->tag.vio_subtype) {
2525	case VIO_SUBTYPE_INFO:
2526		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2527
2528		if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV))
2529			return;
2530
2531		/*
2532		 * If the attributes are unacceptable then we NACK back.
2533		 */
2534		if (vsw_check_attr(attr_pkt, ldcp)) {
2535
2536			DERR(vswp, "%s (chan %d): invalid attributes",
2537			    __func__, ldcp->ldc_id);
2538
2539			vsw_free_lane_resources(ldcp, INBOUND);
2540
2541			attr_pkt->tag.vio_sid = ldcp->local_session;
2542			attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2543
2544			DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
2545			ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT;
2546			(void) vsw_send_msg(ldcp, (void *)attr_pkt,
2547			    sizeof (vnet_attr_msg_t), B_TRUE);
2548
2549			vsw_next_milestone(ldcp);
2550			return;
2551		}
2552
2553		/*
2554		 * Otherwise store attributes for this lane and update
2555		 * lane state.
2556		 */
2557		ldcp->lane_in.mtu = attr_pkt->mtu;
2558		ldcp->lane_in.addr = attr_pkt->addr;
2559		ldcp->lane_in.addr_type = attr_pkt->addr_type;
2560		ldcp->lane_in.xfer_mode = attr_pkt->xfer_mode;
2561		ldcp->lane_in.ack_freq = attr_pkt->ack_freq;
2562
2563		macaddr = ldcp->lane_in.addr;
2564		for (i = ETHERADDRL - 1; i >= 0; i--) {
2565			port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF;
2566			macaddr >>= 8;
2567		}
2568
2569		/* create the fdb entry for this port/mac address */
2570		vsw_fdbe_add(vswp, port);
2571
2572		/* add the port to the specified vlans */
2573		vsw_vlan_add_ids(port, VSW_VNETPORT);
2574
2575		/* setup device specifc xmit routines */
2576		mutex_enter(&port->tx_lock);
2577		if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
2578		    (ldcp->lane_in.xfer_mode & VIO_DRING_MODE_V1_2)) ||
2579		    (VSW_VER_LT(ldcp, 1, 2) &&
2580		    (ldcp->lane_in.xfer_mode == VIO_DRING_MODE_V1_0))) {
2581			D2(vswp, "%s: mode = VIO_DRING_MODE", __func__);
2582			port->transmit = vsw_dringsend;
2583		} else if (ldcp->lane_in.xfer_mode == VIO_DESC_MODE) {
2584			D2(vswp, "%s: mode = VIO_DESC_MODE", __func__);
2585			vsw_create_privring(ldcp);
2586			port->transmit = vsw_descrsend;
2587			ldcp->lane_out.xfer_mode = VIO_DESC_MODE;
2588		}
2589
2590		/*
2591		 * HybridIO is supported only vnet, not by OBP.
2592		 * So, set hio_capable to true only when in DRING mode.
2593		 */
2594		if (VSW_VER_GTEQ(ldcp, 1, 3) &&
2595		    (ldcp->lane_in.xfer_mode != VIO_DESC_MODE)) {
2596			(void) atomic_swap_32(&port->p_hio_capable, B_TRUE);
2597		} else {
2598			(void) atomic_swap_32(&port->p_hio_capable, B_FALSE);
2599		}
2600
2601		mutex_exit(&port->tx_lock);
2602
2603		attr_pkt->tag.vio_sid = ldcp->local_session;
2604		attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2605
2606		DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
2607
2608		ldcp->lane_in.lstate |= VSW_ATTR_ACK_SENT;
2609
2610		(void) vsw_send_msg(ldcp, (void *)attr_pkt,
2611		    sizeof (vnet_attr_msg_t), B_TRUE);
2612
2613		vsw_next_milestone(ldcp);
2614		break;
2615
2616	case VIO_SUBTYPE_ACK:
2617		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2618
2619		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV))
2620			return;
2621
2622		ldcp->lane_out.lstate |= VSW_ATTR_ACK_RECV;
2623		vsw_next_milestone(ldcp);
2624		break;
2625
2626	case VIO_SUBTYPE_NACK:
2627		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2628
2629		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV))
2630			return;
2631
2632		ldcp->lane_out.lstate |= VSW_ATTR_NACK_RECV;
2633		vsw_next_milestone(ldcp);
2634		break;
2635
2636	default:
2637		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2638		    attr_pkt->tag.vio_subtype);
2639	}
2640
2641	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
2642}
2643
2644/*
2645 * Process a dring info packet. We can end up here either because our peer
2646 * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our
2647 * peer has sent us a dring INFO message.
2648 *
2649 * If we get a valid/acceptable INFO packet (and we have already negotiated
2650 * a version) we ACK back and update the lane state, otherwise we NACK back.
2651 *
2652 * FUTURE: nothing to stop client from sending us info on multiple dring's
2653 * but for the moment we will just use the first one we are given.
2654 *
2655 */
2656void
2657vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt)
2658{
2659	vio_dring_reg_msg_t	*dring_pkt;
2660	vsw_t			*vswp = ldcp->ldc_vswp;
2661	ldc_mem_info_t		minfo;
2662	dring_info_t		*dp, *dbp;
2663	int			dring_found = 0;
2664
2665	/*
2666	 * We know this is a ctrl/dring packet so
2667	 * cast it into the correct structure.
2668	 */
2669	dring_pkt = (vio_dring_reg_msg_t *)pkt;
2670
2671	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
2672
2673	switch (dring_pkt->tag.vio_subtype) {
2674	case VIO_SUBTYPE_INFO:
2675		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2676
2677		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
2678			return;
2679
2680		/*
2681		 * If the dring params are unacceptable then we NACK back.
2682		 */
2683		if (vsw_check_dring_info(dring_pkt)) {
2684
2685			DERR(vswp, "%s (%lld): invalid dring info",
2686			    __func__, ldcp->ldc_id);
2687
2688			vsw_free_lane_resources(ldcp, INBOUND);
2689
2690			dring_pkt->tag.vio_sid = ldcp->local_session;
2691			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2692
2693			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
2694
2695			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
2696
2697			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2698			    sizeof (vio_dring_reg_msg_t), B_TRUE);
2699
2700			vsw_next_milestone(ldcp);
2701			return;
2702		}
2703
2704		/*
2705		 * Otherwise, attempt to map in the dring using the
2706		 * cookie. If that succeeds we send back a unique dring
2707		 * identifier that the sending side will use in future
2708		 * to refer to this descriptor ring.
2709		 */
2710		dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
2711
2712		dp->num_descriptors = dring_pkt->num_descriptors;
2713		dp->descriptor_size = dring_pkt->descriptor_size;
2714		dp->options = dring_pkt->options;
2715		dp->ncookies = dring_pkt->ncookies;
2716
2717		/*
2718		 * Note: should only get one cookie. Enforced in
2719		 * the ldc layer.
2720		 */
2721		bcopy(&dring_pkt->cookie[0], &dp->cookie[0],
2722		    sizeof (ldc_mem_cookie_t));
2723
2724		D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__,
2725		    dp->num_descriptors, dp->descriptor_size);
2726		D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__,
2727		    dp->options, dp->ncookies);
2728
2729		if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0],
2730		    dp->ncookies, dp->num_descriptors, dp->descriptor_size,
2731		    LDC_SHADOW_MAP, &(dp->handle))) != 0) {
2732
2733			DERR(vswp, "%s: dring_map failed\n", __func__);
2734
2735			kmem_free(dp, sizeof (dring_info_t));
2736			vsw_free_lane_resources(ldcp, INBOUND);
2737
2738			dring_pkt->tag.vio_sid = ldcp->local_session;
2739			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2740
2741			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
2742
2743			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
2744			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2745			    sizeof (vio_dring_reg_msg_t), B_TRUE);
2746
2747			vsw_next_milestone(ldcp);
2748			return;
2749		}
2750
2751		if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
2752
2753			DERR(vswp, "%s: dring_addr failed\n", __func__);
2754
2755			kmem_free(dp, sizeof (dring_info_t));
2756			vsw_free_lane_resources(ldcp, INBOUND);
2757
2758			dring_pkt->tag.vio_sid = ldcp->local_session;
2759			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2760
2761			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
2762
2763			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
2764			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2765			    sizeof (vio_dring_reg_msg_t), B_TRUE);
2766
2767			vsw_next_milestone(ldcp);
2768			return;
2769		} else {
2770			/* store the address of the pub part of ring */
2771			dp->pub_addr = minfo.vaddr;
2772		}
2773
2774		/* no private section as we are importing */
2775		dp->priv_addr = NULL;
2776
2777		/*
2778		 * Using simple mono increasing int for ident at
2779		 * the moment.
2780		 */
2781		dp->ident = ldcp->next_ident;
2782		ldcp->next_ident++;
2783
2784		dp->end_idx = 0;
2785		dp->next = NULL;
2786
2787		/*
2788		 * Link it onto the end of the list of drings
2789		 * for this lane.
2790		 */
2791		if (ldcp->lane_in.dringp == NULL) {
2792			D2(vswp, "%s: adding first INBOUND dring", __func__);
2793			ldcp->lane_in.dringp = dp;
2794		} else {
2795			dbp = ldcp->lane_in.dringp;
2796
2797			while (dbp->next != NULL)
2798				dbp = dbp->next;
2799
2800			dbp->next = dp;
2801		}
2802
2803		/* acknowledge it */
2804		dring_pkt->tag.vio_sid = ldcp->local_session;
2805		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2806		dring_pkt->dring_ident = dp->ident;
2807
2808		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2809		    sizeof (vio_dring_reg_msg_t), B_TRUE);
2810
2811		ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT;
2812		vsw_next_milestone(ldcp);
2813		break;
2814
2815	case VIO_SUBTYPE_ACK:
2816		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2817
2818		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV))
2819			return;
2820
2821		/*
2822		 * Peer is acknowledging our dring info and will have
2823		 * sent us a dring identifier which we will use to
2824		 * refer to this ring w.r.t. our peer.
2825		 */
2826		dp = ldcp->lane_out.dringp;
2827		if (dp != NULL) {
2828			/*
2829			 * Find the ring this ident should be associated
2830			 * with.
2831			 */
2832			if (vsw_dring_match(dp, dring_pkt)) {
2833				dring_found = 1;
2834
2835			} else while (dp != NULL) {
2836				if (vsw_dring_match(dp, dring_pkt)) {
2837					dring_found = 1;
2838					break;
2839				}
2840				dp = dp->next;
2841			}
2842
2843			if (dring_found == 0) {
2844				DERR(NULL, "%s: unrecognised ring cookie",
2845				    __func__);
2846				vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2847				return;
2848			}
2849
2850		} else {
2851			DERR(vswp, "%s: DRING ACK received but no drings "
2852			    "allocated", __func__);
2853			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2854			return;
2855		}
2856
2857		/* store ident */
2858		dp->ident = dring_pkt->dring_ident;
2859		ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV;
2860		vsw_next_milestone(ldcp);
2861		break;
2862
2863	case VIO_SUBTYPE_NACK:
2864		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2865
2866		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV))
2867			return;
2868
2869		ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV;
2870		vsw_next_milestone(ldcp);
2871		break;
2872
2873	default:
2874		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
2875		    dring_pkt->tag.vio_subtype);
2876	}
2877
2878	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
2879}
2880
2881/*
2882 * Process a request from peer to unregister a dring.
2883 *
2884 * For the moment we just restart the handshake if our
2885 * peer endpoint attempts to unregister a dring.
2886 */
2887void
2888vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt)
2889{
2890	vsw_t			*vswp = ldcp->ldc_vswp;
2891	vio_dring_unreg_msg_t	*dring_pkt;
2892
2893	/*
2894	 * We know this is a ctrl/dring packet so
2895	 * cast it into the correct structure.
2896	 */
2897	dring_pkt = (vio_dring_unreg_msg_t *)pkt;
2898
2899	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2900
2901	switch (dring_pkt->tag.vio_subtype) {
2902	case VIO_SUBTYPE_INFO:
2903		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2904
2905		DWARN(vswp, "%s: restarting handshake..", __func__);
2906		break;
2907
2908	case VIO_SUBTYPE_ACK:
2909		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2910
2911		DWARN(vswp, "%s: restarting handshake..", __func__);
2912		break;
2913
2914	case VIO_SUBTYPE_NACK:
2915		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2916
2917		DWARN(vswp, "%s: restarting handshake..", __func__);
2918		break;
2919
2920	default:
2921		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
2922		    dring_pkt->tag.vio_subtype);
2923	}
2924
2925	vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2926
2927	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2928}
2929
2930#define	SND_MCST_NACK(ldcp, pkt) \
2931	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
2932	pkt->tag.vio_sid = ldcp->local_session; \
2933	(void) vsw_send_msg(ldcp, (void *)pkt, \
2934			sizeof (vnet_mcast_msg_t), B_TRUE);
2935
2936/*
2937 * Process a multicast request from a vnet.
2938 *
2939 * Vnet's specify a multicast address that they are interested in. This
2940 * address is used as a key into the hash table which forms the multicast
2941 * forwarding database (mFDB).
2942 *
2943 * The table keys are the multicast addresses, while the table entries
2944 * are pointers to lists of ports which wish to receive packets for the
2945 * specified multicast address.
2946 *
2947 * When a multicast packet is being switched we use the address as a key
2948 * into the hash table, and then walk the appropriate port list forwarding
2949 * the pkt to each port in turn.
2950 *
2951 * If a vnet is no longer interested in a particular multicast grouping
2952 * we simply find the correct location in the hash table and then delete
2953 * the relevant port from the port list.
2954 *
2955 * To deal with the case whereby a port is being deleted without first
2956 * removing itself from the lists in the hash table, we maintain a list
2957 * of multicast addresses the port has registered an interest in, within
2958 * the port structure itself. We then simply walk that list of addresses
2959 * using them as keys into the hash table and remove the port from the
2960 * appropriate lists.
2961 */
2962static void
2963vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt)
2964{
2965	vnet_mcast_msg_t	*mcst_pkt;
2966	vsw_port_t		*port = ldcp->ldc_port;
2967	vsw_t			*vswp = ldcp->ldc_vswp;
2968	int			i;
2969
2970	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2971
2972	/*
2973	 * We know this is a ctrl/mcast packet so
2974	 * cast it into the correct structure.
2975	 */
2976	mcst_pkt = (vnet_mcast_msg_t *)pkt;
2977
2978	switch (mcst_pkt->tag.vio_subtype) {
2979	case VIO_SUBTYPE_INFO:
2980		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2981
2982		/*
2983		 * Check if in correct state to receive a multicast
2984		 * message (i.e. handshake complete). If not reset
2985		 * the handshake.
2986		 */
2987		if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV))
2988			return;
2989
2990		/*
2991		 * Before attempting to add or remove address check
2992		 * that they are valid multicast addresses.
2993		 * If not, then NACK back.
2994		 */
2995		for (i = 0; i < mcst_pkt->count; i++) {
2996			if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) {
2997				DERR(vswp, "%s: invalid multicast address",
2998				    __func__);
2999				SND_MCST_NACK(ldcp, mcst_pkt);
3000				return;
3001			}
3002		}
3003
3004		/*
3005		 * Now add/remove the addresses. If this fails we
3006		 * NACK back.
3007		 */
3008		if (vsw_add_rem_mcst(mcst_pkt, port) != 0) {
3009			SND_MCST_NACK(ldcp, mcst_pkt);
3010			return;
3011		}
3012
3013		mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3014		mcst_pkt->tag.vio_sid = ldcp->local_session;
3015
3016		DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt);
3017
3018		(void) vsw_send_msg(ldcp, (void *)mcst_pkt,
3019		    sizeof (vnet_mcast_msg_t), B_TRUE);
3020		break;
3021
3022	case VIO_SUBTYPE_ACK:
3023		DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3024
3025		/*
3026		 * We shouldn't ever get a multicast ACK message as
3027		 * at the moment we never request multicast addresses
3028		 * to be set on some other device. This may change in
3029		 * the future if we have cascading switches.
3030		 */
3031		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV))
3032			return;
3033
3034				/* Do nothing */
3035		break;
3036
3037	case VIO_SUBTYPE_NACK:
3038		DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3039
3040		/*
3041		 * We shouldn't get a multicast NACK packet for the
3042		 * same reasons as we shouldn't get a ACK packet.
3043		 */
3044		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV))
3045			return;
3046
3047				/* Do nothing */
3048		break;
3049
3050	default:
3051		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
3052		    mcst_pkt->tag.vio_subtype);
3053	}
3054
3055	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3056}
3057
3058static void
3059vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt)
3060{
3061	vio_rdx_msg_t	*rdx_pkt;
3062	vsw_t		*vswp = ldcp->ldc_vswp;
3063
3064	/*
3065	 * We know this is a ctrl/rdx packet so
3066	 * cast it into the correct structure.
3067	 */
3068	rdx_pkt = (vio_rdx_msg_t *)pkt;
3069
3070	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3071
3072	switch (rdx_pkt->tag.vio_subtype) {
3073	case VIO_SUBTYPE_INFO:
3074		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3075
3076		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_INFO_RECV))
3077			return;
3078
3079		rdx_pkt->tag.vio_sid = ldcp->local_session;
3080		rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3081
3082		DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt);
3083
3084		ldcp->lane_out.lstate |= VSW_RDX_ACK_SENT;
3085
3086		(void) vsw_send_msg(ldcp, (void *)rdx_pkt,
3087		    sizeof (vio_rdx_msg_t), B_TRUE);
3088
3089		vsw_next_milestone(ldcp);
3090		break;
3091
3092	case VIO_SUBTYPE_ACK:
3093		/*
3094		 * Should be handled in-band by callback handler.
3095		 */
3096		DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__);
3097		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3098		break;
3099
3100	case VIO_SUBTYPE_NACK:
3101		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3102
3103		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_NACK_RECV))
3104			return;
3105
3106		ldcp->lane_in.lstate |= VSW_RDX_NACK_RECV;
3107		vsw_next_milestone(ldcp);
3108		break;
3109
3110	default:
3111		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3112		    rdx_pkt->tag.vio_subtype);
3113	}
3114
3115	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3116}
3117
3118static void
3119vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t *tagp,
3120	uint32_t msglen)
3121{
3122	uint16_t	env = tagp->vio_subtype_env;
3123	vsw_t		*vswp = ldcp->ldc_vswp;
3124
3125	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3126
3127	/* session id check */
3128	if (ldcp->session_status & VSW_PEER_SESSION) {
3129		if (ldcp->peer_session != tagp->vio_sid) {
3130			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
3131			    __func__, ldcp->ldc_id, tagp->vio_sid);
3132			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3133			return;
3134		}
3135	}
3136
3137	/*
3138	 * It is an error for us to be getting data packets
3139	 * before the handshake has completed.
3140	 */
3141	if (ldcp->hphase != VSW_MILESTONE4) {
3142		DERR(vswp, "%s: got data packet before handshake complete "
3143		    "hphase %d (%x: %x)", __func__, ldcp->hphase,
3144		    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
3145		DUMP_FLAGS(ldcp->lane_in.lstate);
3146		DUMP_FLAGS(ldcp->lane_out.lstate);
3147		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3148		return;
3149	}
3150
3151	/*
3152	 * To reduce the locking contention, release the
3153	 * ldc_cblock here and re-acquire it once we are done
3154	 * receiving packets.
3155	 */
3156	mutex_exit(&ldcp->ldc_cblock);
3157	mutex_enter(&ldcp->ldc_rxlock);
3158
3159	/*
3160	 * Switch on vio_subtype envelope, then let lower routines
3161	 * decide if its an INFO, ACK or NACK packet.
3162	 */
3163	if (env == VIO_DRING_DATA) {
3164		vsw_process_data_dring_pkt(ldcp, dpkt);
3165	} else if (env == VIO_PKT_DATA) {
3166		ldcp->rx_pktdata(ldcp, dpkt, msglen);
3167	} else if (env == VIO_DESC_DATA) {
3168		vsw_process_data_ibnd_pkt(ldcp, dpkt);
3169	} else {
3170		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
3171	}
3172
3173	mutex_exit(&ldcp->ldc_rxlock);
3174	mutex_enter(&ldcp->ldc_cblock);
3175
3176	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3177}
3178
3179#define	SND_DRING_NACK(ldcp, pkt) \
3180	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
3181	pkt->tag.vio_sid = ldcp->local_session; \
3182	(void) vsw_send_msg(ldcp, (void *)pkt, \
3183			sizeof (vio_dring_msg_t), B_TRUE);
3184
3185static void
3186vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt)
3187{
3188	vio_dring_msg_t		*dring_pkt;
3189	vnet_public_desc_t	*pub_addr = NULL;
3190	vsw_private_desc_t	*priv_addr = NULL;
3191	dring_info_t		*dp = NULL;
3192	vsw_t			*vswp = ldcp->ldc_vswp;
3193	mblk_t			*mp = NULL;
3194	mblk_t			*bp = NULL;
3195	mblk_t			*bpt = NULL;
3196	size_t			nbytes = 0;
3197	uint64_t		ncookies = 0;
3198	uint64_t		chain = 0;
3199	uint64_t		len;
3200	uint32_t		pos, start, datalen;
3201	uint32_t		range_start, range_end;
3202	int32_t			end, num, cnt = 0;
3203	int			i, rv, msg_rv = 0;
3204	boolean_t		ack_needed = B_FALSE;
3205	boolean_t		prev_desc_ack = B_FALSE;
3206	int			read_attempts = 0;
3207	struct ether_header	*ehp;
3208
3209	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3210
3211	/*
3212	 * We know this is a data/dring packet so
3213	 * cast it into the correct structure.
3214	 */
3215	dring_pkt = (vio_dring_msg_t *)dpkt;
3216
3217	/*
3218	 * Switch on the vio_subtype. If its INFO then we need to
3219	 * process the data. If its an ACK we need to make sure
3220	 * it makes sense (i.e did we send an earlier data/info),
3221	 * and if its a NACK then we maybe attempt a retry.
3222	 */
3223	switch (dring_pkt->tag.vio_subtype) {
3224	case VIO_SUBTYPE_INFO:
3225		D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id);
3226
3227		READ_ENTER(&ldcp->lane_in.dlistrw);
3228		if ((dp = vsw_ident2dring(&ldcp->lane_in,
3229		    dring_pkt->dring_ident)) == NULL) {
3230			RW_EXIT(&ldcp->lane_in.dlistrw);
3231
3232			DERR(vswp, "%s(%lld): unable to find dring from "
3233			    "ident 0x%llx", __func__, ldcp->ldc_id,
3234			    dring_pkt->dring_ident);
3235
3236			SND_DRING_NACK(ldcp, dring_pkt);
3237			return;
3238		}
3239
3240		start = pos = dring_pkt->start_idx;
3241		end = dring_pkt->end_idx;
3242		len = dp->num_descriptors;
3243
3244		range_start = range_end = pos;
3245
3246		D2(vswp, "%s(%lld): start index %ld : end %ld\n",
3247		    __func__, ldcp->ldc_id, start, end);
3248
3249		if (end == -1) {
3250			num = -1;
3251		} else if (end >= 0) {
3252			num = end >= pos ? end - pos + 1: (len - pos + 1) + end;
3253
3254			/* basic sanity check */
3255			if (end > len) {
3256				RW_EXIT(&ldcp->lane_in.dlistrw);
3257				DERR(vswp, "%s(%lld): endpoint %lld outside "
3258				    "ring length %lld", __func__,
3259				    ldcp->ldc_id, end, len);
3260
3261				SND_DRING_NACK(ldcp, dring_pkt);
3262				return;
3263			}
3264		} else {
3265			RW_EXIT(&ldcp->lane_in.dlistrw);
3266			DERR(vswp, "%s(%lld): invalid endpoint %lld",
3267			    __func__, ldcp->ldc_id, end);
3268			SND_DRING_NACK(ldcp, dring_pkt);
3269			return;
3270		}
3271
3272		while (cnt != num) {
3273vsw_recheck_desc:
3274			if ((rv = ldc_mem_dring_acquire(dp->handle,
3275			    pos, pos)) != 0) {
3276				RW_EXIT(&ldcp->lane_in.dlistrw);
3277				DERR(vswp, "%s(%lld): unable to acquire "
3278				    "descriptor at pos %d: err %d",
3279				    __func__, pos, ldcp->ldc_id, rv);
3280				SND_DRING_NACK(ldcp, dring_pkt);
3281				ldcp->ldc_stats.ierrors++;
3282				return;
3283			}
3284
3285			pub_addr = (vnet_public_desc_t *)dp->pub_addr + pos;
3286
3287			/*
3288			 * When given a bounded range of descriptors
3289			 * to process, its an error to hit a descriptor
3290			 * which is not ready. In the non-bounded case
3291			 * (end_idx == -1) this simply indicates we have
3292			 * reached the end of the current active range.
3293			 */
3294			if (pub_addr->hdr.dstate != VIO_DESC_READY) {
3295				/* unbound - no error */
3296				if (end == -1) {
3297					if (read_attempts == vsw_read_attempts)
3298						break;
3299
3300					delay(drv_usectohz(vsw_desc_delay));
3301					read_attempts++;
3302					goto vsw_recheck_desc;
3303				}
3304
3305				/* bounded - error - so NACK back */
3306				RW_EXIT(&ldcp->lane_in.dlistrw);
3307				DERR(vswp, "%s(%lld): descriptor not READY "
3308				    "(%d)", __func__, ldcp->ldc_id,
3309				    pub_addr->hdr.dstate);
3310				SND_DRING_NACK(ldcp, dring_pkt);
3311				return;
3312			}
3313
3314			DTRACE_PROBE1(read_attempts, int, read_attempts);
3315
3316			range_end = pos;
3317
3318			/*
3319			 * If we ACK'd the previous descriptor then now
3320			 * record the new range start position for later
3321			 * ACK's.
3322			 */
3323			if (prev_desc_ack) {
3324				range_start = pos;
3325
3326				D2(vswp, "%s(%lld): updating range start to be "
3327				    "%d", __func__, ldcp->ldc_id, range_start);
3328
3329				prev_desc_ack = B_FALSE;
3330			}
3331
3332			/*
3333			 * Data is padded to align on 8 byte boundary,
3334			 * datalen is actual data length, i.e. minus that
3335			 * padding.
3336			 */
3337			datalen = pub_addr->nbytes;
3338
3339			/*
3340			 * Does peer wish us to ACK when we have finished
3341			 * with this descriptor ?
3342			 */
3343			if (pub_addr->hdr.ack)
3344				ack_needed = B_TRUE;
3345
3346			D2(vswp, "%s(%lld): processing desc %lld at pos"
3347			    " 0x%llx : dstate 0x%lx : datalen 0x%lx",
3348			    __func__, ldcp->ldc_id, pos, pub_addr,
3349			    pub_addr->hdr.dstate, datalen);
3350
3351			/*
3352			 * Mark that we are starting to process descriptor.
3353			 */
3354			pub_addr->hdr.dstate = VIO_DESC_ACCEPTED;
3355
3356			/*
3357			 * Ensure that we ask ldc for an aligned
3358			 * number of bytes.
3359			 */
3360			nbytes = (datalen + VNET_IPALIGN + 7) & ~7;
3361
3362			mp = vio_multipool_allocb(&ldcp->vmp, nbytes);
3363			if (mp == NULL) {
3364				ldcp->ldc_stats.rx_vio_allocb_fail++;
3365				/*
3366				 * No free receive buffers available, so
3367				 * fallback onto allocb(9F). Make sure that
3368				 * we get a data buffer which is a multiple
3369				 * of 8 as this is required by ldc_mem_copy.
3370				 */
3371				DTRACE_PROBE(allocb);
3372				if ((mp = allocb(datalen + VNET_IPALIGN + 8,
3373				    BPRI_MED)) == NULL) {
3374					DERR(vswp, "%s(%ld): allocb failed",
3375					    __func__, ldcp->ldc_id);
3376					pub_addr->hdr.dstate = VIO_DESC_DONE;
3377					(void) ldc_mem_dring_release(dp->handle,
3378					    pos, pos);
3379					ldcp->ldc_stats.ierrors++;
3380					ldcp->ldc_stats.rx_allocb_fail++;
3381					break;
3382				}
3383			}
3384
3385			ncookies = pub_addr->ncookies;
3386			rv = ldc_mem_copy(ldcp->ldc_handle,
3387			    (caddr_t)mp->b_rptr, 0, &nbytes,
3388			    pub_addr->memcookie, ncookies, LDC_COPY_IN);
3389
3390			if (rv != 0) {
3391				DERR(vswp, "%s(%d): unable to copy in data "
3392				    "from %d cookies in desc %d (rv %d)",
3393				    __func__, ldcp->ldc_id, ncookies, pos, rv);
3394				freemsg(mp);
3395
3396				pub_addr->hdr.dstate = VIO_DESC_DONE;
3397				(void) ldc_mem_dring_release(dp->handle,
3398				    pos, pos);
3399				ldcp->ldc_stats.ierrors++;
3400				break;
3401			} else {
3402				D2(vswp, "%s(%d): copied in %ld bytes"
3403				    " using %d cookies", __func__,
3404				    ldcp->ldc_id, nbytes, ncookies);
3405			}
3406
3407			/* adjust the read pointer to skip over the padding */
3408			mp->b_rptr += VNET_IPALIGN;
3409
3410			/* point to the actual end of data */
3411			mp->b_wptr = mp->b_rptr + datalen;
3412
3413			/* update statistics */
3414			ehp = (struct ether_header *)mp->b_rptr;
3415			if (IS_BROADCAST(ehp))
3416				ldcp->ldc_stats.brdcstrcv++;
3417			else if (IS_MULTICAST(ehp))
3418				ldcp->ldc_stats.multircv++;
3419
3420			ldcp->ldc_stats.ipackets++;
3421			ldcp->ldc_stats.rbytes += datalen;
3422
3423			/*
3424			 * IPALIGN space can be used for VLAN_TAG
3425			 */
3426			(void) vsw_vlan_frame_pretag(ldcp->ldc_port,
3427			    VSW_VNETPORT, mp);
3428
3429			/* build a chain of received packets */
3430			if (bp == NULL) {
3431				/* first pkt */
3432				bp = mp;
3433				bp->b_next = bp->b_prev = NULL;
3434				bpt = bp;
3435				chain = 1;
3436			} else {
3437				mp->b_next = mp->b_prev = NULL;
3438				bpt->b_next = mp;
3439				bpt = mp;
3440				chain++;
3441			}
3442
3443			/* mark we are finished with this descriptor */
3444			pub_addr->hdr.dstate = VIO_DESC_DONE;
3445
3446			(void) ldc_mem_dring_release(dp->handle, pos, pos);
3447
3448			/*
3449			 * Send an ACK back to peer if requested.
3450			 */
3451			if (ack_needed) {
3452				ack_needed = B_FALSE;
3453
3454				dring_pkt->start_idx = range_start;
3455				dring_pkt->end_idx = range_end;
3456
3457				DERR(vswp, "%s(%lld): processed %d %d, ACK"
3458				    " requested", __func__, ldcp->ldc_id,
3459				    dring_pkt->start_idx, dring_pkt->end_idx);
3460
3461				dring_pkt->dring_process_state = VIO_DP_ACTIVE;
3462				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3463				dring_pkt->tag.vio_sid = ldcp->local_session;
3464
3465				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
3466				    sizeof (vio_dring_msg_t), B_FALSE);
3467
3468				/*
3469				 * Check if ACK was successfully sent. If not
3470				 * we break and deal with that below.
3471				 */
3472				if (msg_rv != 0)
3473					break;
3474
3475				prev_desc_ack = B_TRUE;
3476				range_start = pos;
3477			}
3478
3479			/* next descriptor */
3480			pos = (pos + 1) % len;
3481			cnt++;
3482
3483			/*
3484			 * Break out of loop here and stop processing to
3485			 * allow some other network device (or disk) to
3486			 * get access to the cpu.
3487			 */
3488			if (chain > vsw_chain_len) {
3489				D3(vswp, "%s(%lld): switching chain of %d "
3490				    "msgs", __func__, ldcp->ldc_id, chain);
3491				break;
3492			}
3493		}
3494		RW_EXIT(&ldcp->lane_in.dlistrw);
3495
3496		/*
3497		 * If when we attempted to send the ACK we found that the
3498		 * channel had been reset then now handle this. We deal with
3499		 * it here as we cannot reset the channel while holding the
3500		 * dlistrw lock, and we don't want to acquire/release it
3501		 * continuously in the above loop, as a channel reset should
3502		 * be a rare event.
3503		 */
3504		if (msg_rv == ECONNRESET) {
3505			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
3506			break;
3507		}
3508
3509		/* send the chain of packets to be switched */
3510		if (bp != NULL) {
3511			DTRACE_PROBE1(vsw_rcv_msgs, int, chain);
3512			D3(vswp, "%s(%lld): switching chain of %d msgs",
3513			    __func__, ldcp->ldc_id, chain);
3514			vswp->vsw_switch_frame(vswp, bp, VSW_VNETPORT,
3515			    ldcp->ldc_port, NULL);
3516		}
3517
3518		DTRACE_PROBE1(msg_cnt, int, cnt);
3519
3520		/*
3521		 * We are now finished so ACK back with the state
3522		 * set to STOPPING so our peer knows we are finished
3523		 */
3524		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3525		dring_pkt->tag.vio_sid = ldcp->local_session;
3526
3527		dring_pkt->dring_process_state = VIO_DP_STOPPED;
3528
3529		DTRACE_PROBE(stop_process_sent);
3530
3531		/*
3532		 * We have not processed any more descriptors beyond
3533		 * the last one we ACK'd.
3534		 */
3535		if (prev_desc_ack)
3536			range_start = range_end;
3537
3538		dring_pkt->start_idx = range_start;
3539		dring_pkt->end_idx = range_end;
3540
3541		D2(vswp, "%s(%lld) processed : %d : %d, now stopping",
3542		    __func__, ldcp->ldc_id, dring_pkt->start_idx,
3543		    dring_pkt->end_idx);
3544
3545		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
3546		    sizeof (vio_dring_msg_t), B_TRUE);
3547		break;
3548
3549	case VIO_SUBTYPE_ACK:
3550		D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id);
3551		/*
3552		 * Verify that the relevant descriptors are all
3553		 * marked as DONE
3554		 */
3555		READ_ENTER(&ldcp->lane_out.dlistrw);
3556		if ((dp = vsw_ident2dring(&ldcp->lane_out,
3557		    dring_pkt->dring_ident)) == NULL) {
3558			RW_EXIT(&ldcp->lane_out.dlistrw);
3559			DERR(vswp, "%s: unknown ident in ACK", __func__);
3560			return;
3561		}
3562
3563		start = end = 0;
3564		start = dring_pkt->start_idx;
3565		end = dring_pkt->end_idx;
3566		len = dp->num_descriptors;
3567
3568
3569		mutex_enter(&dp->dlock);
3570		dp->last_ack_recv = end;
3571		ldcp->ldc_stats.dring_data_acks++;
3572		mutex_exit(&dp->dlock);
3573
3574		(void) vsw_reclaim_dring(dp, start);
3575
3576		/*
3577		 * If our peer is stopping processing descriptors then
3578		 * we check to make sure it has processed all the descriptors
3579		 * we have updated. If not then we send it a new message
3580		 * to prompt it to restart.
3581		 */
3582		if (dring_pkt->dring_process_state == VIO_DP_STOPPED) {
3583			DTRACE_PROBE(stop_process_recv);
3584			D2(vswp, "%s(%lld): got stopping msg : %d : %d",
3585			    __func__, ldcp->ldc_id, dring_pkt->start_idx,
3586			    dring_pkt->end_idx);
3587
3588			/*
3589			 * Check next descriptor in public section of ring.
3590			 * If its marked as READY then we need to prompt our
3591			 * peer to start processing the ring again.
3592			 */
3593			i = (end + 1) % len;
3594			pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
3595			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
3596
3597			/*
3598			 * Hold the restart lock across all of this to
3599			 * make sure that its not possible for us to
3600			 * decide that a msg needs to be sent in the future
3601			 * but the sending code having already checked is
3602			 * about to exit.
3603			 */
3604			mutex_enter(&dp->restart_lock);
3605			ldcp->ldc_stats.dring_stopped_acks++;
3606			mutex_enter(&priv_addr->dstate_lock);
3607			if (pub_addr->hdr.dstate == VIO_DESC_READY) {
3608
3609				mutex_exit(&priv_addr->dstate_lock);
3610
3611				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
3612				dring_pkt->tag.vio_sid = ldcp->local_session;
3613
3614				dring_pkt->start_idx = (end + 1) % len;
3615				dring_pkt->end_idx = -1;
3616
3617				D2(vswp, "%s(%lld) : sending restart msg:"
3618				    " %d : %d", __func__, ldcp->ldc_id,
3619				    dring_pkt->start_idx, dring_pkt->end_idx);
3620
3621				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
3622				    sizeof (vio_dring_msg_t), B_FALSE);
3623				ldcp->ldc_stats.dring_data_msgs++;
3624
3625			} else {
3626				mutex_exit(&priv_addr->dstate_lock);
3627				dp->restart_reqd = B_TRUE;
3628			}
3629			mutex_exit(&dp->restart_lock);
3630		}
3631		RW_EXIT(&ldcp->lane_out.dlistrw);
3632
3633		/* only do channel reset after dropping dlistrw lock */
3634		if (msg_rv == ECONNRESET)
3635			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
3636
3637		break;
3638
3639	case VIO_SUBTYPE_NACK:
3640		DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK",
3641		    __func__, ldcp->ldc_id);
3642		/*
3643		 * Something is badly wrong if we are getting NACK's
3644		 * for our data pkts. So reset the channel.
3645		 */
3646		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3647
3648		break;
3649
3650	default:
3651		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
3652		    ldcp->ldc_id, dring_pkt->tag.vio_subtype);
3653	}
3654
3655	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3656}
3657
3658/*
3659 * dummy pkt data handler function for vnet protocol version 1.0
3660 */
3661static void
3662vsw_process_pkt_data_nop(void *arg1, void *arg2, uint32_t msglen)
3663{
3664	_NOTE(ARGUNUSED(arg1, arg2, msglen))
3665}
3666
3667/*
3668 * This function handles raw pkt data messages received over the channel.
3669 * Currently, only priority-eth-type frames are received through this mechanism.
3670 * In this case, the frame(data) is present within the message itself which
3671 * is copied into an mblk before switching it.
3672 */
3673static void
3674vsw_process_pkt_data(void *arg1, void *arg2, uint32_t msglen)
3675{
3676	vsw_ldc_t		*ldcp = (vsw_ldc_t *)arg1;
3677	vio_raw_data_msg_t	*dpkt = (vio_raw_data_msg_t *)arg2;
3678	uint32_t		size;
3679	mblk_t			*mp;
3680	vsw_t			*vswp = ldcp->ldc_vswp;
3681	vgen_stats_t		*statsp = &ldcp->ldc_stats;
3682	lane_t			*lp = &ldcp->lane_out;
3683
3684	size = msglen - VIO_PKT_DATA_HDRSIZE;
3685	if (size < ETHERMIN || size > lp->mtu) {
3686		(void) atomic_inc_32(&statsp->rx_pri_fail);
3687		DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
3688		    ldcp->ldc_id, size);
3689		return;
3690	}
3691
3692	mp = vio_multipool_allocb(&ldcp->vmp, size + VLAN_TAGSZ);
3693	if (mp == NULL) {
3694		mp = allocb(size + VLAN_TAGSZ, BPRI_MED);
3695		if (mp == NULL) {
3696			(void) atomic_inc_32(&statsp->rx_pri_fail);
3697			DWARN(vswp, "%s(%lld) allocb failure, "
3698			    "unable to process priority frame\n", __func__,
3699			    ldcp->ldc_id);
3700			return;
3701		}
3702	}
3703
3704	/* skip over the extra space for vlan tag */
3705	mp->b_rptr += VLAN_TAGSZ;
3706
3707	/* copy the frame from the payload of raw data msg into the mblk */
3708	bcopy(dpkt->data, mp->b_rptr, size);
3709	mp->b_wptr = mp->b_rptr + size;
3710
3711	/* update stats */
3712	(void) atomic_inc_64(&statsp->rx_pri_packets);
3713	(void) atomic_add_64(&statsp->rx_pri_bytes, size);
3714
3715	/*
3716	 * VLAN_TAGSZ of extra space has been pre-alloc'd if tag is needed.
3717	 */
3718	(void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);
3719
3720	/* switch the frame to destination */
3721	vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT, ldcp->ldc_port, NULL);
3722}
3723
3724/*
3725 * Process an in-band descriptor message (most likely from
3726 * OBP).
3727 */
3728static void
3729vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
3730{
3731	vnet_ibnd_desc_t	*ibnd_desc;
3732	dring_info_t		*dp = NULL;
3733	vsw_private_desc_t	*priv_addr = NULL;
3734	vsw_t			*vswp = ldcp->ldc_vswp;
3735	mblk_t			*mp = NULL;
3736	size_t			nbytes = 0;
3737	size_t			off = 0;
3738	uint64_t		idx = 0;
3739	uint32_t		num = 1, len, datalen = 0;
3740	uint64_t		ncookies = 0;
3741	int			i, rv;
3742	int			j = 0;
3743
3744	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3745
3746	ibnd_desc = (vnet_ibnd_desc_t *)pkt;
3747
3748	switch (ibnd_desc->hdr.tag.vio_subtype) {
3749	case VIO_SUBTYPE_INFO:
3750		D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3751
3752		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
3753			return;
3754
3755		/*
3756		 * Data is padded to align on a 8 byte boundary,
3757		 * nbytes is actual data length, i.e. minus that
3758		 * padding.
3759		 */
3760		datalen = ibnd_desc->nbytes;
3761
3762		D2(vswp, "%s(%lld): processing inband desc : "
3763		    ": datalen 0x%lx", __func__, ldcp->ldc_id, datalen);
3764
3765		ncookies = ibnd_desc->ncookies;
3766
3767		/*
3768		 * allocb(9F) returns an aligned data block. We
3769		 * need to ensure that we ask ldc for an aligned
3770		 * number of bytes also.
3771		 */
3772		nbytes = datalen;
3773		if (nbytes & 0x7) {
3774			off = 8 - (nbytes & 0x7);
3775			nbytes += off;
3776		}
3777
3778		/* alloc extra space for VLAN_TAG */
3779		mp = allocb(datalen + 8, BPRI_MED);
3780		if (mp == NULL) {
3781			DERR(vswp, "%s(%lld): allocb failed",
3782			    __func__, ldcp->ldc_id);
3783			ldcp->ldc_stats.rx_allocb_fail++;
3784			return;
3785		}
3786
3787		/* skip over the extra space for VLAN_TAG */
3788		mp->b_rptr += 8;
3789
3790		rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr,
3791		    0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies,
3792		    LDC_COPY_IN);
3793
3794		if (rv != 0) {
3795			DERR(vswp, "%s(%d): unable to copy in data from "
3796			    "%d cookie(s)", __func__, ldcp->ldc_id, ncookies);
3797			freemsg(mp);
3798			ldcp->ldc_stats.ierrors++;
3799			return;
3800		}
3801
3802		D2(vswp, "%s(%d): copied in %ld bytes using %d cookies",
3803		    __func__, ldcp->ldc_id, nbytes, ncookies);
3804
3805		/* point to the actual end of data */
3806		mp->b_wptr = mp->b_rptr + datalen;
3807		ldcp->ldc_stats.ipackets++;
3808		ldcp->ldc_stats.rbytes += datalen;
3809
3810		/*
3811		 * We ACK back every in-band descriptor message we process
3812		 */
3813		ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK;
3814		ibnd_desc->hdr.tag.vio_sid = ldcp->local_session;
3815		(void) vsw_send_msg(ldcp, (void *)ibnd_desc,
3816		    sizeof (vnet_ibnd_desc_t), B_TRUE);
3817
3818		/*
3819		 * there is extra space alloc'd for VLAN_TAG
3820		 */
3821		(void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);
3822
3823		/* send the packet to be switched */
3824		vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT,
3825		    ldcp->ldc_port, NULL);
3826
3827		break;
3828
3829	case VIO_SUBTYPE_ACK:
3830		D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3831
3832		/* Verify the ACK is valid */
3833		idx = ibnd_desc->hdr.desc_handle;
3834
3835		if (idx >= vsw_ntxds) {
3836			cmn_err(CE_WARN, "!vsw%d: corrupted ACK received "
3837			    "(idx %ld)", vswp->instance, idx);
3838			return;
3839		}
3840
3841		if ((dp = ldcp->lane_out.dringp) == NULL) {
3842			DERR(vswp, "%s: no dring found", __func__);
3843			return;
3844		}
3845
3846		len = dp->num_descriptors;
3847		/*
3848		 * If the descriptor we are being ACK'ed for is not the
3849		 * one we expected, then pkts were lost somwhere, either
3850		 * when we tried to send a msg, or a previous ACK msg from
3851		 * our peer. In either case we now reclaim the descriptors
3852		 * in the range from the last ACK we received up to the
3853		 * current ACK.
3854		 */
3855		if (idx != dp->last_ack_recv) {
3856			DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)",
3857			    __func__, dp->last_ack_recv, idx);
3858			num = idx >= dp->last_ack_recv ?
3859			    idx - dp->last_ack_recv + 1:
3860			    (len - dp->last_ack_recv + 1) + idx;
3861		}
3862
3863		/*
3864		 * When we sent the in-band message to our peer we
3865		 * marked the copy in our private ring as READY. We now
3866		 * check that the descriptor we are being ACK'ed for is in
3867		 * fact READY, i.e. it is one we have shared with our peer.
3868		 *
3869		 * If its not we flag an error, but still reset the descr
3870		 * back to FREE.
3871		 */
3872		for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) {
3873			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
3874			mutex_enter(&priv_addr->dstate_lock);
3875			if (priv_addr->dstate != VIO_DESC_READY) {
3876				DERR(vswp, "%s: (%ld) desc at index %ld not "
3877				    "READY (0x%lx)", __func__,
3878				    ldcp->ldc_id, idx, priv_addr->dstate);
3879				DERR(vswp, "%s: bound %d: ncookies %ld : "
3880				    "datalen %ld", __func__,
3881				    priv_addr->bound, priv_addr->ncookies,
3882				    priv_addr->datalen);
3883			}
3884			D2(vswp, "%s: (%lld) freeing descp at %lld", __func__,
3885			    ldcp->ldc_id, idx);
3886			/* release resources associated with sent msg */
3887			priv_addr->datalen = 0;
3888			priv_addr->dstate = VIO_DESC_FREE;
3889			mutex_exit(&priv_addr->dstate_lock);
3890		}
3891		/* update to next expected value */
3892		dp->last_ack_recv = (idx + 1) % dp->num_descriptors;
3893
3894		break;
3895
3896	case VIO_SUBTYPE_NACK:
3897		DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3898
3899		/*
3900		 * We should only get a NACK if our peer doesn't like
3901		 * something about a message we have sent it. If this
3902		 * happens we just release the resources associated with
3903		 * the message. (We are relying on higher layers to decide
3904		 * whether or not to resend.
3905		 */
3906
3907		/* limit check */
3908		idx = ibnd_desc->hdr.desc_handle;
3909
3910		if (idx >= vsw_ntxds) {
3911			DERR(vswp, "%s: corrupted NACK received (idx %lld)",
3912			    __func__, idx);
3913			return;
3914		}
3915
3916		if ((dp = ldcp->lane_out.dringp) == NULL) {
3917			DERR(vswp, "%s: no dring found", __func__);
3918			return;
3919		}
3920
3921		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
3922
3923		/* move to correct location in ring */
3924		priv_addr += idx;
3925
3926		/* release resources associated with sent msg */
3927		mutex_enter(&priv_addr->dstate_lock);
3928		priv_addr->datalen = 0;
3929		priv_addr->dstate = VIO_DESC_FREE;
3930		mutex_exit(&priv_addr->dstate_lock);
3931
3932		break;
3933
3934	default:
3935		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
3936		    ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype);
3937	}
3938
3939	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3940}
3941
3942static void
3943vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t *tagp)
3944{
3945	_NOTE(ARGUNUSED(epkt))
3946
3947	vsw_t		*vswp = ldcp->ldc_vswp;
3948	uint16_t	env = tagp->vio_subtype_env;
3949
3950	D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
3951
3952	/*
3953	 * Error vio_subtypes have yet to be defined. So for
3954	 * the moment we can't do anything.
3955	 */
3956	D2(vswp, "%s: (%x) vio_subtype env", __func__, env);
3957
3958	D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
3959}
3960
3961/* transmit the packet over the given port */
3962int
3963vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt, uint32_t count)
3964{
3965	vsw_ldc_list_t 	*ldcl = &port->p_ldclist;
3966	vsw_ldc_t 	*ldcp;
3967	int		status = 0;
3968	uint32_t	n;
3969
3970	READ_ENTER(&ldcl->lockrw);
3971	/*
3972	 * Note for now, we have a single channel.
3973	 */
3974	ldcp = ldcl->head;
3975	if (ldcp == NULL) {
3976		DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n");
3977		freemsgchain(mp);
3978		RW_EXIT(&ldcl->lockrw);
3979		return (1);
3980	}
3981
3982	n = vsw_vlan_frame_untag(port, VSW_VNETPORT, &mp, &mpt);
3983
3984	count -= n;
3985	if (count == 0) {
3986		goto vsw_portsend_exit;
3987	}
3988
3989	status = ldcp->tx(ldcp, mp, mpt, count);
3990
3991vsw_portsend_exit:
3992	RW_EXIT(&ldcl->lockrw);
3993
3994	return (status);
3995}
3996
3997/*
3998 * Break up frames into 2 seperate chains: normal and
3999 * priority, based on the frame type. The number of
4000 * priority frames is also counted and returned.
4001 *
4002 * Params:
4003 * 	vswp:	pointer to the instance of vsw
4004 *	np:	head of packet chain to be broken
4005 *	npt:	tail of packet chain to be broken
4006 *
4007 * Returns:
4008 *	np:	head of normal data packets
4009 *	npt:	tail of normal data packets
4010 *	hp:	head of high priority packets
4011 *	hpt:	tail of high priority packets
4012 */
4013static uint32_t
4014vsw_get_pri_packets(vsw_t *vswp, mblk_t **np, mblk_t **npt,
4015	mblk_t **hp, mblk_t **hpt)
4016{
4017	mblk_t			*tmp = NULL;
4018	mblk_t			*smp = NULL;
4019	mblk_t			*hmp = NULL;	/* high prio pkts head */
4020	mblk_t			*hmpt = NULL;	/* high prio pkts tail */
4021	mblk_t			*nmp = NULL;	/* normal pkts head */
4022	mblk_t			*nmpt = NULL;	/* normal pkts tail */
4023	uint32_t		count = 0;
4024	int			i;
4025	struct ether_header	*ehp;
4026	uint32_t		num_types;
4027	uint16_t		*types;
4028
4029	tmp = *np;
4030	while (tmp != NULL) {
4031
4032		smp = tmp;
4033		tmp = tmp->b_next;
4034		smp->b_next = NULL;
4035		smp->b_prev = NULL;
4036
4037		ehp = (struct ether_header *)smp->b_rptr;
4038		num_types = vswp->pri_num_types;
4039		types = vswp->pri_types;
4040		for (i = 0; i < num_types; i++) {
4041			if (ehp->ether_type == types[i]) {
4042				/* high priority frame */
4043
4044				if (hmp != NULL) {
4045					hmpt->b_next = smp;
4046					hmpt = smp;
4047				} else {
4048					hmp = hmpt = smp;
4049				}
4050				count++;
4051				break;
4052			}
4053		}
4054		if (i == num_types) {
4055			/* normal data frame */
4056
4057			if (nmp != NULL) {
4058				nmpt->b_next = smp;
4059				nmpt = smp;
4060			} else {
4061				nmp = nmpt = smp;
4062			}
4063		}
4064	}
4065
4066	*hp = hmp;
4067	*hpt = hmpt;
4068	*np = nmp;
4069	*npt = nmpt;
4070
4071	return (count);
4072}
4073
4074/*
4075 * Wrapper function to transmit normal and/or priority frames over the channel.
4076 */
4077static int
4078vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
4079{
4080	vsw_ldc_t 		*ldcp = (vsw_ldc_t *)arg;
4081	mblk_t			*tmp;
4082	mblk_t			*smp;
4083	mblk_t			*hmp;	/* high prio pkts head */
4084	mblk_t			*hmpt;	/* high prio pkts tail */
4085	mblk_t			*nmp;	/* normal pkts head */
4086	mblk_t			*nmpt;	/* normal pkts tail */
4087	uint32_t		n = 0;
4088	vsw_t			*vswp = ldcp->ldc_vswp;
4089
4090	ASSERT(VSW_PRI_ETH_DEFINED(vswp));
4091	ASSERT(count != 0);
4092
4093	nmp = mp;
4094	nmpt = mpt;
4095
4096	/* gather any priority frames from the chain of packets */
4097	n = vsw_get_pri_packets(vswp, &nmp, &nmpt, &hmp, &hmpt);
4098
4099	/* transmit priority frames */
4100	tmp = hmp;
4101	while (tmp != NULL) {
4102		smp = tmp;
4103		tmp = tmp->b_next;
4104		smp->b_next = NULL;
4105		vsw_ldcsend_pkt(ldcp, smp);
4106	}
4107
4108	count -= n;
4109
4110	if (count == 0) {
4111		/* no normal data frames to process */
4112		return (0);
4113	}
4114
4115	return (vsw_ldctx(ldcp, nmp, nmpt, count));
4116}
4117
4118/*
4119 * Wrapper function to transmit normal frames over the channel.
4120 */
4121static int
4122vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
4123{
4124	vsw_ldc_t 	*ldcp = (vsw_ldc_t *)arg;
4125	mblk_t		*tmp = NULL;
4126
4127	ASSERT(count != 0);
4128	/*
4129	 * If the TX thread is enabled, then queue the
4130	 * ordinary frames and signal the tx thread.
4131	 */
4132	if (ldcp->tx_thread != NULL) {
4133
4134		mutex_enter(&ldcp->tx_thr_lock);
4135
4136		if ((ldcp->tx_cnt + count) >= vsw_max_tx_qcount) {
4137			/*
4138			 * If we reached queue limit,
4139			 * do not queue new packets,
4140			 * drop them.
4141			 */
4142			ldcp->ldc_stats.tx_qfull += count;
4143			mutex_exit(&ldcp->tx_thr_lock);
4144			freemsgchain(mp);
4145			goto exit;
4146		}
4147		if (ldcp->tx_mhead == NULL) {
4148			ldcp->tx_mhead = mp;
4149			ldcp->tx_mtail = mpt;
4150			cv_signal(&ldcp->tx_thr_cv);
4151		} else {
4152			ldcp->tx_mtail->b_next = mp;
4153			ldcp->tx_mtail = mpt;
4154		}
4155		ldcp->tx_cnt += count;
4156		mutex_exit(&ldcp->tx_thr_lock);
4157	} else {
4158		while (mp != NULL) {
4159			tmp = mp->b_next;
4160			mp->b_next = mp->b_prev = NULL;
4161			(void) vsw_ldcsend(ldcp, mp, 1);
4162			mp = tmp;
4163		}
4164	}
4165
4166exit:
4167	return (0);
4168}
4169
4170/*
4171 * This function transmits the frame in the payload of a raw data
4172 * (VIO_PKT_DATA) message. Thus, it provides an Out-Of-Band path to
4173 * send special frames with high priorities, without going through
4174 * the normal data path which uses descriptor ring mechanism.
4175 */
4176static void
4177vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp)
4178{
4179	vio_raw_data_msg_t	*pkt;
4180	mblk_t			*bp;
4181	mblk_t			*nmp = NULL;
4182	caddr_t			dst;
4183	uint32_t		mblksz;
4184	uint32_t		size;
4185	uint32_t		nbytes;
4186	int			rv;
4187	vsw_t			*vswp = ldcp->ldc_vswp;
4188	vgen_stats_t		*statsp = &ldcp->ldc_stats;
4189
4190	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4191	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4192		(void) atomic_inc_32(&statsp->tx_pri_fail);
4193		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
4194		    "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
4195		    ldcp->lane_out.lstate);
4196		goto send_pkt_exit;
4197	}
4198
4199	size = msgsize(mp);
4200
4201	/* frame size bigger than available payload len of raw data msg ? */
4202	if (size > (size_t)(ldcp->msglen - VIO_PKT_DATA_HDRSIZE)) {
4203		(void) atomic_inc_32(&statsp->tx_pri_fail);
4204		DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
4205		    ldcp->ldc_id, size);
4206		goto send_pkt_exit;
4207	}
4208
4209	if (size < ETHERMIN)
4210		size = ETHERMIN;
4211
4212	/* alloc space for a raw data message */
4213	nmp = vio_allocb(vswp->pri_tx_vmp);
4214	if (nmp == NULL) {
4215		(void) atomic_inc_32(&statsp->tx_pri_fail);
4216		DWARN(vswp, "vio_allocb failed\n");
4217		goto send_pkt_exit;
4218	}
4219	pkt = (vio_raw_data_msg_t *)nmp->b_rptr;
4220
4221	/* copy frame into the payload of raw data message */
4222	dst = (caddr_t)pkt->data;
4223	for (bp = mp; bp != NULL; bp = bp->b_cont) {
4224		mblksz = MBLKL(bp);
4225		bcopy(bp->b_rptr, dst, mblksz);
4226		dst += mblksz;
4227	}
4228
4229	/* setup the raw data msg */
4230	pkt->tag.vio_msgtype = VIO_TYPE_DATA;
4231	pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
4232	pkt->tag.vio_subtype_env = VIO_PKT_DATA;
4233	pkt->tag.vio_sid = ldcp->local_session;
4234	nbytes = VIO_PKT_DATA_HDRSIZE + size;
4235
4236	/* send the msg over ldc */
4237	rv = vsw_send_msg(ldcp, (void *)pkt, nbytes, B_TRUE);
4238	if (rv != 0) {
4239		(void) atomic_inc_32(&statsp->tx_pri_fail);
4240		DWARN(vswp, "%s(%lld) Error sending priority frame\n", __func__,
4241		    ldcp->ldc_id);
4242		goto send_pkt_exit;
4243	}
4244
4245	/* update stats */
4246	(void) atomic_inc_64(&statsp->tx_pri_packets);
4247	(void) atomic_add_64(&statsp->tx_pri_packets, size);
4248
4249send_pkt_exit:
4250	if (nmp != NULL)
4251		freemsg(nmp);
4252	freemsg(mp);
4253}
4254
4255/*
4256 * Transmit the packet over the given LDC channel.
4257 *
4258 * The 'retries' argument indicates how many times a packet
4259 * is retried before it is dropped. Note, the retry is done
4260 * only for a resource related failure, for all other failures
4261 * the packet is dropped immediately.
4262 */
4263static int
4264vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries)
4265{
4266	int i;
4267	int rc;
4268	int status = 0;
4269	vsw_port_t *port = ldcp->ldc_port;
4270	dring_info_t *dp = NULL;
4271
4272
4273	for (i = 0; i < retries; ) {
4274		/*
4275		 * Send the message out using the appropriate
4276		 * transmit function which will free mblock when it
4277		 * is finished with it.
4278		 */
4279		mutex_enter(&port->tx_lock);
4280		if (port->transmit != NULL) {
4281			status = (*port->transmit)(ldcp, mp);
4282		}
4283		if (status == LDC_TX_SUCCESS) {
4284			mutex_exit(&port->tx_lock);
4285			break;
4286		}
4287		i++;	/* increment the counter here */
4288
4289		/* If its the last retry, then update the oerror */
4290		if ((i == retries) && (status == LDC_TX_NORESOURCES)) {
4291			ldcp->ldc_stats.oerrors++;
4292		}
4293		mutex_exit(&port->tx_lock);
4294
4295		if (status != LDC_TX_NORESOURCES) {
4296			/*
4297			 * No retrying required for errors un-related
4298			 * to resources.
4299			 */
4300			break;
4301		}
4302		READ_ENTER(&ldcp->lane_out.dlistrw);
4303		if (((dp = ldcp->lane_out.dringp) != NULL) &&
4304		    ((VSW_VER_GTEQ(ldcp, 1, 2) &&
4305		    (ldcp->lane_out.xfer_mode & VIO_DRING_MODE_V1_2)) ||
4306		    ((VSW_VER_LT(ldcp, 1, 2) &&
4307		    (ldcp->lane_out.xfer_mode == VIO_DRING_MODE_V1_0))))) {
4308			rc = vsw_reclaim_dring(dp, dp->end_idx);
4309		} else {
4310			/*
4311			 * If there is no dring or the xfer_mode is
4312			 * set to DESC_MODE(ie., OBP), then simply break here.
4313			 */
4314			RW_EXIT(&ldcp->lane_out.dlistrw);
4315			break;
4316		}
4317		RW_EXIT(&ldcp->lane_out.dlistrw);
4318
4319		/*
4320		 * Delay only if none were reclaimed
4321		 * and its not the last retry.
4322		 */
4323		if ((rc == 0) && (i < retries)) {
4324			delay(drv_usectohz(vsw_ldc_tx_delay));
4325		}
4326	}
4327	freemsg(mp);
4328	return (status);
4329}
4330
4331/*
4332 * Send packet out via descriptor ring to a logical device.
4333 */
4334static int
4335vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp)
4336{
4337	vio_dring_msg_t		dring_pkt;
4338	dring_info_t		*dp = NULL;
4339	vsw_private_desc_t	*priv_desc = NULL;
4340	vnet_public_desc_t	*pub = NULL;
4341	vsw_t			*vswp = ldcp->ldc_vswp;
4342	mblk_t			*bp;
4343	size_t			n, size;
4344	caddr_t			bufp;
4345	int			idx;
4346	int			status = LDC_TX_SUCCESS;
4347	struct ether_header	*ehp = (struct ether_header *)mp->b_rptr;
4348	lane_t			*lp = &ldcp->lane_out;
4349
4350	D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id);
4351
4352	/* TODO: make test a macro */
4353	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4354	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4355		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
4356		    "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
4357		    ldcp->lane_out.lstate);
4358		ldcp->ldc_stats.oerrors++;
4359		return (LDC_TX_FAILURE);
4360	}
4361
4362	/*
4363	 * Note - using first ring only, this may change
4364	 * in the future.
4365	 */
4366	READ_ENTER(&ldcp->lane_out.dlistrw);
4367	if ((dp = ldcp->lane_out.dringp) == NULL) {
4368		RW_EXIT(&ldcp->lane_out.dlistrw);
4369		DERR(vswp, "%s(%lld): no dring for outbound lane on"
4370		    " channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id);
4371		ldcp->ldc_stats.oerrors++;
4372		return (LDC_TX_FAILURE);
4373	}
4374
4375	size = msgsize(mp);
4376	if (size > (size_t)lp->mtu) {
4377		RW_EXIT(&ldcp->lane_out.dlistrw);
4378		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
4379		    ldcp->ldc_id, size);
4380		ldcp->ldc_stats.oerrors++;
4381		return (LDC_TX_FAILURE);
4382	}
4383
4384	/*
4385	 * Find a free descriptor
4386	 *
4387	 * Note: for the moment we are assuming that we will only
4388	 * have one dring going from the switch to each of its
4389	 * peers. This may change in the future.
4390	 */
4391	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
4392		D2(vswp, "%s(%lld): no descriptor available for ring "
4393		    "at 0x%llx", __func__, ldcp->ldc_id, dp);
4394
4395		/* nothing more we can do */
4396		status = LDC_TX_NORESOURCES;
4397		ldcp->ldc_stats.tx_no_desc++;
4398		goto vsw_dringsend_free_exit;
4399	} else {
4400		D2(vswp, "%s(%lld): free private descriptor found at pos %ld "
4401		    "addr 0x%llx\n", __func__, ldcp->ldc_id, idx, priv_desc);
4402	}
4403
4404	/* copy data into the descriptor */
4405	bufp = priv_desc->datap;
4406	bufp += VNET_IPALIGN;
4407	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
4408		n = MBLKL(bp);
4409		bcopy(bp->b_rptr, bufp, n);
4410		bufp += n;
4411	}
4412
4413	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
4414
4415	pub = priv_desc->descp;
4416	pub->nbytes = priv_desc->datalen;
4417
4418	/* update statistics */
4419	if (IS_BROADCAST(ehp))
4420		ldcp->ldc_stats.brdcstxmt++;
4421	else if (IS_MULTICAST(ehp))
4422		ldcp->ldc_stats.multixmt++;
4423	ldcp->ldc_stats.opackets++;
4424	ldcp->ldc_stats.obytes += priv_desc->datalen;
4425
4426	mutex_enter(&priv_desc->dstate_lock);
4427	pub->hdr.dstate = VIO_DESC_READY;
4428	mutex_exit(&priv_desc->dstate_lock);
4429
4430	/*
4431	 * Determine whether or not we need to send a message to our
4432	 * peer prompting them to read our newly updated descriptor(s).
4433	 */
4434	mutex_enter(&dp->restart_lock);
4435	if (dp->restart_reqd) {
4436		dp->restart_reqd = B_FALSE;
4437		ldcp->ldc_stats.dring_data_msgs++;
4438		mutex_exit(&dp->restart_lock);
4439
4440		/*
4441		 * Send a vio_dring_msg to peer to prompt them to read
4442		 * the updated descriptor ring.
4443		 */
4444		dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA;
4445		dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO;
4446		dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA;
4447		dring_pkt.tag.vio_sid = ldcp->local_session;
4448
4449		/* Note - for now using first ring */
4450		dring_pkt.dring_ident = dp->ident;
4451
4452		/*
4453		 * If last_ack_recv is -1 then we know we've not
4454		 * received any ack's yet, so this must be the first
4455		 * msg sent, so set the start to the begining of the ring.
4456		 */
4457		mutex_enter(&dp->dlock);
4458		if (dp->last_ack_recv == -1) {
4459			dring_pkt.start_idx = 0;
4460		} else {
4461			dring_pkt.start_idx =
4462			    (dp->last_ack_recv + 1) % dp->num_descriptors;
4463		}
4464		dring_pkt.end_idx = -1;
4465		mutex_exit(&dp->dlock);
4466
4467		D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__,
4468		    ldcp->ldc_id, dp, dring_pkt.dring_ident);
4469		D3(vswp, "%s(%lld): start %lld : end %lld :\n",
4470		    __func__, ldcp->ldc_id, dring_pkt.start_idx,
4471		    dring_pkt.end_idx);
4472
4473		RW_EXIT(&ldcp->lane_out.dlistrw);
4474
4475		(void) vsw_send_msg(ldcp, (void *)&dring_pkt,
4476		    sizeof (vio_dring_msg_t), B_TRUE);
4477
4478		return (status);
4479
4480	} else {
4481		mutex_exit(&dp->restart_lock);
4482		D2(vswp, "%s(%lld): updating descp %d", __func__,
4483		    ldcp->ldc_id, idx);
4484	}
4485
4486vsw_dringsend_free_exit:
4487
4488	RW_EXIT(&ldcp->lane_out.dlistrw);
4489
4490	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
4491	return (status);
4492}
4493
4494/*
4495 * Send an in-band descriptor message over ldc.
4496 */
4497static int
4498vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
4499{
4500	vsw_t			*vswp = ldcp->ldc_vswp;
4501	vnet_ibnd_desc_t	ibnd_msg;
4502	vsw_private_desc_t	*priv_desc = NULL;
4503	dring_info_t		*dp = NULL;
4504	size_t			n, size = 0;
4505	caddr_t			bufp;
4506	mblk_t			*bp;
4507	int			idx, i;
4508	int			status = LDC_TX_SUCCESS;
4509	static int		warn_msg = 1;
4510	lane_t			*lp = &ldcp->lane_out;
4511
4512	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4513
4514	ASSERT(mp != NULL);
4515
4516	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4517	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4518		DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt",
4519		    __func__, ldcp->ldc_id, ldcp->ldc_status,
4520		    ldcp->lane_out.lstate);
4521		ldcp->ldc_stats.oerrors++;
4522		return (LDC_TX_FAILURE);
4523	}
4524
4525	/*
4526	 * only expect single dring to exist, which we use
4527	 * as an internal buffer, rather than a transfer channel.
4528	 */
4529	READ_ENTER(&ldcp->lane_out.dlistrw);
4530	if ((dp = ldcp->lane_out.dringp) == NULL) {
4531		DERR(vswp, "%s(%lld): no dring for outbound lane",
4532		    __func__, ldcp->ldc_id);
4533		DERR(vswp, "%s(%lld) status(%d) state (0x%llx)", __func__,
4534		    ldcp->ldc_id, ldcp->ldc_status, ldcp->lane_out.lstate);
4535		RW_EXIT(&ldcp->lane_out.dlistrw);
4536		ldcp->ldc_stats.oerrors++;
4537		return (LDC_TX_FAILURE);
4538	}
4539
4540	size = msgsize(mp);
4541	if (size > (size_t)lp->mtu) {
4542		RW_EXIT(&ldcp->lane_out.dlistrw);
4543		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
4544		    ldcp->ldc_id, size);
4545		ldcp->ldc_stats.oerrors++;
4546		return (LDC_TX_FAILURE);
4547	}
4548
4549	/*
4550	 * Find a free descriptor in our buffer ring
4551	 */
4552	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
4553		RW_EXIT(&ldcp->lane_out.dlistrw);
4554		if (warn_msg) {
4555			DERR(vswp, "%s(%lld): no descriptor available for ring "
4556			    "at 0x%llx", __func__, ldcp->ldc_id, dp);
4557			warn_msg = 0;
4558		}
4559
4560		/* nothing more we can do */
4561		status = LDC_TX_NORESOURCES;
4562		goto vsw_descrsend_free_exit;
4563	} else {
4564		D2(vswp, "%s(%lld): free private descriptor found at pos "
4565		    "%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx, priv_desc);
4566		warn_msg = 1;
4567	}
4568
4569	/* copy data into the descriptor */
4570	bufp = priv_desc->datap;
4571	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
4572		n = MBLKL(bp);
4573		bcopy(bp->b_rptr, bufp, n);
4574		bufp += n;
4575	}
4576
4577	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
4578
4579	/* create and send the in-band descp msg */
4580	ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
4581	ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO;
4582	ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
4583	ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;
4584
4585	/*
4586	 * Copy the mem cookies describing the data from the
4587	 * private region of the descriptor ring into the inband
4588	 * descriptor.
4589	 */
4590	for (i = 0; i < priv_desc->ncookies; i++) {
4591		bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i],
4592		    sizeof (ldc_mem_cookie_t));
4593	}
4594
4595	ibnd_msg.hdr.desc_handle = idx;
4596	ibnd_msg.ncookies = priv_desc->ncookies;
4597	ibnd_msg.nbytes = size;
4598
4599	ldcp->ldc_stats.opackets++;
4600	ldcp->ldc_stats.obytes += size;
4601
4602	RW_EXIT(&ldcp->lane_out.dlistrw);
4603
4604	(void) vsw_send_msg(ldcp, (void *)&ibnd_msg,
4605	    sizeof (vnet_ibnd_desc_t), B_TRUE);
4606
4607vsw_descrsend_free_exit:
4608
4609	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4610	return (status);
4611}
4612
4613static void
4614vsw_send_ver(void *arg)
4615{
4616	vsw_ldc_t	*ldcp = (vsw_ldc_t *)arg;
4617	vsw_t		*vswp = ldcp->ldc_vswp;
4618	lane_t		*lp = &ldcp->lane_out;
4619	vio_ver_msg_t	ver_msg;
4620
4621	D1(vswp, "%s enter", __func__);
4622
4623	ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4624	ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4625	ver_msg.tag.vio_subtype_env = VIO_VER_INFO;
4626	ver_msg.tag.vio_sid = ldcp->local_session;
4627
4628	if (vsw_obp_ver_proto_workaround == B_FALSE) {
4629		ver_msg.ver_major = vsw_versions[0].ver_major;
4630		ver_msg.ver_minor = vsw_versions[0].ver_minor;
4631	} else {
4632		/* use the major,minor that we've ack'd */
4633		lane_t	*lpi = &ldcp->lane_in;
4634		ver_msg.ver_major = lpi->ver_major;
4635		ver_msg.ver_minor = lpi->ver_minor;
4636	}
4637	ver_msg.dev_class = VDEV_NETWORK_SWITCH;
4638
4639	lp->lstate |= VSW_VER_INFO_SENT;
4640	lp->ver_major = ver_msg.ver_major;
4641	lp->ver_minor = ver_msg.ver_minor;
4642
4643	DUMP_TAG(ver_msg.tag);
4644
4645	(void) vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t), B_TRUE);
4646
4647	D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id);
4648}
4649
4650static void
4651vsw_send_attr(vsw_ldc_t *ldcp)
4652{
4653	vsw_t			*vswp = ldcp->ldc_vswp;
4654	lane_t			*lp = &ldcp->lane_out;
4655	vnet_attr_msg_t		attr_msg;
4656
4657	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
4658
4659	/*
4660	 * Subtype is set to INFO by default
4661	 */
4662	attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4663	attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4664	attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO;
4665	attr_msg.tag.vio_sid = ldcp->local_session;
4666
4667	/* payload copied from default settings for lane */
4668	attr_msg.mtu = lp->mtu;
4669	attr_msg.addr_type = lp->addr_type;
4670	attr_msg.xfer_mode = lp->xfer_mode;
4671	attr_msg.ack_freq = lp->xfer_mode;
4672
4673	READ_ENTER(&vswp->if_lockrw);
4674	attr_msg.addr = vnet_macaddr_strtoul((vswp->if_addr).ether_addr_octet);
4675	RW_EXIT(&vswp->if_lockrw);
4676
4677	ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT;
4678
4679	DUMP_TAG(attr_msg.tag);
4680
4681	(void) vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t), B_TRUE);
4682
4683	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
4684}
4685
4686/*
4687 * Create dring info msg (which also results in the creation of
4688 * a dring).
4689 */
4690static vio_dring_reg_msg_t *
4691vsw_create_dring_info_pkt(vsw_ldc_t *ldcp)
4692{
4693	vio_dring_reg_msg_t	*mp;
4694	dring_info_t		*dp;
4695	vsw_t			*vswp = ldcp->ldc_vswp;
4696
4697	D1(vswp, "vsw_create_dring_info_pkt enter\n");
4698
4699	/*
4700	 * If we can't create a dring, obviously no point sending
4701	 * a message.
4702	 */
4703	if ((dp = vsw_create_dring(ldcp)) == NULL)
4704		return (NULL);
4705
4706	mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP);
4707
4708	mp->tag.vio_msgtype = VIO_TYPE_CTRL;
4709	mp->tag.vio_subtype = VIO_SUBTYPE_INFO;
4710	mp->tag.vio_subtype_env = VIO_DRING_REG;
4711	mp->tag.vio_sid = ldcp->local_session;
4712
4713	/* payload */
4714	mp->num_descriptors = dp->num_descriptors;
4715	mp->descriptor_size = dp->descriptor_size;
4716	mp->options = dp->options;
4717	mp->ncookies = dp->ncookies;
4718	bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t));
4719
4720	mp->dring_ident = 0;
4721
4722	D1(vswp, "vsw_create_dring_info_pkt exit\n");
4723
4724	return (mp);
4725}
4726
4727static void
4728vsw_send_dring_info(vsw_ldc_t *ldcp)
4729{
4730	vio_dring_reg_msg_t	*dring_msg;
4731	vsw_t			*vswp = ldcp->ldc_vswp;
4732
4733	D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id);
4734
4735	dring_msg = vsw_create_dring_info_pkt(ldcp);
4736	if (dring_msg == NULL) {
4737		cmn_err(CE_WARN, "!vsw%d: %s: error creating msg",
4738		    vswp->instance, __func__);
4739		return;
4740	}
4741
4742	ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT;
4743
4744	DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg);
4745
4746	(void) vsw_send_msg(ldcp, dring_msg,
4747	    sizeof (vio_dring_reg_msg_t), B_TRUE);
4748
4749	kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t));
4750
4751	D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id);
4752}
4753
4754static void
4755vsw_send_rdx(vsw_ldc_t *ldcp)
4756{
4757	vsw_t		*vswp = ldcp->ldc_vswp;
4758	vio_rdx_msg_t	rdx_msg;
4759
4760	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
4761
4762	rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4763	rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4764	rdx_msg.tag.vio_subtype_env = VIO_RDX;
4765	rdx_msg.tag.vio_sid = ldcp->local_session;
4766
4767	ldcp->lane_in.lstate |= VSW_RDX_INFO_SENT;
4768
4769	DUMP_TAG(rdx_msg.tag);
4770
4771	(void) vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t), B_TRUE);
4772
4773	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
4774}
4775
4776/*
4777 * Generic routine to send message out over ldc channel.
4778 *
4779 * It is possible that when we attempt to write over the ldc channel
4780 * that we get notified that it has been reset. Depending on the value
4781 * of the handle_reset flag we either handle that event here or simply
4782 * notify the caller that the channel was reset.
4783 */
4784int
4785vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size, boolean_t handle_reset)
4786{
4787	int			rv;
4788	size_t			msglen = size;
4789	vio_msg_tag_t		*tag = (vio_msg_tag_t *)msgp;
4790	vsw_t			*vswp = ldcp->ldc_vswp;
4791	vio_dring_msg_t		*dmsg;
4792	vio_raw_data_msg_t	*rmsg;
4793	vnet_ibnd_desc_t	*imsg;
4794	boolean_t		data_msg = B_FALSE;
4795
4796	D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes",
4797	    ldcp->ldc_id, size);
4798
4799	D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype);
4800	D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype);
4801	D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env);
4802
4803	mutex_enter(&ldcp->ldc_txlock);
4804
4805	if (tag->vio_subtype == VIO_SUBTYPE_INFO) {
4806		if (tag->vio_subtype_env == VIO_DRING_DATA) {
4807			dmsg = (vio_dring_msg_t *)tag;
4808			dmsg->seq_num = ldcp->lane_out.seq_num;
4809			data_msg = B_TRUE;
4810		} else if (tag->vio_subtype_env == VIO_PKT_DATA) {
4811			rmsg = (vio_raw_data_msg_t *)tag;
4812			rmsg->seq_num = ldcp->lane_out.seq_num;
4813			data_msg = B_TRUE;
4814		} else if (tag->vio_subtype_env == VIO_DESC_DATA) {
4815			imsg = (vnet_ibnd_desc_t *)tag;
4816			imsg->hdr.seq_num = ldcp->lane_out.seq_num;
4817			data_msg = B_TRUE;
4818		}
4819	}
4820
4821	do {
4822		msglen = size;
4823		rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen);
4824	} while (rv == EWOULDBLOCK && --vsw_wretries > 0);
4825
4826	if (rv == 0 && data_msg == B_TRUE) {
4827		ldcp->lane_out.seq_num++;
4828	}
4829
4830	if ((rv != 0) || (msglen != size)) {
4831		DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) rv(%d) "
4832		    "size (%d) msglen(%d)\n", ldcp->ldc_id, rv, size, msglen);
4833		ldcp->ldc_stats.oerrors++;
4834	}
4835
4836	mutex_exit(&ldcp->ldc_txlock);
4837
4838	/*
4839	 * If channel has been reset we either handle it here or
4840	 * simply report back that it has been reset and let caller
4841	 * decide what to do.
4842	 */
4843	if (rv == ECONNRESET) {
4844		DWARN(vswp, "%s (%lld) channel reset", __func__, ldcp->ldc_id);
4845
4846		/*
4847		 * N.B - must never be holding the dlistrw lock when
4848		 * we do a reset of the channel.
4849		 */
4850		if (handle_reset) {
4851			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
4852		}
4853	}
4854
4855	return (rv);
4856}
4857
4858/*
4859 * Remove the specified address from the list of address maintained
4860 * in this port node.
4861 */
4862mcst_addr_t *
4863vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr)
4864{
4865	vsw_t		*vswp = NULL;
4866	vsw_port_t	*port = NULL;
4867	mcst_addr_t	*prev_p = NULL;
4868	mcst_addr_t	*curr_p = NULL;
4869
4870	D1(NULL, "%s: enter : devtype %d : addr 0x%llx",
4871	    __func__, devtype, addr);
4872
4873	if (devtype == VSW_VNETPORT) {
4874		port = (vsw_port_t *)arg;
4875		mutex_enter(&port->mca_lock);
4876		prev_p = curr_p = port->mcap;
4877	} else {
4878		vswp = (vsw_t *)arg;
4879		mutex_enter(&vswp->mca_lock);
4880		prev_p = curr_p = vswp->mcap;
4881	}
4882
4883	while (curr_p != NULL) {
4884		if (curr_p->addr == addr) {
4885			D2(NULL, "%s: address found", __func__);
4886			/* match found */
4887			if (prev_p == curr_p) {
4888				/* list head */
4889				if (devtype == VSW_VNETPORT)
4890					port->mcap = curr_p->nextp;
4891				else
4892					vswp->mcap = curr_p->nextp;
4893			} else {
4894				prev_p->nextp = curr_p->nextp;
4895			}
4896			break;
4897		} else {
4898			prev_p = curr_p;
4899			curr_p = curr_p->nextp;
4900		}
4901	}
4902
4903	if (devtype == VSW_VNETPORT)
4904		mutex_exit(&port->mca_lock);
4905	else
4906		mutex_exit(&vswp->mca_lock);
4907
4908	D1(NULL, "%s: exit", __func__);
4909
4910	return (curr_p);
4911}
4912
4913/*
4914 * Creates a descriptor ring (dring) and links it into the
4915 * link of outbound drings for this channel.
4916 *
4917 * Returns NULL if creation failed.
4918 */
4919static dring_info_t *
4920vsw_create_dring(vsw_ldc_t *ldcp)
4921{
4922	vsw_private_desc_t	*priv_addr = NULL;
4923	vsw_t			*vswp = ldcp->ldc_vswp;
4924	ldc_mem_info_t		minfo;
4925	dring_info_t		*dp, *tp;
4926	int			i;
4927
4928	dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
4929
4930	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
4931
4932	/* create public section of ring */
4933	if ((ldc_mem_dring_create(vsw_ntxds,
4934	    VSW_PUB_SIZE, &dp->handle)) != 0) {
4935
4936		DERR(vswp, "vsw_create_dring(%lld): ldc dring create "
4937		    "failed", ldcp->ldc_id);
4938		goto create_fail_exit;
4939	}
4940
4941	ASSERT(dp->handle != NULL);
4942
4943	/*
4944	 * Get the base address of the public section of the ring.
4945	 */
4946	if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
4947		DERR(vswp, "vsw_create_dring(%lld): dring info failed\n",
4948		    ldcp->ldc_id);
4949		goto dring_fail_exit;
4950	} else {
4951		ASSERT(minfo.vaddr != 0);
4952		dp->pub_addr = minfo.vaddr;
4953	}
4954
4955	dp->num_descriptors = vsw_ntxds;
4956	dp->descriptor_size = VSW_PUB_SIZE;
4957	dp->options = VIO_TX_DRING;
4958	dp->ncookies = 1;	/* guaranteed by ldc */
4959
4960	/*
4961	 * create private portion of ring
4962	 */
4963	dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc(
4964	    (sizeof (vsw_private_desc_t) * vsw_ntxds), KM_SLEEP);
4965
4966	if (vsw_setup_ring(ldcp, dp)) {
4967		DERR(vswp, "%s: unable to setup ring", __func__);
4968		goto dring_fail_exit;
4969	}
4970
4971	/* haven't used any descriptors yet */
4972	dp->end_idx = 0;
4973	dp->last_ack_recv = -1;
4974
4975	/* bind dring to the channel */
4976	if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle,
4977	    LDC_SHADOW_MAP, LDC_MEM_RW,
4978	    &dp->cookie[0], &dp->ncookies)) != 0) {
4979		DERR(vswp, "vsw_create_dring: unable to bind to channel "
4980		    "%lld", ldcp->ldc_id);
4981		goto dring_fail_exit;
4982	}
4983
4984	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
4985	dp->restart_reqd = B_TRUE;
4986
4987	/*
4988	 * Only ever create rings for outgoing lane. Link it onto
4989	 * end of list.
4990	 */
4991	WRITE_ENTER(&ldcp->lane_out.dlistrw);
4992	if (ldcp->lane_out.dringp == NULL) {
4993		D2(vswp, "vsw_create_dring: adding first outbound ring");
4994		ldcp->lane_out.dringp = dp;
4995	} else {
4996		tp = ldcp->lane_out.dringp;
4997		while (tp->next != NULL)
4998			tp = tp->next;
4999
5000		tp->next = dp;
5001	}
5002	RW_EXIT(&ldcp->lane_out.dlistrw);
5003
5004	return (dp);
5005
5006dring_fail_exit:
5007	(void) ldc_mem_dring_destroy(dp->handle);
5008
5009create_fail_exit:
5010	if (dp->priv_addr != NULL) {
5011		priv_addr = dp->priv_addr;
5012		for (i = 0; i < vsw_ntxds; i++) {
5013			if (priv_addr->memhandle != NULL)
5014				(void) ldc_mem_free_handle(
5015				    priv_addr->memhandle);
5016			priv_addr++;
5017		}
5018		kmem_free(dp->priv_addr,
5019		    (sizeof (vsw_private_desc_t) * vsw_ntxds));
5020	}
5021	mutex_destroy(&dp->dlock);
5022
5023	kmem_free(dp, sizeof (dring_info_t));
5024	return (NULL);
5025}
5026
5027/*
5028 * Create a ring consisting of just a private portion and link
5029 * it into the list of rings for the outbound lane.
5030 *
5031 * These type of rings are used primarily for temporary data
5032 * storage (i.e. as data buffers).
5033 */
5034void
5035vsw_create_privring(vsw_ldc_t *ldcp)
5036{
5037	dring_info_t		*dp, *tp;
5038	vsw_t			*vswp = ldcp->ldc_vswp;
5039
5040	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
5041
5042	dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
5043
5044	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
5045
5046	/* no public section */
5047	dp->pub_addr = NULL;
5048
5049	dp->priv_addr = kmem_zalloc(
5050	    (sizeof (vsw_private_desc_t) * vsw_ntxds), KM_SLEEP);
5051
5052	dp->num_descriptors = vsw_ntxds;
5053
5054	if (vsw_setup_ring(ldcp, dp)) {
5055		DERR(vswp, "%s: setup of ring failed", __func__);
5056		kmem_free(dp->priv_addr,
5057		    (sizeof (vsw_private_desc_t) * vsw_ntxds));
5058		mutex_destroy(&dp->dlock);
5059		kmem_free(dp, sizeof (dring_info_t));
5060		return;
5061	}
5062
5063	/* haven't used any descriptors yet */
5064	dp->end_idx = 0;
5065
5066	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
5067	dp->restart_reqd = B_TRUE;
5068
5069	/*
5070	 * Only ever create rings for outgoing lane. Link it onto
5071	 * end of list.
5072	 */
5073	WRITE_ENTER(&ldcp->lane_out.dlistrw);
5074	if (ldcp->lane_out.dringp == NULL) {
5075		D2(vswp, "%s: adding first outbound privring", __func__);
5076		ldcp->lane_out.dringp = dp;
5077	} else {
5078		tp = ldcp->lane_out.dringp;
5079		while (tp->next != NULL)
5080			tp = tp->next;
5081
5082		tp->next = dp;
5083	}
5084	RW_EXIT(&ldcp->lane_out.dlistrw);
5085
5086	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
5087}
5088
5089/*
5090 * Setup the descriptors in the dring. Returns 0 on success, 1 on
5091 * failure.
5092 */
5093int
5094vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp)
5095{
5096	vnet_public_desc_t	*pub_addr = NULL;
5097	vsw_private_desc_t	*priv_addr = NULL;
5098	vsw_t			*vswp = ldcp->ldc_vswp;
5099	uint64_t		*tmpp;
5100	uint64_t		offset = 0;
5101	uint32_t		ncookies = 0;
5102	static char		*name = "vsw_setup_ring";
5103	int			i, j, nc, rv;
5104	size_t			data_sz;
5105
5106	priv_addr = dp->priv_addr;
5107	pub_addr = dp->pub_addr;
5108
5109	/* public section may be null but private should never be */
5110	ASSERT(priv_addr != NULL);
5111
5112	/*
5113	 * Allocate the region of memory which will be used to hold
5114	 * the data the descriptors will refer to.
5115	 */
5116	data_sz = vswp->max_frame_size + VNET_IPALIGN + VNET_LDCALIGN;
5117	data_sz = VNET_ROUNDUP_2K(data_sz);
5118	dp->desc_data_sz = data_sz;
5119	dp->data_sz = vsw_ntxds * data_sz;
5120	dp->data_addr = kmem_alloc(dp->data_sz, KM_SLEEP);
5121
5122	D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name,
5123	    dp->data_sz, dp->data_addr);
5124
5125	tmpp = (uint64_t *)dp->data_addr;
5126	offset = dp->desc_data_sz/sizeof (tmpp);
5127
5128	/*
5129	 * Initialise some of the private and public (if they exist)
5130	 * descriptor fields.
5131	 */
5132	for (i = 0; i < vsw_ntxds; i++) {
5133		mutex_init(&priv_addr->dstate_lock, NULL, MUTEX_DRIVER, NULL);
5134
5135		if ((ldc_mem_alloc_handle(ldcp->ldc_handle,
5136		    &priv_addr->memhandle)) != 0) {
5137			DERR(vswp, "%s: alloc mem handle failed", name);
5138			goto setup_ring_cleanup;
5139		}
5140
5141		priv_addr->datap = (void *)tmpp;
5142
5143		rv = ldc_mem_bind_handle(priv_addr->memhandle,
5144		    (caddr_t)priv_addr->datap, dp->desc_data_sz,
5145		    LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W,
5146		    &(priv_addr->memcookie[0]), &ncookies);
5147		if (rv != 0) {
5148			DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed "
5149			    "(rv %d)", name, ldcp->ldc_id, rv);
5150			goto setup_ring_cleanup;
5151		}
5152		priv_addr->bound = 1;
5153
5154		D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx",
5155		    name, i, priv_addr->memcookie[0].addr,
5156		    priv_addr->memcookie[0].size);
5157
5158		if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) {
5159			DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned "
5160			    "invalid num of cookies (%d) for size 0x%llx",
5161			    name, ldcp->ldc_id, ncookies, VSW_RING_EL_DATA_SZ);
5162
5163			goto setup_ring_cleanup;
5164		} else {
5165			for (j = 1; j < ncookies; j++) {
5166				rv = ldc_mem_nextcookie(priv_addr->memhandle,
5167				    &(priv_addr->memcookie[j]));
5168				if (rv != 0) {
5169					DERR(vswp, "%s: ldc_mem_nextcookie "
5170					    "failed rv (%d)", name, rv);
5171					goto setup_ring_cleanup;
5172				}
5173				D3(vswp, "%s: memcookie %d : addr 0x%llx : "
5174				    "size 0x%llx", name, j,
5175				    priv_addr->memcookie[j].addr,
5176				    priv_addr->memcookie[j].size);
5177			}
5178
5179		}
5180		priv_addr->ncookies = ncookies;
5181		priv_addr->dstate = VIO_DESC_FREE;
5182
5183		if (pub_addr != NULL) {
5184
5185			/* link pub and private sides */
5186			priv_addr->descp = pub_addr;
5187
5188			pub_addr->ncookies = priv_addr->ncookies;
5189
5190			for (nc = 0; nc < pub_addr->ncookies; nc++) {
5191				bcopy(&priv_addr->memcookie[nc],
5192				    &pub_addr->memcookie[nc],
5193				    sizeof (ldc_mem_cookie_t));
5194			}
5195
5196			pub_addr->hdr.dstate = VIO_DESC_FREE;
5197			pub_addr++;
5198		}
5199
5200		/*
5201		 * move to next element in the dring and the next
5202		 * position in the data buffer.
5203		 */
5204		priv_addr++;
5205		tmpp += offset;
5206	}
5207
5208	return (0);
5209
5210setup_ring_cleanup:
5211	priv_addr = dp->priv_addr;
5212
5213	for (j = 0; j < i; j++) {
5214		(void) ldc_mem_unbind_handle(priv_addr->memhandle);
5215		(void) ldc_mem_free_handle(priv_addr->memhandle);
5216
5217		mutex_destroy(&priv_addr->dstate_lock);
5218
5219		priv_addr++;
5220	}
5221	kmem_free(dp->data_addr, dp->data_sz);
5222
5223	return (1);
5224}
5225
5226/*
5227 * Searches the private section of a ring for a free descriptor,
5228 * starting at the location of the last free descriptor found
5229 * previously.
5230 *
5231 * Returns 0 if free descriptor is available, and updates state
5232 * of private descriptor to VIO_DESC_READY,  otherwise returns 1.
5233 *
5234 * FUTURE: might need to return contiguous range of descriptors
5235 * as dring info msg assumes all will be contiguous.
5236 */
5237static int
5238vsw_dring_find_free_desc(dring_info_t *dringp,
5239		vsw_private_desc_t **priv_p, int *idx)
5240{
5241	vsw_private_desc_t	*addr = NULL;
5242	int			num = vsw_ntxds;
5243	int			ret = 1;
5244
5245	D1(NULL, "%s enter\n", __func__);
5246
5247	ASSERT(dringp->priv_addr != NULL);
5248
5249	D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld",
5250	    __func__, dringp, dringp->end_idx);
5251
5252	addr = (vsw_private_desc_t *)dringp->priv_addr + dringp->end_idx;
5253
5254	mutex_enter(&addr->dstate_lock);
5255	if (addr->dstate == VIO_DESC_FREE) {
5256		addr->dstate = VIO_DESC_READY;
5257		*priv_p = addr;
5258		*idx = dringp->end_idx;
5259		dringp->end_idx = (dringp->end_idx + 1) % num;
5260		ret = 0;
5261
5262	}
5263	mutex_exit(&addr->dstate_lock);
5264
5265	/* ring full */
5266	if (ret == 1) {
5267		D2(NULL, "%s: no desp free: started at %d", __func__,
5268		    dringp->end_idx);
5269	}
5270
5271	D1(NULL, "%s: exit\n", __func__);
5272
5273	return (ret);
5274}
5275
5276/*
5277 * Map from a dring identifier to the ring itself. Returns
5278 * pointer to ring or NULL if no match found.
5279 *
5280 * Should be called with dlistrw rwlock held as reader.
5281 */
5282static dring_info_t *
5283vsw_ident2dring(lane_t *lane, uint64_t ident)
5284{
5285	dring_info_t	*dp = NULL;
5286
5287	if ((dp = lane->dringp) == NULL) {
5288		return (NULL);
5289	} else {
5290		if (dp->ident == ident)
5291			return (dp);
5292
5293		while (dp != NULL) {
5294			if (dp->ident == ident)
5295				break;
5296			dp = dp->next;
5297		}
5298	}
5299
5300	return (dp);
5301}
5302
5303/*
5304 * Set the default lane attributes. These are copied into
5305 * the attr msg we send to our peer. If they are not acceptable
5306 * then (currently) the handshake ends.
5307 */
5308static void
5309vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
5310{
5311	bzero(lp, sizeof (lane_t));
5312
5313	READ_ENTER(&vswp->if_lockrw);
5314	ether_copy(&(vswp->if_addr), &(lp->addr));
5315	RW_EXIT(&vswp->if_lockrw);
5316
5317	lp->mtu = vswp->max_frame_size;
5318	lp->addr_type = ADDR_TYPE_MAC;
5319	lp->xfer_mode = VIO_DRING_MODE_V1_0;
5320	lp->ack_freq = 0;	/* for shared mode */
5321	lp->seq_num = VNET_ISS;
5322}
5323
5324/*
5325 * Verify that the attributes are acceptable.
5326 *
5327 * FUTURE: If some attributes are not acceptable, change them
5328 * our desired values.
5329 */
5330static int
5331vsw_check_attr(vnet_attr_msg_t *pkt, vsw_ldc_t *ldcp)
5332{
5333	int			ret = 0;
5334	struct ether_addr	ea;
5335	vsw_port_t		*port = ldcp->ldc_port;
5336	lane_t			*lp = &ldcp->lane_out;
5337
5338	D1(NULL, "vsw_check_attr enter\n");
5339
5340	if ((pkt->xfer_mode != VIO_DESC_MODE) &&
5341	    (pkt->xfer_mode != lp->xfer_mode)) {
5342		D2(NULL, "vsw_check_attr: unknown mode %x\n", pkt->xfer_mode);
5343		ret = 1;
5344	}
5345
5346	/* Only support MAC addresses at moment. */
5347	if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) {
5348		D2(NULL, "vsw_check_attr: invalid addr_type %x, "
5349		    "or address 0x%llx\n", pkt->addr_type, pkt->addr);
5350		ret = 1;
5351	}
5352
5353	/*
5354	 * MAC address supplied by device should match that stored
5355	 * in the vsw-port OBP node. Need to decide what to do if they
5356	 * don't match, for the moment just warn but don't fail.
5357	 */
5358	vnet_macaddr_ultostr(pkt->addr, ea.ether_addr_octet);
5359	if (ether_cmp(&ea, &port->p_macaddr) != 0) {
5360		DERR(NULL, "vsw_check_attr: device supplied address "
5361		    "0x%llx doesn't match node address 0x%llx\n",
5362		    pkt->addr, port->p_macaddr);
5363	}
5364
5365	/*
5366	 * Ack freq only makes sense in pkt mode, in shared
5367	 * mode the ring descriptors say whether or not to
5368	 * send back an ACK.
5369	 */
5370	if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
5371	    (pkt->xfer_mode & VIO_DRING_MODE_V1_2)) ||
5372	    (VSW_VER_LT(ldcp, 1, 2) &&
5373	    (pkt->xfer_mode == VIO_DRING_MODE_V1_0))) {
5374		if (pkt->ack_freq > 0) {
5375			D2(NULL, "vsw_check_attr: non zero ack freq "
5376			    " in SHM mode\n");
5377			ret = 1;
5378		}
5379	}
5380
5381	/*
5382	 * Note: for the moment we only support ETHER
5383	 * frames. This may change in the future.
5384	 */
5385	if ((pkt->mtu > lp->mtu) || (pkt->mtu <= 0)) {
5386		D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n",
5387		    pkt->mtu);
5388		ret = 1;
5389	}
5390
5391	D1(NULL, "vsw_check_attr exit\n");
5392
5393	return (ret);
5394}
5395
5396/*
5397 * Returns 1 if there is a problem, 0 otherwise.
5398 */
5399static int
5400vsw_check_dring_info(vio_dring_reg_msg_t *pkt)
5401{
5402	_NOTE(ARGUNUSED(pkt))
5403
5404	int	ret = 0;
5405
5406	D1(NULL, "vsw_check_dring_info enter\n");
5407
5408	if ((pkt->num_descriptors == 0) ||
5409	    (pkt->descriptor_size == 0) ||
5410	    (pkt->ncookies != 1)) {
5411		DERR(NULL, "vsw_check_dring_info: invalid dring msg");
5412		ret = 1;
5413	}
5414
5415	D1(NULL, "vsw_check_dring_info exit\n");
5416
5417	return (ret);
5418}
5419
5420/*
5421 * Returns 1 if two memory cookies match. Otherwise returns 0.
5422 */
5423static int
5424vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2)
5425{
5426	if ((m1->addr != m2->addr) ||
5427	    (m2->size != m2->size)) {
5428		return (0);
5429	} else {
5430		return (1);
5431	}
5432}
5433
5434/*
5435 * Returns 1 if ring described in reg message matches that
5436 * described by dring_info structure. Otherwise returns 0.
5437 */
5438static int
5439vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg)
5440{
5441	if ((msg->descriptor_size != dp->descriptor_size) ||
5442	    (msg->num_descriptors != dp->num_descriptors) ||
5443	    (msg->ncookies != dp->ncookies) ||
5444	    !(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) {
5445		return (0);
5446	} else {
5447		return (1);
5448	}
5449
5450}
5451
5452static caddr_t
5453vsw_print_ethaddr(uint8_t *a, char *ebuf)
5454{
5455	(void) sprintf(ebuf, "%x:%x:%x:%x:%x:%x",
5456	    a[0], a[1], a[2], a[3], a[4], a[5]);
5457	return (ebuf);
5458}
5459
5460/*
5461 * Reset and free all the resources associated with
5462 * the channel.
5463 */
5464static void
5465vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
5466{
5467	dring_info_t		*dp, *dpp;
5468	lane_t			*lp = NULL;
5469	int			rv = 0;
5470
5471	ASSERT(ldcp != NULL);
5472
5473	D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id);
5474
5475	if (dir == INBOUND) {
5476		D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane"
5477		    " of channel %lld", __func__, ldcp->ldc_id);
5478		lp = &ldcp->lane_in;
5479	} else {
5480		D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane"
5481		    " of channel %lld", __func__, ldcp->ldc_id);
5482		lp = &ldcp->lane_out;
5483	}
5484
5485	lp->lstate = VSW_LANE_INACTIV;
5486	lp->seq_num = VNET_ISS;
5487
5488	if (lp->dringp) {
5489		if (dir == INBOUND) {
5490			WRITE_ENTER(&lp->dlistrw);
5491			dp = lp->dringp;
5492			while (dp != NULL) {
5493				dpp = dp->next;
5494				if (dp->handle != NULL)
5495					(void) ldc_mem_dring_unmap(dp->handle);
5496				kmem_free(dp, sizeof (dring_info_t));
5497				dp = dpp;
5498			}
5499			RW_EXIT(&lp->dlistrw);
5500		} else {
5501			/*
5502			 * unbind, destroy exported dring, free dring struct
5503			 */
5504			WRITE_ENTER(&lp->dlistrw);
5505			dp = lp->dringp;
5506			rv = vsw_free_ring(dp);
5507			RW_EXIT(&lp->dlistrw);
5508		}
5509		if (rv == 0) {
5510			lp->dringp = NULL;
5511		}
5512	}
5513
5514	D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id);
5515}
5516
5517/*
5518 * Free ring and all associated resources.
5519 *
5520 * Should be called with dlistrw rwlock held as writer.
5521 */
5522static int
5523vsw_free_ring(dring_info_t *dp)
5524{
5525	vsw_private_desc_t	*paddr = NULL;
5526	dring_info_t		*dpp;
5527	int			i, rv = 1;
5528
5529	while (dp != NULL) {
5530		mutex_enter(&dp->dlock);
5531		dpp = dp->next;
5532		if (dp->priv_addr != NULL) {
5533			/*
5534			 * First unbind and free the memory handles
5535			 * stored in each descriptor within the ring.
5536			 */
5537			for (i = 0; i < vsw_ntxds; i++) {
5538				paddr = (vsw_private_desc_t *)
5539				    dp->priv_addr + i;
5540				if (paddr->memhandle != NULL) {
5541					if (paddr->bound == 1) {
5542						rv = ldc_mem_unbind_handle(
5543						    paddr->memhandle);
5544
5545						if (rv != 0) {
5546							DERR(NULL, "error "
5547							"unbinding handle for "
5548							"ring 0x%llx at pos %d",
5549							    dp, i);
5550							mutex_exit(&dp->dlock);
5551							return (rv);
5552						}
5553						paddr->bound = 0;
5554					}
5555
5556					rv = ldc_mem_free_handle(
5557					    paddr->memhandle);
5558					if (rv != 0) {
5559						DERR(NULL, "error freeing "
5560						    "handle for ring 0x%llx "
5561						    "at pos %d", dp, i);
5562						mutex_exit(&dp->dlock);
5563						return (rv);
5564					}
5565					paddr->memhandle = NULL;
5566				}
5567				mutex_destroy(&paddr->dstate_lock);
5568			}
5569			kmem_free(dp->priv_addr,
5570			    (sizeof (vsw_private_desc_t) * vsw_ntxds));
5571		}
5572
5573		/*
5574		 * Now unbind and destroy the ring itself.
5575		 */
5576		if (dp->handle != NULL) {
5577			(void) ldc_mem_dring_unbind(dp->handle);
5578			(void) ldc_mem_dring_destroy(dp->handle);
5579		}
5580
5581		if (dp->data_addr != NULL) {
5582			kmem_free(dp->data_addr, dp->data_sz);
5583		}
5584
5585		mutex_exit(&dp->dlock);
5586		mutex_destroy(&dp->dlock);
5587		mutex_destroy(&dp->restart_lock);
5588		kmem_free(dp, sizeof (dring_info_t));
5589
5590		dp = dpp;
5591	}
5592	return (0);
5593}
5594
5595/*
5596 * vsw_ldc_rx_worker -- A per LDC worker thread to receive data.
5597 * This thread is woken up by the LDC interrupt handler to process
5598 * LDC packets and receive data.
5599 */
5600static void
5601vsw_ldc_rx_worker(void *arg)
5602{
5603	callb_cpr_t	cprinfo;
5604	vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
5605	vsw_t *vswp = ldcp->ldc_vswp;
5606
5607	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5608	CALLB_CPR_INIT(&cprinfo, &ldcp->rx_thr_lock, callb_generic_cpr,
5609	    "vsw_rx_thread");
5610	mutex_enter(&ldcp->rx_thr_lock);
5611	ldcp->rx_thr_flags |= VSW_WTHR_RUNNING;
5612	while (!(ldcp->rx_thr_flags & VSW_WTHR_STOP)) {
5613
5614		CALLB_CPR_SAFE_BEGIN(&cprinfo);
5615		/*
5616		 * Wait until the data is received or a stop
5617		 * request is received.
5618		 */
5619		while (!(ldcp->rx_thr_flags &
5620		    (VSW_WTHR_DATARCVD | VSW_WTHR_STOP))) {
5621			cv_wait(&ldcp->rx_thr_cv, &ldcp->rx_thr_lock);
5622		}
5623		CALLB_CPR_SAFE_END(&cprinfo, &ldcp->rx_thr_lock)
5624
5625		/*
5626		 * First process the stop request.
5627		 */
5628		if (ldcp->rx_thr_flags & VSW_WTHR_STOP) {
5629			D2(vswp, "%s(%lld):Rx thread stopped\n",
5630			    __func__, ldcp->ldc_id);
5631			break;
5632		}
5633		ldcp->rx_thr_flags &= ~VSW_WTHR_DATARCVD;
5634		mutex_exit(&ldcp->rx_thr_lock);
5635		D1(vswp, "%s(%lld):calling vsw_process_pkt\n",
5636		    __func__, ldcp->ldc_id);
5637		mutex_enter(&ldcp->ldc_cblock);
5638		vsw_process_pkt(ldcp);
5639		mutex_exit(&ldcp->ldc_cblock);
5640		mutex_enter(&ldcp->rx_thr_lock);
5641	}
5642
5643	/*
5644	 * Update the run status and wakeup the thread that
5645	 * has sent the stop request.
5646	 */
5647	ldcp->rx_thr_flags &= ~VSW_WTHR_RUNNING;
5648	cv_signal(&ldcp->rx_thr_cv);
5649	CALLB_CPR_EXIT(&cprinfo);
5650	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5651	thread_exit();
5652}
5653
5654/* vsw_stop_rx_thread -- Co-ordinate with receive thread to stop it */
5655static void
5656vsw_stop_rx_thread(vsw_ldc_t *ldcp)
5657{
5658	vsw_t *vswp = ldcp->ldc_vswp;
5659
5660	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5661	/*
5662	 * Send a stop request by setting the stop flag and
5663	 * wait until the receive thread stops.
5664	 */
5665	mutex_enter(&ldcp->rx_thr_lock);
5666	if (ldcp->rx_thr_flags & VSW_WTHR_RUNNING) {
5667		ldcp->rx_thr_flags |= VSW_WTHR_STOP;
5668		cv_signal(&ldcp->rx_thr_cv);
5669		while (ldcp->rx_thr_flags & VSW_WTHR_RUNNING) {
5670			cv_wait(&ldcp->rx_thr_cv, &ldcp->rx_thr_lock);
5671		}
5672	}
5673	mutex_exit(&ldcp->rx_thr_lock);
5674	ldcp->rx_thread = NULL;
5675	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5676}
5677
5678/*
5679 * vsw_ldc_tx_worker -- A per LDC worker thread to transmit data.
5680 * This thread is woken up by the vsw_portsend to transmit
5681 * packets.
5682 */
5683static void
5684vsw_ldc_tx_worker(void *arg)
5685{
5686	callb_cpr_t	cprinfo;
5687	vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
5688	vsw_t *vswp = ldcp->ldc_vswp;
5689	mblk_t *mp;
5690	mblk_t *tmp;
5691
5692	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5693	CALLB_CPR_INIT(&cprinfo, &ldcp->tx_thr_lock, callb_generic_cpr,
5694	    "vnet_tx_thread");
5695	mutex_enter(&ldcp->tx_thr_lock);
5696	ldcp->tx_thr_flags |= VSW_WTHR_RUNNING;
5697	while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP)) {
5698
5699		CALLB_CPR_SAFE_BEGIN(&cprinfo);
5700		/*
5701		 * Wait until the data is received or a stop
5702		 * request is received.
5703		 */
5704		while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP) &&
5705		    (ldcp->tx_mhead == NULL)) {
5706			cv_wait(&ldcp->tx_thr_cv, &ldcp->tx_thr_lock);
5707		}
5708		CALLB_CPR_SAFE_END(&cprinfo, &ldcp->tx_thr_lock)
5709
5710		/*
5711		 * First process the stop request.
5712		 */
5713		if (ldcp->tx_thr_flags & VSW_WTHR_STOP) {
5714			D2(vswp, "%s(%lld):tx thread stopped\n",
5715			    __func__, ldcp->ldc_id);
5716			break;
5717		}
5718		mp = ldcp->tx_mhead;
5719		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
5720		ldcp->tx_cnt = 0;
5721		mutex_exit(&ldcp->tx_thr_lock);
5722		D2(vswp, "%s(%lld):calling vsw_ldcsend\n",
5723		    __func__, ldcp->ldc_id);
5724		while (mp != NULL) {
5725			tmp = mp->b_next;
5726			mp->b_next = mp->b_prev = NULL;
5727			(void) vsw_ldcsend(ldcp, mp, vsw_ldc_tx_retries);
5728			mp = tmp;
5729		}
5730		mutex_enter(&ldcp->tx_thr_lock);
5731	}
5732
5733	/*
5734	 * Update the run status and wakeup the thread that
5735	 * has sent the stop request.
5736	 */
5737	ldcp->tx_thr_flags &= ~VSW_WTHR_RUNNING;
5738	cv_signal(&ldcp->tx_thr_cv);
5739	CALLB_CPR_EXIT(&cprinfo);
5740	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5741	thread_exit();
5742}
5743
5744/* vsw_stop_tx_thread -- Co-ordinate with receive thread to stop it */
5745static void
5746vsw_stop_tx_thread(vsw_ldc_t *ldcp)
5747{
5748	vsw_t *vswp = ldcp->ldc_vswp;
5749
5750	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5751	/*
5752	 * Send a stop request by setting the stop flag and
5753	 * wait until the receive thread stops.
5754	 */
5755	mutex_enter(&ldcp->tx_thr_lock);
5756	if (ldcp->tx_thr_flags & VSW_WTHR_RUNNING) {
5757		ldcp->tx_thr_flags |= VSW_WTHR_STOP;
5758		cv_signal(&ldcp->tx_thr_cv);
5759		while (ldcp->tx_thr_flags & VSW_WTHR_RUNNING) {
5760			cv_wait(&ldcp->tx_thr_cv, &ldcp->tx_thr_lock);
5761		}
5762	}
5763	mutex_exit(&ldcp->tx_thr_lock);
5764	ldcp->tx_thread = NULL;
5765	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5766}
5767
5768/* vsw_reclaim_dring -- reclaim descriptors */
5769static int
5770vsw_reclaim_dring(dring_info_t *dp, int start)
5771{
5772	int i, j, len;
5773	vsw_private_desc_t *priv_addr;
5774	vnet_public_desc_t *pub_addr;
5775
5776	pub_addr = (vnet_public_desc_t *)dp->pub_addr;
5777	priv_addr = (vsw_private_desc_t *)dp->priv_addr;
5778	len = dp->num_descriptors;
5779
5780	D2(NULL, "%s: start index %ld\n", __func__, start);
5781
5782	j = 0;
5783	for (i = start; j < len; i = (i + 1) % len, j++) {
5784		pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
5785		priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
5786
5787		mutex_enter(&priv_addr->dstate_lock);
5788		if (pub_addr->hdr.dstate != VIO_DESC_DONE) {
5789			mutex_exit(&priv_addr->dstate_lock);
5790			break;
5791		}
5792		pub_addr->hdr.dstate = VIO_DESC_FREE;
5793		priv_addr->dstate = VIO_DESC_FREE;
5794		/* clear all the fields */
5795		priv_addr->datalen = 0;
5796		pub_addr->hdr.ack = 0;
5797		mutex_exit(&priv_addr->dstate_lock);
5798
5799		D3(NULL, "claiming descp:%d pub state:0x%llx priv state 0x%llx",
5800		    i, pub_addr->hdr.dstate, priv_addr->dstate);
5801	}
5802	return (j);
5803}
5804
5805/*
5806 * Debugging routines
5807 */
5808static void
5809display_state(void)
5810{
5811	vsw_t		*vswp;
5812	vsw_port_list_t	*plist;
5813	vsw_port_t 	*port;
5814	vsw_ldc_list_t	*ldcl;
5815	vsw_ldc_t 	*ldcp;
5816	extern vsw_t 	*vsw_head;
5817
5818	cmn_err(CE_NOTE, "***** system state *****");
5819
5820	for (vswp = vsw_head; vswp; vswp = vswp->next) {
5821		plist = &vswp->plist;
5822		READ_ENTER(&plist->lockrw);
5823		cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n",
5824		    vswp->instance, plist->num_ports);
5825
5826		for (port = plist->head; port != NULL; port = port->p_next) {
5827			ldcl = &port->p_ldclist;
5828			cmn_err(CE_CONT, "port %d : %d ldcs attached\n",
5829			    port->p_instance, port->num_ldcs);
5830			READ_ENTER(&ldcl->lockrw);
5831			ldcp = ldcl->head;
5832			for (; ldcp != NULL; ldcp = ldcp->ldc_next) {
5833				cmn_err(CE_CONT, "chan %lu : dev %d : "
5834				    "status %d : phase %u\n",
5835				    ldcp->ldc_id, ldcp->dev_class,
5836				    ldcp->ldc_status, ldcp->hphase);
5837				cmn_err(CE_CONT, "chan %lu : lsession %lu : "
5838				    "psession %lu\n", ldcp->ldc_id,
5839				    ldcp->local_session, ldcp->peer_session);
5840
5841				cmn_err(CE_CONT, "Inbound lane:\n");
5842				display_lane(&ldcp->lane_in);
5843				cmn_err(CE_CONT, "Outbound lane:\n");
5844				display_lane(&ldcp->lane_out);
5845			}
5846			RW_EXIT(&ldcl->lockrw);
5847		}
5848		RW_EXIT(&plist->lockrw);
5849	}
5850	cmn_err(CE_NOTE, "***** system state *****");
5851}
5852
5853static void
5854display_lane(lane_t *lp)
5855{
5856	dring_info_t	*drp;
5857
5858	cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n",
5859	    lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu);
5860	cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n",
5861	    lp->addr_type, lp->addr, lp->xfer_mode);
5862	cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp);
5863
5864	cmn_err(CE_CONT, "Dring info:\n");
5865	for (drp = lp->dringp; drp != NULL; drp = drp->next) {
5866		cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n",
5867		    drp->num_descriptors, drp->descriptor_size);
5868		cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle);
5869		cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n",
5870		    (uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr);
5871		cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n",
5872		    drp->ident, drp->end_idx);
5873		display_ring(drp);
5874	}
5875}
5876
5877static void
5878display_ring(dring_info_t *dringp)
5879{
5880	uint64_t		i;
5881	uint64_t		priv_count = 0;
5882	uint64_t		pub_count = 0;
5883	vnet_public_desc_t	*pub_addr = NULL;
5884	vsw_private_desc_t	*priv_addr = NULL;
5885
5886	for (i = 0; i < vsw_ntxds; i++) {
5887		if (dringp->pub_addr != NULL) {
5888			pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i;
5889
5890			if (pub_addr->hdr.dstate == VIO_DESC_FREE)
5891				pub_count++;
5892		}
5893
5894		if (dringp->priv_addr != NULL) {
5895			priv_addr = (vsw_private_desc_t *)dringp->priv_addr + i;
5896
5897			if (priv_addr->dstate == VIO_DESC_FREE)
5898				priv_count++;
5899		}
5900	}
5901	cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n",
5902	    i, priv_count, pub_count);
5903}
5904
5905static void
5906dump_flags(uint64_t state)
5907{
5908	int	i;
5909
5910	typedef struct flag_name {
5911		int	flag_val;
5912		char	*flag_name;
5913	} flag_name_t;
5914
5915	flag_name_t	flags[] = {
5916		VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT",
5917		VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV",
5918		VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV",
5919		VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT",
5920		VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV",
5921		VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT",
5922		VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT",
5923		VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV",
5924		VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT",
5925		VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV",
5926		VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT",
5927		VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV",
5928		VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT",
5929		VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV",
5930		VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT",
5931		VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV",
5932		VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT",
5933		VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV",
5934		VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT",
5935		VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV",
5936		VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT",
5937		VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV",
5938		VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT",
5939		VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV",
5940		VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT",
5941		VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV",
5942		VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT",
5943		VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV",
5944		VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT",
5945		VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV",
5946		VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"};
5947
5948	DERR(NULL, "DUMP_FLAGS: %llx\n", state);
5949	for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) {
5950		if (state & flags[i].flag_val)
5951			DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name);
5952	}
5953}
5954