1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#ifdef DEBUG
28#define	XNB_DEBUG 1
29#endif /* DEBUG */
30
31#include "xnb.h"
32
33#include <sys/sunddi.h>
34#include <sys/sunndi.h>
35#include <sys/modctl.h>
36#include <sys/conf.h>
37#include <sys/mac.h>
38#include <sys/mac_impl.h> /* For mac_fix_cksum(). */
39#include <sys/dlpi.h>
40#include <sys/strsubr.h>
41#include <sys/strsun.h>
42#include <sys/types.h>
43#include <sys/pattr.h>
44#include <vm/seg_kmem.h>
45#include <vm/hat_i86.h>
46#include <xen/sys/xenbus_impl.h>
47#include <xen/sys/xendev.h>
48#include <sys/balloon_impl.h>
49#include <sys/evtchn_impl.h>
50#include <sys/gnttab.h>
51#include <vm/vm_dep.h>
52#include <sys/note.h>
53#include <sys/gld.h>
54#include <inet/ip.h>
55#include <inet/ip_impl.h>
56
57/*
58 * The terms "transmit" and "receive" are used in alignment with domU,
59 * which means that packets originating from the peer domU are "transmitted"
60 * to other parts of the system and packets are "received" from them.
61 */
62
63/*
64 * Should we allow guests to manipulate multicast group membership?
65 */
66static boolean_t	xnb_multicast_control = B_TRUE;
67
68static boolean_t	xnb_connect_rings(dev_info_t *);
69static void		xnb_disconnect_rings(dev_info_t *);
70static void		xnb_oe_state_change(dev_info_t *, ddi_eventcookie_t,
71    void *, void *);
72static void		xnb_hp_state_change(dev_info_t *, ddi_eventcookie_t,
73    void *, void *);
74
75static int	xnb_txbuf_constructor(void *, void *, int);
76static void	xnb_txbuf_destructor(void *, void *);
77static void	xnb_tx_notify_peer(xnb_t *, boolean_t);
78static void	xnb_tx_mark_complete(xnb_t *, RING_IDX, int16_t);
79
80mblk_t		*xnb_to_peer(xnb_t *, mblk_t *);
81mblk_t		*xnb_copy_to_peer(xnb_t *, mblk_t *);
82
83static void		setup_gop(xnb_t *, gnttab_copy_t *, uchar_t *,
84    size_t, size_t, size_t, grant_ref_t);
85#pragma inline(setup_gop)
86static boolean_t	is_foreign(void *);
87#pragma inline(is_foreign)
88
89#define	INVALID_GRANT_HANDLE	((grant_handle_t)-1)
90#define	INVALID_GRANT_REF	((grant_ref_t)-1)
91
92static kmutex_t	xnb_alloc_page_lock;
93
94/*
95 * On a 32 bit PAE system physical and machine addresses are larger
96 * than 32 bits.  ddi_btop() on such systems take an unsigned long
97 * argument, and so addresses above 4G are truncated before ddi_btop()
98 * gets to see them.  To avoid this, code the shift operation here.
99 */
100#define	xnb_btop(addr)	((addr) >> PAGESHIFT)
101
102/* DMA attributes for transmit and receive data */
103static ddi_dma_attr_t buf_dma_attr = {
104	DMA_ATTR_V0,		/* version of this structure */
105	0,			/* lowest usable address */
106	0xffffffffffffffffULL,	/* highest usable address */
107	0x7fffffff,		/* maximum DMAable byte count */
108	MMU_PAGESIZE,		/* alignment in bytes */
109	0x7ff,			/* bitmap of burst sizes */
110	1,			/* minimum transfer */
111	0xffffffffU,		/* maximum transfer */
112	0xffffffffffffffffULL,	/* maximum segment length */
113	1,			/* maximum number of segments */
114	1,			/* granularity */
115	0,			/* flags (reserved) */
116};
117
118/* DMA access attributes for data: NOT to be byte swapped. */
119static ddi_device_acc_attr_t data_accattr = {
120	DDI_DEVICE_ATTR_V0,
121	DDI_NEVERSWAP_ACC,
122	DDI_STRICTORDER_ACC
123};
124
125/*
126 * Statistics.
127 */
128static const char * const aux_statistics[] = {
129	"rx_cksum_deferred",
130	"tx_cksum_no_need",
131	"rx_rsp_notok",
132	"tx_notify_deferred",
133	"tx_notify_sent",
134	"rx_notify_deferred",
135	"rx_notify_sent",
136	"tx_too_early",
137	"rx_too_early",
138	"rx_allocb_failed",
139	"tx_allocb_failed",
140	"rx_foreign_page",
141	"mac_full",
142	"spurious_intr",
143	"allocation_success",
144	"allocation_failure",
145	"small_allocation_success",
146	"small_allocation_failure",
147	"other_allocation_failure",
148	"rx_pageboundary_crossed",
149	"rx_cpoparea_grown",
150	"csum_hardware",
151	"csum_software",
152	"tx_overflow_page",
153	"tx_unexpected_flags",
154};
155
156static int
157xnb_ks_aux_update(kstat_t *ksp, int flag)
158{
159	xnb_t *xnbp;
160	kstat_named_t *knp;
161
162	if (flag != KSTAT_READ)
163		return (EACCES);
164
165	xnbp = ksp->ks_private;
166	knp = ksp->ks_data;
167
168	/*
169	 * Assignment order should match that of the names in
170	 * aux_statistics.
171	 */
172	(knp++)->value.ui64 = xnbp->xnb_stat_rx_cksum_deferred;
173	(knp++)->value.ui64 = xnbp->xnb_stat_tx_cksum_no_need;
174	(knp++)->value.ui64 = xnbp->xnb_stat_rx_rsp_notok;
175	(knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_deferred;
176	(knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_sent;
177	(knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_deferred;
178	(knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_sent;
179	(knp++)->value.ui64 = xnbp->xnb_stat_tx_too_early;
180	(knp++)->value.ui64 = xnbp->xnb_stat_rx_too_early;
181	(knp++)->value.ui64 = xnbp->xnb_stat_rx_allocb_failed;
182	(knp++)->value.ui64 = xnbp->xnb_stat_tx_allocb_failed;
183	(knp++)->value.ui64 = xnbp->xnb_stat_rx_foreign_page;
184	(knp++)->value.ui64 = xnbp->xnb_stat_mac_full;
185	(knp++)->value.ui64 = xnbp->xnb_stat_spurious_intr;
186	(knp++)->value.ui64 = xnbp->xnb_stat_allocation_success;
187	(knp++)->value.ui64 = xnbp->xnb_stat_allocation_failure;
188	(knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_success;
189	(knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_failure;
190	(knp++)->value.ui64 = xnbp->xnb_stat_other_allocation_failure;
191	(knp++)->value.ui64 = xnbp->xnb_stat_rx_pagebndry_crossed;
192	(knp++)->value.ui64 = xnbp->xnb_stat_rx_cpoparea_grown;
193	(knp++)->value.ui64 = xnbp->xnb_stat_csum_hardware;
194	(knp++)->value.ui64 = xnbp->xnb_stat_csum_software;
195	(knp++)->value.ui64 = xnbp->xnb_stat_tx_overflow_page;
196	(knp++)->value.ui64 = xnbp->xnb_stat_tx_unexpected_flags;
197
198	return (0);
199}
200
201static boolean_t
202xnb_ks_init(xnb_t *xnbp)
203{
204	int nstat = sizeof (aux_statistics) /
205	    sizeof (aux_statistics[0]);
206	const char * const *cp = aux_statistics;
207	kstat_named_t *knp;
208
209	/*
210	 * Create and initialise kstats.
211	 */
212	xnbp->xnb_kstat_aux = kstat_create(ddi_driver_name(xnbp->xnb_devinfo),
213	    ddi_get_instance(xnbp->xnb_devinfo), "aux_statistics", "net",
214	    KSTAT_TYPE_NAMED, nstat, 0);
215	if (xnbp->xnb_kstat_aux == NULL)
216		return (B_FALSE);
217
218	xnbp->xnb_kstat_aux->ks_private = xnbp;
219	xnbp->xnb_kstat_aux->ks_update = xnb_ks_aux_update;
220
221	knp = xnbp->xnb_kstat_aux->ks_data;
222	while (nstat > 0) {
223		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
224
225		knp++;
226		cp++;
227		nstat--;
228	}
229
230	kstat_install(xnbp->xnb_kstat_aux);
231
232	return (B_TRUE);
233}
234
235static void
236xnb_ks_free(xnb_t *xnbp)
237{
238	kstat_delete(xnbp->xnb_kstat_aux);
239}
240
241/*
242 * Calculate and insert the transport checksum for an arbitrary packet.
243 */
244static mblk_t *
245xnb_software_csum(xnb_t *xnbp, mblk_t *mp)
246{
247	_NOTE(ARGUNUSED(xnbp));
248
249	/*
250	 * XXPV dme: shouldn't rely on mac_fix_cksum(), not least
251	 * because it doesn't cover all of the interesting cases :-(
252	 */
253	mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM);
254
255	return (mac_fix_cksum(mp));
256}
257
258mblk_t *
259xnb_process_cksum_flags(xnb_t *xnbp, mblk_t *mp, uint32_t capab)
260{
261	struct ether_header *ehp;
262	uint16_t sap;
263	uint32_t offset;
264	ipha_t *ipha;
265
266	ASSERT(mp->b_next == NULL);
267
268	/*
269	 * Check that the packet is contained in a single mblk.  In
270	 * the "from peer" path this is true today, but may change
271	 * when scatter gather support is added.  In the "to peer"
272	 * path we cannot be sure, but in most cases it will be true
273	 * (in the xnbo case the packet has come from a MAC device
274	 * which is unlikely to split packets).
275	 */
276	if (mp->b_cont != NULL)
277		goto software;
278
279	/*
280	 * If the MAC has no hardware capability don't do any further
281	 * checking.
282	 */
283	if (capab == 0)
284		goto software;
285
286	ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
287	ehp = (struct ether_header *)mp->b_rptr;
288
289	if (ntohs(ehp->ether_type) == VLAN_TPID) {
290		struct ether_vlan_header *evhp;
291
292		ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
293		evhp = (struct ether_vlan_header *)mp->b_rptr;
294		sap = ntohs(evhp->ether_type);
295		offset = sizeof (struct ether_vlan_header);
296	} else {
297		sap = ntohs(ehp->ether_type);
298		offset = sizeof (struct ether_header);
299	}
300
301	/*
302	 * We only attempt to do IPv4 packets in hardware.
303	 */
304	if (sap != ETHERTYPE_IP)
305		goto software;
306
307	/*
308	 * We know that this is an IPv4 packet.
309	 */
310	ipha = (ipha_t *)(mp->b_rptr + offset);
311
312	switch (ipha->ipha_protocol) {
313	case IPPROTO_TCP:
314	case IPPROTO_UDP: {
315		uint32_t start, length, stuff, cksum;
316		uint16_t *stuffp;
317
318		/*
319		 * This is a TCP/IPv4 or UDP/IPv4 packet, for which we
320		 * can use full IPv4 and partial checksum offload.
321		 */
322		if ((capab & (HCKSUM_INET_FULL_V4|HCKSUM_INET_PARTIAL)) == 0)
323			break;
324
325		start = IP_SIMPLE_HDR_LENGTH;
326		length = ntohs(ipha->ipha_length);
327		if (ipha->ipha_protocol == IPPROTO_TCP) {
328			stuff = start + TCP_CHECKSUM_OFFSET;
329			cksum = IP_TCP_CSUM_COMP;
330		} else {
331			stuff = start + UDP_CHECKSUM_OFFSET;
332			cksum = IP_UDP_CSUM_COMP;
333		}
334		stuffp = (uint16_t *)(mp->b_rptr + offset + stuff);
335
336		if (capab & HCKSUM_INET_FULL_V4) {
337			/*
338			 * Some devices require that the checksum
339			 * field of the packet is zero for full
340			 * offload.
341			 */
342			*stuffp = 0;
343
344			mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM);
345
346			xnbp->xnb_stat_csum_hardware++;
347
348			return (mp);
349		}
350
351		if (capab & HCKSUM_INET_PARTIAL) {
352			if (*stuffp == 0) {
353				ipaddr_t src, dst;
354
355				/*
356				 * Older Solaris guests don't insert
357				 * the pseudo-header checksum, so we
358				 * calculate it here.
359				 */
360				src = ipha->ipha_src;
361				dst = ipha->ipha_dst;
362
363				cksum += (dst >> 16) + (dst & 0xFFFF);
364				cksum += (src >> 16) + (src & 0xFFFF);
365				cksum += length - IP_SIMPLE_HDR_LENGTH;
366
367				cksum = (cksum >> 16) + (cksum & 0xFFFF);
368				cksum = (cksum >> 16) + (cksum & 0xFFFF);
369
370				ASSERT(cksum <= 0xFFFF);
371
372				*stuffp = (uint16_t)(cksum ? cksum : ~cksum);
373			}
374
375			mac_hcksum_set(mp, start, stuff, length, 0,
376			    HCK_PARTIALCKSUM);
377
378			xnbp->xnb_stat_csum_hardware++;
379
380			return (mp);
381		}
382
383		/* NOTREACHED */
384		break;
385	}
386
387	default:
388		/* Use software. */
389		break;
390	}
391
392software:
393	/*
394	 * We are not able to use any offload so do the whole thing in
395	 * software.
396	 */
397	xnbp->xnb_stat_csum_software++;
398
399	return (xnb_software_csum(xnbp, mp));
400}
401
402int
403xnb_attach(dev_info_t *dip, xnb_flavour_t *flavour, void *flavour_data)
404{
405	xnb_t *xnbp;
406	char *xsname;
407	char cachename[32];
408
409	xnbp = kmem_zalloc(sizeof (*xnbp), KM_SLEEP);
410
411	xnbp->xnb_flavour = flavour;
412	xnbp->xnb_flavour_data = flavour_data;
413	xnbp->xnb_devinfo = dip;
414	xnbp->xnb_evtchn = INVALID_EVTCHN;
415	xnbp->xnb_irq = B_FALSE;
416	xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE;
417	xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE;
418	xnbp->xnb_connected = B_FALSE;
419	xnbp->xnb_hotplugged = B_FALSE;
420	xnbp->xnb_detachable = B_FALSE;
421	xnbp->xnb_peer = xvdi_get_oeid(dip);
422	xnbp->xnb_be_status = XNB_STATE_INIT;
423	xnbp->xnb_fe_status = XNB_STATE_INIT;
424
425	xnbp->xnb_tx_buf_count = 0;
426
427	xnbp->xnb_rx_hv_copy = B_FALSE;
428	xnbp->xnb_multicast_control = B_FALSE;
429
430	xnbp->xnb_rx_va = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
431	ASSERT(xnbp->xnb_rx_va != NULL);
432
433	if (ddi_get_iblock_cookie(dip, 0, &xnbp->xnb_icookie)
434	    != DDI_SUCCESS)
435		goto failure;
436
437	/* Allocated on demand, when/if we enter xnb_copy_to_peer(). */
438	xnbp->xnb_rx_cpop = NULL;
439	xnbp->xnb_rx_cpop_count = 0;
440
441	mutex_init(&xnbp->xnb_tx_lock, NULL, MUTEX_DRIVER,
442	    xnbp->xnb_icookie);
443	mutex_init(&xnbp->xnb_rx_lock, NULL, MUTEX_DRIVER,
444	    xnbp->xnb_icookie);
445	mutex_init(&xnbp->xnb_state_lock, NULL, MUTEX_DRIVER,
446	    xnbp->xnb_icookie);
447
448	/* Set driver private pointer now. */
449	ddi_set_driver_private(dip, xnbp);
450
451	(void) sprintf(cachename, "xnb_tx_buf_cache_%d", ddi_get_instance(dip));
452	xnbp->xnb_tx_buf_cache = kmem_cache_create(cachename,
453	    sizeof (xnb_txbuf_t), 0,
454	    xnb_txbuf_constructor, xnb_txbuf_destructor,
455	    NULL, xnbp, NULL, 0);
456	if (xnbp->xnb_tx_buf_cache == NULL)
457		goto failure_0;
458
459	if (!xnb_ks_init(xnbp))
460		goto failure_1;
461
462	/*
463	 * Receive notification of changes in the state of the
464	 * driver in the guest domain.
465	 */
466	if (xvdi_add_event_handler(dip, XS_OE_STATE, xnb_oe_state_change,
467	    NULL) != DDI_SUCCESS)
468		goto failure_2;
469
470	/*
471	 * Receive notification of hotplug events.
472	 */
473	if (xvdi_add_event_handler(dip, XS_HP_STATE, xnb_hp_state_change,
474	    NULL) != DDI_SUCCESS)
475		goto failure_2;
476
477	xsname = xvdi_get_xsname(dip);
478
479	if (xenbus_printf(XBT_NULL, xsname,
480	    "feature-multicast-control", "%d",
481	    xnb_multicast_control ? 1 : 0) != 0)
482		goto failure_3;
483
484	if (xenbus_printf(XBT_NULL, xsname,
485	    "feature-rx-copy", "%d",  1) != 0)
486		goto failure_3;
487	/*
488	 * Linux domUs seem to depend on "feature-rx-flip" being 0
489	 * in addition to "feature-rx-copy" being 1. It seems strange
490	 * to use four possible states to describe a binary decision,
491	 * but we might as well play nice.
492	 */
493	if (xenbus_printf(XBT_NULL, xsname,
494	    "feature-rx-flip", "%d", 0) != 0)
495		goto failure_3;
496
497	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitWait);
498	(void) xvdi_post_event(dip, XEN_HP_ADD);
499
500	return (DDI_SUCCESS);
501
502failure_3:
503	xvdi_remove_event_handler(dip, NULL);
504
505failure_2:
506	xnb_ks_free(xnbp);
507
508failure_1:
509	kmem_cache_destroy(xnbp->xnb_tx_buf_cache);
510
511failure_0:
512	mutex_destroy(&xnbp->xnb_state_lock);
513	mutex_destroy(&xnbp->xnb_rx_lock);
514	mutex_destroy(&xnbp->xnb_tx_lock);
515
516failure:
517	vmem_free(heap_arena, xnbp->xnb_rx_va, PAGESIZE);
518	kmem_free(xnbp, sizeof (*xnbp));
519	return (DDI_FAILURE);
520}
521
522void
523xnb_detach(dev_info_t *dip)
524{
525	xnb_t *xnbp = ddi_get_driver_private(dip);
526
527	ASSERT(xnbp != NULL);
528	ASSERT(!xnbp->xnb_connected);
529	ASSERT(xnbp->xnb_tx_buf_count == 0);
530
531	xnb_disconnect_rings(dip);
532
533	xvdi_remove_event_handler(dip, NULL);
534
535	xnb_ks_free(xnbp);
536
537	kmem_cache_destroy(xnbp->xnb_tx_buf_cache);
538
539	ddi_set_driver_private(dip, NULL);
540
541	mutex_destroy(&xnbp->xnb_state_lock);
542	mutex_destroy(&xnbp->xnb_rx_lock);
543	mutex_destroy(&xnbp->xnb_tx_lock);
544
545	if (xnbp->xnb_rx_cpop_count > 0)
546		kmem_free(xnbp->xnb_rx_cpop, sizeof (xnbp->xnb_rx_cpop[0])
547		    * xnbp->xnb_rx_cpop_count);
548
549	ASSERT(xnbp->xnb_rx_va != NULL);
550	vmem_free(heap_arena, xnbp->xnb_rx_va, PAGESIZE);
551
552	kmem_free(xnbp, sizeof (*xnbp));
553}
554
555/*
556 * Allocate a page from the hypervisor to be flipped to the peer.
557 *
558 * Try to get pages in batches to reduce the overhead of calls into
559 * the balloon driver.
560 */
561static mfn_t
562xnb_alloc_page(xnb_t *xnbp)
563{
564#define	WARNING_RATE_LIMIT 100
565#define	BATCH_SIZE 256
566	static mfn_t mfns[BATCH_SIZE];	/* common across all instances */
567	static int nth = BATCH_SIZE;
568	mfn_t mfn;
569
570	mutex_enter(&xnb_alloc_page_lock);
571	if (nth == BATCH_SIZE) {
572		if (balloon_alloc_pages(BATCH_SIZE, mfns) != BATCH_SIZE) {
573			xnbp->xnb_stat_allocation_failure++;
574			mutex_exit(&xnb_alloc_page_lock);
575
576			/*
577			 * Try for a single page in low memory situations.
578			 */
579			if (balloon_alloc_pages(1, &mfn) != 1) {
580				if ((xnbp->xnb_stat_small_allocation_failure++
581				    % WARNING_RATE_LIMIT) == 0)
582					cmn_err(CE_WARN, "xnb_alloc_page: "
583					    "Cannot allocate memory to "
584					    "transfer packets to peer.");
585				return (0);
586			} else {
587				xnbp->xnb_stat_small_allocation_success++;
588				return (mfn);
589			}
590		}
591
592		nth = 0;
593		xnbp->xnb_stat_allocation_success++;
594	}
595
596	mfn = mfns[nth++];
597	mutex_exit(&xnb_alloc_page_lock);
598
599	ASSERT(mfn != 0);
600
601	return (mfn);
602#undef BATCH_SIZE
603#undef WARNING_RATE_LIMIT
604}
605
606/*
607 * Free a page back to the hypervisor.
608 *
609 * This happens only in the error path, so batching is not worth the
610 * complication.
611 */
612static void
613xnb_free_page(xnb_t *xnbp, mfn_t mfn)
614{
615	_NOTE(ARGUNUSED(xnbp));
616	int r;
617	pfn_t pfn;
618
619	pfn = xen_assign_pfn(mfn);
620	pfnzero(pfn, 0, PAGESIZE);
621	xen_release_pfn(pfn);
622
623	if ((r = balloon_free_pages(1, &mfn, NULL, NULL)) != 1) {
624		cmn_err(CE_WARN, "free_page: cannot decrease memory "
625		    "reservation (%d): page kept but unusable (mfn = 0x%lx).",
626		    r, mfn);
627	}
628}
629
630/*
631 * Similar to RING_HAS_UNCONSUMED_REQUESTS(&xnbp->rx_ring) but using
632 * local variables. Used in both xnb_to_peer() and xnb_copy_to_peer().
633 */
634#define	XNB_RING_HAS_UNCONSUMED_REQUESTS(_r)		\
635	((((_r)->sring->req_prod - loop) <		\
636		(RING_SIZE(_r) - (loop - prod))) ?	\
637	    ((_r)->sring->req_prod - loop) :		\
638	    (RING_SIZE(_r) - (loop - prod)))
639
640/*
641 * Pass packets to the peer using page flipping.
642 */
643mblk_t *
644xnb_to_peer(xnb_t *xnbp, mblk_t *mp)
645{
646	mblk_t *free = mp, *prev = NULL;
647	size_t len;
648	gnttab_transfer_t *gop;
649	boolean_t notify;
650	RING_IDX loop, prod, end;
651
652	/*
653	 * For each packet the sequence of operations is:
654	 *
655	 * 1. get a new page from the hypervisor.
656	 * 2. get a request slot from the ring.
657	 * 3. copy the data into the new page.
658	 * 4. transfer the page to the peer.
659	 * 5. update the request slot.
660	 * 6. kick the peer.
661	 * 7. free mp.
662	 *
663	 * In order to reduce the number of hypercalls, we prepare
664	 * several packets for the peer and perform a single hypercall
665	 * to transfer them.
666	 */
667
668	mutex_enter(&xnbp->xnb_rx_lock);
669
670	/*
671	 * If we are not connected to the peer or have not yet
672	 * finished hotplug it is too early to pass packets to the
673	 * peer.
674	 */
675	if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) {
676		mutex_exit(&xnbp->xnb_rx_lock);
677		DTRACE_PROBE(flip_rx_too_early);
678		xnbp->xnb_stat_rx_too_early++;
679		return (mp);
680	}
681
682	loop = xnbp->xnb_rx_ring.req_cons;
683	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
684	gop = xnbp->xnb_rx_top;
685
686	while ((mp != NULL) &&
687	    XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) {
688
689		mfn_t mfn;
690		pfn_t pfn;
691		netif_rx_request_t *rxreq;
692		netif_rx_response_t *rxresp;
693		char *valoop;
694		mblk_t *ml;
695		uint16_t cksum_flags;
696
697		/* 1 */
698		if ((mfn = xnb_alloc_page(xnbp)) == 0) {
699			xnbp->xnb_stat_rx_defer++;
700			break;
701		}
702
703		/* 2 */
704		rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop);
705
706#ifdef XNB_DEBUG
707		if (!(rxreq->id < NET_RX_RING_SIZE))
708			cmn_err(CE_PANIC, "xnb_to_peer: "
709			    "id %d out of range in request 0x%p",
710			    rxreq->id, (void *)rxreq);
711#endif /* XNB_DEBUG */
712
713		/* Assign a pfn and map the new page at the allocated va. */
714		pfn = xen_assign_pfn(mfn);
715		hat_devload(kas.a_hat, xnbp->xnb_rx_va, PAGESIZE,
716		    pfn, PROT_READ | PROT_WRITE, HAT_LOAD);
717
718		/* 3 */
719		len = 0;
720		valoop = xnbp->xnb_rx_va;
721		for (ml = mp; ml != NULL; ml = ml->b_cont) {
722			size_t chunk = ml->b_wptr - ml->b_rptr;
723
724			bcopy(ml->b_rptr, valoop, chunk);
725			valoop += chunk;
726			len += chunk;
727		}
728
729		ASSERT(len < PAGESIZE);
730
731		/* Release the pfn. */
732		hat_unload(kas.a_hat, xnbp->xnb_rx_va, PAGESIZE,
733		    HAT_UNLOAD_UNMAP);
734		xen_release_pfn(pfn);
735
736		/* 4 */
737		gop->mfn = mfn;
738		gop->domid = xnbp->xnb_peer;
739		gop->ref = rxreq->gref;
740
741		/* 5.1 */
742		rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod);
743		rxresp->offset = 0;
744		rxresp->flags = 0;
745
746		cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp);
747		if (cksum_flags != 0)
748			xnbp->xnb_stat_rx_cksum_deferred++;
749		rxresp->flags |= cksum_flags;
750
751		rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id;
752		rxresp->status = len;
753
754		loop++;
755		prod++;
756		gop++;
757		prev = mp;
758		mp = mp->b_next;
759	}
760
761	/*
762	 * Did we actually do anything?
763	 */
764	if (loop == xnbp->xnb_rx_ring.req_cons) {
765		mutex_exit(&xnbp->xnb_rx_lock);
766		return (mp);
767	}
768
769	end = loop;
770
771	/*
772	 * Unlink the end of the 'done' list from the remainder.
773	 */
774	ASSERT(prev != NULL);
775	prev->b_next = NULL;
776
777	if (HYPERVISOR_grant_table_op(GNTTABOP_transfer, xnbp->xnb_rx_top,
778	    loop - xnbp->xnb_rx_ring.req_cons) != 0) {
779		cmn_err(CE_WARN, "xnb_to_peer: transfer operation failed");
780	}
781
782	loop = xnbp->xnb_rx_ring.req_cons;
783	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
784	gop = xnbp->xnb_rx_top;
785
786	while (loop < end) {
787		int16_t status = NETIF_RSP_OKAY;
788
789		if (gop->status != 0) {
790			status = NETIF_RSP_ERROR;
791
792			/*
793			 * If the status is anything other than
794			 * GNTST_bad_page then we don't own the page
795			 * any more, so don't try to give it back.
796			 */
797			if (gop->status != GNTST_bad_page)
798				gop->mfn = 0;
799		} else {
800			/* The page is no longer ours. */
801			gop->mfn = 0;
802		}
803
804		if (gop->mfn != 0)
805			/*
806			 * Give back the page, as we won't be using
807			 * it.
808			 */
809			xnb_free_page(xnbp, gop->mfn);
810		else
811			/*
812			 * We gave away a page, update our accounting
813			 * now.
814			 */
815			balloon_drv_subtracted(1);
816
817		/* 5.2 */
818		if (status != NETIF_RSP_OKAY) {
819			RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status =
820			    status;
821		} else {
822			xnbp->xnb_stat_ipackets++;
823			xnbp->xnb_stat_rbytes += len;
824		}
825
826		loop++;
827		prod++;
828		gop++;
829	}
830
831	xnbp->xnb_rx_ring.req_cons = loop;
832	xnbp->xnb_rx_ring.rsp_prod_pvt = prod;
833
834	/* 6 */
835	/* LINTED: constant in conditional context */
836	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify);
837	if (notify) {
838		ec_notify_via_evtchn(xnbp->xnb_evtchn);
839		xnbp->xnb_stat_rx_notify_sent++;
840	} else {
841		xnbp->xnb_stat_rx_notify_deferred++;
842	}
843
844	if (mp != NULL)
845		xnbp->xnb_stat_rx_defer++;
846
847	mutex_exit(&xnbp->xnb_rx_lock);
848
849	/* Free mblk_t's that we consumed. */
850	freemsgchain(free);
851
852	return (mp);
853}
854
855/* Helper functions for xnb_copy_to_peer(). */
856
857/*
858 * Grow the array of copy operation descriptors.
859 */
860static boolean_t
861grow_cpop_area(xnb_t *xnbp)
862{
863	size_t count;
864	gnttab_copy_t *new;
865
866	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
867
868	count = xnbp->xnb_rx_cpop_count + CPOP_DEFCNT;
869
870	if ((new = kmem_alloc(sizeof (new[0]) * count, KM_NOSLEEP)) == NULL) {
871		xnbp->xnb_stat_other_allocation_failure++;
872		return (B_FALSE);
873	}
874
875	bcopy(xnbp->xnb_rx_cpop, new,
876	    sizeof (xnbp->xnb_rx_cpop[0]) * xnbp->xnb_rx_cpop_count);
877
878	kmem_free(xnbp->xnb_rx_cpop,
879	    sizeof (xnbp->xnb_rx_cpop[0]) * xnbp->xnb_rx_cpop_count);
880
881	xnbp->xnb_rx_cpop = new;
882	xnbp->xnb_rx_cpop_count = count;
883
884	xnbp->xnb_stat_rx_cpoparea_grown++;
885
886	return (B_TRUE);
887}
888
889/*
890 * Check whether an address is on a page that's foreign to this domain.
891 */
892static boolean_t
893is_foreign(void *addr)
894{
895	pfn_t pfn = hat_getpfnum(kas.a_hat, addr);
896
897	return ((pfn & PFN_IS_FOREIGN_MFN) == PFN_IS_FOREIGN_MFN);
898}
899
900/*
901 * Insert a newly allocated mblk into a chain, replacing the old one.
902 */
903static mblk_t *
904replace_msg(mblk_t *mp, size_t len, mblk_t *mp_prev, mblk_t *ml_prev)
905{
906	uint32_t	start, stuff, end, value, flags;
907	mblk_t		*new_mp;
908
909	new_mp = copyb(mp);
910	if (new_mp == NULL) {
911		cmn_err(CE_PANIC, "replace_msg: cannot alloc new message"
912		    "for %p, len %lu", (void *) mp, len);
913	}
914
915	mac_hcksum_get(mp, &start, &stuff, &end, &value, &flags);
916	mac_hcksum_set(new_mp, start, stuff, end, value, flags);
917
918	new_mp->b_next = mp->b_next;
919	new_mp->b_prev = mp->b_prev;
920	new_mp->b_cont = mp->b_cont;
921
922	/* Make sure we only overwrite pointers to the mblk being replaced. */
923	if (mp_prev != NULL && mp_prev->b_next == mp)
924		mp_prev->b_next = new_mp;
925
926	if (ml_prev != NULL && ml_prev->b_cont == mp)
927		ml_prev->b_cont = new_mp;
928
929	mp->b_next = mp->b_prev = mp->b_cont = NULL;
930	freemsg(mp);
931
932	return (new_mp);
933}
934
935/*
936 * Set all the fields in a gnttab_copy_t.
937 */
938static void
939setup_gop(xnb_t *xnbp, gnttab_copy_t *gp, uchar_t *rptr,
940    size_t s_off, size_t d_off, size_t len, grant_ref_t d_ref)
941{
942	ASSERT(xnbp != NULL && gp != NULL);
943
944	gp->source.offset = s_off;
945	gp->source.u.gmfn = pfn_to_mfn(hat_getpfnum(kas.a_hat, (caddr_t)rptr));
946	gp->source.domid = DOMID_SELF;
947
948	gp->len = (uint16_t)len;
949	gp->flags = GNTCOPY_dest_gref;
950	gp->status = 0;
951
952	gp->dest.u.ref = d_ref;
953	gp->dest.offset = d_off;
954	gp->dest.domid = xnbp->xnb_peer;
955}
956
957/*
958 * Pass packets to the peer using hypervisor copy operations.
959 */
960mblk_t *
961xnb_copy_to_peer(xnb_t *xnbp, mblk_t *mp)
962{
963	mblk_t		*free = mp, *mp_prev = NULL, *saved_mp = mp;
964	mblk_t		*ml, *ml_prev;
965	boolean_t	notify;
966	RING_IDX	loop, prod;
967	int		i;
968
969	/*
970	 * If the peer does not pre-post buffers for received packets,
971	 * use page flipping to pass packets to it.
972	 */
973	if (!xnbp->xnb_rx_hv_copy)
974		return (xnb_to_peer(xnbp, mp));
975
976	/*
977	 * For each packet the sequence of operations is:
978	 *
979	 *  1. get a request slot from the ring.
980	 *  2. set up data for hypercall (see NOTE below)
981	 *  3. have the hypervisore copy the data
982	 *  4. update the request slot.
983	 *  5. kick the peer.
984	 *
985	 * NOTE ad 2.
986	 *  In order to reduce the number of hypercalls, we prepare
987	 *  several mblks (mp->b_cont != NULL) for the peer and
988	 *  perform a single hypercall to transfer them.  We also have
989	 *  to set up a seperate copy operation for every page.
990	 *
991	 * If we have more than one packet (mp->b_next != NULL), we do
992	 * this whole dance repeatedly.
993	 */
994
995	mutex_enter(&xnbp->xnb_rx_lock);
996
997	if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) {
998		mutex_exit(&xnbp->xnb_rx_lock);
999		DTRACE_PROBE(copy_rx_too_early);
1000		xnbp->xnb_stat_rx_too_early++;
1001		return (mp);
1002	}
1003
1004	loop = xnbp->xnb_rx_ring.req_cons;
1005	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
1006
1007	while ((mp != NULL) &&
1008	    XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) {
1009		netif_rx_request_t	*rxreq;
1010		size_t			d_offset, len;
1011		int			item_count;
1012		gnttab_copy_t		*gop_cp;
1013		netif_rx_response_t	*rxresp;
1014		uint16_t		cksum_flags;
1015		int16_t			status = NETIF_RSP_OKAY;
1016
1017		/* 1 */
1018		rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop);
1019
1020#ifdef XNB_DEBUG
1021		if (!(rxreq->id < NET_RX_RING_SIZE))
1022			cmn_err(CE_PANIC, "xnb_copy_to_peer: "
1023			    "id %d out of range in request 0x%p",
1024			    rxreq->id, (void *)rxreq);
1025#endif /* XNB_DEBUG */
1026
1027		/* 2 */
1028		d_offset = 0;
1029		len = 0;
1030		item_count = 0;
1031
1032		gop_cp = xnbp->xnb_rx_cpop;
1033
1034		/*
1035		 * We walk the b_cont pointers and set up a
1036		 * gnttab_copy_t for each sub-page chunk in each data
1037		 * block.
1038		 */
1039		/* 2a */
1040		for (ml = mp, ml_prev = NULL; ml != NULL; ml = ml->b_cont) {
1041			size_t	chunk = ml->b_wptr - ml->b_rptr;
1042			uchar_t	*r_tmp,	*rpt_align;
1043			size_t	r_offset;
1044
1045			/*
1046			 * The hypervisor will not allow us to
1047			 * reference a foreign page (e.g. one
1048			 * belonging to another domain) by mfn in the
1049			 * copy operation. If the data in this mblk is
1050			 * on such a page we must copy the data into a
1051			 * local page before initiating the hypervisor
1052			 * copy operation.
1053			 */
1054			if (is_foreign(ml->b_rptr) || is_foreign(ml->b_wptr)) {
1055				mblk_t *ml_new = replace_msg(ml, chunk,
1056				    mp_prev, ml_prev);
1057
1058				/* We can still use old ml, but not *ml! */
1059				if (free == ml)
1060					free = ml_new;
1061				if (mp == ml)
1062					mp = ml_new;
1063				ml = ml_new;
1064
1065				xnbp->xnb_stat_rx_foreign_page++;
1066			}
1067
1068			rpt_align = (uchar_t *)ALIGN2PAGE(ml->b_rptr);
1069			r_offset = (uint16_t)(ml->b_rptr - rpt_align);
1070			r_tmp = ml->b_rptr;
1071
1072			if (d_offset + chunk > PAGESIZE)
1073				cmn_err(CE_PANIC, "xnb_copy_to_peer: mp %p "
1074				    "(svd: %p), ml %p,rpt_alg. %p, d_offset "
1075				    "(%lu) + chunk (%lu) > PAGESIZE %d!",
1076				    (void *)mp, (void *)saved_mp, (void *)ml,
1077				    (void *)rpt_align,
1078				    d_offset, chunk, (int)PAGESIZE);
1079
1080			while (chunk > 0) {
1081				size_t part_len;
1082
1083				if (item_count == xnbp->xnb_rx_cpop_count) {
1084					if (!grow_cpop_area(xnbp))
1085						goto failure;
1086					gop_cp = &xnbp->xnb_rx_cpop[item_count];
1087				}
1088				/*
1089				 * If our mblk crosses a page boundary, we need
1090				 * to do a seperate copy for each page.
1091				 */
1092				if (r_offset + chunk > PAGESIZE) {
1093					part_len = PAGESIZE - r_offset;
1094
1095					DTRACE_PROBE3(mblk_page_crossed,
1096					    (mblk_t *), ml, int, chunk, int,
1097					    (int)r_offset);
1098
1099					xnbp->xnb_stat_rx_pagebndry_crossed++;
1100				} else {
1101					part_len = chunk;
1102				}
1103
1104				setup_gop(xnbp, gop_cp, r_tmp, r_offset,
1105				    d_offset, part_len, rxreq->gref);
1106
1107				chunk -= part_len;
1108
1109				len += part_len;
1110				d_offset += part_len;
1111				r_tmp += part_len;
1112				/*
1113				 * The 2nd, 3rd ... last copies will always
1114				 * start at r_tmp, therefore r_offset is 0.
1115				 */
1116				r_offset = 0;
1117				gop_cp++;
1118				item_count++;
1119			}
1120			ml_prev = ml;
1121
1122			DTRACE_PROBE4(mblk_loop_end, (mblk_t *), ml, int,
1123			    chunk, int, len, int, item_count);
1124		}
1125		/* 3 */
1126		if (HYPERVISOR_grant_table_op(GNTTABOP_copy, xnbp->xnb_rx_cpop,
1127		    item_count) != 0) {
1128			cmn_err(CE_WARN, "xnb_copy_to_peer: copy op. failed");
1129			DTRACE_PROBE(HV_granttableopfailed);
1130		}
1131
1132		/* 4 */
1133		rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod);
1134		rxresp->offset = 0;
1135
1136		rxresp->flags = 0;
1137
1138		DTRACE_PROBE4(got_RX_rsp, int, (int)rxresp->id, int,
1139		    (int)rxresp->offset, int, (int)rxresp->flags, int,
1140		    (int)rxresp->status);
1141
1142		cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp);
1143		if (cksum_flags != 0)
1144			xnbp->xnb_stat_rx_cksum_deferred++;
1145		rxresp->flags |= cksum_flags;
1146
1147		rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id;
1148		rxresp->status = len;
1149
1150		DTRACE_PROBE4(RX_rsp_set, int, (int)rxresp->id, int,
1151		    (int)rxresp->offset, int, (int)rxresp->flags, int,
1152		    (int)rxresp->status);
1153
1154		for (i = 0; i < item_count; i++) {
1155			if (xnbp->xnb_rx_cpop[i].status != 0) {
1156				DTRACE_PROBE2(cpop_status_nonnull, int,
1157				    (int)xnbp->xnb_rx_cpop[i].status,
1158				    int, i);
1159				status = NETIF_RSP_ERROR;
1160			}
1161		}
1162
1163		/* 5.2 */
1164		if (status != NETIF_RSP_OKAY) {
1165			RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status =
1166			    status;
1167			xnbp->xnb_stat_rx_rsp_notok++;
1168		} else {
1169			xnbp->xnb_stat_ipackets++;
1170			xnbp->xnb_stat_rbytes += len;
1171		}
1172
1173		loop++;
1174		prod++;
1175		mp_prev = mp;
1176		mp = mp->b_next;
1177	}
1178failure:
1179	/*
1180	 * Did we actually do anything?
1181	 */
1182	if (loop == xnbp->xnb_rx_ring.req_cons) {
1183		mutex_exit(&xnbp->xnb_rx_lock);
1184		return (mp);
1185	}
1186
1187	/*
1188	 * Unlink the end of the 'done' list from the remainder.
1189	 */
1190	ASSERT(mp_prev != NULL);
1191	mp_prev->b_next = NULL;
1192
1193	xnbp->xnb_rx_ring.req_cons = loop;
1194	xnbp->xnb_rx_ring.rsp_prod_pvt = prod;
1195
1196	/* 6 */
1197	/* LINTED: constant in conditional context */
1198	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify);
1199	if (notify) {
1200		ec_notify_via_evtchn(xnbp->xnb_evtchn);
1201		xnbp->xnb_stat_rx_notify_sent++;
1202	} else {
1203		xnbp->xnb_stat_rx_notify_deferred++;
1204	}
1205
1206	if (mp != NULL)
1207		xnbp->xnb_stat_rx_defer++;
1208
1209	mutex_exit(&xnbp->xnb_rx_lock);
1210
1211	/* Free mblk_t structs we have consumed. */
1212	freemsgchain(free);
1213
1214	return (mp);
1215}
1216
1217
1218static void
1219xnb_tx_notify_peer(xnb_t *xnbp, boolean_t force)
1220{
1221	boolean_t notify;
1222
1223	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1224
1225	/* LINTED: constant in conditional context */
1226	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_tx_ring, notify);
1227	if (notify || force) {
1228		ec_notify_via_evtchn(xnbp->xnb_evtchn);
1229		xnbp->xnb_stat_tx_notify_sent++;
1230	} else {
1231		xnbp->xnb_stat_tx_notify_deferred++;
1232	}
1233}
1234
1235static void
1236xnb_tx_mark_complete(xnb_t *xnbp, RING_IDX id, int16_t status)
1237{
1238	RING_IDX i;
1239	netif_tx_response_t *txresp;
1240
1241	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1242
1243	i = xnbp->xnb_tx_ring.rsp_prod_pvt;
1244
1245	txresp = RING_GET_RESPONSE(&xnbp->xnb_tx_ring, i);
1246	txresp->id = id;
1247	txresp->status = status;
1248
1249	xnbp->xnb_tx_ring.rsp_prod_pvt = i + 1;
1250
1251	/*
1252	 * Note that we don't push the change to the peer here - that
1253	 * is the callers responsibility.
1254	 */
1255}
1256
1257static void
1258xnb_txbuf_recycle(xnb_txbuf_t *txp)
1259{
1260	xnb_t *xnbp = txp->xt_xnbp;
1261
1262	kmem_cache_free(xnbp->xnb_tx_buf_cache, txp);
1263
1264	xnbp->xnb_tx_buf_outstanding--;
1265}
1266
1267static int
1268xnb_txbuf_constructor(void *buf, void *arg, int kmflag)
1269{
1270	_NOTE(ARGUNUSED(kmflag));
1271	xnb_txbuf_t *txp = buf;
1272	xnb_t *xnbp = arg;
1273	size_t len;
1274	ddi_dma_cookie_t dma_cookie;
1275	uint_t ncookies;
1276
1277	txp->xt_free_rtn.free_func = xnb_txbuf_recycle;
1278	txp->xt_free_rtn.free_arg = (caddr_t)txp;
1279	txp->xt_xnbp = xnbp;
1280	txp->xt_next = NULL;
1281
1282	if (ddi_dma_alloc_handle(xnbp->xnb_devinfo, &buf_dma_attr,
1283	    0, 0, &txp->xt_dma_handle) != DDI_SUCCESS)
1284		goto failure;
1285
1286	if (ddi_dma_mem_alloc(txp->xt_dma_handle, PAGESIZE, &data_accattr,
1287	    DDI_DMA_STREAMING, 0, 0, &txp->xt_buf, &len,
1288	    &txp->xt_acc_handle) != DDI_SUCCESS)
1289		goto failure_1;
1290
1291	if (ddi_dma_addr_bind_handle(txp->xt_dma_handle, NULL, txp->xt_buf,
1292	    len, DDI_DMA_RDWR | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT, 0,
1293	    &dma_cookie, &ncookies)
1294	    != DDI_DMA_MAPPED)
1295		goto failure_2;
1296	ASSERT(ncookies == 1);
1297
1298	txp->xt_mfn = xnb_btop(dma_cookie.dmac_laddress);
1299	txp->xt_buflen = dma_cookie.dmac_size;
1300
1301	DTRACE_PROBE(txbuf_allocated);
1302
1303	atomic_add_32(&xnbp->xnb_tx_buf_count, 1);
1304	xnbp->xnb_tx_buf_outstanding++;
1305
1306	return (0);
1307
1308failure_2:
1309	ddi_dma_mem_free(&txp->xt_acc_handle);
1310
1311failure_1:
1312	ddi_dma_free_handle(&txp->xt_dma_handle);
1313
1314failure:
1315
1316	return (-1);
1317}
1318
1319static void
1320xnb_txbuf_destructor(void *buf, void *arg)
1321{
1322	xnb_txbuf_t *txp = buf;
1323	xnb_t *xnbp = arg;
1324
1325	(void) ddi_dma_unbind_handle(txp->xt_dma_handle);
1326	ddi_dma_mem_free(&txp->xt_acc_handle);
1327	ddi_dma_free_handle(&txp->xt_dma_handle);
1328
1329	atomic_add_32(&xnbp->xnb_tx_buf_count, -1);
1330}
1331
1332/*
1333 * Take packets from the peer and deliver them onward.
1334 */
1335static mblk_t *
1336xnb_from_peer(xnb_t *xnbp)
1337{
1338	RING_IDX start, end, loop;
1339	gnttab_copy_t *cop;
1340	xnb_txbuf_t **txpp;
1341	netif_tx_request_t *txreq;
1342	boolean_t work_to_do, need_notify = B_FALSE;
1343	mblk_t *head, *tail;
1344	int n_data_req, i;
1345
1346	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1347
1348	head = tail = NULL;
1349around:
1350
1351	/* LINTED: constant in conditional context */
1352	RING_FINAL_CHECK_FOR_REQUESTS(&xnbp->xnb_tx_ring, work_to_do);
1353	if (!work_to_do) {
1354finished:
1355		xnb_tx_notify_peer(xnbp, need_notify);
1356
1357		return (head);
1358	}
1359
1360	start = xnbp->xnb_tx_ring.req_cons;
1361	end = xnbp->xnb_tx_ring.sring->req_prod;
1362
1363	if ((end - start) > NET_TX_RING_SIZE) {
1364		/*
1365		 * This usually indicates that the frontend driver is
1366		 * misbehaving, as it's not possible to have more than
1367		 * NET_TX_RING_SIZE ring elements in play at any one
1368		 * time.
1369		 *
1370		 * We reset the ring pointers to the state declared by
1371		 * the frontend and try to carry on.
1372		 */
1373		cmn_err(CE_WARN, "xnb_from_peer: domain %d tried to give us %u "
1374		    "items in the ring, resetting and trying to recover.",
1375		    xnbp->xnb_peer, (end - start));
1376
1377		/* LINTED: constant in conditional context */
1378		BACK_RING_ATTACH(&xnbp->xnb_tx_ring,
1379		    (netif_tx_sring_t *)xnbp->xnb_tx_ring_addr, PAGESIZE);
1380
1381		goto around;
1382	}
1383
1384	loop = start;
1385	cop = xnbp->xnb_tx_cop;
1386	txpp = xnbp->xnb_tx_bufp;
1387	n_data_req = 0;
1388
1389	while (loop < end) {
1390		static const uint16_t acceptable_flags =
1391		    NETTXF_csum_blank |
1392		    NETTXF_data_validated |
1393		    NETTXF_extra_info;
1394		uint16_t unexpected_flags;
1395
1396		txreq = RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop);
1397
1398		unexpected_flags = txreq->flags & ~acceptable_flags;
1399		if (unexpected_flags != 0) {
1400			/*
1401			 * The peer used flag bits that we do not
1402			 * recognize.
1403			 */
1404			cmn_err(CE_WARN, "xnb_from_peer: "
1405			    "unexpected flag bits (0x%x) from peer "
1406			    "in transmit request",
1407			    unexpected_flags);
1408			xnbp->xnb_stat_tx_unexpected_flags++;
1409
1410			/* Mark this entry as failed. */
1411			xnb_tx_mark_complete(xnbp, txreq->id, NETIF_RSP_ERROR);
1412			need_notify = B_TRUE;
1413
1414		} else if (txreq->flags & NETTXF_extra_info) {
1415			struct netif_extra_info *erp;
1416			boolean_t status;
1417
1418			loop++; /* Consume another slot in the ring. */
1419			ASSERT(loop <= end);
1420
1421			erp = (struct netif_extra_info *)
1422			    RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop);
1423
1424			switch (erp->type) {
1425			case XEN_NETIF_EXTRA_TYPE_MCAST_ADD:
1426				ASSERT(xnbp->xnb_multicast_control);
1427				status = xnbp->xnb_flavour->xf_mcast_add(xnbp,
1428				    &erp->u.mcast.addr);
1429				break;
1430			case XEN_NETIF_EXTRA_TYPE_MCAST_DEL:
1431				ASSERT(xnbp->xnb_multicast_control);
1432				status = xnbp->xnb_flavour->xf_mcast_del(xnbp,
1433				    &erp->u.mcast.addr);
1434				break;
1435			default:
1436				status = B_FALSE;
1437				cmn_err(CE_WARN, "xnb_from_peer: "
1438				    "unknown extra type %d", erp->type);
1439				break;
1440			}
1441
1442			xnb_tx_mark_complete(xnbp, txreq->id,
1443			    status ? NETIF_RSP_OKAY : NETIF_RSP_ERROR);
1444			need_notify = B_TRUE;
1445
1446		} else if ((txreq->offset > PAGESIZE) ||
1447		    (txreq->offset + txreq->size > PAGESIZE)) {
1448			/*
1449			 * Peer attempted to refer to data beyond the
1450			 * end of the granted page.
1451			 */
1452			cmn_err(CE_WARN, "xnb_from_peer: "
1453			    "attempt to refer beyond the end of granted "
1454			    "page in txreq (offset %d, size %d).",
1455			    txreq->offset, txreq->size);
1456			xnbp->xnb_stat_tx_overflow_page++;
1457
1458			/* Mark this entry as failed. */
1459			xnb_tx_mark_complete(xnbp, txreq->id, NETIF_RSP_ERROR);
1460			need_notify = B_TRUE;
1461
1462		} else {
1463			xnb_txbuf_t *txp;
1464
1465			txp = kmem_cache_alloc(xnbp->xnb_tx_buf_cache,
1466			    KM_NOSLEEP);
1467			if (txp == NULL)
1468				break;
1469
1470			txp->xt_mblk = desballoc((unsigned char *)txp->xt_buf,
1471			    txp->xt_buflen, 0, &txp->xt_free_rtn);
1472			if (txp->xt_mblk == NULL) {
1473				kmem_cache_free(xnbp->xnb_tx_buf_cache, txp);
1474				break;
1475			}
1476
1477			txp->xt_idx = loop;
1478			txp->xt_id = txreq->id;
1479
1480			cop->source.u.ref = txreq->gref;
1481			cop->source.domid = xnbp->xnb_peer;
1482			cop->source.offset = txreq->offset;
1483
1484			cop->dest.u.gmfn = txp->xt_mfn;
1485			cop->dest.domid = DOMID_SELF;
1486			cop->dest.offset = 0;
1487
1488			cop->len = txreq->size;
1489			cop->flags = GNTCOPY_source_gref;
1490			cop->status = 0;
1491
1492			*txpp = txp;
1493
1494			txpp++;
1495			cop++;
1496			n_data_req++;
1497
1498			ASSERT(n_data_req <= NET_TX_RING_SIZE);
1499		}
1500
1501		loop++;
1502	}
1503
1504	xnbp->xnb_tx_ring.req_cons = loop;
1505
1506	if (n_data_req == 0)
1507		goto around;
1508
1509	if (HYPERVISOR_grant_table_op(GNTTABOP_copy,
1510	    xnbp->xnb_tx_cop, n_data_req) != 0) {
1511
1512		cmn_err(CE_WARN, "xnb_from_peer: copy operation failed");
1513
1514		txpp = xnbp->xnb_tx_bufp;
1515		i = n_data_req;
1516		while (i > 0) {
1517			kmem_cache_free(xnbp->xnb_tx_buf_cache, *txpp);
1518			txpp++;
1519			i--;
1520		}
1521
1522		goto finished;
1523	}
1524
1525	txpp = xnbp->xnb_tx_bufp;
1526	cop = xnbp->xnb_tx_cop;
1527	i = n_data_req;
1528
1529	while (i > 0) {
1530		xnb_txbuf_t *txp = *txpp;
1531
1532		txreq = RING_GET_REQUEST(&xnbp->xnb_tx_ring, txp->xt_idx);
1533
1534		if (cop->status != 0) {
1535#ifdef XNB_DEBUG
1536			cmn_err(CE_WARN, "xnb_from_peer: "
1537			    "txpp 0x%p failed (%d)",
1538			    (void *)*txpp, cop->status);
1539#endif /* XNB_DEBUG */
1540			xnb_tx_mark_complete(xnbp, txp->xt_id, NETIF_RSP_ERROR);
1541			freemsg(txp->xt_mblk);
1542		} else {
1543			mblk_t *mp;
1544
1545			mp = txp->xt_mblk;
1546			mp->b_rptr = mp->b_wptr = (unsigned char *)txp->xt_buf;
1547			mp->b_wptr += txreq->size;
1548			mp->b_next = NULL;
1549
1550			/*
1551			 * If there are checksum flags, process them
1552			 * appropriately.
1553			 */
1554			if ((txreq->flags &
1555			    (NETTXF_csum_blank | NETTXF_data_validated))
1556			    != 0) {
1557				mp = xnbp->xnb_flavour->xf_cksum_from_peer(xnbp,
1558				    mp, txreq->flags);
1559				xnbp->xnb_stat_tx_cksum_no_need++;
1560
1561				txp->xt_mblk = mp;
1562			}
1563
1564			if (head == NULL) {
1565				ASSERT(tail == NULL);
1566				head = mp;
1567			} else {
1568				ASSERT(tail != NULL);
1569				tail->b_next = mp;
1570			}
1571			tail = mp;
1572
1573			xnbp->xnb_stat_opackets++;
1574			xnbp->xnb_stat_obytes += txreq->size;
1575
1576			xnb_tx_mark_complete(xnbp, txp->xt_id, NETIF_RSP_OKAY);
1577		}
1578
1579		txpp++;
1580		cop++;
1581		i--;
1582	}
1583
1584	goto around;
1585	/* NOTREACHED */
1586}
1587
1588static uint_t
1589xnb_intr(caddr_t arg)
1590{
1591	xnb_t *xnbp = (xnb_t *)arg;
1592	mblk_t *mp;
1593
1594	xnbp->xnb_stat_intr++;
1595
1596	mutex_enter(&xnbp->xnb_tx_lock);
1597
1598	ASSERT(xnbp->xnb_connected);
1599
1600	mp = xnb_from_peer(xnbp);
1601
1602	mutex_exit(&xnbp->xnb_tx_lock);
1603
1604	if (!xnbp->xnb_hotplugged) {
1605		xnbp->xnb_stat_tx_too_early++;
1606		goto fail;
1607	}
1608	if (mp == NULL) {
1609		xnbp->xnb_stat_spurious_intr++;
1610		goto fail;
1611	}
1612
1613	xnbp->xnb_flavour->xf_from_peer(xnbp, mp);
1614
1615	return (DDI_INTR_CLAIMED);
1616
1617fail:
1618	freemsgchain(mp);
1619	return (DDI_INTR_CLAIMED);
1620}
1621
1622/*
1623 * Read our configuration from xenstore.
1624 */
1625boolean_t
1626xnb_read_xs_config(xnb_t *xnbp)
1627{
1628	char *xsname;
1629	char mac[ETHERADDRL * 3];
1630
1631	xsname = xvdi_get_xsname(xnbp->xnb_devinfo);
1632
1633	if (xenbus_scanf(XBT_NULL, xsname,
1634	    "mac", "%s", mac) != 0) {
1635		cmn_err(CE_WARN, "xnb_attach: "
1636		    "cannot read mac address from %s",
1637		    xsname);
1638		return (B_FALSE);
1639	}
1640
1641	if (ether_aton(mac, xnbp->xnb_mac_addr) != ETHERADDRL) {
1642		cmn_err(CE_WARN,
1643		    "xnb_attach: cannot parse mac address %s",
1644		    mac);
1645		return (B_FALSE);
1646	}
1647
1648	return (B_TRUE);
1649}
1650
1651/*
1652 * Read the configuration of the peer from xenstore.
1653 */
1654boolean_t
1655xnb_read_oe_config(xnb_t *xnbp)
1656{
1657	char *oename;
1658	int i;
1659
1660	oename = xvdi_get_oename(xnbp->xnb_devinfo);
1661
1662	if (xenbus_gather(XBT_NULL, oename,
1663	    "event-channel", "%u", &xnbp->xnb_fe_evtchn,
1664	    "tx-ring-ref", "%lu", &xnbp->xnb_tx_ring_ref,
1665	    "rx-ring-ref", "%lu", &xnbp->xnb_rx_ring_ref,
1666	    NULL) != 0) {
1667		cmn_err(CE_WARN, "xnb_read_oe_config: "
1668		    "cannot read other-end details from %s",
1669		    oename);
1670		return (B_FALSE);
1671	}
1672
1673	/*
1674	 * Check whether our peer requests receive side hypervisor
1675	 * copy.
1676	 */
1677	if (xenbus_scanf(XBT_NULL, oename,
1678	    "request-rx-copy", "%d", &i) != 0)
1679		i = 0;
1680	if (i != 0)
1681		xnbp->xnb_rx_hv_copy = B_TRUE;
1682
1683	/*
1684	 * Check whether our peer requests multicast_control.
1685	 */
1686	if (xenbus_scanf(XBT_NULL, oename,
1687	    "request-multicast-control", "%d", &i) != 0)
1688		i = 0;
1689	if (i != 0)
1690		xnbp->xnb_multicast_control = B_TRUE;
1691
1692	/*
1693	 * The Linux backend driver here checks to see if the peer has
1694	 * set 'feature-no-csum-offload'. This is used to indicate
1695	 * that the guest cannot handle receiving packets without a
1696	 * valid checksum. We don't check here, because packets passed
1697	 * to the peer _always_ have a valid checksum.
1698	 *
1699	 * There are three cases:
1700	 *
1701	 * - the NIC is dedicated: packets from the wire should always
1702	 *   have a valid checksum. If the hardware validates the
1703	 *   checksum then the relevant bit will be set in the packet
1704	 *   attributes and we will inform the peer. It can choose to
1705	 *   ignore the hardware verification.
1706	 *
1707	 * - the NIC is shared (VNIC) and a packet originates from the
1708	 *   wire: this is the same as the case above - the packets
1709	 *   will have a valid checksum.
1710	 *
1711	 * - the NIC is shared (VNIC) and a packet originates from the
1712	 *   host: the MAC layer ensures that all such packets have a
1713	 *   valid checksum by calculating one if the stack did not.
1714	 */
1715
1716	return (B_TRUE);
1717}
1718
1719void
1720xnb_start_connect(xnb_t *xnbp)
1721{
1722	dev_info_t  *dip = xnbp->xnb_devinfo;
1723
1724	if (!xnb_connect_rings(dip)) {
1725		cmn_err(CE_WARN, "xnb_start_connect: "
1726		    "cannot connect rings");
1727		goto failed;
1728	}
1729
1730	if (!xnbp->xnb_flavour->xf_start_connect(xnbp)) {
1731		cmn_err(CE_WARN, "xnb_start_connect: "
1732		    "flavour failed to connect");
1733		goto failed;
1734	}
1735
1736	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
1737	return;
1738
1739failed:
1740	xnbp->xnb_flavour->xf_peer_disconnected(xnbp);
1741	xnb_disconnect_rings(dip);
1742	(void) xvdi_switch_state(dip, XBT_NULL,
1743	    XenbusStateClosed);
1744	(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1745}
1746
1747static boolean_t
1748xnb_connect_rings(dev_info_t *dip)
1749{
1750	xnb_t *xnbp = ddi_get_driver_private(dip);
1751	struct gnttab_map_grant_ref map_op;
1752
1753	/*
1754	 * Cannot attempt to connect the rings if already connected.
1755	 */
1756	ASSERT(!xnbp->xnb_connected);
1757
1758	/*
1759	 * 1. allocate a vaddr for the tx page, one for the rx page.
1760	 * 2. call GNTTABOP_map_grant_ref to map the relevant pages
1761	 *    into the allocated vaddr (one for tx, one for rx).
1762	 * 3. call EVTCHNOP_bind_interdomain to have the event channel
1763	 *    bound to this domain.
1764	 * 4. associate the event channel with an interrupt.
1765	 * 5. enable the interrupt.
1766	 */
1767
1768	/* 1.tx */
1769	xnbp->xnb_tx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
1770	    0, 0, 0, 0, VM_SLEEP);
1771	ASSERT(xnbp->xnb_tx_ring_addr != NULL);
1772
1773	/* 2.tx */
1774	map_op.host_addr = (uint64_t)((long)xnbp->xnb_tx_ring_addr);
1775	map_op.flags = GNTMAP_host_map;
1776	map_op.ref = xnbp->xnb_tx_ring_ref;
1777	map_op.dom = xnbp->xnb_peer;
1778	hat_prepare_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr, NULL);
1779	if (xen_map_gref(GNTTABOP_map_grant_ref, &map_op, 1, B_FALSE) != 0 ||
1780	    map_op.status != 0) {
1781		cmn_err(CE_WARN, "xnb_connect_rings: cannot map tx-ring page.");
1782		goto fail;
1783	}
1784	xnbp->xnb_tx_ring_handle = map_op.handle;
1785
1786	/* LINTED: constant in conditional context */
1787	BACK_RING_INIT(&xnbp->xnb_tx_ring,
1788	    (netif_tx_sring_t *)xnbp->xnb_tx_ring_addr, PAGESIZE);
1789
1790	/* 1.rx */
1791	xnbp->xnb_rx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
1792	    0, 0, 0, 0, VM_SLEEP);
1793	ASSERT(xnbp->xnb_rx_ring_addr != NULL);
1794
1795	/* 2.rx */
1796	map_op.host_addr = (uint64_t)((long)xnbp->xnb_rx_ring_addr);
1797	map_op.flags = GNTMAP_host_map;
1798	map_op.ref = xnbp->xnb_rx_ring_ref;
1799	map_op.dom = xnbp->xnb_peer;
1800	hat_prepare_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr, NULL);
1801	if (xen_map_gref(GNTTABOP_map_grant_ref, &map_op, 1, B_FALSE) != 0 ||
1802	    map_op.status != 0) {
1803		cmn_err(CE_WARN, "xnb_connect_rings: cannot map rx-ring page.");
1804		goto fail;
1805	}
1806	xnbp->xnb_rx_ring_handle = map_op.handle;
1807
1808	/* LINTED: constant in conditional context */
1809	BACK_RING_INIT(&xnbp->xnb_rx_ring,
1810	    (netif_rx_sring_t *)xnbp->xnb_rx_ring_addr, PAGESIZE);
1811
1812	/* 3 */
1813	if (xvdi_bind_evtchn(dip, xnbp->xnb_fe_evtchn) != DDI_SUCCESS) {
1814		cmn_err(CE_WARN, "xnb_connect_rings: "
1815		    "cannot bind event channel %d", xnbp->xnb_evtchn);
1816		xnbp->xnb_evtchn = INVALID_EVTCHN;
1817		goto fail;
1818	}
1819	xnbp->xnb_evtchn = xvdi_get_evtchn(dip);
1820
1821	/*
1822	 * It would be good to set the state to XenbusStateConnected
1823	 * here as well, but then what if ddi_add_intr() failed?
1824	 * Changing the state in the store will be noticed by the peer
1825	 * and cannot be "taken back".
1826	 */
1827	mutex_enter(&xnbp->xnb_tx_lock);
1828	mutex_enter(&xnbp->xnb_rx_lock);
1829
1830	xnbp->xnb_connected = B_TRUE;
1831
1832	mutex_exit(&xnbp->xnb_rx_lock);
1833	mutex_exit(&xnbp->xnb_tx_lock);
1834
1835	/* 4, 5 */
1836	if (ddi_add_intr(dip, 0, NULL, NULL, xnb_intr, (caddr_t)xnbp)
1837	    != DDI_SUCCESS) {
1838		cmn_err(CE_WARN, "xnb_connect_rings: cannot add interrupt");
1839		goto fail;
1840	}
1841	xnbp->xnb_irq = B_TRUE;
1842
1843	return (B_TRUE);
1844
1845fail:
1846	mutex_enter(&xnbp->xnb_tx_lock);
1847	mutex_enter(&xnbp->xnb_rx_lock);
1848
1849	xnbp->xnb_connected = B_FALSE;
1850
1851	mutex_exit(&xnbp->xnb_rx_lock);
1852	mutex_exit(&xnbp->xnb_tx_lock);
1853
1854	return (B_FALSE);
1855}
1856
1857static void
1858xnb_disconnect_rings(dev_info_t *dip)
1859{
1860	xnb_t *xnbp = ddi_get_driver_private(dip);
1861
1862	if (xnbp->xnb_irq) {
1863		ddi_remove_intr(dip, 0, NULL);
1864		xnbp->xnb_irq = B_FALSE;
1865	}
1866
1867	if (xnbp->xnb_evtchn != INVALID_EVTCHN) {
1868		xvdi_free_evtchn(dip);
1869		xnbp->xnb_evtchn = INVALID_EVTCHN;
1870	}
1871
1872	if (xnbp->xnb_rx_ring_handle != INVALID_GRANT_HANDLE) {
1873		struct gnttab_unmap_grant_ref unmap_op;
1874
1875		unmap_op.host_addr = (uint64_t)(uintptr_t)
1876		    xnbp->xnb_rx_ring_addr;
1877		unmap_op.dev_bus_addr = 0;
1878		unmap_op.handle = xnbp->xnb_rx_ring_handle;
1879		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1880		    &unmap_op, 1) != 0)
1881			cmn_err(CE_WARN, "xnb_disconnect_rings: "
1882			    "cannot unmap rx-ring page (%d)",
1883			    unmap_op.status);
1884
1885		xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE;
1886	}
1887
1888	if (xnbp->xnb_rx_ring_addr != NULL) {
1889		hat_release_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr);
1890		vmem_free(heap_arena, xnbp->xnb_rx_ring_addr, PAGESIZE);
1891		xnbp->xnb_rx_ring_addr = NULL;
1892	}
1893
1894	if (xnbp->xnb_tx_ring_handle != INVALID_GRANT_HANDLE) {
1895		struct gnttab_unmap_grant_ref unmap_op;
1896
1897		unmap_op.host_addr = (uint64_t)(uintptr_t)
1898		    xnbp->xnb_tx_ring_addr;
1899		unmap_op.dev_bus_addr = 0;
1900		unmap_op.handle = xnbp->xnb_tx_ring_handle;
1901		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1902		    &unmap_op, 1) != 0)
1903			cmn_err(CE_WARN, "xnb_disconnect_rings: "
1904			    "cannot unmap tx-ring page (%d)",
1905			    unmap_op.status);
1906
1907		xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE;
1908	}
1909
1910	if (xnbp->xnb_tx_ring_addr != NULL) {
1911		hat_release_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr);
1912		vmem_free(heap_arena, xnbp->xnb_tx_ring_addr, PAGESIZE);
1913		xnbp->xnb_tx_ring_addr = NULL;
1914	}
1915}
1916
1917static void
1918xnb_oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
1919    void *arg, void *impl_data)
1920{
1921	_NOTE(ARGUNUSED(id, arg));
1922	xnb_t *xnbp = ddi_get_driver_private(dip);
1923	XenbusState new_state = *(XenbusState *)impl_data;
1924
1925	ASSERT(xnbp != NULL);
1926
1927	switch (new_state) {
1928	case XenbusStateConnected:
1929		/* spurious state change */
1930		if (xnbp->xnb_connected)
1931			return;
1932
1933		if (!xnb_read_oe_config(xnbp) ||
1934		    !xnbp->xnb_flavour->xf_peer_connected(xnbp)) {
1935			cmn_err(CE_WARN, "xnb_oe_state_change: "
1936			    "read otherend config error");
1937			(void) xvdi_switch_state(dip, XBT_NULL,
1938			    XenbusStateClosed);
1939			(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1940
1941			break;
1942		}
1943
1944
1945		mutex_enter(&xnbp->xnb_state_lock);
1946		xnbp->xnb_fe_status = XNB_STATE_READY;
1947		if (xnbp->xnb_be_status == XNB_STATE_READY)
1948			xnb_start_connect(xnbp);
1949		mutex_exit(&xnbp->xnb_state_lock);
1950
1951		/*
1952		 * Now that we've attempted to connect it's reasonable
1953		 * to allow an attempt to detach.
1954		 */
1955		xnbp->xnb_detachable = B_TRUE;
1956
1957		break;
1958
1959	case XenbusStateClosing:
1960		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosing);
1961
1962		break;
1963
1964	case XenbusStateClosed:
1965		xnbp->xnb_flavour->xf_peer_disconnected(xnbp);
1966
1967		mutex_enter(&xnbp->xnb_tx_lock);
1968		mutex_enter(&xnbp->xnb_rx_lock);
1969
1970		xnb_disconnect_rings(dip);
1971		xnbp->xnb_connected = B_FALSE;
1972
1973		mutex_exit(&xnbp->xnb_rx_lock);
1974		mutex_exit(&xnbp->xnb_tx_lock);
1975
1976		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
1977		(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1978		/*
1979		 * In all likelyhood this is already set (in the above
1980		 * case), but if the peer never attempted to connect
1981		 * and the domain is destroyed we get here without
1982		 * having been through the case above, so we set it to
1983		 * be sure.
1984		 */
1985		xnbp->xnb_detachable = B_TRUE;
1986
1987		break;
1988
1989	default:
1990		break;
1991	}
1992}
1993
1994static void
1995xnb_hp_state_change(dev_info_t *dip, ddi_eventcookie_t id,
1996    void *arg, void *impl_data)
1997{
1998	_NOTE(ARGUNUSED(id, arg));
1999	xnb_t *xnbp = ddi_get_driver_private(dip);
2000	xendev_hotplug_state_t state = *(xendev_hotplug_state_t *)impl_data;
2001
2002	ASSERT(xnbp != NULL);
2003
2004	switch (state) {
2005	case Connected:
2006		/* spurious hotplug event */
2007		if (xnbp->xnb_hotplugged)
2008			break;
2009
2010		if (!xnb_read_xs_config(xnbp))
2011			break;
2012
2013		if (!xnbp->xnb_flavour->xf_hotplug_connected(xnbp))
2014			break;
2015
2016		mutex_enter(&xnbp->xnb_tx_lock);
2017		mutex_enter(&xnbp->xnb_rx_lock);
2018
2019		xnbp->xnb_hotplugged = B_TRUE;
2020
2021		mutex_exit(&xnbp->xnb_rx_lock);
2022		mutex_exit(&xnbp->xnb_tx_lock);
2023
2024		mutex_enter(&xnbp->xnb_state_lock);
2025		xnbp->xnb_be_status = XNB_STATE_READY;
2026		if (xnbp->xnb_fe_status == XNB_STATE_READY)
2027			xnb_start_connect(xnbp);
2028		mutex_exit(&xnbp->xnb_state_lock);
2029
2030		break;
2031
2032	default:
2033		break;
2034	}
2035}
2036
2037static struct modldrv modldrv = {
2038	&mod_miscops, "xnb",
2039};
2040
2041static struct modlinkage modlinkage = {
2042	MODREV_1, &modldrv, NULL
2043};
2044
2045int
2046_init(void)
2047{
2048	int i;
2049
2050	mutex_init(&xnb_alloc_page_lock, NULL, MUTEX_DRIVER, NULL);
2051
2052	i = mod_install(&modlinkage);
2053	if (i != DDI_SUCCESS)
2054		mutex_destroy(&xnb_alloc_page_lock);
2055
2056	return (i);
2057}
2058
2059int
2060_info(struct modinfo *modinfop)
2061{
2062	return (mod_info(&modlinkage, modinfop));
2063}
2064
2065int
2066_fini(void)
2067{
2068	int i;
2069
2070	i = mod_remove(&modlinkage);
2071	if (i == DDI_SUCCESS)
2072		mutex_destroy(&xnb_alloc_page_lock);
2073
2074	return (i);
2075}
2076