1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*
28 * xdf.c - Xen Virtual Block Device Driver
29 * TODO:
30 *	- support alternate block size (currently only DEV_BSIZE supported)
31 *	- revalidate geometry for removable devices
32 *
33 * This driver export solaris disk device nodes, accepts IO requests from
34 * those nodes, and services those requests by talking to a backend device
35 * in another domain.
36 *
37 * Communication with the backend device is done via a ringbuffer (which is
38 * managed via xvdi interfaces) and dma memory (which is managed via ddi
39 * interfaces).
40 *
41 * Communication with the backend device is dependant upon establishing a
42 * connection to the backend device.  This connection process involves
43 * reading device configuration information from xenbus and publishing
44 * some frontend runtime configuration parameters via the xenbus (for
45 * consumption by the backend).  Once we've published runtime configuration
46 * information via the xenbus, the backend device can enter the connected
47 * state and we'll enter the XD_CONNECTED state.  But before we can allow
48 * random IO to begin, we need to do IO to the backend device to determine
49 * the device label and if flush operations are supported.  Once this is
50 * done we enter the XD_READY state and can process any IO operations.
51 *
52 * We recieve notifications of xenbus state changes for the backend device
53 * (aka, the "other end") via the xdf_oe_change() callback.  This callback
54 * is single threaded, meaning that we can't recieve new notification of
55 * other end state changes while we're processing an outstanding
56 * notification of an other end state change.  There for we can't do any
57 * blocking operations from the xdf_oe_change() callback.  This is why we
58 * have a seperate taskq (xdf_ready_tq) which exists to do the necessary
59 * IO to get us from the XD_CONNECTED to the XD_READY state.  All IO
60 * generated by the xdf_ready_tq thread (xdf_ready_tq_thread) will go
61 * throught xdf_lb_rdwr(), which is a synchronous IO interface.  IOs
62 * generated by the xdf_ready_tq_thread thread have priority over all
63 * other IO requests.
64 *
65 * We also communicate with the backend device via the xenbus "media-req"
66 * (XBP_MEDIA_REQ) property.  For more information on this see the
67 * comments in blkif.h.
68 */
69
70#include <io/xdf.h>
71
72#include <sys/conf.h>
73#include <sys/dkio.h>
74#include <sys/promif.h>
75#include <sys/sysmacros.h>
76#include <sys/kstat.h>
77#include <sys/mach_mmu.h>
78#ifdef XPV_HVM_DRIVER
79#include <sys/xpv_support.h>
80#include <sys/sunndi.h>
81#else /* !XPV_HVM_DRIVER */
82#include <sys/evtchn_impl.h>
83#endif /* !XPV_HVM_DRIVER */
84#include <public/io/xenbus.h>
85#include <xen/sys/xenbus_impl.h>
86#include <sys/scsi/generic/inquiry.h>
87#include <xen/io/blkif_impl.h>
88#include <sys/fdio.h>
89#include <sys/cdio.h>
90
91/*
92 * DEBUG_EVAL can be used to include debug only statements without
93 * having to use '#ifdef DEBUG' statements
94 */
95#ifdef DEBUG
96#define	DEBUG_EVAL(x)	(x)
97#else /* !DEBUG */
98#define	DEBUG_EVAL(x)
99#endif /* !DEBUG */
100
101#define	XDF_DRAIN_MSEC_DELAY		(50*1000)	/* 00.05 sec */
102#define	XDF_DRAIN_RETRY_COUNT		200		/* 10.00 sec */
103
104#define	INVALID_DOMID	((domid_t)-1)
105#define	FLUSH_DISKCACHE	0x1
106#define	WRITE_BARRIER	0x2
107#define	DEFAULT_FLUSH_BLOCK	156 /* block to write to cause a cache flush */
108#define	USE_WRITE_BARRIER(vdp)						\
109	((vdp)->xdf_feature_barrier && !(vdp)->xdf_flush_supported)
110#define	USE_FLUSH_DISKCACHE(vdp)					\
111	((vdp)->xdf_feature_barrier && (vdp)->xdf_flush_supported)
112#define	IS_WRITE_BARRIER(vdp, bp)					\
113	(!IS_READ(bp) && USE_WRITE_BARRIER(vdp) &&			\
114	((bp)->b_un.b_addr == (vdp)->xdf_cache_flush_block))
115#define	IS_FLUSH_DISKCACHE(bp)						\
116	(!IS_READ(bp) && USE_FLUSH_DISKCACHE(vdp) && ((bp)->b_bcount == 0))
117
118#define	VREQ_DONE(vreq)							\
119	VOID2BOOLEAN(((vreq)->v_status == VREQ_DMAWIN_DONE) &&		\
120	    (((vreq)->v_flush_diskcache == FLUSH_DISKCACHE) ||		\
121	    (((vreq)->v_dmaw + 1) == (vreq)->v_ndmaws)))
122
123#define	BP_VREQ(bp)		((v_req_t *)((bp)->av_back))
124#define	BP_VREQ_SET(bp, vreq)	(((bp)->av_back = (buf_t *)(vreq)))
125
126extern int		do_polled_io;
127
128/* run-time tunables that we don't want the compiler to optimize away */
129volatile int		xdf_debug = 0;
130volatile boolean_t	xdf_barrier_flush_disable = B_FALSE;
131
132/* per module globals */
133major_t			xdf_major;
134static void		*xdf_ssp;
135static kmem_cache_t	*xdf_vreq_cache;
136static kmem_cache_t	*xdf_gs_cache;
137static int		xdf_maxphys = XB_MAXPHYS;
138static diskaddr_t	xdf_flush_block = DEFAULT_FLUSH_BLOCK;
139static int		xdf_fbrewrites;	/* flush block re-write count */
140
141/* misc public functions (used by xdf_shell.c) */
142int xdf_lb_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t, void *);
143int xdf_lb_getinfo(dev_info_t *, int, void *, void *);
144
145/*  misc private functions */
146static void xdf_io_start(xdf_t *);
147
148/* callbacks from commmon label */
149static cmlb_tg_ops_t xdf_lb_ops = {
150	TG_DK_OPS_VERSION_1,
151	xdf_lb_rdwr,
152	xdf_lb_getinfo
153};
154
155/*
156 * I/O buffer DMA attributes
157 * Make sure: one DMA window contains BLKIF_MAX_SEGMENTS_PER_REQUEST at most
158 */
159static ddi_dma_attr_t xb_dma_attr = {
160	DMA_ATTR_V0,
161	(uint64_t)0,			/* lowest address */
162	(uint64_t)0xffffffffffffffff,	/* highest usable address */
163	(uint64_t)0xffffff,		/* DMA counter limit max */
164	(uint64_t)XB_BSIZE,		/* alignment in bytes */
165	XB_BSIZE - 1,			/* bitmap of burst sizes */
166	XB_BSIZE,			/* min transfer */
167	(uint64_t)XB_MAX_XFER, 		/* maximum transfer */
168	(uint64_t)PAGEOFFSET,		/* 1 page segment length  */
169	BLKIF_MAX_SEGMENTS_PER_REQUEST,	/* maximum number of segments */
170	XB_BSIZE,			/* granularity */
171	0,				/* flags (reserved) */
172};
173
174static ddi_device_acc_attr_t xc_acc_attr = {
175	DDI_DEVICE_ATTR_V0,
176	DDI_NEVERSWAP_ACC,
177	DDI_STRICTORDER_ACC
178};
179
180static void
181xdf_timeout_handler(void *arg)
182{
183	xdf_t *vdp = arg;
184
185	mutex_enter(&vdp->xdf_dev_lk);
186	vdp->xdf_timeout_id = 0;
187	mutex_exit(&vdp->xdf_dev_lk);
188
189	/* new timeout thread could be re-scheduled */
190	xdf_io_start(vdp);
191}
192
193/*
194 * callback func when DMA/GTE resources is available
195 *
196 * Note: we only register one callback function to grant table subsystem
197 * since we only have one 'struct gnttab_free_callback' in xdf_t.
198 */
199static int
200xdf_dmacallback(caddr_t arg)
201{
202	xdf_t *vdp = (xdf_t *)arg;
203	ASSERT(vdp != NULL);
204
205	DPRINTF(DMA_DBG, ("xdf@%s: DMA callback started\n",
206	    vdp->xdf_addr));
207
208	ddi_trigger_softintr(vdp->xdf_softintr_id);
209	return (DDI_DMA_CALLBACK_DONE);
210}
211
212static ge_slot_t *
213gs_get(xdf_t *vdp, int isread)
214{
215	grant_ref_t gh;
216	ge_slot_t *gs;
217
218	/* try to alloc GTEs needed in this slot, first */
219	if (gnttab_alloc_grant_references(
220	    BLKIF_MAX_SEGMENTS_PER_REQUEST, &gh) == -1) {
221		if (vdp->xdf_gnt_callback.next == NULL) {
222			SETDMACBON(vdp);
223			gnttab_request_free_callback(
224			    &vdp->xdf_gnt_callback,
225			    (void (*)(void *))xdf_dmacallback,
226			    (void *)vdp,
227			    BLKIF_MAX_SEGMENTS_PER_REQUEST);
228		}
229		return (NULL);
230	}
231
232	gs = kmem_cache_alloc(xdf_gs_cache, KM_NOSLEEP);
233	if (gs == NULL) {
234		gnttab_free_grant_references(gh);
235		if (vdp->xdf_timeout_id == 0)
236			/* restart I/O after one second */
237			vdp->xdf_timeout_id =
238			    timeout(xdf_timeout_handler, vdp, hz);
239		return (NULL);
240	}
241
242	/* init gs_slot */
243	gs->gs_oeid = vdp->xdf_peer;
244	gs->gs_isread = isread;
245	gs->gs_ghead = gh;
246	gs->gs_ngrefs = 0;
247
248	return (gs);
249}
250
251static void
252gs_free(ge_slot_t *gs)
253{
254	int		i;
255
256	/* release all grant table entry resources used in this slot */
257	for (i = 0; i < gs->gs_ngrefs; i++)
258		gnttab_end_foreign_access(gs->gs_ge[i], !gs->gs_isread, 0);
259	gnttab_free_grant_references(gs->gs_ghead);
260	list_remove(&gs->gs_vreq->v_gs, gs);
261	kmem_cache_free(xdf_gs_cache, gs);
262}
263
264static grant_ref_t
265gs_grant(ge_slot_t *gs, mfn_t mfn)
266{
267	grant_ref_t gr = gnttab_claim_grant_reference(&gs->gs_ghead);
268
269	ASSERT(gr != -1);
270	ASSERT(gs->gs_ngrefs < BLKIF_MAX_SEGMENTS_PER_REQUEST);
271	gs->gs_ge[gs->gs_ngrefs++] = gr;
272	gnttab_grant_foreign_access_ref(gr, gs->gs_oeid, mfn, !gs->gs_isread);
273
274	return (gr);
275}
276
277/*
278 * Alloc a vreq for this bp
279 * bp->av_back contains the pointer to the vreq upon return
280 */
281static v_req_t *
282vreq_get(xdf_t *vdp, buf_t *bp)
283{
284	v_req_t *vreq = NULL;
285
286	ASSERT(BP_VREQ(bp) == NULL);
287
288	vreq = kmem_cache_alloc(xdf_vreq_cache, KM_NOSLEEP);
289	if (vreq == NULL) {
290		if (vdp->xdf_timeout_id == 0)
291			/* restart I/O after one second */
292			vdp->xdf_timeout_id =
293			    timeout(xdf_timeout_handler, vdp, hz);
294		return (NULL);
295	}
296	bzero(vreq, sizeof (v_req_t));
297	list_create(&vreq->v_gs, sizeof (ge_slot_t),
298	    offsetof(ge_slot_t, gs_vreq_link));
299	vreq->v_buf = bp;
300	vreq->v_status = VREQ_INIT;
301	vreq->v_runq = B_FALSE;
302	BP_VREQ_SET(bp, vreq);
303	/* init of other fields in vreq is up to the caller */
304
305	list_insert_head(&vdp->xdf_vreq_act, (void *)vreq);
306
307	return (vreq);
308}
309
310static void
311vreq_free(xdf_t *vdp, v_req_t *vreq)
312{
313	buf_t	*bp = vreq->v_buf;
314
315	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
316	ASSERT(BP_VREQ(bp) == vreq);
317
318	list_remove(&vdp->xdf_vreq_act, vreq);
319
320	if (vreq->v_flush_diskcache == FLUSH_DISKCACHE)
321		goto done;
322
323	switch (vreq->v_status) {
324	case VREQ_DMAWIN_DONE:
325	case VREQ_GS_ALLOCED:
326	case VREQ_DMABUF_BOUND:
327		(void) ddi_dma_unbind_handle(vreq->v_dmahdl);
328		/*FALLTHRU*/
329	case VREQ_DMAMEM_ALLOCED:
330		if (!ALIGNED_XFER(bp)) {
331			ASSERT(vreq->v_abuf != NULL);
332			if (!IS_ERROR(bp) && IS_READ(bp))
333				bcopy(vreq->v_abuf, bp->b_un.b_addr,
334				    bp->b_bcount);
335			ddi_dma_mem_free(&vreq->v_align);
336		}
337		/*FALLTHRU*/
338	case VREQ_MEMDMAHDL_ALLOCED:
339		if (!ALIGNED_XFER(bp))
340			ddi_dma_free_handle(&vreq->v_memdmahdl);
341		/*FALLTHRU*/
342	case VREQ_DMAHDL_ALLOCED:
343		ddi_dma_free_handle(&vreq->v_dmahdl);
344		break;
345	default:
346		break;
347	}
348done:
349	ASSERT(!vreq->v_runq);
350	list_destroy(&vreq->v_gs);
351	kmem_cache_free(xdf_vreq_cache, vreq);
352}
353
354/*
355 * Snarf new data if our flush block was re-written
356 */
357static void
358check_fbwrite(xdf_t *vdp, buf_t *bp, daddr_t blkno)
359{
360	int nblks;
361	boolean_t mapin;
362
363	if (IS_WRITE_BARRIER(vdp, bp))
364		return; /* write was a flush write */
365
366	mapin = B_FALSE;
367	nblks = bp->b_bcount >> DEV_BSHIFT;
368	if (xdf_flush_block >= blkno && xdf_flush_block < (blkno + nblks)) {
369		xdf_fbrewrites++;
370		if (bp->b_flags & (B_PAGEIO | B_PHYS)) {
371			mapin = B_TRUE;
372			bp_mapin(bp);
373		}
374		bcopy(bp->b_un.b_addr +
375		    ((xdf_flush_block - blkno) << DEV_BSHIFT),
376		    vdp->xdf_cache_flush_block, DEV_BSIZE);
377		if (mapin)
378			bp_mapout(bp);
379	}
380}
381
382/*
383 * Initalize the DMA and grant table resources for the buf
384 */
385static int
386vreq_setup(xdf_t *vdp, v_req_t *vreq)
387{
388	int rc;
389	ddi_dma_attr_t dmaattr;
390	uint_t ndcs, ndws;
391	ddi_dma_handle_t dh;
392	ddi_dma_handle_t mdh;
393	ddi_dma_cookie_t dc;
394	ddi_acc_handle_t abh;
395	caddr_t	aba;
396	ge_slot_t *gs;
397	size_t bufsz;
398	off_t off;
399	size_t sz;
400	buf_t *bp = vreq->v_buf;
401	int dma_flags = (IS_READ(bp) ? DDI_DMA_READ : DDI_DMA_WRITE) |
402	    DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
403
404	switch (vreq->v_status) {
405	case VREQ_INIT:
406		if (IS_FLUSH_DISKCACHE(bp)) {
407			if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
408				DPRINTF(DMA_DBG, ("xdf@%s: "
409				    "get ge_slotfailed\n", vdp->xdf_addr));
410				return (DDI_FAILURE);
411			}
412			vreq->v_blkno = 0;
413			vreq->v_nslots = 1;
414			vreq->v_flush_diskcache = FLUSH_DISKCACHE;
415			vreq->v_status = VREQ_GS_ALLOCED;
416			gs->gs_vreq = vreq;
417			list_insert_head(&vreq->v_gs, gs);
418			return (DDI_SUCCESS);
419		}
420
421		if (IS_WRITE_BARRIER(vdp, bp))
422			vreq->v_flush_diskcache = WRITE_BARRIER;
423		vreq->v_blkno = bp->b_blkno +
424		    (diskaddr_t)(uintptr_t)bp->b_private;
425		/* See if we wrote new data to our flush block */
426		if (!IS_READ(bp) && USE_WRITE_BARRIER(vdp))
427			check_fbwrite(vdp, bp, vreq->v_blkno);
428		vreq->v_status = VREQ_INIT_DONE;
429		/*FALLTHRU*/
430
431	case VREQ_INIT_DONE:
432		/*
433		 * alloc DMA handle
434		 */
435		rc = ddi_dma_alloc_handle(vdp->xdf_dip, &xb_dma_attr,
436		    xdf_dmacallback, (caddr_t)vdp, &dh);
437		if (rc != DDI_SUCCESS) {
438			SETDMACBON(vdp);
439			DPRINTF(DMA_DBG, ("xdf@%s: DMA handle alloc failed\n",
440			    vdp->xdf_addr));
441			return (DDI_FAILURE);
442		}
443
444		vreq->v_dmahdl = dh;
445		vreq->v_status = VREQ_DMAHDL_ALLOCED;
446		/*FALLTHRU*/
447
448	case VREQ_DMAHDL_ALLOCED:
449		/*
450		 * alloc dma handle for 512-byte aligned buf
451		 */
452		if (!ALIGNED_XFER(bp)) {
453			/*
454			 * XXPV: we need to temporarily enlarge the seg
455			 * boundary and s/g length to work round CR6381968
456			 */
457			dmaattr = xb_dma_attr;
458			dmaattr.dma_attr_seg = (uint64_t)-1;
459			dmaattr.dma_attr_sgllen = INT_MAX;
460			rc = ddi_dma_alloc_handle(vdp->xdf_dip, &dmaattr,
461			    xdf_dmacallback, (caddr_t)vdp, &mdh);
462			if (rc != DDI_SUCCESS) {
463				SETDMACBON(vdp);
464				DPRINTF(DMA_DBG, ("xdf@%s: "
465				    "unaligned buf DMAhandle alloc failed\n",
466				    vdp->xdf_addr));
467				return (DDI_FAILURE);
468			}
469			vreq->v_memdmahdl = mdh;
470			vreq->v_status = VREQ_MEMDMAHDL_ALLOCED;
471		}
472		/*FALLTHRU*/
473
474	case VREQ_MEMDMAHDL_ALLOCED:
475		/*
476		 * alloc 512-byte aligned buf
477		 */
478		if (!ALIGNED_XFER(bp)) {
479			if (bp->b_flags & (B_PAGEIO | B_PHYS))
480				bp_mapin(bp);
481			rc = ddi_dma_mem_alloc(vreq->v_memdmahdl,
482			    roundup(bp->b_bcount, XB_BSIZE), &xc_acc_attr,
483			    DDI_DMA_STREAMING, xdf_dmacallback, (caddr_t)vdp,
484			    &aba, &bufsz, &abh);
485			if (rc != DDI_SUCCESS) {
486				SETDMACBON(vdp);
487				DPRINTF(DMA_DBG, ("xdf@%s: "
488				    "DMA mem allocation failed\n",
489				    vdp->xdf_addr));
490				return (DDI_FAILURE);
491			}
492
493			vreq->v_abuf = aba;
494			vreq->v_align = abh;
495			vreq->v_status = VREQ_DMAMEM_ALLOCED;
496
497			ASSERT(bufsz >= bp->b_bcount);
498			if (!IS_READ(bp))
499				bcopy(bp->b_un.b_addr, vreq->v_abuf,
500				    bp->b_bcount);
501		}
502		/*FALLTHRU*/
503
504	case VREQ_DMAMEM_ALLOCED:
505		/*
506		 * dma bind
507		 */
508		if (ALIGNED_XFER(bp)) {
509			rc = ddi_dma_buf_bind_handle(vreq->v_dmahdl, bp,
510			    dma_flags, xdf_dmacallback, (caddr_t)vdp,
511			    &dc, &ndcs);
512		} else {
513			rc = ddi_dma_addr_bind_handle(vreq->v_dmahdl,
514			    NULL, vreq->v_abuf, bp->b_bcount, dma_flags,
515			    xdf_dmacallback, (caddr_t)vdp, &dc, &ndcs);
516		}
517		if (rc == DDI_DMA_MAPPED || rc == DDI_DMA_PARTIAL_MAP) {
518			/* get num of dma windows */
519			if (rc == DDI_DMA_PARTIAL_MAP) {
520				rc = ddi_dma_numwin(vreq->v_dmahdl, &ndws);
521				ASSERT(rc == DDI_SUCCESS);
522			} else {
523				ndws = 1;
524			}
525		} else {
526			SETDMACBON(vdp);
527			DPRINTF(DMA_DBG, ("xdf@%s: DMA bind failed\n",
528			    vdp->xdf_addr));
529			return (DDI_FAILURE);
530		}
531
532		vreq->v_dmac = dc;
533		vreq->v_dmaw = 0;
534		vreq->v_ndmacs = ndcs;
535		vreq->v_ndmaws = ndws;
536		vreq->v_nslots = ndws;
537		vreq->v_status = VREQ_DMABUF_BOUND;
538		/*FALLTHRU*/
539
540	case VREQ_DMABUF_BOUND:
541		/*
542		 * get ge_slot, callback is set upon failure from gs_get(),
543		 * if not set previously
544		 */
545		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
546			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
547			    vdp->xdf_addr));
548			return (DDI_FAILURE);
549		}
550
551		vreq->v_status = VREQ_GS_ALLOCED;
552		gs->gs_vreq = vreq;
553		list_insert_head(&vreq->v_gs, gs);
554		break;
555
556	case VREQ_GS_ALLOCED:
557		/* nothing need to be done */
558		break;
559
560	case VREQ_DMAWIN_DONE:
561		/*
562		 * move to the next dma window
563		 */
564		ASSERT((vreq->v_dmaw + 1) < vreq->v_ndmaws);
565
566		/* get a ge_slot for this DMA window */
567		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
568			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
569			    vdp->xdf_addr));
570			return (DDI_FAILURE);
571		}
572
573		vreq->v_dmaw++;
574		VERIFY(ddi_dma_getwin(vreq->v_dmahdl, vreq->v_dmaw, &off, &sz,
575		    &vreq->v_dmac, &vreq->v_ndmacs) == DDI_SUCCESS);
576		vreq->v_status = VREQ_GS_ALLOCED;
577		gs->gs_vreq = vreq;
578		list_insert_head(&vreq->v_gs, gs);
579		break;
580
581	default:
582		return (DDI_FAILURE);
583	}
584
585	return (DDI_SUCCESS);
586}
587
588static int
589xdf_cmlb_attach(xdf_t *vdp)
590{
591	dev_info_t	*dip = vdp->xdf_dip;
592
593	return (cmlb_attach(dip, &xdf_lb_ops,
594	    XD_IS_CD(vdp) ? DTYPE_RODIRECT : DTYPE_DIRECT,
595	    XD_IS_RM(vdp),
596	    B_TRUE,
597	    XD_IS_CD(vdp) ? DDI_NT_CD_XVMD : DDI_NT_BLOCK_XVMD,
598#if defined(XPV_HVM_DRIVER)
599	    (XD_IS_CD(vdp) ? 0 : CMLB_CREATE_ALTSLICE_VTOC_16_DTYPE_DIRECT) |
600	    CMLB_INTERNAL_MINOR_NODES,
601#else /* !XPV_HVM_DRIVER */
602	    XD_IS_CD(vdp) ? 0 : CMLB_FAKE_LABEL_ONE_PARTITION,
603#endif /* !XPV_HVM_DRIVER */
604	    vdp->xdf_vd_lbl, NULL));
605}
606
607static void
608xdf_io_err(buf_t *bp, int err, size_t resid)
609{
610	bioerror(bp, err);
611	if (resid == 0)
612		bp->b_resid = bp->b_bcount;
613	biodone(bp);
614}
615
616static void
617xdf_kstat_enter(xdf_t *vdp, buf_t *bp)
618{
619	v_req_t *vreq = BP_VREQ(bp);
620
621	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
622
623	if (vdp->xdf_xdev_iostat == NULL)
624		return;
625	if ((vreq != NULL) && vreq->v_runq) {
626		kstat_runq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
627	} else {
628		kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
629	}
630}
631
632static void
633xdf_kstat_exit(xdf_t *vdp, buf_t *bp)
634{
635	v_req_t *vreq = BP_VREQ(bp);
636
637	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
638
639	if (vdp->xdf_xdev_iostat == NULL)
640		return;
641	if ((vreq != NULL) && vreq->v_runq) {
642		kstat_runq_exit(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
643	} else {
644		kstat_waitq_exit(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
645	}
646}
647
648static void
649xdf_kstat_waitq_to_runq(xdf_t *vdp, buf_t *bp)
650{
651	v_req_t *vreq = BP_VREQ(bp);
652
653	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
654	ASSERT(!vreq->v_runq);
655
656	vreq->v_runq = B_TRUE;
657	if (vdp->xdf_xdev_iostat == NULL)
658		return;
659	kstat_waitq_to_runq(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
660}
661
662static void
663xdf_kstat_runq_to_waitq(xdf_t *vdp, buf_t *bp)
664{
665	v_req_t *vreq = BP_VREQ(bp);
666
667	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
668	ASSERT(vreq->v_runq);
669
670	vreq->v_runq = B_FALSE;
671	if (vdp->xdf_xdev_iostat == NULL)
672		return;
673	kstat_runq_back_to_waitq(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
674}
675
676int
677xdf_kstat_create(dev_info_t *dip, char *ks_module, int instance)
678{
679	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
680	kstat_t		*kstat;
681	buf_t		*bp;
682
683	if ((kstat = kstat_create(
684	    ks_module, instance, NULL, "disk",
685	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) == NULL)
686		return (-1);
687
688	/* See comment about locking in xdf_kstat_delete(). */
689	mutex_enter(&vdp->xdf_iostat_lk);
690	mutex_enter(&vdp->xdf_dev_lk);
691
692	/* only one kstat can exist at a time */
693	if (vdp->xdf_xdev_iostat != NULL) {
694		mutex_exit(&vdp->xdf_dev_lk);
695		mutex_exit(&vdp->xdf_iostat_lk);
696		kstat_delete(kstat);
697		return (-1);
698	}
699
700	vdp->xdf_xdev_iostat = kstat;
701	vdp->xdf_xdev_iostat->ks_lock = &vdp->xdf_dev_lk;
702	kstat_install(vdp->xdf_xdev_iostat);
703
704	/*
705	 * Now that we've created a kstat, we need to update the waitq and
706	 * runq counts for the kstat to reflect our current state.
707	 *
708	 * For a buf_t structure to be on the runq, it must have a ring
709	 * buffer slot associated with it.  To get a ring buffer slot the
710	 * buf must first have a v_req_t and a ge_slot_t associated with it.
711	 * Then when it is granted a ring buffer slot, v_runq will be set to
712	 * true.
713	 *
714	 * For a buf_t structure to be on the waitq, it must not be on the
715	 * runq.  So to find all the buf_t's that should be on waitq, we
716	 * walk the active buf list and add any buf_t's which aren't on the
717	 * runq to the waitq.
718	 */
719	bp = vdp->xdf_f_act;
720	while (bp != NULL) {
721		xdf_kstat_enter(vdp, bp);
722		bp = bp->av_forw;
723	}
724	if (vdp->xdf_ready_tq_bp != NULL)
725		xdf_kstat_enter(vdp, vdp->xdf_ready_tq_bp);
726
727	mutex_exit(&vdp->xdf_dev_lk);
728	mutex_exit(&vdp->xdf_iostat_lk);
729	return (0);
730}
731
732void
733xdf_kstat_delete(dev_info_t *dip)
734{
735	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
736	kstat_t		*kstat;
737	buf_t		*bp;
738
739	/*
740	 * The locking order here is xdf_iostat_lk and then xdf_dev_lk.
741	 * xdf_dev_lk is used to protect the xdf_xdev_iostat pointer
742	 * and the contents of the our kstat.  xdf_iostat_lk is used
743	 * to protect the allocation and freeing of the actual kstat.
744	 * xdf_dev_lk can't be used for this purpose because kstat
745	 * readers use it to access the contents of the kstat and
746	 * hence it can't be held when calling kstat_delete().
747	 */
748	mutex_enter(&vdp->xdf_iostat_lk);
749	mutex_enter(&vdp->xdf_dev_lk);
750
751	if (vdp->xdf_xdev_iostat == NULL) {
752		mutex_exit(&vdp->xdf_dev_lk);
753		mutex_exit(&vdp->xdf_iostat_lk);
754		return;
755	}
756
757	/*
758	 * We're about to destroy the kstat structures, so it isn't really
759	 * necessary to update the runq and waitq counts.  But, since this
760	 * isn't a hot code path we can afford to be a little pedantic and
761	 * go ahead and decrement the runq and waitq kstat counters to zero
762	 * before free'ing them.  This helps us ensure that we've gotten all
763	 * our accounting correct.
764	 *
765	 * For an explanation of how we determine which buffers go on the
766	 * runq vs which go on the waitq, see the comments in
767	 * xdf_kstat_create().
768	 */
769	bp = vdp->xdf_f_act;
770	while (bp != NULL) {
771		xdf_kstat_exit(vdp, bp);
772		bp = bp->av_forw;
773	}
774	if (vdp->xdf_ready_tq_bp != NULL)
775		xdf_kstat_exit(vdp, vdp->xdf_ready_tq_bp);
776
777	kstat = vdp->xdf_xdev_iostat;
778	vdp->xdf_xdev_iostat = NULL;
779	mutex_exit(&vdp->xdf_dev_lk);
780	kstat_delete(kstat);
781	mutex_exit(&vdp->xdf_iostat_lk);
782}
783
784/*
785 * Add an IO requests onto the active queue.
786 *
787 * We have to detect IOs generated by xdf_ready_tq_thread.  These IOs
788 * are used to establish a connection to the backend, so they recieve
789 * priority over all other IOs.  Since xdf_ready_tq_thread only does
790 * synchronous IO, there can only be one xdf_ready_tq_thread request at any
791 * given time and we record the buf associated with that request in
792 * xdf_ready_tq_bp.
793 */
794static void
795xdf_bp_push(xdf_t *vdp, buf_t *bp)
796{
797	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
798	ASSERT(bp->av_forw == NULL);
799
800	xdf_kstat_enter(vdp, bp);
801
802	if (curthread == vdp->xdf_ready_tq_thread) {
803		/* new IO requests from the ready thread */
804		ASSERT(vdp->xdf_ready_tq_bp == NULL);
805		vdp->xdf_ready_tq_bp = bp;
806		return;
807	}
808
809	/* this is normal IO request */
810	ASSERT(bp != vdp->xdf_ready_tq_bp);
811
812	if (vdp->xdf_f_act == NULL) {
813		/* this is only only IO on the active queue */
814		ASSERT(vdp->xdf_l_act == NULL);
815		ASSERT(vdp->xdf_i_act == NULL);
816		vdp->xdf_f_act = vdp->xdf_l_act = vdp->xdf_i_act = bp;
817		return;
818	}
819
820	/* add this IO to the tail of the active queue */
821	vdp->xdf_l_act->av_forw = bp;
822	vdp->xdf_l_act = bp;
823	if (vdp->xdf_i_act == NULL)
824		vdp->xdf_i_act = bp;
825}
826
827static void
828xdf_bp_pop(xdf_t *vdp, buf_t *bp)
829{
830	buf_t	*bp_iter;
831
832	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
833	ASSERT(VREQ_DONE(BP_VREQ(bp)));
834
835	if (vdp->xdf_ready_tq_bp == bp) {
836		/* we're done with a ready thread IO request */
837		ASSERT(bp->av_forw == NULL);
838		vdp->xdf_ready_tq_bp = NULL;
839		return;
840	}
841
842	/* we're done with a normal IO request */
843	ASSERT((bp->av_forw != NULL) || (bp == vdp->xdf_l_act));
844	ASSERT((bp->av_forw == NULL) || (bp != vdp->xdf_l_act));
845	ASSERT(VREQ_DONE(BP_VREQ(vdp->xdf_f_act)));
846	ASSERT(vdp->xdf_f_act != vdp->xdf_i_act);
847
848	if (bp == vdp->xdf_f_act) {
849		/* This IO was at the head of our active queue. */
850		vdp->xdf_f_act = bp->av_forw;
851		if (bp == vdp->xdf_l_act)
852			vdp->xdf_l_act = NULL;
853	} else {
854		/* There IO finished before some other pending IOs. */
855		bp_iter = vdp->xdf_f_act;
856		while (bp != bp_iter->av_forw) {
857			bp_iter = bp_iter->av_forw;
858			ASSERT(VREQ_DONE(BP_VREQ(bp_iter)));
859			ASSERT(bp_iter != vdp->xdf_i_act);
860		}
861		bp_iter->av_forw = bp->av_forw;
862		if (bp == vdp->xdf_l_act)
863			vdp->xdf_l_act = bp_iter;
864	}
865	bp->av_forw = NULL;
866}
867
868static buf_t *
869xdf_bp_next(xdf_t *vdp)
870{
871	v_req_t	*vreq;
872	buf_t	*bp;
873
874	if (vdp->xdf_state == XD_CONNECTED) {
875		/*
876		 * If we're in the XD_CONNECTED state, we only service IOs
877		 * from the xdf_ready_tq_thread thread.
878		 */
879		if ((bp = vdp->xdf_ready_tq_bp) == NULL)
880			return (NULL);
881		if (((vreq = BP_VREQ(bp)) == NULL) || (!VREQ_DONE(vreq)))
882			return (bp);
883		return (NULL);
884	}
885
886	/* if we're not in the XD_CONNECTED or XD_READY state we can't do IO */
887	if (vdp->xdf_state != XD_READY)
888		return (NULL);
889
890	ASSERT(vdp->xdf_ready_tq_bp == NULL);
891	for (;;) {
892		if ((bp = vdp->xdf_i_act) == NULL)
893			return (NULL);
894		if (((vreq = BP_VREQ(bp)) == NULL) || (!VREQ_DONE(vreq)))
895			return (bp);
896
897		/* advance the active buf index pointer */
898		vdp->xdf_i_act = bp->av_forw;
899	}
900}
901
902static void
903xdf_io_fini(xdf_t *vdp, uint64_t id, int bioerr)
904{
905	ge_slot_t	*gs = (ge_slot_t *)(uintptr_t)id;
906	v_req_t		*vreq = gs->gs_vreq;
907	buf_t		*bp = vreq->v_buf;
908
909	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
910	ASSERT(BP_VREQ(bp) == vreq);
911
912	gs_free(gs);
913
914	if (bioerr != 0)
915		bioerror(bp, bioerr);
916	ASSERT(vreq->v_nslots > 0);
917	if (--vreq->v_nslots > 0)
918		return;
919
920	/* remove this IO from our active queue */
921	xdf_bp_pop(vdp, bp);
922
923	ASSERT(vreq->v_runq);
924	xdf_kstat_exit(vdp, bp);
925	vreq->v_runq = B_FALSE;
926	vreq_free(vdp, vreq);
927
928	if (IS_ERROR(bp)) {
929		xdf_io_err(bp, geterror(bp), 0);
930	} else if (bp->b_resid != 0) {
931		/* Partial transfers are an error */
932		xdf_io_err(bp, EIO, bp->b_resid);
933	} else {
934		biodone(bp);
935	}
936}
937
938/*
939 * xdf interrupt handler
940 */
941static uint_t
942xdf_intr_locked(xdf_t *vdp)
943{
944	xendev_ring_t *xbr;
945	blkif_response_t *resp;
946	int bioerr;
947	uint64_t id;
948	uint8_t op;
949	uint16_t status;
950	ddi_acc_handle_t acchdl;
951
952	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
953
954	if ((xbr = vdp->xdf_xb_ring) == NULL)
955		return (DDI_INTR_UNCLAIMED);
956
957	acchdl = vdp->xdf_xb_ring_hdl;
958
959	/*
960	 * complete all requests which have a response
961	 */
962	while (resp = xvdi_ring_get_response(xbr)) {
963		id = ddi_get64(acchdl, &resp->id);
964		op = ddi_get8(acchdl, &resp->operation);
965		status = ddi_get16(acchdl, (uint16_t *)&resp->status);
966		DPRINTF(INTR_DBG, ("resp: op %d id %"PRIu64" status %d\n",
967		    op, id, status));
968
969		if (status != BLKIF_RSP_OKAY) {
970			DPRINTF(IO_DBG, ("xdf@%s: I/O error while %s",
971			    vdp->xdf_addr,
972			    (op == BLKIF_OP_READ) ? "reading" : "writing"));
973			bioerr = EIO;
974		} else {
975			bioerr = 0;
976		}
977
978		xdf_io_fini(vdp, id, bioerr);
979	}
980	return (DDI_INTR_CLAIMED);
981}
982
983/*
984 * xdf_intr runs at PIL 5, so no one else can grab xdf_dev_lk and
985 * block at a lower pil.
986 */
987static uint_t
988xdf_intr(caddr_t arg)
989{
990	xdf_t *vdp = (xdf_t *)arg;
991	int rv;
992
993	mutex_enter(&vdp->xdf_dev_lk);
994	rv = xdf_intr_locked(vdp);
995	mutex_exit(&vdp->xdf_dev_lk);
996
997	if (!do_polled_io)
998		xdf_io_start(vdp);
999
1000	return (rv);
1001}
1002
1003static void
1004xdf_ring_push(xdf_t *vdp)
1005{
1006	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1007
1008	if (vdp->xdf_xb_ring == NULL)
1009		return;
1010
1011	if (xvdi_ring_push_request(vdp->xdf_xb_ring)) {
1012		DPRINTF(IO_DBG, (
1013		    "xdf@%s: xdf_ring_push: sent request(s) to backend\n",
1014		    vdp->xdf_addr));
1015	}
1016
1017	if (xvdi_get_evtchn(vdp->xdf_dip) != INVALID_EVTCHN)
1018		xvdi_notify_oe(vdp->xdf_dip);
1019}
1020
1021static int
1022xdf_ring_drain_locked(xdf_t *vdp)
1023{
1024	int		pollc, rv = 0;
1025
1026	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1027
1028	if (xdf_debug & SUSRES_DBG)
1029		xen_printf("xdf_ring_drain: start\n");
1030
1031	for (pollc = 0; pollc < XDF_DRAIN_RETRY_COUNT; pollc++) {
1032		if (vdp->xdf_xb_ring == NULL)
1033			goto out;
1034
1035		if (xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring))
1036			(void) xdf_intr_locked(vdp);
1037		if (!xvdi_ring_has_incomp_request(vdp->xdf_xb_ring))
1038			goto out;
1039		xdf_ring_push(vdp);
1040
1041		/* file-backed devices can be slow */
1042		mutex_exit(&vdp->xdf_dev_lk);
1043#ifdef XPV_HVM_DRIVER
1044		(void) HYPERVISOR_yield();
1045#endif /* XPV_HVM_DRIVER */
1046		delay(drv_usectohz(XDF_DRAIN_MSEC_DELAY));
1047		mutex_enter(&vdp->xdf_dev_lk);
1048	}
1049	cmn_err(CE_WARN, "xdf@%s: xdf_ring_drain: timeout", vdp->xdf_addr);
1050
1051out:
1052	if (vdp->xdf_xb_ring != NULL) {
1053		if (xvdi_ring_has_incomp_request(vdp->xdf_xb_ring) ||
1054		    xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring))
1055			rv = EIO;
1056	}
1057	if (xdf_debug & SUSRES_DBG)
1058		xen_printf("xdf@%s: xdf_ring_drain: end, err=%d\n",
1059		    vdp->xdf_addr, rv);
1060	return (rv);
1061}
1062
1063static int
1064xdf_ring_drain(xdf_t *vdp)
1065{
1066	int rv;
1067	mutex_enter(&vdp->xdf_dev_lk);
1068	rv = xdf_ring_drain_locked(vdp);
1069	mutex_exit(&vdp->xdf_dev_lk);
1070	return (rv);
1071}
1072
1073/*
1074 * Destroy all v_req_t, grant table entries, and our ring buffer.
1075 */
1076static void
1077xdf_ring_destroy(xdf_t *vdp)
1078{
1079	v_req_t		*vreq;
1080	buf_t		*bp;
1081	ge_slot_t	*gs;
1082
1083	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1084	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1085
1086	if ((vdp->xdf_state != XD_INIT) &&
1087	    (vdp->xdf_state != XD_CONNECTED) &&
1088	    (vdp->xdf_state != XD_READY)) {
1089		ASSERT(vdp->xdf_xb_ring == NULL);
1090		ASSERT(vdp->xdf_xb_ring_hdl == NULL);
1091		ASSERT(vdp->xdf_peer == INVALID_DOMID);
1092		ASSERT(vdp->xdf_evtchn == INVALID_EVTCHN);
1093		ASSERT(list_is_empty(&vdp->xdf_vreq_act));
1094		return;
1095	}
1096
1097	/*
1098	 * We don't want to recieve async notifications from the backend
1099	 * when it finishes processing ring entries.
1100	 */
1101#ifdef XPV_HVM_DRIVER
1102	ec_unbind_evtchn(vdp->xdf_evtchn);
1103#else /* !XPV_HVM_DRIVER */
1104	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1105#endif /* !XPV_HVM_DRIVER */
1106
1107	/*
1108	 * Drain any requests in the ring.  We need to do this before we
1109	 * can free grant table entries, because if active ring entries
1110	 * point to grants, then the backend could be trying to access
1111	 * those grants.
1112	 */
1113	(void) xdf_ring_drain_locked(vdp);
1114
1115	/* We're done talking to the backend so free up our event channel */
1116	xvdi_free_evtchn(vdp->xdf_dip);
1117	vdp->xdf_evtchn = INVALID_EVTCHN;
1118
1119	while ((vreq = list_head(&vdp->xdf_vreq_act)) != NULL) {
1120		bp = vreq->v_buf;
1121		ASSERT(BP_VREQ(bp) == vreq);
1122
1123		/* Free up any grant table entries associaed with this IO */
1124		while ((gs = list_head(&vreq->v_gs)) != NULL)
1125			gs_free(gs);
1126
1127		/* If this IO was on the runq, move it back to the waitq. */
1128		if (vreq->v_runq)
1129			xdf_kstat_runq_to_waitq(vdp, bp);
1130
1131		/*
1132		 * Reset any buf IO state since we're going to re-issue the
1133		 * IO when we reconnect.
1134		 */
1135		vreq_free(vdp, vreq);
1136		BP_VREQ_SET(bp, NULL);
1137		bioerror(bp, 0);
1138	}
1139
1140	/* reset the active queue index pointer */
1141	vdp->xdf_i_act = vdp->xdf_f_act;
1142
1143	/* Destroy the ring */
1144	xvdi_free_ring(vdp->xdf_xb_ring);
1145	vdp->xdf_xb_ring = NULL;
1146	vdp->xdf_xb_ring_hdl = NULL;
1147	vdp->xdf_peer = INVALID_DOMID;
1148}
1149
1150void
1151xdfmin(struct buf *bp)
1152{
1153	if (bp->b_bcount > xdf_maxphys)
1154		bp->b_bcount = xdf_maxphys;
1155}
1156
1157/*
1158 * Check if we have a pending "eject" media request.
1159 */
1160static int
1161xdf_eject_pending(xdf_t *vdp)
1162{
1163	dev_info_t	*dip = vdp->xdf_dip;
1164	char		*xsname, *str;
1165
1166	if (!vdp->xdf_media_req_supported)
1167		return (B_FALSE);
1168
1169	if (((xsname = xvdi_get_xsname(dip)) == NULL) ||
1170	    (xenbus_read_str(xsname, XBP_MEDIA_REQ, &str) != 0))
1171		return (B_FALSE);
1172
1173	if (strcmp(str, XBV_MEDIA_REQ_EJECT) != 0) {
1174		strfree(str);
1175		return (B_FALSE);
1176	}
1177	strfree(str);
1178	return (B_TRUE);
1179}
1180
1181/*
1182 * Generate a media request.
1183 */
1184static int
1185xdf_media_req(xdf_t *vdp, char *req, boolean_t media_required)
1186{
1187	dev_info_t	*dip = vdp->xdf_dip;
1188	char		*xsname;
1189
1190	/*
1191	 * we can't be holding xdf_dev_lk because xenbus_printf() can
1192	 * block while waiting for a PIL 1 interrupt message.  this
1193	 * would cause a deadlock with xdf_intr() which needs to grab
1194	 * xdf_dev_lk as well and runs at PIL 5.
1195	 */
1196	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1197	ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1198
1199	if ((xsname = xvdi_get_xsname(dip)) == NULL)
1200		return (ENXIO);
1201
1202	/* Check if we support media requests */
1203	if (!XD_IS_CD(vdp) || !vdp->xdf_media_req_supported)
1204		return (ENOTTY);
1205
1206	/* If an eject is pending then don't allow any new requests */
1207	if (xdf_eject_pending(vdp))
1208		return (ENXIO);
1209
1210	/* Make sure that there is media present */
1211	if (media_required && (vdp->xdf_xdev_nblocks == 0))
1212		return (ENXIO);
1213
1214	/* We only allow operations when the device is ready and connected */
1215	if (vdp->xdf_state != XD_READY)
1216		return (EIO);
1217
1218	if (xenbus_printf(XBT_NULL, xsname, XBP_MEDIA_REQ, "%s", req) != 0)
1219		return (EIO);
1220
1221	return (0);
1222}
1223
1224/*
1225 * populate a single blkif_request_t w/ a buf
1226 */
1227static void
1228xdf_process_rreq(xdf_t *vdp, struct buf *bp, blkif_request_t *rreq)
1229{
1230	grant_ref_t	gr;
1231	uint8_t		fsect, lsect;
1232	size_t		bcnt;
1233	paddr_t		dma_addr;
1234	off_t		blk_off;
1235	dev_info_t	*dip = vdp->xdf_dip;
1236	blkif_vdev_t	vdev = xvdi_get_vdevnum(dip);
1237	v_req_t		*vreq = BP_VREQ(bp);
1238	uint64_t	blkno = vreq->v_blkno;
1239	uint_t		ndmacs = vreq->v_ndmacs;
1240	ddi_acc_handle_t acchdl = vdp->xdf_xb_ring_hdl;
1241	int		seg = 0;
1242	int		isread = IS_READ(bp);
1243	ge_slot_t	*gs = list_head(&vreq->v_gs);
1244
1245	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1246	ASSERT(vreq->v_status == VREQ_GS_ALLOCED);
1247
1248	if (isread)
1249		ddi_put8(acchdl, &rreq->operation, BLKIF_OP_READ);
1250	else {
1251		switch (vreq->v_flush_diskcache) {
1252		case FLUSH_DISKCACHE:
1253			ddi_put8(acchdl, &rreq->operation,
1254			    BLKIF_OP_FLUSH_DISKCACHE);
1255			ddi_put16(acchdl, &rreq->handle, vdev);
1256			ddi_put64(acchdl, &rreq->id,
1257			    (uint64_t)(uintptr_t)(gs));
1258			ddi_put8(acchdl, &rreq->nr_segments, 0);
1259			vreq->v_status = VREQ_DMAWIN_DONE;
1260			return;
1261		case WRITE_BARRIER:
1262			ddi_put8(acchdl, &rreq->operation,
1263			    BLKIF_OP_WRITE_BARRIER);
1264			break;
1265		default:
1266			if (!vdp->xdf_wce)
1267				ddi_put8(acchdl, &rreq->operation,
1268				    BLKIF_OP_WRITE_BARRIER);
1269			else
1270				ddi_put8(acchdl, &rreq->operation,
1271				    BLKIF_OP_WRITE);
1272			break;
1273		}
1274	}
1275
1276	ddi_put16(acchdl, &rreq->handle, vdev);
1277	ddi_put64(acchdl, &rreq->sector_number, blkno);
1278	ddi_put64(acchdl, &rreq->id, (uint64_t)(uintptr_t)(gs));
1279
1280	/*
1281	 * loop until all segments are populated or no more dma cookie in buf
1282	 */
1283	for (;;) {
1284		/*
1285		 * Each segment of a blkif request can transfer up to
1286		 * one 4K page of data.
1287		 */
1288		bcnt = vreq->v_dmac.dmac_size;
1289		dma_addr = vreq->v_dmac.dmac_laddress;
1290		blk_off = (uint_t)((paddr_t)XB_SEGOFFSET & dma_addr);
1291		fsect = blk_off >> XB_BSHIFT;
1292		lsect = fsect + (bcnt >> XB_BSHIFT) - 1;
1293
1294		ASSERT(bcnt <= PAGESIZE);
1295		ASSERT((bcnt % XB_BSIZE) == 0);
1296		ASSERT((blk_off & XB_BMASK) == 0);
1297		ASSERT(fsect < XB_MAX_SEGLEN / XB_BSIZE &&
1298		    lsect < XB_MAX_SEGLEN / XB_BSIZE);
1299
1300		gr = gs_grant(gs, PATOMA(dma_addr) >> PAGESHIFT);
1301		ddi_put32(acchdl, &rreq->seg[seg].gref, gr);
1302		ddi_put8(acchdl, &rreq->seg[seg].first_sect, fsect);
1303		ddi_put8(acchdl, &rreq->seg[seg].last_sect, lsect);
1304
1305		DPRINTF(IO_DBG, (
1306		    "xdf@%s: seg%d: dmacS %lu blk_off %ld\n",
1307		    vdp->xdf_addr, seg, vreq->v_dmac.dmac_size, blk_off));
1308		DPRINTF(IO_DBG, (
1309		    "xdf@%s: seg%d: fs %d ls %d gr %d dma 0x%"PRIx64"\n",
1310		    vdp->xdf_addr, seg, fsect, lsect, gr, dma_addr));
1311
1312		blkno += (bcnt >> XB_BSHIFT);
1313		seg++;
1314		ASSERT(seg <= BLKIF_MAX_SEGMENTS_PER_REQUEST);
1315		if (--ndmacs) {
1316			ddi_dma_nextcookie(vreq->v_dmahdl, &vreq->v_dmac);
1317			continue;
1318		}
1319
1320		vreq->v_status = VREQ_DMAWIN_DONE;
1321		vreq->v_blkno = blkno;
1322		break;
1323	}
1324	ddi_put8(acchdl,  &rreq->nr_segments, seg);
1325	DPRINTF(IO_DBG, (
1326	    "xdf@%s: xdf_process_rreq: request id=%"PRIx64" ready\n",
1327	    vdp->xdf_addr, rreq->id));
1328}
1329
1330static void
1331xdf_io_start(xdf_t *vdp)
1332{
1333	struct buf	*bp;
1334	v_req_t		*vreq;
1335	blkif_request_t	*rreq;
1336	boolean_t	rreqready = B_FALSE;
1337
1338	mutex_enter(&vdp->xdf_dev_lk);
1339
1340	/*
1341	 * Populate the ring request(s).  Loop until there is no buf to
1342	 * transfer or no free slot available in I/O ring.
1343	 */
1344	for (;;) {
1345		/* don't start any new IO if we're suspending */
1346		if (vdp->xdf_suspending)
1347			break;
1348		if ((bp = xdf_bp_next(vdp)) == NULL)
1349			break;
1350
1351		/* if the buf doesn't already have a vreq, allocate one */
1352		if (((vreq = BP_VREQ(bp)) == NULL) &&
1353		    ((vreq = vreq_get(vdp, bp)) == NULL))
1354			break;
1355
1356		/* alloc DMA/GTE resources */
1357		if (vreq_setup(vdp, vreq) != DDI_SUCCESS)
1358			break;
1359
1360		/* get next blkif_request in the ring */
1361		if ((rreq = xvdi_ring_get_request(vdp->xdf_xb_ring)) == NULL)
1362			break;
1363		bzero(rreq, sizeof (blkif_request_t));
1364		rreqready = B_TRUE;
1365
1366		/* populate blkif_request with this buf */
1367		xdf_process_rreq(vdp, bp, rreq);
1368
1369		/*
1370		 * This buffer/vreq pair is has been allocated a ring buffer
1371		 * resources, so if it isn't already in our runq, add it.
1372		 */
1373		if (!vreq->v_runq)
1374			xdf_kstat_waitq_to_runq(vdp, bp);
1375	}
1376
1377	/* Send the request(s) to the backend */
1378	if (rreqready)
1379		xdf_ring_push(vdp);
1380
1381	mutex_exit(&vdp->xdf_dev_lk);
1382}
1383
1384
1385/* check if partition is open, -1 - check all partitions on the disk */
1386static boolean_t
1387xdf_isopen(xdf_t *vdp, int partition)
1388{
1389	int i;
1390	ulong_t parbit;
1391	boolean_t rval = B_FALSE;
1392
1393	ASSERT((partition == -1) ||
1394	    ((partition >= 0) || (partition < XDF_PEXT)));
1395
1396	if (partition == -1)
1397		parbit = (ulong_t)-1;
1398	else
1399		parbit = 1 << partition;
1400
1401	for (i = 0; i < OTYPCNT; i++) {
1402		if (vdp->xdf_vd_open[i] & parbit)
1403			rval = B_TRUE;
1404	}
1405
1406	return (rval);
1407}
1408
1409/*
1410 * The connection should never be closed as long as someone is holding
1411 * us open, there is pending IO, or someone is waiting waiting for a
1412 * connection.
1413 */
1414static boolean_t
1415xdf_busy(xdf_t *vdp)
1416{
1417	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1418
1419	if ((vdp->xdf_xb_ring != NULL) &&
1420	    xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring)) {
1421		ASSERT(vdp->xdf_state != XD_CLOSED);
1422		return (B_TRUE);
1423	}
1424
1425	if (!list_is_empty(&vdp->xdf_vreq_act) || (vdp->xdf_f_act != NULL)) {
1426		ASSERT(vdp->xdf_state != XD_CLOSED);
1427		return (B_TRUE);
1428	}
1429
1430	if (xdf_isopen(vdp, -1)) {
1431		ASSERT(vdp->xdf_state != XD_CLOSED);
1432		return (B_TRUE);
1433	}
1434
1435	if (vdp->xdf_connect_req > 0) {
1436		ASSERT(vdp->xdf_state != XD_CLOSED);
1437		return (B_TRUE);
1438	}
1439
1440	return (B_FALSE);
1441}
1442
1443static void
1444xdf_set_state(xdf_t *vdp, xdf_state_t new_state)
1445{
1446	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1447	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1448	DPRINTF(DDI_DBG, ("xdf@%s: state change %d -> %d\n",
1449	    vdp->xdf_addr, vdp->xdf_state, new_state));
1450	vdp->xdf_state = new_state;
1451	cv_broadcast(&vdp->xdf_dev_cv);
1452}
1453
1454static void
1455xdf_disconnect(xdf_t *vdp, xdf_state_t new_state, boolean_t quiet)
1456{
1457	dev_info_t	*dip = vdp->xdf_dip;
1458	boolean_t	busy;
1459
1460	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1461	ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1462	ASSERT((new_state == XD_UNKNOWN) || (new_state == XD_CLOSED));
1463
1464	/* Check if we're already there. */
1465	if (vdp->xdf_state == new_state)
1466		return;
1467
1468	mutex_enter(&vdp->xdf_dev_lk);
1469	busy = xdf_busy(vdp);
1470
1471	/* If we're already closed then there's nothing todo. */
1472	if (vdp->xdf_state == XD_CLOSED) {
1473		ASSERT(!busy);
1474		xdf_set_state(vdp, new_state);
1475		mutex_exit(&vdp->xdf_dev_lk);
1476		return;
1477	}
1478
1479#ifdef DEBUG
1480	/* UhOh.  Warn the user that something bad has happened. */
1481	if (!quiet && busy && (vdp->xdf_state == XD_READY) &&
1482	    (vdp->xdf_xdev_nblocks != 0)) {
1483		cmn_err(CE_WARN, "xdf@%s: disconnected while in use",
1484		    vdp->xdf_addr);
1485	}
1486#endif /* DEBUG */
1487
1488	xdf_ring_destroy(vdp);
1489
1490	/* If we're busy then we can only go into the unknown state */
1491	xdf_set_state(vdp, (busy) ? XD_UNKNOWN : new_state);
1492	mutex_exit(&vdp->xdf_dev_lk);
1493
1494	/* if we're closed now, let the other end know */
1495	if (vdp->xdf_state == XD_CLOSED)
1496		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
1497}
1498
1499
1500/*
1501 * Kick-off connect process
1502 * Status should be XD_UNKNOWN or XD_CLOSED
1503 * On success, status will be changed to XD_INIT
1504 * On error, it will be changed to XD_UNKNOWN
1505 */
1506static int
1507xdf_setstate_init(xdf_t *vdp)
1508{
1509	dev_info_t		*dip = vdp->xdf_dip;
1510	xenbus_transaction_t	xbt;
1511	grant_ref_t		gref;
1512	char			*xsname, *str;
1513	int 			rv;
1514
1515	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1516	ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1517	ASSERT((vdp->xdf_state == XD_UNKNOWN) ||
1518	    (vdp->xdf_state == XD_CLOSED));
1519
1520	DPRINTF(DDI_DBG,
1521	    ("xdf@%s: starting connection process\n", vdp->xdf_addr));
1522
1523	/*
1524	 * If an eject is pending then don't allow a new connection.
1525	 * (Only the backend can clear media request eject request.)
1526	 */
1527	if (xdf_eject_pending(vdp))
1528		return (DDI_FAILURE);
1529
1530	if ((xsname = xvdi_get_xsname(dip)) == NULL)
1531		goto errout;
1532
1533	if ((vdp->xdf_peer = xvdi_get_oeid(dip)) == INVALID_DOMID)
1534		goto errout;
1535
1536	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitialising);
1537
1538	/*
1539	 * Sanity check for the existance of the xenbus device-type property.
1540	 * This property might not exist if we our xenbus device nodes was
1541	 * force destroyed while we were still connected to the backend.
1542	 */
1543	if (xenbus_read_str(xsname, XBP_DEV_TYPE, &str) != 0)
1544		goto errout;
1545	strfree(str);
1546
1547	if (xvdi_alloc_evtchn(dip) != DDI_SUCCESS)
1548		goto errout;
1549
1550	vdp->xdf_evtchn = xvdi_get_evtchn(dip);
1551#ifdef XPV_HVM_DRIVER
1552	ec_bind_evtchn_to_handler(vdp->xdf_evtchn, IPL_VBD, xdf_intr, vdp);
1553#else /* !XPV_HVM_DRIVER */
1554	if (ddi_add_intr(dip, 0, NULL, NULL, xdf_intr, (caddr_t)vdp) !=
1555	    DDI_SUCCESS) {
1556		cmn_err(CE_WARN, "xdf@%s: xdf_setstate_init: "
1557		    "failed to add intr handler", vdp->xdf_addr);
1558		goto errout1;
1559	}
1560#endif /* !XPV_HVM_DRIVER */
1561
1562	if (xvdi_alloc_ring(dip, BLKIF_RING_SIZE,
1563	    sizeof (union blkif_sring_entry), &gref, &vdp->xdf_xb_ring) !=
1564	    DDI_SUCCESS) {
1565		cmn_err(CE_WARN, "xdf@%s: failed to alloc comm ring",
1566		    vdp->xdf_addr);
1567		goto errout2;
1568	}
1569	vdp->xdf_xb_ring_hdl = vdp->xdf_xb_ring->xr_acc_hdl; /* ugly!! */
1570
1571	/*
1572	 * Write into xenstore the info needed by backend
1573	 */
1574trans_retry:
1575	if (xenbus_transaction_start(&xbt)) {
1576		cmn_err(CE_WARN, "xdf@%s: failed to start transaction",
1577		    vdp->xdf_addr);
1578		xvdi_fatal_error(dip, EIO, "connect transaction init");
1579		goto fail_trans;
1580	}
1581
1582	/*
1583	 * XBP_PROTOCOL is written by the domain builder in the case of PV
1584	 * domains. However, it is not written for HVM domains, so let's
1585	 * write it here.
1586	 */
1587	if (((rv = xenbus_printf(xbt, xsname,
1588	    XBP_MEDIA_REQ, "%s", XBV_MEDIA_REQ_NONE)) != 0) ||
1589	    ((rv = xenbus_printf(xbt, xsname,
1590	    XBP_RING_REF, "%u", gref)) != 0) ||
1591	    ((rv = xenbus_printf(xbt, xsname,
1592	    XBP_EVENT_CHAN, "%u", vdp->xdf_evtchn)) != 0) ||
1593	    ((rv = xenbus_printf(xbt, xsname,
1594	    XBP_PROTOCOL, "%s", XEN_IO_PROTO_ABI_NATIVE)) != 0) ||
1595	    ((rv = xvdi_switch_state(dip, xbt, XenbusStateInitialised)) > 0)) {
1596		(void) xenbus_transaction_end(xbt, 1);
1597		xvdi_fatal_error(dip, rv, "connect transaction setup");
1598		goto fail_trans;
1599	}
1600
1601	/* kick-off connect process */
1602	if (rv = xenbus_transaction_end(xbt, 0)) {
1603		if (rv == EAGAIN)
1604			goto trans_retry;
1605		xvdi_fatal_error(dip, rv, "connect transaction commit");
1606		goto fail_trans;
1607	}
1608
1609	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1610	mutex_enter(&vdp->xdf_dev_lk);
1611	xdf_set_state(vdp, XD_INIT);
1612	mutex_exit(&vdp->xdf_dev_lk);
1613
1614	return (DDI_SUCCESS);
1615
1616fail_trans:
1617	xvdi_free_ring(vdp->xdf_xb_ring);
1618errout2:
1619#ifdef XPV_HVM_DRIVER
1620	ec_unbind_evtchn(vdp->xdf_evtchn);
1621#else /* !XPV_HVM_DRIVER */
1622	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1623#endif /* !XPV_HVM_DRIVER */
1624errout1:
1625	xvdi_free_evtchn(dip);
1626	vdp->xdf_evtchn = INVALID_EVTCHN;
1627errout:
1628	xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1629	cmn_err(CE_WARN, "xdf@%s: failed to start connection to backend",
1630	    vdp->xdf_addr);
1631	return (DDI_FAILURE);
1632}
1633
1634int
1635xdf_get_flush_block(xdf_t *vdp)
1636{
1637	/*
1638	 * Get a DEV_BSIZE aligned bufer
1639	 */
1640	vdp->xdf_flush_mem = kmem_alloc(vdp->xdf_xdev_secsize * 2, KM_SLEEP);
1641	vdp->xdf_cache_flush_block =
1642	    (char *)P2ROUNDUP((uintptr_t)(vdp->xdf_flush_mem),
1643	    (int)vdp->xdf_xdev_secsize);
1644
1645	if (xdf_lb_rdwr(vdp->xdf_dip, TG_READ, vdp->xdf_cache_flush_block,
1646	    xdf_flush_block, vdp->xdf_xdev_secsize, NULL) != 0)
1647		return (DDI_FAILURE);
1648	return (DDI_SUCCESS);
1649}
1650
1651static void
1652xdf_setstate_ready(void *arg)
1653{
1654	xdf_t	*vdp = (xdf_t *)arg;
1655
1656	vdp->xdf_ready_tq_thread = curthread;
1657
1658	/*
1659	 * We've created all the minor nodes via cmlb_attach() using default
1660	 * value in xdf_attach() to make it possible to block in xdf_open(),
1661	 * in case there's anyone (say, booting thread) ever trying to open
1662	 * it before connected to backend. We will refresh all those minor
1663	 * nodes w/ latest info we've got now when we are almost connected.
1664	 */
1665	mutex_enter(&vdp->xdf_dev_lk);
1666	if (vdp->xdf_cmbl_reattach) {
1667		vdp->xdf_cmbl_reattach = B_FALSE;
1668
1669		mutex_exit(&vdp->xdf_dev_lk);
1670		if (xdf_cmlb_attach(vdp) != 0) {
1671			xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1672			return;
1673		}
1674		mutex_enter(&vdp->xdf_dev_lk);
1675	}
1676
1677	/* If we're not still trying to get to the ready state, then bail. */
1678	if (vdp->xdf_state != XD_CONNECTED) {
1679		mutex_exit(&vdp->xdf_dev_lk);
1680		return;
1681	}
1682	mutex_exit(&vdp->xdf_dev_lk);
1683
1684	/*
1685	 * If backend has feature-barrier, see if it supports disk
1686	 * cache flush op.
1687	 */
1688	vdp->xdf_flush_supported = B_FALSE;
1689	if (vdp->xdf_feature_barrier) {
1690		/*
1691		 * Pretend we already know flush is supported so probe
1692		 * will attempt the correct op.
1693		 */
1694		vdp->xdf_flush_supported = B_TRUE;
1695		if (xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE, NULL, 0, 0, 0) == 0) {
1696			vdp->xdf_flush_supported = B_TRUE;
1697		} else {
1698			vdp->xdf_flush_supported = B_FALSE;
1699			/*
1700			 * If the other end does not support the cache flush op
1701			 * then we must use a barrier-write to force disk
1702			 * cache flushing.  Barrier writes require that a data
1703			 * block actually be written.
1704			 * Cache a block to barrier-write when we are
1705			 * asked to perform a flush.
1706			 * XXX - would it be better to just copy 1 block
1707			 * (512 bytes) from whatever write we did last
1708			 * and rewrite that block?
1709			 */
1710			if (xdf_get_flush_block(vdp) != DDI_SUCCESS) {
1711				xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1712				return;
1713			}
1714		}
1715	}
1716
1717	mutex_enter(&vdp->xdf_cb_lk);
1718	mutex_enter(&vdp->xdf_dev_lk);
1719	if (vdp->xdf_state == XD_CONNECTED)
1720		xdf_set_state(vdp, XD_READY);
1721	mutex_exit(&vdp->xdf_dev_lk);
1722
1723	/* Restart any currently queued up io */
1724	xdf_io_start(vdp);
1725
1726	mutex_exit(&vdp->xdf_cb_lk);
1727}
1728
1729/*
1730 * synthetic geometry
1731 */
1732#define	XDF_NSECTS	256
1733#define	XDF_NHEADS	16
1734
1735static void
1736xdf_synthetic_pgeom(dev_info_t *dip, cmlb_geom_t *geomp)
1737{
1738	xdf_t *vdp;
1739	uint_t ncyl;
1740
1741	vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
1742
1743	ncyl = vdp->xdf_xdev_nblocks / (XDF_NHEADS * XDF_NSECTS);
1744
1745	bzero(geomp, sizeof (*geomp));
1746	geomp->g_ncyl = ncyl == 0 ? 1 : ncyl;
1747	geomp->g_acyl = 0;
1748	geomp->g_nhead = XDF_NHEADS;
1749	geomp->g_nsect = XDF_NSECTS;
1750	geomp->g_secsize = vdp->xdf_xdev_secsize;
1751	geomp->g_capacity = vdp->xdf_xdev_nblocks;
1752	geomp->g_intrlv = 0;
1753	geomp->g_rpm = 7200;
1754}
1755
1756/*
1757 * Finish other initialization after we've connected to backend
1758 * Status should be XD_INIT before calling this routine
1759 * On success, status should be changed to XD_CONNECTED.
1760 * On error, status should stay XD_INIT
1761 */
1762static int
1763xdf_setstate_connected(xdf_t *vdp)
1764{
1765	dev_info_t	*dip = vdp->xdf_dip;
1766	cmlb_geom_t	pgeom;
1767	diskaddr_t	nblocks = 0;
1768	uint_t		secsize = 0;
1769	char		*oename, *xsname, *str;
1770	uint_t		dinfo;
1771
1772	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1773	ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1774	ASSERT(vdp->xdf_state == XD_INIT);
1775
1776	if (((xsname = xvdi_get_xsname(dip)) == NULL) ||
1777	    ((oename = xvdi_get_oename(dip)) == NULL))
1778		return (DDI_FAILURE);
1779
1780	/* Make sure the other end is XenbusStateConnected */
1781	if (xenbus_read_driver_state(oename) != XenbusStateConnected)
1782		return (DDI_FAILURE);
1783
1784	/* Determine if feature barrier is supported by backend */
1785	if (!(vdp->xdf_feature_barrier = xenbus_exists(oename, XBP_FB)))
1786		cmn_err(CE_NOTE, "!xdf@%s: feature-barrier not supported",
1787		    vdp->xdf_addr);
1788
1789	/*
1790	 * Probe backend.  Read the device size into xdf_xdev_nblocks
1791	 * and set the VDISK_READONLY, VDISK_CDROM, and VDISK_REMOVABLE
1792	 * flags in xdf_dinfo.  If the emulated device type is "cdrom",
1793	 * we always set VDISK_CDROM, regardless of if it's present in
1794	 * the xenbus info parameter.
1795	 */
1796	if (xenbus_gather(XBT_NULL, oename,
1797	    XBP_SECTORS, "%"SCNu64, &nblocks,
1798	    XBP_SECTOR_SIZE, "%u", &secsize,
1799	    XBP_INFO, "%u", &dinfo,
1800	    NULL) != 0) {
1801		cmn_err(CE_WARN, "xdf@%s: xdf_setstate_connected: "
1802		    "cannot read backend info", vdp->xdf_addr);
1803		return (DDI_FAILURE);
1804	}
1805	if (xenbus_read_str(xsname, XBP_DEV_TYPE, &str) != 0) {
1806		cmn_err(CE_WARN, "xdf@%s: cannot read device-type",
1807		    vdp->xdf_addr);
1808		return (DDI_FAILURE);
1809	}
1810	if (strcmp(str, XBV_DEV_TYPE_CD) == 0)
1811		dinfo |= VDISK_CDROM;
1812	strfree(str);
1813
1814	if (secsize == 0 || !(ISP2(secsize / DEV_BSIZE)))
1815		secsize = DEV_BSIZE;
1816	vdp->xdf_xdev_nblocks = nblocks;
1817	vdp->xdf_xdev_secsize = secsize;
1818#ifdef _ILP32
1819	if (vdp->xdf_xdev_nblocks > DK_MAX_BLOCKS) {
1820		cmn_err(CE_WARN, "xdf@%s: xdf_setstate_connected: "
1821		    "backend disk device too large with %llu blocks for"
1822		    " 32-bit kernel", vdp->xdf_addr, vdp->xdf_xdev_nblocks);
1823		xvdi_fatal_error(dip, EFBIG, "reading backend info");
1824		return (DDI_FAILURE);
1825	}
1826#endif
1827
1828	/*
1829	 * If the physical geometry for a fixed disk has been explicity
1830	 * set then make sure that the specified physical geometry isn't
1831	 * larger than the device we connected to.
1832	 */
1833	if (vdp->xdf_pgeom_fixed &&
1834	    (vdp->xdf_pgeom.g_capacity > vdp->xdf_xdev_nblocks)) {
1835		cmn_err(CE_WARN,
1836		    "xdf@%s: connect failed, fixed geometry too large",
1837		    vdp->xdf_addr);
1838		return (DDI_FAILURE);
1839	}
1840
1841	vdp->xdf_media_req_supported = xenbus_exists(oename, XBP_MEDIA_REQ_SUP);
1842
1843	/* mark vbd is ready for I/O */
1844	mutex_enter(&vdp->xdf_dev_lk);
1845	xdf_set_state(vdp, XD_CONNECTED);
1846
1847	/* check if the cmlb label should be updated */
1848	xdf_synthetic_pgeom(dip, &pgeom);
1849	if ((vdp->xdf_dinfo != dinfo) ||
1850	    (!vdp->xdf_pgeom_fixed &&
1851	    (memcmp(&vdp->xdf_pgeom, &pgeom, sizeof (pgeom)) != 0))) {
1852		vdp->xdf_cmbl_reattach = B_TRUE;
1853
1854		vdp->xdf_dinfo = dinfo;
1855		if (!vdp->xdf_pgeom_fixed)
1856			vdp->xdf_pgeom = pgeom;
1857	}
1858
1859	if (XD_IS_CD(vdp) || XD_IS_RM(vdp)) {
1860		if (vdp->xdf_xdev_nblocks == 0) {
1861			vdp->xdf_mstate = DKIO_EJECTED;
1862			cv_broadcast(&vdp->xdf_mstate_cv);
1863		} else {
1864			vdp->xdf_mstate = DKIO_INSERTED;
1865			cv_broadcast(&vdp->xdf_mstate_cv);
1866		}
1867	} else {
1868		if (vdp->xdf_mstate != DKIO_NONE) {
1869			vdp->xdf_mstate = DKIO_NONE;
1870			cv_broadcast(&vdp->xdf_mstate_cv);
1871		}
1872	}
1873
1874	mutex_exit(&vdp->xdf_dev_lk);
1875
1876	cmn_err(CE_CONT, "?xdf@%s: %"PRIu64" blocks", vdp->xdf_addr,
1877	    (uint64_t)vdp->xdf_xdev_nblocks);
1878
1879	/* Restart any currently queued up io */
1880	xdf_io_start(vdp);
1881
1882	/*
1883	 * To get to the ready state we have to do IO to the backend device,
1884	 * but we can't initiate IO from the other end change callback thread
1885	 * (which is the current context we're executing in.)  This is because
1886	 * if the other end disconnects while we're doing IO from the callback
1887	 * thread, then we can't recieve that disconnect event and we hang
1888	 * waiting for an IO that can never complete.
1889	 */
1890	(void) ddi_taskq_dispatch(vdp->xdf_ready_tq, xdf_setstate_ready, vdp,
1891	    DDI_SLEEP);
1892
1893	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
1894	return (DDI_SUCCESS);
1895}
1896
1897/*ARGSUSED*/
1898static void
1899xdf_oe_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg, void *impl_data)
1900{
1901	XenbusState new_state = *(XenbusState *)impl_data;
1902	xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
1903
1904	DPRINTF(DDI_DBG, ("xdf@%s: otherend state change to %d!\n",
1905	    vdp->xdf_addr, new_state));
1906
1907	mutex_enter(&vdp->xdf_cb_lk);
1908
1909	/* We assume that this callback is single threaded */
1910	ASSERT(vdp->xdf_oe_change_thread == NULL);
1911	DEBUG_EVAL(vdp->xdf_oe_change_thread = curthread);
1912
1913	/* ignore any backend state changes if we're suspending/suspended */
1914	if (vdp->xdf_suspending || (vdp->xdf_state == XD_SUSPEND)) {
1915		DEBUG_EVAL(vdp->xdf_oe_change_thread = NULL);
1916		mutex_exit(&vdp->xdf_cb_lk);
1917		return;
1918	}
1919
1920	switch (new_state) {
1921	case XenbusStateUnknown:
1922	case XenbusStateInitialising:
1923	case XenbusStateInitWait:
1924	case XenbusStateInitialised:
1925		if (vdp->xdf_state == XD_INIT)
1926			break;
1927
1928		xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1929		if (xdf_setstate_init(vdp) != DDI_SUCCESS)
1930			break;
1931		ASSERT(vdp->xdf_state == XD_INIT);
1932		break;
1933
1934	case XenbusStateConnected:
1935		if ((vdp->xdf_state == XD_CONNECTED) ||
1936		    (vdp->xdf_state == XD_READY))
1937			break;
1938
1939		if (vdp->xdf_state != XD_INIT) {
1940			xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1941			if (xdf_setstate_init(vdp) != DDI_SUCCESS)
1942				break;
1943			ASSERT(vdp->xdf_state == XD_INIT);
1944		}
1945
1946		if (xdf_setstate_connected(vdp) != DDI_SUCCESS) {
1947			xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1948			break;
1949		}
1950		ASSERT(vdp->xdf_state == XD_CONNECTED);
1951		break;
1952
1953	case XenbusStateClosing:
1954		if (xdf_isopen(vdp, -1)) {
1955			cmn_err(CE_NOTE,
1956			    "xdf@%s: hot-unplug failed, still in use",
1957			    vdp->xdf_addr);
1958			break;
1959		}
1960		/*FALLTHROUGH*/
1961	case XenbusStateClosed:
1962		xdf_disconnect(vdp, XD_CLOSED, B_FALSE);
1963		break;
1964	}
1965
1966	/* notify anybody waiting for oe state change */
1967	cv_broadcast(&vdp->xdf_dev_cv);
1968	DEBUG_EVAL(vdp->xdf_oe_change_thread = NULL);
1969	mutex_exit(&vdp->xdf_cb_lk);
1970}
1971
1972static int
1973xdf_connect_locked(xdf_t *vdp, boolean_t wait)
1974{
1975	int	rv, timeouts = 0, reset = 20;
1976
1977	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1978	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1979
1980	/* we can't connect once we're in the closed state */
1981	if (vdp->xdf_state == XD_CLOSED)
1982		return (XD_CLOSED);
1983
1984	vdp->xdf_connect_req++;
1985	while (vdp->xdf_state != XD_READY) {
1986		mutex_exit(&vdp->xdf_dev_lk);
1987
1988		/* only one thread at a time can be the connection thread */
1989		if (vdp->xdf_connect_thread == NULL)
1990			vdp->xdf_connect_thread = curthread;
1991
1992		if (vdp->xdf_connect_thread == curthread) {
1993			if ((timeouts > 0) && ((timeouts % reset) == 0)) {
1994				/*
1995				 * If we haven't establised a connection
1996				 * within the reset time, then disconnect
1997				 * so we can try again, and double the reset
1998				 * time.  The reset time starts at 2 sec.
1999				 */
2000				(void) xdf_disconnect(vdp, XD_UNKNOWN, B_TRUE);
2001				reset *= 2;
2002			}
2003			if (vdp->xdf_state == XD_UNKNOWN)
2004				(void) xdf_setstate_init(vdp);
2005			if (vdp->xdf_state == XD_INIT)
2006				(void) xdf_setstate_connected(vdp);
2007		}
2008
2009		mutex_enter(&vdp->xdf_dev_lk);
2010		if (!wait || (vdp->xdf_state == XD_READY))
2011			goto out;
2012
2013		mutex_exit((&vdp->xdf_cb_lk));
2014		if (vdp->xdf_connect_thread != curthread) {
2015			rv = cv_wait_sig(&vdp->xdf_dev_cv, &vdp->xdf_dev_lk);
2016		} else {
2017			/* delay for 0.1 sec */
2018			rv = cv_reltimedwait_sig(&vdp->xdf_dev_cv,
2019			    &vdp->xdf_dev_lk, drv_usectohz(100*1000),
2020			    TR_CLOCK_TICK);
2021			if (rv == -1)
2022				timeouts++;
2023		}
2024		mutex_exit((&vdp->xdf_dev_lk));
2025		mutex_enter((&vdp->xdf_cb_lk));
2026		mutex_enter((&vdp->xdf_dev_lk));
2027		if (rv == 0)
2028			goto out;
2029	}
2030
2031out:
2032	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
2033	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
2034
2035	if (vdp->xdf_connect_thread == curthread) {
2036		/*
2037		 * wake up someone else so they can become the connection
2038		 * thread.
2039		 */
2040		cv_signal(&vdp->xdf_dev_cv);
2041		vdp->xdf_connect_thread = NULL;
2042	}
2043
2044	/* Try to lock the media */
2045	mutex_exit((&vdp->xdf_dev_lk));
2046	(void) xdf_media_req(vdp, XBV_MEDIA_REQ_LOCK, B_TRUE);
2047	mutex_enter((&vdp->xdf_dev_lk));
2048
2049	vdp->xdf_connect_req--;
2050	return (vdp->xdf_state);
2051}
2052
2053static uint_t
2054xdf_iorestart(caddr_t arg)
2055{
2056	xdf_t *vdp = (xdf_t *)arg;
2057
2058	ASSERT(vdp != NULL);
2059
2060	mutex_enter(&vdp->xdf_dev_lk);
2061	ASSERT(ISDMACBON(vdp));
2062	SETDMACBOFF(vdp);
2063	mutex_exit(&vdp->xdf_dev_lk);
2064
2065	xdf_io_start(vdp);
2066
2067	return (DDI_INTR_CLAIMED);
2068}
2069
2070#if defined(XPV_HVM_DRIVER)
2071
2072typedef struct xdf_hvm_entry {
2073	list_node_t	xdf_he_list;
2074	char		*xdf_he_path;
2075	dev_info_t	*xdf_he_dip;
2076} xdf_hvm_entry_t;
2077
2078static list_t xdf_hvm_list;
2079static kmutex_t xdf_hvm_list_lock;
2080
2081static xdf_hvm_entry_t *
2082i_xdf_hvm_find(const char *path, dev_info_t *dip)
2083{
2084	xdf_hvm_entry_t	*i;
2085
2086	ASSERT((path != NULL) || (dip != NULL));
2087	ASSERT(MUTEX_HELD(&xdf_hvm_list_lock));
2088
2089	i = list_head(&xdf_hvm_list);
2090	while (i != NULL) {
2091		if ((path != NULL) && strcmp(i->xdf_he_path, path) != 0) {
2092			i = list_next(&xdf_hvm_list, i);
2093			continue;
2094		}
2095		if ((dip != NULL) && (i->xdf_he_dip != dip)) {
2096			i = list_next(&xdf_hvm_list, i);
2097			continue;
2098		}
2099		break;
2100	}
2101	return (i);
2102}
2103
2104dev_info_t *
2105xdf_hvm_hold(const char *path)
2106{
2107	xdf_hvm_entry_t	*i;
2108	dev_info_t	*dip;
2109
2110	mutex_enter(&xdf_hvm_list_lock);
2111	i = i_xdf_hvm_find(path, NULL);
2112	if (i == NULL) {
2113		mutex_exit(&xdf_hvm_list_lock);
2114		return (B_FALSE);
2115	}
2116	ndi_hold_devi(dip = i->xdf_he_dip);
2117	mutex_exit(&xdf_hvm_list_lock);
2118	return (dip);
2119}
2120
2121static void
2122xdf_hvm_add(dev_info_t *dip)
2123{
2124	xdf_hvm_entry_t	*i;
2125	char		*path;
2126
2127	/* figure out the path for the dip */
2128	path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
2129	(void) ddi_pathname(dip, path);
2130
2131	i = kmem_alloc(sizeof (*i), KM_SLEEP);
2132	i->xdf_he_dip = dip;
2133	i->xdf_he_path = i_ddi_strdup(path, KM_SLEEP);
2134
2135	mutex_enter(&xdf_hvm_list_lock);
2136	ASSERT(i_xdf_hvm_find(path, NULL) == NULL);
2137	ASSERT(i_xdf_hvm_find(NULL, dip) == NULL);
2138	list_insert_head(&xdf_hvm_list, i);
2139	mutex_exit(&xdf_hvm_list_lock);
2140
2141	kmem_free(path, MAXPATHLEN);
2142}
2143
2144static void
2145xdf_hvm_rm(dev_info_t *dip)
2146{
2147	xdf_hvm_entry_t	*i;
2148
2149	mutex_enter(&xdf_hvm_list_lock);
2150	VERIFY((i = i_xdf_hvm_find(NULL, dip)) != NULL);
2151	list_remove(&xdf_hvm_list, i);
2152	mutex_exit(&xdf_hvm_list_lock);
2153
2154	kmem_free(i->xdf_he_path, strlen(i->xdf_he_path) + 1);
2155	kmem_free(i, sizeof (*i));
2156}
2157
2158static void
2159xdf_hvm_init(void)
2160{
2161	list_create(&xdf_hvm_list, sizeof (xdf_hvm_entry_t),
2162	    offsetof(xdf_hvm_entry_t, xdf_he_list));
2163	mutex_init(&xdf_hvm_list_lock, NULL, MUTEX_DEFAULT, NULL);
2164}
2165
2166static void
2167xdf_hvm_fini(void)
2168{
2169	ASSERT(list_head(&xdf_hvm_list) == NULL);
2170	list_destroy(&xdf_hvm_list);
2171	mutex_destroy(&xdf_hvm_list_lock);
2172}
2173
2174boolean_t
2175xdf_hvm_connect(dev_info_t *dip)
2176{
2177	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2178	char	*oename, *str;
2179	int	rv;
2180
2181	mutex_enter(&vdp->xdf_cb_lk);
2182
2183	/*
2184	 * Before try to establish a connection we need to wait for the
2185	 * backend hotplug scripts to have run.  Once they are run the
2186	 * "<oename>/hotplug-status" property will be set to "connected".
2187	 */
2188	for (;;) {
2189		ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
2190
2191		/*
2192		 * Get the xenbus path to the backend device.  Note that
2193		 * we can't cache this path (and we look it up on each pass
2194		 * through this loop) because it could change during
2195		 * suspend, resume, and migration operations.
2196		 */
2197		if ((oename = xvdi_get_oename(dip)) == NULL) {
2198			mutex_exit(&vdp->xdf_cb_lk);
2199			return (B_FALSE);
2200		}
2201
2202		str = NULL;
2203		if ((xenbus_read_str(oename, XBP_HP_STATUS, &str) == 0) &&
2204		    (strcmp(str, XBV_HP_STATUS_CONN) == 0))
2205			break;
2206
2207		if (str != NULL)
2208			strfree(str);
2209
2210		/* wait for an update to "<oename>/hotplug-status" */
2211		if (cv_wait_sig(&vdp->xdf_hp_status_cv, &vdp->xdf_cb_lk) == 0) {
2212			/* we got interrupted by a signal */
2213			mutex_exit(&vdp->xdf_cb_lk);
2214			return (B_FALSE);
2215		}
2216	}
2217
2218	/* Good news.  The backend hotplug scripts have been run. */
2219	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
2220	ASSERT(strcmp(str, XBV_HP_STATUS_CONN) == 0);
2221	strfree(str);
2222
2223	/*
2224	 * If we're emulating a cd device and if the backend doesn't support
2225	 * media request opreations, then we're not going to bother trying
2226	 * to establish a connection for a couple reasons.  First off, media
2227	 * requests support is required to support operations like eject and
2228	 * media locking.  Second, other backend platforms like Linux don't
2229	 * support hvm pv cdrom access.  They don't even have a backend pv
2230	 * driver for cdrom device nodes, so we don't want to block forever
2231	 * waiting for a connection to a backend driver that doesn't exist.
2232	 */
2233	if (XD_IS_CD(vdp) && !xenbus_exists(oename, XBP_MEDIA_REQ_SUP)) {
2234		mutex_exit(&vdp->xdf_cb_lk);
2235		return (B_FALSE);
2236	}
2237
2238	mutex_enter(&vdp->xdf_dev_lk);
2239	rv = xdf_connect_locked(vdp, B_TRUE);
2240	mutex_exit(&vdp->xdf_dev_lk);
2241	mutex_exit(&vdp->xdf_cb_lk);
2242
2243	return ((rv == XD_READY) ? B_TRUE : B_FALSE);
2244}
2245
2246int
2247xdf_hvm_setpgeom(dev_info_t *dip, cmlb_geom_t *geomp)
2248{
2249	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2250
2251	/* sanity check the requested physical geometry */
2252	mutex_enter(&vdp->xdf_dev_lk);
2253	if ((geomp->g_secsize != XB_BSIZE) ||
2254	    (geomp->g_capacity == 0)) {
2255		mutex_exit(&vdp->xdf_dev_lk);
2256		return (EINVAL);
2257	}
2258
2259	/*
2260	 * If we've already connected to the backend device then make sure
2261	 * we're not defining a physical geometry larger than our backend
2262	 * device.
2263	 */
2264	if ((vdp->xdf_xdev_nblocks != 0) &&
2265	    (geomp->g_capacity > vdp->xdf_xdev_nblocks)) {
2266		mutex_exit(&vdp->xdf_dev_lk);
2267		return (EINVAL);
2268	}
2269
2270	bzero(&vdp->xdf_pgeom, sizeof (vdp->xdf_pgeom));
2271	vdp->xdf_pgeom.g_ncyl = geomp->g_ncyl;
2272	vdp->xdf_pgeom.g_acyl = geomp->g_acyl;
2273	vdp->xdf_pgeom.g_nhead = geomp->g_nhead;
2274	vdp->xdf_pgeom.g_nsect = geomp->g_nsect;
2275	vdp->xdf_pgeom.g_secsize = geomp->g_secsize;
2276	vdp->xdf_pgeom.g_capacity = geomp->g_capacity;
2277	vdp->xdf_pgeom.g_intrlv = geomp->g_intrlv;
2278	vdp->xdf_pgeom.g_rpm = geomp->g_rpm;
2279
2280	vdp->xdf_pgeom_fixed = B_TRUE;
2281	mutex_exit(&vdp->xdf_dev_lk);
2282
2283	/* force a re-validation */
2284	cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
2285
2286	return (0);
2287}
2288
2289boolean_t
2290xdf_is_cd(dev_info_t *dip)
2291{
2292	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
2293	boolean_t	rv;
2294
2295	mutex_enter(&vdp->xdf_cb_lk);
2296	rv = XD_IS_CD(vdp);
2297	mutex_exit(&vdp->xdf_cb_lk);
2298	return (rv);
2299}
2300
2301boolean_t
2302xdf_is_rm(dev_info_t *dip)
2303{
2304	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
2305	boolean_t	rv;
2306
2307	mutex_enter(&vdp->xdf_cb_lk);
2308	rv = XD_IS_RM(vdp);
2309	mutex_exit(&vdp->xdf_cb_lk);
2310	return (rv);
2311}
2312
2313boolean_t
2314xdf_media_req_supported(dev_info_t *dip)
2315{
2316	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
2317	boolean_t	rv;
2318
2319	mutex_enter(&vdp->xdf_cb_lk);
2320	rv = vdp->xdf_media_req_supported;
2321	mutex_exit(&vdp->xdf_cb_lk);
2322	return (rv);
2323}
2324
2325#endif /* XPV_HVM_DRIVER */
2326
2327static int
2328xdf_lb_getcap(dev_info_t *dip, diskaddr_t *capp)
2329{
2330	xdf_t *vdp;
2331	vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
2332
2333	if (vdp == NULL)
2334		return (ENXIO);
2335
2336	mutex_enter(&vdp->xdf_dev_lk);
2337	*capp = vdp->xdf_pgeom.g_capacity;
2338	DPRINTF(LBL_DBG, ("xdf@%s:capacity %llu\n", vdp->xdf_addr, *capp));
2339	mutex_exit(&vdp->xdf_dev_lk);
2340	return (0);
2341}
2342
2343static int
2344xdf_lb_getpgeom(dev_info_t *dip, cmlb_geom_t *geomp)
2345{
2346	xdf_t *vdp;
2347
2348	if ((vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip))) == NULL)
2349		return (ENXIO);
2350	*geomp = vdp->xdf_pgeom;
2351	return (0);
2352}
2353
2354/*
2355 * No real HBA, no geometry available from it
2356 */
2357/*ARGSUSED*/
2358static int
2359xdf_lb_getvgeom(dev_info_t *dip, cmlb_geom_t *geomp)
2360{
2361	return (EINVAL);
2362}
2363
2364static int
2365xdf_lb_getattribute(dev_info_t *dip, tg_attribute_t *tgattributep)
2366{
2367	xdf_t *vdp;
2368
2369	if (!(vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip))))
2370		return (ENXIO);
2371
2372	if (XD_IS_RO(vdp))
2373		tgattributep->media_is_writable = 0;
2374	else
2375		tgattributep->media_is_writable = 1;
2376	return (0);
2377}
2378
2379/* ARGSUSED3 */
2380int
2381xdf_lb_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie)
2382{
2383	int instance;
2384	xdf_t   *vdp;
2385
2386	instance = ddi_get_instance(dip);
2387
2388	if ((vdp = ddi_get_soft_state(xdf_ssp, instance)) == NULL)
2389		return (ENXIO);
2390
2391	switch (cmd) {
2392	case TG_GETPHYGEOM:
2393		return (xdf_lb_getpgeom(dip, (cmlb_geom_t *)arg));
2394	case TG_GETVIRTGEOM:
2395		return (xdf_lb_getvgeom(dip, (cmlb_geom_t *)arg));
2396	case TG_GETCAPACITY:
2397		return (xdf_lb_getcap(dip, (diskaddr_t *)arg));
2398	case TG_GETBLOCKSIZE:
2399		mutex_enter(&vdp->xdf_cb_lk);
2400		*(uint32_t *)arg = vdp->xdf_xdev_secsize;
2401		mutex_exit(&vdp->xdf_cb_lk);
2402		return (0);
2403	case TG_GETATTR:
2404		return (xdf_lb_getattribute(dip, (tg_attribute_t *)arg));
2405	default:
2406		return (ENOTTY);
2407	}
2408}
2409
2410/* ARGSUSED5 */
2411int
2412xdf_lb_rdwr(dev_info_t *dip, uchar_t cmd, void *bufp,
2413    diskaddr_t start, size_t reqlen, void *tg_cookie)
2414{
2415	xdf_t *vdp;
2416	struct buf *bp;
2417	int err = 0;
2418
2419	vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
2420
2421	/* We don't allow IO from the oe_change callback thread */
2422	ASSERT(curthread != vdp->xdf_oe_change_thread);
2423
2424	if ((start + ((reqlen / (vdp->xdf_xdev_secsize / DEV_BSIZE))
2425	    >> DEV_BSHIFT)) > vdp->xdf_pgeom.g_capacity)
2426		return (EINVAL);
2427
2428	bp = getrbuf(KM_SLEEP);
2429	if (cmd == TG_READ)
2430		bp->b_flags = B_BUSY | B_READ;
2431	else
2432		bp->b_flags = B_BUSY | B_WRITE;
2433
2434	bp->b_un.b_addr = bufp;
2435	bp->b_bcount = reqlen;
2436	bp->b_blkno = start * (vdp->xdf_xdev_secsize / DEV_BSIZE);
2437	bp->b_edev = DDI_DEV_T_NONE; /* don't have dev_t */
2438
2439	mutex_enter(&vdp->xdf_dev_lk);
2440	xdf_bp_push(vdp, bp);
2441	mutex_exit(&vdp->xdf_dev_lk);
2442	xdf_io_start(vdp);
2443	if (curthread == vdp->xdf_ready_tq_thread)
2444		(void) xdf_ring_drain(vdp);
2445	err = biowait(bp);
2446	ASSERT(bp->b_flags & B_DONE);
2447	freerbuf(bp);
2448	return (err);
2449}
2450
2451/*
2452 * Lock the current media.  Set the media state to "lock".
2453 * (Media locks are only respected by the backend driver.)
2454 */
2455static int
2456xdf_ioctl_mlock(xdf_t *vdp)
2457{
2458	int rv;
2459	mutex_enter(&vdp->xdf_cb_lk);
2460	rv = xdf_media_req(vdp, XBV_MEDIA_REQ_LOCK, B_TRUE);
2461	mutex_exit(&vdp->xdf_cb_lk);
2462	return (rv);
2463}
2464
2465/*
2466 * Release a media lock.  Set the media state to "none".
2467 */
2468static int
2469xdf_ioctl_munlock(xdf_t *vdp)
2470{
2471	int rv;
2472	mutex_enter(&vdp->xdf_cb_lk);
2473	rv = xdf_media_req(vdp, XBV_MEDIA_REQ_NONE, B_TRUE);
2474	mutex_exit(&vdp->xdf_cb_lk);
2475	return (rv);
2476}
2477
2478/*
2479 * Eject the current media.  Ignores any media locks.  (Media locks
2480 * are only for benifit of the the backend.)
2481 */
2482static int
2483xdf_ioctl_eject(xdf_t *vdp)
2484{
2485	int rv;
2486
2487	mutex_enter(&vdp->xdf_cb_lk);
2488	if ((rv = xdf_media_req(vdp, XBV_MEDIA_REQ_EJECT, B_FALSE)) != 0) {
2489		mutex_exit(&vdp->xdf_cb_lk);
2490		return (rv);
2491	}
2492
2493	/*
2494	 * We've set the media requests xenbus parameter to eject, so now
2495	 * disconnect from the backend, wait for the backend to clear
2496	 * the media requets xenbus paramter, and then we can reconnect
2497	 * to the backend.
2498	 */
2499	(void) xdf_disconnect(vdp, XD_UNKNOWN, B_TRUE);
2500	mutex_enter(&vdp->xdf_dev_lk);
2501	if (xdf_connect_locked(vdp, B_TRUE) != XD_READY) {
2502		mutex_exit(&vdp->xdf_dev_lk);
2503		mutex_exit(&vdp->xdf_cb_lk);
2504		return (EIO);
2505	}
2506	mutex_exit(&vdp->xdf_dev_lk);
2507	mutex_exit(&vdp->xdf_cb_lk);
2508	return (0);
2509}
2510
2511/*
2512 * Watch for media state changes.  This can be an insertion of a device
2513 * (triggered by a 'xm block-configure' request in another domain) or
2514 * the ejection of a device (triggered by a local "eject" operation).
2515 * For a full description of the DKIOCSTATE ioctl behavior see dkio(7I).
2516 */
2517static int
2518xdf_dkstate(xdf_t *vdp, enum dkio_state mstate)
2519{
2520	enum dkio_state		prev_state;
2521
2522	mutex_enter(&vdp->xdf_cb_lk);
2523	prev_state = vdp->xdf_mstate;
2524
2525	if (vdp->xdf_mstate == mstate) {
2526		while (vdp->xdf_mstate == prev_state) {
2527			if (cv_wait_sig(&vdp->xdf_mstate_cv,
2528			    &vdp->xdf_cb_lk) == 0) {
2529				mutex_exit(&vdp->xdf_cb_lk);
2530				return (EINTR);
2531			}
2532		}
2533	}
2534
2535	if ((prev_state != DKIO_INSERTED) &&
2536	    (vdp->xdf_mstate == DKIO_INSERTED)) {
2537		(void) xdf_media_req(vdp, XBV_MEDIA_REQ_LOCK, B_TRUE);
2538		mutex_exit(&vdp->xdf_cb_lk);
2539		return (0);
2540	}
2541
2542	mutex_exit(&vdp->xdf_cb_lk);
2543	return (0);
2544}
2545
2546/*ARGSUSED*/
2547static int
2548xdf_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
2549    int *rvalp)
2550{
2551	minor_t		minor = getminor(dev);
2552	int		part = XDF_PART(minor);
2553	xdf_t		*vdp;
2554	int		rv;
2555
2556	if (((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL) ||
2557	    (!xdf_isopen(vdp, part)))
2558		return (ENXIO);
2559
2560	DPRINTF(IOCTL_DBG, ("xdf@%s:ioctl: cmd %d (0x%x)\n",
2561	    vdp->xdf_addr, cmd, cmd));
2562
2563	switch (cmd) {
2564	default:
2565		return (ENOTTY);
2566	case DKIOCG_PHYGEOM:
2567	case DKIOCG_VIRTGEOM:
2568	case DKIOCGGEOM:
2569	case DKIOCSGEOM:
2570	case DKIOCGAPART:
2571	case DKIOCSAPART:
2572	case DKIOCGVTOC:
2573	case DKIOCSVTOC:
2574	case DKIOCPARTINFO:
2575	case DKIOCGEXTVTOC:
2576	case DKIOCSEXTVTOC:
2577	case DKIOCEXTPARTINFO:
2578	case DKIOCGMBOOT:
2579	case DKIOCSMBOOT:
2580	case DKIOCGETEFI:
2581	case DKIOCSETEFI:
2582	case DKIOCSETEXTPART:
2583	case DKIOCPARTITION:
2584		return (cmlb_ioctl(vdp->xdf_vd_lbl, dev, cmd, arg, mode, credp,
2585		    rvalp, NULL));
2586	case FDEJECT:
2587	case DKIOCEJECT:
2588	case CDROMEJECT:
2589		return (xdf_ioctl_eject(vdp));
2590	case DKIOCLOCK:
2591		return (xdf_ioctl_mlock(vdp));
2592	case DKIOCUNLOCK:
2593		return (xdf_ioctl_munlock(vdp));
2594	case CDROMREADOFFSET: {
2595		int offset = 0;
2596		if (!XD_IS_CD(vdp))
2597			return (ENOTTY);
2598		if (ddi_copyout(&offset, (void *)arg, sizeof (int), mode))
2599			return (EFAULT);
2600		return (0);
2601	}
2602	case DKIOCGMEDIAINFO: {
2603		struct dk_minfo media_info;
2604
2605		media_info.dki_lbsize = vdp->xdf_xdev_secsize;
2606		media_info.dki_capacity = vdp->xdf_pgeom.g_capacity;
2607		if (XD_IS_CD(vdp))
2608			media_info.dki_media_type = DK_CDROM;
2609		else
2610			media_info.dki_media_type = DK_FIXED_DISK;
2611
2612		if (ddi_copyout(&media_info, (void *)arg,
2613		    sizeof (struct dk_minfo), mode))
2614			return (EFAULT);
2615		return (0);
2616	}
2617	case DKIOCINFO: {
2618		struct dk_cinfo info;
2619
2620		/* controller information */
2621		if (XD_IS_CD(vdp))
2622			info.dki_ctype = DKC_CDROM;
2623		else
2624			info.dki_ctype = DKC_VBD;
2625
2626		info.dki_cnum = 0;
2627		(void) strncpy((char *)(&info.dki_cname), "xdf", 8);
2628
2629		/* unit information */
2630		info.dki_unit = ddi_get_instance(vdp->xdf_dip);
2631		(void) strncpy((char *)(&info.dki_dname), "xdf", 8);
2632		info.dki_flags = DKI_FMTVOL;
2633		info.dki_partition = part;
2634		info.dki_maxtransfer = maxphys / DEV_BSIZE;
2635		info.dki_addr = 0;
2636		info.dki_space = 0;
2637		info.dki_prio = 0;
2638		info.dki_vec = 0;
2639
2640		if (ddi_copyout(&info, (void *)arg, sizeof (info), mode))
2641			return (EFAULT);
2642		return (0);
2643	}
2644	case DKIOCSTATE: {
2645		enum dkio_state mstate;
2646
2647		if (ddi_copyin((void *)arg, &mstate,
2648		    sizeof (mstate), mode) != 0)
2649			return (EFAULT);
2650		if ((rv = xdf_dkstate(vdp, mstate)) != 0)
2651			return (rv);
2652		mstate = vdp->xdf_mstate;
2653		if (ddi_copyout(&mstate, (void *)arg,
2654		    sizeof (mstate), mode) != 0)
2655			return (EFAULT);
2656		return (0);
2657	}
2658	case DKIOCREMOVABLE: {
2659		int i = BOOLEAN2VOID(XD_IS_RM(vdp));
2660		if (ddi_copyout(&i, (caddr_t)arg, sizeof (i), mode))
2661			return (EFAULT);
2662		return (0);
2663	}
2664	case DKIOCGETWCE: {
2665		int i = BOOLEAN2VOID(XD_IS_RM(vdp));
2666		if (ddi_copyout(&i, (void *)arg, sizeof (i), mode))
2667			return (EFAULT);
2668		return (0);
2669	}
2670	case DKIOCSETWCE: {
2671		int i;
2672		if (ddi_copyin((void *)arg, &i, sizeof (i), mode))
2673			return (EFAULT);
2674		vdp->xdf_wce = VOID2BOOLEAN(i);
2675		return (0);
2676	}
2677	case DKIOCFLUSHWRITECACHE: {
2678		struct dk_callback *dkc = (struct dk_callback *)arg;
2679
2680		if (vdp->xdf_flush_supported) {
2681			rv = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
2682			    NULL, 0, 0, (void *)dev);
2683		} else if (vdp->xdf_feature_barrier &&
2684		    !xdf_barrier_flush_disable) {
2685			rv = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
2686			    vdp->xdf_cache_flush_block, xdf_flush_block,
2687			    vdp->xdf_xdev_secsize, (void *)dev);
2688		} else {
2689			return (ENOTTY);
2690		}
2691		if ((mode & FKIOCTL) && (dkc != NULL) &&
2692		    (dkc->dkc_callback != NULL)) {
2693			(*dkc->dkc_callback)(dkc->dkc_cookie, rv);
2694			/* need to return 0 after calling callback */
2695			rv = 0;
2696		}
2697		return (rv);
2698	}
2699	}
2700	/*NOTREACHED*/
2701}
2702
2703static int
2704xdf_strategy(struct buf *bp)
2705{
2706	xdf_t	*vdp;
2707	minor_t minor;
2708	diskaddr_t p_blkct, p_blkst;
2709	daddr_t blkno;
2710	ulong_t nblks;
2711	int part;
2712
2713	minor = getminor(bp->b_edev);
2714	part = XDF_PART(minor);
2715	vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor));
2716
2717	mutex_enter(&vdp->xdf_dev_lk);
2718	if (!xdf_isopen(vdp, part)) {
2719		mutex_exit(&vdp->xdf_dev_lk);
2720		xdf_io_err(bp, ENXIO, 0);
2721		return (0);
2722	}
2723
2724	/* We don't allow IO from the oe_change callback thread */
2725	ASSERT(curthread != vdp->xdf_oe_change_thread);
2726
2727	/* Check for writes to a read only device */
2728	if (!IS_READ(bp) && XD_IS_RO(vdp)) {
2729		mutex_exit(&vdp->xdf_dev_lk);
2730		xdf_io_err(bp, EROFS, 0);
2731		return (0);
2732	}
2733
2734	/* Check if this I/O is accessing a partition or the entire disk */
2735	if ((long)bp->b_private == XB_SLICE_NONE) {
2736		/* This I/O is using an absolute offset */
2737		p_blkct = vdp->xdf_xdev_nblocks;
2738		p_blkst = 0;
2739	} else {
2740		/* This I/O is using a partition relative offset */
2741		mutex_exit(&vdp->xdf_dev_lk);
2742		if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
2743		    &p_blkst, NULL, NULL, NULL)) {
2744			xdf_io_err(bp, ENXIO, 0);
2745			return (0);
2746		}
2747		mutex_enter(&vdp->xdf_dev_lk);
2748	}
2749
2750	/*
2751	 * Adjust the real blkno and bcount according to the underline
2752	 * physical sector size.
2753	 */
2754	blkno = bp->b_blkno / (vdp->xdf_xdev_secsize / XB_BSIZE);
2755
2756	/* check for a starting block beyond the disk or partition limit */
2757	if (blkno > p_blkct) {
2758		DPRINTF(IO_DBG, ("xdf@%s: block %lld exceeds VBD size %"PRIu64,
2759		    vdp->xdf_addr, (longlong_t)blkno, (uint64_t)p_blkct));
2760		mutex_exit(&vdp->xdf_dev_lk);
2761		xdf_io_err(bp, EINVAL, 0);
2762		return (0);
2763	}
2764
2765	/* Legacy: don't set error flag at this case */
2766	if (blkno == p_blkct) {
2767		mutex_exit(&vdp->xdf_dev_lk);
2768		bp->b_resid = bp->b_bcount;
2769		biodone(bp);
2770		return (0);
2771	}
2772
2773	/* sanitize the input buf */
2774	bioerror(bp, 0);
2775	bp->b_resid = 0;
2776	bp->av_back = bp->av_forw = NULL;
2777
2778	/* Adjust for partial transfer, this will result in an error later */
2779	if (vdp->xdf_xdev_secsize != 0 &&
2780	    vdp->xdf_xdev_secsize != XB_BSIZE) {
2781		nblks = bp->b_bcount / vdp->xdf_xdev_secsize;
2782	} else {
2783		nblks = bp->b_bcount >> XB_BSHIFT;
2784	}
2785
2786	if ((blkno + nblks) > p_blkct) {
2787		if (vdp->xdf_xdev_secsize != 0 &&
2788		    vdp->xdf_xdev_secsize != XB_BSIZE) {
2789			bp->b_resid =
2790			    ((blkno + nblks) - p_blkct) *
2791			    vdp->xdf_xdev_secsize;
2792		} else {
2793			bp->b_resid =
2794			    ((blkno + nblks) - p_blkct) <<
2795			    XB_BSHIFT;
2796		}
2797		bp->b_bcount -= bp->b_resid;
2798	}
2799
2800	DPRINTF(IO_DBG, ("xdf@%s: strategy blk %lld len %lu\n",
2801	    vdp->xdf_addr, (longlong_t)blkno, (ulong_t)bp->b_bcount));
2802
2803	/* Fix up the buf struct */
2804	bp->b_flags |= B_BUSY;
2805	bp->b_private = (void *)(uintptr_t)p_blkst;
2806
2807	xdf_bp_push(vdp, bp);
2808	mutex_exit(&vdp->xdf_dev_lk);
2809	xdf_io_start(vdp);
2810	if (do_polled_io)
2811		(void) xdf_ring_drain(vdp);
2812	return (0);
2813}
2814
2815/*ARGSUSED*/
2816static int
2817xdf_read(dev_t dev, struct uio *uiop, cred_t *credp)
2818{
2819	xdf_t	*vdp;
2820	minor_t minor;
2821	diskaddr_t p_blkcnt;
2822	int part;
2823
2824	minor = getminor(dev);
2825	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2826		return (ENXIO);
2827
2828	DPRINTF(IO_DBG, ("xdf@%s: read offset 0x%"PRIx64"\n",
2829	    vdp->xdf_addr, (int64_t)uiop->uio_offset));
2830
2831	part = XDF_PART(minor);
2832	if (!xdf_isopen(vdp, part))
2833		return (ENXIO);
2834
2835	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2836	    NULL, NULL, NULL, NULL))
2837		return (ENXIO);
2838
2839	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp))
2840		return (ENOSPC);
2841
2842	if (U_INVAL(uiop))
2843		return (EINVAL);
2844
2845	return (physio(xdf_strategy, NULL, dev, B_READ, xdfmin, uiop));
2846}
2847
2848/*ARGSUSED*/
2849static int
2850xdf_write(dev_t dev, struct uio *uiop, cred_t *credp)
2851{
2852	xdf_t *vdp;
2853	minor_t minor;
2854	diskaddr_t p_blkcnt;
2855	int part;
2856
2857	minor = getminor(dev);
2858	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2859		return (ENXIO);
2860
2861	DPRINTF(IO_DBG, ("xdf@%s: write offset 0x%"PRIx64"\n",
2862	    vdp->xdf_addr, (int64_t)uiop->uio_offset));
2863
2864	part = XDF_PART(minor);
2865	if (!xdf_isopen(vdp, part))
2866		return (ENXIO);
2867
2868	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2869	    NULL, NULL, NULL, NULL))
2870		return (ENXIO);
2871
2872	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp))
2873		return (ENOSPC);
2874
2875	if (U_INVAL(uiop))
2876		return (EINVAL);
2877
2878	return (physio(xdf_strategy, NULL, dev, B_WRITE, xdfmin, uiop));
2879}
2880
2881/*ARGSUSED*/
2882static int
2883xdf_aread(dev_t dev, struct aio_req *aiop, cred_t *credp)
2884{
2885	xdf_t	*vdp;
2886	minor_t minor;
2887	struct uio *uiop = aiop->aio_uio;
2888	diskaddr_t p_blkcnt;
2889	int part;
2890
2891	minor = getminor(dev);
2892	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2893		return (ENXIO);
2894
2895	part = XDF_PART(minor);
2896	if (!xdf_isopen(vdp, part))
2897		return (ENXIO);
2898
2899	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2900	    NULL, NULL, NULL, NULL))
2901		return (ENXIO);
2902
2903	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp))
2904		return (ENOSPC);
2905
2906	if (U_INVAL(uiop))
2907		return (EINVAL);
2908
2909	return (aphysio(xdf_strategy, anocancel, dev, B_READ, xdfmin, aiop));
2910}
2911
2912/*ARGSUSED*/
2913static int
2914xdf_awrite(dev_t dev, struct aio_req *aiop, cred_t *credp)
2915{
2916	xdf_t *vdp;
2917	minor_t minor;
2918	struct uio *uiop = aiop->aio_uio;
2919	diskaddr_t p_blkcnt;
2920	int part;
2921
2922	minor = getminor(dev);
2923	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2924		return (ENXIO);
2925
2926	part = XDF_PART(minor);
2927	if (!xdf_isopen(vdp, part))
2928		return (ENXIO);
2929
2930	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2931	    NULL, NULL, NULL, NULL))
2932		return (ENXIO);
2933
2934	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp))
2935		return (ENOSPC);
2936
2937	if (U_INVAL(uiop))
2938		return (EINVAL);
2939
2940	return (aphysio(xdf_strategy, anocancel, dev, B_WRITE, xdfmin, aiop));
2941}
2942
2943static int
2944xdf_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
2945{
2946	struct buf dumpbuf, *dbp = &dumpbuf;
2947	xdf_t	*vdp;
2948	minor_t minor;
2949	int err = 0;
2950	int part;
2951	diskaddr_t p_blkcnt, p_blkst;
2952
2953	minor = getminor(dev);
2954	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2955		return (ENXIO);
2956
2957	DPRINTF(IO_DBG, ("xdf@%s: dump addr (0x%p) blk (%ld) nblks (%d)\n",
2958	    vdp->xdf_addr, (void *)addr, blkno, nblk));
2959
2960	/* We don't allow IO from the oe_change callback thread */
2961	ASSERT(curthread != vdp->xdf_oe_change_thread);
2962
2963	part = XDF_PART(minor);
2964	if (!xdf_isopen(vdp, part))
2965		return (ENXIO);
2966
2967	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt, &p_blkst,
2968	    NULL, NULL, NULL))
2969		return (ENXIO);
2970
2971	if ((blkno + nblk) >
2972	    (p_blkcnt * (vdp->xdf_xdev_secsize / XB_BSIZE))) {
2973		cmn_err(CE_WARN, "xdf@%s: block %ld exceeds VBD size %"PRIu64,
2974		    vdp->xdf_addr, (daddr_t)((blkno + nblk) /
2975		    (vdp->xdf_xdev_secsize / XB_BSIZE)), (uint64_t)p_blkcnt);
2976		return (EINVAL);
2977	}
2978
2979	bioinit(dbp);
2980	dbp->b_flags = B_BUSY;
2981	dbp->b_un.b_addr = addr;
2982	dbp->b_bcount = nblk << DEV_BSHIFT;
2983	dbp->b_blkno = blkno;
2984	dbp->b_edev = dev;
2985	dbp->b_private = (void *)(uintptr_t)p_blkst;
2986
2987	mutex_enter(&vdp->xdf_dev_lk);
2988	xdf_bp_push(vdp, dbp);
2989	mutex_exit(&vdp->xdf_dev_lk);
2990	xdf_io_start(vdp);
2991	err = xdf_ring_drain(vdp);
2992	biofini(dbp);
2993	return (err);
2994}
2995
2996/*ARGSUSED*/
2997static int
2998xdf_close(dev_t dev, int flag, int otyp, struct cred *credp)
2999{
3000	minor_t	minor;
3001	xdf_t	*vdp;
3002	int part;
3003	ulong_t parbit;
3004
3005	minor = getminor(dev);
3006	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
3007		return (ENXIO);
3008
3009	mutex_enter(&vdp->xdf_dev_lk);
3010	part = XDF_PART(minor);
3011	if (!xdf_isopen(vdp, part)) {
3012		mutex_exit(&vdp->xdf_dev_lk);
3013		return (ENXIO);
3014	}
3015	parbit = 1 << part;
3016
3017	ASSERT((vdp->xdf_vd_open[otyp] & parbit) != 0);
3018	if (otyp == OTYP_LYR) {
3019		ASSERT(vdp->xdf_vd_lyropen[part] > 0);
3020		if (--vdp->xdf_vd_lyropen[part] == 0)
3021			vdp->xdf_vd_open[otyp] &= ~parbit;
3022	} else {
3023		vdp->xdf_vd_open[otyp] &= ~parbit;
3024	}
3025	vdp->xdf_vd_exclopen &= ~parbit;
3026
3027	mutex_exit(&vdp->xdf_dev_lk);
3028	return (0);
3029}
3030
3031static int
3032xdf_open(dev_t *devp, int flag, int otyp, cred_t *credp)
3033{
3034	minor_t	minor;
3035	xdf_t	*vdp;
3036	int part;
3037	ulong_t parbit;
3038	diskaddr_t p_blkct = 0;
3039	boolean_t firstopen;
3040	boolean_t nodelay;
3041
3042	minor = getminor(*devp);
3043	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
3044		return (ENXIO);
3045
3046	nodelay = (flag & (FNDELAY | FNONBLOCK));
3047
3048	DPRINTF(DDI_DBG, ("xdf@%s: opening\n", vdp->xdf_addr));
3049
3050	/* do cv_wait until connected or failed */
3051	mutex_enter(&vdp->xdf_cb_lk);
3052	mutex_enter(&vdp->xdf_dev_lk);
3053	if (!nodelay && (xdf_connect_locked(vdp, B_TRUE) != XD_READY)) {
3054		mutex_exit(&vdp->xdf_dev_lk);
3055		mutex_exit(&vdp->xdf_cb_lk);
3056		return (ENXIO);
3057	}
3058	mutex_exit(&vdp->xdf_cb_lk);
3059
3060	if ((flag & FWRITE) && XD_IS_RO(vdp)) {
3061		mutex_exit(&vdp->xdf_dev_lk);
3062		return (EROFS);
3063	}
3064
3065	part = XDF_PART(minor);
3066	parbit = 1 << part;
3067	if ((vdp->xdf_vd_exclopen & parbit) ||
3068	    ((flag & FEXCL) && xdf_isopen(vdp, part))) {
3069		mutex_exit(&vdp->xdf_dev_lk);
3070		return (EBUSY);
3071	}
3072
3073	/* are we the first one to open this node? */
3074	firstopen = !xdf_isopen(vdp, -1);
3075
3076	if (otyp == OTYP_LYR)
3077		vdp->xdf_vd_lyropen[part]++;
3078
3079	vdp->xdf_vd_open[otyp] |= parbit;
3080
3081	if (flag & FEXCL)
3082		vdp->xdf_vd_exclopen |= parbit;
3083
3084	mutex_exit(&vdp->xdf_dev_lk);
3085
3086	/* force a re-validation */
3087	if (firstopen)
3088		cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
3089
3090	/* If this is a non-blocking open then we're done */
3091	if (nodelay)
3092		return (0);
3093
3094	/*
3095	 * This is a blocking open, so we require:
3096	 * - that the disk have a valid label on it
3097	 * - that the size of the partition that we're opening is non-zero
3098	 */
3099	if ((cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
3100	    NULL, NULL, NULL, NULL) != 0) || (p_blkct == 0)) {
3101		(void) xdf_close(*devp, flag, otyp, credp);
3102		return (ENXIO);
3103	}
3104
3105	return (0);
3106}
3107
3108/*ARGSUSED*/
3109static void
3110xdf_watch_hp_status_cb(dev_info_t *dip, const char *path, void *arg)
3111{
3112	xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
3113	cv_broadcast(&vdp->xdf_hp_status_cv);
3114}
3115
3116static int
3117xdf_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags,
3118	char *name, caddr_t valuep, int *lengthp)
3119{
3120	xdf_t	*vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
3121
3122	/*
3123	 * Sanity check that if a dev_t or dip were specified that they
3124	 * correspond to this device driver.  On debug kernels we'll
3125	 * panic and on non-debug kernels we'll return failure.
3126	 */
3127	ASSERT(ddi_driver_major(dip) == xdf_major);
3128	ASSERT((dev == DDI_DEV_T_ANY) || (getmajor(dev) == xdf_major));
3129	if ((ddi_driver_major(dip) != xdf_major) ||
3130	    ((dev != DDI_DEV_T_ANY) && (getmajor(dev) != xdf_major)))
3131		return (DDI_PROP_NOT_FOUND);
3132
3133	if (vdp == NULL)
3134		return (ddi_prop_op(dev, dip, prop_op, flags,
3135		    name, valuep, lengthp));
3136
3137	return (cmlb_prop_op(vdp->xdf_vd_lbl,
3138	    dev, dip, prop_op, flags, name, valuep, lengthp,
3139	    XDF_PART(getminor(dev)), NULL));
3140}
3141
3142/*ARGSUSED*/
3143static int
3144xdf_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **rp)
3145{
3146	int	instance = XDF_INST(getminor((dev_t)arg));
3147	xdf_t	*vbdp;
3148
3149	switch (cmd) {
3150	case DDI_INFO_DEVT2DEVINFO:
3151		if ((vbdp = ddi_get_soft_state(xdf_ssp, instance)) == NULL) {
3152			*rp = NULL;
3153			return (DDI_FAILURE);
3154		}
3155		*rp = vbdp->xdf_dip;
3156		return (DDI_SUCCESS);
3157
3158	case DDI_INFO_DEVT2INSTANCE:
3159		*rp = (void *)(uintptr_t)instance;
3160		return (DDI_SUCCESS);
3161
3162	default:
3163		return (DDI_FAILURE);
3164	}
3165}
3166
3167/*ARGSUSED*/
3168static int
3169xdf_resume(dev_info_t *dip)
3170{
3171	xdf_t	*vdp;
3172	char	*oename;
3173
3174	if ((vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip))) == NULL)
3175		goto err;
3176
3177	if (xdf_debug & SUSRES_DBG)
3178		xen_printf("xdf@%s: xdf_resume\n", vdp->xdf_addr);
3179
3180	mutex_enter(&vdp->xdf_cb_lk);
3181
3182	if (xvdi_resume(dip) != DDI_SUCCESS) {
3183		mutex_exit(&vdp->xdf_cb_lk);
3184		goto err;
3185	}
3186
3187	if (((oename = xvdi_get_oename(dip)) == NULL) ||
3188	    (xvdi_add_xb_watch_handler(dip, oename, XBP_HP_STATUS,
3189	    xdf_watch_hp_status_cb, NULL) != DDI_SUCCESS)) {
3190		mutex_exit(&vdp->xdf_cb_lk);
3191		goto err;
3192	}
3193
3194	mutex_enter(&vdp->xdf_dev_lk);
3195	ASSERT(vdp->xdf_state != XD_READY);
3196	xdf_set_state(vdp, XD_UNKNOWN);
3197	mutex_exit(&vdp->xdf_dev_lk);
3198
3199	if (xdf_setstate_init(vdp) != DDI_SUCCESS) {
3200		mutex_exit(&vdp->xdf_cb_lk);
3201		goto err;
3202	}
3203
3204	mutex_exit(&vdp->xdf_cb_lk);
3205
3206	if (xdf_debug & SUSRES_DBG)
3207		xen_printf("xdf@%s: xdf_resume: done\n", vdp->xdf_addr);
3208	return (DDI_SUCCESS);
3209err:
3210	if (xdf_debug & SUSRES_DBG)
3211		xen_printf("xdf@%s: xdf_resume: fail\n", vdp->xdf_addr);
3212	return (DDI_FAILURE);
3213}
3214
3215static int
3216xdf_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
3217{
3218	int			n, instance = ddi_get_instance(dip);
3219	ddi_iblock_cookie_t	ibc, softibc;
3220	boolean_t		dev_iscd = B_FALSE;
3221	xdf_t			*vdp;
3222	char			*oename, *xsname, *str;
3223
3224	if ((n = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_NOTPROM,
3225	    "xdf_debug", 0)) != 0)
3226		xdf_debug = n;
3227
3228	switch (cmd) {
3229	case DDI_RESUME:
3230		return (xdf_resume(dip));
3231	case DDI_ATTACH:
3232		break;
3233	default:
3234		return (DDI_FAILURE);
3235	}
3236	/* DDI_ATTACH */
3237
3238	if (((xsname = xvdi_get_xsname(dip)) == NULL) ||
3239	    ((oename = xvdi_get_oename(dip)) == NULL))
3240		return (DDI_FAILURE);
3241
3242	/*
3243	 * Disable auto-detach.  This is necessary so that we don't get
3244	 * detached while we're disconnected from the back end.
3245	 */
3246	if ((ddi_prop_update_int(DDI_DEV_T_NONE, dip,
3247	    DDI_NO_AUTODETACH, 1) != DDI_PROP_SUCCESS))
3248		return (DDI_FAILURE);
3249
3250	/* driver handles kernel-issued IOCTLs */
3251	if (ddi_prop_create(DDI_DEV_T_NONE, dip,
3252	    DDI_PROP_CANSLEEP, DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS)
3253		return (DDI_FAILURE);
3254
3255	if (ddi_get_iblock_cookie(dip, 0, &ibc) != DDI_SUCCESS)
3256		return (DDI_FAILURE);
3257
3258	if (ddi_get_soft_iblock_cookie(dip,
3259	    DDI_SOFTINT_LOW, &softibc) != DDI_SUCCESS)
3260		return (DDI_FAILURE);
3261
3262	if (xenbus_read_str(xsname, XBP_DEV_TYPE, &str) != 0) {
3263		cmn_err(CE_WARN, "xdf@%s: cannot read device-type",
3264		    ddi_get_name_addr(dip));
3265		return (DDI_FAILURE);
3266	}
3267	if (strcmp(str, XBV_DEV_TYPE_CD) == 0)
3268		dev_iscd = B_TRUE;
3269	strfree(str);
3270
3271	if (ddi_soft_state_zalloc(xdf_ssp, instance) != DDI_SUCCESS)
3272		return (DDI_FAILURE);
3273
3274	DPRINTF(DDI_DBG, ("xdf@%s: attaching\n", ddi_get_name_addr(dip)));
3275	vdp = ddi_get_soft_state(xdf_ssp, instance);
3276	ddi_set_driver_private(dip, vdp);
3277	vdp->xdf_dip = dip;
3278	vdp->xdf_addr = ddi_get_name_addr(dip);
3279	vdp->xdf_suspending = B_FALSE;
3280	vdp->xdf_media_req_supported = B_FALSE;
3281	vdp->xdf_peer = INVALID_DOMID;
3282	vdp->xdf_evtchn = INVALID_EVTCHN;
3283	list_create(&vdp->xdf_vreq_act, sizeof (v_req_t),
3284	    offsetof(v_req_t, v_link));
3285	cv_init(&vdp->xdf_dev_cv, NULL, CV_DEFAULT, NULL);
3286	cv_init(&vdp->xdf_hp_status_cv, NULL, CV_DEFAULT, NULL);
3287	cv_init(&vdp->xdf_mstate_cv, NULL, CV_DEFAULT, NULL);
3288	mutex_init(&vdp->xdf_dev_lk, NULL, MUTEX_DRIVER, (void *)ibc);
3289	mutex_init(&vdp->xdf_cb_lk, NULL, MUTEX_DRIVER, (void *)ibc);
3290	mutex_init(&vdp->xdf_iostat_lk, NULL, MUTEX_DRIVER, (void *)ibc);
3291	vdp->xdf_cmbl_reattach = B_TRUE;
3292	if (dev_iscd) {
3293		vdp->xdf_dinfo |= VDISK_CDROM;
3294		vdp->xdf_mstate = DKIO_EJECTED;
3295	} else {
3296		vdp->xdf_mstate = DKIO_NONE;
3297	}
3298
3299	if ((vdp->xdf_ready_tq = ddi_taskq_create(dip, "xdf_ready_tq",
3300	    1, TASKQ_DEFAULTPRI, 0)) == NULL)
3301		goto errout0;
3302
3303	if (xvdi_add_xb_watch_handler(dip, oename, XBP_HP_STATUS,
3304	    xdf_watch_hp_status_cb, NULL) != DDI_SUCCESS)
3305		goto errout0;
3306
3307	if (ddi_add_softintr(dip, DDI_SOFTINT_LOW, &vdp->xdf_softintr_id,
3308	    &softibc, NULL, xdf_iorestart, (caddr_t)vdp) != DDI_SUCCESS) {
3309		cmn_err(CE_WARN, "xdf@%s: failed to add softintr",
3310		    ddi_get_name_addr(dip));
3311		goto errout0;
3312	}
3313
3314	/*
3315	 * Initialize the physical geometry stucture.  Note that currently
3316	 * we don't know the size of the backend device so the number
3317	 * of blocks on the device will be initialized to zero.  Once
3318	 * we connect to the backend device we'll update the physical
3319	 * geometry to reflect the real size of the device.
3320	 */
3321	xdf_synthetic_pgeom(dip, &vdp->xdf_pgeom);
3322	vdp->xdf_pgeom_fixed = B_FALSE;
3323
3324	/*
3325	 * create default device minor nodes: non-removable disk
3326	 * we will adjust minor nodes after we are connected w/ backend
3327	 */
3328	cmlb_alloc_handle(&vdp->xdf_vd_lbl);
3329	if (xdf_cmlb_attach(vdp) != 0) {
3330		cmn_err(CE_WARN,
3331		    "xdf@%s: attach failed, cmlb attach failed",
3332		    ddi_get_name_addr(dip));
3333		goto errout0;
3334	}
3335
3336	/*
3337	 * We ship with cache-enabled disks
3338	 */
3339	vdp->xdf_wce = B_TRUE;
3340
3341	mutex_enter(&vdp->xdf_cb_lk);
3342	/* Watch backend XenbusState change */
3343	if (xvdi_add_event_handler(dip,
3344	    XS_OE_STATE, xdf_oe_change, NULL) != DDI_SUCCESS) {
3345		mutex_exit(&vdp->xdf_cb_lk);
3346		goto errout0;
3347	}
3348
3349	if (xdf_setstate_init(vdp) != DDI_SUCCESS) {
3350		cmn_err(CE_WARN, "xdf@%s: start connection failed",
3351		    ddi_get_name_addr(dip));
3352		mutex_exit(&vdp->xdf_cb_lk);
3353		goto errout1;
3354	}
3355	mutex_exit(&vdp->xdf_cb_lk);
3356
3357#if defined(XPV_HVM_DRIVER)
3358
3359	xdf_hvm_add(dip);
3360
3361	/* Report our version to dom0.  */
3362	if (xenbus_printf(XBT_NULL, "guest/xdf", "version", "%d",
3363	    HVMPV_XDF_VERS))
3364		cmn_err(CE_WARN, "xdf: couldn't write version\n");
3365
3366#else /* !XPV_HVM_DRIVER */
3367
3368	/* create kstat for iostat(1M) */
3369	if (xdf_kstat_create(dip, "xdf", instance) != 0) {
3370		cmn_err(CE_WARN, "xdf@%s: failed to create kstat",
3371		    ddi_get_name_addr(dip));
3372		goto errout1;
3373	}
3374
3375#endif /* !XPV_HVM_DRIVER */
3376
3377	ddi_report_dev(dip);
3378	DPRINTF(DDI_DBG, ("xdf@%s: attached\n", vdp->xdf_addr));
3379	return (DDI_SUCCESS);
3380
3381errout1:
3382	(void) xvdi_switch_state(vdp->xdf_dip, XBT_NULL, XenbusStateClosed);
3383	xvdi_remove_event_handler(dip, XS_OE_STATE);
3384errout0:
3385	if (vdp->xdf_vd_lbl != NULL) {
3386		cmlb_detach(vdp->xdf_vd_lbl, NULL);
3387		cmlb_free_handle(&vdp->xdf_vd_lbl);
3388		vdp->xdf_vd_lbl = NULL;
3389	}
3390	if (vdp->xdf_softintr_id != NULL)
3391		ddi_remove_softintr(vdp->xdf_softintr_id);
3392	xvdi_remove_xb_watch_handlers(dip);
3393	if (vdp->xdf_ready_tq != NULL)
3394		ddi_taskq_destroy(vdp->xdf_ready_tq);
3395	mutex_destroy(&vdp->xdf_cb_lk);
3396	mutex_destroy(&vdp->xdf_dev_lk);
3397	cv_destroy(&vdp->xdf_dev_cv);
3398	cv_destroy(&vdp->xdf_hp_status_cv);
3399	ddi_soft_state_free(xdf_ssp, instance);
3400	ddi_set_driver_private(dip, NULL);
3401	ddi_prop_remove_all(dip);
3402	cmn_err(CE_WARN, "xdf@%s: attach failed", ddi_get_name_addr(dip));
3403	return (DDI_FAILURE);
3404}
3405
3406static int
3407xdf_suspend(dev_info_t *dip)
3408{
3409	int		instance = ddi_get_instance(dip);
3410	xdf_t		*vdp;
3411
3412	if ((vdp = ddi_get_soft_state(xdf_ssp, instance)) == NULL)
3413		return (DDI_FAILURE);
3414
3415	if (xdf_debug & SUSRES_DBG)
3416		xen_printf("xdf@%s: xdf_suspend\n", vdp->xdf_addr);
3417
3418	xvdi_suspend(dip);
3419
3420	mutex_enter(&vdp->xdf_cb_lk);
3421	mutex_enter(&vdp->xdf_dev_lk);
3422
3423	vdp->xdf_suspending = B_TRUE;
3424	xdf_ring_destroy(vdp);
3425	xdf_set_state(vdp, XD_SUSPEND);
3426	vdp->xdf_suspending = B_FALSE;
3427
3428	mutex_exit(&vdp->xdf_dev_lk);
3429	mutex_exit(&vdp->xdf_cb_lk);
3430
3431	if (xdf_debug & SUSRES_DBG)
3432		xen_printf("xdf@%s: xdf_suspend: done\n", vdp->xdf_addr);
3433
3434	return (DDI_SUCCESS);
3435}
3436
3437static int
3438xdf_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
3439{
3440	xdf_t *vdp;
3441	int instance;
3442
3443	switch (cmd) {
3444
3445	case DDI_PM_SUSPEND:
3446		break;
3447
3448	case DDI_SUSPEND:
3449		return (xdf_suspend(dip));
3450
3451	case DDI_DETACH:
3452		break;
3453
3454	default:
3455		return (DDI_FAILURE);
3456	}
3457
3458	instance = ddi_get_instance(dip);
3459	DPRINTF(DDI_DBG, ("xdf@%s: detaching\n", ddi_get_name_addr(dip)));
3460	vdp = ddi_get_soft_state(xdf_ssp, instance);
3461
3462	if (vdp == NULL)
3463		return (DDI_FAILURE);
3464
3465	mutex_enter(&vdp->xdf_cb_lk);
3466	xdf_disconnect(vdp, XD_CLOSED, B_FALSE);
3467	if (vdp->xdf_state != XD_CLOSED) {
3468		mutex_exit(&vdp->xdf_cb_lk);
3469		return (DDI_FAILURE);
3470	}
3471	mutex_exit(&vdp->xdf_cb_lk);
3472
3473	ASSERT(!ISDMACBON(vdp));
3474
3475#if defined(XPV_HVM_DRIVER)
3476	xdf_hvm_rm(dip);
3477#endif /* XPV_HVM_DRIVER */
3478
3479	if (vdp->xdf_timeout_id != 0)
3480		(void) untimeout(vdp->xdf_timeout_id);
3481
3482	xvdi_remove_event_handler(dip, XS_OE_STATE);
3483	ddi_taskq_destroy(vdp->xdf_ready_tq);
3484
3485	cmlb_detach(vdp->xdf_vd_lbl, NULL);
3486	cmlb_free_handle(&vdp->xdf_vd_lbl);
3487
3488	/* we'll support backend running in domU later */
3489#ifdef	DOMU_BACKEND
3490	(void) xvdi_post_event(dip, XEN_HP_REMOVE);
3491#endif
3492
3493	list_destroy(&vdp->xdf_vreq_act);
3494	ddi_prop_remove_all(dip);
3495	xdf_kstat_delete(dip);
3496	ddi_remove_softintr(vdp->xdf_softintr_id);
3497	xvdi_remove_xb_watch_handlers(dip);
3498	ddi_set_driver_private(dip, NULL);
3499	cv_destroy(&vdp->xdf_dev_cv);
3500	mutex_destroy(&vdp->xdf_cb_lk);
3501	mutex_destroy(&vdp->xdf_dev_lk);
3502	if (vdp->xdf_cache_flush_block != NULL)
3503		kmem_free(vdp->xdf_flush_mem, 2 * vdp->xdf_xdev_secsize);
3504	ddi_soft_state_free(xdf_ssp, instance);
3505	return (DDI_SUCCESS);
3506}
3507
3508/*
3509 * Driver linkage structures.
3510 */
3511static struct cb_ops xdf_cbops = {
3512	xdf_open,
3513	xdf_close,
3514	xdf_strategy,
3515	nodev,
3516	xdf_dump,
3517	xdf_read,
3518	xdf_write,
3519	xdf_ioctl,
3520	nodev,
3521	nodev,
3522	nodev,
3523	nochpoll,
3524	xdf_prop_op,
3525	NULL,
3526	D_MP | D_NEW | D_64BIT,
3527	CB_REV,
3528	xdf_aread,
3529	xdf_awrite
3530};
3531
3532struct dev_ops xdf_devops = {
3533	DEVO_REV,		/* devo_rev */
3534	0,			/* devo_refcnt */
3535	xdf_getinfo,		/* devo_getinfo */
3536	nulldev,		/* devo_identify */
3537	nulldev,		/* devo_probe */
3538	xdf_attach,		/* devo_attach */
3539	xdf_detach,		/* devo_detach */
3540	nodev,			/* devo_reset */
3541	&xdf_cbops,		/* devo_cb_ops */
3542	NULL,			/* devo_bus_ops */
3543	NULL,			/* devo_power */
3544	ddi_quiesce_not_supported, /* devo_quiesce */
3545};
3546
3547/*
3548 * Module linkage structures.
3549 */
3550static struct modldrv modldrv = {
3551	&mod_driverops,		/* Type of module.  This one is a driver */
3552	"virtual block driver",	/* short description */
3553	&xdf_devops		/* driver specific ops */
3554};
3555
3556static struct modlinkage xdf_modlinkage = {
3557	MODREV_1, (void *)&modldrv, NULL
3558};
3559
3560/*
3561 * standard module entry points
3562 */
3563int
3564_init(void)
3565{
3566	int rc;
3567
3568	xdf_major = ddi_name_to_major("xdf");
3569	if (xdf_major == (major_t)-1)
3570		return (EINVAL);
3571
3572	if ((rc = ddi_soft_state_init(&xdf_ssp, sizeof (xdf_t), 0)) != 0)
3573		return (rc);
3574
3575	xdf_vreq_cache = kmem_cache_create("xdf_vreq_cache",
3576	    sizeof (v_req_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3577	xdf_gs_cache = kmem_cache_create("xdf_gs_cache",
3578	    sizeof (ge_slot_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3579
3580#if defined(XPV_HVM_DRIVER)
3581	xdf_hvm_init();
3582#endif /* XPV_HVM_DRIVER */
3583
3584	if ((rc = mod_install(&xdf_modlinkage)) != 0) {
3585#if defined(XPV_HVM_DRIVER)
3586		xdf_hvm_fini();
3587#endif /* XPV_HVM_DRIVER */
3588		kmem_cache_destroy(xdf_vreq_cache);
3589		kmem_cache_destroy(xdf_gs_cache);
3590		ddi_soft_state_fini(&xdf_ssp);
3591		return (rc);
3592	}
3593
3594	return (rc);
3595}
3596
3597int
3598_fini(void)
3599{
3600	int err;
3601	if ((err = mod_remove(&xdf_modlinkage)) != 0)
3602		return (err);
3603
3604#if defined(XPV_HVM_DRIVER)
3605	xdf_hvm_fini();
3606#endif /* XPV_HVM_DRIVER */
3607
3608	kmem_cache_destroy(xdf_vreq_cache);
3609	kmem_cache_destroy(xdf_gs_cache);
3610	ddi_soft_state_fini(&xdf_ssp);
3611
3612	return (0);
3613}
3614
3615int
3616_info(struct modinfo *modinfop)
3617{
3618	return (mod_info(&xdf_modlinkage, modinfop));
3619}
3620