xdf.c revision 8863:94039d51dda4
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*
28 * xdf.c - Xen Virtual Block Device Driver
29 * TODO:
30 *	- support alternate block size (currently only DEV_BSIZE supported)
31 *	- revalidate geometry for removable devices
32 *
33 * This driver export solaris disk device nodes, accepts IO requests from
34 * those nodes, and services those requests by talking to a backend device
35 * in another domain.
36 *
37 * Communication with the backend device is done via a ringbuffer (which is
38 * managed via xvdi interfaces) and dma memory (which is managed via ddi
39 * interfaces).
40 *
41 * Communication with the backend device is dependant upon establishing a
42 * connection to the backend device.  This connection process involves
43 * reading device configuration information from xenbus and publishing
44 * some frontend runtime configuration parameters via the xenbus (for
45 * consumption by the backend).  Once we've published runtime configuration
46 * information via the xenbus, the backend device can enter the connected
47 * state and we'll enter the XD_CONNECTED state.  But before we can allow
48 * random IO to begin, we need to do IO to the backend device to determine
49 * the device label and if flush operations are supported.  Once this is
50 * done we enter the XD_READY state and can process any IO operations.
51 *
52 * We recieve notifications of xenbus state changes for the backend device
53 * (aka, the "other end") via the xdf_oe_change() callback.  This callback
54 * is single threaded, meaning that we can't recieve new notification of
55 * other end state changes while we're processing an outstanding
56 * notification of an other end state change.  There for we can't do any
57 * blocking operations from the xdf_oe_change() callback.  This is why we
58 * have a seperate taskq (xdf_ready_tq) which exists to do the necessary
59 * IO to get us from the XD_CONNECTED to the XD_READY state.  All IO
60 * generated by the xdf_ready_tq thread (xdf_ready_tq_thread) will go
61 * throught xdf_lb_rdwr(), which is a synchronous IO interface.  IOs
62 * generated by the xdf_ready_tq_thread thread have priority over all
63 * other IO requests.
64 *
65 * We also communicate with the backend device via the xenbus "media-req"
66 * (XBP_MEDIA_REQ) property.  For more information on this see the
67 * comments in blkif.h.
68 */
69
70#include <io/xdf.h>
71
72#include <sys/conf.h>
73#include <sys/dkio.h>
74#include <sys/promif.h>
75#include <sys/sysmacros.h>
76#include <sys/kstat.h>
77#include <sys/mach_mmu.h>
78#ifdef XPV_HVM_DRIVER
79#include <sys/xpv_support.h>
80#include <sys/sunndi.h>
81#else /* !XPV_HVM_DRIVER */
82#include <sys/evtchn_impl.h>
83#endif /* !XPV_HVM_DRIVER */
84#include <public/io/xenbus.h>
85#include <xen/sys/xenbus_impl.h>
86#include <sys/scsi/generic/inquiry.h>
87#include <xen/io/blkif_impl.h>
88#include <sys/fdio.h>
89#include <sys/cdio.h>
90
91/*
92 * DEBUG_EVAL can be used to include debug only statements without
93 * having to use '#ifdef DEBUG' statements
94 */
95#ifdef DEBUG
96#define	DEBUG_EVAL(x)	(x)
97#else /* !DEBUG */
98#define	DEBUG_EVAL(x)
99#endif /* !DEBUG */
100
101#define	XDF_DRAIN_MSEC_DELAY		(50*1000)	/* 00.05 sec */
102#define	XDF_DRAIN_RETRY_COUNT		200		/* 10.00 sec */
103
104#define	INVALID_DOMID	((domid_t)-1)
105#define	FLUSH_DISKCACHE	0x1
106#define	WRITE_BARRIER	0x2
107#define	DEFAULT_FLUSH_BLOCK	156 /* block to write to cause a cache flush */
108#define	USE_WRITE_BARRIER(vdp)						\
109	((vdp)->xdf_feature_barrier && !(vdp)->xdf_flush_supported)
110#define	USE_FLUSH_DISKCACHE(vdp)					\
111	((vdp)->xdf_feature_barrier && (vdp)->xdf_flush_supported)
112#define	IS_WRITE_BARRIER(vdp, bp)					\
113	(!IS_READ(bp) && USE_WRITE_BARRIER(vdp) &&			\
114	((bp)->b_un.b_addr == (vdp)->xdf_cache_flush_block))
115#define	IS_FLUSH_DISKCACHE(bp)						\
116	(!IS_READ(bp) && USE_FLUSH_DISKCACHE(vdp) && ((bp)->b_bcount == 0))
117
118#define	VREQ_DONE(vreq)							\
119	VOID2BOOLEAN(((vreq)->v_status == VREQ_DMAWIN_DONE) &&		\
120	    (((vreq)->v_flush_diskcache == FLUSH_DISKCACHE) ||		\
121	    (((vreq)->v_dmaw + 1) == (vreq)->v_ndmaws)))
122
123#define	BP_VREQ(bp)		((v_req_t *)((bp)->av_back))
124#define	BP_VREQ_SET(bp, vreq)	(((bp)->av_back = (buf_t *)(vreq)))
125
126extern int		do_polled_io;
127
128/* run-time tunables that we don't want the compiler to optimize away */
129volatile int		xdf_debug = 0;
130volatile boolean_t	xdf_barrier_flush_disable = B_FALSE;
131
132/* per module globals */
133major_t			xdf_major;
134static void		*xdf_ssp;
135static kmem_cache_t	*xdf_vreq_cache;
136static kmem_cache_t	*xdf_gs_cache;
137static int		xdf_maxphys = XB_MAXPHYS;
138static diskaddr_t	xdf_flush_block = DEFAULT_FLUSH_BLOCK;
139static int		xdf_fbrewrites;	/* flush block re-write count */
140
141/* misc public functions (used by xdf_shell.c) */
142int xdf_lb_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t, void *);
143int xdf_lb_getinfo(dev_info_t *, int, void *, void *);
144
145/*  misc private functions */
146static void xdf_io_start(xdf_t *);
147
148/* callbacks from commmon label */
149static cmlb_tg_ops_t xdf_lb_ops = {
150	TG_DK_OPS_VERSION_1,
151	xdf_lb_rdwr,
152	xdf_lb_getinfo
153};
154
155/*
156 * I/O buffer DMA attributes
157 * Make sure: one DMA window contains BLKIF_MAX_SEGMENTS_PER_REQUEST at most
158 */
159static ddi_dma_attr_t xb_dma_attr = {
160	DMA_ATTR_V0,
161	(uint64_t)0,			/* lowest address */
162	(uint64_t)0xffffffffffffffff,	/* highest usable address */
163	(uint64_t)0xffffff,		/* DMA counter limit max */
164	(uint64_t)XB_BSIZE,		/* alignment in bytes */
165	XB_BSIZE - 1,			/* bitmap of burst sizes */
166	XB_BSIZE,			/* min transfer */
167	(uint64_t)XB_MAX_XFER, 		/* maximum transfer */
168	(uint64_t)PAGEOFFSET,		/* 1 page segment length  */
169	BLKIF_MAX_SEGMENTS_PER_REQUEST,	/* maximum number of segments */
170	XB_BSIZE,			/* granularity */
171	0,				/* flags (reserved) */
172};
173
174static ddi_device_acc_attr_t xc_acc_attr = {
175	DDI_DEVICE_ATTR_V0,
176	DDI_NEVERSWAP_ACC,
177	DDI_STRICTORDER_ACC
178};
179
180static void
181xdf_timeout_handler(void *arg)
182{
183	xdf_t *vdp = arg;
184
185	mutex_enter(&vdp->xdf_dev_lk);
186	vdp->xdf_timeout_id = 0;
187	mutex_exit(&vdp->xdf_dev_lk);
188
189	/* new timeout thread could be re-scheduled */
190	xdf_io_start(vdp);
191}
192
193/*
194 * callback func when DMA/GTE resources is available
195 *
196 * Note: we only register one callback function to grant table subsystem
197 * since we only have one 'struct gnttab_free_callback' in xdf_t.
198 */
199static int
200xdf_dmacallback(caddr_t arg)
201{
202	xdf_t *vdp = (xdf_t *)arg;
203	ASSERT(vdp != NULL);
204
205	DPRINTF(DMA_DBG, ("xdf@%s: DMA callback started\n",
206	    vdp->xdf_addr));
207
208	ddi_trigger_softintr(vdp->xdf_softintr_id);
209	return (DDI_DMA_CALLBACK_DONE);
210}
211
212static ge_slot_t *
213gs_get(xdf_t *vdp, int isread)
214{
215	grant_ref_t gh;
216	ge_slot_t *gs;
217
218	/* try to alloc GTEs needed in this slot, first */
219	if (gnttab_alloc_grant_references(
220	    BLKIF_MAX_SEGMENTS_PER_REQUEST, &gh) == -1) {
221		if (vdp->xdf_gnt_callback.next == NULL) {
222			SETDMACBON(vdp);
223			gnttab_request_free_callback(
224			    &vdp->xdf_gnt_callback,
225			    (void (*)(void *))xdf_dmacallback,
226			    (void *)vdp,
227			    BLKIF_MAX_SEGMENTS_PER_REQUEST);
228		}
229		return (NULL);
230	}
231
232	gs = kmem_cache_alloc(xdf_gs_cache, KM_NOSLEEP);
233	if (gs == NULL) {
234		gnttab_free_grant_references(gh);
235		if (vdp->xdf_timeout_id == 0)
236			/* restart I/O after one second */
237			vdp->xdf_timeout_id =
238			    timeout(xdf_timeout_handler, vdp, hz);
239		return (NULL);
240	}
241
242	/* init gs_slot */
243	gs->gs_oeid = vdp->xdf_peer;
244	gs->gs_isread = isread;
245	gs->gs_ghead = gh;
246	gs->gs_ngrefs = 0;
247
248	return (gs);
249}
250
251static void
252gs_free(ge_slot_t *gs)
253{
254	int		i;
255
256	/* release all grant table entry resources used in this slot */
257	for (i = 0; i < gs->gs_ngrefs; i++)
258		gnttab_end_foreign_access(gs->gs_ge[i], !gs->gs_isread, 0);
259	gnttab_free_grant_references(gs->gs_ghead);
260	list_remove(&gs->gs_vreq->v_gs, gs);
261	kmem_cache_free(xdf_gs_cache, gs);
262}
263
264static grant_ref_t
265gs_grant(ge_slot_t *gs, mfn_t mfn)
266{
267	grant_ref_t gr = gnttab_claim_grant_reference(&gs->gs_ghead);
268
269	ASSERT(gr != -1);
270	ASSERT(gs->gs_ngrefs < BLKIF_MAX_SEGMENTS_PER_REQUEST);
271	gs->gs_ge[gs->gs_ngrefs++] = gr;
272	gnttab_grant_foreign_access_ref(gr, gs->gs_oeid, mfn, !gs->gs_isread);
273
274	return (gr);
275}
276
277/*
278 * Alloc a vreq for this bp
279 * bp->av_back contains the pointer to the vreq upon return
280 */
281static v_req_t *
282vreq_get(xdf_t *vdp, buf_t *bp)
283{
284	v_req_t *vreq = NULL;
285
286	ASSERT(BP_VREQ(bp) == NULL);
287
288	vreq = kmem_cache_alloc(xdf_vreq_cache, KM_NOSLEEP);
289	if (vreq == NULL) {
290		if (vdp->xdf_timeout_id == 0)
291			/* restart I/O after one second */
292			vdp->xdf_timeout_id =
293			    timeout(xdf_timeout_handler, vdp, hz);
294		return (NULL);
295	}
296	bzero(vreq, sizeof (v_req_t));
297	list_create(&vreq->v_gs, sizeof (ge_slot_t),
298	    offsetof(ge_slot_t, gs_vreq_link));
299	vreq->v_buf = bp;
300	vreq->v_status = VREQ_INIT;
301	vreq->v_runq = B_FALSE;
302	BP_VREQ_SET(bp, vreq);
303	/* init of other fields in vreq is up to the caller */
304
305	list_insert_head(&vdp->xdf_vreq_act, (void *)vreq);
306
307	return (vreq);
308}
309
310static void
311vreq_free(xdf_t *vdp, v_req_t *vreq)
312{
313	buf_t	*bp = vreq->v_buf;
314
315	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
316	ASSERT(BP_VREQ(bp) == vreq);
317
318	list_remove(&vdp->xdf_vreq_act, vreq);
319
320	if (vreq->v_flush_diskcache == FLUSH_DISKCACHE)
321		goto done;
322
323	switch (vreq->v_status) {
324	case VREQ_DMAWIN_DONE:
325	case VREQ_GS_ALLOCED:
326	case VREQ_DMABUF_BOUND:
327		(void) ddi_dma_unbind_handle(vreq->v_dmahdl);
328		/*FALLTHRU*/
329	case VREQ_DMAMEM_ALLOCED:
330		if (!ALIGNED_XFER(bp)) {
331			ASSERT(vreq->v_abuf != NULL);
332			if (!IS_ERROR(bp) && IS_READ(bp))
333				bcopy(vreq->v_abuf, bp->b_un.b_addr,
334				    bp->b_bcount);
335			ddi_dma_mem_free(&vreq->v_align);
336		}
337		/*FALLTHRU*/
338	case VREQ_MEMDMAHDL_ALLOCED:
339		if (!ALIGNED_XFER(bp))
340			ddi_dma_free_handle(&vreq->v_memdmahdl);
341		/*FALLTHRU*/
342	case VREQ_DMAHDL_ALLOCED:
343		ddi_dma_free_handle(&vreq->v_dmahdl);
344		break;
345	default:
346		break;
347	}
348done:
349	ASSERT(!vreq->v_runq);
350	list_destroy(&vreq->v_gs);
351	kmem_cache_free(xdf_vreq_cache, vreq);
352}
353
354/*
355 * Snarf new data if our flush block was re-written
356 */
357static void
358check_fbwrite(xdf_t *vdp, buf_t *bp, daddr_t blkno)
359{
360	int nblks;
361	boolean_t mapin;
362
363	if (IS_WRITE_BARRIER(vdp, bp))
364		return; /* write was a flush write */
365
366	mapin = B_FALSE;
367	nblks = bp->b_bcount >> DEV_BSHIFT;
368	if (xdf_flush_block >= blkno && xdf_flush_block < (blkno + nblks)) {
369		xdf_fbrewrites++;
370		if (bp->b_flags & (B_PAGEIO | B_PHYS)) {
371			mapin = B_TRUE;
372			bp_mapin(bp);
373		}
374		bcopy(bp->b_un.b_addr +
375		    ((xdf_flush_block - blkno) << DEV_BSHIFT),
376		    vdp->xdf_cache_flush_block, DEV_BSIZE);
377		if (mapin)
378			bp_mapout(bp);
379	}
380}
381
382/*
383 * Initalize the DMA and grant table resources for the buf
384 */
385static int
386vreq_setup(xdf_t *vdp, v_req_t *vreq)
387{
388	int rc;
389	ddi_dma_attr_t dmaattr;
390	uint_t ndcs, ndws;
391	ddi_dma_handle_t dh;
392	ddi_dma_handle_t mdh;
393	ddi_dma_cookie_t dc;
394	ddi_acc_handle_t abh;
395	caddr_t	aba;
396	ge_slot_t *gs;
397	size_t bufsz;
398	off_t off;
399	size_t sz;
400	buf_t *bp = vreq->v_buf;
401	int dma_flags = (IS_READ(bp) ? DDI_DMA_READ : DDI_DMA_WRITE) |
402	    DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
403
404	switch (vreq->v_status) {
405	case VREQ_INIT:
406		if (IS_FLUSH_DISKCACHE(bp)) {
407			if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
408				DPRINTF(DMA_DBG, ("xdf@%s: "
409				    "get ge_slotfailed\n", vdp->xdf_addr));
410				return (DDI_FAILURE);
411			}
412			vreq->v_blkno = 0;
413			vreq->v_nslots = 1;
414			vreq->v_flush_diskcache = FLUSH_DISKCACHE;
415			vreq->v_status = VREQ_GS_ALLOCED;
416			gs->gs_vreq = vreq;
417			list_insert_head(&vreq->v_gs, gs);
418			return (DDI_SUCCESS);
419		}
420
421		if (IS_WRITE_BARRIER(vdp, bp))
422			vreq->v_flush_diskcache = WRITE_BARRIER;
423		vreq->v_blkno = bp->b_blkno +
424		    (diskaddr_t)(uintptr_t)bp->b_private;
425		/* See if we wrote new data to our flush block */
426		if (!IS_READ(bp) && USE_WRITE_BARRIER(vdp))
427			check_fbwrite(vdp, bp, vreq->v_blkno);
428		vreq->v_status = VREQ_INIT_DONE;
429		/*FALLTHRU*/
430
431	case VREQ_INIT_DONE:
432		/*
433		 * alloc DMA handle
434		 */
435		rc = ddi_dma_alloc_handle(vdp->xdf_dip, &xb_dma_attr,
436		    xdf_dmacallback, (caddr_t)vdp, &dh);
437		if (rc != DDI_SUCCESS) {
438			SETDMACBON(vdp);
439			DPRINTF(DMA_DBG, ("xdf@%s: DMA handle alloc failed\n",
440			    vdp->xdf_addr));
441			return (DDI_FAILURE);
442		}
443
444		vreq->v_dmahdl = dh;
445		vreq->v_status = VREQ_DMAHDL_ALLOCED;
446		/*FALLTHRU*/
447
448	case VREQ_DMAHDL_ALLOCED:
449		/*
450		 * alloc dma handle for 512-byte aligned buf
451		 */
452		if (!ALIGNED_XFER(bp)) {
453			/*
454			 * XXPV: we need to temporarily enlarge the seg
455			 * boundary and s/g length to work round CR6381968
456			 */
457			dmaattr = xb_dma_attr;
458			dmaattr.dma_attr_seg = (uint64_t)-1;
459			dmaattr.dma_attr_sgllen = INT_MAX;
460			rc = ddi_dma_alloc_handle(vdp->xdf_dip, &dmaattr,
461			    xdf_dmacallback, (caddr_t)vdp, &mdh);
462			if (rc != DDI_SUCCESS) {
463				SETDMACBON(vdp);
464				DPRINTF(DMA_DBG, ("xdf@%s: "
465				    "unaligned buf DMAhandle alloc failed\n",
466				    vdp->xdf_addr));
467				return (DDI_FAILURE);
468			}
469			vreq->v_memdmahdl = mdh;
470			vreq->v_status = VREQ_MEMDMAHDL_ALLOCED;
471		}
472		/*FALLTHRU*/
473
474	case VREQ_MEMDMAHDL_ALLOCED:
475		/*
476		 * alloc 512-byte aligned buf
477		 */
478		if (!ALIGNED_XFER(bp)) {
479			if (bp->b_flags & (B_PAGEIO | B_PHYS))
480				bp_mapin(bp);
481
482			rc = ddi_dma_mem_alloc(vreq->v_memdmahdl,
483			    roundup(bp->b_bcount, XB_BSIZE), &xc_acc_attr,
484			    DDI_DMA_STREAMING, xdf_dmacallback, (caddr_t)vdp,
485			    &aba, &bufsz, &abh);
486			if (rc != DDI_SUCCESS) {
487				SETDMACBON(vdp);
488				DPRINTF(DMA_DBG, ("xdf@%s: "
489				    "DMA mem allocation failed\n",
490				    vdp->xdf_addr));
491				return (DDI_FAILURE);
492			}
493
494			vreq->v_abuf = aba;
495			vreq->v_align = abh;
496			vreq->v_status = VREQ_DMAMEM_ALLOCED;
497
498			ASSERT(bufsz >= bp->b_bcount);
499			if (!IS_READ(bp))
500				bcopy(bp->b_un.b_addr, vreq->v_abuf,
501				    bp->b_bcount);
502		}
503		/*FALLTHRU*/
504
505	case VREQ_DMAMEM_ALLOCED:
506		/*
507		 * dma bind
508		 */
509		if (ALIGNED_XFER(bp)) {
510			rc = ddi_dma_buf_bind_handle(vreq->v_dmahdl, bp,
511			    dma_flags, xdf_dmacallback, (caddr_t)vdp,
512			    &dc, &ndcs);
513		} else {
514			rc = ddi_dma_addr_bind_handle(vreq->v_dmahdl,
515			    NULL, vreq->v_abuf, bp->b_bcount, dma_flags,
516			    xdf_dmacallback, (caddr_t)vdp, &dc, &ndcs);
517		}
518		if (rc == DDI_DMA_MAPPED || rc == DDI_DMA_PARTIAL_MAP) {
519			/* get num of dma windows */
520			if (rc == DDI_DMA_PARTIAL_MAP) {
521				rc = ddi_dma_numwin(vreq->v_dmahdl, &ndws);
522				ASSERT(rc == DDI_SUCCESS);
523			} else {
524				ndws = 1;
525			}
526		} else {
527			SETDMACBON(vdp);
528			DPRINTF(DMA_DBG, ("xdf@%s: DMA bind failed\n",
529			    vdp->xdf_addr));
530			return (DDI_FAILURE);
531		}
532
533		vreq->v_dmac = dc;
534		vreq->v_dmaw = 0;
535		vreq->v_ndmacs = ndcs;
536		vreq->v_ndmaws = ndws;
537		vreq->v_nslots = ndws;
538		vreq->v_status = VREQ_DMABUF_BOUND;
539		/*FALLTHRU*/
540
541	case VREQ_DMABUF_BOUND:
542		/*
543		 * get ge_slot, callback is set upon failure from gs_get(),
544		 * if not set previously
545		 */
546		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
547			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
548			    vdp->xdf_addr));
549			return (DDI_FAILURE);
550		}
551
552		vreq->v_status = VREQ_GS_ALLOCED;
553		gs->gs_vreq = vreq;
554		list_insert_head(&vreq->v_gs, gs);
555		break;
556
557	case VREQ_GS_ALLOCED:
558		/* nothing need to be done */
559		break;
560
561	case VREQ_DMAWIN_DONE:
562		/*
563		 * move to the next dma window
564		 */
565		ASSERT((vreq->v_dmaw + 1) < vreq->v_ndmaws);
566
567		/* get a ge_slot for this DMA window */
568		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
569			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
570			    vdp->xdf_addr));
571			return (DDI_FAILURE);
572		}
573
574		vreq->v_dmaw++;
575		VERIFY(ddi_dma_getwin(vreq->v_dmahdl, vreq->v_dmaw, &off, &sz,
576		    &vreq->v_dmac, &vreq->v_ndmacs) == DDI_SUCCESS);
577		vreq->v_status = VREQ_GS_ALLOCED;
578		gs->gs_vreq = vreq;
579		list_insert_head(&vreq->v_gs, gs);
580		break;
581
582	default:
583		return (DDI_FAILURE);
584	}
585
586	return (DDI_SUCCESS);
587}
588
589static int
590xdf_cmlb_attach(xdf_t *vdp)
591{
592	dev_info_t	*dip = vdp->xdf_dip;
593
594	return (cmlb_attach(dip, &xdf_lb_ops,
595	    XD_IS_CD(vdp) ? DTYPE_RODIRECT : DTYPE_DIRECT,
596	    XD_IS_RM(vdp),
597	    B_TRUE,
598	    XD_IS_CD(vdp) ? DDI_NT_CD_XVMD : DDI_NT_BLOCK_XVMD,
599#if defined(XPV_HVM_DRIVER)
600	    (XD_IS_CD(vdp) ? 0 : CMLB_CREATE_ALTSLICE_VTOC_16_DTYPE_DIRECT) |
601	    CMLB_INTERNAL_MINOR_NODES,
602#else /* !XPV_HVM_DRIVER */
603	    XD_IS_CD(vdp) ? 0 : CMLB_FAKE_LABEL_ONE_PARTITION,
604#endif /* !XPV_HVM_DRIVER */
605	    vdp->xdf_vd_lbl, NULL));
606}
607
608static void
609xdf_io_err(buf_t *bp, int err, size_t resid)
610{
611	bioerror(bp, err);
612	if (resid == 0)
613		bp->b_resid = bp->b_bcount;
614	biodone(bp);
615}
616
617static void
618xdf_kstat_enter(xdf_t *vdp, buf_t *bp)
619{
620	v_req_t *vreq = BP_VREQ(bp);
621
622	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
623
624	if (vdp->xdf_xdev_iostat == NULL)
625		return;
626	if ((vreq != NULL) && vreq->v_runq) {
627		kstat_runq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
628	} else {
629		kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
630	}
631}
632
633static void
634xdf_kstat_exit(xdf_t *vdp, buf_t *bp)
635{
636	v_req_t *vreq = BP_VREQ(bp);
637
638	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
639
640	if (vdp->xdf_xdev_iostat == NULL)
641		return;
642	if ((vreq != NULL) && vreq->v_runq) {
643		kstat_runq_exit(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
644	} else {
645		kstat_waitq_exit(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
646	}
647}
648
649static void
650xdf_kstat_waitq_to_runq(xdf_t *vdp, buf_t *bp)
651{
652	v_req_t *vreq = BP_VREQ(bp);
653
654	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
655	ASSERT(!vreq->v_runq);
656
657	vreq->v_runq = B_TRUE;
658	if (vdp->xdf_xdev_iostat == NULL)
659		return;
660	kstat_waitq_to_runq(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
661}
662
663static void
664xdf_kstat_runq_to_waitq(xdf_t *vdp, buf_t *bp)
665{
666	v_req_t *vreq = BP_VREQ(bp);
667
668	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
669	ASSERT(vreq->v_runq);
670
671	vreq->v_runq = B_FALSE;
672	if (vdp->xdf_xdev_iostat == NULL)
673		return;
674	kstat_runq_back_to_waitq(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
675}
676
677int
678xdf_kstat_create(dev_info_t *dip, char *ks_module, int instance)
679{
680	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
681	kstat_t		*kstat;
682	buf_t		*bp;
683
684	if ((kstat = kstat_create(
685	    ks_module, instance, NULL, "disk",
686	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) == NULL)
687		return (-1);
688
689	/* See comment about locking in xdf_kstat_delete(). */
690	mutex_enter(&vdp->xdf_iostat_lk);
691	mutex_enter(&vdp->xdf_dev_lk);
692
693	/* only one kstat can exist at a time */
694	if (vdp->xdf_xdev_iostat != NULL) {
695		mutex_exit(&vdp->xdf_dev_lk);
696		mutex_exit(&vdp->xdf_iostat_lk);
697		kstat_delete(kstat);
698		return (-1);
699	}
700
701	vdp->xdf_xdev_iostat = kstat;
702	vdp->xdf_xdev_iostat->ks_lock = &vdp->xdf_dev_lk;
703	kstat_install(vdp->xdf_xdev_iostat);
704
705	/*
706	 * Now that we've created a kstat, we need to update the waitq and
707	 * runq counts for the kstat to reflect our current state.
708	 *
709	 * For a buf_t structure to be on the runq, it must have a ring
710	 * buffer slot associated with it.  To get a ring buffer slot the
711	 * buf must first have a v_req_t and a ge_slot_t associated with it.
712	 * Then when it is granted a ring buffer slot, v_runq will be set to
713	 * true.
714	 *
715	 * For a buf_t structure to be on the waitq, it must not be on the
716	 * runq.  So to find all the buf_t's that should be on waitq, we
717	 * walk the active buf list and add any buf_t's which aren't on the
718	 * runq to the waitq.
719	 */
720	bp = vdp->xdf_f_act;
721	while (bp != NULL) {
722		xdf_kstat_enter(vdp, bp);
723		bp = bp->av_forw;
724	}
725	if (vdp->xdf_ready_tq_bp != NULL)
726		xdf_kstat_enter(vdp, vdp->xdf_ready_tq_bp);
727
728	mutex_exit(&vdp->xdf_dev_lk);
729	mutex_exit(&vdp->xdf_iostat_lk);
730	return (0);
731}
732
733void
734xdf_kstat_delete(dev_info_t *dip)
735{
736	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
737	kstat_t		*kstat;
738	buf_t		*bp;
739
740	/*
741	 * The locking order here is xdf_iostat_lk and then xdf_dev_lk.
742	 * xdf_dev_lk is used to protect the xdf_xdev_iostat pointer
743	 * and the contents of the our kstat.  xdf_iostat_lk is used
744	 * to protect the allocation and freeing of the actual kstat.
745	 * xdf_dev_lk can't be used for this purpose because kstat
746	 * readers use it to access the contents of the kstat and
747	 * hence it can't be held when calling kstat_delete().
748	 */
749	mutex_enter(&vdp->xdf_iostat_lk);
750	mutex_enter(&vdp->xdf_dev_lk);
751
752	if (vdp->xdf_xdev_iostat == NULL) {
753		mutex_exit(&vdp->xdf_dev_lk);
754		mutex_exit(&vdp->xdf_iostat_lk);
755		return;
756	}
757
758	/*
759	 * We're about to destroy the kstat structures, so it isn't really
760	 * necessary to update the runq and waitq counts.  But, since this
761	 * isn't a hot code path we can afford to be a little pedantic and
762	 * go ahead and decrement the runq and waitq kstat counters to zero
763	 * before free'ing them.  This helps us ensure that we've gotten all
764	 * our accounting correct.
765	 *
766	 * For an explanation of how we determine which buffers go on the
767	 * runq vs which go on the waitq, see the comments in
768	 * xdf_kstat_create().
769	 */
770	bp = vdp->xdf_f_act;
771	while (bp != NULL) {
772		xdf_kstat_exit(vdp, bp);
773		bp = bp->av_forw;
774	}
775	if (vdp->xdf_ready_tq_bp != NULL)
776		xdf_kstat_exit(vdp, vdp->xdf_ready_tq_bp);
777
778	kstat = vdp->xdf_xdev_iostat;
779	vdp->xdf_xdev_iostat = NULL;
780	mutex_exit(&vdp->xdf_dev_lk);
781	kstat_delete(kstat);
782	mutex_exit(&vdp->xdf_iostat_lk);
783}
784
785/*
786 * Add an IO requests onto the active queue.
787 *
788 * We have to detect IOs generated by xdf_ready_tq_thread.  These IOs
789 * are used to establish a connection to the backend, so they recieve
790 * priority over all other IOs.  Since xdf_ready_tq_thread only does
791 * synchronous IO, there can only be one xdf_ready_tq_thread request at any
792 * given time and we record the buf associated with that request in
793 * xdf_ready_tq_bp.
794 */
795static void
796xdf_bp_push(xdf_t *vdp, buf_t *bp)
797{
798	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
799	ASSERT(bp->av_forw == NULL);
800
801	xdf_kstat_enter(vdp, bp);
802
803	if (curthread == vdp->xdf_ready_tq_thread) {
804		/* new IO requests from the ready thread */
805		ASSERT(vdp->xdf_ready_tq_bp == NULL);
806		vdp->xdf_ready_tq_bp = bp;
807		return;
808	}
809
810	/* this is normal IO request */
811	ASSERT(bp != vdp->xdf_ready_tq_bp);
812
813	if (vdp->xdf_f_act == NULL) {
814		/* this is only only IO on the active queue */
815		ASSERT(vdp->xdf_l_act == NULL);
816		ASSERT(vdp->xdf_i_act == NULL);
817		vdp->xdf_f_act = vdp->xdf_l_act = vdp->xdf_i_act = bp;
818		return;
819	}
820
821	/* add this IO to the tail of the active queue */
822	vdp->xdf_l_act->av_forw = bp;
823	vdp->xdf_l_act = bp;
824	if (vdp->xdf_i_act == NULL)
825		vdp->xdf_i_act = bp;
826}
827
828static void
829xdf_bp_pop(xdf_t *vdp, buf_t *bp)
830{
831	buf_t	*bp_iter;
832
833	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
834	ASSERT(VREQ_DONE(BP_VREQ(bp)));
835
836	if (vdp->xdf_ready_tq_bp == bp) {
837		/* we're done with a ready thread IO request */
838		ASSERT(bp->av_forw == NULL);
839		vdp->xdf_ready_tq_bp = NULL;
840		return;
841	}
842
843	/* we're done with a normal IO request */
844	ASSERT((bp->av_forw != NULL) || (bp == vdp->xdf_l_act));
845	ASSERT((bp->av_forw == NULL) || (bp != vdp->xdf_l_act));
846	ASSERT(VREQ_DONE(BP_VREQ(vdp->xdf_f_act)));
847	ASSERT(vdp->xdf_f_act != vdp->xdf_i_act);
848
849	if (bp == vdp->xdf_f_act) {
850		/* This IO was at the head of our active queue. */
851		vdp->xdf_f_act = bp->av_forw;
852		if (bp == vdp->xdf_l_act)
853			vdp->xdf_l_act = NULL;
854	} else {
855		/* There IO finished before some other pending IOs. */
856		bp_iter = vdp->xdf_f_act;
857		while (bp != bp_iter->av_forw) {
858			bp_iter = bp_iter->av_forw;
859			ASSERT(VREQ_DONE(BP_VREQ(bp_iter)));
860			ASSERT(bp_iter != vdp->xdf_i_act);
861		}
862		bp_iter->av_forw = bp->av_forw;
863		if (bp == vdp->xdf_l_act)
864			vdp->xdf_l_act = bp_iter;
865	}
866	bp->av_forw = NULL;
867}
868
869static buf_t *
870xdf_bp_next(xdf_t *vdp)
871{
872	v_req_t	*vreq;
873	buf_t	*bp;
874
875	if (vdp->xdf_state == XD_CONNECTED) {
876		/*
877		 * If we're in the XD_CONNECTED state, we only service IOs
878		 * from the xdf_ready_tq_thread thread.
879		 */
880		if ((bp = vdp->xdf_ready_tq_bp) == NULL)
881			return (NULL);
882		if (((vreq = BP_VREQ(bp)) == NULL) || (!VREQ_DONE(vreq)))
883			return (bp);
884		return (NULL);
885	}
886
887	/* if we're not in the XD_CONNECTED or XD_READY state we can't do IO */
888	if (vdp->xdf_state != XD_READY)
889		return (NULL);
890
891	ASSERT(vdp->xdf_ready_tq_bp == NULL);
892	for (;;) {
893		if ((bp = vdp->xdf_i_act) == NULL)
894			return (NULL);
895		if (((vreq = BP_VREQ(bp)) == NULL) || (!VREQ_DONE(vreq)))
896			return (bp);
897
898		/* advance the active buf index pointer */
899		vdp->xdf_i_act = bp->av_forw;
900	}
901}
902
903static void
904xdf_io_fini(xdf_t *vdp, uint64_t id, int bioerr)
905{
906	ge_slot_t	*gs = (ge_slot_t *)(uintptr_t)id;
907	v_req_t		*vreq = gs->gs_vreq;
908	buf_t		*bp = vreq->v_buf;
909
910	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
911	ASSERT(BP_VREQ(bp) == vreq);
912
913	gs_free(gs);
914
915	if (bioerr != 0)
916		bioerror(bp, bioerr);
917	ASSERT(vreq->v_nslots > 0);
918	if (--vreq->v_nslots > 0)
919		return;
920
921	/* remove this IO from our active queue */
922	xdf_bp_pop(vdp, bp);
923
924	ASSERT(vreq->v_runq);
925	xdf_kstat_exit(vdp, bp);
926	vreq->v_runq = B_FALSE;
927	vreq_free(vdp, vreq);
928
929	if (IS_ERROR(bp)) {
930		xdf_io_err(bp, geterror(bp), 0);
931	} else if (bp->b_resid != 0) {
932		/* Partial transfers are an error */
933		xdf_io_err(bp, EIO, bp->b_resid);
934	} else {
935		biodone(bp);
936	}
937}
938
939/*
940 * xdf interrupt handler
941 */
942static uint_t
943xdf_intr_locked(xdf_t *vdp)
944{
945	xendev_ring_t *xbr;
946	blkif_response_t *resp;
947	int bioerr;
948	uint64_t id;
949	uint8_t op;
950	uint16_t status;
951	ddi_acc_handle_t acchdl;
952
953	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
954
955	if ((xbr = vdp->xdf_xb_ring) == NULL)
956		return (DDI_INTR_UNCLAIMED);
957
958	acchdl = vdp->xdf_xb_ring_hdl;
959
960	/*
961	 * complete all requests which have a response
962	 */
963	while (resp = xvdi_ring_get_response(xbr)) {
964		id = ddi_get64(acchdl, &resp->id);
965		op = ddi_get8(acchdl, &resp->operation);
966		status = ddi_get16(acchdl, (uint16_t *)&resp->status);
967		DPRINTF(INTR_DBG, ("resp: op %d id %"PRIu64" status %d\n",
968		    op, id, status));
969
970		if (status != BLKIF_RSP_OKAY) {
971			DPRINTF(IO_DBG, ("xdf@%s: I/O error while %s",
972			    vdp->xdf_addr,
973			    (op == BLKIF_OP_READ) ? "reading" : "writing"));
974			bioerr = EIO;
975		} else {
976			bioerr = 0;
977		}
978
979		xdf_io_fini(vdp, id, bioerr);
980	}
981	return (DDI_INTR_CLAIMED);
982}
983
984static uint_t
985xdf_intr(caddr_t arg)
986{
987	xdf_t *vdp = (xdf_t *)arg;
988	int rv;
989
990	mutex_enter(&vdp->xdf_dev_lk);
991	rv = xdf_intr_locked(vdp);
992	mutex_exit(&vdp->xdf_dev_lk);
993
994	if (!do_polled_io)
995		xdf_io_start(vdp);
996
997	return (rv);
998}
999
1000static void
1001xdf_ring_push(xdf_t *vdp)
1002{
1003	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1004
1005	if (vdp->xdf_xb_ring == NULL)
1006		return;
1007
1008	if (xvdi_ring_push_request(vdp->xdf_xb_ring)) {
1009		DPRINTF(IO_DBG, (
1010		    "xdf@%s: xdf_ring_push: sent request(s) to backend\n",
1011		    vdp->xdf_addr));
1012	}
1013
1014	if (xvdi_get_evtchn(vdp->xdf_dip) != INVALID_EVTCHN)
1015		xvdi_notify_oe(vdp->xdf_dip);
1016}
1017
1018static int
1019xdf_ring_drain_locked(xdf_t *vdp)
1020{
1021	int		pollc, rv = 0;
1022
1023	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1024
1025	if (xdf_debug & SUSRES_DBG)
1026		xen_printf("xdf_ring_drain: start\n");
1027
1028	for (pollc = 0; pollc < XDF_DRAIN_RETRY_COUNT; pollc++) {
1029		if (vdp->xdf_xb_ring == NULL)
1030			goto out;
1031
1032		if (xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring))
1033			(void) xdf_intr_locked(vdp);
1034		if (!xvdi_ring_has_incomp_request(vdp->xdf_xb_ring))
1035			goto out;
1036		xdf_ring_push(vdp);
1037
1038		/* file-backed devices can be slow */
1039		mutex_exit(&vdp->xdf_dev_lk);
1040#ifdef XPV_HVM_DRIVER
1041		(void) HYPERVISOR_yield();
1042#endif /* XPV_HVM_DRIVER */
1043		delay(drv_usectohz(XDF_DRAIN_MSEC_DELAY));
1044		mutex_enter(&vdp->xdf_dev_lk);
1045	}
1046	cmn_err(CE_WARN, "xdf@%s: xdf_ring_drain: timeout", vdp->xdf_addr);
1047
1048out:
1049	if (vdp->xdf_xb_ring != NULL) {
1050		if (xvdi_ring_has_incomp_request(vdp->xdf_xb_ring) ||
1051		    xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring))
1052			rv = EIO;
1053	}
1054	if (xdf_debug & SUSRES_DBG)
1055		xen_printf("xdf@%s: xdf_ring_drain: end, err=%d\n",
1056		    vdp->xdf_addr, rv);
1057	return (rv);
1058}
1059
1060static int
1061xdf_ring_drain(xdf_t *vdp)
1062{
1063	int rv;
1064	mutex_enter(&vdp->xdf_dev_lk);
1065	rv = xdf_ring_drain_locked(vdp);
1066	mutex_exit(&vdp->xdf_dev_lk);
1067	return (rv);
1068}
1069
1070/*
1071 * Destroy all v_req_t, grant table entries, and our ring buffer.
1072 */
1073static void
1074xdf_ring_destroy(xdf_t *vdp)
1075{
1076	v_req_t		*vreq;
1077	buf_t		*bp;
1078	ge_slot_t	*gs;
1079
1080	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1081	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1082
1083	if ((vdp->xdf_state != XD_INIT) &&
1084	    (vdp->xdf_state != XD_CONNECTED) &&
1085	    (vdp->xdf_state != XD_READY)) {
1086		ASSERT(vdp->xdf_xb_ring == NULL);
1087		ASSERT(vdp->xdf_xb_ring_hdl == NULL);
1088		ASSERT(vdp->xdf_peer == INVALID_DOMID);
1089		ASSERT(vdp->xdf_evtchn == INVALID_EVTCHN);
1090		ASSERT(list_is_empty(&vdp->xdf_vreq_act));
1091		return;
1092	}
1093
1094	/*
1095	 * We don't want to recieve async notifications from the backend
1096	 * when it finishes processing ring entries.
1097	 */
1098#ifdef XPV_HVM_DRIVER
1099	ec_unbind_evtchn(vdp->xdf_evtchn);
1100#else /* !XPV_HVM_DRIVER */
1101	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1102#endif /* !XPV_HVM_DRIVER */
1103
1104	/*
1105	 * Drain any requests in the ring.  We need to do this before we
1106	 * can free grant table entries, because if active ring entries
1107	 * point to grants, then the backend could be trying to access
1108	 * those grants.
1109	 */
1110	(void) xdf_ring_drain_locked(vdp);
1111
1112	/* We're done talking to the backend so free up our event channel */
1113	xvdi_free_evtchn(vdp->xdf_dip);
1114	vdp->xdf_evtchn = INVALID_EVTCHN;
1115
1116	while ((vreq = list_head(&vdp->xdf_vreq_act)) != NULL) {
1117		bp = vreq->v_buf;
1118		ASSERT(BP_VREQ(bp) == vreq);
1119
1120		/* Free up any grant table entries associaed with this IO */
1121		while ((gs = list_head(&vreq->v_gs)) != NULL)
1122			gs_free(gs);
1123
1124		/* If this IO was on the runq, move it back to the waitq. */
1125		if (vreq->v_runq)
1126			xdf_kstat_runq_to_waitq(vdp, bp);
1127
1128		/*
1129		 * Reset any buf IO state since we're going to re-issue the
1130		 * IO when we reconnect.
1131		 */
1132		vreq_free(vdp, vreq);
1133		BP_VREQ_SET(bp, NULL);
1134		bioerror(bp, 0);
1135	}
1136
1137	/* reset the active queue index pointer */
1138	vdp->xdf_i_act = vdp->xdf_f_act;
1139
1140	/* Destroy the ring */
1141	xvdi_free_ring(vdp->xdf_xb_ring);
1142	vdp->xdf_xb_ring = NULL;
1143	vdp->xdf_xb_ring_hdl = NULL;
1144	vdp->xdf_peer = INVALID_DOMID;
1145}
1146
1147void
1148xdfmin(struct buf *bp)
1149{
1150	if (bp->b_bcount > xdf_maxphys)
1151		bp->b_bcount = xdf_maxphys;
1152}
1153
1154/*
1155 * Check if we have a pending "eject" media request.
1156 */
1157static int
1158xdf_eject_pending(xdf_t *vdp)
1159{
1160	dev_info_t	*dip = vdp->xdf_dip;
1161	char		*xsname, *str;
1162
1163	if (!vdp->xdf_media_req_supported)
1164		return (B_FALSE);
1165
1166	if (((xsname = xvdi_get_xsname(dip)) == NULL) ||
1167	    (xenbus_read_str(xsname, XBP_MEDIA_REQ, &str) != 0))
1168		return (B_FALSE);
1169
1170	if (strcmp(str, XBV_MEDIA_REQ_EJECT) != 0) {
1171		strfree(str);
1172		return (B_FALSE);
1173	}
1174	strfree(str);
1175	return (B_TRUE);
1176}
1177
1178/*
1179 * Generate a media request.
1180 */
1181static int
1182xdf_media_req(xdf_t *vdp, char *req, boolean_t media_required)
1183{
1184	dev_info_t	*dip = vdp->xdf_dip;
1185	char		*xsname;
1186
1187	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1188
1189	if ((xsname = xvdi_get_xsname(dip)) == NULL)
1190		return (ENXIO);
1191
1192	/* Check if we support media requests */
1193	if (!XD_IS_CD(vdp) || !vdp->xdf_media_req_supported)
1194		return (ENOTTY);
1195
1196	/* If an eject is pending then don't allow any new requests */
1197	if (xdf_eject_pending(vdp))
1198		return (ENXIO);
1199
1200	/* Make sure that there is media present */
1201	if (media_required && (vdp->xdf_xdev_nblocks == 0))
1202		return (ENXIO);
1203
1204	/* We only allow operations when the device is ready and connected */
1205	if (vdp->xdf_state != XD_READY)
1206		return (EIO);
1207
1208	if (xenbus_printf(XBT_NULL, xsname, XBP_MEDIA_REQ, "%s", req) != 0)
1209		return (EIO);
1210
1211	return (0);
1212}
1213
1214/*
1215 * populate a single blkif_request_t w/ a buf
1216 */
1217static void
1218xdf_process_rreq(xdf_t *vdp, struct buf *bp, blkif_request_t *rreq)
1219{
1220	grant_ref_t	gr;
1221	uint8_t		fsect, lsect;
1222	size_t		bcnt;
1223	paddr_t		dma_addr;
1224	off_t		blk_off;
1225	dev_info_t	*dip = vdp->xdf_dip;
1226	blkif_vdev_t	vdev = xvdi_get_vdevnum(dip);
1227	v_req_t		*vreq = BP_VREQ(bp);
1228	uint64_t	blkno = vreq->v_blkno;
1229	uint_t		ndmacs = vreq->v_ndmacs;
1230	ddi_acc_handle_t acchdl = vdp->xdf_xb_ring_hdl;
1231	int		seg = 0;
1232	int		isread = IS_READ(bp);
1233	ge_slot_t	*gs = list_head(&vreq->v_gs);
1234
1235	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1236	ASSERT(vreq->v_status == VREQ_GS_ALLOCED);
1237
1238	if (isread)
1239		ddi_put8(acchdl, &rreq->operation, BLKIF_OP_READ);
1240	else {
1241		switch (vreq->v_flush_diskcache) {
1242		case FLUSH_DISKCACHE:
1243			ddi_put8(acchdl, &rreq->operation,
1244			    BLKIF_OP_FLUSH_DISKCACHE);
1245			ddi_put16(acchdl, &rreq->handle, vdev);
1246			ddi_put64(acchdl, &rreq->id,
1247			    (uint64_t)(uintptr_t)(gs));
1248			ddi_put8(acchdl, &rreq->nr_segments, 0);
1249			vreq->v_status = VREQ_DMAWIN_DONE;
1250			return;
1251		case WRITE_BARRIER:
1252			ddi_put8(acchdl, &rreq->operation,
1253			    BLKIF_OP_WRITE_BARRIER);
1254			break;
1255		default:
1256			if (!vdp->xdf_wce)
1257				ddi_put8(acchdl, &rreq->operation,
1258				    BLKIF_OP_WRITE_BARRIER);
1259			else
1260				ddi_put8(acchdl, &rreq->operation,
1261				    BLKIF_OP_WRITE);
1262			break;
1263		}
1264	}
1265
1266	ddi_put16(acchdl, &rreq->handle, vdev);
1267	ddi_put64(acchdl, &rreq->sector_number, blkno);
1268	ddi_put64(acchdl, &rreq->id, (uint64_t)(uintptr_t)(gs));
1269
1270	/*
1271	 * loop until all segments are populated or no more dma cookie in buf
1272	 */
1273	for (;;) {
1274		/*
1275		 * Each segment of a blkif request can transfer up to
1276		 * one 4K page of data.
1277		 */
1278		bcnt = vreq->v_dmac.dmac_size;
1279		dma_addr = vreq->v_dmac.dmac_laddress;
1280		blk_off = (uint_t)((paddr_t)XB_SEGOFFSET & dma_addr);
1281		fsect = blk_off >> XB_BSHIFT;
1282		lsect = fsect + (bcnt >> XB_BSHIFT) - 1;
1283
1284		ASSERT(bcnt <= PAGESIZE);
1285		ASSERT((bcnt % XB_BSIZE) == 0);
1286		ASSERT((blk_off & XB_BMASK) == 0);
1287		ASSERT(fsect < XB_MAX_SEGLEN / XB_BSIZE &&
1288		    lsect < XB_MAX_SEGLEN / XB_BSIZE);
1289
1290		gr = gs_grant(gs, PATOMA(dma_addr) >> PAGESHIFT);
1291		ddi_put32(acchdl, &rreq->seg[seg].gref, gr);
1292		ddi_put8(acchdl, &rreq->seg[seg].first_sect, fsect);
1293		ddi_put8(acchdl, &rreq->seg[seg].last_sect, lsect);
1294
1295		DPRINTF(IO_DBG, (
1296		    "xdf@%s: seg%d: dmacS %lu blk_off %ld\n",
1297		    vdp->xdf_addr, seg, vreq->v_dmac.dmac_size, blk_off));
1298		DPRINTF(IO_DBG, (
1299		    "xdf@%s: seg%d: fs %d ls %d gr %d dma 0x%"PRIx64"\n",
1300		    vdp->xdf_addr, seg, fsect, lsect, gr, dma_addr));
1301
1302		blkno += (bcnt >> XB_BSHIFT);
1303		seg++;
1304		ASSERT(seg <= BLKIF_MAX_SEGMENTS_PER_REQUEST);
1305		if (--ndmacs) {
1306			ddi_dma_nextcookie(vreq->v_dmahdl, &vreq->v_dmac);
1307			continue;
1308		}
1309
1310		vreq->v_status = VREQ_DMAWIN_DONE;
1311		vreq->v_blkno = blkno;
1312		break;
1313	}
1314	ddi_put8(acchdl,  &rreq->nr_segments, seg);
1315	DPRINTF(IO_DBG, (
1316	    "xdf@%s: xdf_process_rreq: request id=%"PRIx64" ready\n",
1317	    vdp->xdf_addr, rreq->id));
1318}
1319
1320static void
1321xdf_io_start(xdf_t *vdp)
1322{
1323	struct buf	*bp;
1324	v_req_t		*vreq;
1325	blkif_request_t	*rreq;
1326	boolean_t	rreqready = B_FALSE;
1327
1328	mutex_enter(&vdp->xdf_dev_lk);
1329
1330	/*
1331	 * Populate the ring request(s).  Loop until there is no buf to
1332	 * transfer or no free slot available in I/O ring.
1333	 */
1334	for (;;) {
1335		/* don't start any new IO if we're suspending */
1336		if (vdp->xdf_suspending)
1337			break;
1338		if ((bp = xdf_bp_next(vdp)) == NULL)
1339			break;
1340
1341		/* if the buf doesn't already have a vreq, allocate one */
1342		if (((vreq = BP_VREQ(bp)) == NULL) &&
1343		    ((vreq = vreq_get(vdp, bp)) == NULL))
1344			break;
1345
1346		/* alloc DMA/GTE resources */
1347		if (vreq_setup(vdp, vreq) != DDI_SUCCESS)
1348			break;
1349
1350		/* get next blkif_request in the ring */
1351		if ((rreq = xvdi_ring_get_request(vdp->xdf_xb_ring)) == NULL)
1352			break;
1353		bzero(rreq, sizeof (blkif_request_t));
1354		rreqready = B_TRUE;
1355
1356		/* populate blkif_request with this buf */
1357		xdf_process_rreq(vdp, bp, rreq);
1358
1359		/*
1360		 * This buffer/vreq pair is has been allocated a ring buffer
1361		 * resources, so if it isn't already in our runq, add it.
1362		 */
1363		if (!vreq->v_runq)
1364			xdf_kstat_waitq_to_runq(vdp, bp);
1365	}
1366
1367	/* Send the request(s) to the backend */
1368	if (rreqready)
1369		xdf_ring_push(vdp);
1370
1371	mutex_exit(&vdp->xdf_dev_lk);
1372}
1373
1374
1375/* check if partition is open, -1 - check all partitions on the disk */
1376static boolean_t
1377xdf_isopen(xdf_t *vdp, int partition)
1378{
1379	int i;
1380	ulong_t parbit;
1381	boolean_t rval = B_FALSE;
1382
1383	ASSERT((partition == -1) ||
1384	    ((partition >= 0) || (partition < XDF_PEXT)));
1385
1386	if (partition == -1)
1387		parbit = (ulong_t)-1;
1388	else
1389		parbit = 1 << partition;
1390
1391	for (i = 0; i < OTYPCNT; i++) {
1392		if (vdp->xdf_vd_open[i] & parbit)
1393			rval = B_TRUE;
1394	}
1395
1396	return (rval);
1397}
1398
1399/*
1400 * The connection should never be closed as long as someone is holding
1401 * us open, there is pending IO, or someone is waiting waiting for a
1402 * connection.
1403 */
1404static boolean_t
1405xdf_busy(xdf_t *vdp)
1406{
1407	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1408
1409	if ((vdp->xdf_xb_ring != NULL) &&
1410	    xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring)) {
1411		ASSERT(vdp->xdf_state != XD_CLOSED);
1412		return (B_TRUE);
1413	}
1414
1415	if (!list_is_empty(&vdp->xdf_vreq_act) || (vdp->xdf_f_act != NULL)) {
1416		ASSERT(vdp->xdf_state != XD_CLOSED);
1417		return (B_TRUE);
1418	}
1419
1420	if (xdf_isopen(vdp, -1)) {
1421		ASSERT(vdp->xdf_state != XD_CLOSED);
1422		return (B_TRUE);
1423	}
1424
1425	if (vdp->xdf_connect_req > 0) {
1426		ASSERT(vdp->xdf_state != XD_CLOSED);
1427		return (B_TRUE);
1428	}
1429
1430	return (B_FALSE);
1431}
1432
1433static void
1434xdf_set_state(xdf_t *vdp, xdf_state_t new_state)
1435{
1436	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1437	DPRINTF(DDI_DBG, ("xdf@%s: state change %d -> %d\n",
1438	    vdp->xdf_addr, vdp->xdf_state, new_state));
1439	vdp->xdf_state = new_state;
1440	cv_broadcast(&vdp->xdf_dev_cv);
1441}
1442
1443static void
1444xdf_disconnect(xdf_t *vdp, xdf_state_t new_state, boolean_t quiet)
1445{
1446	dev_info_t	*dip = vdp->xdf_dip;
1447	boolean_t	busy;
1448
1449	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1450	ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1451	ASSERT((new_state == XD_UNKNOWN) || (new_state == XD_CLOSED));
1452
1453	/* Check if we're already there. */
1454	if (vdp->xdf_state == new_state)
1455		return;
1456
1457	mutex_enter(&vdp->xdf_dev_lk);
1458	busy = xdf_busy(vdp);
1459
1460	/* If we're already closed then there's nothing todo. */
1461	if (vdp->xdf_state == XD_CLOSED) {
1462		ASSERT(!busy);
1463		xdf_set_state(vdp, new_state);
1464		mutex_exit(&vdp->xdf_dev_lk);
1465		return;
1466	}
1467
1468#ifdef DEBUG
1469	/* UhOh.  Warn the user that something bad has happened. */
1470	if (!quiet && busy && (vdp->xdf_state == XD_READY) &&
1471	    (vdp->xdf_xdev_nblocks != 0)) {
1472		cmn_err(CE_WARN, "xdf@%s: disconnected while in use",
1473		    vdp->xdf_addr);
1474	}
1475#endif /* DEBUG */
1476
1477	xdf_ring_destroy(vdp);
1478
1479	/* If we're busy then we can only go into the unknown state */
1480	xdf_set_state(vdp, (busy) ? XD_UNKNOWN : new_state);
1481	mutex_exit(&vdp->xdf_dev_lk);
1482
1483	/* if we're closed now, let the other end know */
1484	if (vdp->xdf_state == XD_CLOSED)
1485		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
1486}
1487
1488
1489/*
1490 * Kick-off connect process
1491 * Status should be XD_UNKNOWN or XD_CLOSED
1492 * On success, status will be changed to XD_INIT
1493 * On error, it will be changed to XD_UNKNOWN
1494 */
1495static int
1496xdf_setstate_init(xdf_t *vdp)
1497{
1498	dev_info_t		*dip = vdp->xdf_dip;
1499	xenbus_transaction_t	xbt;
1500	grant_ref_t		gref;
1501	char			*xsname, *str;
1502	int 			rv;
1503
1504	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1505	ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1506	ASSERT((vdp->xdf_state == XD_UNKNOWN) ||
1507	    (vdp->xdf_state == XD_CLOSED));
1508
1509	DPRINTF(DDI_DBG,
1510	    ("xdf@%s: starting connection process\n", vdp->xdf_addr));
1511
1512	/*
1513	 * If an eject is pending then don't allow a new connection, but
1514	 * we want to return without displaying an error message.
1515	 */
1516	if (xdf_eject_pending(vdp)) {
1517		xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1518		return (DDI_FAILURE);
1519	}
1520
1521	if ((xsname = xvdi_get_xsname(dip)) == NULL)
1522		goto errout;
1523
1524	if ((vdp->xdf_peer = xvdi_get_oeid(dip)) == INVALID_DOMID)
1525		goto errout;
1526
1527	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitialising);
1528
1529	/*
1530	 * Sanity check for the existance of the xenbus device-type property.
1531	 * This property might not exist if we our xenbus device nodes was
1532	 * force destroyed while we were still connected to the backend.
1533	 */
1534	if (xenbus_read_str(xsname, XBP_DEV_TYPE, &str) != 0)
1535		goto errout;
1536	strfree(str);
1537
1538	if (xvdi_alloc_evtchn(dip) != DDI_SUCCESS)
1539		goto errout;
1540
1541	vdp->xdf_evtchn = xvdi_get_evtchn(dip);
1542#ifdef XPV_HVM_DRIVER
1543	ec_bind_evtchn_to_handler(vdp->xdf_evtchn, IPL_VBD, xdf_intr, vdp);
1544#else /* !XPV_HVM_DRIVER */
1545	if (ddi_add_intr(dip, 0, NULL, NULL, xdf_intr, (caddr_t)vdp) !=
1546	    DDI_SUCCESS) {
1547		cmn_err(CE_WARN, "xdf@%s: xdf_setstate_init: "
1548		    "failed to add intr handler", vdp->xdf_addr);
1549		goto errout1;
1550	}
1551#endif /* !XPV_HVM_DRIVER */
1552
1553	if (xvdi_alloc_ring(dip, BLKIF_RING_SIZE,
1554	    sizeof (union blkif_sring_entry), &gref, &vdp->xdf_xb_ring) !=
1555	    DDI_SUCCESS) {
1556		cmn_err(CE_WARN, "xdf@%s: failed to alloc comm ring",
1557		    vdp->xdf_addr);
1558		goto errout2;
1559	}
1560	vdp->xdf_xb_ring_hdl = vdp->xdf_xb_ring->xr_acc_hdl; /* ugly!! */
1561
1562	/*
1563	 * Write into xenstore the info needed by backend
1564	 */
1565trans_retry:
1566	if (xenbus_transaction_start(&xbt)) {
1567		cmn_err(CE_WARN, "xdf@%s: failed to start transaction",
1568		    vdp->xdf_addr);
1569		xvdi_fatal_error(dip, EIO, "connect transaction init");
1570		goto fail_trans;
1571	}
1572
1573	/*
1574	 * XBP_PROTOCOL is written by the domain builder in the case of PV
1575	 * domains. However, it is not written for HVM domains, so let's
1576	 * write it here.
1577	 */
1578	if (((rv = xenbus_printf(xbt, xsname,
1579	    XBP_MEDIA_REQ, "%s", XBV_MEDIA_REQ_NONE)) != 0) ||
1580	    ((rv = xenbus_printf(xbt, xsname,
1581	    XBP_RING_REF, "%u", gref)) != 0) ||
1582	    ((rv = xenbus_printf(xbt, xsname,
1583	    XBP_EVENT_CHAN, "%u", vdp->xdf_evtchn)) != 0) ||
1584	    ((rv = xenbus_printf(xbt, xsname,
1585	    XBP_PROTOCOL, "%s", XEN_IO_PROTO_ABI_NATIVE)) != 0) ||
1586	    ((rv = xvdi_switch_state(dip, xbt, XenbusStateInitialised)) > 0)) {
1587		(void) xenbus_transaction_end(xbt, 1);
1588		xvdi_fatal_error(dip, rv, "connect transaction setup");
1589		goto fail_trans;
1590	}
1591
1592	/* kick-off connect process */
1593	if (rv = xenbus_transaction_end(xbt, 0)) {
1594		if (rv == EAGAIN)
1595			goto trans_retry;
1596		xvdi_fatal_error(dip, rv, "connect transaction commit");
1597		goto fail_trans;
1598	}
1599
1600	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1601	mutex_enter(&vdp->xdf_dev_lk);
1602	xdf_set_state(vdp, XD_INIT);
1603	mutex_exit(&vdp->xdf_dev_lk);
1604
1605	return (DDI_SUCCESS);
1606
1607fail_trans:
1608	xvdi_free_ring(vdp->xdf_xb_ring);
1609errout2:
1610#ifdef XPV_HVM_DRIVER
1611	ec_unbind_evtchn(vdp->xdf_evtchn);
1612#else /* !XPV_HVM_DRIVER */
1613	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1614#endif /* !XPV_HVM_DRIVER */
1615errout1:
1616	xvdi_free_evtchn(dip);
1617	vdp->xdf_evtchn = INVALID_EVTCHN;
1618errout:
1619	xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1620	cmn_err(CE_WARN, "xdf@%s: failed to start connection to backend",
1621	    vdp->xdf_addr);
1622	return (DDI_FAILURE);
1623}
1624
1625int
1626xdf_get_flush_block(xdf_t *vdp)
1627{
1628	/*
1629	 * Get a DEV_BSIZE aligned bufer
1630	 */
1631	vdp->xdf_flush_mem = kmem_alloc(DEV_BSIZE * 2, KM_SLEEP);
1632	vdp->xdf_cache_flush_block =
1633	    (char *)P2ROUNDUP((uintptr_t)(vdp->xdf_flush_mem), DEV_BSIZE);
1634	if (xdf_lb_rdwr(vdp->xdf_dip, TG_READ, vdp->xdf_cache_flush_block,
1635	    xdf_flush_block, DEV_BSIZE, NULL) != 0)
1636		return (DDI_FAILURE);
1637	return (DDI_SUCCESS);
1638}
1639
1640static void
1641xdf_setstate_ready(void *arg)
1642{
1643	xdf_t	*vdp = (xdf_t *)arg;
1644
1645	vdp->xdf_ready_tq_thread = curthread;
1646
1647	/*
1648	 * We've created all the minor nodes via cmlb_attach() using default
1649	 * value in xdf_attach() to make it possible to block in xdf_open(),
1650	 * in case there's anyone (say, booting thread) ever trying to open
1651	 * it before connected to backend. We will refresh all those minor
1652	 * nodes w/ latest info we've got now when we are almost connected.
1653	 */
1654	mutex_enter(&vdp->xdf_dev_lk);
1655	if (vdp->xdf_cmbl_reattach) {
1656		vdp->xdf_cmbl_reattach = B_FALSE;
1657
1658		mutex_exit(&vdp->xdf_dev_lk);
1659		if (xdf_cmlb_attach(vdp) != 0) {
1660			xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1661			return;
1662		}
1663		mutex_enter(&vdp->xdf_dev_lk);
1664	}
1665
1666	/* If we're not still trying to get to the ready state, then bail. */
1667	if (vdp->xdf_state != XD_CONNECTED) {
1668		mutex_exit(&vdp->xdf_dev_lk);
1669		return;
1670	}
1671	mutex_exit(&vdp->xdf_dev_lk);
1672
1673	/*
1674	 * If backend has feature-barrier, see if it supports disk
1675	 * cache flush op.
1676	 */
1677	vdp->xdf_flush_supported = B_FALSE;
1678	if (vdp->xdf_feature_barrier) {
1679		/*
1680		 * Pretend we already know flush is supported so probe
1681		 * will attempt the correct op.
1682		 */
1683		vdp->xdf_flush_supported = B_TRUE;
1684		if (xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE, NULL, 0, 0, 0) == 0) {
1685			vdp->xdf_flush_supported = B_TRUE;
1686		} else {
1687			vdp->xdf_flush_supported = B_FALSE;
1688			/*
1689			 * If the other end does not support the cache flush op
1690			 * then we must use a barrier-write to force disk
1691			 * cache flushing.  Barrier writes require that a data
1692			 * block actually be written.
1693			 * Cache a block to barrier-write when we are
1694			 * asked to perform a flush.
1695			 * XXX - would it be better to just copy 1 block
1696			 * (512 bytes) from whatever write we did last
1697			 * and rewrite that block?
1698			 */
1699			if (xdf_get_flush_block(vdp) != DDI_SUCCESS) {
1700				xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1701				return;
1702			}
1703		}
1704	}
1705
1706	mutex_enter(&vdp->xdf_cb_lk);
1707	mutex_enter(&vdp->xdf_dev_lk);
1708	if (vdp->xdf_state == XD_CONNECTED)
1709		xdf_set_state(vdp, XD_READY);
1710	mutex_exit(&vdp->xdf_dev_lk);
1711
1712	/* Restart any currently queued up io */
1713	xdf_io_start(vdp);
1714
1715	mutex_exit(&vdp->xdf_cb_lk);
1716}
1717
1718/*
1719 * synthetic geometry
1720 */
1721#define	XDF_NSECTS	256
1722#define	XDF_NHEADS	16
1723
1724static void
1725xdf_synthetic_pgeom(dev_info_t *dip, cmlb_geom_t *geomp)
1726{
1727	xdf_t *vdp;
1728	uint_t ncyl;
1729
1730	vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
1731
1732	ncyl = vdp->xdf_xdev_nblocks / (XDF_NHEADS * XDF_NSECTS);
1733
1734	bzero(geomp, sizeof (*geomp));
1735	geomp->g_ncyl = ncyl == 0 ? 1 : ncyl;
1736	geomp->g_acyl = 0;
1737	geomp->g_nhead = XDF_NHEADS;
1738	geomp->g_nsect = XDF_NSECTS;
1739	geomp->g_secsize = XB_BSIZE;
1740	geomp->g_capacity = vdp->xdf_xdev_nblocks;
1741	geomp->g_intrlv = 0;
1742	geomp->g_rpm = 7200;
1743}
1744
1745/*
1746 * Finish other initialization after we've connected to backend
1747 * Status should be XD_INIT before calling this routine
1748 * On success, status should be changed to XD_CONNECTED.
1749 * On error, status should stay XD_INIT
1750 */
1751static int
1752xdf_setstate_connected(xdf_t *vdp)
1753{
1754	dev_info_t	*dip = vdp->xdf_dip;
1755	cmlb_geom_t	pgeom;
1756	diskaddr_t	nblocks = 0;
1757	char		*oename, *xsname, *str;
1758	uint_t		dinfo;
1759
1760	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1761	ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1762	ASSERT(vdp->xdf_state == XD_INIT);
1763
1764	if (((xsname = xvdi_get_xsname(dip)) == NULL) ||
1765	    ((oename = xvdi_get_oename(dip)) == NULL))
1766		return (DDI_FAILURE);
1767
1768	/* Determine if feature barrier is supported by backend */
1769	if (!(vdp->xdf_feature_barrier = xenbus_exists(oename, XBP_FB)))
1770		cmn_err(CE_NOTE, "xdf@%s: failed to read feature-barrier",
1771		    vdp->xdf_addr);
1772
1773	/*
1774	 * Probe backend.  Read the device size into xdf_xdev_nblocks
1775	 * and set the VDISK_READONLY, VDISK_CDROM, and VDISK_REMOVABLE
1776	 * flags in xdf_dinfo.  If the emulated device type is "cdrom",
1777	 * we always set VDISK_CDROM, regardless of if it's present in
1778	 * the xenbus info parameter.
1779	 */
1780	if (xenbus_gather(XBT_NULL, oename,
1781	    XBP_SECTORS, "%"SCNu64, &nblocks,
1782	    XBP_INFO, "%u", &dinfo,
1783	    NULL) != 0) {
1784		cmn_err(CE_WARN, "xdf@%s: xdf_setstate_connected: "
1785		    "cannot read backend info", vdp->xdf_addr);
1786		return (DDI_FAILURE);
1787	}
1788	if (xenbus_read_str(xsname, XBP_DEV_TYPE, &str) != 0) {
1789		cmn_err(CE_WARN, "xdf@%s: cannot read device-type",
1790		    vdp->xdf_addr);
1791		return (DDI_FAILURE);
1792	}
1793	if (strcmp(str, XBV_DEV_TYPE_CD) == 0)
1794		dinfo |= VDISK_CDROM;
1795	strfree(str);
1796
1797	vdp->xdf_xdev_nblocks = nblocks;
1798#ifdef _ILP32
1799	if (vdp->xdf_xdev_nblocks > DK_MAX_BLOCKS) {
1800		cmn_err(CE_WARN, "xdf@%s: xdf_setstate_connected: "
1801		    "backend disk device too large with %llu blocks for"
1802		    " 32-bit kernel", vdp->xdf_addr, vdp->xdf_xdev_nblocks);
1803		xvdi_fatal_error(dip, EFBIG, "reading backend info");
1804		return (DDI_FAILURE);
1805	}
1806#endif
1807
1808	/*
1809	 * If the physical geometry for a fixed disk has been explicity
1810	 * set then make sure that the specified physical geometry isn't
1811	 * larger than the device we connected to.
1812	 */
1813	if (vdp->xdf_pgeom_fixed &&
1814	    (vdp->xdf_pgeom.g_capacity > vdp->xdf_xdev_nblocks)) {
1815		cmn_err(CE_WARN,
1816		    "xdf@%s: connect failed, fixed geometry too large",
1817		    vdp->xdf_addr);
1818		return (DDI_FAILURE);
1819	}
1820
1821	vdp->xdf_media_req_supported = xenbus_exists(oename, XBP_MEDIA_REQ_SUP);
1822
1823	/* mark vbd is ready for I/O */
1824	mutex_enter(&vdp->xdf_dev_lk);
1825	xdf_set_state(vdp, XD_CONNECTED);
1826
1827	/* check if the cmlb label should be updated */
1828	xdf_synthetic_pgeom(dip, &pgeom);
1829	if ((vdp->xdf_dinfo != dinfo) ||
1830	    (!vdp->xdf_pgeom_fixed &&
1831	    (memcmp(&vdp->xdf_pgeom, &pgeom, sizeof (pgeom)) != 0))) {
1832		vdp->xdf_cmbl_reattach = B_TRUE;
1833
1834		vdp->xdf_dinfo = dinfo;
1835		if (!vdp->xdf_pgeom_fixed)
1836			vdp->xdf_pgeom = pgeom;
1837	}
1838
1839	if (XD_IS_CD(vdp) || XD_IS_RM(vdp)) {
1840		if (vdp->xdf_xdev_nblocks == 0) {
1841			vdp->xdf_mstate = DKIO_EJECTED;
1842			cv_broadcast(&vdp->xdf_mstate_cv);
1843		} else {
1844			vdp->xdf_mstate = DKIO_INSERTED;
1845			cv_broadcast(&vdp->xdf_mstate_cv);
1846		}
1847	} else {
1848		if (vdp->xdf_mstate != DKIO_NONE) {
1849			vdp->xdf_mstate = DKIO_NONE;
1850			cv_broadcast(&vdp->xdf_mstate_cv);
1851		}
1852	}
1853
1854	mutex_exit(&vdp->xdf_dev_lk);
1855
1856	cmn_err(CE_CONT, "?xdf@%s: %"PRIu64" blocks", vdp->xdf_addr,
1857	    (uint64_t)vdp->xdf_xdev_nblocks);
1858
1859	/* Restart any currently queued up io */
1860	xdf_io_start(vdp);
1861
1862	/*
1863	 * To get to the ready state we have to do IO to the backend device,
1864	 * but we can't initiate IO from the other end change callback thread
1865	 * (which is the current context we're executing in.)  This is because
1866	 * if the other end disconnects while we're doing IO from the callback
1867	 * thread, then we can't recieve that disconnect event and we hang
1868	 * waiting for an IO that can never complete.
1869	 */
1870	(void) ddi_taskq_dispatch(vdp->xdf_ready_tq, xdf_setstate_ready, vdp,
1871	    DDI_SLEEP);
1872
1873	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
1874	return (DDI_SUCCESS);
1875}
1876
1877/*ARGSUSED*/
1878static void
1879xdf_oe_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg, void *impl_data)
1880{
1881	XenbusState new_state = *(XenbusState *)impl_data;
1882	xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
1883
1884	DPRINTF(DDI_DBG, ("xdf@%s: otherend state change to %d!\n",
1885	    vdp->xdf_addr, new_state));
1886
1887	mutex_enter(&vdp->xdf_cb_lk);
1888
1889	/* We assume that this callback is single threaded */
1890	ASSERT(vdp->xdf_oe_change_thread == NULL);
1891	DEBUG_EVAL(vdp->xdf_oe_change_thread = curthread);
1892
1893	/* ignore any backend state changes if we're suspending/suspended */
1894	if (vdp->xdf_suspending || (vdp->xdf_state == XD_SUSPEND)) {
1895		DEBUG_EVAL(vdp->xdf_oe_change_thread = NULL);
1896		mutex_exit(&vdp->xdf_cb_lk);
1897		return;
1898	}
1899
1900	switch (new_state) {
1901	case XenbusStateUnknown:
1902	case XenbusStateInitialising:
1903	case XenbusStateInitWait:
1904	case XenbusStateInitialised:
1905		if (vdp->xdf_state == XD_INIT)
1906			break;
1907
1908		xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1909		if (xdf_setstate_init(vdp) != DDI_SUCCESS)
1910			break;
1911		ASSERT(vdp->xdf_state == XD_INIT);
1912		break;
1913
1914	case XenbusStateConnected:
1915		if ((vdp->xdf_state == XD_CONNECTED) ||
1916		    (vdp->xdf_state == XD_READY))
1917			break;
1918
1919		if (vdp->xdf_state != XD_INIT) {
1920			xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1921			if (xdf_setstate_init(vdp) != DDI_SUCCESS)
1922				break;
1923			ASSERT(vdp->xdf_state == XD_INIT);
1924		}
1925
1926		if (xdf_setstate_connected(vdp) != DDI_SUCCESS) {
1927			xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1928			break;
1929		}
1930		ASSERT(vdp->xdf_state == XD_CONNECTED);
1931		break;
1932
1933	case XenbusStateClosing:
1934		if (xdf_isopen(vdp, -1)) {
1935			cmn_err(CE_NOTE,
1936			    "xdf@%s: hot-unplug failed, still in use",
1937			    vdp->xdf_addr);
1938			break;
1939		}
1940		/*FALLTHROUGH*/
1941	case XenbusStateClosed:
1942		xdf_disconnect(vdp, XD_CLOSED, B_FALSE);
1943		break;
1944	}
1945
1946	/* notify anybody waiting for oe state change */
1947	cv_broadcast(&vdp->xdf_dev_cv);
1948	DEBUG_EVAL(vdp->xdf_oe_change_thread = NULL);
1949	mutex_exit(&vdp->xdf_cb_lk);
1950}
1951
1952static int
1953xdf_connect_locked(xdf_t *vdp, boolean_t wait)
1954{
1955	int	rv;
1956
1957	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1958	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1959
1960	/* we can't connect once we're in the closed state */
1961	if (vdp->xdf_state == XD_CLOSED)
1962		return (XD_CLOSED);
1963
1964	vdp->xdf_connect_req++;
1965	while (vdp->xdf_state != XD_READY) {
1966		mutex_exit(&vdp->xdf_dev_lk);
1967		if (vdp->xdf_state == XD_UNKNOWN)
1968			(void) xdf_setstate_init(vdp);
1969		mutex_enter(&vdp->xdf_dev_lk);
1970
1971		if (!wait || (vdp->xdf_state == XD_READY))
1972			goto out;
1973
1974		mutex_exit((&vdp->xdf_cb_lk));
1975		rv = cv_wait_sig(&vdp->xdf_dev_cv, &vdp->xdf_dev_lk);
1976		mutex_exit((&vdp->xdf_dev_lk));
1977		mutex_enter((&vdp->xdf_cb_lk));
1978		mutex_enter((&vdp->xdf_dev_lk));
1979		if (rv == 0)
1980			goto out;
1981	}
1982
1983out:
1984	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1985	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1986
1987	/* Try to lock the media */
1988	(void) xdf_media_req(vdp, XBV_MEDIA_REQ_LOCK, B_TRUE);
1989
1990	vdp->xdf_connect_req--;
1991	return (vdp->xdf_state);
1992}
1993
1994static uint_t
1995xdf_iorestart(caddr_t arg)
1996{
1997	xdf_t *vdp = (xdf_t *)arg;
1998
1999	ASSERT(vdp != NULL);
2000
2001	mutex_enter(&vdp->xdf_dev_lk);
2002	ASSERT(ISDMACBON(vdp));
2003	SETDMACBOFF(vdp);
2004	mutex_exit(&vdp->xdf_dev_lk);
2005
2006	xdf_io_start(vdp);
2007
2008	return (DDI_INTR_CLAIMED);
2009}
2010
2011#if defined(XPV_HVM_DRIVER)
2012
2013typedef struct xdf_hvm_entry {
2014	list_node_t	xdf_he_list;
2015	char		*xdf_he_path;
2016	dev_info_t	*xdf_he_dip;
2017} xdf_hvm_entry_t;
2018
2019static list_t xdf_hvm_list;
2020static kmutex_t xdf_hvm_list_lock;
2021
2022static xdf_hvm_entry_t *
2023i_xdf_hvm_find(const char *path, dev_info_t *dip)
2024{
2025	xdf_hvm_entry_t	*i;
2026
2027	ASSERT((path != NULL) || (dip != NULL));
2028	ASSERT(MUTEX_HELD(&xdf_hvm_list_lock));
2029
2030	i = list_head(&xdf_hvm_list);
2031	while (i != NULL) {
2032		if ((path != NULL) && strcmp(i->xdf_he_path, path) != 0) {
2033			i = list_next(&xdf_hvm_list, i);
2034			continue;
2035		}
2036		if ((dip != NULL) && (i->xdf_he_dip != dip)) {
2037			i = list_next(&xdf_hvm_list, i);
2038			continue;
2039		}
2040		break;
2041	}
2042	return (i);
2043}
2044
2045dev_info_t *
2046xdf_hvm_hold(const char *path)
2047{
2048	xdf_hvm_entry_t	*i;
2049	dev_info_t	*dip;
2050
2051	mutex_enter(&xdf_hvm_list_lock);
2052	i = i_xdf_hvm_find(path, NULL);
2053	if (i == NULL) {
2054		mutex_exit(&xdf_hvm_list_lock);
2055		return (B_FALSE);
2056	}
2057	ndi_hold_devi(dip = i->xdf_he_dip);
2058	mutex_exit(&xdf_hvm_list_lock);
2059	return (dip);
2060}
2061
2062static void
2063xdf_hvm_add(dev_info_t *dip)
2064{
2065	xdf_hvm_entry_t	*i;
2066	char		*path;
2067
2068	/* figure out the path for the dip */
2069	path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
2070	(void) ddi_pathname(dip, path);
2071
2072	i = kmem_alloc(sizeof (*i), KM_SLEEP);
2073	i->xdf_he_dip = dip;
2074	i->xdf_he_path = i_ddi_strdup(path, KM_SLEEP);
2075
2076	mutex_enter(&xdf_hvm_list_lock);
2077	ASSERT(i_xdf_hvm_find(path, NULL) == NULL);
2078	ASSERT(i_xdf_hvm_find(NULL, dip) == NULL);
2079	list_insert_head(&xdf_hvm_list, i);
2080	mutex_exit(&xdf_hvm_list_lock);
2081
2082	kmem_free(path, MAXPATHLEN);
2083}
2084
2085static void
2086xdf_hvm_rm(dev_info_t *dip)
2087{
2088	xdf_hvm_entry_t	*i;
2089
2090	mutex_enter(&xdf_hvm_list_lock);
2091	VERIFY((i = i_xdf_hvm_find(NULL, dip)) != NULL);
2092	list_remove(&xdf_hvm_list, i);
2093	mutex_exit(&xdf_hvm_list_lock);
2094
2095	kmem_free(i->xdf_he_path, strlen(i->xdf_he_path) + 1);
2096	kmem_free(i, sizeof (*i));
2097}
2098
2099static void
2100xdf_hvm_init(void)
2101{
2102	list_create(&xdf_hvm_list, sizeof (xdf_hvm_entry_t),
2103	    offsetof(xdf_hvm_entry_t, xdf_he_list));
2104	mutex_init(&xdf_hvm_list_lock, NULL, MUTEX_DEFAULT, NULL);
2105}
2106
2107static void
2108xdf_hvm_fini(void)
2109{
2110	ASSERT(list_head(&xdf_hvm_list) == NULL);
2111	list_destroy(&xdf_hvm_list);
2112	mutex_destroy(&xdf_hvm_list_lock);
2113}
2114
2115boolean_t
2116xdf_hvm_connect(dev_info_t *dip)
2117{
2118	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2119	char	*oename, *str;
2120	int	rv;
2121
2122	mutex_enter(&vdp->xdf_cb_lk);
2123	mutex_enter(&vdp->xdf_dev_lk);
2124
2125	/*
2126	 * Before try to establish a connection we need to wait for the
2127	 * backend hotplug scripts to have run.  Once they are run the
2128	 * "<oename>/hotplug-status" property will be set to "connected".
2129	 */
2130	for (;;) {
2131		ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
2132		ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
2133
2134		/*
2135		 * Get the xenbus path to the backend device.  Note that
2136		 * we can't cache this path (and we look it up on each pass
2137		 * through this loop) because it could change during
2138		 * suspend, resume, and migration operations.
2139		 */
2140		if ((oename = xvdi_get_oename(dip)) == NULL) {
2141			mutex_exit(&vdp->xdf_dev_lk);
2142			mutex_exit(&vdp->xdf_cb_lk);
2143			return (B_FALSE);
2144		}
2145
2146		str = NULL;
2147		if ((xenbus_read_str(oename, XBP_HP_STATUS, &str) == 0) &&
2148		    (strcmp(str, XBV_HP_STATUS_CONN) == 0))
2149			break;
2150
2151		if (str != NULL)
2152			strfree(str);
2153
2154		/* wait for an update to "<oename>/hotplug-status" */
2155		mutex_exit(&vdp->xdf_dev_lk);
2156		if (cv_wait_sig(&vdp->xdf_hp_status_cv, &vdp->xdf_cb_lk) == 0) {
2157			/* we got interrupted by a signal */
2158			mutex_exit(&vdp->xdf_cb_lk);
2159			return (B_FALSE);
2160		}
2161		mutex_enter(&vdp->xdf_dev_lk);
2162	}
2163
2164	/* Good news.  The backend hotplug scripts have been run. */
2165	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
2166	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
2167	ASSERT(strcmp(str, XBV_HP_STATUS_CONN) == 0);
2168	strfree(str);
2169
2170	/*
2171	 * If we're emulating a cd device and if the backend doesn't support
2172	 * media request opreations, then we're not going to bother trying
2173	 * to establish a connection for a couple reasons.  First off, media
2174	 * requests support is required to support operations like eject and
2175	 * media locking.  Second, other backend platforms like Linux don't
2176	 * support hvm pv cdrom access.  They don't even have a backend pv
2177	 * driver for cdrom device nodes, so we don't want to block forever
2178	 * waiting for a connection to a backend driver that doesn't exist.
2179	 */
2180	if (XD_IS_CD(vdp) && !xenbus_exists(oename, XBP_MEDIA_REQ_SUP)) {
2181		mutex_exit(&vdp->xdf_dev_lk);
2182		mutex_exit(&vdp->xdf_cb_lk);
2183		return (B_FALSE);
2184	}
2185
2186	rv = xdf_connect_locked(vdp, B_TRUE);
2187	mutex_exit(&vdp->xdf_dev_lk);
2188	mutex_exit(&vdp->xdf_cb_lk);
2189
2190	return ((rv == XD_READY) ? B_TRUE : B_FALSE);
2191}
2192
2193int
2194xdf_hvm_setpgeom(dev_info_t *dip, cmlb_geom_t *geomp)
2195{
2196	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2197
2198	/* sanity check the requested physical geometry */
2199	mutex_enter(&vdp->xdf_dev_lk);
2200	if ((geomp->g_secsize != XB_BSIZE) ||
2201	    (geomp->g_capacity == 0)) {
2202		mutex_exit(&vdp->xdf_dev_lk);
2203		return (EINVAL);
2204	}
2205
2206	/*
2207	 * If we've already connected to the backend device then make sure
2208	 * we're not defining a physical geometry larger than our backend
2209	 * device.
2210	 */
2211	if ((vdp->xdf_xdev_nblocks != 0) &&
2212	    (geomp->g_capacity > vdp->xdf_xdev_nblocks)) {
2213		mutex_exit(&vdp->xdf_dev_lk);
2214		return (EINVAL);
2215	}
2216
2217	bzero(&vdp->xdf_pgeom, sizeof (vdp->xdf_pgeom));
2218	vdp->xdf_pgeom.g_ncyl = geomp->g_ncyl;
2219	vdp->xdf_pgeom.g_acyl = geomp->g_acyl;
2220	vdp->xdf_pgeom.g_nhead = geomp->g_nhead;
2221	vdp->xdf_pgeom.g_nsect = geomp->g_nsect;
2222	vdp->xdf_pgeom.g_secsize = geomp->g_secsize;
2223	vdp->xdf_pgeom.g_capacity = geomp->g_capacity;
2224	vdp->xdf_pgeom.g_intrlv = geomp->g_intrlv;
2225	vdp->xdf_pgeom.g_rpm = geomp->g_rpm;
2226
2227	vdp->xdf_pgeom_fixed = B_TRUE;
2228	mutex_exit(&vdp->xdf_dev_lk);
2229
2230	/* force a re-validation */
2231	cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
2232
2233	return (0);
2234}
2235
2236boolean_t
2237xdf_is_cd(dev_info_t *dip)
2238{
2239	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
2240	boolean_t	rv;
2241
2242	mutex_enter(&vdp->xdf_cb_lk);
2243	rv = XD_IS_CD(vdp);
2244	mutex_exit(&vdp->xdf_cb_lk);
2245	return (rv);
2246}
2247
2248boolean_t
2249xdf_is_rm(dev_info_t *dip)
2250{
2251	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
2252	boolean_t	rv;
2253
2254	mutex_enter(&vdp->xdf_cb_lk);
2255	rv = XD_IS_RM(vdp);
2256	mutex_exit(&vdp->xdf_cb_lk);
2257	return (rv);
2258}
2259
2260boolean_t
2261xdf_media_req_supported(dev_info_t *dip)
2262{
2263	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
2264	boolean_t	rv;
2265
2266	mutex_enter(&vdp->xdf_cb_lk);
2267	rv = vdp->xdf_media_req_supported;
2268	mutex_exit(&vdp->xdf_cb_lk);
2269	return (rv);
2270}
2271
2272#endif /* XPV_HVM_DRIVER */
2273
2274static int
2275xdf_lb_getcap(dev_info_t *dip, diskaddr_t *capp)
2276{
2277	xdf_t *vdp;
2278	vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
2279
2280	if (vdp == NULL)
2281		return (ENXIO);
2282
2283	mutex_enter(&vdp->xdf_dev_lk);
2284	*capp = vdp->xdf_pgeom.g_capacity;
2285	DPRINTF(LBL_DBG, ("xdf@%s:capacity %llu\n", vdp->xdf_addr, *capp));
2286	mutex_exit(&vdp->xdf_dev_lk);
2287	return (0);
2288}
2289
2290static int
2291xdf_lb_getpgeom(dev_info_t *dip, cmlb_geom_t *geomp)
2292{
2293	xdf_t *vdp;
2294
2295	if ((vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip))) == NULL)
2296		return (ENXIO);
2297	*geomp = vdp->xdf_pgeom;
2298	return (0);
2299}
2300
2301/*
2302 * No real HBA, no geometry available from it
2303 */
2304/*ARGSUSED*/
2305static int
2306xdf_lb_getvgeom(dev_info_t *dip, cmlb_geom_t *geomp)
2307{
2308	return (EINVAL);
2309}
2310
2311static int
2312xdf_lb_getattribute(dev_info_t *dip, tg_attribute_t *tgattributep)
2313{
2314	xdf_t *vdp;
2315
2316	if (!(vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip))))
2317		return (ENXIO);
2318
2319	if (XD_IS_RO(vdp))
2320		tgattributep->media_is_writable = 0;
2321	else
2322		tgattributep->media_is_writable = 1;
2323	return (0);
2324}
2325
2326/* ARGSUSED3 */
2327int
2328xdf_lb_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie)
2329{
2330	switch (cmd) {
2331	case TG_GETPHYGEOM:
2332		return (xdf_lb_getpgeom(dip, (cmlb_geom_t *)arg));
2333	case TG_GETVIRTGEOM:
2334		return (xdf_lb_getvgeom(dip, (cmlb_geom_t *)arg));
2335	case TG_GETCAPACITY:
2336		return (xdf_lb_getcap(dip, (diskaddr_t *)arg));
2337	case TG_GETBLOCKSIZE:
2338		*(uint32_t *)arg = XB_BSIZE;
2339		return (0);
2340	case TG_GETATTR:
2341		return (xdf_lb_getattribute(dip, (tg_attribute_t *)arg));
2342	default:
2343		return (ENOTTY);
2344	}
2345}
2346
2347/* ARGSUSED5 */
2348int
2349xdf_lb_rdwr(dev_info_t *dip, uchar_t cmd, void *bufp,
2350    diskaddr_t start, size_t reqlen, void *tg_cookie)
2351{
2352	xdf_t *vdp;
2353	struct buf *bp;
2354	int err = 0;
2355
2356	vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
2357
2358	/* We don't allow IO from the oe_change callback thread */
2359	ASSERT(curthread != vdp->xdf_oe_change_thread);
2360
2361	if ((start + (reqlen >> DEV_BSHIFT)) > vdp->xdf_pgeom.g_capacity)
2362		return (EINVAL);
2363
2364	bp = getrbuf(KM_SLEEP);
2365	if (cmd == TG_READ)
2366		bp->b_flags = B_BUSY | B_READ;
2367	else
2368		bp->b_flags = B_BUSY | B_WRITE;
2369	bp->b_un.b_addr = bufp;
2370	bp->b_bcount = reqlen;
2371	bp->b_blkno = start;
2372	bp->b_edev = DDI_DEV_T_NONE; /* don't have dev_t */
2373
2374	mutex_enter(&vdp->xdf_dev_lk);
2375	xdf_bp_push(vdp, bp);
2376	mutex_exit(&vdp->xdf_dev_lk);
2377	xdf_io_start(vdp);
2378	if (curthread == vdp->xdf_ready_tq_thread)
2379		(void) xdf_ring_drain(vdp);
2380	err = biowait(bp);
2381	ASSERT(bp->b_flags & B_DONE);
2382	freerbuf(bp);
2383	return (err);
2384}
2385
2386/*
2387 * Lock the current media.  Set the media state to "lock".
2388 * (Media locks are only respected by the backend driver.)
2389 */
2390static int
2391xdf_ioctl_mlock(xdf_t *vdp)
2392{
2393	int rv;
2394	mutex_enter(&vdp->xdf_cb_lk);
2395	rv = xdf_media_req(vdp, XBV_MEDIA_REQ_LOCK, B_TRUE);
2396	mutex_exit(&vdp->xdf_cb_lk);
2397	return (rv);
2398}
2399
2400/*
2401 * Release a media lock.  Set the media state to "none".
2402 */
2403static int
2404xdf_ioctl_munlock(xdf_t *vdp)
2405{
2406	int rv;
2407	mutex_enter(&vdp->xdf_cb_lk);
2408	rv = xdf_media_req(vdp, XBV_MEDIA_REQ_NONE, B_TRUE);
2409	mutex_exit(&vdp->xdf_cb_lk);
2410	return (rv);
2411}
2412
2413/*
2414 * Eject the current media.  Ignores any media locks.  (Media locks
2415 * are only for benifit of the the backend.)
2416 */
2417static int
2418xdf_ioctl_eject(xdf_t *vdp)
2419{
2420	int rv;
2421
2422	mutex_enter(&vdp->xdf_cb_lk);
2423	if ((rv = xdf_media_req(vdp, XBV_MEDIA_REQ_EJECT, B_FALSE)) != 0) {
2424		mutex_exit(&vdp->xdf_cb_lk);
2425		return (rv);
2426	}
2427
2428	/*
2429	 * We've set the media requests xenbus parameter to eject, so now
2430	 * disconnect from the backend, wait for the backend to clear
2431	 * the media requets xenbus paramter, and then we can reconnect
2432	 * to the backend.
2433	 */
2434	(void) xdf_disconnect(vdp, XD_UNKNOWN, B_TRUE);
2435	mutex_enter(&vdp->xdf_dev_lk);
2436	if (xdf_connect_locked(vdp, B_TRUE) != XD_READY) {
2437		mutex_exit(&vdp->xdf_dev_lk);
2438		mutex_exit(&vdp->xdf_cb_lk);
2439		return (EIO);
2440	}
2441	mutex_exit(&vdp->xdf_dev_lk);
2442	mutex_exit(&vdp->xdf_cb_lk);
2443	return (0);
2444}
2445
2446/*
2447 * Watch for media state changes.  This can be an insertion of a device
2448 * (triggered by a 'xm block-configure' request in another domain) or
2449 * the ejection of a device (triggered by a local "eject" operation).
2450 * For a full description of the DKIOCSTATE ioctl behavior see dkio(7I).
2451 */
2452static int
2453xdf_dkstate(xdf_t *vdp, enum dkio_state mstate)
2454{
2455	enum dkio_state		prev_state;
2456
2457	mutex_enter(&vdp->xdf_cb_lk);
2458	prev_state = vdp->xdf_mstate;
2459
2460	if (vdp->xdf_mstate == mstate) {
2461		while (vdp->xdf_mstate == prev_state) {
2462			if (cv_wait_sig(&vdp->xdf_mstate_cv,
2463			    &vdp->xdf_cb_lk) == 0) {
2464				mutex_exit(&vdp->xdf_cb_lk);
2465				return (EINTR);
2466			}
2467		}
2468	}
2469
2470	if ((prev_state != DKIO_INSERTED) &&
2471	    (vdp->xdf_mstate == DKIO_INSERTED)) {
2472		(void) xdf_media_req(vdp, XBV_MEDIA_REQ_LOCK, B_TRUE);
2473		mutex_exit(&vdp->xdf_cb_lk);
2474		return (0);
2475	}
2476
2477	mutex_exit(&vdp->xdf_cb_lk);
2478	return (0);
2479}
2480
2481/*ARGSUSED*/
2482static int
2483xdf_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
2484    int *rvalp)
2485{
2486	minor_t		minor = getminor(dev);
2487	int		part = XDF_PART(minor);
2488	xdf_t		*vdp;
2489	int		rv;
2490
2491	if (((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL) ||
2492	    (!xdf_isopen(vdp, part)))
2493		return (ENXIO);
2494
2495	DPRINTF(IOCTL_DBG, ("xdf@%s:ioctl: cmd %d (0x%x)\n",
2496	    vdp->xdf_addr, cmd, cmd));
2497
2498	switch (cmd) {
2499	default:
2500		return (ENOTTY);
2501	case DKIOCG_PHYGEOM:
2502	case DKIOCG_VIRTGEOM:
2503	case DKIOCGGEOM:
2504	case DKIOCSGEOM:
2505	case DKIOCGAPART:
2506	case DKIOCSAPART:
2507	case DKIOCGVTOC:
2508	case DKIOCSVTOC:
2509	case DKIOCPARTINFO:
2510	case DKIOCGEXTVTOC:
2511	case DKIOCSEXTVTOC:
2512	case DKIOCEXTPARTINFO:
2513	case DKIOCGMBOOT:
2514	case DKIOCSMBOOT:
2515	case DKIOCGETEFI:
2516	case DKIOCSETEFI:
2517	case DKIOCPARTITION:
2518		return (cmlb_ioctl(vdp->xdf_vd_lbl, dev, cmd, arg, mode, credp,
2519		    rvalp, NULL));
2520	case FDEJECT:
2521	case DKIOCEJECT:
2522	case CDROMEJECT:
2523		return (xdf_ioctl_eject(vdp));
2524	case DKIOCLOCK:
2525		return (xdf_ioctl_mlock(vdp));
2526	case DKIOCUNLOCK:
2527		return (xdf_ioctl_munlock(vdp));
2528	case CDROMREADOFFSET: {
2529		int offset = 0;
2530		if (!XD_IS_CD(vdp))
2531			return (ENOTTY);
2532		if (ddi_copyout(&offset, (void *)arg, sizeof (int), mode))
2533			return (EFAULT);
2534		return (0);
2535	}
2536	case DKIOCGMEDIAINFO: {
2537		struct dk_minfo media_info;
2538
2539		media_info.dki_lbsize = DEV_BSIZE;
2540		media_info.dki_capacity = vdp->xdf_pgeom.g_capacity;
2541		if (XD_IS_CD(vdp))
2542			media_info.dki_media_type = DK_CDROM;
2543		else
2544			media_info.dki_media_type = DK_FIXED_DISK;
2545
2546		if (ddi_copyout(&media_info, (void *)arg,
2547		    sizeof (struct dk_minfo), mode))
2548			return (EFAULT);
2549		return (0);
2550	}
2551	case DKIOCINFO: {
2552		struct dk_cinfo info;
2553
2554		/* controller information */
2555		if (XD_IS_CD(vdp))
2556			info.dki_ctype = DKC_CDROM;
2557		else
2558			info.dki_ctype = DKC_VBD;
2559
2560		info.dki_cnum = 0;
2561		(void) strncpy((char *)(&info.dki_cname), "xdf", 8);
2562
2563		/* unit information */
2564		info.dki_unit = ddi_get_instance(vdp->xdf_dip);
2565		(void) strncpy((char *)(&info.dki_dname), "xdf", 8);
2566		info.dki_flags = DKI_FMTVOL;
2567		info.dki_partition = part;
2568		info.dki_maxtransfer = maxphys / DEV_BSIZE;
2569		info.dki_addr = 0;
2570		info.dki_space = 0;
2571		info.dki_prio = 0;
2572		info.dki_vec = 0;
2573
2574		if (ddi_copyout(&info, (void *)arg, sizeof (info), mode))
2575			return (EFAULT);
2576		return (0);
2577	}
2578	case DKIOCSTATE: {
2579		enum dkio_state mstate;
2580
2581		if (ddi_copyin((void *)arg, &mstate,
2582		    sizeof (mstate), mode) != 0)
2583			return (EFAULT);
2584		if ((rv = xdf_dkstate(vdp, mstate)) != 0)
2585			return (rv);
2586		mstate = vdp->xdf_mstate;
2587		if (ddi_copyout(&mstate, (void *)arg,
2588		    sizeof (mstate), mode) != 0)
2589			return (EFAULT);
2590		return (0);
2591	}
2592	case DKIOCREMOVABLE: {
2593		int i = BOOLEAN2VOID(XD_IS_RM(vdp));
2594		if (ddi_copyout(&i, (caddr_t)arg, sizeof (i), mode))
2595			return (EFAULT);
2596		return (0);
2597	}
2598	case DKIOCGETWCE: {
2599		int i = BOOLEAN2VOID(XD_IS_RM(vdp));
2600		if (ddi_copyout(&i, (void *)arg, sizeof (i), mode))
2601			return (EFAULT);
2602		return (0);
2603	}
2604	case DKIOCSETWCE: {
2605		int i;
2606		if (ddi_copyin((void *)arg, &i, sizeof (i), mode))
2607			return (EFAULT);
2608		vdp->xdf_wce = VOID2BOOLEAN(i);
2609		return (0);
2610	}
2611	case DKIOCFLUSHWRITECACHE: {
2612		struct dk_callback *dkc = (struct dk_callback *)arg;
2613
2614		if (vdp->xdf_flush_supported) {
2615			rv = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
2616			    NULL, 0, 0, (void *)dev);
2617		} else if (vdp->xdf_feature_barrier &&
2618		    !xdf_barrier_flush_disable) {
2619			rv = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
2620			    vdp->xdf_cache_flush_block, xdf_flush_block,
2621			    DEV_BSIZE, (void *)dev);
2622		} else {
2623			return (ENOTTY);
2624		}
2625		if ((mode & FKIOCTL) && (dkc != NULL) &&
2626		    (dkc->dkc_callback != NULL)) {
2627			(*dkc->dkc_callback)(dkc->dkc_cookie, rv);
2628			/* need to return 0 after calling callback */
2629			rv = 0;
2630		}
2631		return (rv);
2632	}
2633	}
2634	/*NOTREACHED*/
2635}
2636
2637static int
2638xdf_strategy(struct buf *bp)
2639{
2640	xdf_t	*vdp;
2641	minor_t minor;
2642	diskaddr_t p_blkct, p_blkst;
2643	ulong_t nblks;
2644	int part;
2645
2646	minor = getminor(bp->b_edev);
2647	part = XDF_PART(minor);
2648	vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor));
2649
2650	mutex_enter(&vdp->xdf_dev_lk);
2651	if (!xdf_isopen(vdp, part)) {
2652		mutex_exit(&vdp->xdf_dev_lk);
2653		xdf_io_err(bp, ENXIO, 0);
2654		return (0);
2655	}
2656
2657	/* We don't allow IO from the oe_change callback thread */
2658	ASSERT(curthread != vdp->xdf_oe_change_thread);
2659
2660	/* Check for writes to a read only device */
2661	if (!IS_READ(bp) && XD_IS_RO(vdp)) {
2662		mutex_exit(&vdp->xdf_dev_lk);
2663		xdf_io_err(bp, EROFS, 0);
2664		return (0);
2665	}
2666
2667	/* Check if this I/O is accessing a partition or the entire disk */
2668	if ((long)bp->b_private == XB_SLICE_NONE) {
2669		/* This I/O is using an absolute offset */
2670		p_blkct = vdp->xdf_xdev_nblocks;
2671		p_blkst = 0;
2672	} else {
2673		/* This I/O is using a partition relative offset */
2674		mutex_exit(&vdp->xdf_dev_lk);
2675		if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
2676		    &p_blkst, NULL, NULL, NULL)) {
2677			xdf_io_err(bp, ENXIO, 0);
2678			return (0);
2679		}
2680		mutex_enter(&vdp->xdf_dev_lk);
2681	}
2682
2683	/* check for a starting block beyond the disk or partition limit */
2684	if (bp->b_blkno > p_blkct) {
2685		DPRINTF(IO_DBG, ("xdf@%s: block %lld exceeds VBD size %"PRIu64,
2686		    vdp->xdf_addr, (longlong_t)bp->b_blkno, (uint64_t)p_blkct));
2687		xdf_io_err(bp, EINVAL, 0);
2688		return (0);
2689	}
2690
2691	/* Legacy: don't set error flag at this case */
2692	if (bp->b_blkno == p_blkct) {
2693		bp->b_resid = bp->b_bcount;
2694		biodone(bp);
2695		return (0);
2696	}
2697
2698	/* sanitize the input buf */
2699	bioerror(bp, 0);
2700	bp->b_resid = 0;
2701	bp->av_back = bp->av_forw = NULL;
2702
2703	/* Adjust for partial transfer, this will result in an error later */
2704	nblks = bp->b_bcount >> XB_BSHIFT;
2705	if ((bp->b_blkno + nblks) > p_blkct) {
2706		bp->b_resid = ((bp->b_blkno + nblks) - p_blkct) << XB_BSHIFT;
2707		bp->b_bcount -= bp->b_resid;
2708	}
2709
2710	DPRINTF(IO_DBG, ("xdf@%s: strategy blk %lld len %lu\n",
2711	    vdp->xdf_addr, (longlong_t)bp->b_blkno, (ulong_t)bp->b_bcount));
2712
2713	/* Fix up the buf struct */
2714	bp->b_flags |= B_BUSY;
2715	bp->b_private = (void *)(uintptr_t)p_blkst;
2716
2717	xdf_bp_push(vdp, bp);
2718	mutex_exit(&vdp->xdf_dev_lk);
2719	xdf_io_start(vdp);
2720	if (do_polled_io)
2721		(void) xdf_ring_drain(vdp);
2722	return (0);
2723}
2724
2725/*ARGSUSED*/
2726static int
2727xdf_read(dev_t dev, struct uio *uiop, cred_t *credp)
2728{
2729	xdf_t	*vdp;
2730	minor_t minor;
2731	diskaddr_t p_blkcnt;
2732	int part;
2733
2734	minor = getminor(dev);
2735	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2736		return (ENXIO);
2737
2738	DPRINTF(IO_DBG, ("xdf@%s: read offset 0x%"PRIx64"\n",
2739	    vdp->xdf_addr, (int64_t)uiop->uio_offset));
2740
2741	part = XDF_PART(minor);
2742	if (!xdf_isopen(vdp, part))
2743		return (ENXIO);
2744
2745	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2746	    NULL, NULL, NULL, NULL))
2747		return (ENXIO);
2748
2749	if (U_INVAL(uiop))
2750		return (EINVAL);
2751
2752	return (physio(xdf_strategy, NULL, dev, B_READ, xdfmin, uiop));
2753}
2754
2755/*ARGSUSED*/
2756static int
2757xdf_write(dev_t dev, struct uio *uiop, cred_t *credp)
2758{
2759	xdf_t *vdp;
2760	minor_t minor;
2761	diskaddr_t p_blkcnt;
2762	int part;
2763
2764	minor = getminor(dev);
2765	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2766		return (ENXIO);
2767
2768	DPRINTF(IO_DBG, ("xdf@%s: write offset 0x%"PRIx64"\n",
2769	    vdp->xdf_addr, (int64_t)uiop->uio_offset));
2770
2771	part = XDF_PART(minor);
2772	if (!xdf_isopen(vdp, part))
2773		return (ENXIO);
2774
2775	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2776	    NULL, NULL, NULL, NULL))
2777		return (ENXIO);
2778
2779	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
2780		return (ENOSPC);
2781
2782	if (U_INVAL(uiop))
2783		return (EINVAL);
2784
2785	return (physio(xdf_strategy, NULL, dev, B_WRITE, xdfmin, uiop));
2786}
2787
2788/*ARGSUSED*/
2789static int
2790xdf_aread(dev_t dev, struct aio_req *aiop, cred_t *credp)
2791{
2792	xdf_t	*vdp;
2793	minor_t minor;
2794	struct uio *uiop = aiop->aio_uio;
2795	diskaddr_t p_blkcnt;
2796	int part;
2797
2798	minor = getminor(dev);
2799	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2800		return (ENXIO);
2801
2802	part = XDF_PART(minor);
2803	if (!xdf_isopen(vdp, part))
2804		return (ENXIO);
2805
2806	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2807	    NULL, NULL, NULL, NULL))
2808		return (ENXIO);
2809
2810	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
2811		return (ENOSPC);
2812
2813	if (U_INVAL(uiop))
2814		return (EINVAL);
2815
2816	return (aphysio(xdf_strategy, anocancel, dev, B_READ, xdfmin, aiop));
2817}
2818
2819/*ARGSUSED*/
2820static int
2821xdf_awrite(dev_t dev, struct aio_req *aiop, cred_t *credp)
2822{
2823	xdf_t *vdp;
2824	minor_t minor;
2825	struct uio *uiop = aiop->aio_uio;
2826	diskaddr_t p_blkcnt;
2827	int part;
2828
2829	minor = getminor(dev);
2830	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2831		return (ENXIO);
2832
2833	part = XDF_PART(minor);
2834	if (!xdf_isopen(vdp, part))
2835		return (ENXIO);
2836
2837	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2838	    NULL, NULL, NULL, NULL))
2839		return (ENXIO);
2840
2841	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
2842		return (ENOSPC);
2843
2844	if (U_INVAL(uiop))
2845		return (EINVAL);
2846
2847	return (aphysio(xdf_strategy, anocancel, dev, B_WRITE, xdfmin, aiop));
2848}
2849
2850static int
2851xdf_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
2852{
2853	struct buf dumpbuf, *dbp = &dumpbuf;
2854	xdf_t	*vdp;
2855	minor_t minor;
2856	int err = 0;
2857	int part;
2858	diskaddr_t p_blkcnt, p_blkst;
2859
2860	minor = getminor(dev);
2861	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2862		return (ENXIO);
2863
2864	DPRINTF(IO_DBG, ("xdf@%s: dump addr (0x%p) blk (%ld) nblks (%d)\n",
2865	    vdp->xdf_addr, (void *)addr, blkno, nblk));
2866
2867	/* We don't allow IO from the oe_change callback thread */
2868	ASSERT(curthread != vdp->xdf_oe_change_thread);
2869
2870	part = XDF_PART(minor);
2871	if (!xdf_isopen(vdp, part))
2872		return (ENXIO);
2873
2874	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt, &p_blkst,
2875	    NULL, NULL, NULL))
2876		return (ENXIO);
2877
2878	if ((blkno + nblk) > p_blkcnt) {
2879		cmn_err(CE_WARN, "xdf@%s: block %ld exceeds VBD size %"PRIu64,
2880		    vdp->xdf_addr, blkno + nblk, (uint64_t)p_blkcnt);
2881		return (EINVAL);
2882	}
2883
2884	bioinit(dbp);
2885	dbp->b_flags = B_BUSY;
2886	dbp->b_un.b_addr = addr;
2887	dbp->b_bcount = nblk << DEV_BSHIFT;
2888	dbp->b_blkno = blkno;
2889	dbp->b_edev = dev;
2890	dbp->b_private = (void *)(uintptr_t)p_blkst;
2891
2892	mutex_enter(&vdp->xdf_dev_lk);
2893	xdf_bp_push(vdp, dbp);
2894	mutex_exit(&vdp->xdf_dev_lk);
2895	xdf_io_start(vdp);
2896	err = xdf_ring_drain(vdp);
2897	biofini(dbp);
2898	return (err);
2899}
2900
2901/*ARGSUSED*/
2902static int
2903xdf_close(dev_t dev, int flag, int otyp, struct cred *credp)
2904{
2905	minor_t	minor;
2906	xdf_t	*vdp;
2907	int part;
2908	ulong_t parbit;
2909
2910	minor = getminor(dev);
2911	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2912		return (ENXIO);
2913
2914	mutex_enter(&vdp->xdf_dev_lk);
2915	part = XDF_PART(minor);
2916	if (!xdf_isopen(vdp, part)) {
2917		mutex_exit(&vdp->xdf_dev_lk);
2918		return (ENXIO);
2919	}
2920	parbit = 1 << part;
2921
2922	ASSERT((vdp->xdf_vd_open[otyp] & parbit) != 0);
2923	if (otyp == OTYP_LYR) {
2924		ASSERT(vdp->xdf_vd_lyropen[part] > 0);
2925		if (--vdp->xdf_vd_lyropen[part] == 0)
2926			vdp->xdf_vd_open[otyp] &= ~parbit;
2927	} else {
2928		vdp->xdf_vd_open[otyp] &= ~parbit;
2929	}
2930	vdp->xdf_vd_exclopen &= ~parbit;
2931
2932	mutex_exit(&vdp->xdf_dev_lk);
2933	return (0);
2934}
2935
2936static int
2937xdf_open(dev_t *devp, int flag, int otyp, cred_t *credp)
2938{
2939	minor_t	minor;
2940	xdf_t	*vdp;
2941	int part;
2942	ulong_t parbit;
2943	diskaddr_t p_blkct = 0;
2944	boolean_t firstopen;
2945	boolean_t nodelay;
2946
2947	minor = getminor(*devp);
2948	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2949		return (ENXIO);
2950
2951	nodelay = (flag & (FNDELAY | FNONBLOCK));
2952
2953	DPRINTF(DDI_DBG, ("xdf@%s: opening\n", vdp->xdf_addr));
2954
2955	/* do cv_wait until connected or failed */
2956	mutex_enter(&vdp->xdf_cb_lk);
2957	mutex_enter(&vdp->xdf_dev_lk);
2958	if (!nodelay && (xdf_connect_locked(vdp, B_TRUE) != XD_READY)) {
2959		mutex_exit(&vdp->xdf_dev_lk);
2960		mutex_exit(&vdp->xdf_cb_lk);
2961		return (ENXIO);
2962	}
2963	mutex_exit(&vdp->xdf_cb_lk);
2964
2965	if ((flag & FWRITE) && XD_IS_RO(vdp)) {
2966		mutex_exit(&vdp->xdf_dev_lk);
2967		return (EROFS);
2968	}
2969
2970	part = XDF_PART(minor);
2971	parbit = 1 << part;
2972	if ((vdp->xdf_vd_exclopen & parbit) ||
2973	    ((flag & FEXCL) && xdf_isopen(vdp, part))) {
2974		mutex_exit(&vdp->xdf_dev_lk);
2975		return (EBUSY);
2976	}
2977
2978	/* are we the first one to open this node? */
2979	firstopen = !xdf_isopen(vdp, -1);
2980
2981	if (otyp == OTYP_LYR)
2982		vdp->xdf_vd_lyropen[part]++;
2983
2984	vdp->xdf_vd_open[otyp] |= parbit;
2985
2986	if (flag & FEXCL)
2987		vdp->xdf_vd_exclopen |= parbit;
2988
2989	mutex_exit(&vdp->xdf_dev_lk);
2990
2991	/* force a re-validation */
2992	if (firstopen)
2993		cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
2994
2995	/* If this is a non-blocking open then we're done */
2996	if (nodelay)
2997		return (0);
2998
2999	/*
3000	 * This is a blocking open, so we require:
3001	 * - that the disk have a valid label on it
3002	 * - that the size of the partition that we're opening is non-zero
3003	 */
3004	if ((cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
3005	    NULL, NULL, NULL, NULL) != 0) || (p_blkct == 0)) {
3006		(void) xdf_close(*devp, flag, otyp, credp);
3007		return (ENXIO);
3008	}
3009
3010	return (0);
3011}
3012
3013/*ARGSUSED*/
3014static void
3015xdf_watch_hp_status_cb(dev_info_t *dip, const char *path, void *arg)
3016{
3017	xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
3018	cv_broadcast(&vdp->xdf_hp_status_cv);
3019}
3020
3021static int
3022xdf_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags,
3023	char *name, caddr_t valuep, int *lengthp)
3024{
3025	xdf_t	*vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
3026
3027	/*
3028	 * Sanity check that if a dev_t or dip were specified that they
3029	 * correspond to this device driver.  On debug kernels we'll
3030	 * panic and on non-debug kernels we'll return failure.
3031	 */
3032	ASSERT(ddi_driver_major(dip) == xdf_major);
3033	ASSERT((dev == DDI_DEV_T_ANY) || (getmajor(dev) == xdf_major));
3034	if ((ddi_driver_major(dip) != xdf_major) ||
3035	    ((dev != DDI_DEV_T_ANY) && (getmajor(dev) != xdf_major)))
3036		return (DDI_PROP_NOT_FOUND);
3037
3038	if (vdp == NULL)
3039		return (ddi_prop_op(dev, dip, prop_op, flags,
3040		    name, valuep, lengthp));
3041
3042	return (cmlb_prop_op(vdp->xdf_vd_lbl,
3043	    dev, dip, prop_op, flags, name, valuep, lengthp,
3044	    XDF_PART(getminor(dev)), NULL));
3045}
3046
3047/*ARGSUSED*/
3048static int
3049xdf_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **rp)
3050{
3051	int	instance = XDF_INST(getminor((dev_t)arg));
3052	xdf_t	*vbdp;
3053
3054	switch (cmd) {
3055	case DDI_INFO_DEVT2DEVINFO:
3056		if ((vbdp = ddi_get_soft_state(xdf_ssp, instance)) == NULL) {
3057			*rp = NULL;
3058			return (DDI_FAILURE);
3059		}
3060		*rp = vbdp->xdf_dip;
3061		return (DDI_SUCCESS);
3062
3063	case DDI_INFO_DEVT2INSTANCE:
3064		*rp = (void *)(uintptr_t)instance;
3065		return (DDI_SUCCESS);
3066
3067	default:
3068		return (DDI_FAILURE);
3069	}
3070}
3071
3072/*ARGSUSED*/
3073static int
3074xdf_resume(dev_info_t *dip)
3075{
3076	xdf_t	*vdp;
3077	char	*oename;
3078
3079	if ((vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip))) == NULL)
3080		goto err;
3081
3082	if (xdf_debug & SUSRES_DBG)
3083		xen_printf("xdf@%s: xdf_resume\n", vdp->xdf_addr);
3084
3085	mutex_enter(&vdp->xdf_cb_lk);
3086
3087	if (xvdi_resume(dip) != DDI_SUCCESS) {
3088		mutex_exit(&vdp->xdf_cb_lk);
3089		goto err;
3090	}
3091
3092	if (((oename = xvdi_get_oename(dip)) == NULL) ||
3093	    (xvdi_add_xb_watch_handler(dip, oename, XBP_HP_STATUS,
3094	    xdf_watch_hp_status_cb, NULL) != DDI_SUCCESS)) {
3095		mutex_exit(&vdp->xdf_cb_lk);
3096		goto err;
3097	}
3098
3099	mutex_enter(&vdp->xdf_dev_lk);
3100	ASSERT(vdp->xdf_state != XD_READY);
3101	xdf_set_state(vdp, XD_UNKNOWN);
3102	mutex_exit(&vdp->xdf_dev_lk);
3103
3104	if (xdf_setstate_init(vdp) != DDI_SUCCESS) {
3105		mutex_exit(&vdp->xdf_cb_lk);
3106		goto err;
3107	}
3108
3109	mutex_exit(&vdp->xdf_cb_lk);
3110
3111	if (xdf_debug & SUSRES_DBG)
3112		xen_printf("xdf@%s: xdf_resume: done\n", vdp->xdf_addr);
3113	return (DDI_SUCCESS);
3114err:
3115	if (xdf_debug & SUSRES_DBG)
3116		xen_printf("xdf@%s: xdf_resume: fail\n", vdp->xdf_addr);
3117	return (DDI_FAILURE);
3118}
3119
3120static int
3121xdf_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
3122{
3123	int			n, instance = ddi_get_instance(dip);
3124	ddi_iblock_cookie_t	ibc, softibc;
3125	boolean_t		dev_iscd = B_FALSE;
3126	xdf_t			*vdp;
3127	char			*oename, *xsname, *str;
3128
3129	if ((n = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_NOTPROM,
3130	    "xdf_debug", 0)) != 0)
3131		xdf_debug = n;
3132
3133	switch (cmd) {
3134	case DDI_RESUME:
3135		return (xdf_resume(dip));
3136	case DDI_ATTACH:
3137		break;
3138	default:
3139		return (DDI_FAILURE);
3140	}
3141	/* DDI_ATTACH */
3142
3143	if (((xsname = xvdi_get_xsname(dip)) == NULL) ||
3144	    ((oename = xvdi_get_oename(dip)) == NULL))
3145		return (DDI_FAILURE);
3146
3147	/*
3148	 * Disable auto-detach.  This is necessary so that we don't get
3149	 * detached while we're disconnected from the back end.
3150	 */
3151	if ((ddi_prop_update_int(DDI_DEV_T_NONE, dip,
3152	    DDI_NO_AUTODETACH, 1) != DDI_PROP_SUCCESS))
3153		return (DDI_FAILURE);
3154
3155	/* driver handles kernel-issued IOCTLs */
3156	if (ddi_prop_create(DDI_DEV_T_NONE, dip,
3157	    DDI_PROP_CANSLEEP, DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS)
3158		return (DDI_FAILURE);
3159
3160	if (ddi_get_iblock_cookie(dip, 0, &ibc) != DDI_SUCCESS)
3161		return (DDI_FAILURE);
3162
3163	if (ddi_get_soft_iblock_cookie(dip,
3164	    DDI_SOFTINT_LOW, &softibc) != DDI_SUCCESS)
3165		return (DDI_FAILURE);
3166
3167	if (xenbus_read_str(xsname, XBP_DEV_TYPE, &str) != 0) {
3168		cmn_err(CE_WARN, "xdf@%s: cannot read device-type",
3169		    ddi_get_name_addr(dip));
3170		return (DDI_FAILURE);
3171	}
3172	if (strcmp(str, XBV_DEV_TYPE_CD) == 0)
3173		dev_iscd = B_TRUE;
3174	strfree(str);
3175
3176	if (ddi_soft_state_zalloc(xdf_ssp, instance) != DDI_SUCCESS)
3177		return (DDI_FAILURE);
3178
3179	DPRINTF(DDI_DBG, ("xdf@%s: attaching\n", ddi_get_name_addr(dip)));
3180	vdp = ddi_get_soft_state(xdf_ssp, instance);
3181	ddi_set_driver_private(dip, vdp);
3182	vdp->xdf_dip = dip;
3183	vdp->xdf_addr = ddi_get_name_addr(dip);
3184	vdp->xdf_suspending = B_FALSE;
3185	vdp->xdf_media_req_supported = B_FALSE;
3186	vdp->xdf_peer = INVALID_DOMID;
3187	vdp->xdf_evtchn = INVALID_EVTCHN;
3188	list_create(&vdp->xdf_vreq_act, sizeof (v_req_t),
3189	    offsetof(v_req_t, v_link));
3190	cv_init(&vdp->xdf_dev_cv, NULL, CV_DEFAULT, NULL);
3191	cv_init(&vdp->xdf_hp_status_cv, NULL, CV_DEFAULT, NULL);
3192	cv_init(&vdp->xdf_mstate_cv, NULL, CV_DEFAULT, NULL);
3193	mutex_init(&vdp->xdf_dev_lk, NULL, MUTEX_DRIVER, (void *)ibc);
3194	mutex_init(&vdp->xdf_cb_lk, NULL, MUTEX_DRIVER, (void *)ibc);
3195	mutex_init(&vdp->xdf_iostat_lk, NULL, MUTEX_DRIVER, (void *)ibc);
3196	vdp->xdf_cmbl_reattach = B_TRUE;
3197	if (dev_iscd) {
3198		vdp->xdf_dinfo |= VDISK_CDROM;
3199		vdp->xdf_mstate = DKIO_EJECTED;
3200	} else {
3201		vdp->xdf_mstate = DKIO_NONE;
3202	}
3203
3204	if ((vdp->xdf_ready_tq = ddi_taskq_create(dip, "xdf_ready_tq",
3205	    1, TASKQ_DEFAULTPRI, 0)) == NULL)
3206		goto errout0;
3207
3208	if (xvdi_add_xb_watch_handler(dip, oename, XBP_HP_STATUS,
3209	    xdf_watch_hp_status_cb, NULL) != DDI_SUCCESS)
3210		goto errout0;
3211
3212	if (ddi_add_softintr(dip, DDI_SOFTINT_LOW, &vdp->xdf_softintr_id,
3213	    &softibc, NULL, xdf_iorestart, (caddr_t)vdp) != DDI_SUCCESS) {
3214		cmn_err(CE_WARN, "xdf@%s: failed to add softintr",
3215		    ddi_get_name_addr(dip));
3216		goto errout0;
3217	}
3218
3219	/*
3220	 * Initialize the physical geometry stucture.  Note that currently
3221	 * we don't know the size of the backend device so the number
3222	 * of blocks on the device will be initialized to zero.  Once
3223	 * we connect to the backend device we'll update the physical
3224	 * geometry to reflect the real size of the device.
3225	 */
3226	xdf_synthetic_pgeom(dip, &vdp->xdf_pgeom);
3227	vdp->xdf_pgeom_fixed = B_FALSE;
3228
3229	/*
3230	 * create default device minor nodes: non-removable disk
3231	 * we will adjust minor nodes after we are connected w/ backend
3232	 */
3233	cmlb_alloc_handle(&vdp->xdf_vd_lbl);
3234	if (xdf_cmlb_attach(vdp) != 0) {
3235		cmn_err(CE_WARN,
3236		    "xdf@%s: attach failed, cmlb attach failed",
3237		    ddi_get_name_addr(dip));
3238		goto errout0;
3239	}
3240
3241	/*
3242	 * We ship with cache-enabled disks
3243	 */
3244	vdp->xdf_wce = B_TRUE;
3245
3246	mutex_enter(&vdp->xdf_cb_lk);
3247	/* Watch backend XenbusState change */
3248	if (xvdi_add_event_handler(dip,
3249	    XS_OE_STATE, xdf_oe_change, NULL) != DDI_SUCCESS) {
3250		mutex_exit(&vdp->xdf_cb_lk);
3251		goto errout0;
3252	}
3253
3254	if (xdf_setstate_init(vdp) != DDI_SUCCESS) {
3255		cmn_err(CE_WARN, "xdf@%s: start connection failed",
3256		    ddi_get_name_addr(dip));
3257		mutex_exit(&vdp->xdf_cb_lk);
3258		goto errout1;
3259	}
3260	mutex_exit(&vdp->xdf_cb_lk);
3261
3262#if defined(XPV_HVM_DRIVER)
3263
3264	xdf_hvm_add(dip);
3265
3266	/* Report our version to dom0.  */
3267	if (xenbus_printf(XBT_NULL, "hvmpv/xdf", "version", "%d",
3268	    HVMPV_XDF_VERS))
3269		cmn_err(CE_WARN, "xdf: couldn't write version\n");
3270
3271#else /* !XPV_HVM_DRIVER */
3272
3273	/* create kstat for iostat(1M) */
3274	if (xdf_kstat_create(dip, "xdf", instance) != 0) {
3275		cmn_err(CE_WARN, "xdf@%s: failed to create kstat",
3276		    ddi_get_name_addr(dip));
3277		goto errout1;
3278	}
3279
3280#endif /* !XPV_HVM_DRIVER */
3281
3282	ddi_report_dev(dip);
3283	DPRINTF(DDI_DBG, ("xdf@%s: attached\n", vdp->xdf_addr));
3284	return (DDI_SUCCESS);
3285
3286errout1:
3287	(void) xvdi_switch_state(vdp->xdf_dip, XBT_NULL, XenbusStateClosed);
3288	xvdi_remove_event_handler(dip, XS_OE_STATE);
3289errout0:
3290	if (vdp->xdf_vd_lbl != NULL) {
3291		cmlb_detach(vdp->xdf_vd_lbl, NULL);
3292		cmlb_free_handle(&vdp->xdf_vd_lbl);
3293		vdp->xdf_vd_lbl = NULL;
3294	}
3295	if (vdp->xdf_softintr_id != NULL)
3296		ddi_remove_softintr(vdp->xdf_softintr_id);
3297	xvdi_remove_xb_watch_handlers(dip);
3298	if (vdp->xdf_ready_tq != NULL)
3299		ddi_taskq_destroy(vdp->xdf_ready_tq);
3300	mutex_destroy(&vdp->xdf_cb_lk);
3301	mutex_destroy(&vdp->xdf_dev_lk);
3302	cv_destroy(&vdp->xdf_dev_cv);
3303	cv_destroy(&vdp->xdf_hp_status_cv);
3304	ddi_soft_state_free(xdf_ssp, instance);
3305	ddi_set_driver_private(dip, NULL);
3306	ddi_prop_remove_all(dip);
3307	cmn_err(CE_WARN, "xdf@%s: attach failed", ddi_get_name_addr(dip));
3308	return (DDI_FAILURE);
3309}
3310
3311static int
3312xdf_suspend(dev_info_t *dip)
3313{
3314	int		instance = ddi_get_instance(dip);
3315	xdf_t		*vdp;
3316
3317	if ((vdp = ddi_get_soft_state(xdf_ssp, instance)) == NULL)
3318		return (DDI_FAILURE);
3319
3320	if (xdf_debug & SUSRES_DBG)
3321		xen_printf("xdf@%s: xdf_suspend\n", vdp->xdf_addr);
3322
3323	xvdi_suspend(dip);
3324
3325	mutex_enter(&vdp->xdf_cb_lk);
3326	mutex_enter(&vdp->xdf_dev_lk);
3327
3328	vdp->xdf_suspending = B_TRUE;
3329	xdf_ring_destroy(vdp);
3330	xdf_set_state(vdp, XD_SUSPEND);
3331	vdp->xdf_suspending = B_FALSE;
3332
3333	mutex_exit(&vdp->xdf_dev_lk);
3334	mutex_exit(&vdp->xdf_cb_lk);
3335
3336	if (xdf_debug & SUSRES_DBG)
3337		xen_printf("xdf@%s: xdf_suspend: done\n", vdp->xdf_addr);
3338
3339	return (DDI_SUCCESS);
3340}
3341
3342static int
3343xdf_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
3344{
3345	xdf_t *vdp;
3346	int instance;
3347
3348	switch (cmd) {
3349
3350	case DDI_PM_SUSPEND:
3351		break;
3352
3353	case DDI_SUSPEND:
3354		return (xdf_suspend(dip));
3355
3356	case DDI_DETACH:
3357		break;
3358
3359	default:
3360		return (DDI_FAILURE);
3361	}
3362
3363	instance = ddi_get_instance(dip);
3364	DPRINTF(DDI_DBG, ("xdf@%s: detaching\n", ddi_get_name_addr(dip)));
3365	vdp = ddi_get_soft_state(xdf_ssp, instance);
3366
3367	if (vdp == NULL)
3368		return (DDI_FAILURE);
3369
3370	mutex_enter(&vdp->xdf_cb_lk);
3371	xdf_disconnect(vdp, XD_CLOSED, B_FALSE);
3372	if (vdp->xdf_state != XD_CLOSED) {
3373		mutex_exit(&vdp->xdf_cb_lk);
3374		return (DDI_FAILURE);
3375	}
3376	mutex_exit(&vdp->xdf_cb_lk);
3377
3378	ASSERT(!ISDMACBON(vdp));
3379
3380#if defined(XPV_HVM_DRIVER)
3381	xdf_hvm_rm(dip);
3382#endif /* XPV_HVM_DRIVER */
3383
3384	if (vdp->xdf_timeout_id != 0)
3385		(void) untimeout(vdp->xdf_timeout_id);
3386
3387	xvdi_remove_event_handler(dip, XS_OE_STATE);
3388	ddi_taskq_destroy(vdp->xdf_ready_tq);
3389
3390	cmlb_detach(vdp->xdf_vd_lbl, NULL);
3391	cmlb_free_handle(&vdp->xdf_vd_lbl);
3392
3393	/* we'll support backend running in domU later */
3394#ifdef	DOMU_BACKEND
3395	(void) xvdi_post_event(dip, XEN_HP_REMOVE);
3396#endif
3397
3398	list_destroy(&vdp->xdf_vreq_act);
3399	ddi_prop_remove_all(dip);
3400	xdf_kstat_delete(dip);
3401	ddi_remove_softintr(vdp->xdf_softintr_id);
3402	xvdi_remove_xb_watch_handlers(dip);
3403	ddi_set_driver_private(dip, NULL);
3404	cv_destroy(&vdp->xdf_dev_cv);
3405	mutex_destroy(&vdp->xdf_cb_lk);
3406	mutex_destroy(&vdp->xdf_dev_lk);
3407	if (vdp->xdf_cache_flush_block != NULL)
3408		kmem_free(vdp->xdf_flush_mem, 2 * DEV_BSIZE);
3409	ddi_soft_state_free(xdf_ssp, instance);
3410	return (DDI_SUCCESS);
3411}
3412
3413/*
3414 * Driver linkage structures.
3415 */
3416static struct cb_ops xdf_cbops = {
3417	xdf_open,
3418	xdf_close,
3419	xdf_strategy,
3420	nodev,
3421	xdf_dump,
3422	xdf_read,
3423	xdf_write,
3424	xdf_ioctl,
3425	nodev,
3426	nodev,
3427	nodev,
3428	nochpoll,
3429	xdf_prop_op,
3430	NULL,
3431	D_MP | D_NEW | D_64BIT,
3432	CB_REV,
3433	xdf_aread,
3434	xdf_awrite
3435};
3436
3437struct dev_ops xdf_devops = {
3438	DEVO_REV,		/* devo_rev */
3439	0,			/* devo_refcnt */
3440	xdf_getinfo,		/* devo_getinfo */
3441	nulldev,		/* devo_identify */
3442	nulldev,		/* devo_probe */
3443	xdf_attach,		/* devo_attach */
3444	xdf_detach,		/* devo_detach */
3445	nodev,			/* devo_reset */
3446	&xdf_cbops,		/* devo_cb_ops */
3447	NULL,			/* devo_bus_ops */
3448	NULL,			/* devo_power */
3449	ddi_quiesce_not_supported, /* devo_quiesce */
3450};
3451
3452/*
3453 * Module linkage structures.
3454 */
3455static struct modldrv modldrv = {
3456	&mod_driverops,		/* Type of module.  This one is a driver */
3457	"virtual block driver",	/* short description */
3458	&xdf_devops		/* driver specific ops */
3459};
3460
3461static struct modlinkage xdf_modlinkage = {
3462	MODREV_1, (void *)&modldrv, NULL
3463};
3464
3465/*
3466 * standard module entry points
3467 */
3468int
3469_init(void)
3470{
3471	int rc;
3472
3473	xdf_major = ddi_name_to_major("xdf");
3474	if (xdf_major == (major_t)-1)
3475		return (EINVAL);
3476
3477	if ((rc = ddi_soft_state_init(&xdf_ssp, sizeof (xdf_t), 0)) != 0)
3478		return (rc);
3479
3480	xdf_vreq_cache = kmem_cache_create("xdf_vreq_cache",
3481	    sizeof (v_req_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3482	xdf_gs_cache = kmem_cache_create("xdf_gs_cache",
3483	    sizeof (ge_slot_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3484
3485#if defined(XPV_HVM_DRIVER)
3486	xdf_hvm_init();
3487#endif /* XPV_HVM_DRIVER */
3488
3489	if ((rc = mod_install(&xdf_modlinkage)) != 0) {
3490#if defined(XPV_HVM_DRIVER)
3491		xdf_hvm_fini();
3492#endif /* XPV_HVM_DRIVER */
3493		kmem_cache_destroy(xdf_vreq_cache);
3494		kmem_cache_destroy(xdf_gs_cache);
3495		ddi_soft_state_fini(&xdf_ssp);
3496		return (rc);
3497	}
3498
3499	return (rc);
3500}
3501
3502int
3503_fini(void)
3504{
3505
3506	int err;
3507	if ((err = mod_remove(&xdf_modlinkage)) != 0)
3508		return (err);
3509
3510#if defined(XPV_HVM_DRIVER)
3511	xdf_hvm_fini();
3512#endif /* XPV_HVM_DRIVER */
3513
3514	kmem_cache_destroy(xdf_vreq_cache);
3515	kmem_cache_destroy(xdf_gs_cache);
3516	ddi_soft_state_fini(&xdf_ssp);
3517
3518	return (0);
3519}
3520
3521int
3522_info(struct modinfo *modinfop)
3523{
3524	return (mod_info(&xdf_modlinkage, modinfop));
3525}
3526