xdf.c revision 7656:2621e50fdf4a
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*
28 * xdf.c - Xen Virtual Block Device Driver
29 * TODO:
30 *	- support alternate block size (currently only DEV_BSIZE supported)
31 *	- revalidate geometry for removable devices
32 */
33
34#include <sys/ddi.h>
35#include <sys/sunddi.h>
36#include <sys/conf.h>
37#include <sys/cmlb.h>
38#include <sys/dkio.h>
39#include <sys/promif.h>
40#include <sys/sysmacros.h>
41#include <sys/kstat.h>
42#include <sys/mach_mmu.h>
43#ifdef XPV_HVM_DRIVER
44#include <sys/xpv_support.h>
45#include <sys/sunndi.h>
46#endif /* XPV_HVM_DRIVER */
47#include <public/io/xenbus.h>
48#include <xen/sys/xenbus_impl.h>
49#include <xen/sys/xendev.h>
50#include <sys/gnttab.h>
51#include <sys/scsi/generic/inquiry.h>
52#include <xen/io/blkif_impl.h>
53#include <io/xdf.h>
54
55#define	FLUSH_DISKCACHE	0x1
56#define	WRITE_BARRIER	0x2
57#define	DEFAULT_FLUSH_BLOCK	156 /* block to write to cause a cache flush */
58#define	USE_WRITE_BARRIER(vdp)				\
59	((vdp)->xdf_feature_barrier && !(vdp)->xdf_flush_supported)
60#define	USE_FLUSH_DISKCACHE(vdp)			\
61	((vdp)->xdf_feature_barrier && (vdp)->xdf_flush_supported)
62#define	IS_WRITE_BARRIER(vdp, bp)			\
63	(!IS_READ(bp) && USE_WRITE_BARRIER(vdp) &&	\
64	((bp)->b_un.b_addr == (vdp)->xdf_cache_flush_block))
65#define	IS_FLUSH_DISKCACHE(bp)				\
66	(!IS_READ(bp) && USE_FLUSH_DISKCACHE(vdp) && ((bp)->b_bcount == 0))
67
68static void *vbd_ss;
69static kmem_cache_t *xdf_vreq_cache;
70static kmem_cache_t *xdf_gs_cache;
71static int xdf_maxphys = XB_MAXPHYS;
72int xdfdebug = 0;
73extern int do_polled_io;
74diskaddr_t xdf_flush_block = DEFAULT_FLUSH_BLOCK;
75int	xdf_barrier_flush_disable = 0;
76
77/*
78 * dev_ops and cb_ops entrypoints
79 */
80static int xdf_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
81static int xdf_attach(dev_info_t *, ddi_attach_cmd_t);
82static int xdf_detach(dev_info_t *, ddi_detach_cmd_t);
83static int xdf_reset(dev_info_t *, ddi_reset_cmd_t);
84static int xdf_open(dev_t *, int, int, cred_t *);
85static int xdf_close(dev_t, int, int, struct cred *);
86static int xdf_strategy(struct buf *);
87static int xdf_read(dev_t, struct uio *, cred_t *);
88static int xdf_aread(dev_t, struct aio_req *, cred_t *);
89static int xdf_write(dev_t, struct uio *, cred_t *);
90static int xdf_awrite(dev_t, struct aio_req *, cred_t *);
91static int xdf_dump(dev_t, caddr_t, daddr_t, int);
92static int xdf_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
93static uint_t xdf_intr(caddr_t);
94static int xdf_prop_op(dev_t, dev_info_t *, ddi_prop_op_t, int, char *,
95    caddr_t, int *);
96
97/*
98 * misc private functions
99 */
100static int xdf_suspend(dev_info_t *);
101static int xdf_resume(dev_info_t *);
102static int xdf_start_connect(xdf_t *);
103static int xdf_start_disconnect(xdf_t *);
104static int xdf_post_connect(xdf_t *);
105static void xdf_post_disconnect(xdf_t *);
106static void xdf_oe_change(dev_info_t *, ddi_eventcookie_t, void *, void *);
107static void xdf_iostart(xdf_t *);
108static void xdf_iofini(xdf_t *, uint64_t, int);
109static int xdf_prepare_rreq(xdf_t *, struct buf *, blkif_request_t *);
110static int xdf_drain_io(xdf_t *);
111static boolean_t xdf_isopen(xdf_t *, int);
112static int xdf_check_state_transition(xdf_t *, XenbusState);
113static int xdf_connect(xdf_t *, boolean_t);
114static int xdf_dmacallback(caddr_t);
115static void xdf_timeout_handler(void *);
116static uint_t xdf_iorestart(caddr_t);
117static v_req_t *vreq_get(xdf_t *, buf_t *);
118static void vreq_free(xdf_t *, v_req_t *);
119static int vreq_setup(xdf_t *, v_req_t *);
120static ge_slot_t *gs_get(xdf_t *, int);
121static void gs_free(xdf_t *, ge_slot_t *);
122static grant_ref_t gs_grant(ge_slot_t *, mfn_t);
123static void unexpectedie(xdf_t *);
124static void xdfmin(struct buf *);
125static void xdf_synthetic_pgeom(dev_info_t *, cmlb_geom_t *);
126extern int xdf_kstat_create(dev_info_t *, char *, int);
127extern void xdf_kstat_delete(dev_info_t *);
128
129#if defined(XPV_HVM_DRIVER)
130static void xdf_hvm_add(dev_info_t *);
131static void xdf_hvm_rm(dev_info_t *);
132static void xdf_hvm_init(void);
133static void xdf_hvm_fini(void);
134#endif /* XPV_HVM_DRIVER */
135
136static 	struct cb_ops xdf_cbops = {
137	xdf_open,
138	xdf_close,
139	xdf_strategy,
140	nodev,
141	xdf_dump,
142	xdf_read,
143	xdf_write,
144	xdf_ioctl,
145	nodev,
146	nodev,
147	nodev,
148	nochpoll,
149	xdf_prop_op,
150	NULL,
151	D_MP | D_NEW | D_64BIT,
152	CB_REV,
153	xdf_aread,
154	xdf_awrite
155};
156
157struct dev_ops xdf_devops = {
158	DEVO_REV,		/* devo_rev */
159	0,			/* devo_refcnt */
160	xdf_getinfo,		/* devo_getinfo */
161	nulldev,		/* devo_identify */
162	nulldev,		/* devo_probe */
163	xdf_attach,		/* devo_attach */
164	xdf_detach,		/* devo_detach */
165	xdf_reset,		/* devo_reset */
166	&xdf_cbops,		/* devo_cb_ops */
167	(struct bus_ops *)NULL,	/* devo_bus_ops */
168	NULL,			/* devo_power */
169	ddi_quiesce_not_supported,	/* devo_quiesce */
170};
171
172static struct modldrv modldrv = {
173	&mod_driverops,		/* Type of module.  This one is a driver */
174	"virtual block driver",	/* short description */
175	&xdf_devops		/* driver specific ops */
176};
177
178static struct modlinkage xdf_modlinkage = {
179	MODREV_1, (void *)&modldrv, NULL
180};
181
182/*
183 * I/O buffer DMA attributes
184 * Make sure: one DMA window contains BLKIF_MAX_SEGMENTS_PER_REQUEST at most
185 */
186static ddi_dma_attr_t xb_dma_attr = {
187	DMA_ATTR_V0,
188	(uint64_t)0,			/* lowest address */
189	(uint64_t)0xffffffffffffffff,	/* highest usable address */
190	(uint64_t)0xffffff,		/* DMA counter limit max */
191	(uint64_t)XB_BSIZE,		/* alignment in bytes */
192	XB_BSIZE - 1,			/* bitmap of burst sizes */
193	XB_BSIZE,			/* min transfer */
194	(uint64_t)XB_MAX_XFER, 		/* maximum transfer */
195	(uint64_t)PAGEOFFSET,		/* 1 page segment length  */
196	BLKIF_MAX_SEGMENTS_PER_REQUEST,	/* maximum number of segments */
197	XB_BSIZE,			/* granularity */
198	0,				/* flags (reserved) */
199};
200
201static ddi_device_acc_attr_t xc_acc_attr = {
202	DDI_DEVICE_ATTR_V0,
203	DDI_NEVERSWAP_ACC,
204	DDI_STRICTORDER_ACC
205};
206
207/* callbacks from commmon label */
208
209int xdf_lb_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t, void *);
210int xdf_lb_getinfo(dev_info_t *, int, void *, void *);
211
212static cmlb_tg_ops_t xdf_lb_ops = {
213	TG_DK_OPS_VERSION_1,
214	xdf_lb_rdwr,
215	xdf_lb_getinfo
216};
217
218int
219_init(void)
220{
221	int rc;
222
223	if ((rc = ddi_soft_state_init(&vbd_ss, sizeof (xdf_t), 0)) != 0)
224		return (rc);
225
226	xdf_vreq_cache = kmem_cache_create("xdf_vreq_cache",
227	    sizeof (v_req_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
228	xdf_gs_cache = kmem_cache_create("xdf_gs_cache",
229	    sizeof (ge_slot_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
230
231#if defined(XPV_HVM_DRIVER)
232	xdf_hvm_init();
233#endif /* XPV_HVM_DRIVER */
234
235	if ((rc = mod_install(&xdf_modlinkage)) != 0) {
236#if defined(XPV_HVM_DRIVER)
237		xdf_hvm_fini();
238#endif /* XPV_HVM_DRIVER */
239		kmem_cache_destroy(xdf_vreq_cache);
240		kmem_cache_destroy(xdf_gs_cache);
241		ddi_soft_state_fini(&vbd_ss);
242		return (rc);
243	}
244
245	return (rc);
246}
247
248int
249_fini(void)
250{
251
252	int err;
253	if ((err = mod_remove(&xdf_modlinkage)) != 0)
254		return (err);
255
256#if defined(XPV_HVM_DRIVER)
257	xdf_hvm_fini();
258#endif /* XPV_HVM_DRIVER */
259
260	kmem_cache_destroy(xdf_vreq_cache);
261	kmem_cache_destroy(xdf_gs_cache);
262	ddi_soft_state_fini(&vbd_ss);
263
264	return (0);
265}
266
267int
268_info(struct modinfo *modinfop)
269{
270	return (mod_info(&xdf_modlinkage, modinfop));
271}
272
273/*ARGSUSED*/
274static int
275xdf_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **rp)
276{
277	int instance;
278	xdf_t *vbdp;
279
280	instance = XDF_INST(getminor((dev_t)arg));
281
282	switch (cmd) {
283	case DDI_INFO_DEVT2DEVINFO:
284		if ((vbdp = ddi_get_soft_state(vbd_ss, instance)) == NULL) {
285			*rp = NULL;
286			return (DDI_FAILURE);
287		}
288		*rp = vbdp->xdf_dip;
289		return (DDI_SUCCESS);
290
291	case DDI_INFO_DEVT2INSTANCE:
292		*rp = (void *)(uintptr_t)instance;
293		return (DDI_SUCCESS);
294
295	default:
296		return (DDI_FAILURE);
297	}
298}
299
300static int
301xdf_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags,
302	char *name, caddr_t valuep, int *lengthp)
303{
304	xdf_t	*vdp;
305
306	if ((vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(dip))) == NULL)
307		return (ddi_prop_op(dev, dip, prop_op, mod_flags,
308		    name, valuep, lengthp));
309
310	return (cmlb_prop_op(vdp->xdf_vd_lbl,
311	    dev, dip, prop_op, mod_flags, name, valuep, lengthp,
312	    XDF_PART(getminor(dev)), NULL));
313}
314
315static int
316xdf_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
317{
318	xdf_t *vdp;
319	ddi_iblock_cookie_t softibc;
320	int instance;
321
322	xdfdebug = ddi_prop_get_int(DDI_DEV_T_ANY, devi, DDI_PROP_NOTPROM,
323	    "xdfdebug", 0);
324
325	switch (cmd) {
326		case DDI_ATTACH:
327			break;
328
329		case DDI_RESUME:
330			return (xdf_resume(devi));
331
332		default:
333			return (DDI_FAILURE);
334	}
335
336	instance = ddi_get_instance(devi);
337	if (ddi_soft_state_zalloc(vbd_ss, instance) != DDI_SUCCESS)
338		return (DDI_FAILURE);
339
340	DPRINTF(DDI_DBG, ("xdf%d: attaching\n", instance));
341	vdp = ddi_get_soft_state(vbd_ss, instance);
342	ddi_set_driver_private(devi, vdp);
343	vdp->xdf_dip = devi;
344	cv_init(&vdp->xdf_dev_cv, NULL, CV_DEFAULT, NULL);
345
346	if (ddi_get_iblock_cookie(devi, 0, &vdp->xdf_ibc) != DDI_SUCCESS) {
347		cmn_err(CE_WARN, "xdf@%s: failed to get iblock cookie",
348		    ddi_get_name_addr(devi));
349		goto errout0;
350	}
351	mutex_init(&vdp->xdf_dev_lk, NULL, MUTEX_DRIVER, (void *)vdp->xdf_ibc);
352	mutex_init(&vdp->xdf_cb_lk, NULL, MUTEX_DRIVER, (void *)vdp->xdf_ibc);
353	mutex_init(&vdp->xdf_iostat_lk, NULL, MUTEX_DRIVER,
354	    (void *)vdp->xdf_ibc);
355
356	if (ddi_get_soft_iblock_cookie(devi, DDI_SOFTINT_LOW, &softibc)
357	    != DDI_SUCCESS) {
358		cmn_err(CE_WARN, "xdf@%s: failed to get softintr iblock cookie",
359		    ddi_get_name_addr(devi));
360		goto errout0;
361	}
362	if (ddi_add_softintr(devi, DDI_SOFTINT_LOW, &vdp->xdf_softintr_id,
363	    &softibc, NULL, xdf_iorestart, (caddr_t)vdp) != DDI_SUCCESS) {
364		cmn_err(CE_WARN, "xdf@%s: failed to add softintr",
365		    ddi_get_name_addr(devi));
366		goto errout0;
367	}
368
369#if !defined(XPV_HVM_DRIVER)
370	/* create kstat for iostat(1M) */
371	if (xdf_kstat_create(devi, "xdf", instance) != 0) {
372		cmn_err(CE_WARN, "xdf@%s: failed to create kstat",
373		    ddi_get_name_addr(devi));
374		goto errout0;
375	}
376#endif /* !XPV_HVM_DRIVER */
377
378	/* driver handles kernel-issued IOCTLs */
379	if (ddi_prop_create(DDI_DEV_T_NONE, devi, DDI_PROP_CANSLEEP,
380	    DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) {
381		cmn_err(CE_WARN, "xdf@%s: cannot create DDI_KERNEL_IOCTL prop",
382		    ddi_get_name_addr(devi));
383		goto errout0;
384	}
385
386	/*
387	 * Initialize the physical geometry stucture.  Note that currently
388	 * we don't know the size of the backend device so the number
389	 * of blocks on the device will be initialized to zero.  Once
390	 * we connect to the backend device we'll update the physical
391	 * geometry to reflect the real size of the device.
392	 */
393	xdf_synthetic_pgeom(devi, &vdp->xdf_pgeom);
394
395	/*
396	 * create default device minor nodes: non-removable disk
397	 * we will adjust minor nodes after we are connected w/ backend
398	 */
399	cmlb_alloc_handle(&vdp->xdf_vd_lbl);
400	if (cmlb_attach(devi, &xdf_lb_ops, DTYPE_DIRECT, 0, 1,
401	    DDI_NT_BLOCK_XVMD,
402#if defined(XPV_HVM_DRIVER)
403	    CMLB_CREATE_ALTSLICE_VTOC_16_DTYPE_DIRECT |
404	    CMLB_INTERNAL_MINOR_NODES,
405#else /* !XPV_HVM_DRIVER */
406	    CMLB_FAKE_LABEL_ONE_PARTITION,
407#endif /* !XPV_HVM_DRIVER */
408	    vdp->xdf_vd_lbl, NULL) != 0) {
409		cmn_err(CE_WARN, "xdf@%s: default cmlb attach failed",
410		    ddi_get_name_addr(devi));
411		goto errout0;
412	}
413
414	/*
415	 * We ship with cache-enabled disks
416	 */
417	vdp->xdf_wce = 1;
418
419	mutex_enter(&vdp->xdf_cb_lk);
420
421	/* Watch backend XenbusState change */
422	if (xvdi_add_event_handler(devi, XS_OE_STATE,
423	    xdf_oe_change) != DDI_SUCCESS) {
424		mutex_exit(&vdp->xdf_cb_lk);
425		goto errout0;
426	}
427
428	if (xdf_start_connect(vdp) != DDI_SUCCESS) {
429		cmn_err(CE_WARN, "xdf@%s: start connection failed",
430		    ddi_get_name_addr(devi));
431		(void) xdf_start_disconnect(vdp);
432		mutex_exit(&vdp->xdf_cb_lk);
433		goto errout1;
434	}
435
436	mutex_exit(&vdp->xdf_cb_lk);
437
438	list_create(&vdp->xdf_vreq_act, sizeof (v_req_t),
439	    offsetof(v_req_t, v_link));
440	list_create(&vdp->xdf_gs_act, sizeof (ge_slot_t),
441	    offsetof(ge_slot_t, link));
442
443#if defined(XPV_HVM_DRIVER)
444	xdf_hvm_add(devi);
445
446	(void) ddi_prop_update_int(DDI_DEV_T_NONE, devi, DDI_NO_AUTODETACH, 1);
447
448	/*
449	 * Report our version to dom0.
450	 */
451	if (xenbus_printf(XBT_NULL, "hvmpv/xdf", "version", "%d",
452	    HVMPV_XDF_VERS))
453		cmn_err(CE_WARN, "xdf: couldn't write version\n");
454#endif /* XPV_HVM_DRIVER */
455
456	ddi_report_dev(devi);
457
458	DPRINTF(DDI_DBG, ("xdf%d: attached\n", instance));
459
460	return (DDI_SUCCESS);
461
462errout1:
463	xvdi_remove_event_handler(devi, XS_OE_STATE);
464errout0:
465	if (vdp->xdf_vd_lbl != NULL) {
466		cmlb_detach(vdp->xdf_vd_lbl, NULL);
467		cmlb_free_handle(&vdp->xdf_vd_lbl);
468		vdp->xdf_vd_lbl = NULL;
469	}
470#if !defined(XPV_HVM_DRIVER)
471	xdf_kstat_delete(devi);
472#endif /* !XPV_HVM_DRIVER */
473	if (vdp->xdf_softintr_id != NULL)
474		ddi_remove_softintr(vdp->xdf_softintr_id);
475	if (vdp->xdf_ibc != NULL) {
476		mutex_destroy(&vdp->xdf_cb_lk);
477		mutex_destroy(&vdp->xdf_dev_lk);
478	}
479	cv_destroy(&vdp->xdf_dev_cv);
480	ddi_soft_state_free(vbd_ss, instance);
481	ddi_set_driver_private(devi, NULL);
482	ddi_prop_remove_all(devi);
483	cmn_err(CE_WARN, "xdf@%s: attach failed", ddi_get_name_addr(devi));
484	return (DDI_FAILURE);
485}
486
487static int
488xdf_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
489{
490	xdf_t *vdp;
491	int instance;
492
493	switch (cmd) {
494
495	case DDI_PM_SUSPEND:
496		break;
497
498	case DDI_SUSPEND:
499		return (xdf_suspend(devi));
500
501	case DDI_DETACH:
502		break;
503
504	default:
505		return (DDI_FAILURE);
506	}
507
508	instance = ddi_get_instance(devi);
509	DPRINTF(DDI_DBG, ("xdf%d: detaching\n", instance));
510	vdp = ddi_get_soft_state(vbd_ss, instance);
511
512	if (vdp == NULL)
513		return (DDI_FAILURE);
514
515	mutex_enter(&vdp->xdf_dev_lk);
516	if (xdf_isopen(vdp, -1)) {
517		mutex_exit(&vdp->xdf_dev_lk);
518		return (DDI_FAILURE);
519	}
520
521	if (vdp->xdf_status != XD_CLOSED) {
522		mutex_exit(&vdp->xdf_dev_lk);
523		return (DDI_FAILURE);
524	}
525
526#if defined(XPV_HVM_DRIVER)
527	xdf_hvm_rm(devi);
528#endif /* XPV_HVM_DRIVER */
529
530	ASSERT(!ISDMACBON(vdp));
531	mutex_exit(&vdp->xdf_dev_lk);
532
533	if (vdp->xdf_timeout_id != 0)
534		(void) untimeout(vdp->xdf_timeout_id);
535
536	xvdi_remove_event_handler(devi, XS_OE_STATE);
537
538	/* we'll support backend running in domU later */
539#ifdef	DOMU_BACKEND
540	(void) xvdi_post_event(devi, XEN_HP_REMOVE);
541#endif
542
543	list_destroy(&vdp->xdf_vreq_act);
544	list_destroy(&vdp->xdf_gs_act);
545	ddi_prop_remove_all(devi);
546	xdf_kstat_delete(devi);
547	ddi_remove_softintr(vdp->xdf_softintr_id);
548	ddi_set_driver_private(devi, NULL);
549	cv_destroy(&vdp->xdf_dev_cv);
550	mutex_destroy(&vdp->xdf_cb_lk);
551	mutex_destroy(&vdp->xdf_dev_lk);
552	if (vdp->xdf_cache_flush_block != NULL)
553		kmem_free(vdp->xdf_flush_mem, 2 * DEV_BSIZE);
554	ddi_soft_state_free(vbd_ss, instance);
555	return (DDI_SUCCESS);
556}
557
558static int
559xdf_suspend(dev_info_t *devi)
560{
561	xdf_t *vdp;
562	int instance;
563	enum xdf_state st;
564
565	instance = ddi_get_instance(devi);
566
567	if (xdfdebug & SUSRES_DBG)
568		xen_printf("xdf_suspend: xdf#%d\n", instance);
569
570	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
571		return (DDI_FAILURE);
572
573	xvdi_suspend(devi);
574
575	mutex_enter(&vdp->xdf_cb_lk);
576	mutex_enter(&vdp->xdf_dev_lk);
577	st = vdp->xdf_status;
578	/* change status to stop further I/O requests */
579	if (st == XD_READY)
580		vdp->xdf_status = XD_SUSPEND;
581	mutex_exit(&vdp->xdf_dev_lk);
582	mutex_exit(&vdp->xdf_cb_lk);
583
584	/* make sure no more I/O responses left in the ring buffer */
585	if ((st == XD_INIT) || (st == XD_READY)) {
586#ifdef XPV_HVM_DRIVER
587		ec_unbind_evtchn(vdp->xdf_evtchn);
588		xvdi_free_evtchn(devi);
589#else /* !XPV_HVM_DRIVER */
590		(void) ddi_remove_intr(devi, 0, NULL);
591#endif /* !XPV_HVM_DRIVER */
592		(void) xdf_drain_io(vdp);
593		/*
594		 * no need to teardown the ring buffer here
595		 * it will be simply re-init'ed during resume when
596		 * we call xvdi_alloc_ring
597		 */
598	}
599
600	if (xdfdebug & SUSRES_DBG)
601		xen_printf("xdf_suspend: SUCCESS\n");
602
603	return (DDI_SUCCESS);
604}
605
606/*ARGSUSED*/
607static int
608xdf_resume(dev_info_t *devi)
609{
610	xdf_t *vdp;
611	int instance;
612
613	instance = ddi_get_instance(devi);
614	if (xdfdebug & SUSRES_DBG)
615		xen_printf("xdf_resume: xdf%d\n", instance);
616
617	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
618		return (DDI_FAILURE);
619
620	mutex_enter(&vdp->xdf_cb_lk);
621
622	if (xvdi_resume(devi) != DDI_SUCCESS) {
623		mutex_exit(&vdp->xdf_cb_lk);
624		return (DDI_FAILURE);
625	}
626
627	mutex_enter(&vdp->xdf_dev_lk);
628	ASSERT(vdp->xdf_status != XD_READY);
629	vdp->xdf_status = XD_UNKNOWN;
630	mutex_exit(&vdp->xdf_dev_lk);
631
632	if (xdf_start_connect(vdp) != DDI_SUCCESS) {
633		mutex_exit(&vdp->xdf_cb_lk);
634		return (DDI_FAILURE);
635	}
636
637	mutex_exit(&vdp->xdf_cb_lk);
638
639	if (xdfdebug & SUSRES_DBG)
640		xen_printf("xdf_resume: done\n");
641	return (DDI_SUCCESS);
642}
643
644/*ARGSUSED*/
645static int
646xdf_reset(dev_info_t *devi, ddi_reset_cmd_t cmd)
647{
648	xdf_t *vdp;
649	int instance;
650
651	instance = ddi_get_instance(devi);
652	DPRINTF(DDI_DBG, ("xdf%d: resetting\n", instance));
653	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
654		return (DDI_FAILURE);
655
656	/*
657	 * wait for any outstanding I/O to complete
658	 */
659	(void) xdf_drain_io(vdp);
660
661	DPRINTF(DDI_DBG, ("xdf%d: reset complete\n", instance));
662	return (DDI_SUCCESS);
663}
664
665static int
666xdf_open(dev_t *devp, int flag, int otyp, cred_t *credp)
667{
668	minor_t	minor;
669	xdf_t	*vdp;
670	int part;
671	ulong_t parbit;
672	diskaddr_t p_blkct = 0;
673	boolean_t firstopen;
674	boolean_t nodelay;
675
676	minor = getminor(*devp);
677	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
678		return (ENXIO);
679
680	nodelay = (flag & (FNDELAY | FNONBLOCK));
681
682	DPRINTF(DDI_DBG, ("xdf%d: opening\n", XDF_INST(minor)));
683
684	/* do cv_wait until connected or failed */
685	mutex_enter(&vdp->xdf_dev_lk);
686	if (!nodelay && (xdf_connect(vdp, B_TRUE) != XD_READY)) {
687		mutex_exit(&vdp->xdf_dev_lk);
688		return (ENXIO);
689	}
690
691	if ((flag & FWRITE) && XD_IS_RO(vdp)) {
692		mutex_exit(&vdp->xdf_dev_lk);
693		return (EROFS);
694	}
695
696	part = XDF_PART(minor);
697	parbit = 1 << part;
698	if ((vdp->xdf_vd_exclopen & parbit) ||
699	    ((flag & FEXCL) && xdf_isopen(vdp, part))) {
700		mutex_exit(&vdp->xdf_dev_lk);
701		return (EBUSY);
702	}
703
704	/* are we the first one to open this node? */
705	firstopen = !xdf_isopen(vdp, -1);
706
707	if (otyp == OTYP_LYR)
708		vdp->xdf_vd_lyropen[part]++;
709
710	vdp->xdf_vd_open[otyp] |= parbit;
711
712	if (flag & FEXCL)
713		vdp->xdf_vd_exclopen |= parbit;
714
715	mutex_exit(&vdp->xdf_dev_lk);
716
717	/* force a re-validation */
718	if (firstopen)
719		cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
720
721	/*
722	 * check size
723	 * ignore CD/DVD which contains a zero-sized s0
724	 */
725	if (!nodelay && !XD_IS_CD(vdp) &&
726	    ((cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
727	    NULL, NULL, NULL, NULL) != 0) || (p_blkct == 0))) {
728		(void) xdf_close(*devp, flag, otyp, credp);
729		return (ENXIO);
730	}
731
732	return (0);
733}
734
735/*ARGSUSED*/
736static int
737xdf_close(dev_t dev, int flag, int otyp, struct cred *credp)
738{
739	minor_t	minor;
740	xdf_t	*vdp;
741	int part;
742	ulong_t parbit;
743
744	minor = getminor(dev);
745	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
746		return (ENXIO);
747
748	mutex_enter(&vdp->xdf_dev_lk);
749	part = XDF_PART(minor);
750	if (!xdf_isopen(vdp, part)) {
751		mutex_exit(&vdp->xdf_dev_lk);
752		return (ENXIO);
753	}
754	parbit = 1 << part;
755
756	ASSERT((vdp->xdf_vd_open[otyp] & parbit) != 0);
757	if (otyp == OTYP_LYR) {
758		ASSERT(vdp->xdf_vd_lyropen[part] > 0);
759		if (--vdp->xdf_vd_lyropen[part] == 0)
760			vdp->xdf_vd_open[otyp] &= ~parbit;
761	} else {
762		vdp->xdf_vd_open[otyp] &= ~parbit;
763	}
764	vdp->xdf_vd_exclopen &= ~parbit;
765
766	mutex_exit(&vdp->xdf_dev_lk);
767	return (0);
768}
769
770static int
771xdf_strategy(struct buf *bp)
772{
773	xdf_t	*vdp;
774	minor_t minor;
775	diskaddr_t p_blkct, p_blkst;
776	ulong_t nblks;
777	int part;
778
779	minor = getminor(bp->b_edev);
780	part = XDF_PART(minor);
781
782	vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor));
783	if ((vdp == NULL) || !xdf_isopen(vdp, part)) {
784		bioerror(bp, ENXIO);
785		bp->b_resid = bp->b_bcount;
786		biodone(bp);
787		return (0);
788	}
789
790	/* Check for writes to a read only device */
791	if (!IS_READ(bp) && XD_IS_RO(vdp)) {
792		bioerror(bp, EROFS);
793		bp->b_resid = bp->b_bcount;
794		biodone(bp);
795		return (0);
796	}
797
798	/* Check if this I/O is accessing a partition or the entire disk */
799	if ((long)bp->b_private == XB_SLICE_NONE) {
800		/* This I/O is using an absolute offset */
801		p_blkct = vdp->xdf_xdev_nblocks;
802		p_blkst = 0;
803	} else {
804		/* This I/O is using a partition relative offset */
805		if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
806		    &p_blkst, NULL, NULL, NULL)) {
807			bioerror(bp, ENXIO);
808			bp->b_resid = bp->b_bcount;
809			biodone(bp);
810			return (0);
811		}
812	}
813
814	/* check for a starting block beyond the disk or partition limit */
815	if (bp->b_blkno > p_blkct) {
816		DPRINTF(IO_DBG, ("xdf: block %lld exceeds VBD size %"PRIu64,
817		    (longlong_t)bp->b_blkno, (uint64_t)p_blkct));
818		bioerror(bp, EINVAL);
819		bp->b_resid = bp->b_bcount;
820		biodone(bp);
821		return (0);
822	}
823
824	/* Legacy: don't set error flag at this case */
825	if (bp->b_blkno == p_blkct) {
826		bp->b_resid = bp->b_bcount;
827		biodone(bp);
828		return (0);
829	}
830
831	/* Adjust for partial transfer */
832	nblks = bp->b_bcount >> XB_BSHIFT;
833	if ((bp->b_blkno + nblks) > p_blkct) {
834		bp->b_resid = ((bp->b_blkno + nblks) - p_blkct) << XB_BSHIFT;
835		bp->b_bcount -= bp->b_resid;
836	}
837
838	DPRINTF(IO_DBG, ("xdf: strategy blk %lld len %lu\n",
839	    (longlong_t)bp->b_blkno, (ulong_t)bp->b_bcount));
840
841	/* Fix up the buf struct */
842	bp->b_flags |= B_BUSY;
843	bp->av_forw = bp->av_back = NULL; /* not tagged with a v_req */
844	bp->b_private = (void *)(uintptr_t)p_blkst;
845
846	mutex_enter(&vdp->xdf_dev_lk);
847	if (vdp->xdf_xdev_iostat != NULL)
848		kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
849	if (vdp->xdf_f_act == NULL) {
850		vdp->xdf_f_act = vdp->xdf_l_act = bp;
851	} else {
852		vdp->xdf_l_act->av_forw = bp;
853		vdp->xdf_l_act = bp;
854	}
855	mutex_exit(&vdp->xdf_dev_lk);
856
857	xdf_iostart(vdp);
858	if (do_polled_io)
859		(void) xdf_drain_io(vdp);
860	return (0);
861}
862
863/*ARGSUSED*/
864static int
865xdf_read(dev_t dev, struct uio *uiop, cred_t *credp)
866{
867
868	xdf_t	*vdp;
869	minor_t minor;
870	diskaddr_t p_blkcnt;
871	int part;
872
873	minor = getminor(dev);
874	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
875		return (ENXIO);
876
877	DPRINTF(IO_DBG, ("xdf: read offset 0x%"PRIx64"\n",
878	    (int64_t)uiop->uio_offset));
879
880	part = XDF_PART(minor);
881	if (!xdf_isopen(vdp, part))
882		return (ENXIO);
883
884	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
885	    NULL, NULL, NULL, NULL))
886		return (ENXIO);
887
888	if (U_INVAL(uiop))
889		return (EINVAL);
890
891	return (physio(xdf_strategy, NULL, dev, B_READ, xdfmin, uiop));
892}
893
894/*ARGSUSED*/
895static int
896xdf_write(dev_t dev, struct uio *uiop, cred_t *credp)
897{
898	xdf_t *vdp;
899	minor_t minor;
900	diskaddr_t p_blkcnt;
901	int part;
902
903	minor = getminor(dev);
904	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
905		return (ENXIO);
906
907	DPRINTF(IO_DBG, ("xdf: write offset 0x%"PRIx64"\n",
908	    (int64_t)uiop->uio_offset));
909
910	part = XDF_PART(minor);
911	if (!xdf_isopen(vdp, part))
912		return (ENXIO);
913
914	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
915	    NULL, NULL, NULL, NULL))
916		return (ENXIO);
917
918	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
919		return (ENOSPC);
920
921	if (U_INVAL(uiop))
922		return (EINVAL);
923
924	return (physio(xdf_strategy, NULL, dev, B_WRITE, minphys, uiop));
925}
926
927/*ARGSUSED*/
928static int
929xdf_aread(dev_t dev, struct aio_req *aiop, cred_t *credp)
930{
931	xdf_t	*vdp;
932	minor_t minor;
933	struct uio *uiop = aiop->aio_uio;
934	diskaddr_t p_blkcnt;
935	int part;
936
937	minor = getminor(dev);
938	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
939		return (ENXIO);
940
941	part = XDF_PART(minor);
942	if (!xdf_isopen(vdp, part))
943		return (ENXIO);
944
945	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
946	    NULL, NULL, NULL, NULL))
947		return (ENXIO);
948
949	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
950		return (ENOSPC);
951
952	if (U_INVAL(uiop))
953		return (EINVAL);
954
955	return (aphysio(xdf_strategy, anocancel, dev, B_READ, minphys, aiop));
956}
957
958/*ARGSUSED*/
959static int
960xdf_awrite(dev_t dev, struct aio_req *aiop, cred_t *credp)
961{
962	xdf_t *vdp;
963	minor_t minor;
964	struct uio *uiop = aiop->aio_uio;
965	diskaddr_t p_blkcnt;
966	int part;
967
968	minor = getminor(dev);
969	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
970		return (ENXIO);
971
972	part = XDF_PART(minor);
973	if (!xdf_isopen(vdp, part))
974		return (ENXIO);
975
976	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
977	    NULL, NULL, NULL, NULL))
978		return (ENXIO);
979
980	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
981		return (ENOSPC);
982
983	if (U_INVAL(uiop))
984		return (EINVAL);
985
986	return (aphysio(xdf_strategy, anocancel, dev, B_WRITE, minphys, aiop));
987}
988
989static int
990xdf_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
991{
992	struct buf dumpbuf, *dbp;
993	xdf_t	*vdp;
994	minor_t minor;
995	int err = 0;
996	int part;
997	diskaddr_t p_blkcnt, p_blkst;
998
999	minor = getminor(dev);
1000	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
1001		return (ENXIO);
1002
1003	DPRINTF(IO_DBG, ("xdf: dump addr (0x%p) blk (%ld) nblks (%d)\n",
1004	    (void *)addr, blkno, nblk));
1005
1006	part = XDF_PART(minor);
1007	if (!xdf_isopen(vdp, part))
1008		return (ENXIO);
1009
1010	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt, &p_blkst,
1011	    NULL, NULL, NULL))
1012		return (ENXIO);
1013
1014	if ((blkno + nblk) > p_blkcnt) {
1015		cmn_err(CE_WARN, "xdf: block %ld exceeds VBD size %"PRIu64,
1016		    blkno + nblk, (uint64_t)p_blkcnt);
1017		return (EINVAL);
1018	}
1019
1020	dbp = &dumpbuf;
1021	bioinit(dbp);
1022	dbp->b_flags = B_BUSY;
1023	dbp->b_un.b_addr = addr;
1024	dbp->b_bcount = nblk << DEV_BSHIFT;
1025	dbp->b_blkno = blkno;
1026	dbp->b_edev = dev;
1027	dbp->b_private = (void *)(uintptr_t)p_blkst;
1028
1029	mutex_enter(&vdp->xdf_dev_lk);
1030	if (vdp->xdf_xdev_iostat != NULL)
1031		kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1032	if (vdp->xdf_f_act == NULL) {
1033		vdp->xdf_f_act = vdp->xdf_l_act = dbp;
1034	} else {
1035		vdp->xdf_l_act->av_forw = dbp;
1036		vdp->xdf_l_act = dbp;
1037	}
1038	dbp->av_forw = NULL;
1039	dbp->av_back = NULL;
1040	mutex_exit(&vdp->xdf_dev_lk);
1041	xdf_iostart(vdp);
1042	err = xdf_drain_io(vdp);
1043	biofini(dbp);
1044	return (err);
1045}
1046
1047/*ARGSUSED*/
1048static int
1049xdf_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
1050    int *rvalp)
1051{
1052	int instance;
1053	xdf_t	*vdp;
1054	minor_t minor;
1055	int part;
1056
1057	minor = getminor(dev);
1058	instance = XDF_INST(minor);
1059
1060	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
1061		return (ENXIO);
1062
1063	DPRINTF(IOCTL_DBG, ("xdf%d:ioctl: cmd %d (0x%x)\n",
1064	    instance, cmd, cmd));
1065
1066	part = XDF_PART(minor);
1067	if (!xdf_isopen(vdp, part))
1068		return (ENXIO);
1069
1070	switch (cmd) {
1071	case DKIOCGMEDIAINFO: {
1072		struct dk_minfo	media_info;
1073
1074		media_info.dki_lbsize = DEV_BSIZE;
1075		media_info.dki_capacity = vdp->xdf_pgeom.g_capacity;
1076		media_info.dki_media_type = DK_FIXED_DISK;
1077
1078		if (ddi_copyout(&media_info, (void *)arg,
1079		    sizeof (struct dk_minfo), mode)) {
1080			return (EFAULT);
1081		} else {
1082			return (0);
1083		}
1084	}
1085
1086	case DKIOCINFO: {
1087		struct dk_cinfo info;
1088
1089		/* controller information */
1090		if (XD_IS_CD(vdp))
1091			info.dki_ctype = DKC_CDROM;
1092		else
1093			info.dki_ctype = DKC_VBD;
1094
1095		info.dki_cnum = 0;
1096		(void) strncpy((char *)(&info.dki_cname), "xdf", 8);
1097
1098		/* unit information */
1099		info.dki_unit = ddi_get_instance(vdp->xdf_dip);
1100		(void) strncpy((char *)(&info.dki_dname), "xdf", 8);
1101		info.dki_flags = DKI_FMTVOL;
1102		info.dki_partition = part;
1103		info.dki_maxtransfer = maxphys / DEV_BSIZE;
1104		info.dki_addr = 0;
1105		info.dki_space = 0;
1106		info.dki_prio = 0;
1107		info.dki_vec = 0;
1108
1109		if (ddi_copyout(&info, (void *)arg, sizeof (info), mode))
1110			return (EFAULT);
1111		else
1112			return (0);
1113	}
1114
1115	case DKIOCSTATE: {
1116		enum dkio_state	dkstate = DKIO_INSERTED;
1117		if (ddi_copyout(&dkstate, (void *)arg, sizeof (dkstate),
1118		    mode) != 0)
1119			return (EFAULT);
1120		return (0);
1121	}
1122
1123	/*
1124	 * is media removable?
1125	 */
1126	case DKIOCREMOVABLE: {
1127		int i = XD_IS_RM(vdp) ? 1 : 0;
1128		if (ddi_copyout(&i, (caddr_t)arg, sizeof (int), mode))
1129			return (EFAULT);
1130		return (0);
1131	}
1132
1133	case DKIOCG_PHYGEOM:
1134	case DKIOCG_VIRTGEOM:
1135	case DKIOCGGEOM:
1136	case DKIOCSGEOM:
1137	case DKIOCGAPART:
1138	case DKIOCSAPART:
1139	case DKIOCGVTOC:
1140	case DKIOCSVTOC:
1141	case DKIOCPARTINFO:
1142	case DKIOCGEXTVTOC:
1143	case DKIOCSEXTVTOC:
1144	case DKIOCEXTPARTINFO:
1145	case DKIOCGMBOOT:
1146	case DKIOCSMBOOT:
1147	case DKIOCGETEFI:
1148	case DKIOCSETEFI:
1149	case DKIOCPARTITION: {
1150		int rc;
1151
1152		rc = cmlb_ioctl(vdp->xdf_vd_lbl, dev, cmd, arg, mode, credp,
1153		    rvalp, NULL);
1154		return (rc);
1155	}
1156
1157	case DKIOCGETWCE:
1158		if (ddi_copyout(&vdp->xdf_wce, (void *)arg,
1159		    sizeof (vdp->xdf_wce), mode))
1160			return (EFAULT);
1161		return (0);
1162	case DKIOCSETWCE:
1163		if (ddi_copyin((void *)arg, &vdp->xdf_wce,
1164		    sizeof (vdp->xdf_wce), mode))
1165			return (EFAULT);
1166		return (0);
1167	case DKIOCFLUSHWRITECACHE: {
1168		int rc;
1169		struct dk_callback *dkc = (struct dk_callback *)arg;
1170
1171		if (vdp->xdf_flush_supported) {
1172			rc = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
1173			    NULL, 0, 0, (void *)dev);
1174		} else if (vdp->xdf_feature_barrier &&
1175		    !xdf_barrier_flush_disable) {
1176			rc = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
1177			    vdp->xdf_cache_flush_block, xdf_flush_block,
1178			    DEV_BSIZE, (void *)dev);
1179		} else {
1180			return (ENOTTY);
1181		}
1182		if ((mode & FKIOCTL) && (dkc != NULL) &&
1183		    (dkc->dkc_callback != NULL)) {
1184			(*dkc->dkc_callback)(dkc->dkc_cookie, rc);
1185			/* need to return 0 after calling callback */
1186			rc = 0;
1187		}
1188		return (rc);
1189	}
1190
1191	default:
1192		return (ENOTTY);
1193	}
1194}
1195
1196/*
1197 * xdf interrupt handler
1198 */
1199static uint_t
1200xdf_intr(caddr_t arg)
1201{
1202	xdf_t *vdp = (xdf_t *)arg;
1203	xendev_ring_t *xbr;
1204	blkif_response_t *resp;
1205	int bioerr;
1206	uint64_t id;
1207	extern int do_polled_io;
1208	uint8_t op;
1209	uint16_t status;
1210	ddi_acc_handle_t acchdl;
1211
1212	mutex_enter(&vdp->xdf_dev_lk);
1213
1214	if ((xbr = vdp->xdf_xb_ring) == NULL) {
1215		mutex_exit(&vdp->xdf_dev_lk);
1216		return (DDI_INTR_UNCLAIMED);
1217	}
1218
1219	acchdl = vdp->xdf_xb_ring_hdl;
1220
1221	/*
1222	 * complete all requests which have a response
1223	 */
1224	while (resp = xvdi_ring_get_response(xbr)) {
1225		id = ddi_get64(acchdl, &resp->id);
1226		op = ddi_get8(acchdl, &resp->operation);
1227		status = ddi_get16(acchdl, (uint16_t *)&resp->status);
1228		DPRINTF(INTR_DBG, ("resp: op %d id %"PRIu64" status %d\n",
1229		    op, id, status));
1230
1231		/*
1232		 * XXPV - close connection to the backend and restart
1233		 */
1234		if (status != BLKIF_RSP_OKAY) {
1235			DPRINTF(IO_DBG, ("xdf@%s: I/O error while %s",
1236			    ddi_get_name_addr(vdp->xdf_dip),
1237			    (op == BLKIF_OP_READ) ? "reading" : "writing"));
1238			bioerr = EIO;
1239		} else {
1240			bioerr = 0;
1241		}
1242
1243		xdf_iofini(vdp, id, bioerr);
1244	}
1245
1246	mutex_exit(&vdp->xdf_dev_lk);
1247
1248	if (!do_polled_io)
1249		xdf_iostart(vdp);
1250
1251	return (DDI_INTR_CLAIMED);
1252}
1253
1254int xdf_fbrewrites;	/* how many times was our flush block rewritten */
1255
1256/*
1257 * Snarf new data if our flush block was re-written
1258 */
1259static void
1260check_fbwrite(xdf_t *vdp, buf_t *bp, daddr_t blkno)
1261{
1262	int nblks;
1263	boolean_t mapin;
1264
1265	if (IS_WRITE_BARRIER(vdp, bp))
1266		return; /* write was a flush write */
1267
1268	mapin = B_FALSE;
1269	nblks = bp->b_bcount >> DEV_BSHIFT;
1270	if (xdf_flush_block >= blkno && xdf_flush_block < (blkno + nblks)) {
1271		xdf_fbrewrites++;
1272		if (bp->b_flags & (B_PAGEIO | B_PHYS)) {
1273			mapin = B_TRUE;
1274			bp_mapin(bp);
1275		}
1276		bcopy(bp->b_un.b_addr +
1277		    ((xdf_flush_block - blkno) << DEV_BSHIFT),
1278		    vdp->xdf_cache_flush_block, DEV_BSIZE);
1279		if (mapin)
1280			bp_mapout(bp);
1281	}
1282}
1283
1284static void
1285xdf_iofini(xdf_t *vdp, uint64_t id, int bioerr)
1286{
1287	ge_slot_t *gs = (ge_slot_t *)(uintptr_t)id;
1288	v_req_t *vreq = gs->vreq;
1289	buf_t *bp = vreq->v_buf;
1290
1291	gs_free(vdp, gs);
1292	if (bioerr)
1293		bioerror(bp, bioerr);
1294	vreq->v_nslots--;
1295	if (vreq->v_nslots != 0)
1296		return;
1297
1298	XDF_UPDATE_IO_STAT(vdp, bp);
1299	if (vdp->xdf_xdev_iostat != NULL)
1300		kstat_runq_exit(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1301
1302	if (IS_ERROR(bp))
1303		bp->b_resid = bp->b_bcount;
1304
1305	vreq_free(vdp, vreq);
1306	biodone(bp);
1307}
1308
1309/*
1310 * return value of xdf_prepare_rreq()
1311 * used in xdf_iostart()
1312 */
1313#define	XF_PARTIAL	0 /* rreq is full, not all I/O in buf transferred */
1314#define	XF_COMP		1 /* no more I/O left in buf */
1315
1316static void
1317xdf_iostart(xdf_t *vdp)
1318{
1319	xendev_ring_t *xbr;
1320	struct buf *bp;
1321	blkif_request_t *rreq;
1322	int retval;
1323	int rreqready = 0;
1324
1325	xbr = vdp->xdf_xb_ring;
1326
1327	/*
1328	 * populate the ring request(s)
1329	 *
1330	 * loop until there is no buf to transfer or no free slot
1331	 * available in I/O ring
1332	 */
1333	mutex_enter(&vdp->xdf_dev_lk);
1334
1335	for (;;) {
1336		if (vdp->xdf_status != XD_READY)
1337			break;
1338
1339		/* active buf queue empty? */
1340		if ((bp = vdp->xdf_f_act) == NULL)
1341			break;
1342
1343		/* try to grab a vreq for this bp */
1344		if ((BP2VREQ(bp) == NULL) && (vreq_get(vdp, bp) == NULL))
1345				break;
1346		/* alloc DMA/GTE resources */
1347		if (vreq_setup(vdp, BP2VREQ(bp)) != DDI_SUCCESS)
1348			break;
1349
1350		/* get next blkif_request in the ring */
1351		if ((rreq = xvdi_ring_get_request(xbr)) == NULL)
1352			break;
1353		bzero(rreq, sizeof (blkif_request_t));
1354
1355		/* populate blkif_request with this buf */
1356		rreqready++;
1357		retval = xdf_prepare_rreq(vdp, bp, rreq);
1358		if (retval == XF_COMP) {
1359			/* finish this bp, switch to next one */
1360			if (vdp->xdf_xdev_iostat != NULL)
1361				kstat_waitq_to_runq(
1362				    KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1363			vdp->xdf_f_act = bp->av_forw;
1364			bp->av_forw = NULL;
1365		}
1366	}
1367
1368	/*
1369	 * Send the request(s) to the backend
1370	 */
1371	if (rreqready) {
1372		if (xvdi_ring_push_request(xbr)) {
1373			DPRINTF(IO_DBG, ("xdf_iostart: "
1374			    "sent request(s) to backend\n"));
1375			xvdi_notify_oe(vdp->xdf_dip);
1376		}
1377	}
1378
1379	mutex_exit(&vdp->xdf_dev_lk);
1380}
1381
1382/*
1383 * populate a single blkif_request_t w/ a buf
1384 */
1385static int
1386xdf_prepare_rreq(xdf_t *vdp, struct buf *bp, blkif_request_t *rreq)
1387{
1388	int		rval;
1389	grant_ref_t	gr;
1390	uint8_t		fsect, lsect;
1391	size_t		bcnt;
1392	paddr_t		dma_addr;
1393	off_t		blk_off;
1394	dev_info_t	*dip = vdp->xdf_dip;
1395	blkif_vdev_t	vdev = xvdi_get_vdevnum(dip);
1396	v_req_t		*vreq = BP2VREQ(bp);
1397	uint64_t	blkno = vreq->v_blkno;
1398	uint_t		ndmacs = vreq->v_ndmacs;
1399	ddi_acc_handle_t acchdl = vdp->xdf_xb_ring_hdl;
1400	int		seg = 0;
1401	int		isread = IS_READ(bp);
1402
1403	if (isread)
1404		ddi_put8(acchdl, &rreq->operation, BLKIF_OP_READ);
1405	else {
1406		switch (vreq->v_flush_diskcache) {
1407		case FLUSH_DISKCACHE:
1408			ddi_put8(acchdl, &rreq->operation,
1409			    BLKIF_OP_FLUSH_DISKCACHE);
1410			ddi_put16(acchdl, &rreq->handle, vdev);
1411			ddi_put64(acchdl, &rreq->id,
1412			    (uint64_t)(uintptr_t)(vreq->v_gs));
1413			ddi_put8(acchdl, &rreq->nr_segments, 0);
1414			return (XF_COMP);
1415		case WRITE_BARRIER:
1416			ddi_put8(acchdl, &rreq->operation,
1417			    BLKIF_OP_WRITE_BARRIER);
1418			break;
1419		default:
1420			if (!vdp->xdf_wce)
1421				ddi_put8(acchdl, &rreq->operation,
1422				    BLKIF_OP_WRITE_BARRIER);
1423			else
1424				ddi_put8(acchdl, &rreq->operation,
1425				    BLKIF_OP_WRITE);
1426			break;
1427		}
1428	}
1429
1430	ddi_put16(acchdl, &rreq->handle, vdev);
1431	ddi_put64(acchdl, &rreq->sector_number, blkno);
1432	ddi_put64(acchdl, &rreq->id, (uint64_t)(uintptr_t)(vreq->v_gs));
1433
1434	/*
1435	 * loop until all segments are populated or no more dma cookie in buf
1436	 */
1437	for (;;) {
1438	/*
1439	 * Each segment of a blkif request can transfer up to
1440	 * one 4K page of data.
1441	 */
1442		bcnt = vreq->v_dmac.dmac_size;
1443		ASSERT(bcnt <= PAGESIZE);
1444		ASSERT((bcnt % XB_BSIZE) == 0);
1445		dma_addr = vreq->v_dmac.dmac_laddress;
1446		blk_off = (uint_t)((paddr_t)XB_SEGOFFSET & dma_addr);
1447		ASSERT((blk_off & XB_BMASK) == 0);
1448		fsect = blk_off >> XB_BSHIFT;
1449		lsect = fsect + (bcnt >> XB_BSHIFT) - 1;
1450		ASSERT(fsect < XB_MAX_SEGLEN / XB_BSIZE &&
1451		    lsect < XB_MAX_SEGLEN / XB_BSIZE);
1452		DPRINTF(IO_DBG, ("  ""seg%d: dmacS %lu blk_off %ld\n",
1453		    seg, vreq->v_dmac.dmac_size, blk_off));
1454		gr = gs_grant(vreq->v_gs, PATOMA(dma_addr) >> PAGESHIFT);
1455		ddi_put32(acchdl, &rreq->seg[seg].gref, gr);
1456		ddi_put8(acchdl, &rreq->seg[seg].first_sect, fsect);
1457		ddi_put8(acchdl, &rreq->seg[seg].last_sect, lsect);
1458		DPRINTF(IO_DBG, ("  ""seg%d: fs %d ls %d gr %d dma 0x%"PRIx64
1459		    "\n", seg, fsect, lsect, gr, dma_addr));
1460
1461		blkno += (bcnt >> XB_BSHIFT);
1462		seg++;
1463		ASSERT(seg <= BLKIF_MAX_SEGMENTS_PER_REQUEST);
1464		if (--ndmacs) {
1465			ddi_dma_nextcookie(vreq->v_dmahdl, &vreq->v_dmac);
1466			continue;
1467		}
1468
1469		vreq->v_status = VREQ_DMAWIN_DONE;
1470		vreq->v_blkno = blkno;
1471		if (vreq->v_dmaw + 1 == vreq->v_ndmaws)
1472			/* last win */
1473			rval = XF_COMP;
1474		else
1475			rval = XF_PARTIAL;
1476		break;
1477	}
1478	ddi_put8(acchdl,  &rreq->nr_segments, seg);
1479	DPRINTF(IO_DBG, ("xdf_prepare_rreq: request id=%"PRIx64" ready\n",
1480	    rreq->id));
1481
1482	return (rval);
1483}
1484
1485#define	XDF_QSEC	50000	/* .005 second */
1486#define	XDF_POLLCNT	12	/* loop for 12 times before time out */
1487
1488static int
1489xdf_drain_io(xdf_t *vdp)
1490{
1491	int pollc, rval;
1492	xendev_ring_t *xbr;
1493
1494	if (xdfdebug & SUSRES_DBG)
1495		xen_printf("xdf_drain_io: start\n");
1496
1497	mutex_enter(&vdp->xdf_dev_lk);
1498
1499	if ((vdp->xdf_status != XD_READY) && (vdp->xdf_status != XD_SUSPEND))
1500		goto out;
1501
1502	rval = 0;
1503	xbr = vdp->xdf_xb_ring;
1504	ASSERT(xbr != NULL);
1505
1506	for (pollc = 0; pollc < XDF_POLLCNT; pollc++) {
1507		if (xvdi_ring_has_unconsumed_responses(xbr)) {
1508			mutex_exit(&vdp->xdf_dev_lk);
1509			(void) xdf_intr((caddr_t)vdp);
1510			mutex_enter(&vdp->xdf_dev_lk);
1511		}
1512		if (!xvdi_ring_has_incomp_request(xbr))
1513			goto out;
1514
1515#ifndef	XPV_HVM_DRIVER
1516		(void) HYPERVISOR_yield();
1517#endif /* XPV_HVM_DRIVER */
1518		/*
1519		 * file-backed devices can be slow
1520		 */
1521		drv_usecwait(XDF_QSEC << pollc);
1522	}
1523	cmn_err(CE_WARN, "xdf_polled_io: timeout");
1524	rval = EIO;
1525out:
1526	mutex_exit(&vdp->xdf_dev_lk);
1527	if (xdfdebug & SUSRES_DBG)
1528		xen_printf("xdf_drain_io: end, err=%d\n", rval);
1529	return (rval);
1530}
1531
1532/* ARGSUSED5 */
1533int
1534xdf_lb_rdwr(dev_info_t *devi, uchar_t cmd, void *bufp,
1535    diskaddr_t start, size_t reqlen, void *tg_cookie)
1536{
1537	xdf_t *vdp;
1538	struct buf *bp;
1539	int err = 0;
1540
1541	vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi));
1542	if (vdp == NULL)
1543		return (ENXIO);
1544
1545	if ((start + (reqlen >> DEV_BSHIFT)) > vdp->xdf_pgeom.g_capacity)
1546		return (EINVAL);
1547
1548	bp = getrbuf(KM_SLEEP);
1549	if (cmd == TG_READ)
1550		bp->b_flags = B_BUSY | B_READ;
1551	else
1552		bp->b_flags = B_BUSY | B_WRITE;
1553	bp->b_un.b_addr = bufp;
1554	bp->b_bcount = reqlen;
1555	bp->b_blkno = start;
1556	bp->b_edev = DDI_DEV_T_NONE; /* don't have dev_t */
1557
1558	mutex_enter(&vdp->xdf_dev_lk);
1559	if (vdp->xdf_xdev_iostat != NULL)
1560		kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1561	if (vdp->xdf_f_act == NULL) {
1562		vdp->xdf_f_act = vdp->xdf_l_act = bp;
1563	} else {
1564		vdp->xdf_l_act->av_forw = bp;
1565		vdp->xdf_l_act = bp;
1566	}
1567	mutex_exit(&vdp->xdf_dev_lk);
1568	xdf_iostart(vdp);
1569	err = biowait(bp);
1570
1571	ASSERT(bp->b_flags & B_DONE);
1572
1573	freerbuf(bp);
1574	return (err);
1575}
1576
1577/*
1578 * synthetic geometry
1579 */
1580#define	XDF_NSECTS	256
1581#define	XDF_NHEADS	16
1582
1583static void
1584xdf_synthetic_pgeom(dev_info_t *devi, cmlb_geom_t *geomp)
1585{
1586	xdf_t *vdp;
1587	uint_t ncyl;
1588
1589	vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi));
1590
1591	ncyl = vdp->xdf_xdev_nblocks / (XDF_NHEADS * XDF_NSECTS);
1592
1593	geomp->g_ncyl = ncyl == 0 ? 1 : ncyl;
1594	geomp->g_acyl = 0;
1595	geomp->g_nhead = XDF_NHEADS;
1596	geomp->g_secsize = XB_BSIZE;
1597	geomp->g_nsect = XDF_NSECTS;
1598	geomp->g_intrlv = 0;
1599	geomp->g_rpm = 7200;
1600	geomp->g_capacity = vdp->xdf_xdev_nblocks;
1601}
1602
1603static int
1604xdf_lb_getcap(dev_info_t *devi, diskaddr_t *capp)
1605{
1606	xdf_t *vdp;
1607
1608	vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi));
1609
1610	if (vdp == NULL)
1611		return (ENXIO);
1612
1613	mutex_enter(&vdp->xdf_dev_lk);
1614	*capp = vdp->xdf_pgeom.g_capacity;
1615	DPRINTF(LBL_DBG, ("capacity %llu\n", *capp));
1616	mutex_exit(&vdp->xdf_dev_lk);
1617	return (0);
1618}
1619
1620static int
1621xdf_lb_getpgeom(dev_info_t *devi, cmlb_geom_t *geomp)
1622{
1623	xdf_t *vdp;
1624
1625	if ((vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi))) == NULL)
1626		return (ENXIO);
1627	*geomp = vdp->xdf_pgeom;
1628	return (0);
1629}
1630
1631/*
1632 * No real HBA, no geometry available from it
1633 */
1634/*ARGSUSED*/
1635static int
1636xdf_lb_getvgeom(dev_info_t *devi, cmlb_geom_t *geomp)
1637{
1638	return (EINVAL);
1639}
1640
1641static int
1642xdf_lb_getattribute(dev_info_t *devi, tg_attribute_t *tgattributep)
1643{
1644	xdf_t *vdp;
1645
1646	if (!(vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi))))
1647		return (ENXIO);
1648
1649	if (XD_IS_RO(vdp))
1650		tgattributep->media_is_writable = 0;
1651	else
1652		tgattributep->media_is_writable = 1;
1653	return (0);
1654}
1655
1656/* ARGSUSED3 */
1657int
1658xdf_lb_getinfo(dev_info_t *devi, int cmd, void *arg, void *tg_cookie)
1659{
1660	switch (cmd) {
1661	case TG_GETPHYGEOM:
1662		return (xdf_lb_getpgeom(devi, (cmlb_geom_t *)arg));
1663	case TG_GETVIRTGEOM:
1664		return (xdf_lb_getvgeom(devi, (cmlb_geom_t *)arg));
1665	case TG_GETCAPACITY:
1666		return (xdf_lb_getcap(devi, (diskaddr_t *)arg));
1667	case TG_GETBLOCKSIZE:
1668		*(uint32_t *)arg = XB_BSIZE;
1669		return (0);
1670	case TG_GETATTR:
1671		return (xdf_lb_getattribute(devi, (tg_attribute_t *)arg));
1672	default:
1673		return (ENOTTY);
1674	}
1675}
1676
1677/*
1678 * Kick-off connect process
1679 * Status should be XD_UNKNOWN or XD_CLOSED
1680 * On success, status will be changed to XD_INIT
1681 * On error, status won't be changed
1682 */
1683static int
1684xdf_start_connect(xdf_t *vdp)
1685{
1686	char *xsnode;
1687	grant_ref_t gref;
1688	xenbus_transaction_t xbt;
1689	int rv;
1690	dev_info_t *dip = vdp->xdf_dip;
1691
1692	if ((vdp->xdf_peer = xvdi_get_oeid(dip)) == (domid_t)-1)
1693		goto errout;
1694
1695	if (xvdi_alloc_evtchn(dip) != DDI_SUCCESS) {
1696		cmn_err(CE_WARN, "xdf@%s: failed to alloc event channel",
1697		    ddi_get_name_addr(dip));
1698		goto errout;
1699	}
1700	vdp->xdf_evtchn = xvdi_get_evtchn(dip);
1701#ifdef XPV_HVM_DRIVER
1702	ec_bind_evtchn_to_handler(vdp->xdf_evtchn, IPL_VBD, xdf_intr, vdp);
1703#else /* !XPV_HVM_DRIVER */
1704	if (ddi_add_intr(dip, 0, NULL, NULL, xdf_intr, (caddr_t)vdp) !=
1705	    DDI_SUCCESS) {
1706		cmn_err(CE_WARN, "xdf_start_connect: xdf@%s: "
1707		    "failed to add intr handler", ddi_get_name_addr(dip));
1708		goto errout1;
1709	}
1710#endif /* !XPV_HVM_DRIVER */
1711
1712	if (xvdi_alloc_ring(dip, BLKIF_RING_SIZE,
1713	    sizeof (union blkif_sring_entry), &gref, &vdp->xdf_xb_ring) !=
1714	    DDI_SUCCESS) {
1715		cmn_err(CE_WARN, "xdf@%s: failed to alloc comm ring",
1716		    ddi_get_name_addr(dip));
1717		goto errout2;
1718	}
1719	vdp->xdf_xb_ring_hdl = vdp->xdf_xb_ring->xr_acc_hdl; /* ugly!! */
1720
1721	/*
1722	 * Write into xenstore the info needed by backend
1723	 */
1724	if ((xsnode = xvdi_get_xsname(dip)) == NULL) {
1725		cmn_err(CE_WARN, "xdf@%s: "
1726		    "failed to get xenstore node path",
1727		    ddi_get_name_addr(dip));
1728		goto fail_trans;
1729	}
1730trans_retry:
1731	if (xenbus_transaction_start(&xbt)) {
1732		cmn_err(CE_WARN, "xdf@%s: failed to start transaction",
1733		    ddi_get_name_addr(dip));
1734		xvdi_fatal_error(dip, EIO, "transaction start");
1735		goto fail_trans;
1736	}
1737
1738	if (rv = xenbus_printf(xbt, xsnode, "ring-ref", "%u", gref)) {
1739		cmn_err(CE_WARN, "xdf@%s: failed to write ring-ref",
1740		    ddi_get_name_addr(dip));
1741		xvdi_fatal_error(dip, rv, "writing ring-ref");
1742		goto abort_trans;
1743	}
1744
1745	if (rv = xenbus_printf(xbt, xsnode, "event-channel", "%u",
1746	    vdp->xdf_evtchn)) {
1747		cmn_err(CE_WARN, "xdf@%s: failed to write event-channel",
1748		    ddi_get_name_addr(dip));
1749		xvdi_fatal_error(dip, rv, "writing event-channel");
1750		goto abort_trans;
1751	}
1752
1753	/*
1754	 * "protocol" is written by the domain builder in the case of PV
1755	 * domains. However, it is not written for HVM domains, so let's
1756	 * write it here.
1757	 */
1758	if (rv = xenbus_printf(xbt, xsnode, "protocol", "%s",
1759	    XEN_IO_PROTO_ABI_NATIVE)) {
1760		cmn_err(CE_WARN, "xdf@%s: failed to write protocol",
1761		    ddi_get_name_addr(dip));
1762		xvdi_fatal_error(dip, rv, "writing protocol");
1763		goto abort_trans;
1764	}
1765
1766	if ((rv = xvdi_switch_state(dip, xbt, XenbusStateInitialised)) > 0) {
1767		cmn_err(CE_WARN, "xdf@%s: "
1768		    "failed to switch state to XenbusStateInitialised",
1769		    ddi_get_name_addr(dip));
1770		xvdi_fatal_error(dip, rv, "writing state");
1771		goto abort_trans;
1772	}
1773
1774	/* kick-off connect process */
1775	if (rv = xenbus_transaction_end(xbt, 0)) {
1776		if (rv == EAGAIN)
1777			goto trans_retry;
1778		cmn_err(CE_WARN, "xdf@%s: failed to end transaction",
1779		    ddi_get_name_addr(dip));
1780		xvdi_fatal_error(dip, rv, "completing transaction");
1781		goto fail_trans;
1782	}
1783
1784	ASSERT(mutex_owned(&vdp->xdf_cb_lk));
1785	mutex_enter(&vdp->xdf_dev_lk);
1786	vdp->xdf_status = XD_INIT;
1787	mutex_exit(&vdp->xdf_dev_lk);
1788
1789	return (DDI_SUCCESS);
1790
1791abort_trans:
1792	(void) xenbus_transaction_end(xbt, 1);
1793fail_trans:
1794	xvdi_free_ring(vdp->xdf_xb_ring);
1795errout2:
1796#ifdef XPV_HVM_DRIVER
1797	ec_unbind_evtchn(vdp->xdf_evtchn);
1798#else /* !XPV_HVM_DRIVER */
1799	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1800#endif /* !XPV_HVM_DRIVER */
1801errout1:
1802	xvdi_free_evtchn(dip);
1803errout:
1804	cmn_err(CE_WARN, "xdf@%s: fail to kick-off connecting",
1805	    ddi_get_name_addr(dip));
1806	return (DDI_FAILURE);
1807}
1808
1809/*
1810 * Kick-off disconnect process
1811 * Status won't be changed
1812 */
1813static int
1814xdf_start_disconnect(xdf_t *vdp)
1815{
1816	if (xvdi_switch_state(vdp->xdf_dip, XBT_NULL, XenbusStateClosed) > 0) {
1817		cmn_err(CE_WARN, "xdf@%s: fail to kick-off disconnecting",
1818		    ddi_get_name_addr(vdp->xdf_dip));
1819		return (DDI_FAILURE);
1820	}
1821
1822	return (DDI_SUCCESS);
1823}
1824
1825int
1826xdf_get_flush_block(xdf_t *vdp)
1827{
1828	/*
1829	 * Get a DEV_BSIZE aligned bufer
1830	 */
1831	vdp->xdf_flush_mem = kmem_alloc(DEV_BSIZE * 2, KM_SLEEP);
1832	vdp->xdf_cache_flush_block =
1833	    (char *)P2ROUNDUP((uintptr_t)(vdp->xdf_flush_mem), DEV_BSIZE);
1834	if (xdf_lb_rdwr(vdp->xdf_dip, TG_READ, vdp->xdf_cache_flush_block,
1835	    xdf_flush_block, DEV_BSIZE, NULL) != 0)
1836		return (DDI_FAILURE);
1837	return (DDI_SUCCESS);
1838}
1839
1840/*
1841 * Finish other initialization after we've connected to backend
1842 * Status should be XD_INIT before calling this routine
1843 * On success, status should be changed to XD_READY
1844 * On error, status should stay XD_INIT
1845 */
1846static int
1847xdf_post_connect(xdf_t *vdp)
1848{
1849	int rv;
1850	uint_t len;
1851	char *type;
1852	char *barrier;
1853	dev_info_t *devi = vdp->xdf_dip;
1854
1855	/*
1856	 * Determine if feature barrier is supported by backend
1857	 */
1858	if (xenbus_read(XBT_NULL, xvdi_get_oename(devi),
1859	    "feature-barrier", (void **)&barrier, &len) == 0) {
1860		vdp->xdf_feature_barrier = 1;
1861		kmem_free(barrier, len);
1862	} else {
1863		cmn_err(CE_NOTE, "xdf@%s: failed to read feature-barrier",
1864		    ddi_get_name_addr(vdp->xdf_dip));
1865		vdp->xdf_feature_barrier = 0;
1866	}
1867
1868	/* probe backend */
1869	if (rv = xenbus_gather(XBT_NULL, xvdi_get_oename(devi),
1870	    "sectors", "%"SCNu64, &vdp->xdf_xdev_nblocks,
1871	    "info", "%u", &vdp->xdf_xdev_info, NULL)) {
1872		cmn_err(CE_WARN, "xdf_post_connect: xdf@%s: "
1873		    "cannot read backend info", ddi_get_name_addr(devi));
1874		xvdi_fatal_error(devi, rv, "reading backend info");
1875		return (DDI_FAILURE);
1876	}
1877
1878	/*
1879	 * Make sure that the device we're connecting isn't smaller than
1880	 * the old connected device.
1881	 */
1882	if (vdp->xdf_xdev_nblocks < vdp->xdf_pgeom.g_capacity) {
1883		cmn_err(CE_WARN, "xdf_post_connect: xdf@%s: "
1884		    "backend disk device shrank", ddi_get_name_addr(devi));
1885		/* XXX:  call xvdi_fatal_error() here? */
1886		xvdi_fatal_error(devi, rv, "reading backend info");
1887		return (DDI_FAILURE);
1888	}
1889
1890#ifdef _ILP32
1891	if (vdp->xdf_xdev_nblocks > DK_MAX_BLOCKS) {
1892		cmn_err(CE_WARN, "xdf_post_connect: xdf@%s: "
1893		    "backend disk device too large with %llu blocks for"
1894		    " 32-bit kernel", ddi_get_name_addr(devi),
1895		    vdp->xdf_xdev_nblocks);
1896		xvdi_fatal_error(devi, rv, "reading backend info");
1897		return (DDI_FAILURE);
1898	}
1899#endif
1900
1901
1902	/*
1903	 * Only update the physical geometry to reflect the new device
1904	 * size if this is the first time we're connecting to the backend
1905	 * device.  Once we assign a physical geometry to a device it stays
1906	 * fixed until:
1907	 *	- we get detach and re-attached (at which point we
1908	 *	  automatically assign a new physical geometry).
1909	 *	- someone calls TG_SETPHYGEOM to explicity set the
1910	 *	  physical geometry.
1911	 */
1912	if (vdp->xdf_pgeom.g_capacity == 0)
1913		xdf_synthetic_pgeom(devi, &vdp->xdf_pgeom);
1914
1915	/* fix disk type */
1916	if (xenbus_read(XBT_NULL, xvdi_get_xsname(devi), "device-type",
1917	    (void **)&type, &len) != 0) {
1918		cmn_err(CE_WARN, "xdf_post_connect: xdf@%s: "
1919		    "cannot read device-type", ddi_get_name_addr(devi));
1920		xvdi_fatal_error(devi, rv, "reading device-type");
1921		return (DDI_FAILURE);
1922	}
1923	if (strcmp(type, "cdrom") == 0)
1924		vdp->xdf_xdev_info |= VDISK_CDROM;
1925	kmem_free(type, len);
1926
1927	/*
1928	 * We've created all the minor nodes via cmlb_attach() using default
1929	 * value in xdf_attach() to make it possible to block in xdf_open(),
1930	 * in case there's anyone (say, booting thread) ever trying to open
1931	 * it before connected to backend. We will refresh all those minor
1932	 * nodes w/ latest info we've got now when we are almost connected.
1933	 *
1934	 * Don't do this when xdf is already opened by someone (could happen
1935	 * during resume), for that cmlb_attach() will invalid the label info
1936	 * and confuse those who has already opened the node, which is bad.
1937	 */
1938	if (!xdf_isopen(vdp, -1) && (XD_IS_CD(vdp) || XD_IS_RM(vdp))) {
1939		/* re-init cmlb w/ latest info we got from backend */
1940		if (cmlb_attach(devi, &xdf_lb_ops,
1941		    XD_IS_CD(vdp) ? DTYPE_RODIRECT : DTYPE_DIRECT,
1942		    XD_IS_RM(vdp), 1,
1943		    XD_IS_CD(vdp) ? DDI_NT_CD_XVMD : DDI_NT_BLOCK_XVMD,
1944#if defined(XPV_HVM_DRIVER)
1945		    CMLB_CREATE_ALTSLICE_VTOC_16_DTYPE_DIRECT |
1946		    CMLB_INTERNAL_MINOR_NODES,
1947#else /* !XPV_HVM_DRIVER */
1948		    CMLB_FAKE_LABEL_ONE_PARTITION,
1949#endif /* !XPV_HVM_DRIVER */
1950		    vdp->xdf_vd_lbl, NULL) != 0) {
1951			cmn_err(CE_WARN, "xdf@%s: cmlb attach failed",
1952			    ddi_get_name_addr(devi));
1953			return (DDI_FAILURE);
1954		}
1955	}
1956
1957	/* mark vbd is ready for I/O */
1958	ASSERT(mutex_owned(&vdp->xdf_cb_lk));
1959	mutex_enter(&vdp->xdf_dev_lk);
1960	vdp->xdf_status = XD_READY;
1961	mutex_exit(&vdp->xdf_dev_lk);
1962	/*
1963	 * If backend has feature-barrier, see if it supports disk
1964	 * cache flush op.
1965	 */
1966	vdp->xdf_flush_supported = 0;
1967	if (vdp->xdf_feature_barrier) {
1968		/*
1969		 * Pretend we already know flush is supported so probe
1970		 * will attempt the correct op.
1971		 */
1972		vdp->xdf_flush_supported = 1;
1973		if (xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE, NULL, 0, 0, 0) == 0) {
1974			vdp->xdf_flush_supported = 1;
1975		} else {
1976			vdp->xdf_flush_supported = 0;
1977			/*
1978			 * If the other end does not support the cache flush op
1979			 * then we must use a barrier-write to force disk
1980			 * cache flushing.  Barrier writes require that a data
1981			 * block actually be written.
1982			 * Cache a block to barrier-write when we are
1983			 * asked to perform a flush.
1984			 * XXX - would it be better to just copy 1 block
1985			 * (512 bytes) from whatever write we did last
1986			 * and rewrite that block?
1987			 */
1988			if (xdf_get_flush_block(vdp) != DDI_SUCCESS)
1989				return (DDI_FAILURE);
1990		}
1991	}
1992
1993	cmn_err(CE_CONT, "?xdf@%s: %"PRIu64" blocks", ddi_get_name_addr(devi),
1994	    (uint64_t)vdp->xdf_xdev_nblocks);
1995
1996	return (DDI_SUCCESS);
1997}
1998
1999/*
2000 * Finish other uninitialization after we've disconnected from backend
2001 * when status is XD_CLOSING or XD_INIT. After returns, status is XD_CLOSED
2002 */
2003static void
2004xdf_post_disconnect(xdf_t *vdp)
2005{
2006#ifdef XPV_HVM_DRIVER
2007	ec_unbind_evtchn(vdp->xdf_evtchn);
2008#else /* !XPV_HVM_DRIVER */
2009	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
2010#endif /* !XPV_HVM_DRIVER */
2011	xvdi_free_evtchn(vdp->xdf_dip);
2012	xvdi_free_ring(vdp->xdf_xb_ring);
2013	vdp->xdf_xb_ring = NULL;
2014	vdp->xdf_xb_ring_hdl = NULL;
2015	vdp->xdf_peer = (domid_t)-1;
2016
2017	ASSERT(mutex_owned(&vdp->xdf_cb_lk));
2018	mutex_enter(&vdp->xdf_dev_lk);
2019	vdp->xdf_status = XD_CLOSED;
2020	mutex_exit(&vdp->xdf_dev_lk);
2021}
2022
2023/*ARGSUSED*/
2024static void
2025xdf_oe_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg, void *impl_data)
2026{
2027	XenbusState new_state = *(XenbusState *)impl_data;
2028	xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
2029	boolean_t unexpect_die = B_FALSE;
2030	int status;
2031
2032	DPRINTF(DDI_DBG, ("xdf@%s: otherend state change to %d!\n",
2033	    ddi_get_name_addr(dip), new_state));
2034
2035	mutex_enter(&vdp->xdf_cb_lk);
2036
2037	if (xdf_check_state_transition(vdp, new_state) == DDI_FAILURE) {
2038		mutex_exit(&vdp->xdf_cb_lk);
2039		return;
2040	}
2041
2042	switch (new_state) {
2043	case XenbusStateInitialising:
2044		ASSERT(vdp->xdf_status == XD_CLOSED);
2045		/*
2046		 * backend recovered from a previous failure,
2047		 * kick-off connect process again
2048		 */
2049		if (xdf_start_connect(vdp) != DDI_SUCCESS) {
2050			cmn_err(CE_WARN, "xdf@%s:"
2051			    " failed to start reconnecting to backend",
2052			    ddi_get_name_addr(dip));
2053		}
2054		break;
2055	case XenbusStateConnected:
2056		ASSERT(vdp->xdf_status == XD_INIT);
2057		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
2058		/* finish final init after connect */
2059		if (xdf_post_connect(vdp) != DDI_SUCCESS)
2060			(void) xdf_start_disconnect(vdp);
2061		break;
2062	case XenbusStateClosing:
2063		if (vdp->xdf_status == XD_READY) {
2064			mutex_enter(&vdp->xdf_dev_lk);
2065			if (xdf_isopen(vdp, -1)) {
2066				cmn_err(CE_NOTE, "xdf@%s: hot-unplug failed, "
2067				    "still in use", ddi_get_name_addr(dip));
2068				mutex_exit(&vdp->xdf_dev_lk);
2069				break;
2070			} else {
2071				vdp->xdf_status = XD_CLOSING;
2072			}
2073			mutex_exit(&vdp->xdf_dev_lk);
2074		}
2075		(void) xdf_start_disconnect(vdp);
2076		break;
2077	case XenbusStateClosed:
2078		/* first check if BE closed unexpectedly */
2079		mutex_enter(&vdp->xdf_dev_lk);
2080		if (xdf_isopen(vdp, -1)) {
2081			unexpect_die = B_TRUE;
2082			unexpectedie(vdp);
2083			cmn_err(CE_WARN, "xdf@%s: backend closed, "
2084			    "reconnecting...", ddi_get_name_addr(dip));
2085		}
2086		mutex_exit(&vdp->xdf_dev_lk);
2087
2088		if (vdp->xdf_status == XD_READY) {
2089			mutex_enter(&vdp->xdf_dev_lk);
2090			vdp->xdf_status = XD_CLOSING;
2091			mutex_exit(&vdp->xdf_dev_lk);
2092
2093#ifdef	DOMU_BACKEND
2094			(void) xvdi_post_event(dip, XEN_HP_REMOVE);
2095#endif
2096
2097			xdf_post_disconnect(vdp);
2098			(void) xvdi_switch_state(dip, XBT_NULL,
2099			    XenbusStateClosed);
2100		} else if ((vdp->xdf_status == XD_INIT) ||
2101		    (vdp->xdf_status == XD_CLOSING)) {
2102			xdf_post_disconnect(vdp);
2103		} else {
2104			mutex_enter(&vdp->xdf_dev_lk);
2105			vdp->xdf_status = XD_CLOSED;
2106			mutex_exit(&vdp->xdf_dev_lk);
2107		}
2108	}
2109
2110	/* notify anybody waiting for oe state change */
2111	mutex_enter(&vdp->xdf_dev_lk);
2112	cv_broadcast(&vdp->xdf_dev_cv);
2113	mutex_exit(&vdp->xdf_dev_lk);
2114
2115	status = vdp->xdf_status;
2116	mutex_exit(&vdp->xdf_cb_lk);
2117
2118	if (status == XD_READY) {
2119		xdf_iostart(vdp);
2120	} else if ((status == XD_CLOSED) && !unexpect_die) {
2121		/* interface is closed successfully, remove all minor nodes */
2122		if (vdp->xdf_vd_lbl != NULL) {
2123			cmlb_detach(vdp->xdf_vd_lbl, NULL);
2124			cmlb_free_handle(&vdp->xdf_vd_lbl);
2125			vdp->xdf_vd_lbl = NULL;
2126		}
2127	}
2128}
2129
2130/* check if partition is open, -1 - check all partitions on the disk */
2131static boolean_t
2132xdf_isopen(xdf_t *vdp, int partition)
2133{
2134	int i;
2135	ulong_t parbit;
2136	boolean_t rval = B_FALSE;
2137
2138	ASSERT((partition == -1) ||
2139	    ((partition >= 0) || (partition < XDF_PEXT)));
2140
2141	if (partition == -1)
2142		parbit = (ulong_t)-1;
2143	else
2144		parbit = 1 << partition;
2145
2146	for (i = 0; i < OTYPCNT; i++) {
2147		if (vdp->xdf_vd_open[i] & parbit)
2148			rval = B_TRUE;
2149	}
2150
2151	return (rval);
2152}
2153
2154/*
2155 * Xdf_check_state_transition will check the XenbusState change to see
2156 * if the change is a valid transition or not.
2157 * The new state is written by backend domain, or by running xenstore-write
2158 * to change it manually in dom0
2159 */
2160static int
2161xdf_check_state_transition(xdf_t *vdp, XenbusState oestate)
2162{
2163	int status;
2164	int stcheck;
2165#define	STOK	0 /* need further process */
2166#define	STNOP	1 /* no action need taking */
2167#define	STBUG	2 /* unexpected state change, could be a bug */
2168
2169	status = vdp->xdf_status;
2170	stcheck = STOK;
2171
2172	switch (status) {
2173	case XD_UNKNOWN:
2174		if ((oestate == XenbusStateUnknown)		||
2175		    (oestate == XenbusStateConnected))
2176			stcheck = STBUG;
2177		else if ((oestate == XenbusStateInitialising)	||
2178		    (oestate == XenbusStateInitWait)		||
2179		    (oestate == XenbusStateInitialised))
2180			stcheck = STNOP;
2181		break;
2182	case XD_INIT:
2183		if (oestate == XenbusStateUnknown)
2184			stcheck = STBUG;
2185		else if ((oestate == XenbusStateInitialising)	||
2186		    (oestate == XenbusStateInitWait)		||
2187		    (oestate == XenbusStateInitialised))
2188			stcheck = STNOP;
2189		break;
2190	case XD_READY:
2191		if ((oestate == XenbusStateUnknown)		||
2192		    (oestate == XenbusStateInitialising)	||
2193		    (oestate == XenbusStateInitWait)		||
2194		    (oestate == XenbusStateInitialised))
2195			stcheck = STBUG;
2196		else if (oestate == XenbusStateConnected)
2197			stcheck = STNOP;
2198		break;
2199	case XD_CLOSING:
2200		if ((oestate == XenbusStateUnknown)		||
2201		    (oestate == XenbusStateInitialising)	||
2202		    (oestate == XenbusStateInitWait)		||
2203		    (oestate == XenbusStateInitialised)		||
2204		    (oestate == XenbusStateConnected))
2205			stcheck = STBUG;
2206		else if (oestate == XenbusStateClosing)
2207			stcheck = STNOP;
2208		break;
2209	case XD_CLOSED:
2210		if ((oestate == XenbusStateUnknown)		||
2211		    (oestate == XenbusStateConnected))
2212			stcheck = STBUG;
2213		else if ((oestate == XenbusStateInitWait)	||
2214		    (oestate == XenbusStateInitialised)		||
2215		    (oestate == XenbusStateClosing)		||
2216		    (oestate == XenbusStateClosed))
2217			stcheck = STNOP;
2218		break;
2219	case XD_SUSPEND:
2220	default:
2221			stcheck = STBUG;
2222	}
2223
2224	if (stcheck == STOK)
2225		return (DDI_SUCCESS);
2226
2227	if (stcheck == STBUG)
2228		cmn_err(CE_NOTE, "xdf@%s: unexpected otherend "
2229		    "state change to %d!, when status is %d",
2230		    ddi_get_name_addr(vdp->xdf_dip), oestate, status);
2231
2232	return (DDI_FAILURE);
2233}
2234
2235static int
2236xdf_connect(xdf_t *vdp, boolean_t wait)
2237{
2238	ASSERT(mutex_owned(&vdp->xdf_dev_lk));
2239	while (vdp->xdf_status != XD_READY) {
2240		if (!wait || (vdp->xdf_status > XD_READY))
2241			break;
2242
2243		if (cv_wait_sig(&vdp->xdf_dev_cv, &vdp->xdf_dev_lk) == 0)
2244			break;
2245	}
2246
2247	return (vdp->xdf_status);
2248}
2249
2250/*
2251 * callback func when DMA/GTE resources is available
2252 *
2253 * Note: we only register one callback function to grant table subsystem
2254 * since we only have one 'struct gnttab_free_callback' in xdf_t.
2255 */
2256static int
2257xdf_dmacallback(caddr_t arg)
2258{
2259	xdf_t *vdp = (xdf_t *)arg;
2260	ASSERT(vdp != NULL);
2261
2262	DPRINTF(DMA_DBG, ("xdf@%s: DMA callback started\n",
2263	    ddi_get_name_addr(vdp->xdf_dip)));
2264
2265	ddi_trigger_softintr(vdp->xdf_softintr_id);
2266	return (DDI_DMA_CALLBACK_DONE);
2267}
2268
2269static uint_t
2270xdf_iorestart(caddr_t arg)
2271{
2272	xdf_t *vdp = (xdf_t *)arg;
2273
2274	ASSERT(vdp != NULL);
2275
2276	mutex_enter(&vdp->xdf_dev_lk);
2277	ASSERT(ISDMACBON(vdp));
2278	SETDMACBOFF(vdp);
2279	mutex_exit(&vdp->xdf_dev_lk);
2280
2281	xdf_iostart(vdp);
2282
2283	return (DDI_INTR_CLAIMED);
2284}
2285
2286static void
2287xdf_timeout_handler(void *arg)
2288{
2289	xdf_t *vdp = arg;
2290
2291	mutex_enter(&vdp->xdf_dev_lk);
2292	vdp->xdf_timeout_id = 0;
2293	mutex_exit(&vdp->xdf_dev_lk);
2294
2295	/* new timeout thread could be re-scheduled */
2296	xdf_iostart(vdp);
2297}
2298
2299/*
2300 * Alloc a vreq for this bp
2301 * bp->av_back contains the pointer to the vreq upon return
2302 */
2303static v_req_t *
2304vreq_get(xdf_t *vdp, buf_t *bp)
2305{
2306	v_req_t *vreq = NULL;
2307
2308	ASSERT(BP2VREQ(bp) == NULL);
2309
2310	vreq = kmem_cache_alloc(xdf_vreq_cache, KM_NOSLEEP);
2311	if (vreq == NULL) {
2312		if (vdp->xdf_timeout_id == 0)
2313			/* restart I/O after one second */
2314			vdp->xdf_timeout_id =
2315			    timeout(xdf_timeout_handler, vdp, hz);
2316		return (NULL);
2317	}
2318	bzero(vreq, sizeof (v_req_t));
2319
2320	list_insert_head(&vdp->xdf_vreq_act, (void *)vreq);
2321	bp->av_back = (buf_t *)vreq;
2322	vreq->v_buf = bp;
2323	vreq->v_status = VREQ_INIT;
2324	/* init of other fields in vreq is up to the caller */
2325
2326	return (vreq);
2327}
2328
2329static void
2330vreq_free(xdf_t *vdp, v_req_t *vreq)
2331{
2332	buf_t *bp = vreq->v_buf;
2333
2334	list_remove(&vdp->xdf_vreq_act, (void *)vreq);
2335
2336	if (vreq->v_flush_diskcache == FLUSH_DISKCACHE)
2337		goto done;
2338
2339	switch (vreq->v_status) {
2340	case VREQ_DMAWIN_DONE:
2341	case VREQ_GS_ALLOCED:
2342	case VREQ_DMABUF_BOUND:
2343		(void) ddi_dma_unbind_handle(vreq->v_dmahdl);
2344		/*FALLTHRU*/
2345	case VREQ_DMAMEM_ALLOCED:
2346		if (!ALIGNED_XFER(bp)) {
2347			ASSERT(vreq->v_abuf != NULL);
2348			if (!IS_ERROR(bp) && IS_READ(bp))
2349				bcopy(vreq->v_abuf, bp->b_un.b_addr,
2350				    bp->b_bcount);
2351			ddi_dma_mem_free(&vreq->v_align);
2352		}
2353		/*FALLTHRU*/
2354	case VREQ_MEMDMAHDL_ALLOCED:
2355		if (!ALIGNED_XFER(bp))
2356			ddi_dma_free_handle(&vreq->v_memdmahdl);
2357		/*FALLTHRU*/
2358	case VREQ_DMAHDL_ALLOCED:
2359		ddi_dma_free_handle(&vreq->v_dmahdl);
2360		break;
2361	default:
2362		break;
2363	}
2364done:
2365	vreq->v_buf->av_back = NULL;
2366	kmem_cache_free(xdf_vreq_cache, vreq);
2367}
2368
2369/*
2370 * Initalize the DMA and grant table resources for the buf
2371 */
2372static int
2373vreq_setup(xdf_t *vdp, v_req_t *vreq)
2374{
2375	int rc;
2376	ddi_dma_attr_t dmaattr;
2377	uint_t ndcs, ndws;
2378	ddi_dma_handle_t dh;
2379	ddi_dma_handle_t mdh;
2380	ddi_dma_cookie_t dc;
2381	ddi_acc_handle_t abh;
2382	caddr_t	aba;
2383	ge_slot_t *gs;
2384	size_t bufsz;
2385	off_t off;
2386	size_t sz;
2387	buf_t *bp = vreq->v_buf;
2388	int dma_flags = (IS_READ(bp) ? DDI_DMA_READ : DDI_DMA_WRITE) |
2389	    DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
2390
2391	switch (vreq->v_status) {
2392	case VREQ_INIT:
2393		if (IS_FLUSH_DISKCACHE(bp)) {
2394			if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
2395				DPRINTF(DMA_DBG, (
2396				    "xdf@%s: get ge_slotfailed\n",
2397				    ddi_get_name_addr(vdp->xdf_dip)));
2398				return (DDI_FAILURE);
2399			}
2400			vreq->v_blkno = 0;
2401			vreq->v_nslots = 1;
2402			vreq->v_gs = gs;
2403			vreq->v_flush_diskcache = FLUSH_DISKCACHE;
2404			vreq->v_status = VREQ_GS_ALLOCED;
2405			gs->vreq = vreq;
2406			return (DDI_SUCCESS);
2407		}
2408
2409		if (IS_WRITE_BARRIER(vdp, bp))
2410			vreq->v_flush_diskcache = WRITE_BARRIER;
2411		vreq->v_blkno = bp->b_blkno +
2412		    (diskaddr_t)(uintptr_t)bp->b_private;
2413		bp->b_private = NULL;
2414		/* See if we wrote new data to our flush block */
2415		if (!IS_READ(bp) && USE_WRITE_BARRIER(vdp))
2416			check_fbwrite(vdp, bp, vreq->v_blkno);
2417		vreq->v_status = VREQ_INIT_DONE;
2418		/*FALLTHRU*/
2419
2420	case VREQ_INIT_DONE:
2421		/*
2422		 * alloc DMA handle
2423		 */
2424		rc = ddi_dma_alloc_handle(vdp->xdf_dip, &xb_dma_attr,
2425		    xdf_dmacallback, (caddr_t)vdp, &dh);
2426		if (rc != DDI_SUCCESS) {
2427			SETDMACBON(vdp);
2428			DPRINTF(DMA_DBG, ("xdf@%s: DMA handle alloc failed\n",
2429			    ddi_get_name_addr(vdp->xdf_dip)));
2430			return (DDI_FAILURE);
2431		}
2432
2433		vreq->v_dmahdl = dh;
2434		vreq->v_status = VREQ_DMAHDL_ALLOCED;
2435		/*FALLTHRU*/
2436
2437	case VREQ_DMAHDL_ALLOCED:
2438		/*
2439		 * alloc dma handle for 512-byte aligned buf
2440		 */
2441		if (!ALIGNED_XFER(bp)) {
2442			/*
2443			 * XXPV: we need to temporarily enlarge the seg
2444			 * boundary and s/g length to work round CR6381968
2445			 */
2446			dmaattr = xb_dma_attr;
2447			dmaattr.dma_attr_seg = (uint64_t)-1;
2448			dmaattr.dma_attr_sgllen = INT_MAX;
2449			rc = ddi_dma_alloc_handle(vdp->xdf_dip, &dmaattr,
2450			    xdf_dmacallback, (caddr_t)vdp, &mdh);
2451			if (rc != DDI_SUCCESS) {
2452				SETDMACBON(vdp);
2453				DPRINTF(DMA_DBG, ("xdf@%s: unaligned buf DMA"
2454				    "handle alloc failed\n",
2455				    ddi_get_name_addr(vdp->xdf_dip)));
2456				return (DDI_FAILURE);
2457			}
2458			vreq->v_memdmahdl = mdh;
2459			vreq->v_status = VREQ_MEMDMAHDL_ALLOCED;
2460		}
2461		/*FALLTHRU*/
2462
2463	case VREQ_MEMDMAHDL_ALLOCED:
2464		/*
2465		 * alloc 512-byte aligned buf
2466		 */
2467		if (!ALIGNED_XFER(bp)) {
2468			if (bp->b_flags & (B_PAGEIO | B_PHYS))
2469				bp_mapin(bp);
2470
2471			rc = ddi_dma_mem_alloc(vreq->v_memdmahdl,
2472			    roundup(bp->b_bcount, XB_BSIZE), &xc_acc_attr,
2473			    DDI_DMA_STREAMING, xdf_dmacallback, (caddr_t)vdp,
2474			    &aba, &bufsz, &abh);
2475			if (rc != DDI_SUCCESS) {
2476				SETDMACBON(vdp);
2477				DPRINTF(DMA_DBG, (
2478				    "xdf@%s: DMA mem allocation failed\n",
2479				    ddi_get_name_addr(vdp->xdf_dip)));
2480				return (DDI_FAILURE);
2481			}
2482
2483			vreq->v_abuf = aba;
2484			vreq->v_align = abh;
2485			vreq->v_status = VREQ_DMAMEM_ALLOCED;
2486
2487			ASSERT(bufsz >= bp->b_bcount);
2488			if (!IS_READ(bp))
2489				bcopy(bp->b_un.b_addr, vreq->v_abuf,
2490				    bp->b_bcount);
2491		}
2492		/*FALLTHRU*/
2493
2494	case VREQ_DMAMEM_ALLOCED:
2495		/*
2496		 * dma bind
2497		 */
2498		if (ALIGNED_XFER(bp)) {
2499			rc = ddi_dma_buf_bind_handle(vreq->v_dmahdl, bp,
2500			    dma_flags, xdf_dmacallback, (caddr_t)vdp,
2501			    &dc, &ndcs);
2502		} else {
2503			rc = ddi_dma_addr_bind_handle(vreq->v_dmahdl,
2504			    NULL, vreq->v_abuf, bp->b_bcount, dma_flags,
2505			    xdf_dmacallback, (caddr_t)vdp, &dc, &ndcs);
2506		}
2507		if (rc == DDI_DMA_MAPPED || rc == DDI_DMA_PARTIAL_MAP) {
2508			/* get num of dma windows */
2509			if (rc == DDI_DMA_PARTIAL_MAP) {
2510				rc = ddi_dma_numwin(vreq->v_dmahdl, &ndws);
2511				ASSERT(rc == DDI_SUCCESS);
2512			} else {
2513				ndws = 1;
2514			}
2515		} else {
2516			SETDMACBON(vdp);
2517			DPRINTF(DMA_DBG, ("xdf@%s: DMA bind failed\n",
2518			    ddi_get_name_addr(vdp->xdf_dip)));
2519			return (DDI_FAILURE);
2520		}
2521
2522		vreq->v_dmac = dc;
2523		vreq->v_dmaw = 0;
2524		vreq->v_ndmacs = ndcs;
2525		vreq->v_ndmaws = ndws;
2526		vreq->v_nslots = ndws;
2527		vreq->v_status = VREQ_DMABUF_BOUND;
2528		/*FALLTHRU*/
2529
2530	case VREQ_DMABUF_BOUND:
2531		/*
2532		 * get ge_slot, callback is set upon failure from gs_get(),
2533		 * if not set previously
2534		 */
2535		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
2536			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
2537			    ddi_get_name_addr(vdp->xdf_dip)));
2538			return (DDI_FAILURE);
2539		}
2540
2541		vreq->v_gs = gs;
2542		gs->vreq = vreq;
2543		vreq->v_status = VREQ_GS_ALLOCED;
2544		break;
2545
2546	case VREQ_GS_ALLOCED:
2547		/* nothing need to be done */
2548		break;
2549
2550	case VREQ_DMAWIN_DONE:
2551		/*
2552		 * move to the next dma window
2553		 */
2554		ASSERT((vreq->v_dmaw + 1) < vreq->v_ndmaws);
2555
2556		/* get a ge_slot for this DMA window */
2557		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
2558			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
2559			    ddi_get_name_addr(vdp->xdf_dip)));
2560			return (DDI_FAILURE);
2561		}
2562
2563		vreq->v_gs = gs;
2564		gs->vreq = vreq;
2565		vreq->v_dmaw++;
2566		rc = ddi_dma_getwin(vreq->v_dmahdl, vreq->v_dmaw, &off, &sz,
2567		    &vreq->v_dmac, &vreq->v_ndmacs);
2568		ASSERT(rc == DDI_SUCCESS);
2569		vreq->v_status = VREQ_GS_ALLOCED;
2570		break;
2571
2572	default:
2573		return (DDI_FAILURE);
2574	}
2575
2576	return (DDI_SUCCESS);
2577}
2578
2579static ge_slot_t *
2580gs_get(xdf_t *vdp, int isread)
2581{
2582	grant_ref_t gh;
2583	ge_slot_t *gs;
2584
2585	/* try to alloc GTEs needed in this slot, first */
2586	if (gnttab_alloc_grant_references(
2587	    BLKIF_MAX_SEGMENTS_PER_REQUEST, &gh) == -1) {
2588		if (vdp->xdf_gnt_callback.next == NULL) {
2589			SETDMACBON(vdp);
2590			gnttab_request_free_callback(
2591			    &vdp->xdf_gnt_callback,
2592			    (void (*)(void *))xdf_dmacallback,
2593			    (void *)vdp,
2594			    BLKIF_MAX_SEGMENTS_PER_REQUEST);
2595		}
2596		return (NULL);
2597	}
2598
2599	gs = kmem_cache_alloc(xdf_gs_cache, KM_NOSLEEP);
2600	if (gs == NULL) {
2601		gnttab_free_grant_references(gh);
2602		if (vdp->xdf_timeout_id == 0)
2603			/* restart I/O after one second */
2604			vdp->xdf_timeout_id =
2605			    timeout(xdf_timeout_handler, vdp, hz);
2606		return (NULL);
2607	}
2608
2609	/* init gs_slot */
2610	list_insert_head(&vdp->xdf_gs_act, (void *)gs);
2611	gs->oeid = vdp->xdf_peer;
2612	gs->isread = isread;
2613	gs->ghead = gh;
2614	gs->ngrefs = 0;
2615
2616	return (gs);
2617}
2618
2619static void
2620gs_free(xdf_t *vdp, ge_slot_t *gs)
2621{
2622	int i;
2623	grant_ref_t *gp = gs->ge;
2624	int ngrefs = gs->ngrefs;
2625	boolean_t isread = gs->isread;
2626
2627	list_remove(&vdp->xdf_gs_act, (void *)gs);
2628
2629	/* release all grant table entry resources used in this slot */
2630	for (i = 0; i < ngrefs; i++, gp++)
2631		gnttab_end_foreign_access(*gp, !isread, 0);
2632	gnttab_free_grant_references(gs->ghead);
2633
2634	kmem_cache_free(xdf_gs_cache, (void *)gs);
2635}
2636
2637static grant_ref_t
2638gs_grant(ge_slot_t *gs, mfn_t mfn)
2639{
2640	grant_ref_t gr = gnttab_claim_grant_reference(&gs->ghead);
2641
2642	ASSERT(gr != -1);
2643	ASSERT(gs->ngrefs < BLKIF_MAX_SEGMENTS_PER_REQUEST);
2644	gs->ge[gs->ngrefs++] = gr;
2645	gnttab_grant_foreign_access_ref(gr, gs->oeid, mfn, !gs->isread);
2646
2647	return (gr);
2648}
2649
2650static void
2651unexpectedie(xdf_t *vdp)
2652{
2653	/* clean up I/Os in ring that have responses */
2654	if (xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring)) {
2655		mutex_exit(&vdp->xdf_dev_lk);
2656		(void) xdf_intr((caddr_t)vdp);
2657		mutex_enter(&vdp->xdf_dev_lk);
2658	}
2659
2660	/* free up all grant table entries */
2661	while (!list_is_empty(&vdp->xdf_gs_act))
2662		gs_free(vdp, list_head(&vdp->xdf_gs_act));
2663
2664	/*
2665	 * move bp back to active list orderly
2666	 * vreq_busy is updated in vreq_free()
2667	 */
2668	while (!list_is_empty(&vdp->xdf_vreq_act)) {
2669		v_req_t *vreq = list_head(&vdp->xdf_vreq_act);
2670		buf_t *bp = vreq->v_buf;
2671
2672		bp->av_back = NULL;
2673		bp->b_resid = bp->b_bcount;
2674		if (vdp->xdf_f_act == NULL) {
2675			vdp->xdf_f_act = vdp->xdf_l_act = bp;
2676		} else {
2677			/* move to the head of list */
2678			bp->av_forw = vdp->xdf_f_act;
2679			vdp->xdf_f_act = bp;
2680		}
2681		if (vdp->xdf_xdev_iostat != NULL)
2682			kstat_runq_back_to_waitq(
2683			    KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
2684		vreq_free(vdp, vreq);
2685	}
2686}
2687
2688static void
2689xdfmin(struct buf *bp)
2690{
2691	if (bp->b_bcount > xdf_maxphys)
2692		bp->b_bcount = xdf_maxphys;
2693}
2694
2695void
2696xdf_kstat_delete(dev_info_t *dip)
2697{
2698	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2699	kstat_t	*kstat;
2700
2701	/*
2702	 * The locking order here is xdf_iostat_lk and then xdf_dev_lk.
2703	 * xdf_dev_lk is used to protect the xdf_xdev_iostat pointer
2704	 * and the contents of the our kstat.  xdf_iostat_lk is used
2705	 * to protect the allocation and freeing of the actual kstat.
2706	 * xdf_dev_lk can't be used for this purpose because kstat
2707	 * readers use it to access the contents of the kstat and
2708	 * hence it can't be held when calling kstat_delete().
2709	 */
2710	mutex_enter(&vdp->xdf_iostat_lk);
2711	mutex_enter(&vdp->xdf_dev_lk);
2712
2713	if (vdp->xdf_xdev_iostat == NULL) {
2714		mutex_exit(&vdp->xdf_dev_lk);
2715		mutex_exit(&vdp->xdf_iostat_lk);
2716		return;
2717	}
2718
2719	kstat = vdp->xdf_xdev_iostat;
2720	vdp->xdf_xdev_iostat = NULL;
2721	mutex_exit(&vdp->xdf_dev_lk);
2722
2723	kstat_delete(kstat);
2724	mutex_exit(&vdp->xdf_iostat_lk);
2725}
2726
2727int
2728xdf_kstat_create(dev_info_t *dip, char *ks_module, int ks_instance)
2729{
2730	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2731
2732	/* See comment about locking in xdf_kstat_delete(). */
2733	mutex_enter(&vdp->xdf_iostat_lk);
2734	mutex_enter(&vdp->xdf_dev_lk);
2735
2736	if (vdp->xdf_xdev_iostat != NULL) {
2737		mutex_exit(&vdp->xdf_dev_lk);
2738		mutex_exit(&vdp->xdf_iostat_lk);
2739		return (-1);
2740	}
2741
2742	if ((vdp->xdf_xdev_iostat = kstat_create(
2743	    ks_module, ks_instance, NULL, "disk",
2744	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) == NULL) {
2745		mutex_exit(&vdp->xdf_dev_lk);
2746		mutex_exit(&vdp->xdf_iostat_lk);
2747		return (-1);
2748	}
2749
2750	vdp->xdf_xdev_iostat->ks_lock = &vdp->xdf_dev_lk;
2751	kstat_install(vdp->xdf_xdev_iostat);
2752	mutex_exit(&vdp->xdf_dev_lk);
2753	mutex_exit(&vdp->xdf_iostat_lk);
2754
2755	return (0);
2756}
2757
2758#if defined(XPV_HVM_DRIVER)
2759
2760typedef struct xdf_hvm_entry {
2761	list_node_t	xdf_he_list;
2762	char		*xdf_he_path;
2763	dev_info_t	*xdf_he_dip;
2764} xdf_hvm_entry_t;
2765
2766static list_t xdf_hvm_list;
2767static kmutex_t xdf_hvm_list_lock;
2768
2769static xdf_hvm_entry_t *
2770i_xdf_hvm_find(char *path, dev_info_t *dip)
2771{
2772	xdf_hvm_entry_t	*i;
2773
2774	ASSERT((path != NULL) || (dip != NULL));
2775	ASSERT(MUTEX_HELD(&xdf_hvm_list_lock));
2776
2777	i = list_head(&xdf_hvm_list);
2778	while (i != NULL) {
2779		if ((path != NULL) && strcmp(i->xdf_he_path, path) != 0) {
2780			i = list_next(&xdf_hvm_list, i);
2781			continue;
2782		}
2783		if ((dip != NULL) && (i->xdf_he_dip != dip)) {
2784			i = list_next(&xdf_hvm_list, i);
2785			continue;
2786		}
2787		break;
2788	}
2789	return (i);
2790}
2791
2792dev_info_t *
2793xdf_hvm_hold(char *path)
2794{
2795	xdf_hvm_entry_t	*i;
2796	dev_info_t	*dip;
2797
2798	mutex_enter(&xdf_hvm_list_lock);
2799	i = i_xdf_hvm_find(path, NULL);
2800	if (i == NULL) {
2801		mutex_exit(&xdf_hvm_list_lock);
2802		return (B_FALSE);
2803	}
2804	ndi_hold_devi(dip = i->xdf_he_dip);
2805	mutex_exit(&xdf_hvm_list_lock);
2806	return (dip);
2807}
2808
2809static void
2810xdf_hvm_add(dev_info_t *dip)
2811{
2812	xdf_hvm_entry_t	*i;
2813	char		*path;
2814
2815	/* figure out the path for the dip */
2816	path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
2817	(void) ddi_pathname(dip, path);
2818
2819	i = kmem_alloc(sizeof (*i), KM_SLEEP);
2820	i->xdf_he_dip = dip;
2821	i->xdf_he_path = i_ddi_strdup(path, KM_SLEEP);
2822
2823	mutex_enter(&xdf_hvm_list_lock);
2824	ASSERT(i_xdf_hvm_find(path, NULL) == NULL);
2825	ASSERT(i_xdf_hvm_find(NULL, dip) == NULL);
2826	list_insert_head(&xdf_hvm_list, i);
2827	mutex_exit(&xdf_hvm_list_lock);
2828
2829	kmem_free(path, MAXPATHLEN);
2830}
2831
2832static void
2833xdf_hvm_rm(dev_info_t *dip)
2834{
2835	xdf_hvm_entry_t	*i;
2836
2837	mutex_enter(&xdf_hvm_list_lock);
2838	VERIFY((i = i_xdf_hvm_find(NULL, dip)) != NULL);
2839	list_remove(&xdf_hvm_list, i);
2840	mutex_exit(&xdf_hvm_list_lock);
2841
2842	kmem_free(i->xdf_he_path, strlen(i->xdf_he_path) + 1);
2843	kmem_free(i, sizeof (*i));
2844}
2845
2846static void
2847xdf_hvm_init(void)
2848{
2849	list_create(&xdf_hvm_list, sizeof (xdf_hvm_entry_t),
2850	    offsetof(xdf_hvm_entry_t, xdf_he_list));
2851	mutex_init(&xdf_hvm_list_lock, NULL, MUTEX_DEFAULT, NULL);
2852}
2853
2854static void
2855xdf_hvm_fini(void)
2856{
2857	ASSERT(list_head(&xdf_hvm_list) == NULL);
2858	list_destroy(&xdf_hvm_list);
2859	mutex_destroy(&xdf_hvm_list_lock);
2860}
2861
2862int
2863xdf_hvm_connect(dev_info_t *dip)
2864{
2865	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2866	int	rv;
2867
2868	/* do cv_wait until connected or failed */
2869	mutex_enter(&vdp->xdf_dev_lk);
2870	rv = xdf_connect(vdp, B_TRUE);
2871	mutex_exit(&vdp->xdf_dev_lk);
2872	return ((rv == XD_READY) ? 0 : -1);
2873}
2874
2875int
2876xdf_hvm_setpgeom(dev_info_t *dip, cmlb_geom_t *geomp)
2877{
2878	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2879
2880	/* sanity check the requested physical geometry */
2881	mutex_enter(&vdp->xdf_dev_lk);
2882	if ((geomp->g_secsize != XB_BSIZE) ||
2883	    (geomp->g_capacity == 0)) {
2884		mutex_exit(&vdp->xdf_dev_lk);
2885		return (EINVAL);
2886	}
2887
2888	/*
2889	 * If we've already connected to the backend device then make sure
2890	 * we're not defining a physical geometry larger than our backend
2891	 * device.
2892	 */
2893	if ((vdp->xdf_xdev_nblocks != 0) &&
2894	    (geomp->g_capacity > vdp->xdf_xdev_nblocks)) {
2895		mutex_exit(&vdp->xdf_dev_lk);
2896		return (EINVAL);
2897	}
2898
2899	vdp->xdf_pgeom = *geomp;
2900	mutex_exit(&vdp->xdf_dev_lk);
2901
2902	/* force a re-validation */
2903	cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
2904
2905	return (0);
2906}
2907
2908#endif /* XPV_HVM_DRIVER */
2909