1/*      $NetBSD: xbdback_xenbus.c,v 1.55.2.1 2012/06/05 15:36:00 jdc Exp $      */
2
3/*
4 * Copyright (c) 2006 Manuel Bouyer.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 *
26 */
27
28#include <sys/cdefs.h>
29__KERNEL_RCSID(0, "$NetBSD: xbdback_xenbus.c,v 1.55.2.1 2012/06/05 15:36:00 jdc Exp $");
30
31#include <sys/atomic.h>
32#include <sys/buf.h>
33#include <sys/condvar.h>
34#include <sys/conf.h>
35#include <sys/disk.h>
36#include <sys/device.h>
37#include <sys/fcntl.h>
38#include <sys/kauth.h>
39#include <sys/kernel.h>
40#include <sys/kmem.h>
41#include <sys/kthread.h>
42#include <sys/malloc.h>
43#include <sys/mutex.h>
44#include <sys/param.h>
45#include <sys/queue.h>
46#include <sys/systm.h>
47#include <sys/time.h>
48#include <sys/types.h>
49#include <sys/vnode.h>
50
51#include <xen/xen.h>
52#include <xen/xen_shm.h>
53#include <xen/evtchn.h>
54#include <xen/xenbus.h>
55#include <xen/xen-public/io/protocols.h>
56
57/* #define XENDEBUG_VBD */
58#ifdef XENDEBUG_VBD
59#define XENPRINTF(x) printf x
60#else
61#define XENPRINTF(x)
62#endif
63
64#define BLKIF_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)
65
66/*
67 * Backend block device driver for Xen
68 */
69
70/* Max number of pages per request. The request may not be page aligned */
71#define BLKIF_MAX_PAGES_PER_REQUEST (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1)
72
73/* Values are expressed in 512-byte sectors */
74#define VBD_BSIZE 512
75#define VBD_MAXSECT ((PAGE_SIZE / VBD_BSIZE) - 1)
76
77struct xbdback_request;
78struct xbdback_io;
79struct xbdback_fragment;
80struct xbdback_instance;
81
82/*
83 * status of a xbdback instance:
84 * WAITING: xbdback instance is connected, waiting for requests
85 * RUN: xbdi thread must be woken up, I/Os have to be processed
86 * DISCONNECTING: the instance is closing, no more I/Os can be scheduled
87 * DISCONNECTED: no I/Os, no ring, the thread should terminate.
88 */
89typedef enum {WAITING, RUN, DISCONNECTING, DISCONNECTED} xbdback_state_t;
90
91/*
92 * Each xbdback instance is managed by a single thread that handles all
93 * the I/O processing. As there are a variety of conditions that can block,
94 * everything will be done in a sort of continuation-passing style.
95 *
96 * When the execution has to block to delay processing, for example to
97 * allow system to recover because of memory shortage (via shared memory
98 * callback), the return value of a continuation can be set to NULL. In that
99 * case, the thread will go back to sleeping and wait for the proper
100 * condition before it starts processing requests again from where it left.
101 * Continuation state is "stored" in the xbdback instance (xbdi_cont and
102 * xbdi_cont_aux), and should only be manipulated by the instance thread.
103 *
104 * As xbdback(4) has to handle different sort of asynchronous events (Xen
105 * event channels, biointr() soft interrupts, xenbus commands), the xbdi_lock
106 * mutex is used to protect specific elements of the xbdback instance from
107 * concurrent access: thread status and ring access (when pushing responses).
108 *
109 * Here's how the call graph is supposed to be for a single I/O:
110 *
111 * xbdback_co_main()
112 *        |
113 *        |               --> xbdback_co_cache_doflush() or NULL
114 *        |               |
115 *        |               - xbdback_co_cache_flush2() <- xbdback_co_do_io() <-
116 *        |                                            |                     |
117 *        |               |-> xbdback_co_cache_flush() -> xbdback_co_map_io()-
118 * xbdback_co_main_loop()-|
119 *        |               |-> xbdback_co_main_done() ---> xbdback_co_map_io()-
120 *        |                                           |                      |
121 *        |               -- xbdback_co_main_done2() <-- xbdback_co_do_io() <-
122 *        |               |
123 *        |               --> xbdback_co_main() or NULL
124 *        |
125 *     xbdback_co_io() -> xbdback_co_main_incr() -> xbdback_co_main_loop()
126 *        |
127 *     xbdback_co_io_gotreq()--+--> xbdback_co_map_io() ---
128 *        |                    |                          |
129 *  -> xbdback_co_io_loop()----|  <- xbdback_co_do_io() <--
130 *  |     |     |     |
131 *  |     |     |     |----------> xbdback_co_io_gotio()
132 *  |     |     |                         |
133 *  |     |   xbdback_co_main_incr()      |
134 *  |     |     |                         |
135 *  |     |   xbdback_co_main_loop()      |
136 *  |     |                               |
137 *  |  xbdback_co_io_gotio2() <-----------|
138 *  |     |           |
139 *  |     |           |----------> xbdback_co_io_gotfrag()
140 *  |     |                               |
141 *  -- xbdback_co_io_gotfrag2() <---------|
142 *        |
143 *     xbdback_co_main_incr() -> xbdback_co_main_loop()
144 */
145typedef void *(* xbdback_cont_t)(struct xbdback_instance *, void *);
146
147enum xbdi_proto {
148	XBDIP_NATIVE,
149	XBDIP_32,
150	XBDIP_64
151};
152
153/* we keep the xbdback instances in a linked list */
154struct xbdback_instance {
155	SLIST_ENTRY(xbdback_instance) next;
156	struct xenbus_device *xbdi_xbusd; /* our xenstore entry */
157	struct xenbus_watch xbdi_watch; /* to watch our store */
158	domid_t xbdi_domid;	/* attached to this domain */
159	uint32_t xbdi_handle;	/* domain-specific handle */
160	char xbdi_name[16];	/* name of this instance */
161	/* mutex that protects concurrent access to the xbdback instance */
162	kmutex_t xbdi_lock;
163	kcondvar_t xbdi_cv;	/* wait channel for thread work */
164	xbdback_state_t xbdi_status; /* thread's status */
165	/* backing device parameters */
166	dev_t xbdi_dev;
167	const struct bdevsw *xbdi_bdevsw; /* pointer to the device's bdevsw */
168	struct vnode *xbdi_vp;
169	uint64_t xbdi_size;
170	bool xbdi_ro; /* is device read-only ? */
171	/* parameters for the communication */
172	unsigned int xbdi_evtchn;
173	/* private parameters for communication */
174	blkif_back_ring_proto_t xbdi_ring;
175	enum xbdi_proto xbdi_proto;
176	grant_handle_t xbdi_ring_handle; /* to unmap the ring */
177	vaddr_t xbdi_ring_va; /* to unmap the ring */
178	/* disconnection must be postponed until all I/O is done */
179	int xbdi_refcnt;
180	/*
181	 * State for I/O processing/coalescing follows; this has to
182	 * live here instead of on the stack because of the
183	 * continuation-ness (see above).
184	 */
185	RING_IDX xbdi_req_prod; /* limit on request indices */
186	xbdback_cont_t xbdi_cont, xbdi_cont_aux;
187	SIMPLEQ_ENTRY(xbdback_instance) xbdi_on_hold; /* waiting on resources */
188	/* _request state: track requests fetched from ring */
189	struct xbdback_request *xbdi_req; /* if NULL, ignore following */
190	blkif_request_t xbdi_xen_req;
191	int xbdi_segno;
192	/* _io state: I/O associated to this instance */
193	struct xbdback_io *xbdi_io; /* if NULL, ignore next field */
194	daddr_t xbdi_next_sector;
195	uint8_t xbdi_last_fs, xbdi_this_fs; /* first sectors */
196	uint8_t xbdi_last_ls, xbdi_this_ls; /* last sectors */
197	grant_ref_t xbdi_thisgrt, xbdi_lastgrt; /* grants */
198	/* other state */
199	int xbdi_same_page; /* are we merging two segments on the same page? */
200	uint xbdi_pendingreqs; /* number of I/O in fly */
201	int xbdi_errps; /* errors per second */
202	struct timeval xbdi_lasterr_time;    /* error time tracking */
203#ifdef DEBUG
204	struct timeval xbdi_lastfragio_time; /* fragmented I/O tracking */
205#endif
206};
207/* Manipulation of the above reference count. */
208#define xbdi_get(xbdip) atomic_inc_uint(&(xbdip)->xbdi_refcnt)
209#define xbdi_put(xbdip)                                      \
210do {                                                         \
211	if (atomic_dec_uint_nv(&(xbdip)->xbdi_refcnt) == 0)  \
212               xbdback_finish_disconnect(xbdip);             \
213} while (/* CONSTCOND */ 0)
214
215SLIST_HEAD(, xbdback_instance) xbdback_instances;
216
217/*
218 * For each request from a guest, a xbdback_request is allocated from
219 * a pool.  This will describe the request until completion.  The
220 * request may require multiple IO operations to perform, so the
221 * per-IO information is not stored here.
222 */
223struct xbdback_request {
224	struct xbdback_instance *rq_xbdi; /* our xbd instance */
225	uint64_t rq_id;
226	int rq_iocount; /* reference count; or, number of outstanding I/O's */
227	int rq_ioerrs;
228	uint8_t rq_operation;
229};
230
231/*
232 * For each I/O operation associated with one of those requests, an
233 * xbdback_io is allocated from a pool.  It may correspond to multiple
234 * Xen disk requests, or parts of them, if several arrive at once that
235 * can be coalesced.
236 */
237struct xbdback_io {
238	/* The instance pointer is duplicated for convenience. */
239	struct xbdback_instance *xio_xbdi; /* our xbd instance */
240	uint8_t xio_operation;
241	union {
242		struct {
243			struct buf xio_buf; /* our I/O */
244			/* xbd requests involved */
245			SLIST_HEAD(, xbdback_fragment) xio_rq;
246			/* the virtual address to map the request at */
247			vaddr_t xio_vaddr;
248			/* grants to map */
249			grant_ref_t xio_gref[XENSHM_MAX_PAGES_PER_REQUEST];
250			/* grants release */
251			grant_handle_t xio_gh[XENSHM_MAX_PAGES_PER_REQUEST];
252			uint16_t xio_nrma; /* number of guest pages */
253			uint16_t xio_mapped; /* == 1: grants are mapped */
254		} xio_rw;
255		uint64_t xio_flush_id;
256	} u;
257};
258#define xio_buf		u.xio_rw.xio_buf
259#define xio_rq		u.xio_rw.xio_rq
260#define xio_vaddr	u.xio_rw.xio_vaddr
261#define xio_gref	u.xio_rw.xio_gref
262#define xio_gh		u.xio_rw.xio_gh
263#define xio_nrma	u.xio_rw.xio_nrma
264#define xio_mapped	u.xio_rw.xio_mapped
265
266#define xio_flush_id	u.xio_flush_id
267
268/*
269 * Rather than having the xbdback_io keep an array of the
270 * xbdback_requests involved, since the actual number will probably be
271 * small but might be as large as BLKIF_RING_SIZE, use a list.  This
272 * would be threaded through xbdback_request, but one of them might be
273 * part of multiple I/O's, alas.
274 */
275struct xbdback_fragment {
276	struct xbdback_request *car;
277	SLIST_ENTRY(xbdback_fragment) cdr;
278};
279
280/*
281 * Pools to manage the chain of block requests and I/Os fragments
282 * submitted by frontend.
283 */
284struct xbdback_pool {
285	struct pool_cache pc;
286	struct timeval last_warning;
287} xbdback_request_pool, xbdback_io_pool, xbdback_fragment_pool;
288
289SIMPLEQ_HEAD(xbdback_iqueue, xbdback_instance);
290static struct xbdback_iqueue xbdback_shmq;
291static int xbdback_shmcb; /* have we already registered a callback? */
292
293/* Interval between reports of I/O errors from frontend */
294struct timeval xbdback_err_intvl = { 1, 0 };
295
296#ifdef DEBUG
297struct timeval xbdback_fragio_intvl = { 60, 0 };
298#endif
299       void xbdbackattach(int);
300static int  xbdback_xenbus_create(struct xenbus_device *);
301static int  xbdback_xenbus_destroy(void *);
302static void xbdback_frontend_changed(void *, XenbusState);
303static void xbdback_backend_changed(struct xenbus_watch *,
304    const char **, unsigned int);
305static int  xbdback_evthandler(void *);
306
307static int  xbdback_connect(struct xbdback_instance *);
308static void xbdback_disconnect(struct xbdback_instance *);
309static void xbdback_finish_disconnect(struct xbdback_instance *);
310
311static struct xbdback_instance *xbdif_lookup(domid_t, uint32_t);
312
313static void *xbdback_co_main(struct xbdback_instance *, void *);
314static void *xbdback_co_main_loop(struct xbdback_instance *, void *);
315static void *xbdback_co_main_incr(struct xbdback_instance *, void *);
316static void *xbdback_co_main_done(struct xbdback_instance *, void *);
317static void *xbdback_co_main_done2(struct xbdback_instance *, void *);
318
319static void *xbdback_co_cache_flush(struct xbdback_instance *, void *);
320static void *xbdback_co_cache_flush2(struct xbdback_instance *, void *);
321static void *xbdback_co_cache_doflush(struct xbdback_instance *, void *);
322
323static void *xbdback_co_io(struct xbdback_instance *, void *);
324static void *xbdback_co_io_gotreq(struct xbdback_instance *, void *);
325static void *xbdback_co_io_loop(struct xbdback_instance *, void *);
326static void *xbdback_co_io_gotio(struct xbdback_instance *, void *);
327static void *xbdback_co_io_gotio2(struct xbdback_instance *, void *);
328static void *xbdback_co_io_gotfrag(struct xbdback_instance *, void *);
329static void *xbdback_co_io_gotfrag2(struct xbdback_instance *, void *);
330
331static void *xbdback_co_map_io(struct xbdback_instance *, void *);
332static void *xbdback_co_do_io(struct xbdback_instance *, void *);
333
334static void *xbdback_co_wait_shm_callback(struct xbdback_instance *, void *);
335
336static int  xbdback_shm_callback(void *);
337static void xbdback_io_error(struct xbdback_io *, int);
338static void xbdback_iodone(struct buf *);
339static void xbdback_send_reply(struct xbdback_instance *, uint64_t , int , int);
340
341static void *xbdback_map_shm(struct xbdback_io *);
342static void xbdback_unmap_shm(struct xbdback_io *);
343
344static void *xbdback_pool_get(struct xbdback_pool *,
345			      struct xbdback_instance *);
346static void xbdback_pool_put(struct xbdback_pool *, void *);
347static void xbdback_thread(void *);
348static void xbdback_wakeup_thread(struct xbdback_instance *);
349static void xbdback_trampoline(struct xbdback_instance *, void *);
350
351static struct xenbus_backend_driver xbd_backend_driver = {
352	.xbakd_create = xbdback_xenbus_create,
353	.xbakd_type = "vbd"
354};
355
356void
357xbdbackattach(int n)
358{
359	XENPRINTF(("xbdbackattach\n"));
360
361	/*
362	 * initialize the backend driver, register the control message handler
363	 * and send driver up message.
364	 */
365	SLIST_INIT(&xbdback_instances);
366	SIMPLEQ_INIT(&xbdback_shmq);
367	xbdback_shmcb = 0;
368
369	pool_cache_bootstrap(&xbdback_request_pool.pc,
370	    sizeof(struct xbdback_request), 0, 0, 0, "xbbrp", NULL,
371	    IPL_SOFTBIO, NULL, NULL, NULL);
372	pool_cache_bootstrap(&xbdback_io_pool.pc,
373	    sizeof(struct xbdback_io), 0, 0, 0, "xbbip", NULL,
374	    IPL_SOFTBIO, NULL, NULL, NULL);
375	pool_cache_bootstrap(&xbdback_fragment_pool.pc,
376	    sizeof(struct xbdback_fragment), 0, 0, 0, "xbbfp", NULL,
377	    IPL_SOFTBIO, NULL, NULL, NULL);
378
379	/* we allocate enough to handle a whole ring at once */
380	if (pool_prime(&xbdback_request_pool.pc.pc_pool, BLKIF_RING_SIZE) != 0)
381		printf("xbdback: failed to prime request pool\n");
382	if (pool_prime(&xbdback_io_pool.pc.pc_pool, BLKIF_RING_SIZE) != 0)
383		printf("xbdback: failed to prime io pool\n");
384	if (pool_prime(&xbdback_fragment_pool.pc.pc_pool,
385            BLKIF_MAX_SEGMENTS_PER_REQUEST * BLKIF_RING_SIZE) != 0)
386		printf("xbdback: failed to prime fragment pool\n");
387
388	xenbus_backend_register(&xbd_backend_driver);
389}
390
391static int
392xbdback_xenbus_create(struct xenbus_device *xbusd)
393{
394	struct xbdback_instance *xbdi;
395	long domid, handle;
396	int error, i;
397	char *ep;
398
399	if ((error = xenbus_read_ul(NULL, xbusd->xbusd_path,
400	    "frontend-id", &domid, 10)) != 0) {
401		aprint_error("xbdback: can't read %s/frontend-id: %d\n",
402		    xbusd->xbusd_path, error);
403		return error;
404	}
405
406	/*
407	 * get handle: this is the last component of the path; which is
408	 * a decimal number. $path/dev contains the device name, which is not
409	 * appropriate.
410	 */
411	for (i = strlen(xbusd->xbusd_path); i > 0; i--) {
412		if (xbusd->xbusd_path[i] == '/')
413			break;
414	}
415	if (i == 0) {
416		aprint_error("xbdback: can't parse %s\n",
417		    xbusd->xbusd_path);
418		return EFTYPE;
419	}
420	handle = strtoul(&xbusd->xbusd_path[i+1], &ep, 10);
421	if (*ep != '\0') {
422		aprint_error("xbdback: can't parse %s\n",
423		    xbusd->xbusd_path);
424		return EFTYPE;
425	}
426
427	if (xbdif_lookup(domid, handle) != NULL) {
428		return EEXIST;
429	}
430	xbdi = kmem_zalloc(sizeof(*xbdi), KM_SLEEP);
431
432	xbdi->xbdi_domid = domid;
433	xbdi->xbdi_handle = handle;
434	snprintf(xbdi->xbdi_name, sizeof(xbdi->xbdi_name), "xbdb%di%d",
435	    xbdi->xbdi_domid, xbdi->xbdi_handle);
436
437	/* initialize status and reference counter */
438	xbdi->xbdi_status = DISCONNECTED;
439	xbdi_get(xbdi);
440
441	mutex_init(&xbdi->xbdi_lock, MUTEX_DEFAULT, IPL_BIO);
442	cv_init(&xbdi->xbdi_cv, xbdi->xbdi_name);
443	SLIST_INSERT_HEAD(&xbdback_instances, xbdi, next);
444
445	xbusd->xbusd_u.b.b_cookie = xbdi;
446	xbusd->xbusd_u.b.b_detach = xbdback_xenbus_destroy;
447	xbusd->xbusd_otherend_changed = xbdback_frontend_changed;
448	xbdi->xbdi_xbusd = xbusd;
449
450	error = xenbus_watch_path2(xbusd, xbusd->xbusd_path, "physical-device",
451	    &xbdi->xbdi_watch, xbdback_backend_changed);
452	if (error) {
453		printf("failed to watch on %s/physical-device: %d\n",
454		    xbusd->xbusd_path, error);
455		goto fail;
456	}
457	xbdi->xbdi_watch.xbw_dev = xbusd;
458	error = xenbus_switch_state(xbusd, NULL, XenbusStateInitWait);
459	if (error) {
460		printf("failed to switch state on %s: %d\n",
461		    xbusd->xbusd_path, error);
462		goto fail2;
463	}
464	return 0;
465fail2:
466	unregister_xenbus_watch(&xbdi->xbdi_watch);
467fail:
468	kmem_free(xbdi, sizeof(*xbdi));
469	return error;
470}
471
472static int
473xbdback_xenbus_destroy(void *arg)
474{
475	struct xbdback_instance *xbdi = arg;
476	struct xenbus_device *xbusd = xbdi->xbdi_xbusd;
477	struct gnttab_unmap_grant_ref ungrop;
478	int err;
479
480	XENPRINTF(("xbdback_xenbus_destroy state %d\n", xbdi->xbdi_status));
481
482	xbdback_disconnect(xbdi);
483
484	/* unregister watch */
485	if (xbdi->xbdi_watch.node) {
486		unregister_xenbus_watch(&xbdi->xbdi_watch);
487		free(xbdi->xbdi_watch.node, M_DEVBUF);
488		xbdi->xbdi_watch.node = NULL;
489	}
490	/* unmap ring */
491	if (xbdi->xbdi_ring_va != 0) {
492		ungrop.host_addr = xbdi->xbdi_ring_va;
493		ungrop.handle = xbdi->xbdi_ring_handle;
494		ungrop.dev_bus_addr = 0;
495		err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
496		    &ungrop, 1);
497		if (err)
498		    printf("xbdback %s: unmap_grant_ref failed: %d\n",
499			xbusd->xbusd_otherend, err);
500		uvm_km_free(kernel_map, xbdi->xbdi_ring_va,
501		    PAGE_SIZE, UVM_KMF_VAONLY);
502	}
503	/* close device */
504	if (xbdi->xbdi_size) {
505		const char *name;
506		struct dkwedge_info wi;
507		if (getdiskinfo(xbdi->xbdi_vp, &wi) == 0)
508			name = wi.dkw_devname;
509		else
510			name = "*unknown*";
511		printf("xbd backend: detach device %s for domain %d\n",
512		    name, xbdi->xbdi_domid);
513		vn_close(xbdi->xbdi_vp, FREAD, NOCRED);
514	}
515	SLIST_REMOVE(&xbdback_instances, xbdi, xbdback_instance, next);
516	mutex_destroy(&xbdi->xbdi_lock);
517	cv_destroy(&xbdi->xbdi_cv);
518	kmem_free(xbdi, sizeof(*xbdi));
519	return 0;
520}
521
522static int
523xbdback_connect(struct xbdback_instance *xbdi)
524{
525	int len, err;
526	struct gnttab_map_grant_ref grop;
527	struct gnttab_unmap_grant_ref ungrop;
528	evtchn_op_t evop;
529	u_long ring_ref, revtchn;
530	char *xsproto;
531	const char *proto;
532	struct xenbus_device *xbusd = xbdi->xbdi_xbusd;
533
534	XENPRINTF(("xbdback %s: connect\n", xbusd->xbusd_path));
535	/* read comunication informations */
536	err = xenbus_read_ul(NULL, xbusd->xbusd_otherend,
537	    "ring-ref", &ring_ref, 10);
538	if (err) {
539		xenbus_dev_fatal(xbusd, err, "reading %s/ring-ref",
540		    xbusd->xbusd_otherend);
541		return -1;
542	}
543	XENPRINTF(("xbdback %s: connect ring-ref %lu\n", xbusd->xbusd_path, ring_ref));
544	err = xenbus_read_ul(NULL, xbusd->xbusd_otherend,
545	    "event-channel", &revtchn, 10);
546	if (err) {
547		xenbus_dev_fatal(xbusd, err, "reading %s/event-channel",
548		    xbusd->xbusd_otherend);
549		return -1;
550	}
551	XENPRINTF(("xbdback %s: connect revtchn %lu\n", xbusd->xbusd_path, revtchn));
552	err = xenbus_read(NULL, xbusd->xbusd_otherend, "protocol",
553	    &len, &xsproto);
554	if (err) {
555		xbdi->xbdi_proto = XBDIP_NATIVE;
556		proto = "unspecified";
557		XENPRINTF(("xbdback %s: connect no xsproto\n", xbusd->xbusd_path));
558	} else {
559		XENPRINTF(("xbdback %s: connect xsproto %s\n", xbusd->xbusd_path, xsproto));
560		if (strcmp(xsproto, XEN_IO_PROTO_ABI_NATIVE) == 0) {
561			xbdi->xbdi_proto = XBDIP_NATIVE;
562			proto = XEN_IO_PROTO_ABI_NATIVE;
563		} else if (strcmp(xsproto, XEN_IO_PROTO_ABI_X86_32) == 0) {
564			xbdi->xbdi_proto = XBDIP_32;
565			proto = XEN_IO_PROTO_ABI_X86_32;
566		} else if (strcmp(xsproto, XEN_IO_PROTO_ABI_X86_64) == 0) {
567			xbdi->xbdi_proto = XBDIP_64;
568			proto = XEN_IO_PROTO_ABI_X86_64;
569		} else {
570			aprint_error("xbd domain %d: unknown proto %s\n",
571			    xbdi->xbdi_domid, xsproto);
572			free(xsproto, M_DEVBUF);
573			return -1;
574		}
575		free(xsproto, M_DEVBUF);
576	}
577
578	/* allocate VA space and map rings */
579	xbdi->xbdi_ring_va = uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
580	    UVM_KMF_VAONLY);
581	if (xbdi->xbdi_ring_va == 0) {
582		xenbus_dev_fatal(xbusd, ENOMEM,
583		    "can't get VA for ring", xbusd->xbusd_otherend);
584		return -1;
585	}
586	XENPRINTF(("xbdback %s: connect va 0x%" PRIxVADDR "\n", xbusd->xbusd_path, xbdi->xbdi_ring_va));
587
588	grop.host_addr = xbdi->xbdi_ring_va;
589	grop.flags = GNTMAP_host_map;
590	grop.ref = ring_ref;
591	grop.dom = xbdi->xbdi_domid;
592	err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
593	    &grop, 1);
594	if (err || grop.status) {
595		aprint_error("xbdback %s: can't map grant ref: %d/%d\n",
596		    xbusd->xbusd_path, err, grop.status);
597		xenbus_dev_fatal(xbusd, EINVAL,
598		    "can't map ring", xbusd->xbusd_otherend);
599		goto err;
600	}
601	xbdi->xbdi_ring_handle = grop.handle;
602	XENPRINTF(("xbdback %s: connect grhandle %d\n", xbusd->xbusd_path, grop.handle));
603
604	switch(xbdi->xbdi_proto) {
605	case XBDIP_NATIVE:
606	{
607		blkif_sring_t *sring = (void *)xbdi->xbdi_ring_va;
608		BACK_RING_INIT(&xbdi->xbdi_ring.ring_n, sring, PAGE_SIZE);
609		break;
610	}
611	case XBDIP_32:
612	{
613		blkif_x86_32_sring_t *sring = (void *)xbdi->xbdi_ring_va;
614		BACK_RING_INIT(&xbdi->xbdi_ring.ring_32, sring, PAGE_SIZE);
615		break;
616	}
617	case XBDIP_64:
618	{
619		blkif_x86_64_sring_t *sring = (void *)xbdi->xbdi_ring_va;
620		BACK_RING_INIT(&xbdi->xbdi_ring.ring_64, sring, PAGE_SIZE);
621		break;
622	}
623	}
624
625	evop.cmd = EVTCHNOP_bind_interdomain;
626	evop.u.bind_interdomain.remote_dom = xbdi->xbdi_domid;
627	evop.u.bind_interdomain.remote_port = revtchn;
628	err = HYPERVISOR_event_channel_op(&evop);
629	if (err) {
630		aprint_error("blkback %s: "
631		    "can't get event channel: %d\n",
632		    xbusd->xbusd_otherend, err);
633		xenbus_dev_fatal(xbusd, err,
634		    "can't bind event channel", xbusd->xbusd_otherend);
635		goto err2;
636	}
637	XENPRINTF(("xbdback %s: connect evchannel %d\n", xbusd->xbusd_path, xbdi->xbdi_evtchn));
638	xbdi->xbdi_evtchn = evop.u.bind_interdomain.local_port;
639
640	event_set_handler(xbdi->xbdi_evtchn, xbdback_evthandler,
641	    xbdi, IPL_BIO, xbdi->xbdi_name);
642	aprint_verbose("xbd backend domain %d handle %#x (%d) "
643	    "using event channel %d, protocol %s\n", xbdi->xbdi_domid,
644	    xbdi->xbdi_handle, xbdi->xbdi_handle, xbdi->xbdi_evtchn, proto);
645
646	/* enable the xbdback event handler machinery */
647	xbdi->xbdi_status = WAITING;
648	hypervisor_enable_event(xbdi->xbdi_evtchn);
649	hypervisor_notify_via_evtchn(xbdi->xbdi_evtchn);
650
651	if (kthread_create(IPL_NONE, KTHREAD_MPSAFE, NULL,
652	    xbdback_thread, xbdi, NULL, "%s", xbdi->xbdi_name) == 0)
653		return 0;
654
655err2:
656	/* unmap ring */
657	ungrop.host_addr = xbdi->xbdi_ring_va;
658	ungrop.handle = xbdi->xbdi_ring_handle;
659	ungrop.dev_bus_addr = 0;
660	err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
661	    &ungrop, 1);
662	if (err)
663	    aprint_error("xbdback %s: unmap_grant_ref failed: %d\n",
664		xbusd->xbusd_path, err);
665
666err:
667	/* free ring VA space */
668	uvm_km_free(kernel_map, xbdi->xbdi_ring_va, PAGE_SIZE, UVM_KMF_VAONLY);
669	return -1;
670}
671
672/*
673 * Signal a xbdback thread to disconnect. Done in 'xenwatch' thread context.
674 */
675static void
676xbdback_disconnect(struct xbdback_instance *xbdi)
677{
678
679	mutex_enter(&xbdi->xbdi_lock);
680	if (xbdi->xbdi_status == DISCONNECTED) {
681		mutex_exit(&xbdi->xbdi_lock);
682		return;
683	}
684	hypervisor_mask_event(xbdi->xbdi_evtchn);
685	event_remove_handler(xbdi->xbdi_evtchn, xbdback_evthandler,
686	    xbdi);
687
688	/* signal thread that we want to disconnect, then wait for it */
689	xbdi->xbdi_status = DISCONNECTING;
690	cv_signal(&xbdi->xbdi_cv);
691
692	while (xbdi->xbdi_status != DISCONNECTED)
693		cv_wait(&xbdi->xbdi_cv, &xbdi->xbdi_lock);
694
695	mutex_exit(&xbdi->xbdi_lock);
696
697	xenbus_switch_state(xbdi->xbdi_xbusd, NULL, XenbusStateClosing);
698}
699
700static void
701xbdback_frontend_changed(void *arg, XenbusState new_state)
702{
703	struct xbdback_instance *xbdi = arg;
704	struct xenbus_device *xbusd = xbdi->xbdi_xbusd;
705
706	XENPRINTF(("xbdback %s: new state %d\n", xbusd->xbusd_path, new_state));
707	switch(new_state) {
708	case XenbusStateInitialising:
709		break;
710	case XenbusStateInitialised:
711	case XenbusStateConnected:
712		if (xbdi->xbdi_status == WAITING || xbdi->xbdi_status == RUN)
713			break;
714		xbdback_connect(xbdi);
715		break;
716	case XenbusStateClosing:
717		xbdback_disconnect(xbdi);
718		break;
719	case XenbusStateClosed:
720		/* otherend_changed() should handle it for us */
721		panic("xbdback_frontend_changed: closed\n");
722	case XenbusStateUnknown:
723	case XenbusStateInitWait:
724	default:
725		aprint_error("xbdback %s: invalid frontend state %d\n",
726		    xbusd->xbusd_path, new_state);
727	}
728	return;
729}
730
731static void
732xbdback_backend_changed(struct xenbus_watch *watch,
733    const char **vec, unsigned int len)
734{
735	struct xenbus_device *xbusd = watch->xbw_dev;
736	struct xbdback_instance *xbdi = xbusd->xbusd_u.b.b_cookie;
737	int err;
738	long dev;
739	char *mode;
740	struct xenbus_transaction *xbt;
741	const char *devname;
742	int major;
743
744	err = xenbus_read_ul(NULL, xbusd->xbusd_path, "physical-device",
745	    &dev, 10);
746	/*
747	 * An error can occur as the watch can fire up just after being
748	 * registered. So we have to ignore error  :(
749	 */
750	if (err)
751		return;
752	/*
753	 * we can also fire up after having opened the device, don't try
754	 * to do it twice.
755	 */
756	if (xbdi->xbdi_vp != NULL) {
757		if (xbdi->xbdi_status == WAITING || xbdi->xbdi_status == RUN) {
758			if (xbdi->xbdi_dev != dev) {
759				printf("xbdback %s: changing physical device "
760				    "from %#"PRIx64" to %#lx not supported\n",
761				    xbusd->xbusd_path, xbdi->xbdi_dev, dev);
762			}
763		}
764		return;
765	}
766	xbdi->xbdi_dev = dev;
767	err = xenbus_read(NULL, xbusd->xbusd_path, "mode", NULL, &mode);
768	if (err) {
769		printf("xbdback: failed to read %s/mode: %d\n",
770		    xbusd->xbusd_path, err);
771		return;
772	}
773	if (mode[0] == 'w')
774		xbdi->xbdi_ro = false;
775	else
776		xbdi->xbdi_ro = true;
777	free(mode, M_DEVBUF);
778	major = major(xbdi->xbdi_dev);
779	devname = devsw_blk2name(major);
780	if (devname == NULL) {
781		printf("xbdback %s: unknown device 0x%"PRIx64"\n",
782		    xbusd->xbusd_path, xbdi->xbdi_dev);
783		return;
784	}
785	xbdi->xbdi_bdevsw = bdevsw_lookup(xbdi->xbdi_dev);
786	if (xbdi->xbdi_bdevsw == NULL) {
787		printf("xbdback %s: no bdevsw for device 0x%"PRIx64"\n",
788		    xbusd->xbusd_path, xbdi->xbdi_dev);
789		return;
790	}
791	err = bdevvp(xbdi->xbdi_dev, &xbdi->xbdi_vp);
792	if (err) {
793		printf("xbdback %s: can't open device 0x%"PRIx64": %d\n",
794		    xbusd->xbusd_path, xbdi->xbdi_dev, err);
795		return;
796	}
797	err = vn_lock(xbdi->xbdi_vp, LK_EXCLUSIVE | LK_RETRY);
798	if (err) {
799		printf("xbdback %s: can't vn_lock device 0x%"PRIx64": %d\n",
800		    xbusd->xbusd_path, xbdi->xbdi_dev, err);
801		vrele(xbdi->xbdi_vp);
802		return;
803	}
804	err  = VOP_OPEN(xbdi->xbdi_vp, FREAD, NOCRED);
805	if (err) {
806		printf("xbdback %s: can't VOP_OPEN device 0x%"PRIx64": %d\n",
807		    xbusd->xbusd_path, xbdi->xbdi_dev, err);
808		vput(xbdi->xbdi_vp);
809		return;
810	}
811	VOP_UNLOCK(xbdi->xbdi_vp);
812
813	/* dk device; get wedge data */
814	struct dkwedge_info wi;
815	if ((err = getdiskinfo(xbdi->xbdi_vp, &wi)) == 0) {
816		xbdi->xbdi_size = wi.dkw_size;
817		printf("xbd backend: attach device %s (size %" PRIu64 ") "
818		    "for domain %d\n", wi.dkw_devname, xbdi->xbdi_size,
819		    xbdi->xbdi_domid);
820	} else {
821		/* If both Ioctls failed set device size to 0 and return */
822		printf("xbdback %s: can't DIOCGWEDGEINFO device "
823		    "0x%"PRIx64": %d\n", xbusd->xbusd_path,
824		    xbdi->xbdi_dev, err);
825		xbdi->xbdi_size = xbdi->xbdi_dev = 0;
826		vn_close(xbdi->xbdi_vp, FREAD, NOCRED);
827		xbdi->xbdi_vp = NULL;
828		return;
829	}
830again:
831	xbt = xenbus_transaction_start();
832	if (xbt == NULL) {
833		printf("xbdback %s: can't start transaction\n",
834		    xbusd->xbusd_path);
835		    return;
836	}
837	err = xenbus_printf(xbt, xbusd->xbusd_path, "sectors", "%" PRIu64 ,
838	    xbdi->xbdi_size);
839	if (err) {
840		printf("xbdback: failed to write %s/sectors: %d\n",
841		    xbusd->xbusd_path, err);
842		goto abort;
843	}
844	err = xenbus_printf(xbt, xbusd->xbusd_path, "info", "%u",
845	    xbdi->xbdi_ro ? VDISK_READONLY : 0);
846	if (err) {
847		printf("xbdback: failed to write %s/info: %d\n",
848		    xbusd->xbusd_path, err);
849		goto abort;
850	}
851	err = xenbus_printf(xbt, xbusd->xbusd_path, "sector-size", "%lu",
852	    (u_long)DEV_BSIZE);
853	if (err) {
854		printf("xbdback: failed to write %s/sector-size: %d\n",
855		    xbusd->xbusd_path, err);
856		goto abort;
857	}
858	err = xenbus_printf(xbt, xbusd->xbusd_path, "feature-flush-cache",
859	    "%u", 1);
860	if (err) {
861		printf("xbdback: failed to write %s/feature-flush-cache: %d\n",
862		    xbusd->xbusd_path, err);
863		goto abort;
864	}
865	err = xenbus_transaction_end(xbt, 0);
866	if (err == EAGAIN)
867		goto again;
868	if (err) {
869		printf("xbdback %s: can't end transaction: %d\n",
870		    xbusd->xbusd_path, err);
871	}
872	err = xenbus_switch_state(xbusd, NULL, XenbusStateConnected);
873	if (err) {
874		printf("xbdback %s: can't switch state: %d\n",
875		    xbusd->xbusd_path, err);
876	}
877	return;
878abort:
879	xenbus_transaction_end(xbt, 1);
880}
881
882/*
883 * Used by a xbdi thread to signal that it is now disconnected.
884 */
885static void
886xbdback_finish_disconnect(struct xbdback_instance *xbdi)
887{
888	KASSERT(mutex_owned(&xbdi->xbdi_lock));
889	KASSERT(xbdi->xbdi_status == DISCONNECTING);
890
891	xbdi->xbdi_status = DISCONNECTED;
892
893	cv_signal(&xbdi->xbdi_cv);
894}
895
896static struct xbdback_instance *
897xbdif_lookup(domid_t dom , uint32_t handle)
898{
899	struct xbdback_instance *xbdi;
900
901	SLIST_FOREACH(xbdi, &xbdback_instances, next) {
902		if (xbdi->xbdi_domid == dom && xbdi->xbdi_handle == handle)
903			return xbdi;
904	}
905	return NULL;
906}
907
908static int
909xbdback_evthandler(void *arg)
910{
911	struct xbdback_instance *xbdi = arg;
912
913	XENPRINTF(("xbdback_evthandler domain %d: cont %p\n",
914	    xbdi->xbdi_domid, xbdi->xbdi_cont));
915
916	xbdback_wakeup_thread(xbdi);
917
918	return 1;
919}
920
921/*
922 * Main thread routine for one xbdback instance. Woken up by
923 * xbdback_evthandler when a domain has I/O work scheduled in a I/O ring.
924 */
925static void
926xbdback_thread(void *arg)
927{
928	struct xbdback_instance *xbdi = arg;
929
930	for (;;) {
931		mutex_enter(&xbdi->xbdi_lock);
932		switch (xbdi->xbdi_status) {
933		case WAITING:
934			cv_wait(&xbdi->xbdi_cv, &xbdi->xbdi_lock);
935			mutex_exit(&xbdi->xbdi_lock);
936			break;
937		case RUN:
938			xbdi->xbdi_status = WAITING; /* reset state */
939			mutex_exit(&xbdi->xbdi_lock);
940
941			if (xbdi->xbdi_cont == NULL) {
942				xbdi->xbdi_cont = xbdback_co_main;
943			}
944
945			xbdback_trampoline(xbdi, xbdi);
946			break;
947		case DISCONNECTING:
948			if (xbdi->xbdi_pendingreqs > 0) {
949				/* there are pending I/Os. Wait for them. */
950				cv_wait(&xbdi->xbdi_cv, &xbdi->xbdi_lock);
951				mutex_exit(&xbdi->xbdi_lock);
952				break;
953			}
954
955			/* All I/Os should have been processed by now,
956			 * xbdi_refcnt should drop to 0 */
957			xbdi_put(xbdi);
958			KASSERT(xbdi->xbdi_refcnt == 0);
959			mutex_exit(&xbdi->xbdi_lock);
960			kthread_exit(0);
961			break;
962		default:
963			panic("%s: invalid state %d",
964			    xbdi->xbdi_name, xbdi->xbdi_status);
965		}
966	}
967}
968
969static void *
970xbdback_co_main(struct xbdback_instance *xbdi, void *obj)
971{
972	(void)obj;
973
974	xbdi->xbdi_req_prod = xbdi->xbdi_ring.ring_n.sring->req_prod;
975	xen_rmb(); /* ensure we see all requests up to req_prod */
976	/*
977	 * note that we'll eventually get a full ring of request.
978	 * in this case, MASK_BLKIF_IDX(req_cons) == MASK_BLKIF_IDX(req_prod)
979	 */
980	xbdi->xbdi_cont = xbdback_co_main_loop;
981	return xbdi;
982}
983
984/*
985 * Fetch a blkif request from the ring, and pass control to the appropriate
986 * continuation.
987 * If someone asked for disconnection, do not fetch any more request from
988 * the ring.
989 */
990static void *
991xbdback_co_main_loop(struct xbdback_instance *xbdi, void *obj)
992{
993	blkif_request_t *req;
994	blkif_x86_32_request_t *req32;
995	blkif_x86_64_request_t *req64;
996
997	(void)obj;
998	req = &xbdi->xbdi_xen_req;
999	if (xbdi->xbdi_ring.ring_n.req_cons != xbdi->xbdi_req_prod) {
1000		switch(xbdi->xbdi_proto) {
1001		case XBDIP_NATIVE:
1002			memcpy(req, RING_GET_REQUEST(&xbdi->xbdi_ring.ring_n,
1003			    xbdi->xbdi_ring.ring_n.req_cons),
1004			    sizeof(blkif_request_t));
1005			break;
1006		case XBDIP_32:
1007			req32 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_32,
1008			    xbdi->xbdi_ring.ring_n.req_cons);
1009			req->operation = req32->operation;
1010			req->nr_segments = req32->nr_segments;
1011			req->handle = req32->handle;
1012			req->id = req32->id;
1013			req->sector_number = req32->sector_number;
1014			break;
1015
1016		case XBDIP_64:
1017			req64 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_64,
1018			    xbdi->xbdi_ring.ring_n.req_cons);
1019			req->operation = req64->operation;
1020			req->nr_segments = req64->nr_segments;
1021			req->handle = req64->handle;
1022			req->id = req64->id;
1023			req->sector_number = req64->sector_number;
1024			break;
1025		}
1026		XENPRINTF(("xbdback op %d req_cons 0x%x req_prod 0x%x "
1027		    "resp_prod 0x%x id %" PRIu64 "\n", req->operation,
1028			xbdi->xbdi_ring.ring_n.req_cons,
1029			xbdi->xbdi_req_prod,
1030			xbdi->xbdi_ring.ring_n.rsp_prod_pvt,
1031			req->id));
1032		switch(req->operation) {
1033		case BLKIF_OP_READ:
1034		case BLKIF_OP_WRITE:
1035			xbdi->xbdi_cont = xbdback_co_io;
1036			break;
1037		case BLKIF_OP_FLUSH_DISKCACHE:
1038			xbdi_get(xbdi);
1039			xbdi->xbdi_cont = xbdback_co_cache_flush;
1040			break;
1041		default:
1042			if (ratecheck(&xbdi->xbdi_lasterr_time,
1043			    &xbdback_err_intvl)) {
1044				printf("%s: unknown operation %d\n",
1045				    xbdi->xbdi_name, req->operation);
1046			}
1047			xbdback_send_reply(xbdi, req->id, req->operation,
1048			    BLKIF_RSP_ERROR);
1049			xbdi->xbdi_cont = xbdback_co_main_incr;
1050			break;
1051		}
1052	} else {
1053		xbdi->xbdi_cont = xbdback_co_main_done;
1054	}
1055	return xbdi;
1056}
1057
1058/*
1059 * Increment consumer index and move on to the next request. In case
1060 * we want to disconnect, leave continuation now.
1061 */
1062static void *
1063xbdback_co_main_incr(struct xbdback_instance *xbdi, void *obj)
1064{
1065	(void)obj;
1066	blkif_back_ring_t *ring = &xbdi->xbdi_ring.ring_n;
1067
1068	ring->req_cons++;
1069
1070	/*
1071	 * Do not bother with locking here when checking for xbdi_status: if
1072	 * we get a transient state, we will get the right value at
1073	 * the next increment.
1074	 */
1075	if (xbdi->xbdi_status == DISCONNECTING)
1076		xbdi->xbdi_cont = NULL;
1077	else
1078		xbdi->xbdi_cont = xbdback_co_main_loop;
1079
1080	/*
1081	 * Each time the thread processes a full ring of requests, give
1082	 * a chance to other threads to process I/Os too
1083	 */
1084	if ((ring->req_cons % BLKIF_RING_SIZE) == 0)
1085		yield();
1086
1087	return xbdi;
1088}
1089
1090/*
1091 * Ring processing is over. If there are any I/O still present for this
1092 * instance, handle them first.
1093 */
1094static void *
1095xbdback_co_main_done(struct xbdback_instance *xbdi, void *obj)
1096{
1097	(void)obj;
1098	if (xbdi->xbdi_io != NULL) {
1099		KASSERT(xbdi->xbdi_io->xio_operation == BLKIF_OP_READ ||
1100		    xbdi->xbdi_io->xio_operation == BLKIF_OP_WRITE);
1101		xbdi->xbdi_cont = xbdback_co_map_io;
1102		xbdi->xbdi_cont_aux = xbdback_co_main_done2;
1103	} else {
1104		xbdi->xbdi_cont = xbdback_co_main_done2;
1105	}
1106	return xbdi;
1107}
1108
1109/*
1110 * Check for requests in the instance's ring. In case there are, start again
1111 * from the beginning. If not, stall.
1112 */
1113static void *
1114xbdback_co_main_done2(struct xbdback_instance *xbdi, void *obj)
1115{
1116	int work_to_do;
1117
1118	RING_FINAL_CHECK_FOR_REQUESTS(&xbdi->xbdi_ring.ring_n, work_to_do);
1119	if (work_to_do)
1120		xbdi->xbdi_cont = xbdback_co_main;
1121	else
1122		xbdi->xbdi_cont = NULL;
1123
1124	return xbdi;
1125}
1126
1127/*
1128 * Frontend requested a cache flush operation.
1129 */
1130static void *
1131xbdback_co_cache_flush(struct xbdback_instance *xbdi, void *obj)
1132{
1133	(void)obj;
1134
1135	XENPRINTF(("xbdback_co_cache_flush %p %p\n", xbdi, obj));
1136	if (xbdi->xbdi_io != NULL) {
1137		/* Some I/Os are required for this instance. Process them. */
1138		KASSERT(xbdi->xbdi_io->xio_operation == BLKIF_OP_READ ||
1139		    xbdi->xbdi_io->xio_operation == BLKIF_OP_WRITE);
1140		KASSERT(xbdi->xbdi_pendingreqs > 0);
1141		xbdi->xbdi_cont = xbdback_co_map_io;
1142		xbdi->xbdi_cont_aux = xbdback_co_cache_flush2;
1143	} else {
1144		xbdi->xbdi_cont = xbdback_co_cache_flush2;
1145	}
1146	return xbdi;
1147}
1148
1149static void *
1150xbdback_co_cache_flush2(struct xbdback_instance *xbdi, void *obj)
1151{
1152	(void)obj;
1153	XENPRINTF(("xbdback_co_cache_flush2 %p %p\n", xbdi, obj));
1154	if (xbdi->xbdi_pendingreqs > 0) {
1155		/*
1156		 * There are pending requests.
1157		 * Event or iodone() will restart processing
1158		 */
1159		xbdi->xbdi_cont = NULL;
1160		xbdi_put(xbdi);
1161		return NULL;
1162	}
1163	xbdi->xbdi_cont = xbdback_co_cache_doflush;
1164	return xbdback_pool_get(&xbdback_io_pool, xbdi);
1165}
1166
1167/* Start the flush work */
1168static void *
1169xbdback_co_cache_doflush(struct xbdback_instance *xbdi, void *obj)
1170{
1171	struct xbdback_io *xbd_io;
1172
1173	XENPRINTF(("xbdback_co_cache_doflush %p %p\n", xbdi, obj));
1174	xbd_io = xbdi->xbdi_io = obj;
1175	xbd_io->xio_xbdi = xbdi;
1176	xbd_io->xio_operation = xbdi->xbdi_xen_req.operation;
1177	xbd_io->xio_flush_id = xbdi->xbdi_xen_req.id;
1178	xbdi->xbdi_cont = xbdback_co_do_io;
1179	return xbdi;
1180}
1181
1182/*
1183 * A read or write I/O request must be processed. Do some checks first,
1184 * then get the segment information directly from the ring request.
1185 */
1186static void *
1187xbdback_co_io(struct xbdback_instance *xbdi, void *obj)
1188{
1189	int i, error;
1190	blkif_request_t *req;
1191	blkif_x86_32_request_t *req32;
1192	blkif_x86_64_request_t *req64;
1193
1194	(void)obj;
1195
1196	/* some sanity checks */
1197	req = &xbdi->xbdi_xen_req;
1198	if (req->nr_segments < 1 ||
1199	    req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
1200		if (ratecheck(&xbdi->xbdi_lasterr_time,
1201		    &xbdback_err_intvl)) {
1202			printf("%s: invalid number of segments: %d\n",
1203			       xbdi->xbdi_name,
1204			       xbdi->xbdi_xen_req.nr_segments);
1205		}
1206		error = EINVAL;
1207		goto end;
1208	}
1209
1210	KASSERT(req->operation == BLKIF_OP_READ ||
1211	    req->operation == BLKIF_OP_WRITE);
1212	if (req->operation == BLKIF_OP_WRITE) {
1213		if (xbdi->xbdi_ro) {
1214			error = EROFS;
1215			goto end;
1216		}
1217	}
1218
1219	xbdi->xbdi_segno = 0;
1220
1221	/* copy request segments */
1222	switch(xbdi->xbdi_proto) {
1223	case XBDIP_NATIVE:
1224		/* already copied in xbdback_co_main_loop */
1225		break;
1226	case XBDIP_32:
1227		req32 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_32,
1228		    xbdi->xbdi_ring.ring_n.req_cons);
1229		for (i = 0; i < req->nr_segments; i++)
1230			req->seg[i] = req32->seg[i];
1231		break;
1232	case XBDIP_64:
1233		req64 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_64,
1234		    xbdi->xbdi_ring.ring_n.req_cons);
1235		for (i = 0; i < req->nr_segments; i++)
1236			req->seg[i] = req64->seg[i];
1237		break;
1238	}
1239
1240	xbdi->xbdi_cont = xbdback_co_io_gotreq;
1241	return xbdback_pool_get(&xbdback_request_pool, xbdi);
1242
1243 end:
1244	xbdback_send_reply(xbdi, xbdi->xbdi_xen_req.id,
1245	    xbdi->xbdi_xen_req.operation, error);
1246	xbdi->xbdi_cont = xbdback_co_main_incr;
1247	return xbdi;
1248}
1249
1250/*
1251 * We have fetched segment requests from the ring. In case there are already
1252 * I/Os prepared for this instance, we can try coalescing the requests
1253 * with these I/Os.
1254 */
1255static void *
1256xbdback_co_io_gotreq(struct xbdback_instance *xbdi, void *obj)
1257{
1258	struct xbdback_request *xrq;
1259
1260	xrq = xbdi->xbdi_req = obj;
1261
1262	xrq->rq_xbdi = xbdi;
1263	xrq->rq_iocount = 0;
1264	xrq->rq_ioerrs = 0;
1265	xrq->rq_id = xbdi->xbdi_xen_req.id;
1266	xrq->rq_operation = xbdi->xbdi_xen_req.operation;
1267	KASSERT(xbdi->xbdi_req->rq_operation == BLKIF_OP_READ ||
1268	    xbdi->xbdi_req->rq_operation == BLKIF_OP_WRITE);
1269
1270	/*
1271	 * Request-level reasons not to coalesce: different device,
1272	 * different op, or noncontiguous disk sectors (vs. previous
1273	 * request handed to us).
1274	 */
1275	xbdi->xbdi_cont = xbdback_co_io_loop;
1276	if (xbdi->xbdi_io != NULL) {
1277		struct xbdback_request *last_req;
1278		last_req = SLIST_FIRST(&xbdi->xbdi_io->xio_rq)->car;
1279		XENPRINTF(("xbdback_io domain %d: hoping for sector %" PRIu64
1280		    "; got %" PRIu64 "\n", xbdi->xbdi_domid,
1281		    xbdi->xbdi_next_sector,
1282		    xbdi->xbdi_xen_req.sector_number));
1283		if ((xrq->rq_operation != last_req->rq_operation)
1284		    || (xbdi->xbdi_xen_req.sector_number !=
1285		    xbdi->xbdi_next_sector)) {
1286			XENPRINTF(("xbdback_io domain %d: segment break\n",
1287			    xbdi->xbdi_domid));
1288			xbdi->xbdi_next_sector =
1289			    xbdi->xbdi_xen_req.sector_number;
1290			KASSERT(xbdi->xbdi_io->xio_operation == BLKIF_OP_READ ||
1291			    xbdi->xbdi_io->xio_operation == BLKIF_OP_WRITE);
1292			xbdi->xbdi_cont_aux = xbdback_co_io_loop;
1293			xbdi->xbdi_cont = xbdback_co_map_io;
1294		}
1295	} else {
1296		xbdi->xbdi_next_sector = xbdi->xbdi_xen_req.sector_number;
1297	}
1298	return xbdi;
1299}
1300
1301/* Handle coalescing of multiple segment requests into one I/O work */
1302static void *
1303xbdback_co_io_loop(struct xbdback_instance *xbdi, void *obj)
1304{
1305	(void)obj;
1306	KASSERT(xbdi->xbdi_req->rq_operation == BLKIF_OP_READ ||
1307	    xbdi->xbdi_req->rq_operation == BLKIF_OP_WRITE);
1308	if (xbdi->xbdi_segno < xbdi->xbdi_xen_req.nr_segments) {
1309		uint8_t this_fs, this_ls, last_fs, last_ls;
1310		grant_ref_t thisgrt, lastgrt;
1311		/*
1312		 * Segment-level reason to coalesce: handling full
1313		 * pages, or adjacent sector ranges from the same page
1314		 * (and yes, this latter does happen).  But not if the
1315		 * array of client pseudo-physical pages is full.
1316		 */
1317		this_fs = xbdi->xbdi_xen_req.seg[xbdi->xbdi_segno].first_sect;
1318		this_ls = xbdi->xbdi_xen_req.seg[xbdi->xbdi_segno].last_sect;
1319		thisgrt = xbdi->xbdi_xen_req.seg[xbdi->xbdi_segno].gref;
1320		XENPRINTF(("xbdback_io domain %d: "
1321			   "first,last_sect[%d]=0%o,0%o\n",
1322			   xbdi->xbdi_domid, xbdi->xbdi_segno,
1323			   this_fs, this_ls));
1324		last_fs = xbdi->xbdi_last_fs = xbdi->xbdi_this_fs;
1325		last_ls = xbdi->xbdi_last_ls = xbdi->xbdi_this_ls;
1326		lastgrt = xbdi->xbdi_lastgrt = xbdi->xbdi_thisgrt;
1327		xbdi->xbdi_this_fs = this_fs;
1328		xbdi->xbdi_this_ls = this_ls;
1329		xbdi->xbdi_thisgrt = thisgrt;
1330		if (xbdi->xbdi_io != NULL) {
1331			if (last_ls == VBD_MAXSECT
1332			    && this_fs == 0
1333			    && xbdi->xbdi_io->xio_nrma
1334			    < XENSHM_MAX_PAGES_PER_REQUEST) {
1335				xbdi->xbdi_same_page = 0;
1336			} else if (last_ls + 1
1337				       == this_fs
1338#ifdef notyet
1339				   && (last_fas & ~PAGE_MASK)
1340				       == (this_fas & ~PAGE_MASK)
1341#else
1342				  && 0 /* can't know frame number yet */
1343#endif
1344			    ) {
1345#ifdef DEBUG
1346				if (ratecheck(&xbdi->xbdi_lastfragio_time,
1347				    &xbdback_fragio_intvl))
1348					printf("%s: domain is sending"
1349					    " excessively fragmented I/O\n",
1350					    xbdi->xbdi_name);
1351#endif
1352				printf("xbdback_io: would maybe glue "
1353				    "same page sec %d (%d->%d)\n",
1354				    xbdi->xbdi_segno, this_fs, this_ls);
1355				XENPRINTF(("xbdback_io domain %d: glue same "
1356				    "page", xbdi->xbdi_domid));
1357				panic("notyet!");
1358				xbdi->xbdi_same_page = 1;
1359			} else {
1360				KASSERT(xbdi->xbdi_io->xio_operation ==
1361				     BLKIF_OP_READ ||
1362				    xbdi->xbdi_io->xio_operation ==
1363				     BLKIF_OP_WRITE);
1364				xbdi->xbdi_cont_aux = xbdback_co_io_loop;
1365				xbdi->xbdi_cont = xbdback_co_map_io;
1366				return xbdi;
1367			}
1368		} else
1369			xbdi->xbdi_same_page = 0;
1370
1371		if (xbdi->xbdi_io == NULL) {
1372			xbdi->xbdi_cont = xbdback_co_io_gotio;
1373			return xbdback_pool_get(&xbdback_io_pool, xbdi);
1374		} else {
1375			xbdi->xbdi_cont = xbdback_co_io_gotio2;
1376		}
1377	} else {
1378		/* done with the loop over segments; get next request */
1379		xbdi->xbdi_cont = xbdback_co_main_incr;
1380	}
1381	return xbdi;
1382}
1383
1384/* Prepare an I/O buffer for a xbdback instance */
1385static void *
1386xbdback_co_io_gotio(struct xbdback_instance *xbdi, void *obj)
1387{
1388	struct xbdback_io *xbd_io;
1389	vaddr_t start_offset; /* start offset in vm area */
1390	int buf_flags;
1391
1392	xbdi_get(xbdi);
1393	atomic_inc_uint(&xbdi->xbdi_pendingreqs);
1394
1395	xbd_io = xbdi->xbdi_io = obj;
1396	buf_init(&xbd_io->xio_buf);
1397	xbd_io->xio_xbdi = xbdi;
1398	SLIST_INIT(&xbd_io->xio_rq);
1399	xbd_io->xio_nrma = 0;
1400	xbd_io->xio_mapped = 0;
1401	xbd_io->xio_operation = xbdi->xbdi_xen_req.operation;
1402
1403	start_offset = xbdi->xbdi_this_fs * VBD_BSIZE;
1404
1405	if (xbdi->xbdi_xen_req.operation == BLKIF_OP_WRITE) {
1406		buf_flags = B_WRITE;
1407	} else {
1408		buf_flags = B_READ;
1409	}
1410
1411	xbd_io->xio_buf.b_flags = buf_flags;
1412	xbd_io->xio_buf.b_cflags = 0;
1413	xbd_io->xio_buf.b_oflags = 0;
1414	xbd_io->xio_buf.b_iodone = xbdback_iodone;
1415	xbd_io->xio_buf.b_proc = NULL;
1416	xbd_io->xio_buf.b_vp = xbdi->xbdi_vp;
1417	xbd_io->xio_buf.b_objlock = xbdi->xbdi_vp->v_interlock;
1418	xbd_io->xio_buf.b_dev = xbdi->xbdi_dev;
1419	xbd_io->xio_buf.b_blkno = xbdi->xbdi_next_sector;
1420	xbd_io->xio_buf.b_bcount = 0;
1421	xbd_io->xio_buf.b_data = (void *)start_offset;
1422	xbd_io->xio_buf.b_private = xbd_io;
1423
1424	xbdi->xbdi_cont = xbdback_co_io_gotio2;
1425	return xbdi;
1426}
1427
1428/* Manage fragments */
1429static void *
1430xbdback_co_io_gotio2(struct xbdback_instance *xbdi, void *obj)
1431{
1432	(void)obj;
1433	if (xbdi->xbdi_segno == 0 || SLIST_EMPTY(&xbdi->xbdi_io->xio_rq)) {
1434		/* if this is the first segment of a new request */
1435		/* or if it's the first segment of the io */
1436		xbdi->xbdi_cont = xbdback_co_io_gotfrag;
1437		return xbdback_pool_get(&xbdback_fragment_pool, xbdi);
1438	}
1439	xbdi->xbdi_cont = xbdback_co_io_gotfrag2;
1440	return xbdi;
1441}
1442
1443/* Prepare the instance for its first fragment */
1444static void *
1445xbdback_co_io_gotfrag(struct xbdback_instance *xbdi, void *obj)
1446{
1447	struct xbdback_fragment *xbd_fr;
1448
1449	xbd_fr = obj;
1450	xbd_fr->car = xbdi->xbdi_req;
1451	SLIST_INSERT_HEAD(&xbdi->xbdi_io->xio_rq, xbd_fr, cdr);
1452	++xbdi->xbdi_req->rq_iocount;
1453
1454	xbdi->xbdi_cont = xbdback_co_io_gotfrag2;
1455	return xbdi;
1456}
1457
1458/* Last routine to manage segments fragments for one I/O */
1459static void *
1460xbdback_co_io_gotfrag2(struct xbdback_instance *xbdi, void *obj)
1461{
1462	struct xbdback_io *xbd_io;
1463	int seg_size;
1464	uint8_t this_fs, this_ls;
1465
1466	this_fs = xbdi->xbdi_this_fs;
1467	this_ls = xbdi->xbdi_this_ls;
1468	xbd_io = xbdi->xbdi_io;
1469	seg_size = this_ls - this_fs + 1;
1470
1471	if (seg_size < 0) {
1472		printf("xbdback_io domain %d: negative-size request (%d %d)\n",
1473		       xbdi->xbdi_domid, this_ls, this_fs);
1474		xbdback_io_error(xbdi->xbdi_io, EINVAL);
1475		xbdi->xbdi_io = NULL;
1476		xbdi->xbdi_cont = xbdback_co_main_incr;
1477		return xbdi;
1478	}
1479
1480	if (!xbdi->xbdi_same_page) {
1481		XENPRINTF(("xbdback_io domain %d: appending grant %u\n",
1482			   xbdi->xbdi_domid, (u_int)xbdi->xbdi_thisgrt));
1483		xbd_io->xio_gref[xbd_io->xio_nrma++] = xbdi->xbdi_thisgrt;
1484	}
1485
1486	xbd_io->xio_buf.b_bcount += (daddr_t)(seg_size * VBD_BSIZE);
1487	XENPRINTF(("xbdback_io domain %d: start sect %d size %d\n",
1488	    xbdi->xbdi_domid, (int)xbdi->xbdi_next_sector, seg_size));
1489
1490	/* Finally, the end of the segment loop! */
1491	xbdi->xbdi_next_sector += seg_size;
1492	++xbdi->xbdi_segno;
1493	xbdi->xbdi_cont = xbdback_co_io_loop;
1494	return xbdi;
1495}
1496
1497/*
1498 * Map the different I/O requests in backend's VA space.
1499 */
1500static void *
1501xbdback_co_map_io(struct xbdback_instance *xbdi, void *obj)
1502{
1503	(void)obj;
1504	XENPRINTF(("xbdback_io domain %d: flush sect %ld size %d ptr 0x%lx\n",
1505	    xbdi->xbdi_domid, (long)xbdi->xbdi_io->xio_buf.b_blkno,
1506	    (int)xbdi->xbdi_io->xio_buf.b_bcount, (long)xbdi->xbdi_io));
1507	xbdi->xbdi_cont = xbdback_co_do_io;
1508	return xbdback_map_shm(xbdi->xbdi_io);
1509}
1510
1511static void
1512xbdback_io_error(struct xbdback_io *xbd_io, int error)
1513{
1514	xbd_io->xio_buf.b_error = error;
1515	xbdback_iodone(&xbd_io->xio_buf);
1516}
1517
1518/*
1519 * Main xbdback I/O routine. It can either perform a flush operation or
1520 * schedule a read/write operation.
1521 */
1522static void *
1523xbdback_co_do_io(struct xbdback_instance *xbdi, void *obj)
1524{
1525	struct xbdback_io *xbd_io = xbdi->xbdi_io;
1526
1527	switch (xbd_io->xio_operation) {
1528	case BLKIF_OP_FLUSH_DISKCACHE:
1529	{
1530		int error;
1531		int force = 1;
1532
1533		error = VOP_IOCTL(xbdi->xbdi_vp, DIOCCACHESYNC, &force, FWRITE,
1534		    kauth_cred_get());
1535		if (error) {
1536			aprint_error("xbdback %s: DIOCCACHESYNC returned %d\n",
1537			    xbdi->xbdi_xbusd->xbusd_path, error);
1538			 if (error == EOPNOTSUPP || error == ENOTTY)
1539				error = BLKIF_RSP_EOPNOTSUPP;
1540			 else
1541				error = BLKIF_RSP_ERROR;
1542		} else
1543			error = BLKIF_RSP_OKAY;
1544		xbdback_send_reply(xbdi, xbd_io->xio_flush_id,
1545		    xbd_io->xio_operation, error);
1546		xbdback_pool_put(&xbdback_io_pool, xbd_io);
1547		xbdi_put(xbdi);
1548		xbdi->xbdi_io = NULL;
1549		xbdi->xbdi_cont = xbdback_co_main_incr;
1550		return xbdi;
1551	}
1552	case BLKIF_OP_READ:
1553	case BLKIF_OP_WRITE:
1554		xbd_io->xio_buf.b_data = (void *)
1555		    ((vaddr_t)xbd_io->xio_buf.b_data + xbd_io->xio_vaddr);
1556#ifdef DIAGNOSTIC
1557		{
1558		vaddr_t bdata = (vaddr_t)xbd_io->xio_buf.b_data;
1559		int nsegs =
1560		    ((((bdata + xbd_io->xio_buf.b_bcount - 1) & ~PAGE_MASK) -
1561		    (bdata & ~PAGE_MASK)) >> PAGE_SHIFT) + 1;
1562		if ((bdata & ~PAGE_MASK) != (xbd_io->xio_vaddr & ~PAGE_MASK)) {
1563			printf("xbdback_co_do_io: vaddr %#" PRIxVADDR
1564			    " bdata %#" PRIxVADDR "\n",
1565			    xbd_io->xio_vaddr, bdata);
1566			panic("xbdback_co_do_io: bdata page change");
1567		}
1568		if (nsegs > xbd_io->xio_nrma) {
1569			printf("xbdback_co_do_io: vaddr %#" PRIxVADDR
1570			    " bcount %#x doesn't fit in %d pages\n",
1571			    bdata, xbd_io->xio_buf.b_bcount, xbd_io->xio_nrma);
1572			panic("xbdback_co_do_io: not enough pages");
1573		}
1574		}
1575#endif
1576		if ((xbd_io->xio_buf.b_flags & B_READ) == 0) {
1577			mutex_enter(xbd_io->xio_buf.b_vp->v_interlock);
1578			xbd_io->xio_buf.b_vp->v_numoutput++;
1579			mutex_exit(xbd_io->xio_buf.b_vp->v_interlock);
1580		}
1581		bdev_strategy(&xbd_io->xio_buf);
1582		/* will call xbdback_iodone() asynchronously when done */
1583		xbdi->xbdi_io = NULL;
1584		xbdi->xbdi_cont = xbdi->xbdi_cont_aux;
1585		return xbdi;
1586	default:
1587		/* Should never happen */
1588		panic("xbdback_co_do_io: unsupported operation %d",
1589		    xbd_io->xio_operation);
1590	}
1591}
1592
1593/*
1594 * Called from softint(9) context when an I/O is done: for each request, send
1595 * back the associated reply to the domain.
1596 *
1597 * This gets reused by xbdback_io_error to report errors from other sources.
1598 */
1599static void
1600xbdback_iodone(struct buf *bp)
1601{
1602	struct xbdback_io *xbd_io;
1603	struct xbdback_instance *xbdi;
1604	int errp;
1605
1606	xbd_io = bp->b_private;
1607	xbdi = xbd_io->xio_xbdi;
1608
1609	XENPRINTF(("xbdback_io domain %d: iodone ptr 0x%lx\n",
1610		   xbdi->xbdi_domid, (long)xbd_io));
1611
1612	if (xbd_io->xio_mapped == 1)
1613		xbdback_unmap_shm(xbd_io);
1614
1615	if (bp->b_error != 0) {
1616		printf("xbd IO domain %d: error %d\n",
1617		       xbdi->xbdi_domid, bp->b_error);
1618		errp = 1;
1619	} else
1620		errp = 0;
1621
1622	/* for each constituent xbd request */
1623	while(!SLIST_EMPTY(&xbd_io->xio_rq)) {
1624		struct xbdback_fragment *xbd_fr;
1625		struct xbdback_request *xbd_req;
1626		struct xbdback_instance *rxbdi;
1627		int error;
1628
1629		xbd_fr = SLIST_FIRST(&xbd_io->xio_rq);
1630		xbd_req = xbd_fr->car;
1631		SLIST_REMOVE_HEAD(&xbd_io->xio_rq, cdr);
1632		xbdback_pool_put(&xbdback_fragment_pool, xbd_fr);
1633
1634		if (errp)
1635			++xbd_req->rq_ioerrs;
1636
1637		/* finalize it only if this was its last I/O */
1638		if (--xbd_req->rq_iocount > 0)
1639			continue;
1640
1641		rxbdi = xbd_req->rq_xbdi;
1642		KASSERT(xbdi == rxbdi);
1643
1644		error = xbd_req->rq_ioerrs > 0
1645		    ? BLKIF_RSP_ERROR
1646		    : BLKIF_RSP_OKAY;
1647
1648		XENPRINTF(("xbdback_io domain %d: end request %"PRIu64
1649		    "error=%d\n",
1650		    xbdi->xbdi_domid, xbd_req->rq_id, error));
1651		xbdback_send_reply(xbdi, xbd_req->rq_id,
1652		    xbd_req->rq_operation, error);
1653		xbdback_pool_put(&xbdback_request_pool, xbd_req);
1654	}
1655	xbdi_put(xbdi);
1656	atomic_dec_uint(&xbdi->xbdi_pendingreqs);
1657	buf_destroy(&xbd_io->xio_buf);
1658	xbdback_pool_put(&xbdback_io_pool, xbd_io);
1659
1660	xbdback_wakeup_thread(xbdi);
1661}
1662
1663/*
1664 * Wake up the per xbdback instance thread.
1665 */
1666static void
1667xbdback_wakeup_thread(struct xbdback_instance *xbdi)
1668{
1669
1670	mutex_enter(&xbdi->xbdi_lock);
1671	/* only set RUN state when we are WAITING for work */
1672	if (xbdi->xbdi_status == WAITING)
1673	       xbdi->xbdi_status = RUN;
1674	mutex_exit(&xbdi->xbdi_lock);
1675
1676	cv_broadcast(&xbdi->xbdi_cv);
1677}
1678
1679/*
1680 * called once a request has completed. Place the reply in the ring and
1681 * notify the guest OS.
1682 */
1683static void
1684xbdback_send_reply(struct xbdback_instance *xbdi, uint64_t id,
1685    int op, int status)
1686{
1687	blkif_response_t *resp_n;
1688	blkif_x86_32_response_t *resp32;
1689	blkif_x86_64_response_t *resp64;
1690	int notify;
1691
1692	/*
1693	 * The ring can be accessed by the xbdback thread, xbdback_iodone()
1694	 * handler, or any handler that triggered the shm callback. So
1695	 * protect ring access via the xbdi_lock mutex.
1696	 */
1697	mutex_enter(&xbdi->xbdi_lock);
1698	switch (xbdi->xbdi_proto) {
1699	case XBDIP_NATIVE:
1700		resp_n = RING_GET_RESPONSE(&xbdi->xbdi_ring.ring_n,
1701		    xbdi->xbdi_ring.ring_n.rsp_prod_pvt);
1702		resp_n->id        = id;
1703		resp_n->operation = op;
1704		resp_n->status    = status;
1705		break;
1706	case XBDIP_32:
1707		resp32 = RING_GET_RESPONSE(&xbdi->xbdi_ring.ring_32,
1708		    xbdi->xbdi_ring.ring_n.rsp_prod_pvt);
1709		resp32->id        = id;
1710		resp32->operation = op;
1711		resp32->status    = status;
1712		break;
1713	case XBDIP_64:
1714		resp64 = RING_GET_RESPONSE(&xbdi->xbdi_ring.ring_64,
1715		    xbdi->xbdi_ring.ring_n.rsp_prod_pvt);
1716		resp64->id        = id;
1717		resp64->operation = op;
1718		resp64->status    = status;
1719		break;
1720	}
1721	xbdi->xbdi_ring.ring_n.rsp_prod_pvt++;
1722	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbdi->xbdi_ring.ring_n, notify);
1723	mutex_exit(&xbdi->xbdi_lock);
1724
1725	if (notify) {
1726		XENPRINTF(("xbdback_send_reply notify %d\n", xbdi->xbdi_domid));
1727		hypervisor_notify_via_evtchn(xbdi->xbdi_evtchn);
1728	}
1729}
1730
1731/*
1732 * Map multiple entries of an I/O request into backend's VA space.
1733 * The xbd_io->xio_gref array has to be filled out by the caller.
1734 */
1735static void *
1736xbdback_map_shm(struct xbdback_io *xbd_io)
1737{
1738	struct xbdback_instance *xbdi;
1739	struct xbdback_request *xbd_rq;
1740	int error, s;
1741
1742#ifdef XENDEBUG_VBD
1743	int i;
1744	printf("xbdback_map_shm map grant ");
1745	for (i = 0; i < xbd_io->xio_nrma; i++) {
1746		printf("%u ", (u_int)xbd_io->xio_gref[i]);
1747	}
1748#endif
1749
1750	KASSERT(xbd_io->xio_mapped == 0);
1751
1752	xbdi = xbd_io->xio_xbdi;
1753	xbd_rq = SLIST_FIRST(&xbd_io->xio_rq)->car;
1754
1755	error = xen_shm_map(xbd_io->xio_nrma, xbdi->xbdi_domid,
1756	    xbd_io->xio_gref, &xbd_io->xio_vaddr, xbd_io->xio_gh,
1757	    (xbd_rq->rq_operation == BLKIF_OP_WRITE) ? XSHM_RO : 0);
1758
1759	switch(error) {
1760	case 0:
1761#ifdef XENDEBUG_VBD
1762		printf("handle ");
1763		for (i = 0; i < xbd_io->xio_nrma; i++) {
1764			printf("%u ", (u_int)xbd_io->xio_gh[i]);
1765		}
1766		printf("\n");
1767#endif
1768		xbd_io->xio_mapped = 1;
1769		return xbdi;
1770	case ENOMEM:
1771		s = splvm();
1772		if (!xbdback_shmcb) {
1773			if (xen_shm_callback(xbdback_shm_callback, xbdi)
1774			    != 0) {
1775				splx(s);
1776				panic("xbdback_map_shm: "
1777				      "xen_shm_callback failed");
1778			}
1779			xbdback_shmcb = 1;
1780		}
1781		SIMPLEQ_INSERT_TAIL(&xbdback_shmq, xbdi, xbdi_on_hold);
1782		splx(s);
1783		/* Put the thread to sleep until the callback is called */
1784		xbdi->xbdi_cont = xbdback_co_wait_shm_callback;
1785		return NULL;
1786	default:
1787		printf("xbdback_map_shm: xen_shm error %d ", error);
1788		xbdback_io_error(xbdi->xbdi_io, error);
1789		xbdi->xbdi_io = NULL;
1790		xbdi->xbdi_cont = xbdi->xbdi_cont_aux;
1791		return xbdi;
1792	}
1793}
1794
1795static int
1796xbdback_shm_callback(void *arg)
1797{
1798        int error, s;
1799
1800	/*
1801	 * The shm callback may be executed at any level, including
1802	 * IPL_BIO and IPL_NET levels. Raise to the lowest priority level
1803	 * that can mask both.
1804	 */
1805	s = splvm();
1806	while(!SIMPLEQ_EMPTY(&xbdback_shmq)) {
1807		struct xbdback_instance *xbdi;
1808		struct xbdback_io *xbd_io;
1809		struct xbdback_request *xbd_rq;
1810
1811		xbdi = SIMPLEQ_FIRST(&xbdback_shmq);
1812		xbd_io = xbdi->xbdi_io;
1813		xbd_rq = SLIST_FIRST(&xbd_io->xio_rq)->car;
1814		KASSERT(xbd_io->xio_mapped == 0);
1815
1816		error = xen_shm_map(xbd_io->xio_nrma,
1817		    xbdi->xbdi_domid, xbd_io->xio_gref,
1818		    &xbd_io->xio_vaddr, xbd_io->xio_gh,
1819		    XSHM_CALLBACK |
1820		    ((xbd_rq->rq_operation == BLKIF_OP_WRITE) ? XSHM_RO: 0));
1821		switch(error) {
1822		case ENOMEM:
1823			splx(s);
1824			return -1; /* will try again later */
1825		case 0:
1826			SIMPLEQ_REMOVE_HEAD(&xbdback_shmq, xbdi_on_hold);
1827			xbd_io->xio_mapped = 1;
1828			xbdback_wakeup_thread(xbdi);
1829			break;
1830		default:
1831			SIMPLEQ_REMOVE_HEAD(&xbdback_shmq, xbdi_on_hold);
1832			printf("xbdback_shm_callback: xen_shm error %d\n",
1833			       error);
1834			xbdback_io_error(xbd_io, error);
1835			xbdi->xbdi_io = NULL;
1836			xbdback_wakeup_thread(xbdi);
1837			break;
1838		}
1839	}
1840	xbdback_shmcb = 0;
1841	splx(s);
1842	return 0;
1843}
1844
1845/*
1846 * Allows waiting for the shm callback to complete.
1847 */
1848static void *
1849xbdback_co_wait_shm_callback(struct xbdback_instance *xbdi, void *obj)
1850{
1851
1852	if (xbdi->xbdi_io == NULL || xbdi->xbdi_io->xio_mapped == 1) {
1853		/*
1854		 * Only proceed to next step when the callback reported
1855		 * success or failure.
1856		 */
1857		xbdi->xbdi_cont = xbdi->xbdi_cont_aux;
1858		return xbdi;
1859	} else {
1860		/* go back to sleep */
1861		return NULL;
1862	}
1863}
1864
1865/* unmap a request from our virtual address space (request is done) */
1866static void
1867xbdback_unmap_shm(struct xbdback_io *xbd_io)
1868{
1869#ifdef XENDEBUG_VBD
1870	int i;
1871	printf("xbdback_unmap_shm handle ");
1872	for (i = 0; i < xbd_io->xio_nrma; i++) {
1873		printf("%u ", (u_int)xbd_io->xio_gh[i]);
1874	}
1875	printf("\n");
1876#endif
1877
1878	KASSERT(xbd_io->xio_mapped == 1);
1879	xbd_io->xio_mapped = 0;
1880	xen_shm_unmap(xbd_io->xio_vaddr, xbd_io->xio_nrma,
1881	    xbd_io->xio_gh);
1882	xbd_io->xio_vaddr = -1;
1883}
1884
1885/* Obtain memory from a pool */
1886static void *
1887xbdback_pool_get(struct xbdback_pool *pp,
1888			      struct xbdback_instance *xbdi)
1889{
1890	return pool_cache_get(&pp->pc, PR_WAITOK);
1891}
1892
1893/* Restore memory to a pool */
1894static void
1895xbdback_pool_put(struct xbdback_pool *pp, void *item)
1896{
1897	pool_cache_put(&pp->pc, item);
1898}
1899
1900/*
1901 * Trampoline routine. Calls continuations in a loop and only exits when
1902 * either the returned object or the next callback is NULL.
1903 */
1904static void
1905xbdback_trampoline(struct xbdback_instance *xbdi, void *obj)
1906{
1907	xbdback_cont_t cont;
1908
1909	while(obj != NULL && xbdi->xbdi_cont != NULL) {
1910		cont = xbdi->xbdi_cont;
1911#ifdef DIAGNOSTIC
1912		xbdi->xbdi_cont = (xbdback_cont_t)0xDEADBEEF;
1913#endif
1914		obj = (*cont)(xbdi, obj);
1915#ifdef DIAGNOSTIC
1916		if (xbdi->xbdi_cont == (xbdback_cont_t)0xDEADBEEF) {
1917			printf("xbdback_trampoline: 0x%lx didn't set "
1918			       "xbdi->xbdi_cont!\n", (long)cont);
1919			panic("xbdback_trampoline: bad continuation");
1920		}
1921#endif
1922	}
1923}
1924