1/* $NetBSD: xbdback_xenbus.c,v 1.55.2.1 2012/06/05 15:36:00 jdc Exp $ */ 2 3/* 4 * Copyright (c) 2006 Manuel Bouyer. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 * 26 */ 27 28#include <sys/cdefs.h> 29__KERNEL_RCSID(0, "$NetBSD: xbdback_xenbus.c,v 1.55.2.1 2012/06/05 15:36:00 jdc Exp $"); 30 31#include <sys/atomic.h> 32#include <sys/buf.h> 33#include <sys/condvar.h> 34#include <sys/conf.h> 35#include <sys/disk.h> 36#include <sys/device.h> 37#include <sys/fcntl.h> 38#include <sys/kauth.h> 39#include <sys/kernel.h> 40#include <sys/kmem.h> 41#include <sys/kthread.h> 42#include <sys/malloc.h> 43#include <sys/mutex.h> 44#include <sys/param.h> 45#include <sys/queue.h> 46#include <sys/systm.h> 47#include <sys/time.h> 48#include <sys/types.h> 49#include <sys/vnode.h> 50 51#include <xen/xen.h> 52#include <xen/xen_shm.h> 53#include <xen/evtchn.h> 54#include <xen/xenbus.h> 55#include <xen/xen-public/io/protocols.h> 56 57/* #define XENDEBUG_VBD */ 58#ifdef XENDEBUG_VBD 59#define XENPRINTF(x) printf x 60#else 61#define XENPRINTF(x) 62#endif 63 64#define BLKIF_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE) 65 66/* 67 * Backend block device driver for Xen 68 */ 69 70/* Max number of pages per request. The request may not be page aligned */ 71#define BLKIF_MAX_PAGES_PER_REQUEST (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1) 72 73/* Values are expressed in 512-byte sectors */ 74#define VBD_BSIZE 512 75#define VBD_MAXSECT ((PAGE_SIZE / VBD_BSIZE) - 1) 76 77struct xbdback_request; 78struct xbdback_io; 79struct xbdback_fragment; 80struct xbdback_instance; 81 82/* 83 * status of a xbdback instance: 84 * WAITING: xbdback instance is connected, waiting for requests 85 * RUN: xbdi thread must be woken up, I/Os have to be processed 86 * DISCONNECTING: the instance is closing, no more I/Os can be scheduled 87 * DISCONNECTED: no I/Os, no ring, the thread should terminate. 88 */ 89typedef enum {WAITING, RUN, DISCONNECTING, DISCONNECTED} xbdback_state_t; 90 91/* 92 * Each xbdback instance is managed by a single thread that handles all 93 * the I/O processing. As there are a variety of conditions that can block, 94 * everything will be done in a sort of continuation-passing style. 95 * 96 * When the execution has to block to delay processing, for example to 97 * allow system to recover because of memory shortage (via shared memory 98 * callback), the return value of a continuation can be set to NULL. In that 99 * case, the thread will go back to sleeping and wait for the proper 100 * condition before it starts processing requests again from where it left. 101 * Continuation state is "stored" in the xbdback instance (xbdi_cont and 102 * xbdi_cont_aux), and should only be manipulated by the instance thread. 103 * 104 * As xbdback(4) has to handle different sort of asynchronous events (Xen 105 * event channels, biointr() soft interrupts, xenbus commands), the xbdi_lock 106 * mutex is used to protect specific elements of the xbdback instance from 107 * concurrent access: thread status and ring access (when pushing responses). 108 * 109 * Here's how the call graph is supposed to be for a single I/O: 110 * 111 * xbdback_co_main() 112 * | 113 * | --> xbdback_co_cache_doflush() or NULL 114 * | | 115 * | - xbdback_co_cache_flush2() <- xbdback_co_do_io() <- 116 * | | | 117 * | |-> xbdback_co_cache_flush() -> xbdback_co_map_io()- 118 * xbdback_co_main_loop()-| 119 * | |-> xbdback_co_main_done() ---> xbdback_co_map_io()- 120 * | | | 121 * | -- xbdback_co_main_done2() <-- xbdback_co_do_io() <- 122 * | | 123 * | --> xbdback_co_main() or NULL 124 * | 125 * xbdback_co_io() -> xbdback_co_main_incr() -> xbdback_co_main_loop() 126 * | 127 * xbdback_co_io_gotreq()--+--> xbdback_co_map_io() --- 128 * | | | 129 * -> xbdback_co_io_loop()----| <- xbdback_co_do_io() <-- 130 * | | | | 131 * | | | |----------> xbdback_co_io_gotio() 132 * | | | | 133 * | | xbdback_co_main_incr() | 134 * | | | | 135 * | | xbdback_co_main_loop() | 136 * | | | 137 * | xbdback_co_io_gotio2() <-----------| 138 * | | | 139 * | | |----------> xbdback_co_io_gotfrag() 140 * | | | 141 * -- xbdback_co_io_gotfrag2() <---------| 142 * | 143 * xbdback_co_main_incr() -> xbdback_co_main_loop() 144 */ 145typedef void *(* xbdback_cont_t)(struct xbdback_instance *, void *); 146 147enum xbdi_proto { 148 XBDIP_NATIVE, 149 XBDIP_32, 150 XBDIP_64 151}; 152 153/* we keep the xbdback instances in a linked list */ 154struct xbdback_instance { 155 SLIST_ENTRY(xbdback_instance) next; 156 struct xenbus_device *xbdi_xbusd; /* our xenstore entry */ 157 struct xenbus_watch xbdi_watch; /* to watch our store */ 158 domid_t xbdi_domid; /* attached to this domain */ 159 uint32_t xbdi_handle; /* domain-specific handle */ 160 char xbdi_name[16]; /* name of this instance */ 161 /* mutex that protects concurrent access to the xbdback instance */ 162 kmutex_t xbdi_lock; 163 kcondvar_t xbdi_cv; /* wait channel for thread work */ 164 xbdback_state_t xbdi_status; /* thread's status */ 165 /* backing device parameters */ 166 dev_t xbdi_dev; 167 const struct bdevsw *xbdi_bdevsw; /* pointer to the device's bdevsw */ 168 struct vnode *xbdi_vp; 169 uint64_t xbdi_size; 170 bool xbdi_ro; /* is device read-only ? */ 171 /* parameters for the communication */ 172 unsigned int xbdi_evtchn; 173 /* private parameters for communication */ 174 blkif_back_ring_proto_t xbdi_ring; 175 enum xbdi_proto xbdi_proto; 176 grant_handle_t xbdi_ring_handle; /* to unmap the ring */ 177 vaddr_t xbdi_ring_va; /* to unmap the ring */ 178 /* disconnection must be postponed until all I/O is done */ 179 int xbdi_refcnt; 180 /* 181 * State for I/O processing/coalescing follows; this has to 182 * live here instead of on the stack because of the 183 * continuation-ness (see above). 184 */ 185 RING_IDX xbdi_req_prod; /* limit on request indices */ 186 xbdback_cont_t xbdi_cont, xbdi_cont_aux; 187 SIMPLEQ_ENTRY(xbdback_instance) xbdi_on_hold; /* waiting on resources */ 188 /* _request state: track requests fetched from ring */ 189 struct xbdback_request *xbdi_req; /* if NULL, ignore following */ 190 blkif_request_t xbdi_xen_req; 191 int xbdi_segno; 192 /* _io state: I/O associated to this instance */ 193 struct xbdback_io *xbdi_io; /* if NULL, ignore next field */ 194 daddr_t xbdi_next_sector; 195 uint8_t xbdi_last_fs, xbdi_this_fs; /* first sectors */ 196 uint8_t xbdi_last_ls, xbdi_this_ls; /* last sectors */ 197 grant_ref_t xbdi_thisgrt, xbdi_lastgrt; /* grants */ 198 /* other state */ 199 int xbdi_same_page; /* are we merging two segments on the same page? */ 200 uint xbdi_pendingreqs; /* number of I/O in fly */ 201 int xbdi_errps; /* errors per second */ 202 struct timeval xbdi_lasterr_time; /* error time tracking */ 203#ifdef DEBUG 204 struct timeval xbdi_lastfragio_time; /* fragmented I/O tracking */ 205#endif 206}; 207/* Manipulation of the above reference count. */ 208#define xbdi_get(xbdip) atomic_inc_uint(&(xbdip)->xbdi_refcnt) 209#define xbdi_put(xbdip) \ 210do { \ 211 if (atomic_dec_uint_nv(&(xbdip)->xbdi_refcnt) == 0) \ 212 xbdback_finish_disconnect(xbdip); \ 213} while (/* CONSTCOND */ 0) 214 215SLIST_HEAD(, xbdback_instance) xbdback_instances; 216 217/* 218 * For each request from a guest, a xbdback_request is allocated from 219 * a pool. This will describe the request until completion. The 220 * request may require multiple IO operations to perform, so the 221 * per-IO information is not stored here. 222 */ 223struct xbdback_request { 224 struct xbdback_instance *rq_xbdi; /* our xbd instance */ 225 uint64_t rq_id; 226 int rq_iocount; /* reference count; or, number of outstanding I/O's */ 227 int rq_ioerrs; 228 uint8_t rq_operation; 229}; 230 231/* 232 * For each I/O operation associated with one of those requests, an 233 * xbdback_io is allocated from a pool. It may correspond to multiple 234 * Xen disk requests, or parts of them, if several arrive at once that 235 * can be coalesced. 236 */ 237struct xbdback_io { 238 /* The instance pointer is duplicated for convenience. */ 239 struct xbdback_instance *xio_xbdi; /* our xbd instance */ 240 uint8_t xio_operation; 241 union { 242 struct { 243 struct buf xio_buf; /* our I/O */ 244 /* xbd requests involved */ 245 SLIST_HEAD(, xbdback_fragment) xio_rq; 246 /* the virtual address to map the request at */ 247 vaddr_t xio_vaddr; 248 /* grants to map */ 249 grant_ref_t xio_gref[XENSHM_MAX_PAGES_PER_REQUEST]; 250 /* grants release */ 251 grant_handle_t xio_gh[XENSHM_MAX_PAGES_PER_REQUEST]; 252 uint16_t xio_nrma; /* number of guest pages */ 253 uint16_t xio_mapped; /* == 1: grants are mapped */ 254 } xio_rw; 255 uint64_t xio_flush_id; 256 } u; 257}; 258#define xio_buf u.xio_rw.xio_buf 259#define xio_rq u.xio_rw.xio_rq 260#define xio_vaddr u.xio_rw.xio_vaddr 261#define xio_gref u.xio_rw.xio_gref 262#define xio_gh u.xio_rw.xio_gh 263#define xio_nrma u.xio_rw.xio_nrma 264#define xio_mapped u.xio_rw.xio_mapped 265 266#define xio_flush_id u.xio_flush_id 267 268/* 269 * Rather than having the xbdback_io keep an array of the 270 * xbdback_requests involved, since the actual number will probably be 271 * small but might be as large as BLKIF_RING_SIZE, use a list. This 272 * would be threaded through xbdback_request, but one of them might be 273 * part of multiple I/O's, alas. 274 */ 275struct xbdback_fragment { 276 struct xbdback_request *car; 277 SLIST_ENTRY(xbdback_fragment) cdr; 278}; 279 280/* 281 * Pools to manage the chain of block requests and I/Os fragments 282 * submitted by frontend. 283 */ 284struct xbdback_pool { 285 struct pool_cache pc; 286 struct timeval last_warning; 287} xbdback_request_pool, xbdback_io_pool, xbdback_fragment_pool; 288 289SIMPLEQ_HEAD(xbdback_iqueue, xbdback_instance); 290static struct xbdback_iqueue xbdback_shmq; 291static int xbdback_shmcb; /* have we already registered a callback? */ 292 293/* Interval between reports of I/O errors from frontend */ 294struct timeval xbdback_err_intvl = { 1, 0 }; 295 296#ifdef DEBUG 297struct timeval xbdback_fragio_intvl = { 60, 0 }; 298#endif 299 void xbdbackattach(int); 300static int xbdback_xenbus_create(struct xenbus_device *); 301static int xbdback_xenbus_destroy(void *); 302static void xbdback_frontend_changed(void *, XenbusState); 303static void xbdback_backend_changed(struct xenbus_watch *, 304 const char **, unsigned int); 305static int xbdback_evthandler(void *); 306 307static int xbdback_connect(struct xbdback_instance *); 308static void xbdback_disconnect(struct xbdback_instance *); 309static void xbdback_finish_disconnect(struct xbdback_instance *); 310 311static struct xbdback_instance *xbdif_lookup(domid_t, uint32_t); 312 313static void *xbdback_co_main(struct xbdback_instance *, void *); 314static void *xbdback_co_main_loop(struct xbdback_instance *, void *); 315static void *xbdback_co_main_incr(struct xbdback_instance *, void *); 316static void *xbdback_co_main_done(struct xbdback_instance *, void *); 317static void *xbdback_co_main_done2(struct xbdback_instance *, void *); 318 319static void *xbdback_co_cache_flush(struct xbdback_instance *, void *); 320static void *xbdback_co_cache_flush2(struct xbdback_instance *, void *); 321static void *xbdback_co_cache_doflush(struct xbdback_instance *, void *); 322 323static void *xbdback_co_io(struct xbdback_instance *, void *); 324static void *xbdback_co_io_gotreq(struct xbdback_instance *, void *); 325static void *xbdback_co_io_loop(struct xbdback_instance *, void *); 326static void *xbdback_co_io_gotio(struct xbdback_instance *, void *); 327static void *xbdback_co_io_gotio2(struct xbdback_instance *, void *); 328static void *xbdback_co_io_gotfrag(struct xbdback_instance *, void *); 329static void *xbdback_co_io_gotfrag2(struct xbdback_instance *, void *); 330 331static void *xbdback_co_map_io(struct xbdback_instance *, void *); 332static void *xbdback_co_do_io(struct xbdback_instance *, void *); 333 334static void *xbdback_co_wait_shm_callback(struct xbdback_instance *, void *); 335 336static int xbdback_shm_callback(void *); 337static void xbdback_io_error(struct xbdback_io *, int); 338static void xbdback_iodone(struct buf *); 339static void xbdback_send_reply(struct xbdback_instance *, uint64_t , int , int); 340 341static void *xbdback_map_shm(struct xbdback_io *); 342static void xbdback_unmap_shm(struct xbdback_io *); 343 344static void *xbdback_pool_get(struct xbdback_pool *, 345 struct xbdback_instance *); 346static void xbdback_pool_put(struct xbdback_pool *, void *); 347static void xbdback_thread(void *); 348static void xbdback_wakeup_thread(struct xbdback_instance *); 349static void xbdback_trampoline(struct xbdback_instance *, void *); 350 351static struct xenbus_backend_driver xbd_backend_driver = { 352 .xbakd_create = xbdback_xenbus_create, 353 .xbakd_type = "vbd" 354}; 355 356void 357xbdbackattach(int n) 358{ 359 XENPRINTF(("xbdbackattach\n")); 360 361 /* 362 * initialize the backend driver, register the control message handler 363 * and send driver up message. 364 */ 365 SLIST_INIT(&xbdback_instances); 366 SIMPLEQ_INIT(&xbdback_shmq); 367 xbdback_shmcb = 0; 368 369 pool_cache_bootstrap(&xbdback_request_pool.pc, 370 sizeof(struct xbdback_request), 0, 0, 0, "xbbrp", NULL, 371 IPL_SOFTBIO, NULL, NULL, NULL); 372 pool_cache_bootstrap(&xbdback_io_pool.pc, 373 sizeof(struct xbdback_io), 0, 0, 0, "xbbip", NULL, 374 IPL_SOFTBIO, NULL, NULL, NULL); 375 pool_cache_bootstrap(&xbdback_fragment_pool.pc, 376 sizeof(struct xbdback_fragment), 0, 0, 0, "xbbfp", NULL, 377 IPL_SOFTBIO, NULL, NULL, NULL); 378 379 /* we allocate enough to handle a whole ring at once */ 380 if (pool_prime(&xbdback_request_pool.pc.pc_pool, BLKIF_RING_SIZE) != 0) 381 printf("xbdback: failed to prime request pool\n"); 382 if (pool_prime(&xbdback_io_pool.pc.pc_pool, BLKIF_RING_SIZE) != 0) 383 printf("xbdback: failed to prime io pool\n"); 384 if (pool_prime(&xbdback_fragment_pool.pc.pc_pool, 385 BLKIF_MAX_SEGMENTS_PER_REQUEST * BLKIF_RING_SIZE) != 0) 386 printf("xbdback: failed to prime fragment pool\n"); 387 388 xenbus_backend_register(&xbd_backend_driver); 389} 390 391static int 392xbdback_xenbus_create(struct xenbus_device *xbusd) 393{ 394 struct xbdback_instance *xbdi; 395 long domid, handle; 396 int error, i; 397 char *ep; 398 399 if ((error = xenbus_read_ul(NULL, xbusd->xbusd_path, 400 "frontend-id", &domid, 10)) != 0) { 401 aprint_error("xbdback: can't read %s/frontend-id: %d\n", 402 xbusd->xbusd_path, error); 403 return error; 404 } 405 406 /* 407 * get handle: this is the last component of the path; which is 408 * a decimal number. $path/dev contains the device name, which is not 409 * appropriate. 410 */ 411 for (i = strlen(xbusd->xbusd_path); i > 0; i--) { 412 if (xbusd->xbusd_path[i] == '/') 413 break; 414 } 415 if (i == 0) { 416 aprint_error("xbdback: can't parse %s\n", 417 xbusd->xbusd_path); 418 return EFTYPE; 419 } 420 handle = strtoul(&xbusd->xbusd_path[i+1], &ep, 10); 421 if (*ep != '\0') { 422 aprint_error("xbdback: can't parse %s\n", 423 xbusd->xbusd_path); 424 return EFTYPE; 425 } 426 427 if (xbdif_lookup(domid, handle) != NULL) { 428 return EEXIST; 429 } 430 xbdi = kmem_zalloc(sizeof(*xbdi), KM_SLEEP); 431 432 xbdi->xbdi_domid = domid; 433 xbdi->xbdi_handle = handle; 434 snprintf(xbdi->xbdi_name, sizeof(xbdi->xbdi_name), "xbdb%di%d", 435 xbdi->xbdi_domid, xbdi->xbdi_handle); 436 437 /* initialize status and reference counter */ 438 xbdi->xbdi_status = DISCONNECTED; 439 xbdi_get(xbdi); 440 441 mutex_init(&xbdi->xbdi_lock, MUTEX_DEFAULT, IPL_BIO); 442 cv_init(&xbdi->xbdi_cv, xbdi->xbdi_name); 443 SLIST_INSERT_HEAD(&xbdback_instances, xbdi, next); 444 445 xbusd->xbusd_u.b.b_cookie = xbdi; 446 xbusd->xbusd_u.b.b_detach = xbdback_xenbus_destroy; 447 xbusd->xbusd_otherend_changed = xbdback_frontend_changed; 448 xbdi->xbdi_xbusd = xbusd; 449 450 error = xenbus_watch_path2(xbusd, xbusd->xbusd_path, "physical-device", 451 &xbdi->xbdi_watch, xbdback_backend_changed); 452 if (error) { 453 printf("failed to watch on %s/physical-device: %d\n", 454 xbusd->xbusd_path, error); 455 goto fail; 456 } 457 xbdi->xbdi_watch.xbw_dev = xbusd; 458 error = xenbus_switch_state(xbusd, NULL, XenbusStateInitWait); 459 if (error) { 460 printf("failed to switch state on %s: %d\n", 461 xbusd->xbusd_path, error); 462 goto fail2; 463 } 464 return 0; 465fail2: 466 unregister_xenbus_watch(&xbdi->xbdi_watch); 467fail: 468 kmem_free(xbdi, sizeof(*xbdi)); 469 return error; 470} 471 472static int 473xbdback_xenbus_destroy(void *arg) 474{ 475 struct xbdback_instance *xbdi = arg; 476 struct xenbus_device *xbusd = xbdi->xbdi_xbusd; 477 struct gnttab_unmap_grant_ref ungrop; 478 int err; 479 480 XENPRINTF(("xbdback_xenbus_destroy state %d\n", xbdi->xbdi_status)); 481 482 xbdback_disconnect(xbdi); 483 484 /* unregister watch */ 485 if (xbdi->xbdi_watch.node) { 486 unregister_xenbus_watch(&xbdi->xbdi_watch); 487 free(xbdi->xbdi_watch.node, M_DEVBUF); 488 xbdi->xbdi_watch.node = NULL; 489 } 490 /* unmap ring */ 491 if (xbdi->xbdi_ring_va != 0) { 492 ungrop.host_addr = xbdi->xbdi_ring_va; 493 ungrop.handle = xbdi->xbdi_ring_handle; 494 ungrop.dev_bus_addr = 0; 495 err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, 496 &ungrop, 1); 497 if (err) 498 printf("xbdback %s: unmap_grant_ref failed: %d\n", 499 xbusd->xbusd_otherend, err); 500 uvm_km_free(kernel_map, xbdi->xbdi_ring_va, 501 PAGE_SIZE, UVM_KMF_VAONLY); 502 } 503 /* close device */ 504 if (xbdi->xbdi_size) { 505 const char *name; 506 struct dkwedge_info wi; 507 if (getdiskinfo(xbdi->xbdi_vp, &wi) == 0) 508 name = wi.dkw_devname; 509 else 510 name = "*unknown*"; 511 printf("xbd backend: detach device %s for domain %d\n", 512 name, xbdi->xbdi_domid); 513 vn_close(xbdi->xbdi_vp, FREAD, NOCRED); 514 } 515 SLIST_REMOVE(&xbdback_instances, xbdi, xbdback_instance, next); 516 mutex_destroy(&xbdi->xbdi_lock); 517 cv_destroy(&xbdi->xbdi_cv); 518 kmem_free(xbdi, sizeof(*xbdi)); 519 return 0; 520} 521 522static int 523xbdback_connect(struct xbdback_instance *xbdi) 524{ 525 int len, err; 526 struct gnttab_map_grant_ref grop; 527 struct gnttab_unmap_grant_ref ungrop; 528 evtchn_op_t evop; 529 u_long ring_ref, revtchn; 530 char *xsproto; 531 const char *proto; 532 struct xenbus_device *xbusd = xbdi->xbdi_xbusd; 533 534 XENPRINTF(("xbdback %s: connect\n", xbusd->xbusd_path)); 535 /* read comunication informations */ 536 err = xenbus_read_ul(NULL, xbusd->xbusd_otherend, 537 "ring-ref", &ring_ref, 10); 538 if (err) { 539 xenbus_dev_fatal(xbusd, err, "reading %s/ring-ref", 540 xbusd->xbusd_otherend); 541 return -1; 542 } 543 XENPRINTF(("xbdback %s: connect ring-ref %lu\n", xbusd->xbusd_path, ring_ref)); 544 err = xenbus_read_ul(NULL, xbusd->xbusd_otherend, 545 "event-channel", &revtchn, 10); 546 if (err) { 547 xenbus_dev_fatal(xbusd, err, "reading %s/event-channel", 548 xbusd->xbusd_otherend); 549 return -1; 550 } 551 XENPRINTF(("xbdback %s: connect revtchn %lu\n", xbusd->xbusd_path, revtchn)); 552 err = xenbus_read(NULL, xbusd->xbusd_otherend, "protocol", 553 &len, &xsproto); 554 if (err) { 555 xbdi->xbdi_proto = XBDIP_NATIVE; 556 proto = "unspecified"; 557 XENPRINTF(("xbdback %s: connect no xsproto\n", xbusd->xbusd_path)); 558 } else { 559 XENPRINTF(("xbdback %s: connect xsproto %s\n", xbusd->xbusd_path, xsproto)); 560 if (strcmp(xsproto, XEN_IO_PROTO_ABI_NATIVE) == 0) { 561 xbdi->xbdi_proto = XBDIP_NATIVE; 562 proto = XEN_IO_PROTO_ABI_NATIVE; 563 } else if (strcmp(xsproto, XEN_IO_PROTO_ABI_X86_32) == 0) { 564 xbdi->xbdi_proto = XBDIP_32; 565 proto = XEN_IO_PROTO_ABI_X86_32; 566 } else if (strcmp(xsproto, XEN_IO_PROTO_ABI_X86_64) == 0) { 567 xbdi->xbdi_proto = XBDIP_64; 568 proto = XEN_IO_PROTO_ABI_X86_64; 569 } else { 570 aprint_error("xbd domain %d: unknown proto %s\n", 571 xbdi->xbdi_domid, xsproto); 572 free(xsproto, M_DEVBUF); 573 return -1; 574 } 575 free(xsproto, M_DEVBUF); 576 } 577 578 /* allocate VA space and map rings */ 579 xbdi->xbdi_ring_va = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 580 UVM_KMF_VAONLY); 581 if (xbdi->xbdi_ring_va == 0) { 582 xenbus_dev_fatal(xbusd, ENOMEM, 583 "can't get VA for ring", xbusd->xbusd_otherend); 584 return -1; 585 } 586 XENPRINTF(("xbdback %s: connect va 0x%" PRIxVADDR "\n", xbusd->xbusd_path, xbdi->xbdi_ring_va)); 587 588 grop.host_addr = xbdi->xbdi_ring_va; 589 grop.flags = GNTMAP_host_map; 590 grop.ref = ring_ref; 591 grop.dom = xbdi->xbdi_domid; 592 err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, 593 &grop, 1); 594 if (err || grop.status) { 595 aprint_error("xbdback %s: can't map grant ref: %d/%d\n", 596 xbusd->xbusd_path, err, grop.status); 597 xenbus_dev_fatal(xbusd, EINVAL, 598 "can't map ring", xbusd->xbusd_otherend); 599 goto err; 600 } 601 xbdi->xbdi_ring_handle = grop.handle; 602 XENPRINTF(("xbdback %s: connect grhandle %d\n", xbusd->xbusd_path, grop.handle)); 603 604 switch(xbdi->xbdi_proto) { 605 case XBDIP_NATIVE: 606 { 607 blkif_sring_t *sring = (void *)xbdi->xbdi_ring_va; 608 BACK_RING_INIT(&xbdi->xbdi_ring.ring_n, sring, PAGE_SIZE); 609 break; 610 } 611 case XBDIP_32: 612 { 613 blkif_x86_32_sring_t *sring = (void *)xbdi->xbdi_ring_va; 614 BACK_RING_INIT(&xbdi->xbdi_ring.ring_32, sring, PAGE_SIZE); 615 break; 616 } 617 case XBDIP_64: 618 { 619 blkif_x86_64_sring_t *sring = (void *)xbdi->xbdi_ring_va; 620 BACK_RING_INIT(&xbdi->xbdi_ring.ring_64, sring, PAGE_SIZE); 621 break; 622 } 623 } 624 625 evop.cmd = EVTCHNOP_bind_interdomain; 626 evop.u.bind_interdomain.remote_dom = xbdi->xbdi_domid; 627 evop.u.bind_interdomain.remote_port = revtchn; 628 err = HYPERVISOR_event_channel_op(&evop); 629 if (err) { 630 aprint_error("blkback %s: " 631 "can't get event channel: %d\n", 632 xbusd->xbusd_otherend, err); 633 xenbus_dev_fatal(xbusd, err, 634 "can't bind event channel", xbusd->xbusd_otherend); 635 goto err2; 636 } 637 XENPRINTF(("xbdback %s: connect evchannel %d\n", xbusd->xbusd_path, xbdi->xbdi_evtchn)); 638 xbdi->xbdi_evtchn = evop.u.bind_interdomain.local_port; 639 640 event_set_handler(xbdi->xbdi_evtchn, xbdback_evthandler, 641 xbdi, IPL_BIO, xbdi->xbdi_name); 642 aprint_verbose("xbd backend domain %d handle %#x (%d) " 643 "using event channel %d, protocol %s\n", xbdi->xbdi_domid, 644 xbdi->xbdi_handle, xbdi->xbdi_handle, xbdi->xbdi_evtchn, proto); 645 646 /* enable the xbdback event handler machinery */ 647 xbdi->xbdi_status = WAITING; 648 hypervisor_enable_event(xbdi->xbdi_evtchn); 649 hypervisor_notify_via_evtchn(xbdi->xbdi_evtchn); 650 651 if (kthread_create(IPL_NONE, KTHREAD_MPSAFE, NULL, 652 xbdback_thread, xbdi, NULL, "%s", xbdi->xbdi_name) == 0) 653 return 0; 654 655err2: 656 /* unmap ring */ 657 ungrop.host_addr = xbdi->xbdi_ring_va; 658 ungrop.handle = xbdi->xbdi_ring_handle; 659 ungrop.dev_bus_addr = 0; 660 err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, 661 &ungrop, 1); 662 if (err) 663 aprint_error("xbdback %s: unmap_grant_ref failed: %d\n", 664 xbusd->xbusd_path, err); 665 666err: 667 /* free ring VA space */ 668 uvm_km_free(kernel_map, xbdi->xbdi_ring_va, PAGE_SIZE, UVM_KMF_VAONLY); 669 return -1; 670} 671 672/* 673 * Signal a xbdback thread to disconnect. Done in 'xenwatch' thread context. 674 */ 675static void 676xbdback_disconnect(struct xbdback_instance *xbdi) 677{ 678 679 mutex_enter(&xbdi->xbdi_lock); 680 if (xbdi->xbdi_status == DISCONNECTED) { 681 mutex_exit(&xbdi->xbdi_lock); 682 return; 683 } 684 hypervisor_mask_event(xbdi->xbdi_evtchn); 685 event_remove_handler(xbdi->xbdi_evtchn, xbdback_evthandler, 686 xbdi); 687 688 /* signal thread that we want to disconnect, then wait for it */ 689 xbdi->xbdi_status = DISCONNECTING; 690 cv_signal(&xbdi->xbdi_cv); 691 692 while (xbdi->xbdi_status != DISCONNECTED) 693 cv_wait(&xbdi->xbdi_cv, &xbdi->xbdi_lock); 694 695 mutex_exit(&xbdi->xbdi_lock); 696 697 xenbus_switch_state(xbdi->xbdi_xbusd, NULL, XenbusStateClosing); 698} 699 700static void 701xbdback_frontend_changed(void *arg, XenbusState new_state) 702{ 703 struct xbdback_instance *xbdi = arg; 704 struct xenbus_device *xbusd = xbdi->xbdi_xbusd; 705 706 XENPRINTF(("xbdback %s: new state %d\n", xbusd->xbusd_path, new_state)); 707 switch(new_state) { 708 case XenbusStateInitialising: 709 break; 710 case XenbusStateInitialised: 711 case XenbusStateConnected: 712 if (xbdi->xbdi_status == WAITING || xbdi->xbdi_status == RUN) 713 break; 714 xbdback_connect(xbdi); 715 break; 716 case XenbusStateClosing: 717 xbdback_disconnect(xbdi); 718 break; 719 case XenbusStateClosed: 720 /* otherend_changed() should handle it for us */ 721 panic("xbdback_frontend_changed: closed\n"); 722 case XenbusStateUnknown: 723 case XenbusStateInitWait: 724 default: 725 aprint_error("xbdback %s: invalid frontend state %d\n", 726 xbusd->xbusd_path, new_state); 727 } 728 return; 729} 730 731static void 732xbdback_backend_changed(struct xenbus_watch *watch, 733 const char **vec, unsigned int len) 734{ 735 struct xenbus_device *xbusd = watch->xbw_dev; 736 struct xbdback_instance *xbdi = xbusd->xbusd_u.b.b_cookie; 737 int err; 738 long dev; 739 char *mode; 740 struct xenbus_transaction *xbt; 741 const char *devname; 742 int major; 743 744 err = xenbus_read_ul(NULL, xbusd->xbusd_path, "physical-device", 745 &dev, 10); 746 /* 747 * An error can occur as the watch can fire up just after being 748 * registered. So we have to ignore error :( 749 */ 750 if (err) 751 return; 752 /* 753 * we can also fire up after having opened the device, don't try 754 * to do it twice. 755 */ 756 if (xbdi->xbdi_vp != NULL) { 757 if (xbdi->xbdi_status == WAITING || xbdi->xbdi_status == RUN) { 758 if (xbdi->xbdi_dev != dev) { 759 printf("xbdback %s: changing physical device " 760 "from %#"PRIx64" to %#lx not supported\n", 761 xbusd->xbusd_path, xbdi->xbdi_dev, dev); 762 } 763 } 764 return; 765 } 766 xbdi->xbdi_dev = dev; 767 err = xenbus_read(NULL, xbusd->xbusd_path, "mode", NULL, &mode); 768 if (err) { 769 printf("xbdback: failed to read %s/mode: %d\n", 770 xbusd->xbusd_path, err); 771 return; 772 } 773 if (mode[0] == 'w') 774 xbdi->xbdi_ro = false; 775 else 776 xbdi->xbdi_ro = true; 777 free(mode, M_DEVBUF); 778 major = major(xbdi->xbdi_dev); 779 devname = devsw_blk2name(major); 780 if (devname == NULL) { 781 printf("xbdback %s: unknown device 0x%"PRIx64"\n", 782 xbusd->xbusd_path, xbdi->xbdi_dev); 783 return; 784 } 785 xbdi->xbdi_bdevsw = bdevsw_lookup(xbdi->xbdi_dev); 786 if (xbdi->xbdi_bdevsw == NULL) { 787 printf("xbdback %s: no bdevsw for device 0x%"PRIx64"\n", 788 xbusd->xbusd_path, xbdi->xbdi_dev); 789 return; 790 } 791 err = bdevvp(xbdi->xbdi_dev, &xbdi->xbdi_vp); 792 if (err) { 793 printf("xbdback %s: can't open device 0x%"PRIx64": %d\n", 794 xbusd->xbusd_path, xbdi->xbdi_dev, err); 795 return; 796 } 797 err = vn_lock(xbdi->xbdi_vp, LK_EXCLUSIVE | LK_RETRY); 798 if (err) { 799 printf("xbdback %s: can't vn_lock device 0x%"PRIx64": %d\n", 800 xbusd->xbusd_path, xbdi->xbdi_dev, err); 801 vrele(xbdi->xbdi_vp); 802 return; 803 } 804 err = VOP_OPEN(xbdi->xbdi_vp, FREAD, NOCRED); 805 if (err) { 806 printf("xbdback %s: can't VOP_OPEN device 0x%"PRIx64": %d\n", 807 xbusd->xbusd_path, xbdi->xbdi_dev, err); 808 vput(xbdi->xbdi_vp); 809 return; 810 } 811 VOP_UNLOCK(xbdi->xbdi_vp); 812 813 /* dk device; get wedge data */ 814 struct dkwedge_info wi; 815 if ((err = getdiskinfo(xbdi->xbdi_vp, &wi)) == 0) { 816 xbdi->xbdi_size = wi.dkw_size; 817 printf("xbd backend: attach device %s (size %" PRIu64 ") " 818 "for domain %d\n", wi.dkw_devname, xbdi->xbdi_size, 819 xbdi->xbdi_domid); 820 } else { 821 /* If both Ioctls failed set device size to 0 and return */ 822 printf("xbdback %s: can't DIOCGWEDGEINFO device " 823 "0x%"PRIx64": %d\n", xbusd->xbusd_path, 824 xbdi->xbdi_dev, err); 825 xbdi->xbdi_size = xbdi->xbdi_dev = 0; 826 vn_close(xbdi->xbdi_vp, FREAD, NOCRED); 827 xbdi->xbdi_vp = NULL; 828 return; 829 } 830again: 831 xbt = xenbus_transaction_start(); 832 if (xbt == NULL) { 833 printf("xbdback %s: can't start transaction\n", 834 xbusd->xbusd_path); 835 return; 836 } 837 err = xenbus_printf(xbt, xbusd->xbusd_path, "sectors", "%" PRIu64 , 838 xbdi->xbdi_size); 839 if (err) { 840 printf("xbdback: failed to write %s/sectors: %d\n", 841 xbusd->xbusd_path, err); 842 goto abort; 843 } 844 err = xenbus_printf(xbt, xbusd->xbusd_path, "info", "%u", 845 xbdi->xbdi_ro ? VDISK_READONLY : 0); 846 if (err) { 847 printf("xbdback: failed to write %s/info: %d\n", 848 xbusd->xbusd_path, err); 849 goto abort; 850 } 851 err = xenbus_printf(xbt, xbusd->xbusd_path, "sector-size", "%lu", 852 (u_long)DEV_BSIZE); 853 if (err) { 854 printf("xbdback: failed to write %s/sector-size: %d\n", 855 xbusd->xbusd_path, err); 856 goto abort; 857 } 858 err = xenbus_printf(xbt, xbusd->xbusd_path, "feature-flush-cache", 859 "%u", 1); 860 if (err) { 861 printf("xbdback: failed to write %s/feature-flush-cache: %d\n", 862 xbusd->xbusd_path, err); 863 goto abort; 864 } 865 err = xenbus_transaction_end(xbt, 0); 866 if (err == EAGAIN) 867 goto again; 868 if (err) { 869 printf("xbdback %s: can't end transaction: %d\n", 870 xbusd->xbusd_path, err); 871 } 872 err = xenbus_switch_state(xbusd, NULL, XenbusStateConnected); 873 if (err) { 874 printf("xbdback %s: can't switch state: %d\n", 875 xbusd->xbusd_path, err); 876 } 877 return; 878abort: 879 xenbus_transaction_end(xbt, 1); 880} 881 882/* 883 * Used by a xbdi thread to signal that it is now disconnected. 884 */ 885static void 886xbdback_finish_disconnect(struct xbdback_instance *xbdi) 887{ 888 KASSERT(mutex_owned(&xbdi->xbdi_lock)); 889 KASSERT(xbdi->xbdi_status == DISCONNECTING); 890 891 xbdi->xbdi_status = DISCONNECTED; 892 893 cv_signal(&xbdi->xbdi_cv); 894} 895 896static struct xbdback_instance * 897xbdif_lookup(domid_t dom , uint32_t handle) 898{ 899 struct xbdback_instance *xbdi; 900 901 SLIST_FOREACH(xbdi, &xbdback_instances, next) { 902 if (xbdi->xbdi_domid == dom && xbdi->xbdi_handle == handle) 903 return xbdi; 904 } 905 return NULL; 906} 907 908static int 909xbdback_evthandler(void *arg) 910{ 911 struct xbdback_instance *xbdi = arg; 912 913 XENPRINTF(("xbdback_evthandler domain %d: cont %p\n", 914 xbdi->xbdi_domid, xbdi->xbdi_cont)); 915 916 xbdback_wakeup_thread(xbdi); 917 918 return 1; 919} 920 921/* 922 * Main thread routine for one xbdback instance. Woken up by 923 * xbdback_evthandler when a domain has I/O work scheduled in a I/O ring. 924 */ 925static void 926xbdback_thread(void *arg) 927{ 928 struct xbdback_instance *xbdi = arg; 929 930 for (;;) { 931 mutex_enter(&xbdi->xbdi_lock); 932 switch (xbdi->xbdi_status) { 933 case WAITING: 934 cv_wait(&xbdi->xbdi_cv, &xbdi->xbdi_lock); 935 mutex_exit(&xbdi->xbdi_lock); 936 break; 937 case RUN: 938 xbdi->xbdi_status = WAITING; /* reset state */ 939 mutex_exit(&xbdi->xbdi_lock); 940 941 if (xbdi->xbdi_cont == NULL) { 942 xbdi->xbdi_cont = xbdback_co_main; 943 } 944 945 xbdback_trampoline(xbdi, xbdi); 946 break; 947 case DISCONNECTING: 948 if (xbdi->xbdi_pendingreqs > 0) { 949 /* there are pending I/Os. Wait for them. */ 950 cv_wait(&xbdi->xbdi_cv, &xbdi->xbdi_lock); 951 mutex_exit(&xbdi->xbdi_lock); 952 break; 953 } 954 955 /* All I/Os should have been processed by now, 956 * xbdi_refcnt should drop to 0 */ 957 xbdi_put(xbdi); 958 KASSERT(xbdi->xbdi_refcnt == 0); 959 mutex_exit(&xbdi->xbdi_lock); 960 kthread_exit(0); 961 break; 962 default: 963 panic("%s: invalid state %d", 964 xbdi->xbdi_name, xbdi->xbdi_status); 965 } 966 } 967} 968 969static void * 970xbdback_co_main(struct xbdback_instance *xbdi, void *obj) 971{ 972 (void)obj; 973 974 xbdi->xbdi_req_prod = xbdi->xbdi_ring.ring_n.sring->req_prod; 975 xen_rmb(); /* ensure we see all requests up to req_prod */ 976 /* 977 * note that we'll eventually get a full ring of request. 978 * in this case, MASK_BLKIF_IDX(req_cons) == MASK_BLKIF_IDX(req_prod) 979 */ 980 xbdi->xbdi_cont = xbdback_co_main_loop; 981 return xbdi; 982} 983 984/* 985 * Fetch a blkif request from the ring, and pass control to the appropriate 986 * continuation. 987 * If someone asked for disconnection, do not fetch any more request from 988 * the ring. 989 */ 990static void * 991xbdback_co_main_loop(struct xbdback_instance *xbdi, void *obj) 992{ 993 blkif_request_t *req; 994 blkif_x86_32_request_t *req32; 995 blkif_x86_64_request_t *req64; 996 997 (void)obj; 998 req = &xbdi->xbdi_xen_req; 999 if (xbdi->xbdi_ring.ring_n.req_cons != xbdi->xbdi_req_prod) { 1000 switch(xbdi->xbdi_proto) { 1001 case XBDIP_NATIVE: 1002 memcpy(req, RING_GET_REQUEST(&xbdi->xbdi_ring.ring_n, 1003 xbdi->xbdi_ring.ring_n.req_cons), 1004 sizeof(blkif_request_t)); 1005 break; 1006 case XBDIP_32: 1007 req32 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_32, 1008 xbdi->xbdi_ring.ring_n.req_cons); 1009 req->operation = req32->operation; 1010 req->nr_segments = req32->nr_segments; 1011 req->handle = req32->handle; 1012 req->id = req32->id; 1013 req->sector_number = req32->sector_number; 1014 break; 1015 1016 case XBDIP_64: 1017 req64 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_64, 1018 xbdi->xbdi_ring.ring_n.req_cons); 1019 req->operation = req64->operation; 1020 req->nr_segments = req64->nr_segments; 1021 req->handle = req64->handle; 1022 req->id = req64->id; 1023 req->sector_number = req64->sector_number; 1024 break; 1025 } 1026 XENPRINTF(("xbdback op %d req_cons 0x%x req_prod 0x%x " 1027 "resp_prod 0x%x id %" PRIu64 "\n", req->operation, 1028 xbdi->xbdi_ring.ring_n.req_cons, 1029 xbdi->xbdi_req_prod, 1030 xbdi->xbdi_ring.ring_n.rsp_prod_pvt, 1031 req->id)); 1032 switch(req->operation) { 1033 case BLKIF_OP_READ: 1034 case BLKIF_OP_WRITE: 1035 xbdi->xbdi_cont = xbdback_co_io; 1036 break; 1037 case BLKIF_OP_FLUSH_DISKCACHE: 1038 xbdi_get(xbdi); 1039 xbdi->xbdi_cont = xbdback_co_cache_flush; 1040 break; 1041 default: 1042 if (ratecheck(&xbdi->xbdi_lasterr_time, 1043 &xbdback_err_intvl)) { 1044 printf("%s: unknown operation %d\n", 1045 xbdi->xbdi_name, req->operation); 1046 } 1047 xbdback_send_reply(xbdi, req->id, req->operation, 1048 BLKIF_RSP_ERROR); 1049 xbdi->xbdi_cont = xbdback_co_main_incr; 1050 break; 1051 } 1052 } else { 1053 xbdi->xbdi_cont = xbdback_co_main_done; 1054 } 1055 return xbdi; 1056} 1057 1058/* 1059 * Increment consumer index and move on to the next request. In case 1060 * we want to disconnect, leave continuation now. 1061 */ 1062static void * 1063xbdback_co_main_incr(struct xbdback_instance *xbdi, void *obj) 1064{ 1065 (void)obj; 1066 blkif_back_ring_t *ring = &xbdi->xbdi_ring.ring_n; 1067 1068 ring->req_cons++; 1069 1070 /* 1071 * Do not bother with locking here when checking for xbdi_status: if 1072 * we get a transient state, we will get the right value at 1073 * the next increment. 1074 */ 1075 if (xbdi->xbdi_status == DISCONNECTING) 1076 xbdi->xbdi_cont = NULL; 1077 else 1078 xbdi->xbdi_cont = xbdback_co_main_loop; 1079 1080 /* 1081 * Each time the thread processes a full ring of requests, give 1082 * a chance to other threads to process I/Os too 1083 */ 1084 if ((ring->req_cons % BLKIF_RING_SIZE) == 0) 1085 yield(); 1086 1087 return xbdi; 1088} 1089 1090/* 1091 * Ring processing is over. If there are any I/O still present for this 1092 * instance, handle them first. 1093 */ 1094static void * 1095xbdback_co_main_done(struct xbdback_instance *xbdi, void *obj) 1096{ 1097 (void)obj; 1098 if (xbdi->xbdi_io != NULL) { 1099 KASSERT(xbdi->xbdi_io->xio_operation == BLKIF_OP_READ || 1100 xbdi->xbdi_io->xio_operation == BLKIF_OP_WRITE); 1101 xbdi->xbdi_cont = xbdback_co_map_io; 1102 xbdi->xbdi_cont_aux = xbdback_co_main_done2; 1103 } else { 1104 xbdi->xbdi_cont = xbdback_co_main_done2; 1105 } 1106 return xbdi; 1107} 1108 1109/* 1110 * Check for requests in the instance's ring. In case there are, start again 1111 * from the beginning. If not, stall. 1112 */ 1113static void * 1114xbdback_co_main_done2(struct xbdback_instance *xbdi, void *obj) 1115{ 1116 int work_to_do; 1117 1118 RING_FINAL_CHECK_FOR_REQUESTS(&xbdi->xbdi_ring.ring_n, work_to_do); 1119 if (work_to_do) 1120 xbdi->xbdi_cont = xbdback_co_main; 1121 else 1122 xbdi->xbdi_cont = NULL; 1123 1124 return xbdi; 1125} 1126 1127/* 1128 * Frontend requested a cache flush operation. 1129 */ 1130static void * 1131xbdback_co_cache_flush(struct xbdback_instance *xbdi, void *obj) 1132{ 1133 (void)obj; 1134 1135 XENPRINTF(("xbdback_co_cache_flush %p %p\n", xbdi, obj)); 1136 if (xbdi->xbdi_io != NULL) { 1137 /* Some I/Os are required for this instance. Process them. */ 1138 KASSERT(xbdi->xbdi_io->xio_operation == BLKIF_OP_READ || 1139 xbdi->xbdi_io->xio_operation == BLKIF_OP_WRITE); 1140 KASSERT(xbdi->xbdi_pendingreqs > 0); 1141 xbdi->xbdi_cont = xbdback_co_map_io; 1142 xbdi->xbdi_cont_aux = xbdback_co_cache_flush2; 1143 } else { 1144 xbdi->xbdi_cont = xbdback_co_cache_flush2; 1145 } 1146 return xbdi; 1147} 1148 1149static void * 1150xbdback_co_cache_flush2(struct xbdback_instance *xbdi, void *obj) 1151{ 1152 (void)obj; 1153 XENPRINTF(("xbdback_co_cache_flush2 %p %p\n", xbdi, obj)); 1154 if (xbdi->xbdi_pendingreqs > 0) { 1155 /* 1156 * There are pending requests. 1157 * Event or iodone() will restart processing 1158 */ 1159 xbdi->xbdi_cont = NULL; 1160 xbdi_put(xbdi); 1161 return NULL; 1162 } 1163 xbdi->xbdi_cont = xbdback_co_cache_doflush; 1164 return xbdback_pool_get(&xbdback_io_pool, xbdi); 1165} 1166 1167/* Start the flush work */ 1168static void * 1169xbdback_co_cache_doflush(struct xbdback_instance *xbdi, void *obj) 1170{ 1171 struct xbdback_io *xbd_io; 1172 1173 XENPRINTF(("xbdback_co_cache_doflush %p %p\n", xbdi, obj)); 1174 xbd_io = xbdi->xbdi_io = obj; 1175 xbd_io->xio_xbdi = xbdi; 1176 xbd_io->xio_operation = xbdi->xbdi_xen_req.operation; 1177 xbd_io->xio_flush_id = xbdi->xbdi_xen_req.id; 1178 xbdi->xbdi_cont = xbdback_co_do_io; 1179 return xbdi; 1180} 1181 1182/* 1183 * A read or write I/O request must be processed. Do some checks first, 1184 * then get the segment information directly from the ring request. 1185 */ 1186static void * 1187xbdback_co_io(struct xbdback_instance *xbdi, void *obj) 1188{ 1189 int i, error; 1190 blkif_request_t *req; 1191 blkif_x86_32_request_t *req32; 1192 blkif_x86_64_request_t *req64; 1193 1194 (void)obj; 1195 1196 /* some sanity checks */ 1197 req = &xbdi->xbdi_xen_req; 1198 if (req->nr_segments < 1 || 1199 req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) { 1200 if (ratecheck(&xbdi->xbdi_lasterr_time, 1201 &xbdback_err_intvl)) { 1202 printf("%s: invalid number of segments: %d\n", 1203 xbdi->xbdi_name, 1204 xbdi->xbdi_xen_req.nr_segments); 1205 } 1206 error = EINVAL; 1207 goto end; 1208 } 1209 1210 KASSERT(req->operation == BLKIF_OP_READ || 1211 req->operation == BLKIF_OP_WRITE); 1212 if (req->operation == BLKIF_OP_WRITE) { 1213 if (xbdi->xbdi_ro) { 1214 error = EROFS; 1215 goto end; 1216 } 1217 } 1218 1219 xbdi->xbdi_segno = 0; 1220 1221 /* copy request segments */ 1222 switch(xbdi->xbdi_proto) { 1223 case XBDIP_NATIVE: 1224 /* already copied in xbdback_co_main_loop */ 1225 break; 1226 case XBDIP_32: 1227 req32 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_32, 1228 xbdi->xbdi_ring.ring_n.req_cons); 1229 for (i = 0; i < req->nr_segments; i++) 1230 req->seg[i] = req32->seg[i]; 1231 break; 1232 case XBDIP_64: 1233 req64 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_64, 1234 xbdi->xbdi_ring.ring_n.req_cons); 1235 for (i = 0; i < req->nr_segments; i++) 1236 req->seg[i] = req64->seg[i]; 1237 break; 1238 } 1239 1240 xbdi->xbdi_cont = xbdback_co_io_gotreq; 1241 return xbdback_pool_get(&xbdback_request_pool, xbdi); 1242 1243 end: 1244 xbdback_send_reply(xbdi, xbdi->xbdi_xen_req.id, 1245 xbdi->xbdi_xen_req.operation, error); 1246 xbdi->xbdi_cont = xbdback_co_main_incr; 1247 return xbdi; 1248} 1249 1250/* 1251 * We have fetched segment requests from the ring. In case there are already 1252 * I/Os prepared for this instance, we can try coalescing the requests 1253 * with these I/Os. 1254 */ 1255static void * 1256xbdback_co_io_gotreq(struct xbdback_instance *xbdi, void *obj) 1257{ 1258 struct xbdback_request *xrq; 1259 1260 xrq = xbdi->xbdi_req = obj; 1261 1262 xrq->rq_xbdi = xbdi; 1263 xrq->rq_iocount = 0; 1264 xrq->rq_ioerrs = 0; 1265 xrq->rq_id = xbdi->xbdi_xen_req.id; 1266 xrq->rq_operation = xbdi->xbdi_xen_req.operation; 1267 KASSERT(xbdi->xbdi_req->rq_operation == BLKIF_OP_READ || 1268 xbdi->xbdi_req->rq_operation == BLKIF_OP_WRITE); 1269 1270 /* 1271 * Request-level reasons not to coalesce: different device, 1272 * different op, or noncontiguous disk sectors (vs. previous 1273 * request handed to us). 1274 */ 1275 xbdi->xbdi_cont = xbdback_co_io_loop; 1276 if (xbdi->xbdi_io != NULL) { 1277 struct xbdback_request *last_req; 1278 last_req = SLIST_FIRST(&xbdi->xbdi_io->xio_rq)->car; 1279 XENPRINTF(("xbdback_io domain %d: hoping for sector %" PRIu64 1280 "; got %" PRIu64 "\n", xbdi->xbdi_domid, 1281 xbdi->xbdi_next_sector, 1282 xbdi->xbdi_xen_req.sector_number)); 1283 if ((xrq->rq_operation != last_req->rq_operation) 1284 || (xbdi->xbdi_xen_req.sector_number != 1285 xbdi->xbdi_next_sector)) { 1286 XENPRINTF(("xbdback_io domain %d: segment break\n", 1287 xbdi->xbdi_domid)); 1288 xbdi->xbdi_next_sector = 1289 xbdi->xbdi_xen_req.sector_number; 1290 KASSERT(xbdi->xbdi_io->xio_operation == BLKIF_OP_READ || 1291 xbdi->xbdi_io->xio_operation == BLKIF_OP_WRITE); 1292 xbdi->xbdi_cont_aux = xbdback_co_io_loop; 1293 xbdi->xbdi_cont = xbdback_co_map_io; 1294 } 1295 } else { 1296 xbdi->xbdi_next_sector = xbdi->xbdi_xen_req.sector_number; 1297 } 1298 return xbdi; 1299} 1300 1301/* Handle coalescing of multiple segment requests into one I/O work */ 1302static void * 1303xbdback_co_io_loop(struct xbdback_instance *xbdi, void *obj) 1304{ 1305 (void)obj; 1306 KASSERT(xbdi->xbdi_req->rq_operation == BLKIF_OP_READ || 1307 xbdi->xbdi_req->rq_operation == BLKIF_OP_WRITE); 1308 if (xbdi->xbdi_segno < xbdi->xbdi_xen_req.nr_segments) { 1309 uint8_t this_fs, this_ls, last_fs, last_ls; 1310 grant_ref_t thisgrt, lastgrt; 1311 /* 1312 * Segment-level reason to coalesce: handling full 1313 * pages, or adjacent sector ranges from the same page 1314 * (and yes, this latter does happen). But not if the 1315 * array of client pseudo-physical pages is full. 1316 */ 1317 this_fs = xbdi->xbdi_xen_req.seg[xbdi->xbdi_segno].first_sect; 1318 this_ls = xbdi->xbdi_xen_req.seg[xbdi->xbdi_segno].last_sect; 1319 thisgrt = xbdi->xbdi_xen_req.seg[xbdi->xbdi_segno].gref; 1320 XENPRINTF(("xbdback_io domain %d: " 1321 "first,last_sect[%d]=0%o,0%o\n", 1322 xbdi->xbdi_domid, xbdi->xbdi_segno, 1323 this_fs, this_ls)); 1324 last_fs = xbdi->xbdi_last_fs = xbdi->xbdi_this_fs; 1325 last_ls = xbdi->xbdi_last_ls = xbdi->xbdi_this_ls; 1326 lastgrt = xbdi->xbdi_lastgrt = xbdi->xbdi_thisgrt; 1327 xbdi->xbdi_this_fs = this_fs; 1328 xbdi->xbdi_this_ls = this_ls; 1329 xbdi->xbdi_thisgrt = thisgrt; 1330 if (xbdi->xbdi_io != NULL) { 1331 if (last_ls == VBD_MAXSECT 1332 && this_fs == 0 1333 && xbdi->xbdi_io->xio_nrma 1334 < XENSHM_MAX_PAGES_PER_REQUEST) { 1335 xbdi->xbdi_same_page = 0; 1336 } else if (last_ls + 1 1337 == this_fs 1338#ifdef notyet 1339 && (last_fas & ~PAGE_MASK) 1340 == (this_fas & ~PAGE_MASK) 1341#else 1342 && 0 /* can't know frame number yet */ 1343#endif 1344 ) { 1345#ifdef DEBUG 1346 if (ratecheck(&xbdi->xbdi_lastfragio_time, 1347 &xbdback_fragio_intvl)) 1348 printf("%s: domain is sending" 1349 " excessively fragmented I/O\n", 1350 xbdi->xbdi_name); 1351#endif 1352 printf("xbdback_io: would maybe glue " 1353 "same page sec %d (%d->%d)\n", 1354 xbdi->xbdi_segno, this_fs, this_ls); 1355 XENPRINTF(("xbdback_io domain %d: glue same " 1356 "page", xbdi->xbdi_domid)); 1357 panic("notyet!"); 1358 xbdi->xbdi_same_page = 1; 1359 } else { 1360 KASSERT(xbdi->xbdi_io->xio_operation == 1361 BLKIF_OP_READ || 1362 xbdi->xbdi_io->xio_operation == 1363 BLKIF_OP_WRITE); 1364 xbdi->xbdi_cont_aux = xbdback_co_io_loop; 1365 xbdi->xbdi_cont = xbdback_co_map_io; 1366 return xbdi; 1367 } 1368 } else 1369 xbdi->xbdi_same_page = 0; 1370 1371 if (xbdi->xbdi_io == NULL) { 1372 xbdi->xbdi_cont = xbdback_co_io_gotio; 1373 return xbdback_pool_get(&xbdback_io_pool, xbdi); 1374 } else { 1375 xbdi->xbdi_cont = xbdback_co_io_gotio2; 1376 } 1377 } else { 1378 /* done with the loop over segments; get next request */ 1379 xbdi->xbdi_cont = xbdback_co_main_incr; 1380 } 1381 return xbdi; 1382} 1383 1384/* Prepare an I/O buffer for a xbdback instance */ 1385static void * 1386xbdback_co_io_gotio(struct xbdback_instance *xbdi, void *obj) 1387{ 1388 struct xbdback_io *xbd_io; 1389 vaddr_t start_offset; /* start offset in vm area */ 1390 int buf_flags; 1391 1392 xbdi_get(xbdi); 1393 atomic_inc_uint(&xbdi->xbdi_pendingreqs); 1394 1395 xbd_io = xbdi->xbdi_io = obj; 1396 buf_init(&xbd_io->xio_buf); 1397 xbd_io->xio_xbdi = xbdi; 1398 SLIST_INIT(&xbd_io->xio_rq); 1399 xbd_io->xio_nrma = 0; 1400 xbd_io->xio_mapped = 0; 1401 xbd_io->xio_operation = xbdi->xbdi_xen_req.operation; 1402 1403 start_offset = xbdi->xbdi_this_fs * VBD_BSIZE; 1404 1405 if (xbdi->xbdi_xen_req.operation == BLKIF_OP_WRITE) { 1406 buf_flags = B_WRITE; 1407 } else { 1408 buf_flags = B_READ; 1409 } 1410 1411 xbd_io->xio_buf.b_flags = buf_flags; 1412 xbd_io->xio_buf.b_cflags = 0; 1413 xbd_io->xio_buf.b_oflags = 0; 1414 xbd_io->xio_buf.b_iodone = xbdback_iodone; 1415 xbd_io->xio_buf.b_proc = NULL; 1416 xbd_io->xio_buf.b_vp = xbdi->xbdi_vp; 1417 xbd_io->xio_buf.b_objlock = xbdi->xbdi_vp->v_interlock; 1418 xbd_io->xio_buf.b_dev = xbdi->xbdi_dev; 1419 xbd_io->xio_buf.b_blkno = xbdi->xbdi_next_sector; 1420 xbd_io->xio_buf.b_bcount = 0; 1421 xbd_io->xio_buf.b_data = (void *)start_offset; 1422 xbd_io->xio_buf.b_private = xbd_io; 1423 1424 xbdi->xbdi_cont = xbdback_co_io_gotio2; 1425 return xbdi; 1426} 1427 1428/* Manage fragments */ 1429static void * 1430xbdback_co_io_gotio2(struct xbdback_instance *xbdi, void *obj) 1431{ 1432 (void)obj; 1433 if (xbdi->xbdi_segno == 0 || SLIST_EMPTY(&xbdi->xbdi_io->xio_rq)) { 1434 /* if this is the first segment of a new request */ 1435 /* or if it's the first segment of the io */ 1436 xbdi->xbdi_cont = xbdback_co_io_gotfrag; 1437 return xbdback_pool_get(&xbdback_fragment_pool, xbdi); 1438 } 1439 xbdi->xbdi_cont = xbdback_co_io_gotfrag2; 1440 return xbdi; 1441} 1442 1443/* Prepare the instance for its first fragment */ 1444static void * 1445xbdback_co_io_gotfrag(struct xbdback_instance *xbdi, void *obj) 1446{ 1447 struct xbdback_fragment *xbd_fr; 1448 1449 xbd_fr = obj; 1450 xbd_fr->car = xbdi->xbdi_req; 1451 SLIST_INSERT_HEAD(&xbdi->xbdi_io->xio_rq, xbd_fr, cdr); 1452 ++xbdi->xbdi_req->rq_iocount; 1453 1454 xbdi->xbdi_cont = xbdback_co_io_gotfrag2; 1455 return xbdi; 1456} 1457 1458/* Last routine to manage segments fragments for one I/O */ 1459static void * 1460xbdback_co_io_gotfrag2(struct xbdback_instance *xbdi, void *obj) 1461{ 1462 struct xbdback_io *xbd_io; 1463 int seg_size; 1464 uint8_t this_fs, this_ls; 1465 1466 this_fs = xbdi->xbdi_this_fs; 1467 this_ls = xbdi->xbdi_this_ls; 1468 xbd_io = xbdi->xbdi_io; 1469 seg_size = this_ls - this_fs + 1; 1470 1471 if (seg_size < 0) { 1472 printf("xbdback_io domain %d: negative-size request (%d %d)\n", 1473 xbdi->xbdi_domid, this_ls, this_fs); 1474 xbdback_io_error(xbdi->xbdi_io, EINVAL); 1475 xbdi->xbdi_io = NULL; 1476 xbdi->xbdi_cont = xbdback_co_main_incr; 1477 return xbdi; 1478 } 1479 1480 if (!xbdi->xbdi_same_page) { 1481 XENPRINTF(("xbdback_io domain %d: appending grant %u\n", 1482 xbdi->xbdi_domid, (u_int)xbdi->xbdi_thisgrt)); 1483 xbd_io->xio_gref[xbd_io->xio_nrma++] = xbdi->xbdi_thisgrt; 1484 } 1485 1486 xbd_io->xio_buf.b_bcount += (daddr_t)(seg_size * VBD_BSIZE); 1487 XENPRINTF(("xbdback_io domain %d: start sect %d size %d\n", 1488 xbdi->xbdi_domid, (int)xbdi->xbdi_next_sector, seg_size)); 1489 1490 /* Finally, the end of the segment loop! */ 1491 xbdi->xbdi_next_sector += seg_size; 1492 ++xbdi->xbdi_segno; 1493 xbdi->xbdi_cont = xbdback_co_io_loop; 1494 return xbdi; 1495} 1496 1497/* 1498 * Map the different I/O requests in backend's VA space. 1499 */ 1500static void * 1501xbdback_co_map_io(struct xbdback_instance *xbdi, void *obj) 1502{ 1503 (void)obj; 1504 XENPRINTF(("xbdback_io domain %d: flush sect %ld size %d ptr 0x%lx\n", 1505 xbdi->xbdi_domid, (long)xbdi->xbdi_io->xio_buf.b_blkno, 1506 (int)xbdi->xbdi_io->xio_buf.b_bcount, (long)xbdi->xbdi_io)); 1507 xbdi->xbdi_cont = xbdback_co_do_io; 1508 return xbdback_map_shm(xbdi->xbdi_io); 1509} 1510 1511static void 1512xbdback_io_error(struct xbdback_io *xbd_io, int error) 1513{ 1514 xbd_io->xio_buf.b_error = error; 1515 xbdback_iodone(&xbd_io->xio_buf); 1516} 1517 1518/* 1519 * Main xbdback I/O routine. It can either perform a flush operation or 1520 * schedule a read/write operation. 1521 */ 1522static void * 1523xbdback_co_do_io(struct xbdback_instance *xbdi, void *obj) 1524{ 1525 struct xbdback_io *xbd_io = xbdi->xbdi_io; 1526 1527 switch (xbd_io->xio_operation) { 1528 case BLKIF_OP_FLUSH_DISKCACHE: 1529 { 1530 int error; 1531 int force = 1; 1532 1533 error = VOP_IOCTL(xbdi->xbdi_vp, DIOCCACHESYNC, &force, FWRITE, 1534 kauth_cred_get()); 1535 if (error) { 1536 aprint_error("xbdback %s: DIOCCACHESYNC returned %d\n", 1537 xbdi->xbdi_xbusd->xbusd_path, error); 1538 if (error == EOPNOTSUPP || error == ENOTTY) 1539 error = BLKIF_RSP_EOPNOTSUPP; 1540 else 1541 error = BLKIF_RSP_ERROR; 1542 } else 1543 error = BLKIF_RSP_OKAY; 1544 xbdback_send_reply(xbdi, xbd_io->xio_flush_id, 1545 xbd_io->xio_operation, error); 1546 xbdback_pool_put(&xbdback_io_pool, xbd_io); 1547 xbdi_put(xbdi); 1548 xbdi->xbdi_io = NULL; 1549 xbdi->xbdi_cont = xbdback_co_main_incr; 1550 return xbdi; 1551 } 1552 case BLKIF_OP_READ: 1553 case BLKIF_OP_WRITE: 1554 xbd_io->xio_buf.b_data = (void *) 1555 ((vaddr_t)xbd_io->xio_buf.b_data + xbd_io->xio_vaddr); 1556#ifdef DIAGNOSTIC 1557 { 1558 vaddr_t bdata = (vaddr_t)xbd_io->xio_buf.b_data; 1559 int nsegs = 1560 ((((bdata + xbd_io->xio_buf.b_bcount - 1) & ~PAGE_MASK) - 1561 (bdata & ~PAGE_MASK)) >> PAGE_SHIFT) + 1; 1562 if ((bdata & ~PAGE_MASK) != (xbd_io->xio_vaddr & ~PAGE_MASK)) { 1563 printf("xbdback_co_do_io: vaddr %#" PRIxVADDR 1564 " bdata %#" PRIxVADDR "\n", 1565 xbd_io->xio_vaddr, bdata); 1566 panic("xbdback_co_do_io: bdata page change"); 1567 } 1568 if (nsegs > xbd_io->xio_nrma) { 1569 printf("xbdback_co_do_io: vaddr %#" PRIxVADDR 1570 " bcount %#x doesn't fit in %d pages\n", 1571 bdata, xbd_io->xio_buf.b_bcount, xbd_io->xio_nrma); 1572 panic("xbdback_co_do_io: not enough pages"); 1573 } 1574 } 1575#endif 1576 if ((xbd_io->xio_buf.b_flags & B_READ) == 0) { 1577 mutex_enter(xbd_io->xio_buf.b_vp->v_interlock); 1578 xbd_io->xio_buf.b_vp->v_numoutput++; 1579 mutex_exit(xbd_io->xio_buf.b_vp->v_interlock); 1580 } 1581 bdev_strategy(&xbd_io->xio_buf); 1582 /* will call xbdback_iodone() asynchronously when done */ 1583 xbdi->xbdi_io = NULL; 1584 xbdi->xbdi_cont = xbdi->xbdi_cont_aux; 1585 return xbdi; 1586 default: 1587 /* Should never happen */ 1588 panic("xbdback_co_do_io: unsupported operation %d", 1589 xbd_io->xio_operation); 1590 } 1591} 1592 1593/* 1594 * Called from softint(9) context when an I/O is done: for each request, send 1595 * back the associated reply to the domain. 1596 * 1597 * This gets reused by xbdback_io_error to report errors from other sources. 1598 */ 1599static void 1600xbdback_iodone(struct buf *bp) 1601{ 1602 struct xbdback_io *xbd_io; 1603 struct xbdback_instance *xbdi; 1604 int errp; 1605 1606 xbd_io = bp->b_private; 1607 xbdi = xbd_io->xio_xbdi; 1608 1609 XENPRINTF(("xbdback_io domain %d: iodone ptr 0x%lx\n", 1610 xbdi->xbdi_domid, (long)xbd_io)); 1611 1612 if (xbd_io->xio_mapped == 1) 1613 xbdback_unmap_shm(xbd_io); 1614 1615 if (bp->b_error != 0) { 1616 printf("xbd IO domain %d: error %d\n", 1617 xbdi->xbdi_domid, bp->b_error); 1618 errp = 1; 1619 } else 1620 errp = 0; 1621 1622 /* for each constituent xbd request */ 1623 while(!SLIST_EMPTY(&xbd_io->xio_rq)) { 1624 struct xbdback_fragment *xbd_fr; 1625 struct xbdback_request *xbd_req; 1626 struct xbdback_instance *rxbdi; 1627 int error; 1628 1629 xbd_fr = SLIST_FIRST(&xbd_io->xio_rq); 1630 xbd_req = xbd_fr->car; 1631 SLIST_REMOVE_HEAD(&xbd_io->xio_rq, cdr); 1632 xbdback_pool_put(&xbdback_fragment_pool, xbd_fr); 1633 1634 if (errp) 1635 ++xbd_req->rq_ioerrs; 1636 1637 /* finalize it only if this was its last I/O */ 1638 if (--xbd_req->rq_iocount > 0) 1639 continue; 1640 1641 rxbdi = xbd_req->rq_xbdi; 1642 KASSERT(xbdi == rxbdi); 1643 1644 error = xbd_req->rq_ioerrs > 0 1645 ? BLKIF_RSP_ERROR 1646 : BLKIF_RSP_OKAY; 1647 1648 XENPRINTF(("xbdback_io domain %d: end request %"PRIu64 1649 "error=%d\n", 1650 xbdi->xbdi_domid, xbd_req->rq_id, error)); 1651 xbdback_send_reply(xbdi, xbd_req->rq_id, 1652 xbd_req->rq_operation, error); 1653 xbdback_pool_put(&xbdback_request_pool, xbd_req); 1654 } 1655 xbdi_put(xbdi); 1656 atomic_dec_uint(&xbdi->xbdi_pendingreqs); 1657 buf_destroy(&xbd_io->xio_buf); 1658 xbdback_pool_put(&xbdback_io_pool, xbd_io); 1659 1660 xbdback_wakeup_thread(xbdi); 1661} 1662 1663/* 1664 * Wake up the per xbdback instance thread. 1665 */ 1666static void 1667xbdback_wakeup_thread(struct xbdback_instance *xbdi) 1668{ 1669 1670 mutex_enter(&xbdi->xbdi_lock); 1671 /* only set RUN state when we are WAITING for work */ 1672 if (xbdi->xbdi_status == WAITING) 1673 xbdi->xbdi_status = RUN; 1674 mutex_exit(&xbdi->xbdi_lock); 1675 1676 cv_broadcast(&xbdi->xbdi_cv); 1677} 1678 1679/* 1680 * called once a request has completed. Place the reply in the ring and 1681 * notify the guest OS. 1682 */ 1683static void 1684xbdback_send_reply(struct xbdback_instance *xbdi, uint64_t id, 1685 int op, int status) 1686{ 1687 blkif_response_t *resp_n; 1688 blkif_x86_32_response_t *resp32; 1689 blkif_x86_64_response_t *resp64; 1690 int notify; 1691 1692 /* 1693 * The ring can be accessed by the xbdback thread, xbdback_iodone() 1694 * handler, or any handler that triggered the shm callback. So 1695 * protect ring access via the xbdi_lock mutex. 1696 */ 1697 mutex_enter(&xbdi->xbdi_lock); 1698 switch (xbdi->xbdi_proto) { 1699 case XBDIP_NATIVE: 1700 resp_n = RING_GET_RESPONSE(&xbdi->xbdi_ring.ring_n, 1701 xbdi->xbdi_ring.ring_n.rsp_prod_pvt); 1702 resp_n->id = id; 1703 resp_n->operation = op; 1704 resp_n->status = status; 1705 break; 1706 case XBDIP_32: 1707 resp32 = RING_GET_RESPONSE(&xbdi->xbdi_ring.ring_32, 1708 xbdi->xbdi_ring.ring_n.rsp_prod_pvt); 1709 resp32->id = id; 1710 resp32->operation = op; 1711 resp32->status = status; 1712 break; 1713 case XBDIP_64: 1714 resp64 = RING_GET_RESPONSE(&xbdi->xbdi_ring.ring_64, 1715 xbdi->xbdi_ring.ring_n.rsp_prod_pvt); 1716 resp64->id = id; 1717 resp64->operation = op; 1718 resp64->status = status; 1719 break; 1720 } 1721 xbdi->xbdi_ring.ring_n.rsp_prod_pvt++; 1722 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbdi->xbdi_ring.ring_n, notify); 1723 mutex_exit(&xbdi->xbdi_lock); 1724 1725 if (notify) { 1726 XENPRINTF(("xbdback_send_reply notify %d\n", xbdi->xbdi_domid)); 1727 hypervisor_notify_via_evtchn(xbdi->xbdi_evtchn); 1728 } 1729} 1730 1731/* 1732 * Map multiple entries of an I/O request into backend's VA space. 1733 * The xbd_io->xio_gref array has to be filled out by the caller. 1734 */ 1735static void * 1736xbdback_map_shm(struct xbdback_io *xbd_io) 1737{ 1738 struct xbdback_instance *xbdi; 1739 struct xbdback_request *xbd_rq; 1740 int error, s; 1741 1742#ifdef XENDEBUG_VBD 1743 int i; 1744 printf("xbdback_map_shm map grant "); 1745 for (i = 0; i < xbd_io->xio_nrma; i++) { 1746 printf("%u ", (u_int)xbd_io->xio_gref[i]); 1747 } 1748#endif 1749 1750 KASSERT(xbd_io->xio_mapped == 0); 1751 1752 xbdi = xbd_io->xio_xbdi; 1753 xbd_rq = SLIST_FIRST(&xbd_io->xio_rq)->car; 1754 1755 error = xen_shm_map(xbd_io->xio_nrma, xbdi->xbdi_domid, 1756 xbd_io->xio_gref, &xbd_io->xio_vaddr, xbd_io->xio_gh, 1757 (xbd_rq->rq_operation == BLKIF_OP_WRITE) ? XSHM_RO : 0); 1758 1759 switch(error) { 1760 case 0: 1761#ifdef XENDEBUG_VBD 1762 printf("handle "); 1763 for (i = 0; i < xbd_io->xio_nrma; i++) { 1764 printf("%u ", (u_int)xbd_io->xio_gh[i]); 1765 } 1766 printf("\n"); 1767#endif 1768 xbd_io->xio_mapped = 1; 1769 return xbdi; 1770 case ENOMEM: 1771 s = splvm(); 1772 if (!xbdback_shmcb) { 1773 if (xen_shm_callback(xbdback_shm_callback, xbdi) 1774 != 0) { 1775 splx(s); 1776 panic("xbdback_map_shm: " 1777 "xen_shm_callback failed"); 1778 } 1779 xbdback_shmcb = 1; 1780 } 1781 SIMPLEQ_INSERT_TAIL(&xbdback_shmq, xbdi, xbdi_on_hold); 1782 splx(s); 1783 /* Put the thread to sleep until the callback is called */ 1784 xbdi->xbdi_cont = xbdback_co_wait_shm_callback; 1785 return NULL; 1786 default: 1787 printf("xbdback_map_shm: xen_shm error %d ", error); 1788 xbdback_io_error(xbdi->xbdi_io, error); 1789 xbdi->xbdi_io = NULL; 1790 xbdi->xbdi_cont = xbdi->xbdi_cont_aux; 1791 return xbdi; 1792 } 1793} 1794 1795static int 1796xbdback_shm_callback(void *arg) 1797{ 1798 int error, s; 1799 1800 /* 1801 * The shm callback may be executed at any level, including 1802 * IPL_BIO and IPL_NET levels. Raise to the lowest priority level 1803 * that can mask both. 1804 */ 1805 s = splvm(); 1806 while(!SIMPLEQ_EMPTY(&xbdback_shmq)) { 1807 struct xbdback_instance *xbdi; 1808 struct xbdback_io *xbd_io; 1809 struct xbdback_request *xbd_rq; 1810 1811 xbdi = SIMPLEQ_FIRST(&xbdback_shmq); 1812 xbd_io = xbdi->xbdi_io; 1813 xbd_rq = SLIST_FIRST(&xbd_io->xio_rq)->car; 1814 KASSERT(xbd_io->xio_mapped == 0); 1815 1816 error = xen_shm_map(xbd_io->xio_nrma, 1817 xbdi->xbdi_domid, xbd_io->xio_gref, 1818 &xbd_io->xio_vaddr, xbd_io->xio_gh, 1819 XSHM_CALLBACK | 1820 ((xbd_rq->rq_operation == BLKIF_OP_WRITE) ? XSHM_RO: 0)); 1821 switch(error) { 1822 case ENOMEM: 1823 splx(s); 1824 return -1; /* will try again later */ 1825 case 0: 1826 SIMPLEQ_REMOVE_HEAD(&xbdback_shmq, xbdi_on_hold); 1827 xbd_io->xio_mapped = 1; 1828 xbdback_wakeup_thread(xbdi); 1829 break; 1830 default: 1831 SIMPLEQ_REMOVE_HEAD(&xbdback_shmq, xbdi_on_hold); 1832 printf("xbdback_shm_callback: xen_shm error %d\n", 1833 error); 1834 xbdback_io_error(xbd_io, error); 1835 xbdi->xbdi_io = NULL; 1836 xbdback_wakeup_thread(xbdi); 1837 break; 1838 } 1839 } 1840 xbdback_shmcb = 0; 1841 splx(s); 1842 return 0; 1843} 1844 1845/* 1846 * Allows waiting for the shm callback to complete. 1847 */ 1848static void * 1849xbdback_co_wait_shm_callback(struct xbdback_instance *xbdi, void *obj) 1850{ 1851 1852 if (xbdi->xbdi_io == NULL || xbdi->xbdi_io->xio_mapped == 1) { 1853 /* 1854 * Only proceed to next step when the callback reported 1855 * success or failure. 1856 */ 1857 xbdi->xbdi_cont = xbdi->xbdi_cont_aux; 1858 return xbdi; 1859 } else { 1860 /* go back to sleep */ 1861 return NULL; 1862 } 1863} 1864 1865/* unmap a request from our virtual address space (request is done) */ 1866static void 1867xbdback_unmap_shm(struct xbdback_io *xbd_io) 1868{ 1869#ifdef XENDEBUG_VBD 1870 int i; 1871 printf("xbdback_unmap_shm handle "); 1872 for (i = 0; i < xbd_io->xio_nrma; i++) { 1873 printf("%u ", (u_int)xbd_io->xio_gh[i]); 1874 } 1875 printf("\n"); 1876#endif 1877 1878 KASSERT(xbd_io->xio_mapped == 1); 1879 xbd_io->xio_mapped = 0; 1880 xen_shm_unmap(xbd_io->xio_vaddr, xbd_io->xio_nrma, 1881 xbd_io->xio_gh); 1882 xbd_io->xio_vaddr = -1; 1883} 1884 1885/* Obtain memory from a pool */ 1886static void * 1887xbdback_pool_get(struct xbdback_pool *pp, 1888 struct xbdback_instance *xbdi) 1889{ 1890 return pool_cache_get(&pp->pc, PR_WAITOK); 1891} 1892 1893/* Restore memory to a pool */ 1894static void 1895xbdback_pool_put(struct xbdback_pool *pp, void *item) 1896{ 1897 pool_cache_put(&pp->pc, item); 1898} 1899 1900/* 1901 * Trampoline routine. Calls continuations in a loop and only exits when 1902 * either the returned object or the next callback is NULL. 1903 */ 1904static void 1905xbdback_trampoline(struct xbdback_instance *xbdi, void *obj) 1906{ 1907 xbdback_cont_t cont; 1908 1909 while(obj != NULL && xbdi->xbdi_cont != NULL) { 1910 cont = xbdi->xbdi_cont; 1911#ifdef DIAGNOSTIC 1912 xbdi->xbdi_cont = (xbdback_cont_t)0xDEADBEEF; 1913#endif 1914 obj = (*cont)(xbdi, obj); 1915#ifdef DIAGNOSTIC 1916 if (xbdi->xbdi_cont == (xbdback_cont_t)0xDEADBEEF) { 1917 printf("xbdback_trampoline: 0x%lx didn't set " 1918 "xbdi->xbdi_cont!\n", (long)cont); 1919 panic("xbdback_trampoline: bad continuation"); 1920 } 1921#endif 1922 } 1923} 1924