blkback.c revision 301268
1/*- 2 * Copyright (c) 2009-2012 Spectra Logic Corporation 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions, and the following disclaimer, 10 * without modification. 11 * 2. Redistributions in binary form must reproduce at minimum a disclaimer 12 * substantially similar to the "NO WARRANTY" disclaimer below 13 * ("Disclaimer") and any redistribution must be conditioned upon 14 * including a substantially similar Disclaimer requirement for further 15 * binary redistribution. 16 * 17 * NO WARRANTY 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 26 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 27 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * POSSIBILITY OF SUCH DAMAGES. 29 * 30 * Authors: Justin T. Gibbs (Spectra Logic Corporation) 31 * Ken Merry (Spectra Logic Corporation) 32 */ 33#include <sys/cdefs.h> 34__FBSDID("$FreeBSD: head/sys/dev/xen/blkback/blkback.c 301268 2016-06-03 11:38:52Z royger $"); 35 36/** 37 * \file blkback.c 38 * 39 * \brief Device driver supporting the vending of block storage from 40 * a FreeBSD domain to other domains. 41 */ 42 43#include <sys/param.h> 44#include <sys/systm.h> 45#include <sys/kernel.h> 46#include <sys/malloc.h> 47 48#include <sys/bio.h> 49#include <sys/bus.h> 50#include <sys/conf.h> 51#include <sys/devicestat.h> 52#include <sys/disk.h> 53#include <sys/fcntl.h> 54#include <sys/filedesc.h> 55#include <sys/kdb.h> 56#include <sys/module.h> 57#include <sys/namei.h> 58#include <sys/proc.h> 59#include <sys/rman.h> 60#include <sys/taskqueue.h> 61#include <sys/types.h> 62#include <sys/vnode.h> 63#include <sys/mount.h> 64#include <sys/sysctl.h> 65#include <sys/bitstring.h> 66#include <sys/sdt.h> 67 68#include <geom/geom.h> 69 70#include <machine/_inttypes.h> 71 72#include <vm/vm.h> 73#include <vm/vm_extern.h> 74#include <vm/vm_kern.h> 75 76#include <xen/xen-os.h> 77#include <xen/blkif.h> 78#include <xen/gnttab.h> 79#include <xen/xen_intr.h> 80 81#include <xen/interface/event_channel.h> 82#include <xen/interface/grant_table.h> 83 84#include <xen/xenbus/xenbusvar.h> 85 86/*--------------------------- Compile-time Tunables --------------------------*/ 87/** 88 * The maximum number of shared memory ring pages we will allow in a 89 * negotiated block-front/back communication channel. Allow enough 90 * ring space for all requests to be XBB_MAX_REQUEST_SIZE'd. 91 */ 92#define XBB_MAX_RING_PAGES 32 93 94/** 95 * The maximum number of outstanding request blocks (request headers plus 96 * additional segment blocks) we will allow in a negotiated block-front/back 97 * communication channel. 98 */ 99#define XBB_MAX_REQUESTS \ 100 __CONST_RING_SIZE(blkif, PAGE_SIZE * XBB_MAX_RING_PAGES) 101 102/** 103 * \brief Define to force all I/O to be performed on memory owned by the 104 * backend device, with a copy-in/out to the remote domain's memory. 105 * 106 * \note This option is currently required when this driver's domain is 107 * operating in HVM mode on a system using an IOMMU. 108 * 109 * This driver uses Xen's grant table API to gain access to the memory of 110 * the remote domains it serves. When our domain is operating in PV mode, 111 * the grant table mechanism directly updates our domain's page table entries 112 * to point to the physical pages of the remote domain. This scheme guarantees 113 * that blkback and the backing devices it uses can safely perform DMA 114 * operations to satisfy requests. In HVM mode, Xen may use a HW IOMMU to 115 * insure that our domain cannot DMA to pages owned by another domain. As 116 * of Xen 4.0, IOMMU mappings for HVM guests are not updated via the grant 117 * table API. For this reason, in HVM mode, we must bounce all requests into 118 * memory that is mapped into our domain at domain startup and thus has 119 * valid IOMMU mappings. 120 */ 121#define XBB_USE_BOUNCE_BUFFERS 122 123/** 124 * \brief Define to enable rudimentary request logging to the console. 125 */ 126#undef XBB_DEBUG 127 128/*---------------------------------- Macros ----------------------------------*/ 129/** 130 * Custom malloc type for all driver allocations. 131 */ 132static MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data"); 133 134#ifdef XBB_DEBUG 135#define DPRINTF(fmt, args...) \ 136 printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) 137#else 138#define DPRINTF(fmt, args...) do {} while(0) 139#endif 140 141/** 142 * The maximum mapped region size per request we will allow in a negotiated 143 * block-front/back communication channel. 144 */ 145#define XBB_MAX_REQUEST_SIZE \ 146 MIN(MAXPHYS, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) 147 148/** 149 * The maximum number of segments (within a request header and accompanying 150 * segment blocks) per request we will allow in a negotiated block-front/back 151 * communication channel. 152 */ 153#define XBB_MAX_SEGMENTS_PER_REQUEST \ 154 (MIN(UIO_MAXIOV, \ 155 MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST, \ 156 (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1))) 157 158/** 159 * The maximum number of ring pages that we can allow per request list. 160 * We limit this to the maximum number of segments per request, because 161 * that is already a reasonable number of segments to aggregate. This 162 * number should never be smaller than XBB_MAX_SEGMENTS_PER_REQUEST, 163 * because that would leave situations where we can't dispatch even one 164 * large request. 165 */ 166#define XBB_MAX_SEGMENTS_PER_REQLIST XBB_MAX_SEGMENTS_PER_REQUEST 167 168/*--------------------------- Forward Declarations ---------------------------*/ 169struct xbb_softc; 170struct xbb_xen_req; 171 172static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, 173 ...) __attribute__((format(printf, 3, 4))); 174static int xbb_shutdown(struct xbb_softc *xbb); 175static int xbb_detach(device_t dev); 176 177/*------------------------------ Data Structures -----------------------------*/ 178 179STAILQ_HEAD(xbb_xen_req_list, xbb_xen_req); 180 181typedef enum { 182 XBB_REQLIST_NONE = 0x00, 183 XBB_REQLIST_MAPPED = 0x01 184} xbb_reqlist_flags; 185 186struct xbb_xen_reqlist { 187 /** 188 * Back reference to the parent block back instance for this 189 * request. Used during bio_done handling. 190 */ 191 struct xbb_softc *xbb; 192 193 /** 194 * BLKIF_OP code for this request. 195 */ 196 int operation; 197 198 /** 199 * Set to BLKIF_RSP_* to indicate request status. 200 * 201 * This field allows an error status to be recorded even if the 202 * delivery of this status must be deferred. Deferred reporting 203 * is necessary, for example, when an error is detected during 204 * completion processing of one bio when other bios for this 205 * request are still outstanding. 206 */ 207 int status; 208 209 /** 210 * Number of 512 byte sectors not transferred. 211 */ 212 int residual_512b_sectors; 213 214 /** 215 * Starting sector number of the first request in the list. 216 */ 217 off_t starting_sector_number; 218 219 /** 220 * If we're going to coalesce, the next contiguous sector would be 221 * this one. 222 */ 223 off_t next_contig_sector; 224 225 /** 226 * Number of child requests in the list. 227 */ 228 int num_children; 229 230 /** 231 * Number of I/O requests still pending on the backend. 232 */ 233 int pendcnt; 234 235 /** 236 * Total number of segments for requests in the list. 237 */ 238 int nr_segments; 239 240 /** 241 * Flags for this particular request list. 242 */ 243 xbb_reqlist_flags flags; 244 245 /** 246 * Kernel virtual address space reserved for this request 247 * list structure and used to map the remote domain's pages for 248 * this I/O, into our domain's address space. 249 */ 250 uint8_t *kva; 251 252 /** 253 * Base, psuedo-physical address, corresponding to the start 254 * of this request's kva region. 255 */ 256 uint64_t gnt_base; 257 258 259#ifdef XBB_USE_BOUNCE_BUFFERS 260 /** 261 * Pre-allocated domain local memory used to proxy remote 262 * domain memory during I/O operations. 263 */ 264 uint8_t *bounce; 265#endif 266 267 /** 268 * Array of grant handles (one per page) used to map this request. 269 */ 270 grant_handle_t *gnt_handles; 271 272 /** 273 * Device statistics request ordering type (ordered or simple). 274 */ 275 devstat_tag_type ds_tag_type; 276 277 /** 278 * Device statistics request type (read, write, no_data). 279 */ 280 devstat_trans_flags ds_trans_type; 281 282 /** 283 * The start time for this request. 284 */ 285 struct bintime ds_t0; 286 287 /** 288 * Linked list of contiguous requests with the same operation type. 289 */ 290 struct xbb_xen_req_list contig_req_list; 291 292 /** 293 * Linked list links used to aggregate idle requests in the 294 * request list free pool (xbb->reqlist_free_stailq) and pending 295 * requests waiting for execution (xbb->reqlist_pending_stailq). 296 */ 297 STAILQ_ENTRY(xbb_xen_reqlist) links; 298}; 299 300STAILQ_HEAD(xbb_xen_reqlist_list, xbb_xen_reqlist); 301 302/** 303 * \brief Object tracking an in-flight I/O from a Xen VBD consumer. 304 */ 305struct xbb_xen_req { 306 /** 307 * Linked list links used to aggregate requests into a reqlist 308 * and to store them in the request free pool. 309 */ 310 STAILQ_ENTRY(xbb_xen_req) links; 311 312 /** 313 * The remote domain's identifier for this I/O request. 314 */ 315 uint64_t id; 316 317 /** 318 * The number of pages currently mapped for this request. 319 */ 320 int nr_pages; 321 322 /** 323 * The number of 512 byte sectors comprising this requests. 324 */ 325 int nr_512b_sectors; 326 327 /** 328 * BLKIF_OP code for this request. 329 */ 330 int operation; 331 332 /** 333 * Storage used for non-native ring requests. 334 */ 335 blkif_request_t ring_req_storage; 336 337 /** 338 * Pointer to the Xen request in the ring. 339 */ 340 blkif_request_t *ring_req; 341 342 /** 343 * Consumer index for this request. 344 */ 345 RING_IDX req_ring_idx; 346 347 /** 348 * The start time for this request. 349 */ 350 struct bintime ds_t0; 351 352 /** 353 * Pointer back to our parent request list. 354 */ 355 struct xbb_xen_reqlist *reqlist; 356}; 357SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req); 358 359/** 360 * \brief Configuration data for the shared memory request ring 361 * used to communicate with the front-end client of this 362 * this driver. 363 */ 364struct xbb_ring_config { 365 /** KVA address where ring memory is mapped. */ 366 vm_offset_t va; 367 368 /** The pseudo-physical address where ring memory is mapped.*/ 369 uint64_t gnt_addr; 370 371 /** 372 * Grant table handles, one per-ring page, returned by the 373 * hyperpervisor upon mapping of the ring and required to 374 * unmap it when a connection is torn down. 375 */ 376 grant_handle_t handle[XBB_MAX_RING_PAGES]; 377 378 /** 379 * The device bus address returned by the hypervisor when 380 * mapping the ring and required to unmap it when a connection 381 * is torn down. 382 */ 383 uint64_t bus_addr[XBB_MAX_RING_PAGES]; 384 385 /** The number of ring pages mapped for the current connection. */ 386 u_int ring_pages; 387 388 /** 389 * The grant references, one per-ring page, supplied by the 390 * front-end, allowing us to reference the ring pages in the 391 * front-end's domain and to map these pages into our own domain. 392 */ 393 grant_ref_t ring_ref[XBB_MAX_RING_PAGES]; 394 395 /** The interrupt driven even channel used to signal ring events. */ 396 evtchn_port_t evtchn; 397}; 398 399/** 400 * Per-instance connection state flags. 401 */ 402typedef enum 403{ 404 /** 405 * The front-end requested a read-only mount of the 406 * back-end device/file. 407 */ 408 XBBF_READ_ONLY = 0x01, 409 410 /** Communication with the front-end has been established. */ 411 XBBF_RING_CONNECTED = 0x02, 412 413 /** 414 * Front-end requests exist in the ring and are waiting for 415 * xbb_xen_req objects to free up. 416 */ 417 XBBF_RESOURCE_SHORTAGE = 0x04, 418 419 /** Connection teardown in progress. */ 420 XBBF_SHUTDOWN = 0x08, 421 422 /** A thread is already performing shutdown processing. */ 423 XBBF_IN_SHUTDOWN = 0x10 424} xbb_flag_t; 425 426/** Backend device type. */ 427typedef enum { 428 /** Backend type unknown. */ 429 XBB_TYPE_NONE = 0x00, 430 431 /** 432 * Backend type disk (access via cdev switch 433 * strategy routine). 434 */ 435 XBB_TYPE_DISK = 0x01, 436 437 /** Backend type file (access vnode operations.). */ 438 XBB_TYPE_FILE = 0x02 439} xbb_type; 440 441/** 442 * \brief Structure used to memoize information about a per-request 443 * scatter-gather list. 444 * 445 * The chief benefit of using this data structure is it avoids having 446 * to reparse the possibly discontiguous S/G list in the original 447 * request. Due to the way that the mapping of the memory backing an 448 * I/O transaction is handled by Xen, a second pass is unavoidable. 449 * At least this way the second walk is a simple array traversal. 450 * 451 * \note A single Scatter/Gather element in the block interface covers 452 * at most 1 machine page. In this context a sector (blkif 453 * nomenclature, not what I'd choose) is a 512b aligned unit 454 * of mapping within the machine page referenced by an S/G 455 * element. 456 */ 457struct xbb_sg { 458 /** The number of 512b data chunks mapped in this S/G element. */ 459 int16_t nsect; 460 461 /** 462 * The index (0 based) of the first 512b data chunk mapped 463 * in this S/G element. 464 */ 465 uint8_t first_sect; 466 467 /** 468 * The index (0 based) of the last 512b data chunk mapped 469 * in this S/G element. 470 */ 471 uint8_t last_sect; 472}; 473 474/** 475 * Character device backend specific configuration data. 476 */ 477struct xbb_dev_data { 478 /** Cdev used for device backend access. */ 479 struct cdev *cdev; 480 481 /** Cdev switch used for device backend access. */ 482 struct cdevsw *csw; 483 484 /** Used to hold a reference on opened cdev backend devices. */ 485 int dev_ref; 486}; 487 488/** 489 * File backend specific configuration data. 490 */ 491struct xbb_file_data { 492 /** Credentials to use for vnode backed (file based) I/O. */ 493 struct ucred *cred; 494 495 /** 496 * \brief Array of io vectors used to process file based I/O. 497 * 498 * Only a single file based request is outstanding per-xbb instance, 499 * so we only need one of these. 500 */ 501 struct iovec xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; 502#ifdef XBB_USE_BOUNCE_BUFFERS 503 504 /** 505 * \brief Array of io vectors used to handle bouncing of file reads. 506 * 507 * Vnode operations are free to modify uio data during their 508 * exectuion. In the case of a read with bounce buffering active, 509 * we need some of the data from the original uio in order to 510 * bounce-out the read data. This array serves as the temporary 511 * storage for this saved data. 512 */ 513 struct iovec saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; 514 515 /** 516 * \brief Array of memoized bounce buffer kva offsets used 517 * in the file based backend. 518 * 519 * Due to the way that the mapping of the memory backing an 520 * I/O transaction is handled by Xen, a second pass through 521 * the request sg elements is unavoidable. We memoize the computed 522 * bounce address here to reduce the cost of the second walk. 523 */ 524 void *xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQLIST]; 525#endif /* XBB_USE_BOUNCE_BUFFERS */ 526}; 527 528/** 529 * Collection of backend type specific data. 530 */ 531union xbb_backend_data { 532 struct xbb_dev_data dev; 533 struct xbb_file_data file; 534}; 535 536/** 537 * Function signature of backend specific I/O handlers. 538 */ 539typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb, 540 struct xbb_xen_reqlist *reqlist, int operation, 541 int flags); 542 543/** 544 * Per-instance configuration data. 545 */ 546struct xbb_softc { 547 548 /** 549 * Task-queue used to process I/O requests. 550 */ 551 struct taskqueue *io_taskqueue; 552 553 /** 554 * Single "run the request queue" task enqueued 555 * on io_taskqueue. 556 */ 557 struct task io_task; 558 559 /** Device type for this instance. */ 560 xbb_type device_type; 561 562 /** NewBus device corresponding to this instance. */ 563 device_t dev; 564 565 /** Backend specific dispatch routine for this instance. */ 566 xbb_dispatch_t dispatch_io; 567 568 /** The number of requests outstanding on the backend device/file. */ 569 int active_request_count; 570 571 /** Free pool of request tracking structures. */ 572 struct xbb_xen_req_list request_free_stailq; 573 574 /** Array, sized at connection time, of request tracking structures. */ 575 struct xbb_xen_req *requests; 576 577 /** Free pool of request list structures. */ 578 struct xbb_xen_reqlist_list reqlist_free_stailq; 579 580 /** List of pending request lists awaiting execution. */ 581 struct xbb_xen_reqlist_list reqlist_pending_stailq; 582 583 /** Array, sized at connection time, of request list structures. */ 584 struct xbb_xen_reqlist *request_lists; 585 586 /** 587 * Global pool of kva used for mapping remote domain ring 588 * and I/O transaction data. 589 */ 590 vm_offset_t kva; 591 592 /** Psuedo-physical address corresponding to kva. */ 593 uint64_t gnt_base_addr; 594 595 /** The size of the global kva pool. */ 596 int kva_size; 597 598 /** The size of the KVA area used for request lists. */ 599 int reqlist_kva_size; 600 601 /** The number of pages of KVA used for request lists */ 602 int reqlist_kva_pages; 603 604 /** Bitmap of free KVA pages */ 605 bitstr_t *kva_free; 606 607 /** 608 * \brief Cached value of the front-end's domain id. 609 * 610 * This value is used at once for each mapped page in 611 * a transaction. We cache it to avoid incuring the 612 * cost of an ivar access every time this is needed. 613 */ 614 domid_t otherend_id; 615 616 /** 617 * \brief The blkif protocol abi in effect. 618 * 619 * There are situations where the back and front ends can 620 * have a different, native abi (e.g. intel x86_64 and 621 * 32bit x86 domains on the same machine). The back-end 622 * always accommodates the front-end's native abi. That 623 * value is pulled from the XenStore and recorded here. 624 */ 625 int abi; 626 627 /** 628 * \brief The maximum number of requests and request lists allowed 629 * to be in flight at a time. 630 * 631 * This value is negotiated via the XenStore. 632 */ 633 u_int max_requests; 634 635 /** 636 * \brief The maximum number of segments (1 page per segment) 637 * that can be mapped by a request. 638 * 639 * This value is negotiated via the XenStore. 640 */ 641 u_int max_request_segments; 642 643 /** 644 * \brief Maximum number of segments per request list. 645 * 646 * This value is derived from and will generally be larger than 647 * max_request_segments. 648 */ 649 u_int max_reqlist_segments; 650 651 /** 652 * The maximum size of any request to this back-end 653 * device. 654 * 655 * This value is negotiated via the XenStore. 656 */ 657 u_int max_request_size; 658 659 /** 660 * The maximum size of any request list. This is derived directly 661 * from max_reqlist_segments. 662 */ 663 u_int max_reqlist_size; 664 665 /** Various configuration and state bit flags. */ 666 xbb_flag_t flags; 667 668 /** Ring mapping and interrupt configuration data. */ 669 struct xbb_ring_config ring_config; 670 671 /** Runtime, cross-abi safe, structures for ring access. */ 672 blkif_back_rings_t rings; 673 674 /** IRQ mapping for the communication ring event channel. */ 675 xen_intr_handle_t xen_intr_handle; 676 677 /** 678 * \brief Backend access mode flags (e.g. write, or read-only). 679 * 680 * This value is passed to us by the front-end via the XenStore. 681 */ 682 char *dev_mode; 683 684 /** 685 * \brief Backend device type (e.g. "disk", "cdrom", "floppy"). 686 * 687 * This value is passed to us by the front-end via the XenStore. 688 * Currently unused. 689 */ 690 char *dev_type; 691 692 /** 693 * \brief Backend device/file identifier. 694 * 695 * This value is passed to us by the front-end via the XenStore. 696 * We expect this to be a POSIX path indicating the file or 697 * device to open. 698 */ 699 char *dev_name; 700 701 /** 702 * Vnode corresponding to the backend device node or file 703 * we are acessing. 704 */ 705 struct vnode *vn; 706 707 union xbb_backend_data backend; 708 709 /** The native sector size of the backend. */ 710 u_int sector_size; 711 712 /** log2 of sector_size. */ 713 u_int sector_size_shift; 714 715 /** Size in bytes of the backend device or file. */ 716 off_t media_size; 717 718 /** 719 * \brief media_size expressed in terms of the backend native 720 * sector size. 721 * 722 * (e.g. xbb->media_size >> xbb->sector_size_shift). 723 */ 724 uint64_t media_num_sectors; 725 726 /** 727 * \brief Array of memoized scatter gather data computed during the 728 * conversion of blkif ring requests to internal xbb_xen_req 729 * structures. 730 * 731 * Ring processing is serialized so we only need one of these. 732 */ 733 struct xbb_sg xbb_sgs[XBB_MAX_SEGMENTS_PER_REQLIST]; 734 735 /** 736 * Temporary grant table map used in xbb_dispatch_io(). When 737 * XBB_MAX_SEGMENTS_PER_REQLIST gets large, keeping this on the 738 * stack could cause a stack overflow. 739 */ 740 struct gnttab_map_grant_ref maps[XBB_MAX_SEGMENTS_PER_REQLIST]; 741 742 /** Mutex protecting per-instance data. */ 743 struct mtx lock; 744 745 /** 746 * Resource representing allocated physical address space 747 * associated with our per-instance kva region. 748 */ 749 struct resource *pseudo_phys_res; 750 751 /** Resource id for allocated physical address space. */ 752 int pseudo_phys_res_id; 753 754 /** 755 * I/O statistics from BlockBack dispatch down. These are 756 * coalesced requests, and we start them right before execution. 757 */ 758 struct devstat *xbb_stats; 759 760 /** 761 * I/O statistics coming into BlockBack. These are the requests as 762 * we get them from BlockFront. They are started as soon as we 763 * receive a request, and completed when the I/O is complete. 764 */ 765 struct devstat *xbb_stats_in; 766 767 /** Disable sending flush to the backend */ 768 int disable_flush; 769 770 /** Send a real flush for every N flush requests */ 771 int flush_interval; 772 773 /** Count of flush requests in the interval */ 774 int flush_count; 775 776 /** Don't coalesce requests if this is set */ 777 int no_coalesce_reqs; 778 779 /** Number of requests we have received */ 780 uint64_t reqs_received; 781 782 /** Number of requests we have completed*/ 783 uint64_t reqs_completed; 784 785 /** Number of requests we queued but not pushed*/ 786 uint64_t reqs_queued_for_completion; 787 788 /** Number of requests we completed with an error status*/ 789 uint64_t reqs_completed_with_error; 790 791 /** How many forced dispatches (i.e. without coalescing) have happened */ 792 uint64_t forced_dispatch; 793 794 /** How many normal dispatches have happened */ 795 uint64_t normal_dispatch; 796 797 /** How many total dispatches have happened */ 798 uint64_t total_dispatch; 799 800 /** How many times we have run out of KVA */ 801 uint64_t kva_shortages; 802 803 /** How many times we have run out of request structures */ 804 uint64_t request_shortages; 805 806 /** Watch to wait for hotplug script execution */ 807 struct xs_watch hotplug_watch; 808}; 809 810/*---------------------------- Request Processing ----------------------------*/ 811/** 812 * Allocate an internal transaction tracking structure from the free pool. 813 * 814 * \param xbb Per-instance xbb configuration structure. 815 * 816 * \return On success, a pointer to the allocated xbb_xen_req structure. 817 * Otherwise NULL. 818 */ 819static inline struct xbb_xen_req * 820xbb_get_req(struct xbb_softc *xbb) 821{ 822 struct xbb_xen_req *req; 823 824 req = NULL; 825 826 mtx_assert(&xbb->lock, MA_OWNED); 827 828 if ((req = STAILQ_FIRST(&xbb->request_free_stailq)) != NULL) { 829 STAILQ_REMOVE_HEAD(&xbb->request_free_stailq, links); 830 xbb->active_request_count++; 831 } 832 833 return (req); 834} 835 836/** 837 * Return an allocated transaction tracking structure to the free pool. 838 * 839 * \param xbb Per-instance xbb configuration structure. 840 * \param req The request structure to free. 841 */ 842static inline void 843xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req) 844{ 845 mtx_assert(&xbb->lock, MA_OWNED); 846 847 STAILQ_INSERT_HEAD(&xbb->request_free_stailq, req, links); 848 xbb->active_request_count--; 849 850 KASSERT(xbb->active_request_count >= 0, 851 ("xbb_release_req: negative active count")); 852} 853 854/** 855 * Return an xbb_xen_req_list of allocated xbb_xen_reqs to the free pool. 856 * 857 * \param xbb Per-instance xbb configuration structure. 858 * \param req_list The list of requests to free. 859 * \param nreqs The number of items in the list. 860 */ 861static inline void 862xbb_release_reqs(struct xbb_softc *xbb, struct xbb_xen_req_list *req_list, 863 int nreqs) 864{ 865 mtx_assert(&xbb->lock, MA_OWNED); 866 867 STAILQ_CONCAT(&xbb->request_free_stailq, req_list); 868 xbb->active_request_count -= nreqs; 869 870 KASSERT(xbb->active_request_count >= 0, 871 ("xbb_release_reqs: negative active count")); 872} 873 874/** 875 * Given a page index and 512b sector offset within that page, 876 * calculate an offset into a request's kva region. 877 * 878 * \param reqlist The request structure whose kva region will be accessed. 879 * \param pagenr The page index used to compute the kva offset. 880 * \param sector The 512b sector index used to compute the page relative 881 * kva offset. 882 * 883 * \return The computed global KVA offset. 884 */ 885static inline uint8_t * 886xbb_reqlist_vaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 887{ 888 return (reqlist->kva + (PAGE_SIZE * pagenr) + (sector << 9)); 889} 890 891#ifdef XBB_USE_BOUNCE_BUFFERS 892/** 893 * Given a page index and 512b sector offset within that page, 894 * calculate an offset into a request's local bounce memory region. 895 * 896 * \param reqlist The request structure whose bounce region will be accessed. 897 * \param pagenr The page index used to compute the bounce offset. 898 * \param sector The 512b sector index used to compute the page relative 899 * bounce offset. 900 * 901 * \return The computed global bounce buffer address. 902 */ 903static inline uint8_t * 904xbb_reqlist_bounce_addr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 905{ 906 return (reqlist->bounce + (PAGE_SIZE * pagenr) + (sector << 9)); 907} 908#endif 909 910/** 911 * Given a page number and 512b sector offset within that page, 912 * calculate an offset into the request's memory region that the 913 * underlying backend device/file should use for I/O. 914 * 915 * \param reqlist The request structure whose I/O region will be accessed. 916 * \param pagenr The page index used to compute the I/O offset. 917 * \param sector The 512b sector index used to compute the page relative 918 * I/O offset. 919 * 920 * \return The computed global I/O address. 921 * 922 * Depending on configuration, this will either be a local bounce buffer 923 * or a pointer to the memory mapped in from the front-end domain for 924 * this request. 925 */ 926static inline uint8_t * 927xbb_reqlist_ioaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 928{ 929#ifdef XBB_USE_BOUNCE_BUFFERS 930 return (xbb_reqlist_bounce_addr(reqlist, pagenr, sector)); 931#else 932 return (xbb_reqlist_vaddr(reqlist, pagenr, sector)); 933#endif 934} 935 936/** 937 * Given a page index and 512b sector offset within that page, calculate 938 * an offset into the local psuedo-physical address space used to map a 939 * front-end's request data into a request. 940 * 941 * \param reqlist The request list structure whose pseudo-physical region 942 * will be accessed. 943 * \param pagenr The page index used to compute the pseudo-physical offset. 944 * \param sector The 512b sector index used to compute the page relative 945 * pseudo-physical offset. 946 * 947 * \return The computed global pseudo-phsyical address. 948 * 949 * Depending on configuration, this will either be a local bounce buffer 950 * or a pointer to the memory mapped in from the front-end domain for 951 * this request. 952 */ 953static inline uintptr_t 954xbb_get_gntaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 955{ 956 struct xbb_softc *xbb; 957 958 xbb = reqlist->xbb; 959 960 return ((uintptr_t)(xbb->gnt_base_addr + 961 (uintptr_t)(reqlist->kva - xbb->kva) + 962 (PAGE_SIZE * pagenr) + (sector << 9))); 963} 964 965/** 966 * Get Kernel Virtual Address space for mapping requests. 967 * 968 * \param xbb Per-instance xbb configuration structure. 969 * \param nr_pages Number of pages needed. 970 * \param check_only If set, check for free KVA but don't allocate it. 971 * \param have_lock If set, xbb lock is already held. 972 * 973 * \return On success, a pointer to the allocated KVA region. Otherwise NULL. 974 * 975 * Note: This should be unnecessary once we have either chaining or 976 * scatter/gather support for struct bio. At that point we'll be able to 977 * put multiple addresses and lengths in one bio/bio chain and won't need 978 * to map everything into one virtual segment. 979 */ 980static uint8_t * 981xbb_get_kva(struct xbb_softc *xbb, int nr_pages) 982{ 983 int first_clear; 984 int num_clear; 985 uint8_t *free_kva; 986 int i; 987 988 KASSERT(nr_pages != 0, ("xbb_get_kva of zero length")); 989 990 first_clear = 0; 991 free_kva = NULL; 992 993 mtx_lock(&xbb->lock); 994 995 /* 996 * Look for the first available page. If there are none, we're done. 997 */ 998 bit_ffc(xbb->kva_free, xbb->reqlist_kva_pages, &first_clear); 999 1000 if (first_clear == -1) 1001 goto bailout; 1002 1003 /* 1004 * Starting at the first available page, look for consecutive free 1005 * pages that will satisfy the user's request. 1006 */ 1007 for (i = first_clear, num_clear = 0; i < xbb->reqlist_kva_pages; i++) { 1008 /* 1009 * If this is true, the page is used, so we have to reset 1010 * the number of clear pages and the first clear page 1011 * (since it pointed to a region with an insufficient number 1012 * of clear pages). 1013 */ 1014 if (bit_test(xbb->kva_free, i)) { 1015 num_clear = 0; 1016 first_clear = -1; 1017 continue; 1018 } 1019 1020 if (first_clear == -1) 1021 first_clear = i; 1022 1023 /* 1024 * If this is true, we've found a large enough free region 1025 * to satisfy the request. 1026 */ 1027 if (++num_clear == nr_pages) { 1028 1029 bit_nset(xbb->kva_free, first_clear, 1030 first_clear + nr_pages - 1); 1031 1032 free_kva = xbb->kva + 1033 (uint8_t *)((intptr_t)first_clear * PAGE_SIZE); 1034 1035 KASSERT(free_kva >= (uint8_t *)xbb->kva && 1036 free_kva + (nr_pages * PAGE_SIZE) <= 1037 (uint8_t *)xbb->ring_config.va, 1038 ("Free KVA %p len %d out of range, " 1039 "kva = %#jx, ring VA = %#jx\n", free_kva, 1040 nr_pages * PAGE_SIZE, (uintmax_t)xbb->kva, 1041 (uintmax_t)xbb->ring_config.va)); 1042 break; 1043 } 1044 } 1045 1046bailout: 1047 1048 if (free_kva == NULL) { 1049 xbb->flags |= XBBF_RESOURCE_SHORTAGE; 1050 xbb->kva_shortages++; 1051 } 1052 1053 mtx_unlock(&xbb->lock); 1054 1055 return (free_kva); 1056} 1057 1058/** 1059 * Free allocated KVA. 1060 * 1061 * \param xbb Per-instance xbb configuration structure. 1062 * \param kva_ptr Pointer to allocated KVA region. 1063 * \param nr_pages Number of pages in the KVA region. 1064 */ 1065static void 1066xbb_free_kva(struct xbb_softc *xbb, uint8_t *kva_ptr, int nr_pages) 1067{ 1068 intptr_t start_page; 1069 1070 mtx_assert(&xbb->lock, MA_OWNED); 1071 1072 start_page = (intptr_t)(kva_ptr - xbb->kva) >> PAGE_SHIFT; 1073 bit_nclear(xbb->kva_free, start_page, start_page + nr_pages - 1); 1074 1075} 1076 1077/** 1078 * Unmap the front-end pages associated with this I/O request. 1079 * 1080 * \param req The request structure to unmap. 1081 */ 1082static void 1083xbb_unmap_reqlist(struct xbb_xen_reqlist *reqlist) 1084{ 1085 struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQLIST]; 1086 u_int i; 1087 u_int invcount; 1088 int error; 1089 1090 invcount = 0; 1091 for (i = 0; i < reqlist->nr_segments; i++) { 1092 1093 if (reqlist->gnt_handles[i] == GRANT_REF_INVALID) 1094 continue; 1095 1096 unmap[invcount].host_addr = xbb_get_gntaddr(reqlist, i, 0); 1097 unmap[invcount].dev_bus_addr = 0; 1098 unmap[invcount].handle = reqlist->gnt_handles[i]; 1099 reqlist->gnt_handles[i] = GRANT_REF_INVALID; 1100 invcount++; 1101 } 1102 1103 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, 1104 unmap, invcount); 1105 KASSERT(error == 0, ("Grant table operation failed")); 1106} 1107 1108/** 1109 * Allocate an internal transaction tracking structure from the free pool. 1110 * 1111 * \param xbb Per-instance xbb configuration structure. 1112 * 1113 * \return On success, a pointer to the allocated xbb_xen_reqlist structure. 1114 * Otherwise NULL. 1115 */ 1116static inline struct xbb_xen_reqlist * 1117xbb_get_reqlist(struct xbb_softc *xbb) 1118{ 1119 struct xbb_xen_reqlist *reqlist; 1120 1121 reqlist = NULL; 1122 1123 mtx_assert(&xbb->lock, MA_OWNED); 1124 1125 if ((reqlist = STAILQ_FIRST(&xbb->reqlist_free_stailq)) != NULL) { 1126 1127 STAILQ_REMOVE_HEAD(&xbb->reqlist_free_stailq, links); 1128 reqlist->flags = XBB_REQLIST_NONE; 1129 reqlist->kva = NULL; 1130 reqlist->status = BLKIF_RSP_OKAY; 1131 reqlist->residual_512b_sectors = 0; 1132 reqlist->num_children = 0; 1133 reqlist->nr_segments = 0; 1134 STAILQ_INIT(&reqlist->contig_req_list); 1135 } 1136 1137 return (reqlist); 1138} 1139 1140/** 1141 * Return an allocated transaction tracking structure to the free pool. 1142 * 1143 * \param xbb Per-instance xbb configuration structure. 1144 * \param req The request list structure to free. 1145 * \param wakeup If set, wakeup the work thread if freeing this reqlist 1146 * during a resource shortage condition. 1147 */ 1148static inline void 1149xbb_release_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 1150 int wakeup) 1151{ 1152 1153 mtx_assert(&xbb->lock, MA_OWNED); 1154 1155 if (wakeup) { 1156 wakeup = xbb->flags & XBBF_RESOURCE_SHORTAGE; 1157 xbb->flags &= ~XBBF_RESOURCE_SHORTAGE; 1158 } 1159 1160 if (reqlist->kva != NULL) 1161 xbb_free_kva(xbb, reqlist->kva, reqlist->nr_segments); 1162 1163 xbb_release_reqs(xbb, &reqlist->contig_req_list, reqlist->num_children); 1164 1165 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); 1166 1167 if ((xbb->flags & XBBF_SHUTDOWN) != 0) { 1168 /* 1169 * Shutdown is in progress. See if we can 1170 * progress further now that one more request 1171 * has completed and been returned to the 1172 * free pool. 1173 */ 1174 xbb_shutdown(xbb); 1175 } 1176 1177 if (wakeup != 0) 1178 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1179} 1180 1181/** 1182 * Request resources and do basic request setup. 1183 * 1184 * \param xbb Per-instance xbb configuration structure. 1185 * \param reqlist Pointer to reqlist pointer. 1186 * \param ring_req Pointer to a block ring request. 1187 * \param ring_index The ring index of this request. 1188 * 1189 * \return 0 for success, non-zero for failure. 1190 */ 1191static int 1192xbb_get_resources(struct xbb_softc *xbb, struct xbb_xen_reqlist **reqlist, 1193 blkif_request_t *ring_req, RING_IDX ring_idx) 1194{ 1195 struct xbb_xen_reqlist *nreqlist; 1196 struct xbb_xen_req *nreq; 1197 1198 nreqlist = NULL; 1199 nreq = NULL; 1200 1201 mtx_lock(&xbb->lock); 1202 1203 /* 1204 * We don't allow new resources to be allocated if we're in the 1205 * process of shutting down. 1206 */ 1207 if ((xbb->flags & XBBF_SHUTDOWN) != 0) { 1208 mtx_unlock(&xbb->lock); 1209 return (1); 1210 } 1211 1212 /* 1213 * Allocate a reqlist if the caller doesn't have one already. 1214 */ 1215 if (*reqlist == NULL) { 1216 nreqlist = xbb_get_reqlist(xbb); 1217 if (nreqlist == NULL) 1218 goto bailout_error; 1219 } 1220 1221 /* We always allocate a request. */ 1222 nreq = xbb_get_req(xbb); 1223 if (nreq == NULL) 1224 goto bailout_error; 1225 1226 mtx_unlock(&xbb->lock); 1227 1228 if (*reqlist == NULL) { 1229 *reqlist = nreqlist; 1230 nreqlist->operation = ring_req->operation; 1231 nreqlist->starting_sector_number = ring_req->sector_number; 1232 STAILQ_INSERT_TAIL(&xbb->reqlist_pending_stailq, nreqlist, 1233 links); 1234 } 1235 1236 nreq->reqlist = *reqlist; 1237 nreq->req_ring_idx = ring_idx; 1238 nreq->id = ring_req->id; 1239 nreq->operation = ring_req->operation; 1240 1241 if (xbb->abi != BLKIF_PROTOCOL_NATIVE) { 1242 bcopy(ring_req, &nreq->ring_req_storage, sizeof(*ring_req)); 1243 nreq->ring_req = &nreq->ring_req_storage; 1244 } else { 1245 nreq->ring_req = ring_req; 1246 } 1247 1248 binuptime(&nreq->ds_t0); 1249 devstat_start_transaction(xbb->xbb_stats_in, &nreq->ds_t0); 1250 STAILQ_INSERT_TAIL(&(*reqlist)->contig_req_list, nreq, links); 1251 (*reqlist)->num_children++; 1252 (*reqlist)->nr_segments += ring_req->nr_segments; 1253 1254 return (0); 1255 1256bailout_error: 1257 1258 /* 1259 * We're out of resources, so set the shortage flag. The next time 1260 * a request is released, we'll try waking up the work thread to 1261 * see if we can allocate more resources. 1262 */ 1263 xbb->flags |= XBBF_RESOURCE_SHORTAGE; 1264 xbb->request_shortages++; 1265 1266 if (nreq != NULL) 1267 xbb_release_req(xbb, nreq); 1268 1269 if (nreqlist != NULL) 1270 xbb_release_reqlist(xbb, nreqlist, /*wakeup*/ 0); 1271 1272 mtx_unlock(&xbb->lock); 1273 1274 return (1); 1275} 1276 1277/** 1278 * Create and queue a response to a blkif request. 1279 * 1280 * \param xbb Per-instance xbb configuration structure. 1281 * \param req The request structure to which to respond. 1282 * \param status The status code to report. See BLKIF_RSP_* 1283 * in sys/xen/interface/io/blkif.h. 1284 */ 1285static void 1286xbb_queue_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status) 1287{ 1288 blkif_response_t *resp; 1289 1290 /* 1291 * The mutex is required here, and should be held across this call 1292 * until after the subsequent call to xbb_push_responses(). This 1293 * is to guarantee that another context won't queue responses and 1294 * push them while we're active. 1295 * 1296 * That could lead to the other end being notified of responses 1297 * before the resources have been freed on this end. The other end 1298 * would then be able to queue additional I/O, and we may run out 1299 * of resources because we haven't freed them all yet. 1300 */ 1301 mtx_assert(&xbb->lock, MA_OWNED); 1302 1303 /* 1304 * Place on the response ring for the relevant domain. 1305 * For now, only the spacing between entries is different 1306 * in the different ABIs, not the response entry layout. 1307 */ 1308 switch (xbb->abi) { 1309 case BLKIF_PROTOCOL_NATIVE: 1310 resp = RING_GET_RESPONSE(&xbb->rings.native, 1311 xbb->rings.native.rsp_prod_pvt); 1312 break; 1313 case BLKIF_PROTOCOL_X86_32: 1314 resp = (blkif_response_t *) 1315 RING_GET_RESPONSE(&xbb->rings.x86_32, 1316 xbb->rings.x86_32.rsp_prod_pvt); 1317 break; 1318 case BLKIF_PROTOCOL_X86_64: 1319 resp = (blkif_response_t *) 1320 RING_GET_RESPONSE(&xbb->rings.x86_64, 1321 xbb->rings.x86_64.rsp_prod_pvt); 1322 break; 1323 default: 1324 panic("Unexpected blkif protocol ABI."); 1325 } 1326 1327 resp->id = req->id; 1328 resp->operation = req->operation; 1329 resp->status = status; 1330 1331 if (status != BLKIF_RSP_OKAY) 1332 xbb->reqs_completed_with_error++; 1333 1334 xbb->rings.common.rsp_prod_pvt++; 1335 1336 xbb->reqs_queued_for_completion++; 1337 1338} 1339 1340/** 1341 * Send queued responses to blkif requests. 1342 * 1343 * \param xbb Per-instance xbb configuration structure. 1344 * \param run_taskqueue Flag that is set to 1 if the taskqueue 1345 * should be run, 0 if it does not need to be run. 1346 * \param notify Flag that is set to 1 if the other end should be 1347 * notified via irq, 0 if the other end should not be 1348 * notified. 1349 */ 1350static void 1351xbb_push_responses(struct xbb_softc *xbb, int *run_taskqueue, int *notify) 1352{ 1353 int more_to_do; 1354 1355 /* 1356 * The mutex is required here. 1357 */ 1358 mtx_assert(&xbb->lock, MA_OWNED); 1359 1360 more_to_do = 0; 1361 1362 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, *notify); 1363 1364 if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) { 1365 1366 /* 1367 * Tail check for pending requests. Allows frontend to avoid 1368 * notifications if requests are already in flight (lower 1369 * overheads and promotes batching). 1370 */ 1371 RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do); 1372 } else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) { 1373 1374 more_to_do = 1; 1375 } 1376 1377 xbb->reqs_completed += xbb->reqs_queued_for_completion; 1378 xbb->reqs_queued_for_completion = 0; 1379 1380 *run_taskqueue = more_to_do; 1381} 1382 1383/** 1384 * Complete a request list. 1385 * 1386 * \param xbb Per-instance xbb configuration structure. 1387 * \param reqlist Allocated internal request list structure. 1388 */ 1389static void 1390xbb_complete_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) 1391{ 1392 struct xbb_xen_req *nreq; 1393 off_t sectors_sent; 1394 int notify, run_taskqueue; 1395 1396 sectors_sent = 0; 1397 1398 if (reqlist->flags & XBB_REQLIST_MAPPED) 1399 xbb_unmap_reqlist(reqlist); 1400 1401 mtx_lock(&xbb->lock); 1402 1403 /* 1404 * All I/O is done, send the response. A lock is not necessary 1405 * to protect the request list, because all requests have 1406 * completed. Therefore this is the only context accessing this 1407 * reqlist right now. However, in order to make sure that no one 1408 * else queues responses onto the queue or pushes them to the other 1409 * side while we're active, we need to hold the lock across the 1410 * calls to xbb_queue_response() and xbb_push_responses(). 1411 */ 1412 STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { 1413 off_t cur_sectors_sent; 1414 1415 /* Put this response on the ring, but don't push yet */ 1416 xbb_queue_response(xbb, nreq, reqlist->status); 1417 1418 /* We don't report bytes sent if there is an error. */ 1419 if (reqlist->status == BLKIF_RSP_OKAY) 1420 cur_sectors_sent = nreq->nr_512b_sectors; 1421 else 1422 cur_sectors_sent = 0; 1423 1424 sectors_sent += cur_sectors_sent; 1425 1426 devstat_end_transaction(xbb->xbb_stats_in, 1427 /*bytes*/cur_sectors_sent << 9, 1428 reqlist->ds_tag_type, 1429 reqlist->ds_trans_type, 1430 /*now*/NULL, 1431 /*then*/&nreq->ds_t0); 1432 } 1433 1434 /* 1435 * Take out any sectors not sent. If we wind up negative (which 1436 * might happen if an error is reported as well as a residual), just 1437 * report 0 sectors sent. 1438 */ 1439 sectors_sent -= reqlist->residual_512b_sectors; 1440 if (sectors_sent < 0) 1441 sectors_sent = 0; 1442 1443 devstat_end_transaction(xbb->xbb_stats, 1444 /*bytes*/ sectors_sent << 9, 1445 reqlist->ds_tag_type, 1446 reqlist->ds_trans_type, 1447 /*now*/NULL, 1448 /*then*/&reqlist->ds_t0); 1449 1450 xbb_release_reqlist(xbb, reqlist, /*wakeup*/ 1); 1451 1452 xbb_push_responses(xbb, &run_taskqueue, ¬ify); 1453 1454 mtx_unlock(&xbb->lock); 1455 1456 if (run_taskqueue) 1457 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1458 1459 if (notify) 1460 xen_intr_signal(xbb->xen_intr_handle); 1461} 1462 1463/** 1464 * Completion handler for buffer I/O requests issued by the device 1465 * backend driver. 1466 * 1467 * \param bio The buffer I/O request on which to perform completion 1468 * processing. 1469 */ 1470static void 1471xbb_bio_done(struct bio *bio) 1472{ 1473 struct xbb_softc *xbb; 1474 struct xbb_xen_reqlist *reqlist; 1475 1476 reqlist = bio->bio_caller1; 1477 xbb = reqlist->xbb; 1478 1479 reqlist->residual_512b_sectors += bio->bio_resid >> 9; 1480 1481 /* 1482 * This is a bit imprecise. With aggregated I/O a single 1483 * request list can contain multiple front-end requests and 1484 * a multiple bios may point to a single request. By carefully 1485 * walking the request list, we could map residuals and errors 1486 * back to the original front-end request, but the interface 1487 * isn't sufficiently rich for us to properly report the error. 1488 * So, we just treat the entire request list as having failed if an 1489 * error occurs on any part. And, if an error occurs, we treat 1490 * the amount of data transferred as 0. 1491 * 1492 * For residuals, we report it on the overall aggregated device, 1493 * but not on the individual requests, since we don't currently 1494 * do the work to determine which front-end request to which the 1495 * residual applies. 1496 */ 1497 if (bio->bio_error) { 1498 DPRINTF("BIO returned error %d for operation on device %s\n", 1499 bio->bio_error, xbb->dev_name); 1500 reqlist->status = BLKIF_RSP_ERROR; 1501 1502 if (bio->bio_error == ENXIO 1503 && xenbus_get_state(xbb->dev) == XenbusStateConnected) { 1504 1505 /* 1506 * Backend device has disappeared. Signal the 1507 * front-end that we (the device proxy) want to 1508 * go away. 1509 */ 1510 xenbus_set_state(xbb->dev, XenbusStateClosing); 1511 } 1512 } 1513 1514#ifdef XBB_USE_BOUNCE_BUFFERS 1515 if (bio->bio_cmd == BIO_READ) { 1516 vm_offset_t kva_offset; 1517 1518 kva_offset = (vm_offset_t)bio->bio_data 1519 - (vm_offset_t)reqlist->bounce; 1520 memcpy((uint8_t *)reqlist->kva + kva_offset, 1521 bio->bio_data, bio->bio_bcount); 1522 } 1523#endif /* XBB_USE_BOUNCE_BUFFERS */ 1524 1525 /* 1526 * Decrement the pending count for the request list. When we're 1527 * done with the requests, send status back for all of them. 1528 */ 1529 if (atomic_fetchadd_int(&reqlist->pendcnt, -1) == 1) 1530 xbb_complete_reqlist(xbb, reqlist); 1531 1532 g_destroy_bio(bio); 1533} 1534 1535/** 1536 * Parse a blkif request into an internal request structure and send 1537 * it to the backend for processing. 1538 * 1539 * \param xbb Per-instance xbb configuration structure. 1540 * \param reqlist Allocated internal request list structure. 1541 * 1542 * \return On success, 0. For resource shortages, non-zero. 1543 * 1544 * This routine performs the backend common aspects of request parsing 1545 * including compiling an internal request structure, parsing the S/G 1546 * list and any secondary ring requests in which they may reside, and 1547 * the mapping of front-end I/O pages into our domain. 1548 */ 1549static int 1550xbb_dispatch_io(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) 1551{ 1552 struct xbb_sg *xbb_sg; 1553 struct gnttab_map_grant_ref *map; 1554 struct blkif_request_segment *sg; 1555 struct blkif_request_segment *last_block_sg; 1556 struct xbb_xen_req *nreq; 1557 u_int nseg; 1558 u_int seg_idx; 1559 u_int block_segs; 1560 int nr_sects; 1561 int total_sects; 1562 int operation; 1563 uint8_t bio_flags; 1564 int error; 1565 1566 reqlist->ds_tag_type = DEVSTAT_TAG_SIMPLE; 1567 bio_flags = 0; 1568 total_sects = 0; 1569 nr_sects = 0; 1570 1571 /* 1572 * First determine whether we have enough free KVA to satisfy this 1573 * request list. If not, tell xbb_run_queue() so it can go to 1574 * sleep until we have more KVA. 1575 */ 1576 reqlist->kva = NULL; 1577 if (reqlist->nr_segments != 0) { 1578 reqlist->kva = xbb_get_kva(xbb, reqlist->nr_segments); 1579 if (reqlist->kva == NULL) { 1580 /* 1581 * If we're out of KVA, return ENOMEM. 1582 */ 1583 return (ENOMEM); 1584 } 1585 } 1586 1587 binuptime(&reqlist->ds_t0); 1588 devstat_start_transaction(xbb->xbb_stats, &reqlist->ds_t0); 1589 1590 switch (reqlist->operation) { 1591 case BLKIF_OP_WRITE_BARRIER: 1592 bio_flags |= BIO_ORDERED; 1593 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; 1594 /* FALLTHROUGH */ 1595 case BLKIF_OP_WRITE: 1596 operation = BIO_WRITE; 1597 reqlist->ds_trans_type = DEVSTAT_WRITE; 1598 if ((xbb->flags & XBBF_READ_ONLY) != 0) { 1599 DPRINTF("Attempt to write to read only device %s\n", 1600 xbb->dev_name); 1601 reqlist->status = BLKIF_RSP_ERROR; 1602 goto send_response; 1603 } 1604 break; 1605 case BLKIF_OP_READ: 1606 operation = BIO_READ; 1607 reqlist->ds_trans_type = DEVSTAT_READ; 1608 break; 1609 case BLKIF_OP_FLUSH_DISKCACHE: 1610 /* 1611 * If this is true, the user has requested that we disable 1612 * flush support. So we just complete the requests 1613 * successfully. 1614 */ 1615 if (xbb->disable_flush != 0) { 1616 goto send_response; 1617 } 1618 1619 /* 1620 * The user has requested that we only send a real flush 1621 * for every N flush requests. So keep count, and either 1622 * complete the request immediately or queue it for the 1623 * backend. 1624 */ 1625 if (xbb->flush_interval != 0) { 1626 if (++(xbb->flush_count) < xbb->flush_interval) { 1627 goto send_response; 1628 } else 1629 xbb->flush_count = 0; 1630 } 1631 1632 operation = BIO_FLUSH; 1633 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; 1634 reqlist->ds_trans_type = DEVSTAT_NO_DATA; 1635 goto do_dispatch; 1636 /*NOTREACHED*/ 1637 default: 1638 DPRINTF("error: unknown block io operation [%d]\n", 1639 reqlist->operation); 1640 reqlist->status = BLKIF_RSP_ERROR; 1641 goto send_response; 1642 } 1643 1644 reqlist->xbb = xbb; 1645 xbb_sg = xbb->xbb_sgs; 1646 map = xbb->maps; 1647 seg_idx = 0; 1648 1649 STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { 1650 blkif_request_t *ring_req; 1651 RING_IDX req_ring_idx; 1652 u_int req_seg_idx; 1653 1654 ring_req = nreq->ring_req; 1655 req_ring_idx = nreq->req_ring_idx; 1656 nr_sects = 0; 1657 nseg = ring_req->nr_segments; 1658 nreq->nr_pages = nseg; 1659 nreq->nr_512b_sectors = 0; 1660 req_seg_idx = 0; 1661 sg = NULL; 1662 1663 /* Check that number of segments is sane. */ 1664 if (__predict_false(nseg == 0) 1665 || __predict_false(nseg > xbb->max_request_segments)) { 1666 DPRINTF("Bad number of segments in request (%d)\n", 1667 nseg); 1668 reqlist->status = BLKIF_RSP_ERROR; 1669 goto send_response; 1670 } 1671 1672 block_segs = nseg; 1673 sg = ring_req->seg; 1674 last_block_sg = sg + block_segs; 1675 1676 while (sg < last_block_sg) { 1677 KASSERT(seg_idx < 1678 XBB_MAX_SEGMENTS_PER_REQLIST, 1679 ("seg_idx %d is too large, max " 1680 "segs %d\n", seg_idx, 1681 XBB_MAX_SEGMENTS_PER_REQLIST)); 1682 1683 xbb_sg->first_sect = sg->first_sect; 1684 xbb_sg->last_sect = sg->last_sect; 1685 xbb_sg->nsect = 1686 (int8_t)(sg->last_sect - 1687 sg->first_sect + 1); 1688 1689 if ((sg->last_sect >= (PAGE_SIZE >> 9)) 1690 || (xbb_sg->nsect <= 0)) { 1691 reqlist->status = BLKIF_RSP_ERROR; 1692 goto send_response; 1693 } 1694 1695 nr_sects += xbb_sg->nsect; 1696 map->host_addr = xbb_get_gntaddr(reqlist, 1697 seg_idx, /*sector*/0); 1698 KASSERT(map->host_addr + PAGE_SIZE <= 1699 xbb->ring_config.gnt_addr, 1700 ("Host address %#jx len %d overlaps " 1701 "ring address %#jx\n", 1702 (uintmax_t)map->host_addr, PAGE_SIZE, 1703 (uintmax_t)xbb->ring_config.gnt_addr)); 1704 1705 map->flags = GNTMAP_host_map; 1706 map->ref = sg->gref; 1707 map->dom = xbb->otherend_id; 1708 if (operation == BIO_WRITE) 1709 map->flags |= GNTMAP_readonly; 1710 sg++; 1711 map++; 1712 xbb_sg++; 1713 seg_idx++; 1714 req_seg_idx++; 1715 } 1716 1717 /* Convert to the disk's sector size */ 1718 nreq->nr_512b_sectors = nr_sects; 1719 nr_sects = (nr_sects << 9) >> xbb->sector_size_shift; 1720 total_sects += nr_sects; 1721 1722 if ((nreq->nr_512b_sectors & 1723 ((xbb->sector_size >> 9) - 1)) != 0) { 1724 device_printf(xbb->dev, "%s: I/O size (%d) is not " 1725 "a multiple of the backing store sector " 1726 "size (%d)\n", __func__, 1727 nreq->nr_512b_sectors << 9, 1728 xbb->sector_size); 1729 reqlist->status = BLKIF_RSP_ERROR; 1730 goto send_response; 1731 } 1732 } 1733 1734 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, 1735 xbb->maps, reqlist->nr_segments); 1736 if (error != 0) 1737 panic("Grant table operation failed (%d)", error); 1738 1739 reqlist->flags |= XBB_REQLIST_MAPPED; 1740 1741 for (seg_idx = 0, map = xbb->maps; seg_idx < reqlist->nr_segments; 1742 seg_idx++, map++){ 1743 1744 if (__predict_false(map->status != 0)) { 1745 DPRINTF("invalid buffer -- could not remap " 1746 "it (%d)\n", map->status); 1747 DPRINTF("Mapping(%d): Host Addr 0x%"PRIx64", flags " 1748 "0x%x ref 0x%x, dom %d\n", seg_idx, 1749 map->host_addr, map->flags, map->ref, 1750 map->dom); 1751 reqlist->status = BLKIF_RSP_ERROR; 1752 goto send_response; 1753 } 1754 1755 reqlist->gnt_handles[seg_idx] = map->handle; 1756 } 1757 if (reqlist->starting_sector_number + total_sects > 1758 xbb->media_num_sectors) { 1759 1760 DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] " 1761 "extends past end of device %s\n", 1762 operation == BIO_READ ? "read" : "write", 1763 reqlist->starting_sector_number, 1764 reqlist->starting_sector_number + total_sects, 1765 xbb->dev_name); 1766 reqlist->status = BLKIF_RSP_ERROR; 1767 goto send_response; 1768 } 1769 1770do_dispatch: 1771 1772 error = xbb->dispatch_io(xbb, 1773 reqlist, 1774 operation, 1775 bio_flags); 1776 1777 if (error != 0) { 1778 reqlist->status = BLKIF_RSP_ERROR; 1779 goto send_response; 1780 } 1781 1782 return (0); 1783 1784send_response: 1785 1786 xbb_complete_reqlist(xbb, reqlist); 1787 1788 return (0); 1789} 1790 1791static __inline int 1792xbb_count_sects(blkif_request_t *ring_req) 1793{ 1794 int i; 1795 int cur_size = 0; 1796 1797 for (i = 0; i < ring_req->nr_segments; i++) { 1798 int nsect; 1799 1800 nsect = (int8_t)(ring_req->seg[i].last_sect - 1801 ring_req->seg[i].first_sect + 1); 1802 if (nsect <= 0) 1803 break; 1804 1805 cur_size += nsect; 1806 } 1807 1808 return (cur_size); 1809} 1810 1811/** 1812 * Process incoming requests from the shared communication ring in response 1813 * to a signal on the ring's event channel. 1814 * 1815 * \param context Callback argument registerd during task initialization - 1816 * the xbb_softc for this instance. 1817 * \param pending The number of taskqueue_enqueue events that have 1818 * occurred since this handler was last run. 1819 */ 1820static void 1821xbb_run_queue(void *context, int pending) 1822{ 1823 struct xbb_softc *xbb; 1824 blkif_back_rings_t *rings; 1825 RING_IDX rp; 1826 uint64_t cur_sector; 1827 int cur_operation; 1828 struct xbb_xen_reqlist *reqlist; 1829 1830 1831 xbb = (struct xbb_softc *)context; 1832 rings = &xbb->rings; 1833 1834 /* 1835 * Work gather and dispatch loop. Note that we have a bias here 1836 * towards gathering I/O sent by blockfront. We first gather up 1837 * everything in the ring, as long as we have resources. Then we 1838 * dispatch one request, and then attempt to gather up any 1839 * additional requests that have come in while we were dispatching 1840 * the request. 1841 * 1842 * This allows us to get a clearer picture (via devstat) of how 1843 * many requests blockfront is queueing to us at any given time. 1844 */ 1845 for (;;) { 1846 int retval; 1847 1848 /* 1849 * Initialize reqlist to the last element in the pending 1850 * queue, if there is one. This allows us to add more 1851 * requests to that request list, if we have room. 1852 */ 1853 reqlist = STAILQ_LAST(&xbb->reqlist_pending_stailq, 1854 xbb_xen_reqlist, links); 1855 if (reqlist != NULL) { 1856 cur_sector = reqlist->next_contig_sector; 1857 cur_operation = reqlist->operation; 1858 } else { 1859 cur_operation = 0; 1860 cur_sector = 0; 1861 } 1862 1863 /* 1864 * Cache req_prod to avoid accessing a cache line shared 1865 * with the frontend. 1866 */ 1867 rp = rings->common.sring->req_prod; 1868 1869 /* Ensure we see queued requests up to 'rp'. */ 1870 rmb(); 1871 1872 /** 1873 * Run so long as there is work to consume and the generation 1874 * of a response will not overflow the ring. 1875 * 1876 * @note There's a 1 to 1 relationship between requests and 1877 * responses, so an overflow should never occur. This 1878 * test is to protect our domain from digesting bogus 1879 * data. Shouldn't we log this? 1880 */ 1881 while (rings->common.req_cons != rp 1882 && RING_REQUEST_CONS_OVERFLOW(&rings->common, 1883 rings->common.req_cons) == 0){ 1884 blkif_request_t ring_req_storage; 1885 blkif_request_t *ring_req; 1886 int cur_size; 1887 1888 switch (xbb->abi) { 1889 case BLKIF_PROTOCOL_NATIVE: 1890 ring_req = RING_GET_REQUEST(&xbb->rings.native, 1891 rings->common.req_cons); 1892 break; 1893 case BLKIF_PROTOCOL_X86_32: 1894 { 1895 struct blkif_x86_32_request *ring_req32; 1896 1897 ring_req32 = RING_GET_REQUEST( 1898 &xbb->rings.x86_32, rings->common.req_cons); 1899 blkif_get_x86_32_req(&ring_req_storage, 1900 ring_req32); 1901 ring_req = &ring_req_storage; 1902 break; 1903 } 1904 case BLKIF_PROTOCOL_X86_64: 1905 { 1906 struct blkif_x86_64_request *ring_req64; 1907 1908 ring_req64 =RING_GET_REQUEST(&xbb->rings.x86_64, 1909 rings->common.req_cons); 1910 blkif_get_x86_64_req(&ring_req_storage, 1911 ring_req64); 1912 ring_req = &ring_req_storage; 1913 break; 1914 } 1915 default: 1916 panic("Unexpected blkif protocol ABI."); 1917 /* NOTREACHED */ 1918 } 1919 1920 /* 1921 * Check for situations that would require closing 1922 * off this I/O for further coalescing: 1923 * - Coalescing is turned off. 1924 * - Current I/O is out of sequence with the previous 1925 * I/O. 1926 * - Coalesced I/O would be too large. 1927 */ 1928 if ((reqlist != NULL) 1929 && ((xbb->no_coalesce_reqs != 0) 1930 || ((xbb->no_coalesce_reqs == 0) 1931 && ((ring_req->sector_number != cur_sector) 1932 || (ring_req->operation != cur_operation) 1933 || ((ring_req->nr_segments + reqlist->nr_segments) > 1934 xbb->max_reqlist_segments))))) { 1935 reqlist = NULL; 1936 } 1937 1938 /* 1939 * Grab and check for all resources in one shot. 1940 * If we can't get all of the resources we need, 1941 * the shortage is noted and the thread will get 1942 * woken up when more resources are available. 1943 */ 1944 retval = xbb_get_resources(xbb, &reqlist, ring_req, 1945 xbb->rings.common.req_cons); 1946 1947 if (retval != 0) { 1948 /* 1949 * Resource shortage has been recorded. 1950 * We'll be scheduled to run once a request 1951 * object frees up due to a completion. 1952 */ 1953 break; 1954 } 1955 1956 /* 1957 * Signify that we can overwrite this request with 1958 * a response by incrementing our consumer index. 1959 * The response won't be generated until after 1960 * we've already consumed all necessary data out 1961 * of the version of the request in the ring buffer 1962 * (for native mode). We must update the consumer 1963 * index before issuing back-end I/O so there is 1964 * no possibility that it will complete and a 1965 * response be generated before we make room in 1966 * the queue for that response. 1967 */ 1968 xbb->rings.common.req_cons++; 1969 xbb->reqs_received++; 1970 1971 cur_size = xbb_count_sects(ring_req); 1972 cur_sector = ring_req->sector_number + cur_size; 1973 reqlist->next_contig_sector = cur_sector; 1974 cur_operation = ring_req->operation; 1975 } 1976 1977 /* Check for I/O to dispatch */ 1978 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); 1979 if (reqlist == NULL) { 1980 /* 1981 * We're out of work to do, put the task queue to 1982 * sleep. 1983 */ 1984 break; 1985 } 1986 1987 /* 1988 * Grab the first request off the queue and attempt 1989 * to dispatch it. 1990 */ 1991 STAILQ_REMOVE_HEAD(&xbb->reqlist_pending_stailq, links); 1992 1993 retval = xbb_dispatch_io(xbb, reqlist); 1994 if (retval != 0) { 1995 /* 1996 * xbb_dispatch_io() returns non-zero only when 1997 * there is a resource shortage. If that's the 1998 * case, re-queue this request on the head of the 1999 * queue, and go to sleep until we have more 2000 * resources. 2001 */ 2002 STAILQ_INSERT_HEAD(&xbb->reqlist_pending_stailq, 2003 reqlist, links); 2004 break; 2005 } else { 2006 /* 2007 * If we still have anything on the queue after 2008 * removing the head entry, that is because we 2009 * met one of the criteria to create a new 2010 * request list (outlined above), and we'll call 2011 * that a forced dispatch for statistical purposes. 2012 * 2013 * Otherwise, if there is only one element on the 2014 * queue, we coalesced everything available on 2015 * the ring and we'll call that a normal dispatch. 2016 */ 2017 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); 2018 2019 if (reqlist != NULL) 2020 xbb->forced_dispatch++; 2021 else 2022 xbb->normal_dispatch++; 2023 2024 xbb->total_dispatch++; 2025 } 2026 } 2027} 2028 2029/** 2030 * Interrupt handler bound to the shared ring's event channel. 2031 * 2032 * \param arg Callback argument registerd during event channel 2033 * binding - the xbb_softc for this instance. 2034 */ 2035static int 2036xbb_filter(void *arg) 2037{ 2038 struct xbb_softc *xbb; 2039 2040 /* Defer to taskqueue thread. */ 2041 xbb = (struct xbb_softc *)arg; 2042 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 2043 2044 return (FILTER_HANDLED); 2045} 2046 2047SDT_PROVIDER_DEFINE(xbb); 2048SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_dev, flush, "int"); 2049SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, read, "int", "uint64_t", 2050 "uint64_t"); 2051SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, write, "int", 2052 "uint64_t", "uint64_t"); 2053 2054/*----------------------------- Backend Handlers -----------------------------*/ 2055/** 2056 * Backend handler for character device access. 2057 * 2058 * \param xbb Per-instance xbb configuration structure. 2059 * \param reqlist Allocated internal request list structure. 2060 * \param operation BIO_* I/O operation code. 2061 * \param bio_flags Additional bio_flag data to pass to any generated 2062 * bios (e.g. BIO_ORDERED).. 2063 * 2064 * \return 0 for success, errno codes for failure. 2065 */ 2066static int 2067xbb_dispatch_dev(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 2068 int operation, int bio_flags) 2069{ 2070 struct xbb_dev_data *dev_data; 2071 struct bio *bios[XBB_MAX_SEGMENTS_PER_REQLIST]; 2072 off_t bio_offset; 2073 struct bio *bio; 2074 struct xbb_sg *xbb_sg; 2075 u_int nbio; 2076 u_int bio_idx; 2077 u_int nseg; 2078 u_int seg_idx; 2079 int error; 2080 2081 dev_data = &xbb->backend.dev; 2082 bio_offset = (off_t)reqlist->starting_sector_number 2083 << xbb->sector_size_shift; 2084 error = 0; 2085 nbio = 0; 2086 bio_idx = 0; 2087 2088 if (operation == BIO_FLUSH) { 2089 bio = g_new_bio(); 2090 if (__predict_false(bio == NULL)) { 2091 DPRINTF("Unable to allocate bio for BIO_FLUSH\n"); 2092 error = ENOMEM; 2093 return (error); 2094 } 2095 2096 bio->bio_cmd = BIO_FLUSH; 2097 bio->bio_flags |= BIO_ORDERED; 2098 bio->bio_dev = dev_data->cdev; 2099 bio->bio_offset = 0; 2100 bio->bio_data = 0; 2101 bio->bio_done = xbb_bio_done; 2102 bio->bio_caller1 = reqlist; 2103 bio->bio_pblkno = 0; 2104 2105 reqlist->pendcnt = 1; 2106 2107 SDT_PROBE1(xbb, kernel, xbb_dispatch_dev, flush, 2108 device_get_unit(xbb->dev)); 2109 2110 (*dev_data->csw->d_strategy)(bio); 2111 2112 return (0); 2113 } 2114 2115 xbb_sg = xbb->xbb_sgs; 2116 bio = NULL; 2117 nseg = reqlist->nr_segments; 2118 2119 for (seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { 2120 2121 /* 2122 * KVA will not be contiguous, so any additional 2123 * I/O will need to be represented in a new bio. 2124 */ 2125 if ((bio != NULL) 2126 && (xbb_sg->first_sect != 0)) { 2127 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { 2128 printf("%s: Discontiguous I/O request " 2129 "from domain %d ends on " 2130 "non-sector boundary\n", 2131 __func__, xbb->otherend_id); 2132 error = EINVAL; 2133 goto fail_free_bios; 2134 } 2135 bio = NULL; 2136 } 2137 2138 if (bio == NULL) { 2139 /* 2140 * Make sure that the start of this bio is 2141 * aligned to a device sector. 2142 */ 2143 if ((bio_offset & (xbb->sector_size - 1)) != 0){ 2144 printf("%s: Misaligned I/O request " 2145 "from domain %d\n", __func__, 2146 xbb->otherend_id); 2147 error = EINVAL; 2148 goto fail_free_bios; 2149 } 2150 2151 bio = bios[nbio++] = g_new_bio(); 2152 if (__predict_false(bio == NULL)) { 2153 error = ENOMEM; 2154 goto fail_free_bios; 2155 } 2156 bio->bio_cmd = operation; 2157 bio->bio_flags |= bio_flags; 2158 bio->bio_dev = dev_data->cdev; 2159 bio->bio_offset = bio_offset; 2160 bio->bio_data = xbb_reqlist_ioaddr(reqlist, seg_idx, 2161 xbb_sg->first_sect); 2162 bio->bio_done = xbb_bio_done; 2163 bio->bio_caller1 = reqlist; 2164 bio->bio_pblkno = bio_offset >> xbb->sector_size_shift; 2165 } 2166 2167 bio->bio_length += xbb_sg->nsect << 9; 2168 bio->bio_bcount = bio->bio_length; 2169 bio_offset += xbb_sg->nsect << 9; 2170 2171 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) { 2172 2173 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { 2174 printf("%s: Discontiguous I/O request " 2175 "from domain %d ends on " 2176 "non-sector boundary\n", 2177 __func__, xbb->otherend_id); 2178 error = EINVAL; 2179 goto fail_free_bios; 2180 } 2181 /* 2182 * KVA will not be contiguous, so any additional 2183 * I/O will need to be represented in a new bio. 2184 */ 2185 bio = NULL; 2186 } 2187 } 2188 2189 reqlist->pendcnt = nbio; 2190 2191 for (bio_idx = 0; bio_idx < nbio; bio_idx++) 2192 { 2193#ifdef XBB_USE_BOUNCE_BUFFERS 2194 vm_offset_t kva_offset; 2195 2196 kva_offset = (vm_offset_t)bios[bio_idx]->bio_data 2197 - (vm_offset_t)reqlist->bounce; 2198 if (operation == BIO_WRITE) { 2199 memcpy(bios[bio_idx]->bio_data, 2200 (uint8_t *)reqlist->kva + kva_offset, 2201 bios[bio_idx]->bio_bcount); 2202 } 2203#endif 2204 if (operation == BIO_READ) { 2205 SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, read, 2206 device_get_unit(xbb->dev), 2207 bios[bio_idx]->bio_offset, 2208 bios[bio_idx]->bio_length); 2209 } else if (operation == BIO_WRITE) { 2210 SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, write, 2211 device_get_unit(xbb->dev), 2212 bios[bio_idx]->bio_offset, 2213 bios[bio_idx]->bio_length); 2214 } 2215 (*dev_data->csw->d_strategy)(bios[bio_idx]); 2216 } 2217 2218 return (error); 2219 2220fail_free_bios: 2221 for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++) 2222 g_destroy_bio(bios[bio_idx]); 2223 2224 return (error); 2225} 2226 2227SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_file, flush, "int"); 2228SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, read, "int", "uint64_t", 2229 "uint64_t"); 2230SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, write, "int", 2231 "uint64_t", "uint64_t"); 2232 2233/** 2234 * Backend handler for file access. 2235 * 2236 * \param xbb Per-instance xbb configuration structure. 2237 * \param reqlist Allocated internal request list. 2238 * \param operation BIO_* I/O operation code. 2239 * \param flags Additional bio_flag data to pass to any generated bios 2240 * (e.g. BIO_ORDERED).. 2241 * 2242 * \return 0 for success, errno codes for failure. 2243 */ 2244static int 2245xbb_dispatch_file(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 2246 int operation, int flags) 2247{ 2248 struct xbb_file_data *file_data; 2249 u_int seg_idx; 2250 u_int nseg; 2251 struct uio xuio; 2252 struct xbb_sg *xbb_sg; 2253 struct iovec *xiovec; 2254#ifdef XBB_USE_BOUNCE_BUFFERS 2255 void **p_vaddr; 2256 int saved_uio_iovcnt; 2257#endif /* XBB_USE_BOUNCE_BUFFERS */ 2258 int error; 2259 2260 file_data = &xbb->backend.file; 2261 error = 0; 2262 bzero(&xuio, sizeof(xuio)); 2263 2264 switch (operation) { 2265 case BIO_READ: 2266 xuio.uio_rw = UIO_READ; 2267 break; 2268 case BIO_WRITE: 2269 xuio.uio_rw = UIO_WRITE; 2270 break; 2271 case BIO_FLUSH: { 2272 struct mount *mountpoint; 2273 2274 SDT_PROBE1(xbb, kernel, xbb_dispatch_file, flush, 2275 device_get_unit(xbb->dev)); 2276 2277 (void) vn_start_write(xbb->vn, &mountpoint, V_WAIT); 2278 2279 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2280 error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread); 2281 VOP_UNLOCK(xbb->vn, 0); 2282 2283 vn_finished_write(mountpoint); 2284 2285 goto bailout_send_response; 2286 /* NOTREACHED */ 2287 } 2288 default: 2289 panic("invalid operation %d", operation); 2290 /* NOTREACHED */ 2291 } 2292 xuio.uio_offset = (vm_offset_t)reqlist->starting_sector_number 2293 << xbb->sector_size_shift; 2294 xuio.uio_segflg = UIO_SYSSPACE; 2295 xuio.uio_iov = file_data->xiovecs; 2296 xuio.uio_iovcnt = 0; 2297 xbb_sg = xbb->xbb_sgs; 2298 nseg = reqlist->nr_segments; 2299 2300 for (xiovec = NULL, seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { 2301 2302 /* 2303 * If the first sector is not 0, the KVA will 2304 * not be contiguous and we'll need to go on 2305 * to another segment. 2306 */ 2307 if (xbb_sg->first_sect != 0) 2308 xiovec = NULL; 2309 2310 if (xiovec == NULL) { 2311 xiovec = &file_data->xiovecs[xuio.uio_iovcnt]; 2312 xiovec->iov_base = xbb_reqlist_ioaddr(reqlist, 2313 seg_idx, xbb_sg->first_sect); 2314#ifdef XBB_USE_BOUNCE_BUFFERS 2315 /* 2316 * Store the address of the incoming 2317 * buffer at this particular offset 2318 * as well, so we can do the copy 2319 * later without having to do more 2320 * work to recalculate this address. 2321 */ 2322 p_vaddr = &file_data->xiovecs_vaddr[xuio.uio_iovcnt]; 2323 *p_vaddr = xbb_reqlist_vaddr(reqlist, seg_idx, 2324 xbb_sg->first_sect); 2325#endif /* XBB_USE_BOUNCE_BUFFERS */ 2326 xiovec->iov_len = 0; 2327 xuio.uio_iovcnt++; 2328 } 2329 2330 xiovec->iov_len += xbb_sg->nsect << 9; 2331 2332 xuio.uio_resid += xbb_sg->nsect << 9; 2333 2334 /* 2335 * If the last sector is not the full page 2336 * size count, the next segment will not be 2337 * contiguous in KVA and we need a new iovec. 2338 */ 2339 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) 2340 xiovec = NULL; 2341 } 2342 2343 xuio.uio_td = curthread; 2344 2345#ifdef XBB_USE_BOUNCE_BUFFERS 2346 saved_uio_iovcnt = xuio.uio_iovcnt; 2347 2348 if (operation == BIO_WRITE) { 2349 /* Copy the write data to the local buffer. */ 2350 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, 2351 xiovec = xuio.uio_iov; seg_idx < xuio.uio_iovcnt; 2352 seg_idx++, xiovec++, p_vaddr++) { 2353 2354 memcpy(xiovec->iov_base, *p_vaddr, xiovec->iov_len); 2355 } 2356 } else { 2357 /* 2358 * We only need to save off the iovecs in the case of a 2359 * read, because the copy for the read happens after the 2360 * VOP_READ(). (The uio will get modified in that call 2361 * sequence.) 2362 */ 2363 memcpy(file_data->saved_xiovecs, xuio.uio_iov, 2364 xuio.uio_iovcnt * sizeof(xuio.uio_iov[0])); 2365 } 2366#endif /* XBB_USE_BOUNCE_BUFFERS */ 2367 2368 switch (operation) { 2369 case BIO_READ: 2370 2371 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, read, 2372 device_get_unit(xbb->dev), xuio.uio_offset, 2373 xuio.uio_resid); 2374 2375 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2376 2377 /* 2378 * UFS pays attention to IO_DIRECT for reads. If the 2379 * DIRECTIO option is configured into the kernel, it calls 2380 * ffs_rawread(). But that only works for single-segment 2381 * uios with user space addresses. In our case, with a 2382 * kernel uio, it still reads into the buffer cache, but it 2383 * will just try to release the buffer from the cache later 2384 * on in ffs_read(). 2385 * 2386 * ZFS does not pay attention to IO_DIRECT for reads. 2387 * 2388 * UFS does not pay attention to IO_SYNC for reads. 2389 * 2390 * ZFS pays attention to IO_SYNC (which translates into the 2391 * Solaris define FRSYNC for zfs_read()) for reads. It 2392 * attempts to sync the file before reading. 2393 * 2394 * So, to attempt to provide some barrier semantics in the 2395 * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC. 2396 */ 2397 error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 2398 (IO_DIRECT|IO_SYNC) : 0, file_data->cred); 2399 2400 VOP_UNLOCK(xbb->vn, 0); 2401 break; 2402 case BIO_WRITE: { 2403 struct mount *mountpoint; 2404 2405 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, write, 2406 device_get_unit(xbb->dev), xuio.uio_offset, 2407 xuio.uio_resid); 2408 2409 (void)vn_start_write(xbb->vn, &mountpoint, V_WAIT); 2410 2411 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2412 2413 /* 2414 * UFS pays attention to IO_DIRECT for writes. The write 2415 * is done asynchronously. (Normally the write would just 2416 * get put into cache. 2417 * 2418 * UFS pays attention to IO_SYNC for writes. It will 2419 * attempt to write the buffer out synchronously if that 2420 * flag is set. 2421 * 2422 * ZFS does not pay attention to IO_DIRECT for writes. 2423 * 2424 * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC) 2425 * for writes. It will flush the transaction from the 2426 * cache before returning. 2427 * 2428 * So if we've got the BIO_ORDERED flag set, we want 2429 * IO_SYNC in either the UFS or ZFS case. 2430 */ 2431 error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 2432 IO_SYNC : 0, file_data->cred); 2433 VOP_UNLOCK(xbb->vn, 0); 2434 2435 vn_finished_write(mountpoint); 2436 2437 break; 2438 } 2439 default: 2440 panic("invalid operation %d", operation); 2441 /* NOTREACHED */ 2442 } 2443 2444#ifdef XBB_USE_BOUNCE_BUFFERS 2445 /* We only need to copy here for read operations */ 2446 if (operation == BIO_READ) { 2447 2448 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, 2449 xiovec = file_data->saved_xiovecs; 2450 seg_idx < saved_uio_iovcnt; seg_idx++, 2451 xiovec++, p_vaddr++) { 2452 2453 /* 2454 * Note that we have to use the copy of the 2455 * io vector we made above. uiomove() modifies 2456 * the uio and its referenced vector as uiomove 2457 * performs the copy, so we can't rely on any 2458 * state from the original uio. 2459 */ 2460 memcpy(*p_vaddr, xiovec->iov_base, xiovec->iov_len); 2461 } 2462 } 2463#endif /* XBB_USE_BOUNCE_BUFFERS */ 2464 2465bailout_send_response: 2466 2467 if (error != 0) 2468 reqlist->status = BLKIF_RSP_ERROR; 2469 2470 xbb_complete_reqlist(xbb, reqlist); 2471 2472 return (0); 2473} 2474 2475/*--------------------------- Backend Configuration --------------------------*/ 2476/** 2477 * Close and cleanup any backend device/file specific state for this 2478 * block back instance. 2479 * 2480 * \param xbb Per-instance xbb configuration structure. 2481 */ 2482static void 2483xbb_close_backend(struct xbb_softc *xbb) 2484{ 2485 DROP_GIANT(); 2486 DPRINTF("closing dev=%s\n", xbb->dev_name); 2487 if (xbb->vn) { 2488 int flags = FREAD; 2489 2490 if ((xbb->flags & XBBF_READ_ONLY) == 0) 2491 flags |= FWRITE; 2492 2493 switch (xbb->device_type) { 2494 case XBB_TYPE_DISK: 2495 if (xbb->backend.dev.csw) { 2496 dev_relthread(xbb->backend.dev.cdev, 2497 xbb->backend.dev.dev_ref); 2498 xbb->backend.dev.csw = NULL; 2499 xbb->backend.dev.cdev = NULL; 2500 } 2501 break; 2502 case XBB_TYPE_FILE: 2503 break; 2504 case XBB_TYPE_NONE: 2505 default: 2506 panic("Unexpected backend type."); 2507 break; 2508 } 2509 2510 (void)vn_close(xbb->vn, flags, NOCRED, curthread); 2511 xbb->vn = NULL; 2512 2513 switch (xbb->device_type) { 2514 case XBB_TYPE_DISK: 2515 break; 2516 case XBB_TYPE_FILE: 2517 if (xbb->backend.file.cred != NULL) { 2518 crfree(xbb->backend.file.cred); 2519 xbb->backend.file.cred = NULL; 2520 } 2521 break; 2522 case XBB_TYPE_NONE: 2523 default: 2524 panic("Unexpected backend type."); 2525 break; 2526 } 2527 } 2528 PICKUP_GIANT(); 2529} 2530 2531/** 2532 * Open a character device to be used for backend I/O. 2533 * 2534 * \param xbb Per-instance xbb configuration structure. 2535 * 2536 * \return 0 for success, errno codes for failure. 2537 */ 2538static int 2539xbb_open_dev(struct xbb_softc *xbb) 2540{ 2541 struct vattr vattr; 2542 struct cdev *dev; 2543 struct cdevsw *devsw; 2544 int error; 2545 2546 xbb->device_type = XBB_TYPE_DISK; 2547 xbb->dispatch_io = xbb_dispatch_dev; 2548 xbb->backend.dev.cdev = xbb->vn->v_rdev; 2549 xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev, 2550 &xbb->backend.dev.dev_ref); 2551 if (xbb->backend.dev.csw == NULL) 2552 panic("Unable to retrieve device switch"); 2553 2554 error = VOP_GETATTR(xbb->vn, &vattr, NOCRED); 2555 if (error) { 2556 xenbus_dev_fatal(xbb->dev, error, "error getting " 2557 "vnode attributes for device %s", 2558 xbb->dev_name); 2559 return (error); 2560 } 2561 2562 2563 dev = xbb->vn->v_rdev; 2564 devsw = dev->si_devsw; 2565 if (!devsw->d_ioctl) { 2566 xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for " 2567 "device %s!", xbb->dev_name); 2568 return (ENODEV); 2569 } 2570 2571 error = devsw->d_ioctl(dev, DIOCGSECTORSIZE, 2572 (caddr_t)&xbb->sector_size, FREAD, 2573 curthread); 2574 if (error) { 2575 xenbus_dev_fatal(xbb->dev, error, 2576 "error calling ioctl DIOCGSECTORSIZE " 2577 "for device %s", xbb->dev_name); 2578 return (error); 2579 } 2580 2581 error = devsw->d_ioctl(dev, DIOCGMEDIASIZE, 2582 (caddr_t)&xbb->media_size, FREAD, 2583 curthread); 2584 if (error) { 2585 xenbus_dev_fatal(xbb->dev, error, 2586 "error calling ioctl DIOCGMEDIASIZE " 2587 "for device %s", xbb->dev_name); 2588 return (error); 2589 } 2590 2591 return (0); 2592} 2593 2594/** 2595 * Open a file to be used for backend I/O. 2596 * 2597 * \param xbb Per-instance xbb configuration structure. 2598 * 2599 * \return 0 for success, errno codes for failure. 2600 */ 2601static int 2602xbb_open_file(struct xbb_softc *xbb) 2603{ 2604 struct xbb_file_data *file_data; 2605 struct vattr vattr; 2606 int error; 2607 2608 file_data = &xbb->backend.file; 2609 xbb->device_type = XBB_TYPE_FILE; 2610 xbb->dispatch_io = xbb_dispatch_file; 2611 error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred); 2612 if (error != 0) { 2613 xenbus_dev_fatal(xbb->dev, error, 2614 "error calling VOP_GETATTR()" 2615 "for file %s", xbb->dev_name); 2616 return (error); 2617 } 2618 2619 /* 2620 * Verify that we have the ability to upgrade to exclusive 2621 * access on this file so we can trap errors at open instead 2622 * of reporting them during first access. 2623 */ 2624 if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) { 2625 vn_lock(xbb->vn, LK_UPGRADE | LK_RETRY); 2626 if (xbb->vn->v_iflag & VI_DOOMED) { 2627 error = EBADF; 2628 xenbus_dev_fatal(xbb->dev, error, 2629 "error locking file %s", 2630 xbb->dev_name); 2631 2632 return (error); 2633 } 2634 } 2635 2636 file_data->cred = crhold(curthread->td_ucred); 2637 xbb->media_size = vattr.va_size; 2638 2639 /* 2640 * XXX KDM vattr.va_blocksize may be larger than 512 bytes here. 2641 * With ZFS, it is 131072 bytes. Block sizes that large don't work 2642 * with disklabel and UFS on FreeBSD at least. Large block sizes 2643 * may not work with other OSes as well. So just export a sector 2644 * size of 512 bytes, which should work with any OS or 2645 * application. Since our backing is a file, any block size will 2646 * work fine for the backing store. 2647 */ 2648#if 0 2649 xbb->sector_size = vattr.va_blocksize; 2650#endif 2651 xbb->sector_size = 512; 2652 2653 /* 2654 * Sanity check. The media size has to be at least one 2655 * sector long. 2656 */ 2657 if (xbb->media_size < xbb->sector_size) { 2658 error = EINVAL; 2659 xenbus_dev_fatal(xbb->dev, error, 2660 "file %s size %ju < block size %u", 2661 xbb->dev_name, 2662 (uintmax_t)xbb->media_size, 2663 xbb->sector_size); 2664 } 2665 return (error); 2666} 2667 2668/** 2669 * Open the backend provider for this connection. 2670 * 2671 * \param xbb Per-instance xbb configuration structure. 2672 * 2673 * \return 0 for success, errno codes for failure. 2674 */ 2675static int 2676xbb_open_backend(struct xbb_softc *xbb) 2677{ 2678 struct nameidata nd; 2679 int flags; 2680 int error; 2681 2682 flags = FREAD; 2683 error = 0; 2684 2685 DPRINTF("opening dev=%s\n", xbb->dev_name); 2686 2687 if (rootvnode == NULL) { 2688 xenbus_dev_fatal(xbb->dev, ENOENT, 2689 "Root file system not mounted"); 2690 return (ENOENT); 2691 } 2692 2693 if ((xbb->flags & XBBF_READ_ONLY) == 0) 2694 flags |= FWRITE; 2695 2696 pwd_ensure_dirs(); 2697 2698 again: 2699 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name, curthread); 2700 error = vn_open(&nd, &flags, 0, NULL); 2701 if (error) { 2702 /* 2703 * This is the only reasonable guess we can make as far as 2704 * path if the user doesn't give us a fully qualified path. 2705 * If they want to specify a file, they need to specify the 2706 * full path. 2707 */ 2708 if (xbb->dev_name[0] != '/') { 2709 char *dev_path = "/dev/"; 2710 char *dev_name; 2711 2712 /* Try adding device path at beginning of name */ 2713 dev_name = malloc(strlen(xbb->dev_name) 2714 + strlen(dev_path) + 1, 2715 M_XENBLOCKBACK, M_NOWAIT); 2716 if (dev_name) { 2717 sprintf(dev_name, "%s%s", dev_path, 2718 xbb->dev_name); 2719 free(xbb->dev_name, M_XENBLOCKBACK); 2720 xbb->dev_name = dev_name; 2721 goto again; 2722 } 2723 } 2724 xenbus_dev_fatal(xbb->dev, error, "error opening device %s", 2725 xbb->dev_name); 2726 return (error); 2727 } 2728 2729 NDFREE(&nd, NDF_ONLY_PNBUF); 2730 2731 xbb->vn = nd.ni_vp; 2732 2733 /* We only support disks and files. */ 2734 if (vn_isdisk(xbb->vn, &error)) { 2735 error = xbb_open_dev(xbb); 2736 } else if (xbb->vn->v_type == VREG) { 2737 error = xbb_open_file(xbb); 2738 } else { 2739 error = EINVAL; 2740 xenbus_dev_fatal(xbb->dev, error, "%s is not a disk " 2741 "or file", xbb->dev_name); 2742 } 2743 VOP_UNLOCK(xbb->vn, 0); 2744 2745 if (error != 0) { 2746 xbb_close_backend(xbb); 2747 return (error); 2748 } 2749 2750 xbb->sector_size_shift = fls(xbb->sector_size) - 1; 2751 xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift; 2752 2753 DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n", 2754 (xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file", 2755 xbb->dev_name, xbb->sector_size, xbb->media_size); 2756 2757 return (0); 2758} 2759 2760/*------------------------ Inter-Domain Communication ------------------------*/ 2761/** 2762 * Free dynamically allocated KVA or pseudo-physical address allocations. 2763 * 2764 * \param xbb Per-instance xbb configuration structure. 2765 */ 2766static void 2767xbb_free_communication_mem(struct xbb_softc *xbb) 2768{ 2769 if (xbb->kva != 0) { 2770 if (xbb->pseudo_phys_res != NULL) { 2771 xenmem_free(xbb->dev, xbb->pseudo_phys_res_id, 2772 xbb->pseudo_phys_res); 2773 xbb->pseudo_phys_res = NULL; 2774 } 2775 } 2776 xbb->kva = 0; 2777 xbb->gnt_base_addr = 0; 2778 if (xbb->kva_free != NULL) { 2779 free(xbb->kva_free, M_XENBLOCKBACK); 2780 xbb->kva_free = NULL; 2781 } 2782} 2783 2784/** 2785 * Cleanup all inter-domain communication mechanisms. 2786 * 2787 * \param xbb Per-instance xbb configuration structure. 2788 */ 2789static int 2790xbb_disconnect(struct xbb_softc *xbb) 2791{ 2792 struct gnttab_unmap_grant_ref ops[XBB_MAX_RING_PAGES]; 2793 struct gnttab_unmap_grant_ref *op; 2794 u_int ring_idx; 2795 int error; 2796 2797 DPRINTF("\n"); 2798 2799 if ((xbb->flags & XBBF_RING_CONNECTED) == 0) 2800 return (0); 2801 2802 xen_intr_unbind(&xbb->xen_intr_handle); 2803 2804 mtx_unlock(&xbb->lock); 2805 taskqueue_drain(xbb->io_taskqueue, &xbb->io_task); 2806 mtx_lock(&xbb->lock); 2807 2808 /* 2809 * No new interrupts can generate work, but we must wait 2810 * for all currently active requests to drain. 2811 */ 2812 if (xbb->active_request_count != 0) 2813 return (EAGAIN); 2814 2815 for (ring_idx = 0, op = ops; 2816 ring_idx < xbb->ring_config.ring_pages; 2817 ring_idx++, op++) { 2818 2819 op->host_addr = xbb->ring_config.gnt_addr 2820 + (ring_idx * PAGE_SIZE); 2821 op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx]; 2822 op->handle = xbb->ring_config.handle[ring_idx]; 2823 } 2824 2825 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops, 2826 xbb->ring_config.ring_pages); 2827 if (error != 0) 2828 panic("Grant table op failed (%d)", error); 2829 2830 xbb_free_communication_mem(xbb); 2831 2832 if (xbb->requests != NULL) { 2833 free(xbb->requests, M_XENBLOCKBACK); 2834 xbb->requests = NULL; 2835 } 2836 2837 if (xbb->request_lists != NULL) { 2838 struct xbb_xen_reqlist *reqlist; 2839 int i; 2840 2841 /* There is one request list for ever allocated request. */ 2842 for (i = 0, reqlist = xbb->request_lists; 2843 i < xbb->max_requests; i++, reqlist++){ 2844#ifdef XBB_USE_BOUNCE_BUFFERS 2845 if (reqlist->bounce != NULL) { 2846 free(reqlist->bounce, M_XENBLOCKBACK); 2847 reqlist->bounce = NULL; 2848 } 2849#endif 2850 if (reqlist->gnt_handles != NULL) { 2851 free(reqlist->gnt_handles, M_XENBLOCKBACK); 2852 reqlist->gnt_handles = NULL; 2853 } 2854 } 2855 free(xbb->request_lists, M_XENBLOCKBACK); 2856 xbb->request_lists = NULL; 2857 } 2858 2859 xbb->flags &= ~XBBF_RING_CONNECTED; 2860 return (0); 2861} 2862 2863/** 2864 * Map shared memory ring into domain local address space, initialize 2865 * ring control structures, and bind an interrupt to the event channel 2866 * used to notify us of ring changes. 2867 * 2868 * \param xbb Per-instance xbb configuration structure. 2869 */ 2870static int 2871xbb_connect_ring(struct xbb_softc *xbb) 2872{ 2873 struct gnttab_map_grant_ref gnts[XBB_MAX_RING_PAGES]; 2874 struct gnttab_map_grant_ref *gnt; 2875 u_int ring_idx; 2876 int error; 2877 2878 if ((xbb->flags & XBBF_RING_CONNECTED) != 0) 2879 return (0); 2880 2881 /* 2882 * Kva for our ring is at the tail of the region of kva allocated 2883 * by xbb_alloc_communication_mem(). 2884 */ 2885 xbb->ring_config.va = xbb->kva 2886 + (xbb->kva_size 2887 - (xbb->ring_config.ring_pages * PAGE_SIZE)); 2888 xbb->ring_config.gnt_addr = xbb->gnt_base_addr 2889 + (xbb->kva_size 2890 - (xbb->ring_config.ring_pages * PAGE_SIZE)); 2891 2892 for (ring_idx = 0, gnt = gnts; 2893 ring_idx < xbb->ring_config.ring_pages; 2894 ring_idx++, gnt++) { 2895 2896 gnt->host_addr = xbb->ring_config.gnt_addr 2897 + (ring_idx * PAGE_SIZE); 2898 gnt->flags = GNTMAP_host_map; 2899 gnt->ref = xbb->ring_config.ring_ref[ring_idx]; 2900 gnt->dom = xbb->otherend_id; 2901 } 2902 2903 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts, 2904 xbb->ring_config.ring_pages); 2905 if (error) 2906 panic("blkback: Ring page grant table op failed (%d)", error); 2907 2908 for (ring_idx = 0, gnt = gnts; 2909 ring_idx < xbb->ring_config.ring_pages; 2910 ring_idx++, gnt++) { 2911 if (gnt->status != 0) { 2912 xbb->ring_config.va = 0; 2913 xenbus_dev_fatal(xbb->dev, EACCES, 2914 "Ring shared page mapping failed. " 2915 "Status %d.", gnt->status); 2916 return (EACCES); 2917 } 2918 xbb->ring_config.handle[ring_idx] = gnt->handle; 2919 xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr; 2920 } 2921 2922 /* Initialize the ring based on ABI. */ 2923 switch (xbb->abi) { 2924 case BLKIF_PROTOCOL_NATIVE: 2925 { 2926 blkif_sring_t *sring; 2927 sring = (blkif_sring_t *)xbb->ring_config.va; 2928 BACK_RING_INIT(&xbb->rings.native, sring, 2929 xbb->ring_config.ring_pages * PAGE_SIZE); 2930 break; 2931 } 2932 case BLKIF_PROTOCOL_X86_32: 2933 { 2934 blkif_x86_32_sring_t *sring_x86_32; 2935 sring_x86_32 = (blkif_x86_32_sring_t *)xbb->ring_config.va; 2936 BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32, 2937 xbb->ring_config.ring_pages * PAGE_SIZE); 2938 break; 2939 } 2940 case BLKIF_PROTOCOL_X86_64: 2941 { 2942 blkif_x86_64_sring_t *sring_x86_64; 2943 sring_x86_64 = (blkif_x86_64_sring_t *)xbb->ring_config.va; 2944 BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64, 2945 xbb->ring_config.ring_pages * PAGE_SIZE); 2946 break; 2947 } 2948 default: 2949 panic("Unexpected blkif protocol ABI."); 2950 } 2951 2952 xbb->flags |= XBBF_RING_CONNECTED; 2953 2954 error = xen_intr_bind_remote_port(xbb->dev, 2955 xbb->otherend_id, 2956 xbb->ring_config.evtchn, 2957 xbb_filter, 2958 /*ithread_handler*/NULL, 2959 /*arg*/xbb, 2960 INTR_TYPE_BIO | INTR_MPSAFE, 2961 &xbb->xen_intr_handle); 2962 if (error) { 2963 (void)xbb_disconnect(xbb); 2964 xenbus_dev_fatal(xbb->dev, error, "binding event channel"); 2965 return (error); 2966 } 2967 2968 DPRINTF("rings connected!\n"); 2969 2970 return 0; 2971} 2972 2973/** 2974 * Size KVA and pseudo-physical address allocations based on negotiated 2975 * values for the size and number of I/O requests, and the size of our 2976 * communication ring. 2977 * 2978 * \param xbb Per-instance xbb configuration structure. 2979 * 2980 * These address spaces are used to dynamically map pages in the 2981 * front-end's domain into our own. 2982 */ 2983static int 2984xbb_alloc_communication_mem(struct xbb_softc *xbb) 2985{ 2986 xbb->reqlist_kva_pages = xbb->max_requests * xbb->max_request_segments; 2987 xbb->reqlist_kva_size = xbb->reqlist_kva_pages * PAGE_SIZE; 2988 xbb->kva_size = xbb->reqlist_kva_size + 2989 (xbb->ring_config.ring_pages * PAGE_SIZE); 2990 2991 xbb->kva_free = bit_alloc(xbb->reqlist_kva_pages, M_XENBLOCKBACK, M_NOWAIT); 2992 if (xbb->kva_free == NULL) 2993 return (ENOMEM); 2994 2995 DPRINTF("%s: kva_size = %d, reqlist_kva_size = %d\n", 2996 device_get_nameunit(xbb->dev), xbb->kva_size, 2997 xbb->reqlist_kva_size); 2998 /* 2999 * Reserve a range of pseudo physical memory that we can map 3000 * into kva. These pages will only be backed by machine 3001 * pages ("real memory") during the lifetime of front-end requests 3002 * via grant table operations. 3003 */ 3004 xbb->pseudo_phys_res_id = 0; 3005 xbb->pseudo_phys_res = xenmem_alloc(xbb->dev, &xbb->pseudo_phys_res_id, 3006 xbb->kva_size); 3007 if (xbb->pseudo_phys_res == NULL) { 3008 xbb->kva = 0; 3009 return (ENOMEM); 3010 } 3011 xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res); 3012 xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res); 3013 3014 DPRINTF("%s: kva: %#jx, gnt_base_addr: %#jx\n", 3015 device_get_nameunit(xbb->dev), (uintmax_t)xbb->kva, 3016 (uintmax_t)xbb->gnt_base_addr); 3017 return (0); 3018} 3019 3020/** 3021 * Collect front-end information from the XenStore. 3022 * 3023 * \param xbb Per-instance xbb configuration structure. 3024 */ 3025static int 3026xbb_collect_frontend_info(struct xbb_softc *xbb) 3027{ 3028 char protocol_abi[64]; 3029 const char *otherend_path; 3030 int error; 3031 u_int ring_idx; 3032 u_int ring_page_order; 3033 size_t ring_size; 3034 3035 otherend_path = xenbus_get_otherend_path(xbb->dev); 3036 3037 /* 3038 * Protocol defaults valid even if all negotiation fails. 3039 */ 3040 xbb->ring_config.ring_pages = 1; 3041 xbb->max_request_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST; 3042 xbb->max_request_size = xbb->max_request_segments * PAGE_SIZE; 3043 3044 /* 3045 * Mandatory data (used in all versions of the protocol) first. 3046 */ 3047 error = xs_scanf(XST_NIL, otherend_path, 3048 "event-channel", NULL, "%" PRIu32, 3049 &xbb->ring_config.evtchn); 3050 if (error != 0) { 3051 xenbus_dev_fatal(xbb->dev, error, 3052 "Unable to retrieve event-channel information " 3053 "from frontend %s. Unable to connect.", 3054 xenbus_get_otherend_path(xbb->dev)); 3055 return (error); 3056 } 3057 3058 /* 3059 * These fields are initialized to legacy protocol defaults 3060 * so we only need to fail if reading the updated value succeeds 3061 * and the new value is outside of its allowed range. 3062 * 3063 * \note xs_gather() returns on the first encountered error, so 3064 * we must use independent calls in order to guarantee 3065 * we don't miss information in a sparsly populated front-end 3066 * tree. 3067 * 3068 * \note xs_scanf() does not update variables for unmatched 3069 * fields. 3070 */ 3071 ring_page_order = 0; 3072 xbb->max_requests = 32; 3073 3074 (void)xs_scanf(XST_NIL, otherend_path, 3075 "ring-page-order", NULL, "%u", 3076 &ring_page_order); 3077 xbb->ring_config.ring_pages = 1 << ring_page_order; 3078 ring_size = PAGE_SIZE * xbb->ring_config.ring_pages; 3079 xbb->max_requests = BLKIF_MAX_RING_REQUESTS(ring_size); 3080 3081 if (xbb->ring_config.ring_pages > XBB_MAX_RING_PAGES) { 3082 xenbus_dev_fatal(xbb->dev, EINVAL, 3083 "Front-end specified ring-pages of %u " 3084 "exceeds backend limit of %u. " 3085 "Unable to connect.", 3086 xbb->ring_config.ring_pages, 3087 XBB_MAX_RING_PAGES); 3088 return (EINVAL); 3089 } 3090 3091 if (xbb->ring_config.ring_pages == 1) { 3092 error = xs_gather(XST_NIL, otherend_path, 3093 "ring-ref", "%" PRIu32, 3094 &xbb->ring_config.ring_ref[0], 3095 NULL); 3096 if (error != 0) { 3097 xenbus_dev_fatal(xbb->dev, error, 3098 "Unable to retrieve ring information " 3099 "from frontend %s. Unable to " 3100 "connect.", 3101 xenbus_get_otherend_path(xbb->dev)); 3102 return (error); 3103 } 3104 } else { 3105 /* Multi-page ring format. */ 3106 for (ring_idx = 0; ring_idx < xbb->ring_config.ring_pages; 3107 ring_idx++) { 3108 char ring_ref_name[]= "ring_refXX"; 3109 3110 snprintf(ring_ref_name, sizeof(ring_ref_name), 3111 "ring-ref%u", ring_idx); 3112 error = xs_scanf(XST_NIL, otherend_path, 3113 ring_ref_name, NULL, "%" PRIu32, 3114 &xbb->ring_config.ring_ref[ring_idx]); 3115 if (error != 0) { 3116 xenbus_dev_fatal(xbb->dev, error, 3117 "Failed to retriev grant " 3118 "reference for page %u of " 3119 "shared ring. Unable " 3120 "to connect.", ring_idx); 3121 return (error); 3122 } 3123 } 3124 } 3125 3126 error = xs_gather(XST_NIL, otherend_path, 3127 "protocol", "%63s", protocol_abi, 3128 NULL); 3129 if (error != 0 3130 || !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) { 3131 /* 3132 * Assume native if the frontend has not 3133 * published ABI data or it has published and 3134 * matches our own ABI. 3135 */ 3136 xbb->abi = BLKIF_PROTOCOL_NATIVE; 3137 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) { 3138 3139 xbb->abi = BLKIF_PROTOCOL_X86_32; 3140 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) { 3141 3142 xbb->abi = BLKIF_PROTOCOL_X86_64; 3143 } else { 3144 3145 xenbus_dev_fatal(xbb->dev, EINVAL, 3146 "Unknown protocol ABI (%s) published by " 3147 "frontend. Unable to connect.", protocol_abi); 3148 return (EINVAL); 3149 } 3150 return (0); 3151} 3152 3153/** 3154 * Allocate per-request data structures given request size and number 3155 * information negotiated with the front-end. 3156 * 3157 * \param xbb Per-instance xbb configuration structure. 3158 */ 3159static int 3160xbb_alloc_requests(struct xbb_softc *xbb) 3161{ 3162 struct xbb_xen_req *req; 3163 struct xbb_xen_req *last_req; 3164 3165 /* 3166 * Allocate request book keeping datastructures. 3167 */ 3168 xbb->requests = malloc(xbb->max_requests * sizeof(*xbb->requests), 3169 M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3170 if (xbb->requests == NULL) { 3171 xenbus_dev_fatal(xbb->dev, ENOMEM, 3172 "Unable to allocate request structures"); 3173 return (ENOMEM); 3174 } 3175 3176 req = xbb->requests; 3177 last_req = &xbb->requests[xbb->max_requests - 1]; 3178 STAILQ_INIT(&xbb->request_free_stailq); 3179 while (req <= last_req) { 3180 STAILQ_INSERT_TAIL(&xbb->request_free_stailq, req, links); 3181 req++; 3182 } 3183 return (0); 3184} 3185 3186static int 3187xbb_alloc_request_lists(struct xbb_softc *xbb) 3188{ 3189 struct xbb_xen_reqlist *reqlist; 3190 int i; 3191 3192 /* 3193 * If no requests can be merged, we need 1 request list per 3194 * in flight request. 3195 */ 3196 xbb->request_lists = malloc(xbb->max_requests * 3197 sizeof(*xbb->request_lists), M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3198 if (xbb->request_lists == NULL) { 3199 xenbus_dev_fatal(xbb->dev, ENOMEM, 3200 "Unable to allocate request list structures"); 3201 return (ENOMEM); 3202 } 3203 3204 STAILQ_INIT(&xbb->reqlist_free_stailq); 3205 STAILQ_INIT(&xbb->reqlist_pending_stailq); 3206 for (i = 0; i < xbb->max_requests; i++) { 3207 int seg; 3208 3209 reqlist = &xbb->request_lists[i]; 3210 3211 reqlist->xbb = xbb; 3212 3213#ifdef XBB_USE_BOUNCE_BUFFERS 3214 reqlist->bounce = malloc(xbb->max_reqlist_size, 3215 M_XENBLOCKBACK, M_NOWAIT); 3216 if (reqlist->bounce == NULL) { 3217 xenbus_dev_fatal(xbb->dev, ENOMEM, 3218 "Unable to allocate request " 3219 "bounce buffers"); 3220 return (ENOMEM); 3221 } 3222#endif /* XBB_USE_BOUNCE_BUFFERS */ 3223 3224 reqlist->gnt_handles = malloc(xbb->max_reqlist_segments * 3225 sizeof(*reqlist->gnt_handles), 3226 M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3227 if (reqlist->gnt_handles == NULL) { 3228 xenbus_dev_fatal(xbb->dev, ENOMEM, 3229 "Unable to allocate request " 3230 "grant references"); 3231 return (ENOMEM); 3232 } 3233 3234 for (seg = 0; seg < xbb->max_reqlist_segments; seg++) 3235 reqlist->gnt_handles[seg] = GRANT_REF_INVALID; 3236 3237 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); 3238 } 3239 return (0); 3240} 3241 3242/** 3243 * Supply information about the physical device to the frontend 3244 * via XenBus. 3245 * 3246 * \param xbb Per-instance xbb configuration structure. 3247 */ 3248static int 3249xbb_publish_backend_info(struct xbb_softc *xbb) 3250{ 3251 struct xs_transaction xst; 3252 const char *our_path; 3253 const char *leaf; 3254 int error; 3255 3256 our_path = xenbus_get_node(xbb->dev); 3257 while (1) { 3258 error = xs_transaction_start(&xst); 3259 if (error != 0) { 3260 xenbus_dev_fatal(xbb->dev, error, 3261 "Error publishing backend info " 3262 "(start transaction)"); 3263 return (error); 3264 } 3265 3266 leaf = "sectors"; 3267 error = xs_printf(xst, our_path, leaf, 3268 "%"PRIu64, xbb->media_num_sectors); 3269 if (error != 0) 3270 break; 3271 3272 /* XXX Support all VBD attributes here. */ 3273 leaf = "info"; 3274 error = xs_printf(xst, our_path, leaf, "%u", 3275 xbb->flags & XBBF_READ_ONLY 3276 ? VDISK_READONLY : 0); 3277 if (error != 0) 3278 break; 3279 3280 leaf = "sector-size"; 3281 error = xs_printf(xst, our_path, leaf, "%u", 3282 xbb->sector_size); 3283 if (error != 0) 3284 break; 3285 3286 error = xs_transaction_end(xst, 0); 3287 if (error == 0) { 3288 return (0); 3289 } else if (error != EAGAIN) { 3290 xenbus_dev_fatal(xbb->dev, error, "ending transaction"); 3291 return (error); 3292 } 3293 } 3294 3295 xenbus_dev_fatal(xbb->dev, error, "writing %s/%s", 3296 our_path, leaf); 3297 xs_transaction_end(xst, 1); 3298 return (error); 3299} 3300 3301/** 3302 * Connect to our blkfront peer now that it has completed publishing 3303 * its configuration into the XenStore. 3304 * 3305 * \param xbb Per-instance xbb configuration structure. 3306 */ 3307static void 3308xbb_connect(struct xbb_softc *xbb) 3309{ 3310 int error; 3311 3312 if (xenbus_get_state(xbb->dev) != XenbusStateInitialised) 3313 return; 3314 3315 if (xbb_collect_frontend_info(xbb) != 0) 3316 return; 3317 3318 xbb->flags &= ~XBBF_SHUTDOWN; 3319 3320 /* 3321 * We limit the maximum number of reqlist segments to the maximum 3322 * number of segments in the ring, or our absolute maximum, 3323 * whichever is smaller. 3324 */ 3325 xbb->max_reqlist_segments = MIN(xbb->max_request_segments * 3326 xbb->max_requests, XBB_MAX_SEGMENTS_PER_REQLIST); 3327 3328 /* 3329 * The maximum size is simply a function of the number of segments 3330 * we can handle. 3331 */ 3332 xbb->max_reqlist_size = xbb->max_reqlist_segments * PAGE_SIZE; 3333 3334 /* Allocate resources whose size depends on front-end configuration. */ 3335 error = xbb_alloc_communication_mem(xbb); 3336 if (error != 0) { 3337 xenbus_dev_fatal(xbb->dev, error, 3338 "Unable to allocate communication memory"); 3339 return; 3340 } 3341 3342 error = xbb_alloc_requests(xbb); 3343 if (error != 0) { 3344 /* Specific errors are reported by xbb_alloc_requests(). */ 3345 return; 3346 } 3347 3348 error = xbb_alloc_request_lists(xbb); 3349 if (error != 0) { 3350 /* Specific errors are reported by xbb_alloc_request_lists(). */ 3351 return; 3352 } 3353 3354 /* 3355 * Connect communication channel. 3356 */ 3357 error = xbb_connect_ring(xbb); 3358 if (error != 0) { 3359 /* Specific errors are reported by xbb_connect_ring(). */ 3360 return; 3361 } 3362 3363 if (xbb_publish_backend_info(xbb) != 0) { 3364 /* 3365 * If we can't publish our data, we cannot participate 3366 * in this connection, and waiting for a front-end state 3367 * change will not help the situation. 3368 */ 3369 (void)xbb_disconnect(xbb); 3370 return; 3371 } 3372 3373 /* Ready for I/O. */ 3374 xenbus_set_state(xbb->dev, XenbusStateConnected); 3375} 3376 3377/*-------------------------- Device Teardown Support -------------------------*/ 3378/** 3379 * Perform device shutdown functions. 3380 * 3381 * \param xbb Per-instance xbb configuration structure. 3382 * 3383 * Mark this instance as shutting down, wait for any active I/O on the 3384 * backend device/file to drain, disconnect from the front-end, and notify 3385 * any waiters (e.g. a thread invoking our detach method) that detach can 3386 * now proceed. 3387 */ 3388static int 3389xbb_shutdown(struct xbb_softc *xbb) 3390{ 3391 XenbusState frontState; 3392 int error; 3393 3394 DPRINTF("\n"); 3395 3396 /* 3397 * Due to the need to drop our mutex during some 3398 * xenbus operations, it is possible for two threads 3399 * to attempt to close out shutdown processing at 3400 * the same time. Tell the caller that hits this 3401 * race to try back later. 3402 */ 3403 if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0) 3404 return (EAGAIN); 3405 3406 xbb->flags |= XBBF_IN_SHUTDOWN; 3407 mtx_unlock(&xbb->lock); 3408 3409 if (xbb->hotplug_watch.node != NULL) { 3410 xs_unregister_watch(&xbb->hotplug_watch); 3411 free(xbb->hotplug_watch.node, M_XENBLOCKBACK); 3412 xbb->hotplug_watch.node = NULL; 3413 } 3414 3415 if (xenbus_get_state(xbb->dev) < XenbusStateClosing) 3416 xenbus_set_state(xbb->dev, XenbusStateClosing); 3417 3418 frontState = xenbus_get_otherend_state(xbb->dev); 3419 mtx_lock(&xbb->lock); 3420 xbb->flags &= ~XBBF_IN_SHUTDOWN; 3421 3422 /* The front can submit I/O until entering the closed state. */ 3423 if (frontState < XenbusStateClosed) 3424 return (EAGAIN); 3425 3426 DPRINTF("\n"); 3427 3428 /* Indicate shutdown is in progress. */ 3429 xbb->flags |= XBBF_SHUTDOWN; 3430 3431 /* Disconnect from the front-end. */ 3432 error = xbb_disconnect(xbb); 3433 if (error != 0) { 3434 /* 3435 * Requests still outstanding. We'll be called again 3436 * once they complete. 3437 */ 3438 KASSERT(error == EAGAIN, 3439 ("%s: Unexpected xbb_disconnect() failure %d", 3440 __func__, error)); 3441 3442 return (error); 3443 } 3444 3445 DPRINTF("\n"); 3446 3447 /* Indicate to xbb_detach() that is it safe to proceed. */ 3448 wakeup(xbb); 3449 3450 return (0); 3451} 3452 3453/** 3454 * Report an attach time error to the console and Xen, and cleanup 3455 * this instance by forcing immediate detach processing. 3456 * 3457 * \param xbb Per-instance xbb configuration structure. 3458 * \param err Errno describing the error. 3459 * \param fmt Printf style format and arguments 3460 */ 3461static void 3462xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...) 3463{ 3464 va_list ap; 3465 va_list ap_hotplug; 3466 3467 va_start(ap, fmt); 3468 va_copy(ap_hotplug, ap); 3469 xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev), 3470 "hotplug-error", fmt, ap_hotplug); 3471 va_end(ap_hotplug); 3472 xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3473 "hotplug-status", "error"); 3474 3475 xenbus_dev_vfatal(xbb->dev, err, fmt, ap); 3476 va_end(ap); 3477 3478 xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3479 "online", "0"); 3480 xbb_detach(xbb->dev); 3481} 3482 3483/*---------------------------- NewBus Entrypoints ----------------------------*/ 3484/** 3485 * Inspect a XenBus device and claim it if is of the appropriate type. 3486 * 3487 * \param dev NewBus device object representing a candidate XenBus device. 3488 * 3489 * \return 0 for success, errno codes for failure. 3490 */ 3491static int 3492xbb_probe(device_t dev) 3493{ 3494 3495 if (!strcmp(xenbus_get_type(dev), "vbd")) { 3496 device_set_desc(dev, "Backend Virtual Block Device"); 3497 device_quiet(dev); 3498 return (0); 3499 } 3500 3501 return (ENXIO); 3502} 3503 3504/** 3505 * Setup sysctl variables to control various Block Back parameters. 3506 * 3507 * \param xbb Xen Block Back softc. 3508 * 3509 */ 3510static void 3511xbb_setup_sysctl(struct xbb_softc *xbb) 3512{ 3513 struct sysctl_ctx_list *sysctl_ctx = NULL; 3514 struct sysctl_oid *sysctl_tree = NULL; 3515 3516 sysctl_ctx = device_get_sysctl_ctx(xbb->dev); 3517 if (sysctl_ctx == NULL) 3518 return; 3519 3520 sysctl_tree = device_get_sysctl_tree(xbb->dev); 3521 if (sysctl_tree == NULL) 3522 return; 3523 3524 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3525 "disable_flush", CTLFLAG_RW, &xbb->disable_flush, 0, 3526 "fake the flush command"); 3527 3528 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3529 "flush_interval", CTLFLAG_RW, &xbb->flush_interval, 0, 3530 "send a real flush for N flush requests"); 3531 3532 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3533 "no_coalesce_reqs", CTLFLAG_RW, &xbb->no_coalesce_reqs,0, 3534 "Don't coalesce contiguous requests"); 3535 3536 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3537 "reqs_received", CTLFLAG_RW, &xbb->reqs_received, 3538 "how many I/O requests we have received"); 3539 3540 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3541 "reqs_completed", CTLFLAG_RW, &xbb->reqs_completed, 3542 "how many I/O requests have been completed"); 3543 3544 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3545 "reqs_queued_for_completion", CTLFLAG_RW, 3546 &xbb->reqs_queued_for_completion, 3547 "how many I/O requests queued but not yet pushed"); 3548 3549 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3550 "reqs_completed_with_error", CTLFLAG_RW, 3551 &xbb->reqs_completed_with_error, 3552 "how many I/O requests completed with error status"); 3553 3554 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3555 "forced_dispatch", CTLFLAG_RW, &xbb->forced_dispatch, 3556 "how many I/O dispatches were forced"); 3557 3558 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3559 "normal_dispatch", CTLFLAG_RW, &xbb->normal_dispatch, 3560 "how many I/O dispatches were normal"); 3561 3562 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3563 "total_dispatch", CTLFLAG_RW, &xbb->total_dispatch, 3564 "total number of I/O dispatches"); 3565 3566 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3567 "kva_shortages", CTLFLAG_RW, &xbb->kva_shortages, 3568 "how many times we have run out of KVA"); 3569 3570 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3571 "request_shortages", CTLFLAG_RW, 3572 &xbb->request_shortages, 3573 "how many times we have run out of requests"); 3574 3575 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3576 "max_requests", CTLFLAG_RD, &xbb->max_requests, 0, 3577 "maximum outstanding requests (negotiated)"); 3578 3579 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3580 "max_request_segments", CTLFLAG_RD, 3581 &xbb->max_request_segments, 0, 3582 "maximum number of pages per requests (negotiated)"); 3583 3584 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3585 "max_request_size", CTLFLAG_RD, 3586 &xbb->max_request_size, 0, 3587 "maximum size in bytes of a request (negotiated)"); 3588 3589 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3590 "ring_pages", CTLFLAG_RD, 3591 &xbb->ring_config.ring_pages, 0, 3592 "communication channel pages (negotiated)"); 3593} 3594 3595static void 3596xbb_attach_disk(struct xs_watch *watch, const char **vec, unsigned int len) 3597{ 3598 device_t dev; 3599 struct xbb_softc *xbb; 3600 int error; 3601 3602 dev = (device_t) watch->callback_data; 3603 xbb = device_get_softc(dev); 3604 3605 error = xs_gather(XST_NIL, xenbus_get_node(dev), "physical-device-path", 3606 NULL, &xbb->dev_name, NULL); 3607 if (error != 0) 3608 return; 3609 3610 xs_unregister_watch(watch); 3611 free(watch->node, M_XENBLOCKBACK); 3612 watch->node = NULL; 3613 3614 /* Collect physical device information. */ 3615 error = xs_gather(XST_NIL, xenbus_get_otherend_path(xbb->dev), 3616 "device-type", NULL, &xbb->dev_type, 3617 NULL); 3618 if (error != 0) 3619 xbb->dev_type = NULL; 3620 3621 error = xs_gather(XST_NIL, xenbus_get_node(dev), 3622 "mode", NULL, &xbb->dev_mode, 3623 NULL); 3624 if (error != 0) { 3625 xbb_attach_failed(xbb, error, "reading backend fields at %s", 3626 xenbus_get_node(dev)); 3627 return; 3628 } 3629 3630 /* Parse fopen style mode flags. */ 3631 if (strchr(xbb->dev_mode, 'w') == NULL) 3632 xbb->flags |= XBBF_READ_ONLY; 3633 3634 /* 3635 * Verify the physical device is present and can support 3636 * the desired I/O mode. 3637 */ 3638 error = xbb_open_backend(xbb); 3639 if (error != 0) { 3640 xbb_attach_failed(xbb, error, "Unable to open %s", 3641 xbb->dev_name); 3642 return; 3643 } 3644 3645 /* Use devstat(9) for recording statistics. */ 3646 xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev), 3647 xbb->sector_size, 3648 DEVSTAT_ALL_SUPPORTED, 3649 DEVSTAT_TYPE_DIRECT 3650 | DEVSTAT_TYPE_IF_OTHER, 3651 DEVSTAT_PRIORITY_OTHER); 3652 3653 xbb->xbb_stats_in = devstat_new_entry("xbbi", device_get_unit(xbb->dev), 3654 xbb->sector_size, 3655 DEVSTAT_ALL_SUPPORTED, 3656 DEVSTAT_TYPE_DIRECT 3657 | DEVSTAT_TYPE_IF_OTHER, 3658 DEVSTAT_PRIORITY_OTHER); 3659 /* 3660 * Setup sysctl variables. 3661 */ 3662 xbb_setup_sysctl(xbb); 3663 3664 /* 3665 * Create a taskqueue for doing work that must occur from a 3666 * thread context. 3667 */ 3668 xbb->io_taskqueue = taskqueue_create_fast(device_get_nameunit(dev), 3669 M_NOWAIT, 3670 taskqueue_thread_enqueue, 3671 /*contxt*/&xbb->io_taskqueue); 3672 if (xbb->io_taskqueue == NULL) { 3673 xbb_attach_failed(xbb, error, "Unable to create taskqueue"); 3674 return; 3675 } 3676 3677 taskqueue_start_threads(&xbb->io_taskqueue, 3678 /*num threads*/1, 3679 /*priority*/PWAIT, 3680 /*thread name*/ 3681 "%s taskq", device_get_nameunit(dev)); 3682 3683 /* Update hot-plug status to satisfy xend. */ 3684 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3685 "hotplug-status", "connected"); 3686 if (error) { 3687 xbb_attach_failed(xbb, error, "writing %s/hotplug-status", 3688 xenbus_get_node(xbb->dev)); 3689 return; 3690 } 3691 3692 /* Tell the front end that we are ready to connect. */ 3693 xenbus_set_state(dev, XenbusStateInitialised); 3694} 3695 3696/** 3697 * Attach to a XenBus device that has been claimed by our probe routine. 3698 * 3699 * \param dev NewBus device object representing this Xen Block Back instance. 3700 * 3701 * \return 0 for success, errno codes for failure. 3702 */ 3703static int 3704xbb_attach(device_t dev) 3705{ 3706 struct xbb_softc *xbb; 3707 int error; 3708 u_int max_ring_page_order; 3709 struct sbuf *watch_path; 3710 3711 DPRINTF("Attaching to %s\n", xenbus_get_node(dev)); 3712 3713 /* 3714 * Basic initialization. 3715 * After this block it is safe to call xbb_detach() 3716 * to clean up any allocated data for this instance. 3717 */ 3718 xbb = device_get_softc(dev); 3719 xbb->dev = dev; 3720 xbb->otherend_id = xenbus_get_otherend_id(dev); 3721 TASK_INIT(&xbb->io_task, /*priority*/0, xbb_run_queue, xbb); 3722 mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF); 3723 3724 /* 3725 * Publish protocol capabilities for consumption by the 3726 * front-end. 3727 */ 3728 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3729 "feature-barrier", "1"); 3730 if (error) { 3731 xbb_attach_failed(xbb, error, "writing %s/feature-barrier", 3732 xenbus_get_node(xbb->dev)); 3733 return (error); 3734 } 3735 3736 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3737 "feature-flush-cache", "1"); 3738 if (error) { 3739 xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache", 3740 xenbus_get_node(xbb->dev)); 3741 return (error); 3742 } 3743 3744 max_ring_page_order = flsl(XBB_MAX_RING_PAGES) - 1; 3745 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3746 "max-ring-page-order", "%u", max_ring_page_order); 3747 if (error) { 3748 xbb_attach_failed(xbb, error, "writing %s/max-ring-page-order", 3749 xenbus_get_node(xbb->dev)); 3750 return (error); 3751 } 3752 3753 /* 3754 * We need to wait for hotplug script execution before 3755 * moving forward. 3756 */ 3757 watch_path = xs_join(xenbus_get_node(xbb->dev), "physical-device-path"); 3758 xbb->hotplug_watch.callback_data = (uintptr_t)dev; 3759 xbb->hotplug_watch.callback = xbb_attach_disk; 3760 KASSERT(xbb->hotplug_watch.node == NULL, ("watch node already setup")); 3761 xbb->hotplug_watch.node = strdup(sbuf_data(watch_path), M_XENBLOCKBACK); 3762 sbuf_delete(watch_path); 3763 error = xs_register_watch(&xbb->hotplug_watch); 3764 if (error != 0) { 3765 xbb_attach_failed(xbb, error, "failed to create watch on %s", 3766 xbb->hotplug_watch.node); 3767 free(xbb->hotplug_watch.node, M_XENBLOCKBACK); 3768 return (error); 3769 } 3770 3771 /* Tell the toolstack blkback has attached. */ 3772 xenbus_set_state(dev, XenbusStateInitWait); 3773 3774 return (0); 3775} 3776 3777/** 3778 * Detach from a block back device instance. 3779 * 3780 * \param dev NewBus device object representing this Xen Block Back instance. 3781 * 3782 * \return 0 for success, errno codes for failure. 3783 * 3784 * \note A block back device may be detached at any time in its life-cycle, 3785 * including part way through the attach process. For this reason, 3786 * initialization order and the initialization state checks in this 3787 * routine must be carefully coupled so that attach time failures 3788 * are gracefully handled. 3789 */ 3790static int 3791xbb_detach(device_t dev) 3792{ 3793 struct xbb_softc *xbb; 3794 3795 DPRINTF("\n"); 3796 3797 xbb = device_get_softc(dev); 3798 mtx_lock(&xbb->lock); 3799 while (xbb_shutdown(xbb) == EAGAIN) { 3800 msleep(xbb, &xbb->lock, /*wakeup prio unchanged*/0, 3801 "xbb_shutdown", 0); 3802 } 3803 mtx_unlock(&xbb->lock); 3804 3805 DPRINTF("\n"); 3806 3807 if (xbb->io_taskqueue != NULL) 3808 taskqueue_free(xbb->io_taskqueue); 3809 3810 if (xbb->xbb_stats != NULL) 3811 devstat_remove_entry(xbb->xbb_stats); 3812 3813 if (xbb->xbb_stats_in != NULL) 3814 devstat_remove_entry(xbb->xbb_stats_in); 3815 3816 xbb_close_backend(xbb); 3817 3818 if (xbb->dev_mode != NULL) { 3819 free(xbb->dev_mode, M_XENSTORE); 3820 xbb->dev_mode = NULL; 3821 } 3822 3823 if (xbb->dev_type != NULL) { 3824 free(xbb->dev_type, M_XENSTORE); 3825 xbb->dev_type = NULL; 3826 } 3827 3828 if (xbb->dev_name != NULL) { 3829 free(xbb->dev_name, M_XENSTORE); 3830 xbb->dev_name = NULL; 3831 } 3832 3833 mtx_destroy(&xbb->lock); 3834 return (0); 3835} 3836 3837/** 3838 * Prepare this block back device for suspension of this VM. 3839 * 3840 * \param dev NewBus device object representing this Xen Block Back instance. 3841 * 3842 * \return 0 for success, errno codes for failure. 3843 */ 3844static int 3845xbb_suspend(device_t dev) 3846{ 3847#ifdef NOT_YET 3848 struct xbb_softc *sc = device_get_softc(dev); 3849 3850 /* Prevent new requests being issued until we fix things up. */ 3851 mtx_lock(&sc->xb_io_lock); 3852 sc->connected = BLKIF_STATE_SUSPENDED; 3853 mtx_unlock(&sc->xb_io_lock); 3854#endif 3855 3856 return (0); 3857} 3858 3859/** 3860 * Perform any processing required to recover from a suspended state. 3861 * 3862 * \param dev NewBus device object representing this Xen Block Back instance. 3863 * 3864 * \return 0 for success, errno codes for failure. 3865 */ 3866static int 3867xbb_resume(device_t dev) 3868{ 3869 return (0); 3870} 3871 3872/** 3873 * Handle state changes expressed via the XenStore by our front-end peer. 3874 * 3875 * \param dev NewBus device object representing this Xen 3876 * Block Back instance. 3877 * \param frontend_state The new state of the front-end. 3878 * 3879 * \return 0 for success, errno codes for failure. 3880 */ 3881static void 3882xbb_frontend_changed(device_t dev, XenbusState frontend_state) 3883{ 3884 struct xbb_softc *xbb = device_get_softc(dev); 3885 3886 DPRINTF("frontend_state=%s, xbb_state=%s\n", 3887 xenbus_strstate(frontend_state), 3888 xenbus_strstate(xenbus_get_state(xbb->dev))); 3889 3890 switch (frontend_state) { 3891 case XenbusStateInitialising: 3892 break; 3893 case XenbusStateInitialised: 3894 case XenbusStateConnected: 3895 xbb_connect(xbb); 3896 break; 3897 case XenbusStateClosing: 3898 case XenbusStateClosed: 3899 mtx_lock(&xbb->lock); 3900 xbb_shutdown(xbb); 3901 mtx_unlock(&xbb->lock); 3902 if (frontend_state == XenbusStateClosed) 3903 xenbus_set_state(xbb->dev, XenbusStateClosed); 3904 break; 3905 default: 3906 xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend", 3907 frontend_state); 3908 break; 3909 } 3910} 3911 3912/*---------------------------- NewBus Registration ---------------------------*/ 3913static device_method_t xbb_methods[] = { 3914 /* Device interface */ 3915 DEVMETHOD(device_probe, xbb_probe), 3916 DEVMETHOD(device_attach, xbb_attach), 3917 DEVMETHOD(device_detach, xbb_detach), 3918 DEVMETHOD(device_shutdown, bus_generic_shutdown), 3919 DEVMETHOD(device_suspend, xbb_suspend), 3920 DEVMETHOD(device_resume, xbb_resume), 3921 3922 /* Xenbus interface */ 3923 DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed), 3924 3925 { 0, 0 } 3926}; 3927 3928static driver_t xbb_driver = { 3929 "xbbd", 3930 xbb_methods, 3931 sizeof(struct xbb_softc), 3932}; 3933devclass_t xbb_devclass; 3934 3935DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, xbb_devclass, 0, 0); 3936