blkback.c revision 346817
1/*- 2 * Copyright (c) 2009-2012 Spectra Logic Corporation 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions, and the following disclaimer, 10 * without modification. 11 * 2. Redistributions in binary form must reproduce at minimum a disclaimer 12 * substantially similar to the "NO WARRANTY" disclaimer below 13 * ("Disclaimer") and any redistribution must be conditioned upon 14 * including a substantially similar Disclaimer requirement for further 15 * binary redistribution. 16 * 17 * NO WARRANTY 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 26 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 27 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * POSSIBILITY OF SUCH DAMAGES. 29 * 30 * Authors: Justin T. Gibbs (Spectra Logic Corporation) 31 * Ken Merry (Spectra Logic Corporation) 32 */ 33#include <sys/cdefs.h> 34__FBSDID("$FreeBSD: stable/11/sys/dev/xen/blkback/blkback.c 346817 2019-04-28 13:21:01Z dchagin $"); 35 36/** 37 * \file blkback.c 38 * 39 * \brief Device driver supporting the vending of block storage from 40 * a FreeBSD domain to other domains. 41 */ 42 43#include <sys/param.h> 44#include <sys/systm.h> 45#include <sys/kernel.h> 46#include <sys/malloc.h> 47 48#include <sys/bio.h> 49#include <sys/bus.h> 50#include <sys/conf.h> 51#include <sys/devicestat.h> 52#include <sys/disk.h> 53#include <sys/fcntl.h> 54#include <sys/filedesc.h> 55#include <sys/kdb.h> 56#include <sys/module.h> 57#include <sys/namei.h> 58#include <sys/proc.h> 59#include <sys/rman.h> 60#include <sys/taskqueue.h> 61#include <sys/types.h> 62#include <sys/vnode.h> 63#include <sys/mount.h> 64#include <sys/sysctl.h> 65#include <sys/bitstring.h> 66#include <sys/sdt.h> 67 68#include <geom/geom.h> 69 70#include <machine/_inttypes.h> 71 72#include <vm/vm.h> 73#include <vm/vm_extern.h> 74#include <vm/vm_kern.h> 75 76#include <xen/xen-os.h> 77#include <xen/blkif.h> 78#include <xen/gnttab.h> 79#include <xen/xen_intr.h> 80 81#include <xen/interface/event_channel.h> 82#include <xen/interface/grant_table.h> 83 84#include <xen/xenbus/xenbusvar.h> 85 86/*--------------------------- Compile-time Tunables --------------------------*/ 87/** 88 * The maximum number of shared memory ring pages we will allow in a 89 * negotiated block-front/back communication channel. Allow enough 90 * ring space for all requests to be XBB_MAX_REQUEST_SIZE'd. 91 */ 92#define XBB_MAX_RING_PAGES 32 93 94/** 95 * The maximum number of outstanding request blocks (request headers plus 96 * additional segment blocks) we will allow in a negotiated block-front/back 97 * communication channel. 98 */ 99#define XBB_MAX_REQUESTS \ 100 __CONST_RING_SIZE(blkif, PAGE_SIZE * XBB_MAX_RING_PAGES) 101 102/** 103 * \brief Define to force all I/O to be performed on memory owned by the 104 * backend device, with a copy-in/out to the remote domain's memory. 105 * 106 * \note This option is currently required when this driver's domain is 107 * operating in HVM mode on a system using an IOMMU. 108 * 109 * This driver uses Xen's grant table API to gain access to the memory of 110 * the remote domains it serves. When our domain is operating in PV mode, 111 * the grant table mechanism directly updates our domain's page table entries 112 * to point to the physical pages of the remote domain. This scheme guarantees 113 * that blkback and the backing devices it uses can safely perform DMA 114 * operations to satisfy requests. In HVM mode, Xen may use a HW IOMMU to 115 * insure that our domain cannot DMA to pages owned by another domain. As 116 * of Xen 4.0, IOMMU mappings for HVM guests are not updated via the grant 117 * table API. For this reason, in HVM mode, we must bounce all requests into 118 * memory that is mapped into our domain at domain startup and thus has 119 * valid IOMMU mappings. 120 */ 121#define XBB_USE_BOUNCE_BUFFERS 122 123/** 124 * \brief Define to enable rudimentary request logging to the console. 125 */ 126#undef XBB_DEBUG 127 128/*---------------------------------- Macros ----------------------------------*/ 129/** 130 * Custom malloc type for all driver allocations. 131 */ 132static MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data"); 133 134#ifdef XBB_DEBUG 135#define DPRINTF(fmt, args...) \ 136 printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) 137#else 138#define DPRINTF(fmt, args...) do {} while(0) 139#endif 140 141/** 142 * The maximum mapped region size per request we will allow in a negotiated 143 * block-front/back communication channel. 144 */ 145#define XBB_MAX_REQUEST_SIZE \ 146 MIN(MAXPHYS, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) 147 148/** 149 * The maximum number of segments (within a request header and accompanying 150 * segment blocks) per request we will allow in a negotiated block-front/back 151 * communication channel. 152 */ 153#define XBB_MAX_SEGMENTS_PER_REQUEST \ 154 (MIN(UIO_MAXIOV, \ 155 MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST, \ 156 (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1))) 157 158/** 159 * The maximum number of ring pages that we can allow per request list. 160 * We limit this to the maximum number of segments per request, because 161 * that is already a reasonable number of segments to aggregate. This 162 * number should never be smaller than XBB_MAX_SEGMENTS_PER_REQUEST, 163 * because that would leave situations where we can't dispatch even one 164 * large request. 165 */ 166#define XBB_MAX_SEGMENTS_PER_REQLIST XBB_MAX_SEGMENTS_PER_REQUEST 167 168/*--------------------------- Forward Declarations ---------------------------*/ 169struct xbb_softc; 170struct xbb_xen_req; 171 172static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, 173 ...) __attribute__((format(printf, 3, 4))); 174static int xbb_shutdown(struct xbb_softc *xbb); 175 176/*------------------------------ Data Structures -----------------------------*/ 177 178STAILQ_HEAD(xbb_xen_req_list, xbb_xen_req); 179 180typedef enum { 181 XBB_REQLIST_NONE = 0x00, 182 XBB_REQLIST_MAPPED = 0x01 183} xbb_reqlist_flags; 184 185struct xbb_xen_reqlist { 186 /** 187 * Back reference to the parent block back instance for this 188 * request. Used during bio_done handling. 189 */ 190 struct xbb_softc *xbb; 191 192 /** 193 * BLKIF_OP code for this request. 194 */ 195 int operation; 196 197 /** 198 * Set to BLKIF_RSP_* to indicate request status. 199 * 200 * This field allows an error status to be recorded even if the 201 * delivery of this status must be deferred. Deferred reporting 202 * is necessary, for example, when an error is detected during 203 * completion processing of one bio when other bios for this 204 * request are still outstanding. 205 */ 206 int status; 207 208 /** 209 * Number of 512 byte sectors not transferred. 210 */ 211 int residual_512b_sectors; 212 213 /** 214 * Starting sector number of the first request in the list. 215 */ 216 off_t starting_sector_number; 217 218 /** 219 * If we're going to coalesce, the next contiguous sector would be 220 * this one. 221 */ 222 off_t next_contig_sector; 223 224 /** 225 * Number of child requests in the list. 226 */ 227 int num_children; 228 229 /** 230 * Number of I/O requests still pending on the backend. 231 */ 232 int pendcnt; 233 234 /** 235 * Total number of segments for requests in the list. 236 */ 237 int nr_segments; 238 239 /** 240 * Flags for this particular request list. 241 */ 242 xbb_reqlist_flags flags; 243 244 /** 245 * Kernel virtual address space reserved for this request 246 * list structure and used to map the remote domain's pages for 247 * this I/O, into our domain's address space. 248 */ 249 uint8_t *kva; 250 251 /** 252 * Base, pseudo-physical address, corresponding to the start 253 * of this request's kva region. 254 */ 255 uint64_t gnt_base; 256 257 258#ifdef XBB_USE_BOUNCE_BUFFERS 259 /** 260 * Pre-allocated domain local memory used to proxy remote 261 * domain memory during I/O operations. 262 */ 263 uint8_t *bounce; 264#endif 265 266 /** 267 * Array of grant handles (one per page) used to map this request. 268 */ 269 grant_handle_t *gnt_handles; 270 271 /** 272 * Device statistics request ordering type (ordered or simple). 273 */ 274 devstat_tag_type ds_tag_type; 275 276 /** 277 * Device statistics request type (read, write, no_data). 278 */ 279 devstat_trans_flags ds_trans_type; 280 281 /** 282 * The start time for this request. 283 */ 284 struct bintime ds_t0; 285 286 /** 287 * Linked list of contiguous requests with the same operation type. 288 */ 289 struct xbb_xen_req_list contig_req_list; 290 291 /** 292 * Linked list links used to aggregate idle requests in the 293 * request list free pool (xbb->reqlist_free_stailq) and pending 294 * requests waiting for execution (xbb->reqlist_pending_stailq). 295 */ 296 STAILQ_ENTRY(xbb_xen_reqlist) links; 297}; 298 299STAILQ_HEAD(xbb_xen_reqlist_list, xbb_xen_reqlist); 300 301/** 302 * \brief Object tracking an in-flight I/O from a Xen VBD consumer. 303 */ 304struct xbb_xen_req { 305 /** 306 * Linked list links used to aggregate requests into a reqlist 307 * and to store them in the request free pool. 308 */ 309 STAILQ_ENTRY(xbb_xen_req) links; 310 311 /** 312 * The remote domain's identifier for this I/O request. 313 */ 314 uint64_t id; 315 316 /** 317 * The number of pages currently mapped for this request. 318 */ 319 int nr_pages; 320 321 /** 322 * The number of 512 byte sectors comprising this requests. 323 */ 324 int nr_512b_sectors; 325 326 /** 327 * BLKIF_OP code for this request. 328 */ 329 int operation; 330 331 /** 332 * Storage used for non-native ring requests. 333 */ 334 blkif_request_t ring_req_storage; 335 336 /** 337 * Pointer to the Xen request in the ring. 338 */ 339 blkif_request_t *ring_req; 340 341 /** 342 * Consumer index for this request. 343 */ 344 RING_IDX req_ring_idx; 345 346 /** 347 * The start time for this request. 348 */ 349 struct bintime ds_t0; 350 351 /** 352 * Pointer back to our parent request list. 353 */ 354 struct xbb_xen_reqlist *reqlist; 355}; 356SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req); 357 358/** 359 * \brief Configuration data for the shared memory request ring 360 * used to communicate with the front-end client of this 361 * this driver. 362 */ 363struct xbb_ring_config { 364 /** KVA address where ring memory is mapped. */ 365 vm_offset_t va; 366 367 /** The pseudo-physical address where ring memory is mapped.*/ 368 uint64_t gnt_addr; 369 370 /** 371 * Grant table handles, one per-ring page, returned by the 372 * hyperpervisor upon mapping of the ring and required to 373 * unmap it when a connection is torn down. 374 */ 375 grant_handle_t handle[XBB_MAX_RING_PAGES]; 376 377 /** 378 * The device bus address returned by the hypervisor when 379 * mapping the ring and required to unmap it when a connection 380 * is torn down. 381 */ 382 uint64_t bus_addr[XBB_MAX_RING_PAGES]; 383 384 /** The number of ring pages mapped for the current connection. */ 385 u_int ring_pages; 386 387 /** 388 * The grant references, one per-ring page, supplied by the 389 * front-end, allowing us to reference the ring pages in the 390 * front-end's domain and to map these pages into our own domain. 391 */ 392 grant_ref_t ring_ref[XBB_MAX_RING_PAGES]; 393 394 /** The interrupt driven even channel used to signal ring events. */ 395 evtchn_port_t evtchn; 396}; 397 398/** 399 * Per-instance connection state flags. 400 */ 401typedef enum 402{ 403 /** 404 * The front-end requested a read-only mount of the 405 * back-end device/file. 406 */ 407 XBBF_READ_ONLY = 0x01, 408 409 /** Communication with the front-end has been established. */ 410 XBBF_RING_CONNECTED = 0x02, 411 412 /** 413 * Front-end requests exist in the ring and are waiting for 414 * xbb_xen_req objects to free up. 415 */ 416 XBBF_RESOURCE_SHORTAGE = 0x04, 417 418 /** Connection teardown in progress. */ 419 XBBF_SHUTDOWN = 0x08, 420 421 /** A thread is already performing shutdown processing. */ 422 XBBF_IN_SHUTDOWN = 0x10 423} xbb_flag_t; 424 425/** Backend device type. */ 426typedef enum { 427 /** Backend type unknown. */ 428 XBB_TYPE_NONE = 0x00, 429 430 /** 431 * Backend type disk (access via cdev switch 432 * strategy routine). 433 */ 434 XBB_TYPE_DISK = 0x01, 435 436 /** Backend type file (access vnode operations.). */ 437 XBB_TYPE_FILE = 0x02 438} xbb_type; 439 440/** 441 * \brief Structure used to memoize information about a per-request 442 * scatter-gather list. 443 * 444 * The chief benefit of using this data structure is it avoids having 445 * to reparse the possibly discontiguous S/G list in the original 446 * request. Due to the way that the mapping of the memory backing an 447 * I/O transaction is handled by Xen, a second pass is unavoidable. 448 * At least this way the second walk is a simple array traversal. 449 * 450 * \note A single Scatter/Gather element in the block interface covers 451 * at most 1 machine page. In this context a sector (blkif 452 * nomenclature, not what I'd choose) is a 512b aligned unit 453 * of mapping within the machine page referenced by an S/G 454 * element. 455 */ 456struct xbb_sg { 457 /** The number of 512b data chunks mapped in this S/G element. */ 458 int16_t nsect; 459 460 /** 461 * The index (0 based) of the first 512b data chunk mapped 462 * in this S/G element. 463 */ 464 uint8_t first_sect; 465 466 /** 467 * The index (0 based) of the last 512b data chunk mapped 468 * in this S/G element. 469 */ 470 uint8_t last_sect; 471}; 472 473/** 474 * Character device backend specific configuration data. 475 */ 476struct xbb_dev_data { 477 /** Cdev used for device backend access. */ 478 struct cdev *cdev; 479 480 /** Cdev switch used for device backend access. */ 481 struct cdevsw *csw; 482 483 /** Used to hold a reference on opened cdev backend devices. */ 484 int dev_ref; 485}; 486 487/** 488 * File backend specific configuration data. 489 */ 490struct xbb_file_data { 491 /** Credentials to use for vnode backed (file based) I/O. */ 492 struct ucred *cred; 493 494 /** 495 * \brief Array of io vectors used to process file based I/O. 496 * 497 * Only a single file based request is outstanding per-xbb instance, 498 * so we only need one of these. 499 */ 500 struct iovec xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; 501#ifdef XBB_USE_BOUNCE_BUFFERS 502 503 /** 504 * \brief Array of io vectors used to handle bouncing of file reads. 505 * 506 * Vnode operations are free to modify uio data during their 507 * exectuion. In the case of a read with bounce buffering active, 508 * we need some of the data from the original uio in order to 509 * bounce-out the read data. This array serves as the temporary 510 * storage for this saved data. 511 */ 512 struct iovec saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; 513 514 /** 515 * \brief Array of memoized bounce buffer kva offsets used 516 * in the file based backend. 517 * 518 * Due to the way that the mapping of the memory backing an 519 * I/O transaction is handled by Xen, a second pass through 520 * the request sg elements is unavoidable. We memoize the computed 521 * bounce address here to reduce the cost of the second walk. 522 */ 523 void *xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQLIST]; 524#endif /* XBB_USE_BOUNCE_BUFFERS */ 525}; 526 527/** 528 * Collection of backend type specific data. 529 */ 530union xbb_backend_data { 531 struct xbb_dev_data dev; 532 struct xbb_file_data file; 533}; 534 535/** 536 * Function signature of backend specific I/O handlers. 537 */ 538typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb, 539 struct xbb_xen_reqlist *reqlist, int operation, 540 int flags); 541 542/** 543 * Per-instance configuration data. 544 */ 545struct xbb_softc { 546 547 /** 548 * Task-queue used to process I/O requests. 549 */ 550 struct taskqueue *io_taskqueue; 551 552 /** 553 * Single "run the request queue" task enqueued 554 * on io_taskqueue. 555 */ 556 struct task io_task; 557 558 /** Device type for this instance. */ 559 xbb_type device_type; 560 561 /** NewBus device corresponding to this instance. */ 562 device_t dev; 563 564 /** Backend specific dispatch routine for this instance. */ 565 xbb_dispatch_t dispatch_io; 566 567 /** The number of requests outstanding on the backend device/file. */ 568 int active_request_count; 569 570 /** Free pool of request tracking structures. */ 571 struct xbb_xen_req_list request_free_stailq; 572 573 /** Array, sized at connection time, of request tracking structures. */ 574 struct xbb_xen_req *requests; 575 576 /** Free pool of request list structures. */ 577 struct xbb_xen_reqlist_list reqlist_free_stailq; 578 579 /** List of pending request lists awaiting execution. */ 580 struct xbb_xen_reqlist_list reqlist_pending_stailq; 581 582 /** Array, sized at connection time, of request list structures. */ 583 struct xbb_xen_reqlist *request_lists; 584 585 /** 586 * Global pool of kva used for mapping remote domain ring 587 * and I/O transaction data. 588 */ 589 vm_offset_t kva; 590 591 /** Pseudo-physical address corresponding to kva. */ 592 uint64_t gnt_base_addr; 593 594 /** The size of the global kva pool. */ 595 int kva_size; 596 597 /** The size of the KVA area used for request lists. */ 598 int reqlist_kva_size; 599 600 /** The number of pages of KVA used for request lists */ 601 int reqlist_kva_pages; 602 603 /** Bitmap of free KVA pages */ 604 bitstr_t *kva_free; 605 606 /** 607 * \brief Cached value of the front-end's domain id. 608 * 609 * This value is used at once for each mapped page in 610 * a transaction. We cache it to avoid incuring the 611 * cost of an ivar access every time this is needed. 612 */ 613 domid_t otherend_id; 614 615 /** 616 * \brief The blkif protocol abi in effect. 617 * 618 * There are situations where the back and front ends can 619 * have a different, native abi (e.g. intel x86_64 and 620 * 32bit x86 domains on the same machine). The back-end 621 * always accommodates the front-end's native abi. That 622 * value is pulled from the XenStore and recorded here. 623 */ 624 int abi; 625 626 /** 627 * \brief The maximum number of requests and request lists allowed 628 * to be in flight at a time. 629 * 630 * This value is negotiated via the XenStore. 631 */ 632 u_int max_requests; 633 634 /** 635 * \brief The maximum number of segments (1 page per segment) 636 * that can be mapped by a request. 637 * 638 * This value is negotiated via the XenStore. 639 */ 640 u_int max_request_segments; 641 642 /** 643 * \brief Maximum number of segments per request list. 644 * 645 * This value is derived from and will generally be larger than 646 * max_request_segments. 647 */ 648 u_int max_reqlist_segments; 649 650 /** 651 * The maximum size of any request to this back-end 652 * device. 653 * 654 * This value is negotiated via the XenStore. 655 */ 656 u_int max_request_size; 657 658 /** 659 * The maximum size of any request list. This is derived directly 660 * from max_reqlist_segments. 661 */ 662 u_int max_reqlist_size; 663 664 /** Various configuration and state bit flags. */ 665 xbb_flag_t flags; 666 667 /** Ring mapping and interrupt configuration data. */ 668 struct xbb_ring_config ring_config; 669 670 /** Runtime, cross-abi safe, structures for ring access. */ 671 blkif_back_rings_t rings; 672 673 /** IRQ mapping for the communication ring event channel. */ 674 xen_intr_handle_t xen_intr_handle; 675 676 /** 677 * \brief Backend access mode flags (e.g. write, or read-only). 678 * 679 * This value is passed to us by the front-end via the XenStore. 680 */ 681 char *dev_mode; 682 683 /** 684 * \brief Backend device type (e.g. "disk", "cdrom", "floppy"). 685 * 686 * This value is passed to us by the front-end via the XenStore. 687 * Currently unused. 688 */ 689 char *dev_type; 690 691 /** 692 * \brief Backend device/file identifier. 693 * 694 * This value is passed to us by the front-end via the XenStore. 695 * We expect this to be a POSIX path indicating the file or 696 * device to open. 697 */ 698 char *dev_name; 699 700 /** 701 * Vnode corresponding to the backend device node or file 702 * we are acessing. 703 */ 704 struct vnode *vn; 705 706 union xbb_backend_data backend; 707 708 /** The native sector size of the backend. */ 709 u_int sector_size; 710 711 /** log2 of sector_size. */ 712 u_int sector_size_shift; 713 714 /** Size in bytes of the backend device or file. */ 715 off_t media_size; 716 717 /** 718 * \brief media_size expressed in terms of the backend native 719 * sector size. 720 * 721 * (e.g. xbb->media_size >> xbb->sector_size_shift). 722 */ 723 uint64_t media_num_sectors; 724 725 /** 726 * \brief Array of memoized scatter gather data computed during the 727 * conversion of blkif ring requests to internal xbb_xen_req 728 * structures. 729 * 730 * Ring processing is serialized so we only need one of these. 731 */ 732 struct xbb_sg xbb_sgs[XBB_MAX_SEGMENTS_PER_REQLIST]; 733 734 /** 735 * Temporary grant table map used in xbb_dispatch_io(). When 736 * XBB_MAX_SEGMENTS_PER_REQLIST gets large, keeping this on the 737 * stack could cause a stack overflow. 738 */ 739 struct gnttab_map_grant_ref maps[XBB_MAX_SEGMENTS_PER_REQLIST]; 740 741 /** Mutex protecting per-instance data. */ 742 struct mtx lock; 743 744 /** 745 * Resource representing allocated physical address space 746 * associated with our per-instance kva region. 747 */ 748 struct resource *pseudo_phys_res; 749 750 /** Resource id for allocated physical address space. */ 751 int pseudo_phys_res_id; 752 753 /** 754 * I/O statistics from BlockBack dispatch down. These are 755 * coalesced requests, and we start them right before execution. 756 */ 757 struct devstat *xbb_stats; 758 759 /** 760 * I/O statistics coming into BlockBack. These are the requests as 761 * we get them from BlockFront. They are started as soon as we 762 * receive a request, and completed when the I/O is complete. 763 */ 764 struct devstat *xbb_stats_in; 765 766 /** Disable sending flush to the backend */ 767 int disable_flush; 768 769 /** Send a real flush for every N flush requests */ 770 int flush_interval; 771 772 /** Count of flush requests in the interval */ 773 int flush_count; 774 775 /** Don't coalesce requests if this is set */ 776 int no_coalesce_reqs; 777 778 /** Number of requests we have received */ 779 uint64_t reqs_received; 780 781 /** Number of requests we have completed*/ 782 uint64_t reqs_completed; 783 784 /** Number of requests we queued but not pushed*/ 785 uint64_t reqs_queued_for_completion; 786 787 /** Number of requests we completed with an error status*/ 788 uint64_t reqs_completed_with_error; 789 790 /** How many forced dispatches (i.e. without coalescing) have happened */ 791 uint64_t forced_dispatch; 792 793 /** How many normal dispatches have happened */ 794 uint64_t normal_dispatch; 795 796 /** How many total dispatches have happened */ 797 uint64_t total_dispatch; 798 799 /** How many times we have run out of KVA */ 800 uint64_t kva_shortages; 801 802 /** How many times we have run out of request structures */ 803 uint64_t request_shortages; 804 805 /** Watch to wait for hotplug script execution */ 806 struct xs_watch hotplug_watch; 807 808 /** Got the needed data from hotplug scripts? */ 809 bool hotplug_done; 810}; 811 812/*---------------------------- Request Processing ----------------------------*/ 813/** 814 * Allocate an internal transaction tracking structure from the free pool. 815 * 816 * \param xbb Per-instance xbb configuration structure. 817 * 818 * \return On success, a pointer to the allocated xbb_xen_req structure. 819 * Otherwise NULL. 820 */ 821static inline struct xbb_xen_req * 822xbb_get_req(struct xbb_softc *xbb) 823{ 824 struct xbb_xen_req *req; 825 826 req = NULL; 827 828 mtx_assert(&xbb->lock, MA_OWNED); 829 830 if ((req = STAILQ_FIRST(&xbb->request_free_stailq)) != NULL) { 831 STAILQ_REMOVE_HEAD(&xbb->request_free_stailq, links); 832 xbb->active_request_count++; 833 } 834 835 return (req); 836} 837 838/** 839 * Return an allocated transaction tracking structure to the free pool. 840 * 841 * \param xbb Per-instance xbb configuration structure. 842 * \param req The request structure to free. 843 */ 844static inline void 845xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req) 846{ 847 mtx_assert(&xbb->lock, MA_OWNED); 848 849 STAILQ_INSERT_HEAD(&xbb->request_free_stailq, req, links); 850 xbb->active_request_count--; 851 852 KASSERT(xbb->active_request_count >= 0, 853 ("xbb_release_req: negative active count")); 854} 855 856/** 857 * Return an xbb_xen_req_list of allocated xbb_xen_reqs to the free pool. 858 * 859 * \param xbb Per-instance xbb configuration structure. 860 * \param req_list The list of requests to free. 861 * \param nreqs The number of items in the list. 862 */ 863static inline void 864xbb_release_reqs(struct xbb_softc *xbb, struct xbb_xen_req_list *req_list, 865 int nreqs) 866{ 867 mtx_assert(&xbb->lock, MA_OWNED); 868 869 STAILQ_CONCAT(&xbb->request_free_stailq, req_list); 870 xbb->active_request_count -= nreqs; 871 872 KASSERT(xbb->active_request_count >= 0, 873 ("xbb_release_reqs: negative active count")); 874} 875 876/** 877 * Given a page index and 512b sector offset within that page, 878 * calculate an offset into a request's kva region. 879 * 880 * \param reqlist The request structure whose kva region will be accessed. 881 * \param pagenr The page index used to compute the kva offset. 882 * \param sector The 512b sector index used to compute the page relative 883 * kva offset. 884 * 885 * \return The computed global KVA offset. 886 */ 887static inline uint8_t * 888xbb_reqlist_vaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 889{ 890 return (reqlist->kva + (PAGE_SIZE * pagenr) + (sector << 9)); 891} 892 893#ifdef XBB_USE_BOUNCE_BUFFERS 894/** 895 * Given a page index and 512b sector offset within that page, 896 * calculate an offset into a request's local bounce memory region. 897 * 898 * \param reqlist The request structure whose bounce region will be accessed. 899 * \param pagenr The page index used to compute the bounce offset. 900 * \param sector The 512b sector index used to compute the page relative 901 * bounce offset. 902 * 903 * \return The computed global bounce buffer address. 904 */ 905static inline uint8_t * 906xbb_reqlist_bounce_addr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 907{ 908 return (reqlist->bounce + (PAGE_SIZE * pagenr) + (sector << 9)); 909} 910#endif 911 912/** 913 * Given a page number and 512b sector offset within that page, 914 * calculate an offset into the request's memory region that the 915 * underlying backend device/file should use for I/O. 916 * 917 * \param reqlist The request structure whose I/O region will be accessed. 918 * \param pagenr The page index used to compute the I/O offset. 919 * \param sector The 512b sector index used to compute the page relative 920 * I/O offset. 921 * 922 * \return The computed global I/O address. 923 * 924 * Depending on configuration, this will either be a local bounce buffer 925 * or a pointer to the memory mapped in from the front-end domain for 926 * this request. 927 */ 928static inline uint8_t * 929xbb_reqlist_ioaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 930{ 931#ifdef XBB_USE_BOUNCE_BUFFERS 932 return (xbb_reqlist_bounce_addr(reqlist, pagenr, sector)); 933#else 934 return (xbb_reqlist_vaddr(reqlist, pagenr, sector)); 935#endif 936} 937 938/** 939 * Given a page index and 512b sector offset within that page, calculate 940 * an offset into the local pseudo-physical address space used to map a 941 * front-end's request data into a request. 942 * 943 * \param reqlist The request list structure whose pseudo-physical region 944 * will be accessed. 945 * \param pagenr The page index used to compute the pseudo-physical offset. 946 * \param sector The 512b sector index used to compute the page relative 947 * pseudo-physical offset. 948 * 949 * \return The computed global pseudo-phsyical address. 950 * 951 * Depending on configuration, this will either be a local bounce buffer 952 * or a pointer to the memory mapped in from the front-end domain for 953 * this request. 954 */ 955static inline uintptr_t 956xbb_get_gntaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 957{ 958 struct xbb_softc *xbb; 959 960 xbb = reqlist->xbb; 961 962 return ((uintptr_t)(xbb->gnt_base_addr + 963 (uintptr_t)(reqlist->kva - xbb->kva) + 964 (PAGE_SIZE * pagenr) + (sector << 9))); 965} 966 967/** 968 * Get Kernel Virtual Address space for mapping requests. 969 * 970 * \param xbb Per-instance xbb configuration structure. 971 * \param nr_pages Number of pages needed. 972 * \param check_only If set, check for free KVA but don't allocate it. 973 * \param have_lock If set, xbb lock is already held. 974 * 975 * \return On success, a pointer to the allocated KVA region. Otherwise NULL. 976 * 977 * Note: This should be unnecessary once we have either chaining or 978 * scatter/gather support for struct bio. At that point we'll be able to 979 * put multiple addresses and lengths in one bio/bio chain and won't need 980 * to map everything into one virtual segment. 981 */ 982static uint8_t * 983xbb_get_kva(struct xbb_softc *xbb, int nr_pages) 984{ 985 int first_clear; 986 int num_clear; 987 uint8_t *free_kva; 988 int i; 989 990 KASSERT(nr_pages != 0, ("xbb_get_kva of zero length")); 991 992 first_clear = 0; 993 free_kva = NULL; 994 995 mtx_lock(&xbb->lock); 996 997 /* 998 * Look for the first available page. If there are none, we're done. 999 */ 1000 bit_ffc(xbb->kva_free, xbb->reqlist_kva_pages, &first_clear); 1001 1002 if (first_clear == -1) 1003 goto bailout; 1004 1005 /* 1006 * Starting at the first available page, look for consecutive free 1007 * pages that will satisfy the user's request. 1008 */ 1009 for (i = first_clear, num_clear = 0; i < xbb->reqlist_kva_pages; i++) { 1010 /* 1011 * If this is true, the page is used, so we have to reset 1012 * the number of clear pages and the first clear page 1013 * (since it pointed to a region with an insufficient number 1014 * of clear pages). 1015 */ 1016 if (bit_test(xbb->kva_free, i)) { 1017 num_clear = 0; 1018 first_clear = -1; 1019 continue; 1020 } 1021 1022 if (first_clear == -1) 1023 first_clear = i; 1024 1025 /* 1026 * If this is true, we've found a large enough free region 1027 * to satisfy the request. 1028 */ 1029 if (++num_clear == nr_pages) { 1030 1031 bit_nset(xbb->kva_free, first_clear, 1032 first_clear + nr_pages - 1); 1033 1034 free_kva = xbb->kva + 1035 (uint8_t *)((intptr_t)first_clear * PAGE_SIZE); 1036 1037 KASSERT(free_kva >= (uint8_t *)xbb->kva && 1038 free_kva + (nr_pages * PAGE_SIZE) <= 1039 (uint8_t *)xbb->ring_config.va, 1040 ("Free KVA %p len %d out of range, " 1041 "kva = %#jx, ring VA = %#jx\n", free_kva, 1042 nr_pages * PAGE_SIZE, (uintmax_t)xbb->kva, 1043 (uintmax_t)xbb->ring_config.va)); 1044 break; 1045 } 1046 } 1047 1048bailout: 1049 1050 if (free_kva == NULL) { 1051 xbb->flags |= XBBF_RESOURCE_SHORTAGE; 1052 xbb->kva_shortages++; 1053 } 1054 1055 mtx_unlock(&xbb->lock); 1056 1057 return (free_kva); 1058} 1059 1060/** 1061 * Free allocated KVA. 1062 * 1063 * \param xbb Per-instance xbb configuration structure. 1064 * \param kva_ptr Pointer to allocated KVA region. 1065 * \param nr_pages Number of pages in the KVA region. 1066 */ 1067static void 1068xbb_free_kva(struct xbb_softc *xbb, uint8_t *kva_ptr, int nr_pages) 1069{ 1070 intptr_t start_page; 1071 1072 mtx_assert(&xbb->lock, MA_OWNED); 1073 1074 start_page = (intptr_t)(kva_ptr - xbb->kva) >> PAGE_SHIFT; 1075 bit_nclear(xbb->kva_free, start_page, start_page + nr_pages - 1); 1076 1077} 1078 1079/** 1080 * Unmap the front-end pages associated with this I/O request. 1081 * 1082 * \param req The request structure to unmap. 1083 */ 1084static void 1085xbb_unmap_reqlist(struct xbb_xen_reqlist *reqlist) 1086{ 1087 struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQLIST]; 1088 u_int i; 1089 u_int invcount; 1090 int error; 1091 1092 invcount = 0; 1093 for (i = 0; i < reqlist->nr_segments; i++) { 1094 1095 if (reqlist->gnt_handles[i] == GRANT_REF_INVALID) 1096 continue; 1097 1098 unmap[invcount].host_addr = xbb_get_gntaddr(reqlist, i, 0); 1099 unmap[invcount].dev_bus_addr = 0; 1100 unmap[invcount].handle = reqlist->gnt_handles[i]; 1101 reqlist->gnt_handles[i] = GRANT_REF_INVALID; 1102 invcount++; 1103 } 1104 1105 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, 1106 unmap, invcount); 1107 KASSERT(error == 0, ("Grant table operation failed")); 1108} 1109 1110/** 1111 * Allocate an internal transaction tracking structure from the free pool. 1112 * 1113 * \param xbb Per-instance xbb configuration structure. 1114 * 1115 * \return On success, a pointer to the allocated xbb_xen_reqlist structure. 1116 * Otherwise NULL. 1117 */ 1118static inline struct xbb_xen_reqlist * 1119xbb_get_reqlist(struct xbb_softc *xbb) 1120{ 1121 struct xbb_xen_reqlist *reqlist; 1122 1123 reqlist = NULL; 1124 1125 mtx_assert(&xbb->lock, MA_OWNED); 1126 1127 if ((reqlist = STAILQ_FIRST(&xbb->reqlist_free_stailq)) != NULL) { 1128 1129 STAILQ_REMOVE_HEAD(&xbb->reqlist_free_stailq, links); 1130 reqlist->flags = XBB_REQLIST_NONE; 1131 reqlist->kva = NULL; 1132 reqlist->status = BLKIF_RSP_OKAY; 1133 reqlist->residual_512b_sectors = 0; 1134 reqlist->num_children = 0; 1135 reqlist->nr_segments = 0; 1136 STAILQ_INIT(&reqlist->contig_req_list); 1137 } 1138 1139 return (reqlist); 1140} 1141 1142/** 1143 * Return an allocated transaction tracking structure to the free pool. 1144 * 1145 * \param xbb Per-instance xbb configuration structure. 1146 * \param req The request list structure to free. 1147 * \param wakeup If set, wakeup the work thread if freeing this reqlist 1148 * during a resource shortage condition. 1149 */ 1150static inline void 1151xbb_release_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 1152 int wakeup) 1153{ 1154 1155 mtx_assert(&xbb->lock, MA_OWNED); 1156 1157 if (wakeup) { 1158 wakeup = xbb->flags & XBBF_RESOURCE_SHORTAGE; 1159 xbb->flags &= ~XBBF_RESOURCE_SHORTAGE; 1160 } 1161 1162 if (reqlist->kva != NULL) 1163 xbb_free_kva(xbb, reqlist->kva, reqlist->nr_segments); 1164 1165 xbb_release_reqs(xbb, &reqlist->contig_req_list, reqlist->num_children); 1166 1167 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); 1168 1169 if ((xbb->flags & XBBF_SHUTDOWN) != 0) { 1170 /* 1171 * Shutdown is in progress. See if we can 1172 * progress further now that one more request 1173 * has completed and been returned to the 1174 * free pool. 1175 */ 1176 xbb_shutdown(xbb); 1177 } 1178 1179 if (wakeup != 0) 1180 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1181} 1182 1183/** 1184 * Request resources and do basic request setup. 1185 * 1186 * \param xbb Per-instance xbb configuration structure. 1187 * \param reqlist Pointer to reqlist pointer. 1188 * \param ring_req Pointer to a block ring request. 1189 * \param ring_index The ring index of this request. 1190 * 1191 * \return 0 for success, non-zero for failure. 1192 */ 1193static int 1194xbb_get_resources(struct xbb_softc *xbb, struct xbb_xen_reqlist **reqlist, 1195 blkif_request_t *ring_req, RING_IDX ring_idx) 1196{ 1197 struct xbb_xen_reqlist *nreqlist; 1198 struct xbb_xen_req *nreq; 1199 1200 nreqlist = NULL; 1201 nreq = NULL; 1202 1203 mtx_lock(&xbb->lock); 1204 1205 /* 1206 * We don't allow new resources to be allocated if we're in the 1207 * process of shutting down. 1208 */ 1209 if ((xbb->flags & XBBF_SHUTDOWN) != 0) { 1210 mtx_unlock(&xbb->lock); 1211 return (1); 1212 } 1213 1214 /* 1215 * Allocate a reqlist if the caller doesn't have one already. 1216 */ 1217 if (*reqlist == NULL) { 1218 nreqlist = xbb_get_reqlist(xbb); 1219 if (nreqlist == NULL) 1220 goto bailout_error; 1221 } 1222 1223 /* We always allocate a request. */ 1224 nreq = xbb_get_req(xbb); 1225 if (nreq == NULL) 1226 goto bailout_error; 1227 1228 mtx_unlock(&xbb->lock); 1229 1230 if (*reqlist == NULL) { 1231 *reqlist = nreqlist; 1232 nreqlist->operation = ring_req->operation; 1233 nreqlist->starting_sector_number = ring_req->sector_number; 1234 STAILQ_INSERT_TAIL(&xbb->reqlist_pending_stailq, nreqlist, 1235 links); 1236 } 1237 1238 nreq->reqlist = *reqlist; 1239 nreq->req_ring_idx = ring_idx; 1240 nreq->id = ring_req->id; 1241 nreq->operation = ring_req->operation; 1242 1243 if (xbb->abi != BLKIF_PROTOCOL_NATIVE) { 1244 bcopy(ring_req, &nreq->ring_req_storage, sizeof(*ring_req)); 1245 nreq->ring_req = &nreq->ring_req_storage; 1246 } else { 1247 nreq->ring_req = ring_req; 1248 } 1249 1250 binuptime(&nreq->ds_t0); 1251 devstat_start_transaction(xbb->xbb_stats_in, &nreq->ds_t0); 1252 STAILQ_INSERT_TAIL(&(*reqlist)->contig_req_list, nreq, links); 1253 (*reqlist)->num_children++; 1254 (*reqlist)->nr_segments += ring_req->nr_segments; 1255 1256 return (0); 1257 1258bailout_error: 1259 1260 /* 1261 * We're out of resources, so set the shortage flag. The next time 1262 * a request is released, we'll try waking up the work thread to 1263 * see if we can allocate more resources. 1264 */ 1265 xbb->flags |= XBBF_RESOURCE_SHORTAGE; 1266 xbb->request_shortages++; 1267 1268 if (nreq != NULL) 1269 xbb_release_req(xbb, nreq); 1270 1271 if (nreqlist != NULL) 1272 xbb_release_reqlist(xbb, nreqlist, /*wakeup*/ 0); 1273 1274 mtx_unlock(&xbb->lock); 1275 1276 return (1); 1277} 1278 1279/** 1280 * Create and queue a response to a blkif request. 1281 * 1282 * \param xbb Per-instance xbb configuration structure. 1283 * \param req The request structure to which to respond. 1284 * \param status The status code to report. See BLKIF_RSP_* 1285 * in sys/xen/interface/io/blkif.h. 1286 */ 1287static void 1288xbb_queue_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status) 1289{ 1290 blkif_response_t *resp; 1291 1292 /* 1293 * The mutex is required here, and should be held across this call 1294 * until after the subsequent call to xbb_push_responses(). This 1295 * is to guarantee that another context won't queue responses and 1296 * push them while we're active. 1297 * 1298 * That could lead to the other end being notified of responses 1299 * before the resources have been freed on this end. The other end 1300 * would then be able to queue additional I/O, and we may run out 1301 * of resources because we haven't freed them all yet. 1302 */ 1303 mtx_assert(&xbb->lock, MA_OWNED); 1304 1305 /* 1306 * Place on the response ring for the relevant domain. 1307 * For now, only the spacing between entries is different 1308 * in the different ABIs, not the response entry layout. 1309 */ 1310 switch (xbb->abi) { 1311 case BLKIF_PROTOCOL_NATIVE: 1312 resp = RING_GET_RESPONSE(&xbb->rings.native, 1313 xbb->rings.native.rsp_prod_pvt); 1314 break; 1315 case BLKIF_PROTOCOL_X86_32: 1316 resp = (blkif_response_t *) 1317 RING_GET_RESPONSE(&xbb->rings.x86_32, 1318 xbb->rings.x86_32.rsp_prod_pvt); 1319 break; 1320 case BLKIF_PROTOCOL_X86_64: 1321 resp = (blkif_response_t *) 1322 RING_GET_RESPONSE(&xbb->rings.x86_64, 1323 xbb->rings.x86_64.rsp_prod_pvt); 1324 break; 1325 default: 1326 panic("Unexpected blkif protocol ABI."); 1327 } 1328 1329 resp->id = req->id; 1330 resp->operation = req->operation; 1331 resp->status = status; 1332 1333 if (status != BLKIF_RSP_OKAY) 1334 xbb->reqs_completed_with_error++; 1335 1336 xbb->rings.common.rsp_prod_pvt++; 1337 1338 xbb->reqs_queued_for_completion++; 1339 1340} 1341 1342/** 1343 * Send queued responses to blkif requests. 1344 * 1345 * \param xbb Per-instance xbb configuration structure. 1346 * \param run_taskqueue Flag that is set to 1 if the taskqueue 1347 * should be run, 0 if it does not need to be run. 1348 * \param notify Flag that is set to 1 if the other end should be 1349 * notified via irq, 0 if the other end should not be 1350 * notified. 1351 */ 1352static void 1353xbb_push_responses(struct xbb_softc *xbb, int *run_taskqueue, int *notify) 1354{ 1355 int more_to_do; 1356 1357 /* 1358 * The mutex is required here. 1359 */ 1360 mtx_assert(&xbb->lock, MA_OWNED); 1361 1362 more_to_do = 0; 1363 1364 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, *notify); 1365 1366 if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) { 1367 1368 /* 1369 * Tail check for pending requests. Allows frontend to avoid 1370 * notifications if requests are already in flight (lower 1371 * overheads and promotes batching). 1372 */ 1373 RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do); 1374 } else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) { 1375 1376 more_to_do = 1; 1377 } 1378 1379 xbb->reqs_completed += xbb->reqs_queued_for_completion; 1380 xbb->reqs_queued_for_completion = 0; 1381 1382 *run_taskqueue = more_to_do; 1383} 1384 1385/** 1386 * Complete a request list. 1387 * 1388 * \param xbb Per-instance xbb configuration structure. 1389 * \param reqlist Allocated internal request list structure. 1390 */ 1391static void 1392xbb_complete_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) 1393{ 1394 struct xbb_xen_req *nreq; 1395 off_t sectors_sent; 1396 int notify, run_taskqueue; 1397 1398 sectors_sent = 0; 1399 1400 if (reqlist->flags & XBB_REQLIST_MAPPED) 1401 xbb_unmap_reqlist(reqlist); 1402 1403 mtx_lock(&xbb->lock); 1404 1405 /* 1406 * All I/O is done, send the response. A lock is not necessary 1407 * to protect the request list, because all requests have 1408 * completed. Therefore this is the only context accessing this 1409 * reqlist right now. However, in order to make sure that no one 1410 * else queues responses onto the queue or pushes them to the other 1411 * side while we're active, we need to hold the lock across the 1412 * calls to xbb_queue_response() and xbb_push_responses(). 1413 */ 1414 STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { 1415 off_t cur_sectors_sent; 1416 1417 /* Put this response on the ring, but don't push yet */ 1418 xbb_queue_response(xbb, nreq, reqlist->status); 1419 1420 /* We don't report bytes sent if there is an error. */ 1421 if (reqlist->status == BLKIF_RSP_OKAY) 1422 cur_sectors_sent = nreq->nr_512b_sectors; 1423 else 1424 cur_sectors_sent = 0; 1425 1426 sectors_sent += cur_sectors_sent; 1427 1428 devstat_end_transaction(xbb->xbb_stats_in, 1429 /*bytes*/cur_sectors_sent << 9, 1430 reqlist->ds_tag_type, 1431 reqlist->ds_trans_type, 1432 /*now*/NULL, 1433 /*then*/&nreq->ds_t0); 1434 } 1435 1436 /* 1437 * Take out any sectors not sent. If we wind up negative (which 1438 * might happen if an error is reported as well as a residual), just 1439 * report 0 sectors sent. 1440 */ 1441 sectors_sent -= reqlist->residual_512b_sectors; 1442 if (sectors_sent < 0) 1443 sectors_sent = 0; 1444 1445 devstat_end_transaction(xbb->xbb_stats, 1446 /*bytes*/ sectors_sent << 9, 1447 reqlist->ds_tag_type, 1448 reqlist->ds_trans_type, 1449 /*now*/NULL, 1450 /*then*/&reqlist->ds_t0); 1451 1452 xbb_release_reqlist(xbb, reqlist, /*wakeup*/ 1); 1453 1454 xbb_push_responses(xbb, &run_taskqueue, ¬ify); 1455 1456 mtx_unlock(&xbb->lock); 1457 1458 if (run_taskqueue) 1459 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1460 1461 if (notify) 1462 xen_intr_signal(xbb->xen_intr_handle); 1463} 1464 1465/** 1466 * Completion handler for buffer I/O requests issued by the device 1467 * backend driver. 1468 * 1469 * \param bio The buffer I/O request on which to perform completion 1470 * processing. 1471 */ 1472static void 1473xbb_bio_done(struct bio *bio) 1474{ 1475 struct xbb_softc *xbb; 1476 struct xbb_xen_reqlist *reqlist; 1477 1478 reqlist = bio->bio_caller1; 1479 xbb = reqlist->xbb; 1480 1481 reqlist->residual_512b_sectors += bio->bio_resid >> 9; 1482 1483 /* 1484 * This is a bit imprecise. With aggregated I/O a single 1485 * request list can contain multiple front-end requests and 1486 * a multiple bios may point to a single request. By carefully 1487 * walking the request list, we could map residuals and errors 1488 * back to the original front-end request, but the interface 1489 * isn't sufficiently rich for us to properly report the error. 1490 * So, we just treat the entire request list as having failed if an 1491 * error occurs on any part. And, if an error occurs, we treat 1492 * the amount of data transferred as 0. 1493 * 1494 * For residuals, we report it on the overall aggregated device, 1495 * but not on the individual requests, since we don't currently 1496 * do the work to determine which front-end request to which the 1497 * residual applies. 1498 */ 1499 if (bio->bio_error) { 1500 DPRINTF("BIO returned error %d for operation on device %s\n", 1501 bio->bio_error, xbb->dev_name); 1502 reqlist->status = BLKIF_RSP_ERROR; 1503 1504 if (bio->bio_error == ENXIO 1505 && xenbus_get_state(xbb->dev) == XenbusStateConnected) { 1506 1507 /* 1508 * Backend device has disappeared. Signal the 1509 * front-end that we (the device proxy) want to 1510 * go away. 1511 */ 1512 xenbus_set_state(xbb->dev, XenbusStateClosing); 1513 } 1514 } 1515 1516#ifdef XBB_USE_BOUNCE_BUFFERS 1517 if (bio->bio_cmd == BIO_READ) { 1518 vm_offset_t kva_offset; 1519 1520 kva_offset = (vm_offset_t)bio->bio_data 1521 - (vm_offset_t)reqlist->bounce; 1522 memcpy((uint8_t *)reqlist->kva + kva_offset, 1523 bio->bio_data, bio->bio_bcount); 1524 } 1525#endif /* XBB_USE_BOUNCE_BUFFERS */ 1526 1527 /* 1528 * Decrement the pending count for the request list. When we're 1529 * done with the requests, send status back for all of them. 1530 */ 1531 if (atomic_fetchadd_int(&reqlist->pendcnt, -1) == 1) 1532 xbb_complete_reqlist(xbb, reqlist); 1533 1534 g_destroy_bio(bio); 1535} 1536 1537/** 1538 * Parse a blkif request into an internal request structure and send 1539 * it to the backend for processing. 1540 * 1541 * \param xbb Per-instance xbb configuration structure. 1542 * \param reqlist Allocated internal request list structure. 1543 * 1544 * \return On success, 0. For resource shortages, non-zero. 1545 * 1546 * This routine performs the backend common aspects of request parsing 1547 * including compiling an internal request structure, parsing the S/G 1548 * list and any secondary ring requests in which they may reside, and 1549 * the mapping of front-end I/O pages into our domain. 1550 */ 1551static int 1552xbb_dispatch_io(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) 1553{ 1554 struct xbb_sg *xbb_sg; 1555 struct gnttab_map_grant_ref *map; 1556 struct blkif_request_segment *sg; 1557 struct blkif_request_segment *last_block_sg; 1558 struct xbb_xen_req *nreq; 1559 u_int nseg; 1560 u_int seg_idx; 1561 u_int block_segs; 1562 int nr_sects; 1563 int total_sects; 1564 int operation; 1565 uint8_t bio_flags; 1566 int error; 1567 1568 reqlist->ds_tag_type = DEVSTAT_TAG_SIMPLE; 1569 bio_flags = 0; 1570 total_sects = 0; 1571 nr_sects = 0; 1572 1573 /* 1574 * First determine whether we have enough free KVA to satisfy this 1575 * request list. If not, tell xbb_run_queue() so it can go to 1576 * sleep until we have more KVA. 1577 */ 1578 reqlist->kva = NULL; 1579 if (reqlist->nr_segments != 0) { 1580 reqlist->kva = xbb_get_kva(xbb, reqlist->nr_segments); 1581 if (reqlist->kva == NULL) { 1582 /* 1583 * If we're out of KVA, return ENOMEM. 1584 */ 1585 return (ENOMEM); 1586 } 1587 } 1588 1589 binuptime(&reqlist->ds_t0); 1590 devstat_start_transaction(xbb->xbb_stats, &reqlist->ds_t0); 1591 1592 switch (reqlist->operation) { 1593 case BLKIF_OP_WRITE_BARRIER: 1594 bio_flags |= BIO_ORDERED; 1595 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; 1596 /* FALLTHROUGH */ 1597 case BLKIF_OP_WRITE: 1598 operation = BIO_WRITE; 1599 reqlist->ds_trans_type = DEVSTAT_WRITE; 1600 if ((xbb->flags & XBBF_READ_ONLY) != 0) { 1601 DPRINTF("Attempt to write to read only device %s\n", 1602 xbb->dev_name); 1603 reqlist->status = BLKIF_RSP_ERROR; 1604 goto send_response; 1605 } 1606 break; 1607 case BLKIF_OP_READ: 1608 operation = BIO_READ; 1609 reqlist->ds_trans_type = DEVSTAT_READ; 1610 break; 1611 case BLKIF_OP_FLUSH_DISKCACHE: 1612 /* 1613 * If this is true, the user has requested that we disable 1614 * flush support. So we just complete the requests 1615 * successfully. 1616 */ 1617 if (xbb->disable_flush != 0) { 1618 goto send_response; 1619 } 1620 1621 /* 1622 * The user has requested that we only send a real flush 1623 * for every N flush requests. So keep count, and either 1624 * complete the request immediately or queue it for the 1625 * backend. 1626 */ 1627 if (xbb->flush_interval != 0) { 1628 if (++(xbb->flush_count) < xbb->flush_interval) { 1629 goto send_response; 1630 } else 1631 xbb->flush_count = 0; 1632 } 1633 1634 operation = BIO_FLUSH; 1635 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; 1636 reqlist->ds_trans_type = DEVSTAT_NO_DATA; 1637 goto do_dispatch; 1638 /*NOTREACHED*/ 1639 default: 1640 DPRINTF("error: unknown block io operation [%d]\n", 1641 reqlist->operation); 1642 reqlist->status = BLKIF_RSP_ERROR; 1643 goto send_response; 1644 } 1645 1646 reqlist->xbb = xbb; 1647 xbb_sg = xbb->xbb_sgs; 1648 map = xbb->maps; 1649 seg_idx = 0; 1650 1651 STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { 1652 blkif_request_t *ring_req; 1653 RING_IDX req_ring_idx; 1654 u_int req_seg_idx; 1655 1656 ring_req = nreq->ring_req; 1657 req_ring_idx = nreq->req_ring_idx; 1658 nr_sects = 0; 1659 nseg = ring_req->nr_segments; 1660 nreq->nr_pages = nseg; 1661 nreq->nr_512b_sectors = 0; 1662 req_seg_idx = 0; 1663 sg = NULL; 1664 1665 /* Check that number of segments is sane. */ 1666 if (__predict_false(nseg == 0) 1667 || __predict_false(nseg > xbb->max_request_segments)) { 1668 DPRINTF("Bad number of segments in request (%d)\n", 1669 nseg); 1670 reqlist->status = BLKIF_RSP_ERROR; 1671 goto send_response; 1672 } 1673 1674 block_segs = nseg; 1675 sg = ring_req->seg; 1676 last_block_sg = sg + block_segs; 1677 1678 while (sg < last_block_sg) { 1679 KASSERT(seg_idx < 1680 XBB_MAX_SEGMENTS_PER_REQLIST, 1681 ("seg_idx %d is too large, max " 1682 "segs %d\n", seg_idx, 1683 XBB_MAX_SEGMENTS_PER_REQLIST)); 1684 1685 xbb_sg->first_sect = sg->first_sect; 1686 xbb_sg->last_sect = sg->last_sect; 1687 xbb_sg->nsect = 1688 (int8_t)(sg->last_sect - 1689 sg->first_sect + 1); 1690 1691 if ((sg->last_sect >= (PAGE_SIZE >> 9)) 1692 || (xbb_sg->nsect <= 0)) { 1693 reqlist->status = BLKIF_RSP_ERROR; 1694 goto send_response; 1695 } 1696 1697 nr_sects += xbb_sg->nsect; 1698 map->host_addr = xbb_get_gntaddr(reqlist, 1699 seg_idx, /*sector*/0); 1700 KASSERT(map->host_addr + PAGE_SIZE <= 1701 xbb->ring_config.gnt_addr, 1702 ("Host address %#jx len %d overlaps " 1703 "ring address %#jx\n", 1704 (uintmax_t)map->host_addr, PAGE_SIZE, 1705 (uintmax_t)xbb->ring_config.gnt_addr)); 1706 1707 map->flags = GNTMAP_host_map; 1708 map->ref = sg->gref; 1709 map->dom = xbb->otherend_id; 1710 if (operation == BIO_WRITE) 1711 map->flags |= GNTMAP_readonly; 1712 sg++; 1713 map++; 1714 xbb_sg++; 1715 seg_idx++; 1716 req_seg_idx++; 1717 } 1718 1719 /* Convert to the disk's sector size */ 1720 nreq->nr_512b_sectors = nr_sects; 1721 nr_sects = (nr_sects << 9) >> xbb->sector_size_shift; 1722 total_sects += nr_sects; 1723 1724 if ((nreq->nr_512b_sectors & 1725 ((xbb->sector_size >> 9) - 1)) != 0) { 1726 device_printf(xbb->dev, "%s: I/O size (%d) is not " 1727 "a multiple of the backing store sector " 1728 "size (%d)\n", __func__, 1729 nreq->nr_512b_sectors << 9, 1730 xbb->sector_size); 1731 reqlist->status = BLKIF_RSP_ERROR; 1732 goto send_response; 1733 } 1734 } 1735 1736 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, 1737 xbb->maps, reqlist->nr_segments); 1738 if (error != 0) 1739 panic("Grant table operation failed (%d)", error); 1740 1741 reqlist->flags |= XBB_REQLIST_MAPPED; 1742 1743 for (seg_idx = 0, map = xbb->maps; seg_idx < reqlist->nr_segments; 1744 seg_idx++, map++){ 1745 1746 if (__predict_false(map->status != 0)) { 1747 DPRINTF("invalid buffer -- could not remap " 1748 "it (%d)\n", map->status); 1749 DPRINTF("Mapping(%d): Host Addr 0x%"PRIx64", flags " 1750 "0x%x ref 0x%x, dom %d\n", seg_idx, 1751 map->host_addr, map->flags, map->ref, 1752 map->dom); 1753 reqlist->status = BLKIF_RSP_ERROR; 1754 goto send_response; 1755 } 1756 1757 reqlist->gnt_handles[seg_idx] = map->handle; 1758 } 1759 if (reqlist->starting_sector_number + total_sects > 1760 xbb->media_num_sectors) { 1761 1762 DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] " 1763 "extends past end of device %s\n", 1764 operation == BIO_READ ? "read" : "write", 1765 reqlist->starting_sector_number, 1766 reqlist->starting_sector_number + total_sects, 1767 xbb->dev_name); 1768 reqlist->status = BLKIF_RSP_ERROR; 1769 goto send_response; 1770 } 1771 1772do_dispatch: 1773 1774 error = xbb->dispatch_io(xbb, 1775 reqlist, 1776 operation, 1777 bio_flags); 1778 1779 if (error != 0) { 1780 reqlist->status = BLKIF_RSP_ERROR; 1781 goto send_response; 1782 } 1783 1784 return (0); 1785 1786send_response: 1787 1788 xbb_complete_reqlist(xbb, reqlist); 1789 1790 return (0); 1791} 1792 1793static __inline int 1794xbb_count_sects(blkif_request_t *ring_req) 1795{ 1796 int i; 1797 int cur_size = 0; 1798 1799 for (i = 0; i < ring_req->nr_segments; i++) { 1800 int nsect; 1801 1802 nsect = (int8_t)(ring_req->seg[i].last_sect - 1803 ring_req->seg[i].first_sect + 1); 1804 if (nsect <= 0) 1805 break; 1806 1807 cur_size += nsect; 1808 } 1809 1810 return (cur_size); 1811} 1812 1813/** 1814 * Process incoming requests from the shared communication ring in response 1815 * to a signal on the ring's event channel. 1816 * 1817 * \param context Callback argument registerd during task initialization - 1818 * the xbb_softc for this instance. 1819 * \param pending The number of taskqueue_enqueue events that have 1820 * occurred since this handler was last run. 1821 */ 1822static void 1823xbb_run_queue(void *context, int pending) 1824{ 1825 struct xbb_softc *xbb; 1826 blkif_back_rings_t *rings; 1827 RING_IDX rp; 1828 uint64_t cur_sector; 1829 int cur_operation; 1830 struct xbb_xen_reqlist *reqlist; 1831 1832 1833 xbb = (struct xbb_softc *)context; 1834 rings = &xbb->rings; 1835 1836 /* 1837 * Work gather and dispatch loop. Note that we have a bias here 1838 * towards gathering I/O sent by blockfront. We first gather up 1839 * everything in the ring, as long as we have resources. Then we 1840 * dispatch one request, and then attempt to gather up any 1841 * additional requests that have come in while we were dispatching 1842 * the request. 1843 * 1844 * This allows us to get a clearer picture (via devstat) of how 1845 * many requests blockfront is queueing to us at any given time. 1846 */ 1847 for (;;) { 1848 int retval; 1849 1850 /* 1851 * Initialize reqlist to the last element in the pending 1852 * queue, if there is one. This allows us to add more 1853 * requests to that request list, if we have room. 1854 */ 1855 reqlist = STAILQ_LAST(&xbb->reqlist_pending_stailq, 1856 xbb_xen_reqlist, links); 1857 if (reqlist != NULL) { 1858 cur_sector = reqlist->next_contig_sector; 1859 cur_operation = reqlist->operation; 1860 } else { 1861 cur_operation = 0; 1862 cur_sector = 0; 1863 } 1864 1865 /* 1866 * Cache req_prod to avoid accessing a cache line shared 1867 * with the frontend. 1868 */ 1869 rp = rings->common.sring->req_prod; 1870 1871 /* Ensure we see queued requests up to 'rp'. */ 1872 rmb(); 1873 1874 /** 1875 * Run so long as there is work to consume and the generation 1876 * of a response will not overflow the ring. 1877 * 1878 * @note There's a 1 to 1 relationship between requests and 1879 * responses, so an overflow should never occur. This 1880 * test is to protect our domain from digesting bogus 1881 * data. Shouldn't we log this? 1882 */ 1883 while (rings->common.req_cons != rp 1884 && RING_REQUEST_CONS_OVERFLOW(&rings->common, 1885 rings->common.req_cons) == 0){ 1886 blkif_request_t ring_req_storage; 1887 blkif_request_t *ring_req; 1888 int cur_size; 1889 1890 switch (xbb->abi) { 1891 case BLKIF_PROTOCOL_NATIVE: 1892 ring_req = RING_GET_REQUEST(&xbb->rings.native, 1893 rings->common.req_cons); 1894 break; 1895 case BLKIF_PROTOCOL_X86_32: 1896 { 1897 struct blkif_x86_32_request *ring_req32; 1898 1899 ring_req32 = RING_GET_REQUEST( 1900 &xbb->rings.x86_32, rings->common.req_cons); 1901 blkif_get_x86_32_req(&ring_req_storage, 1902 ring_req32); 1903 ring_req = &ring_req_storage; 1904 break; 1905 } 1906 case BLKIF_PROTOCOL_X86_64: 1907 { 1908 struct blkif_x86_64_request *ring_req64; 1909 1910 ring_req64 =RING_GET_REQUEST(&xbb->rings.x86_64, 1911 rings->common.req_cons); 1912 blkif_get_x86_64_req(&ring_req_storage, 1913 ring_req64); 1914 ring_req = &ring_req_storage; 1915 break; 1916 } 1917 default: 1918 panic("Unexpected blkif protocol ABI."); 1919 /* NOTREACHED */ 1920 } 1921 1922 /* 1923 * Check for situations that would require closing 1924 * off this I/O for further coalescing: 1925 * - Coalescing is turned off. 1926 * - Current I/O is out of sequence with the previous 1927 * I/O. 1928 * - Coalesced I/O would be too large. 1929 */ 1930 if ((reqlist != NULL) 1931 && ((xbb->no_coalesce_reqs != 0) 1932 || ((xbb->no_coalesce_reqs == 0) 1933 && ((ring_req->sector_number != cur_sector) 1934 || (ring_req->operation != cur_operation) 1935 || ((ring_req->nr_segments + reqlist->nr_segments) > 1936 xbb->max_reqlist_segments))))) { 1937 reqlist = NULL; 1938 } 1939 1940 /* 1941 * Grab and check for all resources in one shot. 1942 * If we can't get all of the resources we need, 1943 * the shortage is noted and the thread will get 1944 * woken up when more resources are available. 1945 */ 1946 retval = xbb_get_resources(xbb, &reqlist, ring_req, 1947 xbb->rings.common.req_cons); 1948 1949 if (retval != 0) { 1950 /* 1951 * Resource shortage has been recorded. 1952 * We'll be scheduled to run once a request 1953 * object frees up due to a completion. 1954 */ 1955 break; 1956 } 1957 1958 /* 1959 * Signify that we can overwrite this request with 1960 * a response by incrementing our consumer index. 1961 * The response won't be generated until after 1962 * we've already consumed all necessary data out 1963 * of the version of the request in the ring buffer 1964 * (for native mode). We must update the consumer 1965 * index before issuing back-end I/O so there is 1966 * no possibility that it will complete and a 1967 * response be generated before we make room in 1968 * the queue for that response. 1969 */ 1970 xbb->rings.common.req_cons++; 1971 xbb->reqs_received++; 1972 1973 cur_size = xbb_count_sects(ring_req); 1974 cur_sector = ring_req->sector_number + cur_size; 1975 reqlist->next_contig_sector = cur_sector; 1976 cur_operation = ring_req->operation; 1977 } 1978 1979 /* Check for I/O to dispatch */ 1980 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); 1981 if (reqlist == NULL) { 1982 /* 1983 * We're out of work to do, put the task queue to 1984 * sleep. 1985 */ 1986 break; 1987 } 1988 1989 /* 1990 * Grab the first request off the queue and attempt 1991 * to dispatch it. 1992 */ 1993 STAILQ_REMOVE_HEAD(&xbb->reqlist_pending_stailq, links); 1994 1995 retval = xbb_dispatch_io(xbb, reqlist); 1996 if (retval != 0) { 1997 /* 1998 * xbb_dispatch_io() returns non-zero only when 1999 * there is a resource shortage. If that's the 2000 * case, re-queue this request on the head of the 2001 * queue, and go to sleep until we have more 2002 * resources. 2003 */ 2004 STAILQ_INSERT_HEAD(&xbb->reqlist_pending_stailq, 2005 reqlist, links); 2006 break; 2007 } else { 2008 /* 2009 * If we still have anything on the queue after 2010 * removing the head entry, that is because we 2011 * met one of the criteria to create a new 2012 * request list (outlined above), and we'll call 2013 * that a forced dispatch for statistical purposes. 2014 * 2015 * Otherwise, if there is only one element on the 2016 * queue, we coalesced everything available on 2017 * the ring and we'll call that a normal dispatch. 2018 */ 2019 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); 2020 2021 if (reqlist != NULL) 2022 xbb->forced_dispatch++; 2023 else 2024 xbb->normal_dispatch++; 2025 2026 xbb->total_dispatch++; 2027 } 2028 } 2029} 2030 2031/** 2032 * Interrupt handler bound to the shared ring's event channel. 2033 * 2034 * \param arg Callback argument registerd during event channel 2035 * binding - the xbb_softc for this instance. 2036 */ 2037static int 2038xbb_filter(void *arg) 2039{ 2040 struct xbb_softc *xbb; 2041 2042 /* Defer to taskqueue thread. */ 2043 xbb = (struct xbb_softc *)arg; 2044 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 2045 2046 return (FILTER_HANDLED); 2047} 2048 2049SDT_PROVIDER_DEFINE(xbb); 2050SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_dev, flush, "int"); 2051SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, read, "int", "uint64_t", 2052 "uint64_t"); 2053SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, write, "int", 2054 "uint64_t", "uint64_t"); 2055 2056/*----------------------------- Backend Handlers -----------------------------*/ 2057/** 2058 * Backend handler for character device access. 2059 * 2060 * \param xbb Per-instance xbb configuration structure. 2061 * \param reqlist Allocated internal request list structure. 2062 * \param operation BIO_* I/O operation code. 2063 * \param bio_flags Additional bio_flag data to pass to any generated 2064 * bios (e.g. BIO_ORDERED).. 2065 * 2066 * \return 0 for success, errno codes for failure. 2067 */ 2068static int 2069xbb_dispatch_dev(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 2070 int operation, int bio_flags) 2071{ 2072 struct xbb_dev_data *dev_data; 2073 struct bio *bios[XBB_MAX_SEGMENTS_PER_REQLIST]; 2074 off_t bio_offset; 2075 struct bio *bio; 2076 struct xbb_sg *xbb_sg; 2077 u_int nbio; 2078 u_int bio_idx; 2079 u_int nseg; 2080 u_int seg_idx; 2081 int error; 2082 2083 dev_data = &xbb->backend.dev; 2084 bio_offset = (off_t)reqlist->starting_sector_number 2085 << xbb->sector_size_shift; 2086 error = 0; 2087 nbio = 0; 2088 bio_idx = 0; 2089 2090 if (operation == BIO_FLUSH) { 2091 bio = g_new_bio(); 2092 if (__predict_false(bio == NULL)) { 2093 DPRINTF("Unable to allocate bio for BIO_FLUSH\n"); 2094 error = ENOMEM; 2095 return (error); 2096 } 2097 2098 bio->bio_cmd = BIO_FLUSH; 2099 bio->bio_flags |= BIO_ORDERED; 2100 bio->bio_dev = dev_data->cdev; 2101 bio->bio_offset = 0; 2102 bio->bio_data = 0; 2103 bio->bio_done = xbb_bio_done; 2104 bio->bio_caller1 = reqlist; 2105 bio->bio_pblkno = 0; 2106 2107 reqlist->pendcnt = 1; 2108 2109 SDT_PROBE1(xbb, kernel, xbb_dispatch_dev, flush, 2110 device_get_unit(xbb->dev)); 2111 2112 (*dev_data->csw->d_strategy)(bio); 2113 2114 return (0); 2115 } 2116 2117 xbb_sg = xbb->xbb_sgs; 2118 bio = NULL; 2119 nseg = reqlist->nr_segments; 2120 2121 for (seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { 2122 2123 /* 2124 * KVA will not be contiguous, so any additional 2125 * I/O will need to be represented in a new bio. 2126 */ 2127 if ((bio != NULL) 2128 && (xbb_sg->first_sect != 0)) { 2129 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { 2130 printf("%s: Discontiguous I/O request " 2131 "from domain %d ends on " 2132 "non-sector boundary\n", 2133 __func__, xbb->otherend_id); 2134 error = EINVAL; 2135 goto fail_free_bios; 2136 } 2137 bio = NULL; 2138 } 2139 2140 if (bio == NULL) { 2141 /* 2142 * Make sure that the start of this bio is 2143 * aligned to a device sector. 2144 */ 2145 if ((bio_offset & (xbb->sector_size - 1)) != 0){ 2146 printf("%s: Misaligned I/O request " 2147 "from domain %d\n", __func__, 2148 xbb->otherend_id); 2149 error = EINVAL; 2150 goto fail_free_bios; 2151 } 2152 2153 bio = bios[nbio++] = g_new_bio(); 2154 if (__predict_false(bio == NULL)) { 2155 error = ENOMEM; 2156 goto fail_free_bios; 2157 } 2158 bio->bio_cmd = operation; 2159 bio->bio_flags |= bio_flags; 2160 bio->bio_dev = dev_data->cdev; 2161 bio->bio_offset = bio_offset; 2162 bio->bio_data = xbb_reqlist_ioaddr(reqlist, seg_idx, 2163 xbb_sg->first_sect); 2164 bio->bio_done = xbb_bio_done; 2165 bio->bio_caller1 = reqlist; 2166 bio->bio_pblkno = bio_offset >> xbb->sector_size_shift; 2167 } 2168 2169 bio->bio_length += xbb_sg->nsect << 9; 2170 bio->bio_bcount = bio->bio_length; 2171 bio_offset += xbb_sg->nsect << 9; 2172 2173 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) { 2174 2175 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { 2176 printf("%s: Discontiguous I/O request " 2177 "from domain %d ends on " 2178 "non-sector boundary\n", 2179 __func__, xbb->otherend_id); 2180 error = EINVAL; 2181 goto fail_free_bios; 2182 } 2183 /* 2184 * KVA will not be contiguous, so any additional 2185 * I/O will need to be represented in a new bio. 2186 */ 2187 bio = NULL; 2188 } 2189 } 2190 2191 reqlist->pendcnt = nbio; 2192 2193 for (bio_idx = 0; bio_idx < nbio; bio_idx++) 2194 { 2195#ifdef XBB_USE_BOUNCE_BUFFERS 2196 vm_offset_t kva_offset; 2197 2198 kva_offset = (vm_offset_t)bios[bio_idx]->bio_data 2199 - (vm_offset_t)reqlist->bounce; 2200 if (operation == BIO_WRITE) { 2201 memcpy(bios[bio_idx]->bio_data, 2202 (uint8_t *)reqlist->kva + kva_offset, 2203 bios[bio_idx]->bio_bcount); 2204 } 2205#endif 2206 if (operation == BIO_READ) { 2207 SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, read, 2208 device_get_unit(xbb->dev), 2209 bios[bio_idx]->bio_offset, 2210 bios[bio_idx]->bio_length); 2211 } else if (operation == BIO_WRITE) { 2212 SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, write, 2213 device_get_unit(xbb->dev), 2214 bios[bio_idx]->bio_offset, 2215 bios[bio_idx]->bio_length); 2216 } 2217 (*dev_data->csw->d_strategy)(bios[bio_idx]); 2218 } 2219 2220 return (error); 2221 2222fail_free_bios: 2223 for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++) 2224 g_destroy_bio(bios[bio_idx]); 2225 2226 return (error); 2227} 2228 2229SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_file, flush, "int"); 2230SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, read, "int", "uint64_t", 2231 "uint64_t"); 2232SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, write, "int", 2233 "uint64_t", "uint64_t"); 2234 2235/** 2236 * Backend handler for file access. 2237 * 2238 * \param xbb Per-instance xbb configuration structure. 2239 * \param reqlist Allocated internal request list. 2240 * \param operation BIO_* I/O operation code. 2241 * \param flags Additional bio_flag data to pass to any generated bios 2242 * (e.g. BIO_ORDERED).. 2243 * 2244 * \return 0 for success, errno codes for failure. 2245 */ 2246static int 2247xbb_dispatch_file(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 2248 int operation, int flags) 2249{ 2250 struct xbb_file_data *file_data; 2251 u_int seg_idx; 2252 u_int nseg; 2253 struct uio xuio; 2254 struct xbb_sg *xbb_sg; 2255 struct iovec *xiovec; 2256#ifdef XBB_USE_BOUNCE_BUFFERS 2257 void **p_vaddr; 2258 int saved_uio_iovcnt; 2259#endif /* XBB_USE_BOUNCE_BUFFERS */ 2260 int error; 2261 2262 file_data = &xbb->backend.file; 2263 error = 0; 2264 bzero(&xuio, sizeof(xuio)); 2265 2266 switch (operation) { 2267 case BIO_READ: 2268 xuio.uio_rw = UIO_READ; 2269 break; 2270 case BIO_WRITE: 2271 xuio.uio_rw = UIO_WRITE; 2272 break; 2273 case BIO_FLUSH: { 2274 struct mount *mountpoint; 2275 2276 SDT_PROBE1(xbb, kernel, xbb_dispatch_file, flush, 2277 device_get_unit(xbb->dev)); 2278 2279 (void) vn_start_write(xbb->vn, &mountpoint, V_WAIT); 2280 2281 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2282 error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread); 2283 VOP_UNLOCK(xbb->vn, 0); 2284 2285 vn_finished_write(mountpoint); 2286 2287 goto bailout_send_response; 2288 /* NOTREACHED */ 2289 } 2290 default: 2291 panic("invalid operation %d", operation); 2292 /* NOTREACHED */ 2293 } 2294 xuio.uio_offset = (vm_offset_t)reqlist->starting_sector_number 2295 << xbb->sector_size_shift; 2296 xuio.uio_segflg = UIO_SYSSPACE; 2297 xuio.uio_iov = file_data->xiovecs; 2298 xuio.uio_iovcnt = 0; 2299 xbb_sg = xbb->xbb_sgs; 2300 nseg = reqlist->nr_segments; 2301 2302 for (xiovec = NULL, seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { 2303 2304 /* 2305 * If the first sector is not 0, the KVA will 2306 * not be contiguous and we'll need to go on 2307 * to another segment. 2308 */ 2309 if (xbb_sg->first_sect != 0) 2310 xiovec = NULL; 2311 2312 if (xiovec == NULL) { 2313 xiovec = &file_data->xiovecs[xuio.uio_iovcnt]; 2314 xiovec->iov_base = xbb_reqlist_ioaddr(reqlist, 2315 seg_idx, xbb_sg->first_sect); 2316#ifdef XBB_USE_BOUNCE_BUFFERS 2317 /* 2318 * Store the address of the incoming 2319 * buffer at this particular offset 2320 * as well, so we can do the copy 2321 * later without having to do more 2322 * work to recalculate this address. 2323 */ 2324 p_vaddr = &file_data->xiovecs_vaddr[xuio.uio_iovcnt]; 2325 *p_vaddr = xbb_reqlist_vaddr(reqlist, seg_idx, 2326 xbb_sg->first_sect); 2327#endif /* XBB_USE_BOUNCE_BUFFERS */ 2328 xiovec->iov_len = 0; 2329 xuio.uio_iovcnt++; 2330 } 2331 2332 xiovec->iov_len += xbb_sg->nsect << 9; 2333 2334 xuio.uio_resid += xbb_sg->nsect << 9; 2335 2336 /* 2337 * If the last sector is not the full page 2338 * size count, the next segment will not be 2339 * contiguous in KVA and we need a new iovec. 2340 */ 2341 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) 2342 xiovec = NULL; 2343 } 2344 2345 xuio.uio_td = curthread; 2346 2347#ifdef XBB_USE_BOUNCE_BUFFERS 2348 saved_uio_iovcnt = xuio.uio_iovcnt; 2349 2350 if (operation == BIO_WRITE) { 2351 /* Copy the write data to the local buffer. */ 2352 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, 2353 xiovec = xuio.uio_iov; seg_idx < xuio.uio_iovcnt; 2354 seg_idx++, xiovec++, p_vaddr++) { 2355 2356 memcpy(xiovec->iov_base, *p_vaddr, xiovec->iov_len); 2357 } 2358 } else { 2359 /* 2360 * We only need to save off the iovecs in the case of a 2361 * read, because the copy for the read happens after the 2362 * VOP_READ(). (The uio will get modified in that call 2363 * sequence.) 2364 */ 2365 memcpy(file_data->saved_xiovecs, xuio.uio_iov, 2366 xuio.uio_iovcnt * sizeof(xuio.uio_iov[0])); 2367 } 2368#endif /* XBB_USE_BOUNCE_BUFFERS */ 2369 2370 switch (operation) { 2371 case BIO_READ: 2372 2373 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, read, 2374 device_get_unit(xbb->dev), xuio.uio_offset, 2375 xuio.uio_resid); 2376 2377 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2378 2379 /* 2380 * UFS pays attention to IO_DIRECT for reads. If the 2381 * DIRECTIO option is configured into the kernel, it calls 2382 * ffs_rawread(). But that only works for single-segment 2383 * uios with user space addresses. In our case, with a 2384 * kernel uio, it still reads into the buffer cache, but it 2385 * will just try to release the buffer from the cache later 2386 * on in ffs_read(). 2387 * 2388 * ZFS does not pay attention to IO_DIRECT for reads. 2389 * 2390 * UFS does not pay attention to IO_SYNC for reads. 2391 * 2392 * ZFS pays attention to IO_SYNC (which translates into the 2393 * Solaris define FRSYNC for zfs_read()) for reads. It 2394 * attempts to sync the file before reading. 2395 * 2396 * So, to attempt to provide some barrier semantics in the 2397 * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC. 2398 */ 2399 error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 2400 (IO_DIRECT|IO_SYNC) : 0, file_data->cred); 2401 2402 VOP_UNLOCK(xbb->vn, 0); 2403 break; 2404 case BIO_WRITE: { 2405 struct mount *mountpoint; 2406 2407 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, write, 2408 device_get_unit(xbb->dev), xuio.uio_offset, 2409 xuio.uio_resid); 2410 2411 (void)vn_start_write(xbb->vn, &mountpoint, V_WAIT); 2412 2413 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2414 2415 /* 2416 * UFS pays attention to IO_DIRECT for writes. The write 2417 * is done asynchronously. (Normally the write would just 2418 * get put into cache. 2419 * 2420 * UFS pays attention to IO_SYNC for writes. It will 2421 * attempt to write the buffer out synchronously if that 2422 * flag is set. 2423 * 2424 * ZFS does not pay attention to IO_DIRECT for writes. 2425 * 2426 * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC) 2427 * for writes. It will flush the transaction from the 2428 * cache before returning. 2429 * 2430 * So if we've got the BIO_ORDERED flag set, we want 2431 * IO_SYNC in either the UFS or ZFS case. 2432 */ 2433 error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 2434 IO_SYNC : 0, file_data->cred); 2435 VOP_UNLOCK(xbb->vn, 0); 2436 2437 vn_finished_write(mountpoint); 2438 2439 break; 2440 } 2441 default: 2442 panic("invalid operation %d", operation); 2443 /* NOTREACHED */ 2444 } 2445 2446#ifdef XBB_USE_BOUNCE_BUFFERS 2447 /* We only need to copy here for read operations */ 2448 if (operation == BIO_READ) { 2449 2450 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, 2451 xiovec = file_data->saved_xiovecs; 2452 seg_idx < saved_uio_iovcnt; seg_idx++, 2453 xiovec++, p_vaddr++) { 2454 2455 /* 2456 * Note that we have to use the copy of the 2457 * io vector we made above. uiomove() modifies 2458 * the uio and its referenced vector as uiomove 2459 * performs the copy, so we can't rely on any 2460 * state from the original uio. 2461 */ 2462 memcpy(*p_vaddr, xiovec->iov_base, xiovec->iov_len); 2463 } 2464 } 2465#endif /* XBB_USE_BOUNCE_BUFFERS */ 2466 2467bailout_send_response: 2468 2469 if (error != 0) 2470 reqlist->status = BLKIF_RSP_ERROR; 2471 2472 xbb_complete_reqlist(xbb, reqlist); 2473 2474 return (0); 2475} 2476 2477/*--------------------------- Backend Configuration --------------------------*/ 2478/** 2479 * Close and cleanup any backend device/file specific state for this 2480 * block back instance. 2481 * 2482 * \param xbb Per-instance xbb configuration structure. 2483 */ 2484static void 2485xbb_close_backend(struct xbb_softc *xbb) 2486{ 2487 DROP_GIANT(); 2488 DPRINTF("closing dev=%s\n", xbb->dev_name); 2489 if (xbb->vn) { 2490 int flags = FREAD; 2491 2492 if ((xbb->flags & XBBF_READ_ONLY) == 0) 2493 flags |= FWRITE; 2494 2495 switch (xbb->device_type) { 2496 case XBB_TYPE_DISK: 2497 if (xbb->backend.dev.csw) { 2498 dev_relthread(xbb->backend.dev.cdev, 2499 xbb->backend.dev.dev_ref); 2500 xbb->backend.dev.csw = NULL; 2501 xbb->backend.dev.cdev = NULL; 2502 } 2503 break; 2504 case XBB_TYPE_FILE: 2505 break; 2506 case XBB_TYPE_NONE: 2507 default: 2508 panic("Unexpected backend type."); 2509 break; 2510 } 2511 2512 (void)vn_close(xbb->vn, flags, NOCRED, curthread); 2513 xbb->vn = NULL; 2514 2515 switch (xbb->device_type) { 2516 case XBB_TYPE_DISK: 2517 break; 2518 case XBB_TYPE_FILE: 2519 if (xbb->backend.file.cred != NULL) { 2520 crfree(xbb->backend.file.cred); 2521 xbb->backend.file.cred = NULL; 2522 } 2523 break; 2524 case XBB_TYPE_NONE: 2525 default: 2526 panic("Unexpected backend type."); 2527 break; 2528 } 2529 } 2530 PICKUP_GIANT(); 2531} 2532 2533/** 2534 * Open a character device to be used for backend I/O. 2535 * 2536 * \param xbb Per-instance xbb configuration structure. 2537 * 2538 * \return 0 for success, errno codes for failure. 2539 */ 2540static int 2541xbb_open_dev(struct xbb_softc *xbb) 2542{ 2543 struct vattr vattr; 2544 struct cdev *dev; 2545 struct cdevsw *devsw; 2546 int error; 2547 2548 xbb->device_type = XBB_TYPE_DISK; 2549 xbb->dispatch_io = xbb_dispatch_dev; 2550 xbb->backend.dev.cdev = xbb->vn->v_rdev; 2551 xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev, 2552 &xbb->backend.dev.dev_ref); 2553 if (xbb->backend.dev.csw == NULL) 2554 panic("Unable to retrieve device switch"); 2555 2556 error = VOP_GETATTR(xbb->vn, &vattr, NOCRED); 2557 if (error) { 2558 xenbus_dev_fatal(xbb->dev, error, "error getting " 2559 "vnode attributes for device %s", 2560 xbb->dev_name); 2561 return (error); 2562 } 2563 2564 2565 dev = xbb->vn->v_rdev; 2566 devsw = dev->si_devsw; 2567 if (!devsw->d_ioctl) { 2568 xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for " 2569 "device %s!", xbb->dev_name); 2570 return (ENODEV); 2571 } 2572 2573 error = devsw->d_ioctl(dev, DIOCGSECTORSIZE, 2574 (caddr_t)&xbb->sector_size, FREAD, 2575 curthread); 2576 if (error) { 2577 xenbus_dev_fatal(xbb->dev, error, 2578 "error calling ioctl DIOCGSECTORSIZE " 2579 "for device %s", xbb->dev_name); 2580 return (error); 2581 } 2582 2583 error = devsw->d_ioctl(dev, DIOCGMEDIASIZE, 2584 (caddr_t)&xbb->media_size, FREAD, 2585 curthread); 2586 if (error) { 2587 xenbus_dev_fatal(xbb->dev, error, 2588 "error calling ioctl DIOCGMEDIASIZE " 2589 "for device %s", xbb->dev_name); 2590 return (error); 2591 } 2592 2593 return (0); 2594} 2595 2596/** 2597 * Open a file to be used for backend I/O. 2598 * 2599 * \param xbb Per-instance xbb configuration structure. 2600 * 2601 * \return 0 for success, errno codes for failure. 2602 */ 2603static int 2604xbb_open_file(struct xbb_softc *xbb) 2605{ 2606 struct xbb_file_data *file_data; 2607 struct vattr vattr; 2608 int error; 2609 2610 file_data = &xbb->backend.file; 2611 xbb->device_type = XBB_TYPE_FILE; 2612 xbb->dispatch_io = xbb_dispatch_file; 2613 error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred); 2614 if (error != 0) { 2615 xenbus_dev_fatal(xbb->dev, error, 2616 "error calling VOP_GETATTR()" 2617 "for file %s", xbb->dev_name); 2618 return (error); 2619 } 2620 2621 /* 2622 * Verify that we have the ability to upgrade to exclusive 2623 * access on this file so we can trap errors at open instead 2624 * of reporting them during first access. 2625 */ 2626 if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) { 2627 vn_lock(xbb->vn, LK_UPGRADE | LK_RETRY); 2628 if (xbb->vn->v_iflag & VI_DOOMED) { 2629 error = EBADF; 2630 xenbus_dev_fatal(xbb->dev, error, 2631 "error locking file %s", 2632 xbb->dev_name); 2633 2634 return (error); 2635 } 2636 } 2637 2638 file_data->cred = crhold(curthread->td_ucred); 2639 xbb->media_size = vattr.va_size; 2640 2641 /* 2642 * XXX KDM vattr.va_blocksize may be larger than 512 bytes here. 2643 * With ZFS, it is 131072 bytes. Block sizes that large don't work 2644 * with disklabel and UFS on FreeBSD at least. Large block sizes 2645 * may not work with other OSes as well. So just export a sector 2646 * size of 512 bytes, which should work with any OS or 2647 * application. Since our backing is a file, any block size will 2648 * work fine for the backing store. 2649 */ 2650#if 0 2651 xbb->sector_size = vattr.va_blocksize; 2652#endif 2653 xbb->sector_size = 512; 2654 2655 /* 2656 * Sanity check. The media size has to be at least one 2657 * sector long. 2658 */ 2659 if (xbb->media_size < xbb->sector_size) { 2660 error = EINVAL; 2661 xenbus_dev_fatal(xbb->dev, error, 2662 "file %s size %ju < block size %u", 2663 xbb->dev_name, 2664 (uintmax_t)xbb->media_size, 2665 xbb->sector_size); 2666 } 2667 return (error); 2668} 2669 2670/** 2671 * Open the backend provider for this connection. 2672 * 2673 * \param xbb Per-instance xbb configuration structure. 2674 * 2675 * \return 0 for success, errno codes for failure. 2676 */ 2677static int 2678xbb_open_backend(struct xbb_softc *xbb) 2679{ 2680 struct nameidata nd; 2681 int flags; 2682 int error; 2683 2684 flags = FREAD; 2685 error = 0; 2686 2687 DPRINTF("opening dev=%s\n", xbb->dev_name); 2688 2689 if (rootvnode == NULL) { 2690 xenbus_dev_fatal(xbb->dev, ENOENT, 2691 "Root file system not mounted"); 2692 return (ENOENT); 2693 } 2694 2695 if ((xbb->flags & XBBF_READ_ONLY) == 0) 2696 flags |= FWRITE; 2697 2698 pwd_ensure_dirs(); 2699 2700 again: 2701 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name, curthread); 2702 error = vn_open(&nd, &flags, 0, NULL); 2703 if (error) { 2704 /* 2705 * This is the only reasonable guess we can make as far as 2706 * path if the user doesn't give us a fully qualified path. 2707 * If they want to specify a file, they need to specify the 2708 * full path. 2709 */ 2710 if (xbb->dev_name[0] != '/') { 2711 char *dev_path = "/dev/"; 2712 char *dev_name; 2713 2714 /* Try adding device path at beginning of name */ 2715 dev_name = malloc(strlen(xbb->dev_name) 2716 + strlen(dev_path) + 1, 2717 M_XENBLOCKBACK, M_NOWAIT); 2718 if (dev_name) { 2719 sprintf(dev_name, "%s%s", dev_path, 2720 xbb->dev_name); 2721 free(xbb->dev_name, M_XENBLOCKBACK); 2722 xbb->dev_name = dev_name; 2723 goto again; 2724 } 2725 } 2726 xenbus_dev_fatal(xbb->dev, error, "error opening device %s", 2727 xbb->dev_name); 2728 return (error); 2729 } 2730 2731 NDFREE(&nd, NDF_ONLY_PNBUF); 2732 2733 xbb->vn = nd.ni_vp; 2734 2735 /* We only support disks and files. */ 2736 if (vn_isdisk(xbb->vn, &error)) { 2737 error = xbb_open_dev(xbb); 2738 } else if (xbb->vn->v_type == VREG) { 2739 error = xbb_open_file(xbb); 2740 } else { 2741 error = EINVAL; 2742 xenbus_dev_fatal(xbb->dev, error, "%s is not a disk " 2743 "or file", xbb->dev_name); 2744 } 2745 VOP_UNLOCK(xbb->vn, 0); 2746 2747 if (error != 0) { 2748 xbb_close_backend(xbb); 2749 return (error); 2750 } 2751 2752 xbb->sector_size_shift = fls(xbb->sector_size) - 1; 2753 xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift; 2754 2755 DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n", 2756 (xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file", 2757 xbb->dev_name, xbb->sector_size, xbb->media_size); 2758 2759 return (0); 2760} 2761 2762/*------------------------ Inter-Domain Communication ------------------------*/ 2763/** 2764 * Free dynamically allocated KVA or pseudo-physical address allocations. 2765 * 2766 * \param xbb Per-instance xbb configuration structure. 2767 */ 2768static void 2769xbb_free_communication_mem(struct xbb_softc *xbb) 2770{ 2771 if (xbb->kva != 0) { 2772 if (xbb->pseudo_phys_res != NULL) { 2773 xenmem_free(xbb->dev, xbb->pseudo_phys_res_id, 2774 xbb->pseudo_phys_res); 2775 xbb->pseudo_phys_res = NULL; 2776 } 2777 } 2778 xbb->kva = 0; 2779 xbb->gnt_base_addr = 0; 2780 if (xbb->kva_free != NULL) { 2781 free(xbb->kva_free, M_XENBLOCKBACK); 2782 xbb->kva_free = NULL; 2783 } 2784} 2785 2786/** 2787 * Cleanup all inter-domain communication mechanisms. 2788 * 2789 * \param xbb Per-instance xbb configuration structure. 2790 */ 2791static int 2792xbb_disconnect(struct xbb_softc *xbb) 2793{ 2794 struct gnttab_unmap_grant_ref ops[XBB_MAX_RING_PAGES]; 2795 struct gnttab_unmap_grant_ref *op; 2796 u_int ring_idx; 2797 int error; 2798 2799 DPRINTF("\n"); 2800 2801 if ((xbb->flags & XBBF_RING_CONNECTED) == 0) 2802 return (0); 2803 2804 xen_intr_unbind(&xbb->xen_intr_handle); 2805 2806 mtx_unlock(&xbb->lock); 2807 taskqueue_drain(xbb->io_taskqueue, &xbb->io_task); 2808 mtx_lock(&xbb->lock); 2809 2810 /* 2811 * No new interrupts can generate work, but we must wait 2812 * for all currently active requests to drain. 2813 */ 2814 if (xbb->active_request_count != 0) 2815 return (EAGAIN); 2816 2817 for (ring_idx = 0, op = ops; 2818 ring_idx < xbb->ring_config.ring_pages; 2819 ring_idx++, op++) { 2820 2821 op->host_addr = xbb->ring_config.gnt_addr 2822 + (ring_idx * PAGE_SIZE); 2823 op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx]; 2824 op->handle = xbb->ring_config.handle[ring_idx]; 2825 } 2826 2827 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops, 2828 xbb->ring_config.ring_pages); 2829 if (error != 0) 2830 panic("Grant table op failed (%d)", error); 2831 2832 xbb_free_communication_mem(xbb); 2833 2834 if (xbb->requests != NULL) { 2835 free(xbb->requests, M_XENBLOCKBACK); 2836 xbb->requests = NULL; 2837 } 2838 2839 if (xbb->request_lists != NULL) { 2840 struct xbb_xen_reqlist *reqlist; 2841 int i; 2842 2843 /* There is one request list for ever allocated request. */ 2844 for (i = 0, reqlist = xbb->request_lists; 2845 i < xbb->max_requests; i++, reqlist++){ 2846#ifdef XBB_USE_BOUNCE_BUFFERS 2847 if (reqlist->bounce != NULL) { 2848 free(reqlist->bounce, M_XENBLOCKBACK); 2849 reqlist->bounce = NULL; 2850 } 2851#endif 2852 if (reqlist->gnt_handles != NULL) { 2853 free(reqlist->gnt_handles, M_XENBLOCKBACK); 2854 reqlist->gnt_handles = NULL; 2855 } 2856 } 2857 free(xbb->request_lists, M_XENBLOCKBACK); 2858 xbb->request_lists = NULL; 2859 } 2860 2861 xbb->flags &= ~XBBF_RING_CONNECTED; 2862 return (0); 2863} 2864 2865/** 2866 * Map shared memory ring into domain local address space, initialize 2867 * ring control structures, and bind an interrupt to the event channel 2868 * used to notify us of ring changes. 2869 * 2870 * \param xbb Per-instance xbb configuration structure. 2871 */ 2872static int 2873xbb_connect_ring(struct xbb_softc *xbb) 2874{ 2875 struct gnttab_map_grant_ref gnts[XBB_MAX_RING_PAGES]; 2876 struct gnttab_map_grant_ref *gnt; 2877 u_int ring_idx; 2878 int error; 2879 2880 if ((xbb->flags & XBBF_RING_CONNECTED) != 0) 2881 return (0); 2882 2883 /* 2884 * Kva for our ring is at the tail of the region of kva allocated 2885 * by xbb_alloc_communication_mem(). 2886 */ 2887 xbb->ring_config.va = xbb->kva 2888 + (xbb->kva_size 2889 - (xbb->ring_config.ring_pages * PAGE_SIZE)); 2890 xbb->ring_config.gnt_addr = xbb->gnt_base_addr 2891 + (xbb->kva_size 2892 - (xbb->ring_config.ring_pages * PAGE_SIZE)); 2893 2894 for (ring_idx = 0, gnt = gnts; 2895 ring_idx < xbb->ring_config.ring_pages; 2896 ring_idx++, gnt++) { 2897 2898 gnt->host_addr = xbb->ring_config.gnt_addr 2899 + (ring_idx * PAGE_SIZE); 2900 gnt->flags = GNTMAP_host_map; 2901 gnt->ref = xbb->ring_config.ring_ref[ring_idx]; 2902 gnt->dom = xbb->otherend_id; 2903 } 2904 2905 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts, 2906 xbb->ring_config.ring_pages); 2907 if (error) 2908 panic("blkback: Ring page grant table op failed (%d)", error); 2909 2910 for (ring_idx = 0, gnt = gnts; 2911 ring_idx < xbb->ring_config.ring_pages; 2912 ring_idx++, gnt++) { 2913 if (gnt->status != 0) { 2914 xbb->ring_config.va = 0; 2915 xenbus_dev_fatal(xbb->dev, EACCES, 2916 "Ring shared page mapping failed. " 2917 "Status %d.", gnt->status); 2918 return (EACCES); 2919 } 2920 xbb->ring_config.handle[ring_idx] = gnt->handle; 2921 xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr; 2922 } 2923 2924 /* Initialize the ring based on ABI. */ 2925 switch (xbb->abi) { 2926 case BLKIF_PROTOCOL_NATIVE: 2927 { 2928 blkif_sring_t *sring; 2929 sring = (blkif_sring_t *)xbb->ring_config.va; 2930 BACK_RING_INIT(&xbb->rings.native, sring, 2931 xbb->ring_config.ring_pages * PAGE_SIZE); 2932 break; 2933 } 2934 case BLKIF_PROTOCOL_X86_32: 2935 { 2936 blkif_x86_32_sring_t *sring_x86_32; 2937 sring_x86_32 = (blkif_x86_32_sring_t *)xbb->ring_config.va; 2938 BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32, 2939 xbb->ring_config.ring_pages * PAGE_SIZE); 2940 break; 2941 } 2942 case BLKIF_PROTOCOL_X86_64: 2943 { 2944 blkif_x86_64_sring_t *sring_x86_64; 2945 sring_x86_64 = (blkif_x86_64_sring_t *)xbb->ring_config.va; 2946 BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64, 2947 xbb->ring_config.ring_pages * PAGE_SIZE); 2948 break; 2949 } 2950 default: 2951 panic("Unexpected blkif protocol ABI."); 2952 } 2953 2954 xbb->flags |= XBBF_RING_CONNECTED; 2955 2956 error = xen_intr_bind_remote_port(xbb->dev, 2957 xbb->otherend_id, 2958 xbb->ring_config.evtchn, 2959 xbb_filter, 2960 /*ithread_handler*/NULL, 2961 /*arg*/xbb, 2962 INTR_TYPE_BIO | INTR_MPSAFE, 2963 &xbb->xen_intr_handle); 2964 if (error) { 2965 (void)xbb_disconnect(xbb); 2966 xenbus_dev_fatal(xbb->dev, error, "binding event channel"); 2967 return (error); 2968 } 2969 2970 DPRINTF("rings connected!\n"); 2971 2972 return 0; 2973} 2974 2975/** 2976 * Size KVA and pseudo-physical address allocations based on negotiated 2977 * values for the size and number of I/O requests, and the size of our 2978 * communication ring. 2979 * 2980 * \param xbb Per-instance xbb configuration structure. 2981 * 2982 * These address spaces are used to dynamically map pages in the 2983 * front-end's domain into our own. 2984 */ 2985static int 2986xbb_alloc_communication_mem(struct xbb_softc *xbb) 2987{ 2988 xbb->reqlist_kva_pages = xbb->max_requests * xbb->max_request_segments; 2989 xbb->reqlist_kva_size = xbb->reqlist_kva_pages * PAGE_SIZE; 2990 xbb->kva_size = xbb->reqlist_kva_size + 2991 (xbb->ring_config.ring_pages * PAGE_SIZE); 2992 2993 xbb->kva_free = bit_alloc(xbb->reqlist_kva_pages, M_XENBLOCKBACK, M_NOWAIT); 2994 if (xbb->kva_free == NULL) 2995 return (ENOMEM); 2996 2997 DPRINTF("%s: kva_size = %d, reqlist_kva_size = %d\n", 2998 device_get_nameunit(xbb->dev), xbb->kva_size, 2999 xbb->reqlist_kva_size); 3000 /* 3001 * Reserve a range of pseudo physical memory that we can map 3002 * into kva. These pages will only be backed by machine 3003 * pages ("real memory") during the lifetime of front-end requests 3004 * via grant table operations. 3005 */ 3006 xbb->pseudo_phys_res_id = 0; 3007 xbb->pseudo_phys_res = xenmem_alloc(xbb->dev, &xbb->pseudo_phys_res_id, 3008 xbb->kva_size); 3009 if (xbb->pseudo_phys_res == NULL) { 3010 xbb->kva = 0; 3011 return (ENOMEM); 3012 } 3013 xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res); 3014 xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res); 3015 3016 DPRINTF("%s: kva: %#jx, gnt_base_addr: %#jx\n", 3017 device_get_nameunit(xbb->dev), (uintmax_t)xbb->kva, 3018 (uintmax_t)xbb->gnt_base_addr); 3019 return (0); 3020} 3021 3022/** 3023 * Collect front-end information from the XenStore. 3024 * 3025 * \param xbb Per-instance xbb configuration structure. 3026 */ 3027static int 3028xbb_collect_frontend_info(struct xbb_softc *xbb) 3029{ 3030 char protocol_abi[64]; 3031 const char *otherend_path; 3032 int error; 3033 u_int ring_idx; 3034 u_int ring_page_order; 3035 size_t ring_size; 3036 3037 otherend_path = xenbus_get_otherend_path(xbb->dev); 3038 3039 /* 3040 * Protocol defaults valid even if all negotiation fails. 3041 */ 3042 xbb->ring_config.ring_pages = 1; 3043 xbb->max_request_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST; 3044 xbb->max_request_size = xbb->max_request_segments * PAGE_SIZE; 3045 3046 /* 3047 * Mandatory data (used in all versions of the protocol) first. 3048 */ 3049 error = xs_scanf(XST_NIL, otherend_path, 3050 "event-channel", NULL, "%" PRIu32, 3051 &xbb->ring_config.evtchn); 3052 if (error != 0) { 3053 xenbus_dev_fatal(xbb->dev, error, 3054 "Unable to retrieve event-channel information " 3055 "from frontend %s. Unable to connect.", 3056 xenbus_get_otherend_path(xbb->dev)); 3057 return (error); 3058 } 3059 3060 /* 3061 * These fields are initialized to legacy protocol defaults 3062 * so we only need to fail if reading the updated value succeeds 3063 * and the new value is outside of its allowed range. 3064 * 3065 * \note xs_gather() returns on the first encountered error, so 3066 * we must use independent calls in order to guarantee 3067 * we don't miss information in a sparsly populated front-end 3068 * tree. 3069 * 3070 * \note xs_scanf() does not update variables for unmatched 3071 * fields. 3072 */ 3073 ring_page_order = 0; 3074 xbb->max_requests = 32; 3075 3076 (void)xs_scanf(XST_NIL, otherend_path, 3077 "ring-page-order", NULL, "%u", 3078 &ring_page_order); 3079 xbb->ring_config.ring_pages = 1 << ring_page_order; 3080 ring_size = PAGE_SIZE * xbb->ring_config.ring_pages; 3081 xbb->max_requests = BLKIF_MAX_RING_REQUESTS(ring_size); 3082 3083 if (xbb->ring_config.ring_pages > XBB_MAX_RING_PAGES) { 3084 xenbus_dev_fatal(xbb->dev, EINVAL, 3085 "Front-end specified ring-pages of %u " 3086 "exceeds backend limit of %u. " 3087 "Unable to connect.", 3088 xbb->ring_config.ring_pages, 3089 XBB_MAX_RING_PAGES); 3090 return (EINVAL); 3091 } 3092 3093 if (xbb->ring_config.ring_pages == 1) { 3094 error = xs_gather(XST_NIL, otherend_path, 3095 "ring-ref", "%" PRIu32, 3096 &xbb->ring_config.ring_ref[0], 3097 NULL); 3098 if (error != 0) { 3099 xenbus_dev_fatal(xbb->dev, error, 3100 "Unable to retrieve ring information " 3101 "from frontend %s. Unable to " 3102 "connect.", 3103 xenbus_get_otherend_path(xbb->dev)); 3104 return (error); 3105 } 3106 } else { 3107 /* Multi-page ring format. */ 3108 for (ring_idx = 0; ring_idx < xbb->ring_config.ring_pages; 3109 ring_idx++) { 3110 char ring_ref_name[]= "ring_refXX"; 3111 3112 snprintf(ring_ref_name, sizeof(ring_ref_name), 3113 "ring-ref%u", ring_idx); 3114 error = xs_scanf(XST_NIL, otherend_path, 3115 ring_ref_name, NULL, "%" PRIu32, 3116 &xbb->ring_config.ring_ref[ring_idx]); 3117 if (error != 0) { 3118 xenbus_dev_fatal(xbb->dev, error, 3119 "Failed to retriev grant " 3120 "reference for page %u of " 3121 "shared ring. Unable " 3122 "to connect.", ring_idx); 3123 return (error); 3124 } 3125 } 3126 } 3127 3128 error = xs_gather(XST_NIL, otherend_path, 3129 "protocol", "%63s", protocol_abi, 3130 NULL); 3131 if (error != 0 3132 || !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) { 3133 /* 3134 * Assume native if the frontend has not 3135 * published ABI data or it has published and 3136 * matches our own ABI. 3137 */ 3138 xbb->abi = BLKIF_PROTOCOL_NATIVE; 3139 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) { 3140 3141 xbb->abi = BLKIF_PROTOCOL_X86_32; 3142 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) { 3143 3144 xbb->abi = BLKIF_PROTOCOL_X86_64; 3145 } else { 3146 3147 xenbus_dev_fatal(xbb->dev, EINVAL, 3148 "Unknown protocol ABI (%s) published by " 3149 "frontend. Unable to connect.", protocol_abi); 3150 return (EINVAL); 3151 } 3152 return (0); 3153} 3154 3155/** 3156 * Allocate per-request data structures given request size and number 3157 * information negotiated with the front-end. 3158 * 3159 * \param xbb Per-instance xbb configuration structure. 3160 */ 3161static int 3162xbb_alloc_requests(struct xbb_softc *xbb) 3163{ 3164 struct xbb_xen_req *req; 3165 struct xbb_xen_req *last_req; 3166 3167 /* 3168 * Allocate request book keeping datastructures. 3169 */ 3170 xbb->requests = malloc(xbb->max_requests * sizeof(*xbb->requests), 3171 M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3172 if (xbb->requests == NULL) { 3173 xenbus_dev_fatal(xbb->dev, ENOMEM, 3174 "Unable to allocate request structures"); 3175 return (ENOMEM); 3176 } 3177 3178 req = xbb->requests; 3179 last_req = &xbb->requests[xbb->max_requests - 1]; 3180 STAILQ_INIT(&xbb->request_free_stailq); 3181 while (req <= last_req) { 3182 STAILQ_INSERT_TAIL(&xbb->request_free_stailq, req, links); 3183 req++; 3184 } 3185 return (0); 3186} 3187 3188static int 3189xbb_alloc_request_lists(struct xbb_softc *xbb) 3190{ 3191 struct xbb_xen_reqlist *reqlist; 3192 int i; 3193 3194 /* 3195 * If no requests can be merged, we need 1 request list per 3196 * in flight request. 3197 */ 3198 xbb->request_lists = malloc(xbb->max_requests * 3199 sizeof(*xbb->request_lists), M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3200 if (xbb->request_lists == NULL) { 3201 xenbus_dev_fatal(xbb->dev, ENOMEM, 3202 "Unable to allocate request list structures"); 3203 return (ENOMEM); 3204 } 3205 3206 STAILQ_INIT(&xbb->reqlist_free_stailq); 3207 STAILQ_INIT(&xbb->reqlist_pending_stailq); 3208 for (i = 0; i < xbb->max_requests; i++) { 3209 int seg; 3210 3211 reqlist = &xbb->request_lists[i]; 3212 3213 reqlist->xbb = xbb; 3214 3215#ifdef XBB_USE_BOUNCE_BUFFERS 3216 reqlist->bounce = malloc(xbb->max_reqlist_size, 3217 M_XENBLOCKBACK, M_NOWAIT); 3218 if (reqlist->bounce == NULL) { 3219 xenbus_dev_fatal(xbb->dev, ENOMEM, 3220 "Unable to allocate request " 3221 "bounce buffers"); 3222 return (ENOMEM); 3223 } 3224#endif /* XBB_USE_BOUNCE_BUFFERS */ 3225 3226 reqlist->gnt_handles = malloc(xbb->max_reqlist_segments * 3227 sizeof(*reqlist->gnt_handles), 3228 M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3229 if (reqlist->gnt_handles == NULL) { 3230 xenbus_dev_fatal(xbb->dev, ENOMEM, 3231 "Unable to allocate request " 3232 "grant references"); 3233 return (ENOMEM); 3234 } 3235 3236 for (seg = 0; seg < xbb->max_reqlist_segments; seg++) 3237 reqlist->gnt_handles[seg] = GRANT_REF_INVALID; 3238 3239 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); 3240 } 3241 return (0); 3242} 3243 3244/** 3245 * Supply information about the physical device to the frontend 3246 * via XenBus. 3247 * 3248 * \param xbb Per-instance xbb configuration structure. 3249 */ 3250static int 3251xbb_publish_backend_info(struct xbb_softc *xbb) 3252{ 3253 struct xs_transaction xst; 3254 const char *our_path; 3255 const char *leaf; 3256 int error; 3257 3258 our_path = xenbus_get_node(xbb->dev); 3259 while (1) { 3260 error = xs_transaction_start(&xst); 3261 if (error != 0) { 3262 xenbus_dev_fatal(xbb->dev, error, 3263 "Error publishing backend info " 3264 "(start transaction)"); 3265 return (error); 3266 } 3267 3268 leaf = "sectors"; 3269 error = xs_printf(xst, our_path, leaf, 3270 "%"PRIu64, xbb->media_num_sectors); 3271 if (error != 0) 3272 break; 3273 3274 /* XXX Support all VBD attributes here. */ 3275 leaf = "info"; 3276 error = xs_printf(xst, our_path, leaf, "%u", 3277 xbb->flags & XBBF_READ_ONLY 3278 ? VDISK_READONLY : 0); 3279 if (error != 0) 3280 break; 3281 3282 leaf = "sector-size"; 3283 error = xs_printf(xst, our_path, leaf, "%u", 3284 xbb->sector_size); 3285 if (error != 0) 3286 break; 3287 3288 error = xs_transaction_end(xst, 0); 3289 if (error == 0) { 3290 return (0); 3291 } else if (error != EAGAIN) { 3292 xenbus_dev_fatal(xbb->dev, error, "ending transaction"); 3293 return (error); 3294 } 3295 } 3296 3297 xenbus_dev_fatal(xbb->dev, error, "writing %s/%s", 3298 our_path, leaf); 3299 xs_transaction_end(xst, 1); 3300 return (error); 3301} 3302 3303/** 3304 * Connect to our blkfront peer now that it has completed publishing 3305 * its configuration into the XenStore. 3306 * 3307 * \param xbb Per-instance xbb configuration structure. 3308 */ 3309static void 3310xbb_connect(struct xbb_softc *xbb) 3311{ 3312 int error; 3313 3314 if (!xbb->hotplug_done || 3315 (xenbus_get_state(xbb->dev) != XenbusStateInitWait) || 3316 (xbb_collect_frontend_info(xbb) != 0)) 3317 return; 3318 3319 xbb->flags &= ~XBBF_SHUTDOWN; 3320 3321 /* 3322 * We limit the maximum number of reqlist segments to the maximum 3323 * number of segments in the ring, or our absolute maximum, 3324 * whichever is smaller. 3325 */ 3326 xbb->max_reqlist_segments = MIN(xbb->max_request_segments * 3327 xbb->max_requests, XBB_MAX_SEGMENTS_PER_REQLIST); 3328 3329 /* 3330 * The maximum size is simply a function of the number of segments 3331 * we can handle. 3332 */ 3333 xbb->max_reqlist_size = xbb->max_reqlist_segments * PAGE_SIZE; 3334 3335 /* Allocate resources whose size depends on front-end configuration. */ 3336 error = xbb_alloc_communication_mem(xbb); 3337 if (error != 0) { 3338 xenbus_dev_fatal(xbb->dev, error, 3339 "Unable to allocate communication memory"); 3340 return; 3341 } 3342 3343 error = xbb_alloc_requests(xbb); 3344 if (error != 0) { 3345 /* Specific errors are reported by xbb_alloc_requests(). */ 3346 return; 3347 } 3348 3349 error = xbb_alloc_request_lists(xbb); 3350 if (error != 0) { 3351 /* Specific errors are reported by xbb_alloc_request_lists(). */ 3352 return; 3353 } 3354 3355 /* 3356 * Connect communication channel. 3357 */ 3358 error = xbb_connect_ring(xbb); 3359 if (error != 0) { 3360 /* Specific errors are reported by xbb_connect_ring(). */ 3361 return; 3362 } 3363 3364 if (xbb_publish_backend_info(xbb) != 0) { 3365 /* 3366 * If we can't publish our data, we cannot participate 3367 * in this connection, and waiting for a front-end state 3368 * change will not help the situation. 3369 */ 3370 (void)xbb_disconnect(xbb); 3371 return; 3372 } 3373 3374 /* Ready for I/O. */ 3375 xenbus_set_state(xbb->dev, XenbusStateConnected); 3376} 3377 3378/*-------------------------- Device Teardown Support -------------------------*/ 3379/** 3380 * Perform device shutdown functions. 3381 * 3382 * \param xbb Per-instance xbb configuration structure. 3383 * 3384 * Mark this instance as shutting down, wait for any active I/O on the 3385 * backend device/file to drain, disconnect from the front-end, and notify 3386 * any waiters (e.g. a thread invoking our detach method) that detach can 3387 * now proceed. 3388 */ 3389static int 3390xbb_shutdown(struct xbb_softc *xbb) 3391{ 3392 XenbusState frontState; 3393 int error; 3394 3395 DPRINTF("\n"); 3396 3397 /* 3398 * Due to the need to drop our mutex during some 3399 * xenbus operations, it is possible for two threads 3400 * to attempt to close out shutdown processing at 3401 * the same time. Tell the caller that hits this 3402 * race to try back later. 3403 */ 3404 if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0) 3405 return (EAGAIN); 3406 3407 xbb->flags |= XBBF_IN_SHUTDOWN; 3408 mtx_unlock(&xbb->lock); 3409 3410 if (xbb->hotplug_watch.node != NULL) { 3411 xs_unregister_watch(&xbb->hotplug_watch); 3412 free(xbb->hotplug_watch.node, M_XENBLOCKBACK); 3413 xbb->hotplug_watch.node = NULL; 3414 } 3415 xbb->hotplug_done = false; 3416 3417 if (xenbus_get_state(xbb->dev) < XenbusStateClosing) 3418 xenbus_set_state(xbb->dev, XenbusStateClosing); 3419 3420 frontState = xenbus_get_otherend_state(xbb->dev); 3421 mtx_lock(&xbb->lock); 3422 xbb->flags &= ~XBBF_IN_SHUTDOWN; 3423 3424 /* Wait for the frontend to disconnect (if it's connected). */ 3425 if (frontState == XenbusStateConnected) 3426 return (EAGAIN); 3427 3428 DPRINTF("\n"); 3429 3430 /* Indicate shutdown is in progress. */ 3431 xbb->flags |= XBBF_SHUTDOWN; 3432 3433 /* Disconnect from the front-end. */ 3434 error = xbb_disconnect(xbb); 3435 if (error != 0) { 3436 /* 3437 * Requests still outstanding. We'll be called again 3438 * once they complete. 3439 */ 3440 KASSERT(error == EAGAIN, 3441 ("%s: Unexpected xbb_disconnect() failure %d", 3442 __func__, error)); 3443 3444 return (error); 3445 } 3446 3447 DPRINTF("\n"); 3448 3449 /* Indicate to xbb_detach() that is it safe to proceed. */ 3450 wakeup(xbb); 3451 3452 return (0); 3453} 3454 3455/** 3456 * Report an attach time error to the console and Xen, and cleanup 3457 * this instance by forcing immediate detach processing. 3458 * 3459 * \param xbb Per-instance xbb configuration structure. 3460 * \param err Errno describing the error. 3461 * \param fmt Printf style format and arguments 3462 */ 3463static void 3464xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...) 3465{ 3466 va_list ap; 3467 va_list ap_hotplug; 3468 3469 va_start(ap, fmt); 3470 va_copy(ap_hotplug, ap); 3471 xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev), 3472 "hotplug-error", fmt, ap_hotplug); 3473 va_end(ap_hotplug); 3474 xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3475 "hotplug-status", "error"); 3476 3477 xenbus_dev_vfatal(xbb->dev, err, fmt, ap); 3478 va_end(ap); 3479 3480 xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3481 "online", "0"); 3482 mtx_lock(&xbb->lock); 3483 xbb_shutdown(xbb); 3484 mtx_unlock(&xbb->lock); 3485} 3486 3487/*---------------------------- NewBus Entrypoints ----------------------------*/ 3488/** 3489 * Inspect a XenBus device and claim it if is of the appropriate type. 3490 * 3491 * \param dev NewBus device object representing a candidate XenBus device. 3492 * 3493 * \return 0 for success, errno codes for failure. 3494 */ 3495static int 3496xbb_probe(device_t dev) 3497{ 3498 3499 if (!strcmp(xenbus_get_type(dev), "vbd")) { 3500 device_set_desc(dev, "Backend Virtual Block Device"); 3501 device_quiet(dev); 3502 return (0); 3503 } 3504 3505 return (ENXIO); 3506} 3507 3508/** 3509 * Setup sysctl variables to control various Block Back parameters. 3510 * 3511 * \param xbb Xen Block Back softc. 3512 * 3513 */ 3514static void 3515xbb_setup_sysctl(struct xbb_softc *xbb) 3516{ 3517 struct sysctl_ctx_list *sysctl_ctx = NULL; 3518 struct sysctl_oid *sysctl_tree = NULL; 3519 3520 sysctl_ctx = device_get_sysctl_ctx(xbb->dev); 3521 if (sysctl_ctx == NULL) 3522 return; 3523 3524 sysctl_tree = device_get_sysctl_tree(xbb->dev); 3525 if (sysctl_tree == NULL) 3526 return; 3527 3528 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3529 "disable_flush", CTLFLAG_RW, &xbb->disable_flush, 0, 3530 "fake the flush command"); 3531 3532 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3533 "flush_interval", CTLFLAG_RW, &xbb->flush_interval, 0, 3534 "send a real flush for N flush requests"); 3535 3536 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3537 "no_coalesce_reqs", CTLFLAG_RW, &xbb->no_coalesce_reqs,0, 3538 "Don't coalesce contiguous requests"); 3539 3540 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3541 "reqs_received", CTLFLAG_RW, &xbb->reqs_received, 3542 "how many I/O requests we have received"); 3543 3544 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3545 "reqs_completed", CTLFLAG_RW, &xbb->reqs_completed, 3546 "how many I/O requests have been completed"); 3547 3548 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3549 "reqs_queued_for_completion", CTLFLAG_RW, 3550 &xbb->reqs_queued_for_completion, 3551 "how many I/O requests queued but not yet pushed"); 3552 3553 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3554 "reqs_completed_with_error", CTLFLAG_RW, 3555 &xbb->reqs_completed_with_error, 3556 "how many I/O requests completed with error status"); 3557 3558 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3559 "forced_dispatch", CTLFLAG_RW, &xbb->forced_dispatch, 3560 "how many I/O dispatches were forced"); 3561 3562 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3563 "normal_dispatch", CTLFLAG_RW, &xbb->normal_dispatch, 3564 "how many I/O dispatches were normal"); 3565 3566 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3567 "total_dispatch", CTLFLAG_RW, &xbb->total_dispatch, 3568 "total number of I/O dispatches"); 3569 3570 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3571 "kva_shortages", CTLFLAG_RW, &xbb->kva_shortages, 3572 "how many times we have run out of KVA"); 3573 3574 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3575 "request_shortages", CTLFLAG_RW, 3576 &xbb->request_shortages, 3577 "how many times we have run out of requests"); 3578 3579 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3580 "max_requests", CTLFLAG_RD, &xbb->max_requests, 0, 3581 "maximum outstanding requests (negotiated)"); 3582 3583 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3584 "max_request_segments", CTLFLAG_RD, 3585 &xbb->max_request_segments, 0, 3586 "maximum number of pages per requests (negotiated)"); 3587 3588 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3589 "max_request_size", CTLFLAG_RD, 3590 &xbb->max_request_size, 0, 3591 "maximum size in bytes of a request (negotiated)"); 3592 3593 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3594 "ring_pages", CTLFLAG_RD, 3595 &xbb->ring_config.ring_pages, 0, 3596 "communication channel pages (negotiated)"); 3597} 3598 3599static void 3600xbb_attach_disk(struct xs_watch *watch, const char **vec, unsigned int len) 3601{ 3602 device_t dev; 3603 struct xbb_softc *xbb; 3604 int error; 3605 3606 dev = (device_t) watch->callback_data; 3607 xbb = device_get_softc(dev); 3608 3609 error = xs_gather(XST_NIL, xenbus_get_node(dev), "physical-device-path", 3610 NULL, &xbb->dev_name, NULL); 3611 if (error != 0) 3612 return; 3613 3614 xs_unregister_watch(watch); 3615 free(watch->node, M_XENBLOCKBACK); 3616 watch->node = NULL; 3617 3618 /* Collect physical device information. */ 3619 error = xs_gather(XST_NIL, xenbus_get_otherend_path(xbb->dev), 3620 "device-type", NULL, &xbb->dev_type, 3621 NULL); 3622 if (error != 0) 3623 xbb->dev_type = NULL; 3624 3625 error = xs_gather(XST_NIL, xenbus_get_node(dev), 3626 "mode", NULL, &xbb->dev_mode, 3627 NULL); 3628 if (error != 0) { 3629 xbb_attach_failed(xbb, error, "reading backend fields at %s", 3630 xenbus_get_node(dev)); 3631 return; 3632 } 3633 3634 /* Parse fopen style mode flags. */ 3635 if (strchr(xbb->dev_mode, 'w') == NULL) 3636 xbb->flags |= XBBF_READ_ONLY; 3637 3638 /* 3639 * Verify the physical device is present and can support 3640 * the desired I/O mode. 3641 */ 3642 error = xbb_open_backend(xbb); 3643 if (error != 0) { 3644 xbb_attach_failed(xbb, error, "Unable to open %s", 3645 xbb->dev_name); 3646 return; 3647 } 3648 3649 /* Use devstat(9) for recording statistics. */ 3650 xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev), 3651 xbb->sector_size, 3652 DEVSTAT_ALL_SUPPORTED, 3653 DEVSTAT_TYPE_DIRECT 3654 | DEVSTAT_TYPE_IF_OTHER, 3655 DEVSTAT_PRIORITY_OTHER); 3656 3657 xbb->xbb_stats_in = devstat_new_entry("xbbi", device_get_unit(xbb->dev), 3658 xbb->sector_size, 3659 DEVSTAT_ALL_SUPPORTED, 3660 DEVSTAT_TYPE_DIRECT 3661 | DEVSTAT_TYPE_IF_OTHER, 3662 DEVSTAT_PRIORITY_OTHER); 3663 /* 3664 * Setup sysctl variables. 3665 */ 3666 xbb_setup_sysctl(xbb); 3667 3668 /* 3669 * Create a taskqueue for doing work that must occur from a 3670 * thread context. 3671 */ 3672 xbb->io_taskqueue = taskqueue_create_fast(device_get_nameunit(dev), 3673 M_NOWAIT, 3674 taskqueue_thread_enqueue, 3675 /*contxt*/&xbb->io_taskqueue); 3676 if (xbb->io_taskqueue == NULL) { 3677 xbb_attach_failed(xbb, error, "Unable to create taskqueue"); 3678 return; 3679 } 3680 3681 taskqueue_start_threads(&xbb->io_taskqueue, 3682 /*num threads*/1, 3683 /*priority*/PWAIT, 3684 /*thread name*/ 3685 "%s taskq", device_get_nameunit(dev)); 3686 3687 /* Update hot-plug status to satisfy xend. */ 3688 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3689 "hotplug-status", "connected"); 3690 if (error) { 3691 xbb_attach_failed(xbb, error, "writing %s/hotplug-status", 3692 xenbus_get_node(xbb->dev)); 3693 return; 3694 } 3695 3696 xbb->hotplug_done = true; 3697 3698 /* The front end might be waiting for the backend, attach if so. */ 3699 if (xenbus_get_otherend_state(xbb->dev) == XenbusStateInitialised) 3700 xbb_connect(xbb); 3701} 3702 3703/** 3704 * Attach to a XenBus device that has been claimed by our probe routine. 3705 * 3706 * \param dev NewBus device object representing this Xen Block Back instance. 3707 * 3708 * \return 0 for success, errno codes for failure. 3709 */ 3710static int 3711xbb_attach(device_t dev) 3712{ 3713 struct xbb_softc *xbb; 3714 int error; 3715 u_int max_ring_page_order; 3716 struct sbuf *watch_path; 3717 3718 DPRINTF("Attaching to %s\n", xenbus_get_node(dev)); 3719 3720 /* 3721 * Basic initialization. 3722 * After this block it is safe to call xbb_detach() 3723 * to clean up any allocated data for this instance. 3724 */ 3725 xbb = device_get_softc(dev); 3726 xbb->dev = dev; 3727 xbb->otherend_id = xenbus_get_otherend_id(dev); 3728 TASK_INIT(&xbb->io_task, /*priority*/0, xbb_run_queue, xbb); 3729 mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF); 3730 3731 /* 3732 * Publish protocol capabilities for consumption by the 3733 * front-end. 3734 */ 3735 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3736 "feature-barrier", "1"); 3737 if (error) { 3738 xbb_attach_failed(xbb, error, "writing %s/feature-barrier", 3739 xenbus_get_node(xbb->dev)); 3740 return (error); 3741 } 3742 3743 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3744 "feature-flush-cache", "1"); 3745 if (error) { 3746 xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache", 3747 xenbus_get_node(xbb->dev)); 3748 return (error); 3749 } 3750 3751 max_ring_page_order = flsl(XBB_MAX_RING_PAGES) - 1; 3752 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3753 "max-ring-page-order", "%u", max_ring_page_order); 3754 if (error) { 3755 xbb_attach_failed(xbb, error, "writing %s/max-ring-page-order", 3756 xenbus_get_node(xbb->dev)); 3757 return (error); 3758 } 3759 3760 /* 3761 * We need to wait for hotplug script execution before 3762 * moving forward. 3763 */ 3764 KASSERT(!xbb->hotplug_done, ("Hotplug scripts already executed")); 3765 watch_path = xs_join(xenbus_get_node(xbb->dev), "physical-device-path"); 3766 xbb->hotplug_watch.callback_data = (uintptr_t)dev; 3767 xbb->hotplug_watch.callback = xbb_attach_disk; 3768 KASSERT(xbb->hotplug_watch.node == NULL, ("watch node already setup")); 3769 xbb->hotplug_watch.node = strdup(sbuf_data(watch_path), M_XENBLOCKBACK); 3770 sbuf_delete(watch_path); 3771 error = xs_register_watch(&xbb->hotplug_watch); 3772 if (error != 0) { 3773 xbb_attach_failed(xbb, error, "failed to create watch on %s", 3774 xbb->hotplug_watch.node); 3775 free(xbb->hotplug_watch.node, M_XENBLOCKBACK); 3776 return (error); 3777 } 3778 3779 /* Tell the toolstack blkback has attached. */ 3780 xenbus_set_state(dev, XenbusStateInitWait); 3781 3782 return (0); 3783} 3784 3785/** 3786 * Detach from a block back device instance. 3787 * 3788 * \param dev NewBus device object representing this Xen Block Back instance. 3789 * 3790 * \return 0 for success, errno codes for failure. 3791 * 3792 * \note A block back device may be detached at any time in its life-cycle, 3793 * including part way through the attach process. For this reason, 3794 * initialization order and the initialization state checks in this 3795 * routine must be carefully coupled so that attach time failures 3796 * are gracefully handled. 3797 */ 3798static int 3799xbb_detach(device_t dev) 3800{ 3801 struct xbb_softc *xbb; 3802 3803 DPRINTF("\n"); 3804 3805 xbb = device_get_softc(dev); 3806 mtx_lock(&xbb->lock); 3807 while (xbb_shutdown(xbb) == EAGAIN) { 3808 msleep(xbb, &xbb->lock, /*wakeup prio unchanged*/0, 3809 "xbb_shutdown", 0); 3810 } 3811 mtx_unlock(&xbb->lock); 3812 3813 DPRINTF("\n"); 3814 3815 if (xbb->io_taskqueue != NULL) 3816 taskqueue_free(xbb->io_taskqueue); 3817 3818 if (xbb->xbb_stats != NULL) 3819 devstat_remove_entry(xbb->xbb_stats); 3820 3821 if (xbb->xbb_stats_in != NULL) 3822 devstat_remove_entry(xbb->xbb_stats_in); 3823 3824 xbb_close_backend(xbb); 3825 3826 if (xbb->dev_mode != NULL) { 3827 free(xbb->dev_mode, M_XENSTORE); 3828 xbb->dev_mode = NULL; 3829 } 3830 3831 if (xbb->dev_type != NULL) { 3832 free(xbb->dev_type, M_XENSTORE); 3833 xbb->dev_type = NULL; 3834 } 3835 3836 if (xbb->dev_name != NULL) { 3837 free(xbb->dev_name, M_XENSTORE); 3838 xbb->dev_name = NULL; 3839 } 3840 3841 mtx_destroy(&xbb->lock); 3842 return (0); 3843} 3844 3845/** 3846 * Prepare this block back device for suspension of this VM. 3847 * 3848 * \param dev NewBus device object representing this Xen Block Back instance. 3849 * 3850 * \return 0 for success, errno codes for failure. 3851 */ 3852static int 3853xbb_suspend(device_t dev) 3854{ 3855#ifdef NOT_YET 3856 struct xbb_softc *sc = device_get_softc(dev); 3857 3858 /* Prevent new requests being issued until we fix things up. */ 3859 mtx_lock(&sc->xb_io_lock); 3860 sc->connected = BLKIF_STATE_SUSPENDED; 3861 mtx_unlock(&sc->xb_io_lock); 3862#endif 3863 3864 return (0); 3865} 3866 3867/** 3868 * Perform any processing required to recover from a suspended state. 3869 * 3870 * \param dev NewBus device object representing this Xen Block Back instance. 3871 * 3872 * \return 0 for success, errno codes for failure. 3873 */ 3874static int 3875xbb_resume(device_t dev) 3876{ 3877 return (0); 3878} 3879 3880/** 3881 * Handle state changes expressed via the XenStore by our front-end peer. 3882 * 3883 * \param dev NewBus device object representing this Xen 3884 * Block Back instance. 3885 * \param frontend_state The new state of the front-end. 3886 * 3887 * \return 0 for success, errno codes for failure. 3888 */ 3889static void 3890xbb_frontend_changed(device_t dev, XenbusState frontend_state) 3891{ 3892 struct xbb_softc *xbb = device_get_softc(dev); 3893 3894 DPRINTF("frontend_state=%s, xbb_state=%s\n", 3895 xenbus_strstate(frontend_state), 3896 xenbus_strstate(xenbus_get_state(xbb->dev))); 3897 3898 switch (frontend_state) { 3899 case XenbusStateInitialising: 3900 break; 3901 case XenbusStateInitialised: 3902 case XenbusStateConnected: 3903 xbb_connect(xbb); 3904 break; 3905 case XenbusStateClosing: 3906 case XenbusStateClosed: 3907 mtx_lock(&xbb->lock); 3908 xbb_shutdown(xbb); 3909 mtx_unlock(&xbb->lock); 3910 if (frontend_state == XenbusStateClosed) 3911 xenbus_set_state(xbb->dev, XenbusStateClosed); 3912 break; 3913 default: 3914 xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend", 3915 frontend_state); 3916 break; 3917 } 3918} 3919 3920/*---------------------------- NewBus Registration ---------------------------*/ 3921static device_method_t xbb_methods[] = { 3922 /* Device interface */ 3923 DEVMETHOD(device_probe, xbb_probe), 3924 DEVMETHOD(device_attach, xbb_attach), 3925 DEVMETHOD(device_detach, xbb_detach), 3926 DEVMETHOD(device_shutdown, bus_generic_shutdown), 3927 DEVMETHOD(device_suspend, xbb_suspend), 3928 DEVMETHOD(device_resume, xbb_resume), 3929 3930 /* Xenbus interface */ 3931 DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed), 3932 3933 { 0, 0 } 3934}; 3935 3936static driver_t xbb_driver = { 3937 "xbbd", 3938 xbb_methods, 3939 sizeof(struct xbb_softc), 3940}; 3941devclass_t xbb_devclass; 3942 3943DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, xbb_devclass, 0, 0); 3944