blkback.c revision 282274
1/*- 2 * Copyright (c) 2009-2012 Spectra Logic Corporation 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions, and the following disclaimer, 10 * without modification. 11 * 2. Redistributions in binary form must reproduce at minimum a disclaimer 12 * substantially similar to the "NO WARRANTY" disclaimer below 13 * ("Disclaimer") and any redistribution must be conditioned upon 14 * including a substantially similar Disclaimer requirement for further 15 * binary redistribution. 16 * 17 * NO WARRANTY 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 26 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 27 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * POSSIBILITY OF SUCH DAMAGES. 29 * 30 * Authors: Justin T. Gibbs (Spectra Logic Corporation) 31 * Ken Merry (Spectra Logic Corporation) 32 */ 33#include <sys/cdefs.h> 34__FBSDID("$FreeBSD: head/sys/dev/xen/blkback/blkback.c 282274 2015-04-30 15:48:48Z jhb $"); 35 36/** 37 * \file blkback.c 38 * 39 * \brief Device driver supporting the vending of block storage from 40 * a FreeBSD domain to other domains. 41 */ 42 43#include <sys/param.h> 44#include <sys/systm.h> 45#include <sys/kernel.h> 46#include <sys/malloc.h> 47 48#include <sys/bio.h> 49#include <sys/bus.h> 50#include <sys/conf.h> 51#include <sys/devicestat.h> 52#include <sys/disk.h> 53#include <sys/fcntl.h> 54#include <sys/filedesc.h> 55#include <sys/kdb.h> 56#include <sys/module.h> 57#include <sys/namei.h> 58#include <sys/proc.h> 59#include <sys/rman.h> 60#include <sys/taskqueue.h> 61#include <sys/types.h> 62#include <sys/vnode.h> 63#include <sys/mount.h> 64#include <sys/sysctl.h> 65#include <sys/bitstring.h> 66#include <sys/sdt.h> 67 68#include <geom/geom.h> 69 70#include <machine/_inttypes.h> 71 72#include <vm/vm.h> 73#include <vm/vm_extern.h> 74#include <vm/vm_kern.h> 75 76#include <xen/xen-os.h> 77#include <xen/blkif.h> 78#include <xen/gnttab.h> 79#include <xen/xen_intr.h> 80 81#include <xen/interface/event_channel.h> 82#include <xen/interface/grant_table.h> 83 84#include <xen/xenbus/xenbusvar.h> 85 86/*--------------------------- Compile-time Tunables --------------------------*/ 87/** 88 * The maximum number of outstanding request blocks (request headers plus 89 * additional segment blocks) we will allow in a negotiated block-front/back 90 * communication channel. 91 */ 92#define XBB_MAX_REQUESTS 256 93 94/** 95 * \brief Define to force all I/O to be performed on memory owned by the 96 * backend device, with a copy-in/out to the remote domain's memory. 97 * 98 * \note This option is currently required when this driver's domain is 99 * operating in HVM mode on a system using an IOMMU. 100 * 101 * This driver uses Xen's grant table API to gain access to the memory of 102 * the remote domains it serves. When our domain is operating in PV mode, 103 * the grant table mechanism directly updates our domain's page table entries 104 * to point to the physical pages of the remote domain. This scheme guarantees 105 * that blkback and the backing devices it uses can safely perform DMA 106 * operations to satisfy requests. In HVM mode, Xen may use a HW IOMMU to 107 * insure that our domain cannot DMA to pages owned by another domain. As 108 * of Xen 4.0, IOMMU mappings for HVM guests are not updated via the grant 109 * table API. For this reason, in HVM mode, we must bounce all requests into 110 * memory that is mapped into our domain at domain startup and thus has 111 * valid IOMMU mappings. 112 */ 113#define XBB_USE_BOUNCE_BUFFERS 114 115/** 116 * \brief Define to enable rudimentary request logging to the console. 117 */ 118#undef XBB_DEBUG 119 120/*---------------------------------- Macros ----------------------------------*/ 121/** 122 * Custom malloc type for all driver allocations. 123 */ 124static MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data"); 125 126#ifdef XBB_DEBUG 127#define DPRINTF(fmt, args...) \ 128 printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) 129#else 130#define DPRINTF(fmt, args...) do {} while(0) 131#endif 132 133/** 134 * The maximum mapped region size per request we will allow in a negotiated 135 * block-front/back communication channel. 136 */ 137#define XBB_MAX_REQUEST_SIZE \ 138 MIN(MAXPHYS, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) 139 140/** 141 * The maximum number of segments (within a request header and accompanying 142 * segment blocks) per request we will allow in a negotiated block-front/back 143 * communication channel. 144 */ 145#define XBB_MAX_SEGMENTS_PER_REQUEST \ 146 (MIN(UIO_MAXIOV, \ 147 MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST, \ 148 (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1))) 149 150/** 151 * The maximum number of shared memory ring pages we will allow in a 152 * negotiated block-front/back communication channel. Allow enough 153 * ring space for all requests to be XBB_MAX_REQUEST_SIZE'd. 154 */ 155#define XBB_MAX_RING_PAGES \ 156 BLKIF_RING_PAGES(BLKIF_SEGS_TO_BLOCKS(XBB_MAX_SEGMENTS_PER_REQUEST) \ 157 * XBB_MAX_REQUESTS) 158/** 159 * The maximum number of ring pages that we can allow per request list. 160 * We limit this to the maximum number of segments per request, because 161 * that is already a reasonable number of segments to aggregate. This 162 * number should never be smaller than XBB_MAX_SEGMENTS_PER_REQUEST, 163 * because that would leave situations where we can't dispatch even one 164 * large request. 165 */ 166#define XBB_MAX_SEGMENTS_PER_REQLIST XBB_MAX_SEGMENTS_PER_REQUEST 167 168/*--------------------------- Forward Declarations ---------------------------*/ 169struct xbb_softc; 170struct xbb_xen_req; 171 172static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, 173 ...) __attribute__((format(printf, 3, 4))); 174static int xbb_shutdown(struct xbb_softc *xbb); 175static int xbb_detach(device_t dev); 176 177/*------------------------------ Data Structures -----------------------------*/ 178 179STAILQ_HEAD(xbb_xen_req_list, xbb_xen_req); 180 181typedef enum { 182 XBB_REQLIST_NONE = 0x00, 183 XBB_REQLIST_MAPPED = 0x01 184} xbb_reqlist_flags; 185 186struct xbb_xen_reqlist { 187 /** 188 * Back reference to the parent block back instance for this 189 * request. Used during bio_done handling. 190 */ 191 struct xbb_softc *xbb; 192 193 /** 194 * BLKIF_OP code for this request. 195 */ 196 int operation; 197 198 /** 199 * Set to BLKIF_RSP_* to indicate request status. 200 * 201 * This field allows an error status to be recorded even if the 202 * delivery of this status must be deferred. Deferred reporting 203 * is necessary, for example, when an error is detected during 204 * completion processing of one bio when other bios for this 205 * request are still outstanding. 206 */ 207 int status; 208 209 /** 210 * Number of 512 byte sectors not transferred. 211 */ 212 int residual_512b_sectors; 213 214 /** 215 * Starting sector number of the first request in the list. 216 */ 217 off_t starting_sector_number; 218 219 /** 220 * If we're going to coalesce, the next contiguous sector would be 221 * this one. 222 */ 223 off_t next_contig_sector; 224 225 /** 226 * Number of child requests in the list. 227 */ 228 int num_children; 229 230 /** 231 * Number of I/O requests still pending on the backend. 232 */ 233 int pendcnt; 234 235 /** 236 * Total number of segments for requests in the list. 237 */ 238 int nr_segments; 239 240 /** 241 * Flags for this particular request list. 242 */ 243 xbb_reqlist_flags flags; 244 245 /** 246 * Kernel virtual address space reserved for this request 247 * list structure and used to map the remote domain's pages for 248 * this I/O, into our domain's address space. 249 */ 250 uint8_t *kva; 251 252 /** 253 * Base, psuedo-physical address, corresponding to the start 254 * of this request's kva region. 255 */ 256 uint64_t gnt_base; 257 258 259#ifdef XBB_USE_BOUNCE_BUFFERS 260 /** 261 * Pre-allocated domain local memory used to proxy remote 262 * domain memory during I/O operations. 263 */ 264 uint8_t *bounce; 265#endif 266 267 /** 268 * Array of grant handles (one per page) used to map this request. 269 */ 270 grant_handle_t *gnt_handles; 271 272 /** 273 * Device statistics request ordering type (ordered or simple). 274 */ 275 devstat_tag_type ds_tag_type; 276 277 /** 278 * Device statistics request type (read, write, no_data). 279 */ 280 devstat_trans_flags ds_trans_type; 281 282 /** 283 * The start time for this request. 284 */ 285 struct bintime ds_t0; 286 287 /** 288 * Linked list of contiguous requests with the same operation type. 289 */ 290 struct xbb_xen_req_list contig_req_list; 291 292 /** 293 * Linked list links used to aggregate idle requests in the 294 * request list free pool (xbb->reqlist_free_stailq) and pending 295 * requests waiting for execution (xbb->reqlist_pending_stailq). 296 */ 297 STAILQ_ENTRY(xbb_xen_reqlist) links; 298}; 299 300STAILQ_HEAD(xbb_xen_reqlist_list, xbb_xen_reqlist); 301 302/** 303 * \brief Object tracking an in-flight I/O from a Xen VBD consumer. 304 */ 305struct xbb_xen_req { 306 /** 307 * Linked list links used to aggregate requests into a reqlist 308 * and to store them in the request free pool. 309 */ 310 STAILQ_ENTRY(xbb_xen_req) links; 311 312 /** 313 * The remote domain's identifier for this I/O request. 314 */ 315 uint64_t id; 316 317 /** 318 * The number of pages currently mapped for this request. 319 */ 320 int nr_pages; 321 322 /** 323 * The number of 512 byte sectors comprising this requests. 324 */ 325 int nr_512b_sectors; 326 327 /** 328 * BLKIF_OP code for this request. 329 */ 330 int operation; 331 332 /** 333 * Storage used for non-native ring requests. 334 */ 335 blkif_request_t ring_req_storage; 336 337 /** 338 * Pointer to the Xen request in the ring. 339 */ 340 blkif_request_t *ring_req; 341 342 /** 343 * Consumer index for this request. 344 */ 345 RING_IDX req_ring_idx; 346 347 /** 348 * The start time for this request. 349 */ 350 struct bintime ds_t0; 351 352 /** 353 * Pointer back to our parent request list. 354 */ 355 struct xbb_xen_reqlist *reqlist; 356}; 357SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req); 358 359/** 360 * \brief Configuration data for the shared memory request ring 361 * used to communicate with the front-end client of this 362 * this driver. 363 */ 364struct xbb_ring_config { 365 /** KVA address where ring memory is mapped. */ 366 vm_offset_t va; 367 368 /** The pseudo-physical address where ring memory is mapped.*/ 369 uint64_t gnt_addr; 370 371 /** 372 * Grant table handles, one per-ring page, returned by the 373 * hyperpervisor upon mapping of the ring and required to 374 * unmap it when a connection is torn down. 375 */ 376 grant_handle_t handle[XBB_MAX_RING_PAGES]; 377 378 /** 379 * The device bus address returned by the hypervisor when 380 * mapping the ring and required to unmap it when a connection 381 * is torn down. 382 */ 383 uint64_t bus_addr[XBB_MAX_RING_PAGES]; 384 385 /** The number of ring pages mapped for the current connection. */ 386 u_int ring_pages; 387 388 /** 389 * The grant references, one per-ring page, supplied by the 390 * front-end, allowing us to reference the ring pages in the 391 * front-end's domain and to map these pages into our own domain. 392 */ 393 grant_ref_t ring_ref[XBB_MAX_RING_PAGES]; 394 395 /** The interrupt driven even channel used to signal ring events. */ 396 evtchn_port_t evtchn; 397}; 398 399/** 400 * Per-instance connection state flags. 401 */ 402typedef enum 403{ 404 /** 405 * The front-end requested a read-only mount of the 406 * back-end device/file. 407 */ 408 XBBF_READ_ONLY = 0x01, 409 410 /** Communication with the front-end has been established. */ 411 XBBF_RING_CONNECTED = 0x02, 412 413 /** 414 * Front-end requests exist in the ring and are waiting for 415 * xbb_xen_req objects to free up. 416 */ 417 XBBF_RESOURCE_SHORTAGE = 0x04, 418 419 /** Connection teardown in progress. */ 420 XBBF_SHUTDOWN = 0x08, 421 422 /** A thread is already performing shutdown processing. */ 423 XBBF_IN_SHUTDOWN = 0x10 424} xbb_flag_t; 425 426/** Backend device type. */ 427typedef enum { 428 /** Backend type unknown. */ 429 XBB_TYPE_NONE = 0x00, 430 431 /** 432 * Backend type disk (access via cdev switch 433 * strategy routine). 434 */ 435 XBB_TYPE_DISK = 0x01, 436 437 /** Backend type file (access vnode operations.). */ 438 XBB_TYPE_FILE = 0x02 439} xbb_type; 440 441/** 442 * \brief Structure used to memoize information about a per-request 443 * scatter-gather list. 444 * 445 * The chief benefit of using this data structure is it avoids having 446 * to reparse the possibly discontiguous S/G list in the original 447 * request. Due to the way that the mapping of the memory backing an 448 * I/O transaction is handled by Xen, a second pass is unavoidable. 449 * At least this way the second walk is a simple array traversal. 450 * 451 * \note A single Scatter/Gather element in the block interface covers 452 * at most 1 machine page. In this context a sector (blkif 453 * nomenclature, not what I'd choose) is a 512b aligned unit 454 * of mapping within the machine page referenced by an S/G 455 * element. 456 */ 457struct xbb_sg { 458 /** The number of 512b data chunks mapped in this S/G element. */ 459 int16_t nsect; 460 461 /** 462 * The index (0 based) of the first 512b data chunk mapped 463 * in this S/G element. 464 */ 465 uint8_t first_sect; 466 467 /** 468 * The index (0 based) of the last 512b data chunk mapped 469 * in this S/G element. 470 */ 471 uint8_t last_sect; 472}; 473 474/** 475 * Character device backend specific configuration data. 476 */ 477struct xbb_dev_data { 478 /** Cdev used for device backend access. */ 479 struct cdev *cdev; 480 481 /** Cdev switch used for device backend access. */ 482 struct cdevsw *csw; 483 484 /** Used to hold a reference on opened cdev backend devices. */ 485 int dev_ref; 486}; 487 488/** 489 * File backend specific configuration data. 490 */ 491struct xbb_file_data { 492 /** Credentials to use for vnode backed (file based) I/O. */ 493 struct ucred *cred; 494 495 /** 496 * \brief Array of io vectors used to process file based I/O. 497 * 498 * Only a single file based request is outstanding per-xbb instance, 499 * so we only need one of these. 500 */ 501 struct iovec xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; 502#ifdef XBB_USE_BOUNCE_BUFFERS 503 504 /** 505 * \brief Array of io vectors used to handle bouncing of file reads. 506 * 507 * Vnode operations are free to modify uio data during their 508 * exectuion. In the case of a read with bounce buffering active, 509 * we need some of the data from the original uio in order to 510 * bounce-out the read data. This array serves as the temporary 511 * storage for this saved data. 512 */ 513 struct iovec saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; 514 515 /** 516 * \brief Array of memoized bounce buffer kva offsets used 517 * in the file based backend. 518 * 519 * Due to the way that the mapping of the memory backing an 520 * I/O transaction is handled by Xen, a second pass through 521 * the request sg elements is unavoidable. We memoize the computed 522 * bounce address here to reduce the cost of the second walk. 523 */ 524 void *xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQLIST]; 525#endif /* XBB_USE_BOUNCE_BUFFERS */ 526}; 527 528/** 529 * Collection of backend type specific data. 530 */ 531union xbb_backend_data { 532 struct xbb_dev_data dev; 533 struct xbb_file_data file; 534}; 535 536/** 537 * Function signature of backend specific I/O handlers. 538 */ 539typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb, 540 struct xbb_xen_reqlist *reqlist, int operation, 541 int flags); 542 543/** 544 * Per-instance configuration data. 545 */ 546struct xbb_softc { 547 548 /** 549 * Task-queue used to process I/O requests. 550 */ 551 struct taskqueue *io_taskqueue; 552 553 /** 554 * Single "run the request queue" task enqueued 555 * on io_taskqueue. 556 */ 557 struct task io_task; 558 559 /** Device type for this instance. */ 560 xbb_type device_type; 561 562 /** NewBus device corresponding to this instance. */ 563 device_t dev; 564 565 /** Backend specific dispatch routine for this instance. */ 566 xbb_dispatch_t dispatch_io; 567 568 /** The number of requests outstanding on the backend device/file. */ 569 int active_request_count; 570 571 /** Free pool of request tracking structures. */ 572 struct xbb_xen_req_list request_free_stailq; 573 574 /** Array, sized at connection time, of request tracking structures. */ 575 struct xbb_xen_req *requests; 576 577 /** Free pool of request list structures. */ 578 struct xbb_xen_reqlist_list reqlist_free_stailq; 579 580 /** List of pending request lists awaiting execution. */ 581 struct xbb_xen_reqlist_list reqlist_pending_stailq; 582 583 /** Array, sized at connection time, of request list structures. */ 584 struct xbb_xen_reqlist *request_lists; 585 586 /** 587 * Global pool of kva used for mapping remote domain ring 588 * and I/O transaction data. 589 */ 590 vm_offset_t kva; 591 592 /** Psuedo-physical address corresponding to kva. */ 593 uint64_t gnt_base_addr; 594 595 /** The size of the global kva pool. */ 596 int kva_size; 597 598 /** The size of the KVA area used for request lists. */ 599 int reqlist_kva_size; 600 601 /** The number of pages of KVA used for request lists */ 602 int reqlist_kva_pages; 603 604 /** Bitmap of free KVA pages */ 605 bitstr_t *kva_free; 606 607 /** 608 * \brief Cached value of the front-end's domain id. 609 * 610 * This value is used at once for each mapped page in 611 * a transaction. We cache it to avoid incuring the 612 * cost of an ivar access every time this is needed. 613 */ 614 domid_t otherend_id; 615 616 /** 617 * \brief The blkif protocol abi in effect. 618 * 619 * There are situations where the back and front ends can 620 * have a different, native abi (e.g. intel x86_64 and 621 * 32bit x86 domains on the same machine). The back-end 622 * always accomodates the front-end's native abi. That 623 * value is pulled from the XenStore and recorded here. 624 */ 625 int abi; 626 627 /** 628 * \brief The maximum number of requests and request lists allowed 629 * to be in flight at a time. 630 * 631 * This value is negotiated via the XenStore. 632 */ 633 u_int max_requests; 634 635 /** 636 * \brief The maximum number of segments (1 page per segment) 637 * that can be mapped by a request. 638 * 639 * This value is negotiated via the XenStore. 640 */ 641 u_int max_request_segments; 642 643 /** 644 * \brief Maximum number of segments per request list. 645 * 646 * This value is derived from and will generally be larger than 647 * max_request_segments. 648 */ 649 u_int max_reqlist_segments; 650 651 /** 652 * The maximum size of any request to this back-end 653 * device. 654 * 655 * This value is negotiated via the XenStore. 656 */ 657 u_int max_request_size; 658 659 /** 660 * The maximum size of any request list. This is derived directly 661 * from max_reqlist_segments. 662 */ 663 u_int max_reqlist_size; 664 665 /** Various configuration and state bit flags. */ 666 xbb_flag_t flags; 667 668 /** Ring mapping and interrupt configuration data. */ 669 struct xbb_ring_config ring_config; 670 671 /** Runtime, cross-abi safe, structures for ring access. */ 672 blkif_back_rings_t rings; 673 674 /** IRQ mapping for the communication ring event channel. */ 675 xen_intr_handle_t xen_intr_handle; 676 677 /** 678 * \brief Backend access mode flags (e.g. write, or read-only). 679 * 680 * This value is passed to us by the front-end via the XenStore. 681 */ 682 char *dev_mode; 683 684 /** 685 * \brief Backend device type (e.g. "disk", "cdrom", "floppy"). 686 * 687 * This value is passed to us by the front-end via the XenStore. 688 * Currently unused. 689 */ 690 char *dev_type; 691 692 /** 693 * \brief Backend device/file identifier. 694 * 695 * This value is passed to us by the front-end via the XenStore. 696 * We expect this to be a POSIX path indicating the file or 697 * device to open. 698 */ 699 char *dev_name; 700 701 /** 702 * Vnode corresponding to the backend device node or file 703 * we are acessing. 704 */ 705 struct vnode *vn; 706 707 union xbb_backend_data backend; 708 709 /** The native sector size of the backend. */ 710 u_int sector_size; 711 712 /** log2 of sector_size. */ 713 u_int sector_size_shift; 714 715 /** Size in bytes of the backend device or file. */ 716 off_t media_size; 717 718 /** 719 * \brief media_size expressed in terms of the backend native 720 * sector size. 721 * 722 * (e.g. xbb->media_size >> xbb->sector_size_shift). 723 */ 724 uint64_t media_num_sectors; 725 726 /** 727 * \brief Array of memoized scatter gather data computed during the 728 * conversion of blkif ring requests to internal xbb_xen_req 729 * structures. 730 * 731 * Ring processing is serialized so we only need one of these. 732 */ 733 struct xbb_sg xbb_sgs[XBB_MAX_SEGMENTS_PER_REQLIST]; 734 735 /** 736 * Temporary grant table map used in xbb_dispatch_io(). When 737 * XBB_MAX_SEGMENTS_PER_REQLIST gets large, keeping this on the 738 * stack could cause a stack overflow. 739 */ 740 struct gnttab_map_grant_ref maps[XBB_MAX_SEGMENTS_PER_REQLIST]; 741 742 /** Mutex protecting per-instance data. */ 743 struct mtx lock; 744 745 /** 746 * Resource representing allocated physical address space 747 * associated with our per-instance kva region. 748 */ 749 struct resource *pseudo_phys_res; 750 751 /** Resource id for allocated physical address space. */ 752 int pseudo_phys_res_id; 753 754 /** 755 * I/O statistics from BlockBack dispatch down. These are 756 * coalesced requests, and we start them right before execution. 757 */ 758 struct devstat *xbb_stats; 759 760 /** 761 * I/O statistics coming into BlockBack. These are the requests as 762 * we get them from BlockFront. They are started as soon as we 763 * receive a request, and completed when the I/O is complete. 764 */ 765 struct devstat *xbb_stats_in; 766 767 /** Disable sending flush to the backend */ 768 int disable_flush; 769 770 /** Send a real flush for every N flush requests */ 771 int flush_interval; 772 773 /** Count of flush requests in the interval */ 774 int flush_count; 775 776 /** Don't coalesce requests if this is set */ 777 int no_coalesce_reqs; 778 779 /** Number of requests we have received */ 780 uint64_t reqs_received; 781 782 /** Number of requests we have completed*/ 783 uint64_t reqs_completed; 784 785 /** Number of requests we queued but not pushed*/ 786 uint64_t reqs_queued_for_completion; 787 788 /** Number of requests we completed with an error status*/ 789 uint64_t reqs_completed_with_error; 790 791 /** How many forced dispatches (i.e. without coalescing) have happend */ 792 uint64_t forced_dispatch; 793 794 /** How many normal dispatches have happend */ 795 uint64_t normal_dispatch; 796 797 /** How many total dispatches have happend */ 798 uint64_t total_dispatch; 799 800 /** How many times we have run out of KVA */ 801 uint64_t kva_shortages; 802 803 /** How many times we have run out of request structures */ 804 uint64_t request_shortages; 805}; 806 807/*---------------------------- Request Processing ----------------------------*/ 808/** 809 * Allocate an internal transaction tracking structure from the free pool. 810 * 811 * \param xbb Per-instance xbb configuration structure. 812 * 813 * \return On success, a pointer to the allocated xbb_xen_req structure. 814 * Otherwise NULL. 815 */ 816static inline struct xbb_xen_req * 817xbb_get_req(struct xbb_softc *xbb) 818{ 819 struct xbb_xen_req *req; 820 821 req = NULL; 822 823 mtx_assert(&xbb->lock, MA_OWNED); 824 825 if ((req = STAILQ_FIRST(&xbb->request_free_stailq)) != NULL) { 826 STAILQ_REMOVE_HEAD(&xbb->request_free_stailq, links); 827 xbb->active_request_count++; 828 } 829 830 return (req); 831} 832 833/** 834 * Return an allocated transaction tracking structure to the free pool. 835 * 836 * \param xbb Per-instance xbb configuration structure. 837 * \param req The request structure to free. 838 */ 839static inline void 840xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req) 841{ 842 mtx_assert(&xbb->lock, MA_OWNED); 843 844 STAILQ_INSERT_HEAD(&xbb->request_free_stailq, req, links); 845 xbb->active_request_count--; 846 847 KASSERT(xbb->active_request_count >= 0, 848 ("xbb_release_req: negative active count")); 849} 850 851/** 852 * Return an xbb_xen_req_list of allocated xbb_xen_reqs to the free pool. 853 * 854 * \param xbb Per-instance xbb configuration structure. 855 * \param req_list The list of requests to free. 856 * \param nreqs The number of items in the list. 857 */ 858static inline void 859xbb_release_reqs(struct xbb_softc *xbb, struct xbb_xen_req_list *req_list, 860 int nreqs) 861{ 862 mtx_assert(&xbb->lock, MA_OWNED); 863 864 STAILQ_CONCAT(&xbb->request_free_stailq, req_list); 865 xbb->active_request_count -= nreqs; 866 867 KASSERT(xbb->active_request_count >= 0, 868 ("xbb_release_reqs: negative active count")); 869} 870 871/** 872 * Given a page index and 512b sector offset within that page, 873 * calculate an offset into a request's kva region. 874 * 875 * \param reqlist The request structure whose kva region will be accessed. 876 * \param pagenr The page index used to compute the kva offset. 877 * \param sector The 512b sector index used to compute the page relative 878 * kva offset. 879 * 880 * \return The computed global KVA offset. 881 */ 882static inline uint8_t * 883xbb_reqlist_vaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 884{ 885 return (reqlist->kva + (PAGE_SIZE * pagenr) + (sector << 9)); 886} 887 888#ifdef XBB_USE_BOUNCE_BUFFERS 889/** 890 * Given a page index and 512b sector offset within that page, 891 * calculate an offset into a request's local bounce memory region. 892 * 893 * \param reqlist The request structure whose bounce region will be accessed. 894 * \param pagenr The page index used to compute the bounce offset. 895 * \param sector The 512b sector index used to compute the page relative 896 * bounce offset. 897 * 898 * \return The computed global bounce buffer address. 899 */ 900static inline uint8_t * 901xbb_reqlist_bounce_addr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 902{ 903 return (reqlist->bounce + (PAGE_SIZE * pagenr) + (sector << 9)); 904} 905#endif 906 907/** 908 * Given a page number and 512b sector offset within that page, 909 * calculate an offset into the request's memory region that the 910 * underlying backend device/file should use for I/O. 911 * 912 * \param reqlist The request structure whose I/O region will be accessed. 913 * \param pagenr The page index used to compute the I/O offset. 914 * \param sector The 512b sector index used to compute the page relative 915 * I/O offset. 916 * 917 * \return The computed global I/O address. 918 * 919 * Depending on configuration, this will either be a local bounce buffer 920 * or a pointer to the memory mapped in from the front-end domain for 921 * this request. 922 */ 923static inline uint8_t * 924xbb_reqlist_ioaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 925{ 926#ifdef XBB_USE_BOUNCE_BUFFERS 927 return (xbb_reqlist_bounce_addr(reqlist, pagenr, sector)); 928#else 929 return (xbb_reqlist_vaddr(reqlist, pagenr, sector)); 930#endif 931} 932 933/** 934 * Given a page index and 512b sector offset within that page, calculate 935 * an offset into the local psuedo-physical address space used to map a 936 * front-end's request data into a request. 937 * 938 * \param reqlist The request list structure whose pseudo-physical region 939 * will be accessed. 940 * \param pagenr The page index used to compute the pseudo-physical offset. 941 * \param sector The 512b sector index used to compute the page relative 942 * pseudo-physical offset. 943 * 944 * \return The computed global pseudo-phsyical address. 945 * 946 * Depending on configuration, this will either be a local bounce buffer 947 * or a pointer to the memory mapped in from the front-end domain for 948 * this request. 949 */ 950static inline uintptr_t 951xbb_get_gntaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 952{ 953 struct xbb_softc *xbb; 954 955 xbb = reqlist->xbb; 956 957 return ((uintptr_t)(xbb->gnt_base_addr + 958 (uintptr_t)(reqlist->kva - xbb->kva) + 959 (PAGE_SIZE * pagenr) + (sector << 9))); 960} 961 962/** 963 * Get Kernel Virtual Address space for mapping requests. 964 * 965 * \param xbb Per-instance xbb configuration structure. 966 * \param nr_pages Number of pages needed. 967 * \param check_only If set, check for free KVA but don't allocate it. 968 * \param have_lock If set, xbb lock is already held. 969 * 970 * \return On success, a pointer to the allocated KVA region. Otherwise NULL. 971 * 972 * Note: This should be unnecessary once we have either chaining or 973 * scatter/gather support for struct bio. At that point we'll be able to 974 * put multiple addresses and lengths in one bio/bio chain and won't need 975 * to map everything into one virtual segment. 976 */ 977static uint8_t * 978xbb_get_kva(struct xbb_softc *xbb, int nr_pages) 979{ 980 intptr_t first_clear; 981 intptr_t num_clear; 982 uint8_t *free_kva; 983 int i; 984 985 KASSERT(nr_pages != 0, ("xbb_get_kva of zero length")); 986 987 first_clear = 0; 988 free_kva = NULL; 989 990 mtx_lock(&xbb->lock); 991 992 /* 993 * Look for the first available page. If there are none, we're done. 994 */ 995 bit_ffc(xbb->kva_free, xbb->reqlist_kva_pages, &first_clear); 996 997 if (first_clear == -1) 998 goto bailout; 999 1000 /* 1001 * Starting at the first available page, look for consecutive free 1002 * pages that will satisfy the user's request. 1003 */ 1004 for (i = first_clear, num_clear = 0; i < xbb->reqlist_kva_pages; i++) { 1005 /* 1006 * If this is true, the page is used, so we have to reset 1007 * the number of clear pages and the first clear page 1008 * (since it pointed to a region with an insufficient number 1009 * of clear pages). 1010 */ 1011 if (bit_test(xbb->kva_free, i)) { 1012 num_clear = 0; 1013 first_clear = -1; 1014 continue; 1015 } 1016 1017 if (first_clear == -1) 1018 first_clear = i; 1019 1020 /* 1021 * If this is true, we've found a large enough free region 1022 * to satisfy the request. 1023 */ 1024 if (++num_clear == nr_pages) { 1025 1026 bit_nset(xbb->kva_free, first_clear, 1027 first_clear + nr_pages - 1); 1028 1029 free_kva = xbb->kva + 1030 (uint8_t *)(first_clear * PAGE_SIZE); 1031 1032 KASSERT(free_kva >= (uint8_t *)xbb->kva && 1033 free_kva + (nr_pages * PAGE_SIZE) <= 1034 (uint8_t *)xbb->ring_config.va, 1035 ("Free KVA %p len %d out of range, " 1036 "kva = %#jx, ring VA = %#jx\n", free_kva, 1037 nr_pages * PAGE_SIZE, (uintmax_t)xbb->kva, 1038 (uintmax_t)xbb->ring_config.va)); 1039 break; 1040 } 1041 } 1042 1043bailout: 1044 1045 if (free_kva == NULL) { 1046 xbb->flags |= XBBF_RESOURCE_SHORTAGE; 1047 xbb->kva_shortages++; 1048 } 1049 1050 mtx_unlock(&xbb->lock); 1051 1052 return (free_kva); 1053} 1054 1055/** 1056 * Free allocated KVA. 1057 * 1058 * \param xbb Per-instance xbb configuration structure. 1059 * \param kva_ptr Pointer to allocated KVA region. 1060 * \param nr_pages Number of pages in the KVA region. 1061 */ 1062static void 1063xbb_free_kva(struct xbb_softc *xbb, uint8_t *kva_ptr, int nr_pages) 1064{ 1065 intptr_t start_page; 1066 1067 mtx_assert(&xbb->lock, MA_OWNED); 1068 1069 start_page = (intptr_t)(kva_ptr - xbb->kva) >> PAGE_SHIFT; 1070 bit_nclear(xbb->kva_free, start_page, start_page + nr_pages - 1); 1071 1072} 1073 1074/** 1075 * Unmap the front-end pages associated with this I/O request. 1076 * 1077 * \param req The request structure to unmap. 1078 */ 1079static void 1080xbb_unmap_reqlist(struct xbb_xen_reqlist *reqlist) 1081{ 1082 struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQLIST]; 1083 u_int i; 1084 u_int invcount; 1085 int error; 1086 1087 invcount = 0; 1088 for (i = 0; i < reqlist->nr_segments; i++) { 1089 1090 if (reqlist->gnt_handles[i] == GRANT_REF_INVALID) 1091 continue; 1092 1093 unmap[invcount].host_addr = xbb_get_gntaddr(reqlist, i, 0); 1094 unmap[invcount].dev_bus_addr = 0; 1095 unmap[invcount].handle = reqlist->gnt_handles[i]; 1096 reqlist->gnt_handles[i] = GRANT_REF_INVALID; 1097 invcount++; 1098 } 1099 1100 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, 1101 unmap, invcount); 1102 KASSERT(error == 0, ("Grant table operation failed")); 1103} 1104 1105/** 1106 * Allocate an internal transaction tracking structure from the free pool. 1107 * 1108 * \param xbb Per-instance xbb configuration structure. 1109 * 1110 * \return On success, a pointer to the allocated xbb_xen_reqlist structure. 1111 * Otherwise NULL. 1112 */ 1113static inline struct xbb_xen_reqlist * 1114xbb_get_reqlist(struct xbb_softc *xbb) 1115{ 1116 struct xbb_xen_reqlist *reqlist; 1117 1118 reqlist = NULL; 1119 1120 mtx_assert(&xbb->lock, MA_OWNED); 1121 1122 if ((reqlist = STAILQ_FIRST(&xbb->reqlist_free_stailq)) != NULL) { 1123 1124 STAILQ_REMOVE_HEAD(&xbb->reqlist_free_stailq, links); 1125 reqlist->flags = XBB_REQLIST_NONE; 1126 reqlist->kva = NULL; 1127 reqlist->status = BLKIF_RSP_OKAY; 1128 reqlist->residual_512b_sectors = 0; 1129 reqlist->num_children = 0; 1130 reqlist->nr_segments = 0; 1131 STAILQ_INIT(&reqlist->contig_req_list); 1132 } 1133 1134 return (reqlist); 1135} 1136 1137/** 1138 * Return an allocated transaction tracking structure to the free pool. 1139 * 1140 * \param xbb Per-instance xbb configuration structure. 1141 * \param req The request list structure to free. 1142 * \param wakeup If set, wakeup the work thread if freeing this reqlist 1143 * during a resource shortage condition. 1144 */ 1145static inline void 1146xbb_release_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 1147 int wakeup) 1148{ 1149 1150 mtx_assert(&xbb->lock, MA_OWNED); 1151 1152 if (wakeup) { 1153 wakeup = xbb->flags & XBBF_RESOURCE_SHORTAGE; 1154 xbb->flags &= ~XBBF_RESOURCE_SHORTAGE; 1155 } 1156 1157 if (reqlist->kva != NULL) 1158 xbb_free_kva(xbb, reqlist->kva, reqlist->nr_segments); 1159 1160 xbb_release_reqs(xbb, &reqlist->contig_req_list, reqlist->num_children); 1161 1162 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); 1163 1164 if ((xbb->flags & XBBF_SHUTDOWN) != 0) { 1165 /* 1166 * Shutdown is in progress. See if we can 1167 * progress further now that one more request 1168 * has completed and been returned to the 1169 * free pool. 1170 */ 1171 xbb_shutdown(xbb); 1172 } 1173 1174 if (wakeup != 0) 1175 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1176} 1177 1178/** 1179 * Request resources and do basic request setup. 1180 * 1181 * \param xbb Per-instance xbb configuration structure. 1182 * \param reqlist Pointer to reqlist pointer. 1183 * \param ring_req Pointer to a block ring request. 1184 * \param ring_index The ring index of this request. 1185 * 1186 * \return 0 for success, non-zero for failure. 1187 */ 1188static int 1189xbb_get_resources(struct xbb_softc *xbb, struct xbb_xen_reqlist **reqlist, 1190 blkif_request_t *ring_req, RING_IDX ring_idx) 1191{ 1192 struct xbb_xen_reqlist *nreqlist; 1193 struct xbb_xen_req *nreq; 1194 1195 nreqlist = NULL; 1196 nreq = NULL; 1197 1198 mtx_lock(&xbb->lock); 1199 1200 /* 1201 * We don't allow new resources to be allocated if we're in the 1202 * process of shutting down. 1203 */ 1204 if ((xbb->flags & XBBF_SHUTDOWN) != 0) { 1205 mtx_unlock(&xbb->lock); 1206 return (1); 1207 } 1208 1209 /* 1210 * Allocate a reqlist if the caller doesn't have one already. 1211 */ 1212 if (*reqlist == NULL) { 1213 nreqlist = xbb_get_reqlist(xbb); 1214 if (nreqlist == NULL) 1215 goto bailout_error; 1216 } 1217 1218 /* We always allocate a request. */ 1219 nreq = xbb_get_req(xbb); 1220 if (nreq == NULL) 1221 goto bailout_error; 1222 1223 mtx_unlock(&xbb->lock); 1224 1225 if (*reqlist == NULL) { 1226 *reqlist = nreqlist; 1227 nreqlist->operation = ring_req->operation; 1228 nreqlist->starting_sector_number = ring_req->sector_number; 1229 STAILQ_INSERT_TAIL(&xbb->reqlist_pending_stailq, nreqlist, 1230 links); 1231 } 1232 1233 nreq->reqlist = *reqlist; 1234 nreq->req_ring_idx = ring_idx; 1235 nreq->id = ring_req->id; 1236 nreq->operation = ring_req->operation; 1237 1238 if (xbb->abi != BLKIF_PROTOCOL_NATIVE) { 1239 bcopy(ring_req, &nreq->ring_req_storage, sizeof(*ring_req)); 1240 nreq->ring_req = &nreq->ring_req_storage; 1241 } else { 1242 nreq->ring_req = ring_req; 1243 } 1244 1245 binuptime(&nreq->ds_t0); 1246 devstat_start_transaction(xbb->xbb_stats_in, &nreq->ds_t0); 1247 STAILQ_INSERT_TAIL(&(*reqlist)->contig_req_list, nreq, links); 1248 (*reqlist)->num_children++; 1249 (*reqlist)->nr_segments += ring_req->nr_segments; 1250 1251 return (0); 1252 1253bailout_error: 1254 1255 /* 1256 * We're out of resources, so set the shortage flag. The next time 1257 * a request is released, we'll try waking up the work thread to 1258 * see if we can allocate more resources. 1259 */ 1260 xbb->flags |= XBBF_RESOURCE_SHORTAGE; 1261 xbb->request_shortages++; 1262 1263 if (nreq != NULL) 1264 xbb_release_req(xbb, nreq); 1265 1266 if (nreqlist != NULL) 1267 xbb_release_reqlist(xbb, nreqlist, /*wakeup*/ 0); 1268 1269 mtx_unlock(&xbb->lock); 1270 1271 return (1); 1272} 1273 1274/** 1275 * Create and queue a response to a blkif request. 1276 * 1277 * \param xbb Per-instance xbb configuration structure. 1278 * \param req The request structure to which to respond. 1279 * \param status The status code to report. See BLKIF_RSP_* 1280 * in sys/xen/interface/io/blkif.h. 1281 */ 1282static void 1283xbb_queue_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status) 1284{ 1285 blkif_response_t *resp; 1286 1287 /* 1288 * The mutex is required here, and should be held across this call 1289 * until after the subsequent call to xbb_push_responses(). This 1290 * is to guarantee that another context won't queue responses and 1291 * push them while we're active. 1292 * 1293 * That could lead to the other end being notified of responses 1294 * before the resources have been freed on this end. The other end 1295 * would then be able to queue additional I/O, and we may run out 1296 * of resources because we haven't freed them all yet. 1297 */ 1298 mtx_assert(&xbb->lock, MA_OWNED); 1299 1300 /* 1301 * Place on the response ring for the relevant domain. 1302 * For now, only the spacing between entries is different 1303 * in the different ABIs, not the response entry layout. 1304 */ 1305 switch (xbb->abi) { 1306 case BLKIF_PROTOCOL_NATIVE: 1307 resp = RING_GET_RESPONSE(&xbb->rings.native, 1308 xbb->rings.native.rsp_prod_pvt); 1309 break; 1310 case BLKIF_PROTOCOL_X86_32: 1311 resp = (blkif_response_t *) 1312 RING_GET_RESPONSE(&xbb->rings.x86_32, 1313 xbb->rings.x86_32.rsp_prod_pvt); 1314 break; 1315 case BLKIF_PROTOCOL_X86_64: 1316 resp = (blkif_response_t *) 1317 RING_GET_RESPONSE(&xbb->rings.x86_64, 1318 xbb->rings.x86_64.rsp_prod_pvt); 1319 break; 1320 default: 1321 panic("Unexpected blkif protocol ABI."); 1322 } 1323 1324 resp->id = req->id; 1325 resp->operation = req->operation; 1326 resp->status = status; 1327 1328 if (status != BLKIF_RSP_OKAY) 1329 xbb->reqs_completed_with_error++; 1330 1331 xbb->rings.common.rsp_prod_pvt += BLKIF_SEGS_TO_BLOCKS(req->nr_pages); 1332 1333 xbb->reqs_queued_for_completion++; 1334 1335} 1336 1337/** 1338 * Send queued responses to blkif requests. 1339 * 1340 * \param xbb Per-instance xbb configuration structure. 1341 * \param run_taskqueue Flag that is set to 1 if the taskqueue 1342 * should be run, 0 if it does not need to be run. 1343 * \param notify Flag that is set to 1 if the other end should be 1344 * notified via irq, 0 if the other end should not be 1345 * notified. 1346 */ 1347static void 1348xbb_push_responses(struct xbb_softc *xbb, int *run_taskqueue, int *notify) 1349{ 1350 int more_to_do; 1351 1352 /* 1353 * The mutex is required here. 1354 */ 1355 mtx_assert(&xbb->lock, MA_OWNED); 1356 1357 more_to_do = 0; 1358 1359 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, *notify); 1360 1361 if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) { 1362 1363 /* 1364 * Tail check for pending requests. Allows frontend to avoid 1365 * notifications if requests are already in flight (lower 1366 * overheads and promotes batching). 1367 */ 1368 RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do); 1369 } else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) { 1370 1371 more_to_do = 1; 1372 } 1373 1374 xbb->reqs_completed += xbb->reqs_queued_for_completion; 1375 xbb->reqs_queued_for_completion = 0; 1376 1377 *run_taskqueue = more_to_do; 1378} 1379 1380/** 1381 * Complete a request list. 1382 * 1383 * \param xbb Per-instance xbb configuration structure. 1384 * \param reqlist Allocated internal request list structure. 1385 */ 1386static void 1387xbb_complete_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) 1388{ 1389 struct xbb_xen_req *nreq; 1390 off_t sectors_sent; 1391 int notify, run_taskqueue; 1392 1393 sectors_sent = 0; 1394 1395 if (reqlist->flags & XBB_REQLIST_MAPPED) 1396 xbb_unmap_reqlist(reqlist); 1397 1398 mtx_lock(&xbb->lock); 1399 1400 /* 1401 * All I/O is done, send the response. A lock is not necessary 1402 * to protect the request list, because all requests have 1403 * completed. Therefore this is the only context accessing this 1404 * reqlist right now. However, in order to make sure that no one 1405 * else queues responses onto the queue or pushes them to the other 1406 * side while we're active, we need to hold the lock across the 1407 * calls to xbb_queue_response() and xbb_push_responses(). 1408 */ 1409 STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { 1410 off_t cur_sectors_sent; 1411 1412 /* Put this response on the ring, but don't push yet */ 1413 xbb_queue_response(xbb, nreq, reqlist->status); 1414 1415 /* We don't report bytes sent if there is an error. */ 1416 if (reqlist->status == BLKIF_RSP_OKAY) 1417 cur_sectors_sent = nreq->nr_512b_sectors; 1418 else 1419 cur_sectors_sent = 0; 1420 1421 sectors_sent += cur_sectors_sent; 1422 1423 devstat_end_transaction(xbb->xbb_stats_in, 1424 /*bytes*/cur_sectors_sent << 9, 1425 reqlist->ds_tag_type, 1426 reqlist->ds_trans_type, 1427 /*now*/NULL, 1428 /*then*/&nreq->ds_t0); 1429 } 1430 1431 /* 1432 * Take out any sectors not sent. If we wind up negative (which 1433 * might happen if an error is reported as well as a residual), just 1434 * report 0 sectors sent. 1435 */ 1436 sectors_sent -= reqlist->residual_512b_sectors; 1437 if (sectors_sent < 0) 1438 sectors_sent = 0; 1439 1440 devstat_end_transaction(xbb->xbb_stats, 1441 /*bytes*/ sectors_sent << 9, 1442 reqlist->ds_tag_type, 1443 reqlist->ds_trans_type, 1444 /*now*/NULL, 1445 /*then*/&reqlist->ds_t0); 1446 1447 xbb_release_reqlist(xbb, reqlist, /*wakeup*/ 1); 1448 1449 xbb_push_responses(xbb, &run_taskqueue, ¬ify); 1450 1451 mtx_unlock(&xbb->lock); 1452 1453 if (run_taskqueue) 1454 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1455 1456 if (notify) 1457 xen_intr_signal(xbb->xen_intr_handle); 1458} 1459 1460/** 1461 * Completion handler for buffer I/O requests issued by the device 1462 * backend driver. 1463 * 1464 * \param bio The buffer I/O request on which to perform completion 1465 * processing. 1466 */ 1467static void 1468xbb_bio_done(struct bio *bio) 1469{ 1470 struct xbb_softc *xbb; 1471 struct xbb_xen_reqlist *reqlist; 1472 1473 reqlist = bio->bio_caller1; 1474 xbb = reqlist->xbb; 1475 1476 reqlist->residual_512b_sectors += bio->bio_resid >> 9; 1477 1478 /* 1479 * This is a bit imprecise. With aggregated I/O a single 1480 * request list can contain multiple front-end requests and 1481 * a multiple bios may point to a single request. By carefully 1482 * walking the request list, we could map residuals and errors 1483 * back to the original front-end request, but the interface 1484 * isn't sufficiently rich for us to properly report the error. 1485 * So, we just treat the entire request list as having failed if an 1486 * error occurs on any part. And, if an error occurs, we treat 1487 * the amount of data transferred as 0. 1488 * 1489 * For residuals, we report it on the overall aggregated device, 1490 * but not on the individual requests, since we don't currently 1491 * do the work to determine which front-end request to which the 1492 * residual applies. 1493 */ 1494 if (bio->bio_error) { 1495 DPRINTF("BIO returned error %d for operation on device %s\n", 1496 bio->bio_error, xbb->dev_name); 1497 reqlist->status = BLKIF_RSP_ERROR; 1498 1499 if (bio->bio_error == ENXIO 1500 && xenbus_get_state(xbb->dev) == XenbusStateConnected) { 1501 1502 /* 1503 * Backend device has disappeared. Signal the 1504 * front-end that we (the device proxy) want to 1505 * go away. 1506 */ 1507 xenbus_set_state(xbb->dev, XenbusStateClosing); 1508 } 1509 } 1510 1511#ifdef XBB_USE_BOUNCE_BUFFERS 1512 if (bio->bio_cmd == BIO_READ) { 1513 vm_offset_t kva_offset; 1514 1515 kva_offset = (vm_offset_t)bio->bio_data 1516 - (vm_offset_t)reqlist->bounce; 1517 memcpy((uint8_t *)reqlist->kva + kva_offset, 1518 bio->bio_data, bio->bio_bcount); 1519 } 1520#endif /* XBB_USE_BOUNCE_BUFFERS */ 1521 1522 /* 1523 * Decrement the pending count for the request list. When we're 1524 * done with the requests, send status back for all of them. 1525 */ 1526 if (atomic_fetchadd_int(&reqlist->pendcnt, -1) == 1) 1527 xbb_complete_reqlist(xbb, reqlist); 1528 1529 g_destroy_bio(bio); 1530} 1531 1532/** 1533 * Parse a blkif request into an internal request structure and send 1534 * it to the backend for processing. 1535 * 1536 * \param xbb Per-instance xbb configuration structure. 1537 * \param reqlist Allocated internal request list structure. 1538 * 1539 * \return On success, 0. For resource shortages, non-zero. 1540 * 1541 * This routine performs the backend common aspects of request parsing 1542 * including compiling an internal request structure, parsing the S/G 1543 * list and any secondary ring requests in which they may reside, and 1544 * the mapping of front-end I/O pages into our domain. 1545 */ 1546static int 1547xbb_dispatch_io(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) 1548{ 1549 struct xbb_sg *xbb_sg; 1550 struct gnttab_map_grant_ref *map; 1551 struct blkif_request_segment *sg; 1552 struct blkif_request_segment *last_block_sg; 1553 struct xbb_xen_req *nreq; 1554 u_int nseg; 1555 u_int seg_idx; 1556 u_int block_segs; 1557 int nr_sects; 1558 int total_sects; 1559 int operation; 1560 uint8_t bio_flags; 1561 int error; 1562 1563 reqlist->ds_tag_type = DEVSTAT_TAG_SIMPLE; 1564 bio_flags = 0; 1565 total_sects = 0; 1566 nr_sects = 0; 1567 1568 /* 1569 * First determine whether we have enough free KVA to satisfy this 1570 * request list. If not, tell xbb_run_queue() so it can go to 1571 * sleep until we have more KVA. 1572 */ 1573 reqlist->kva = NULL; 1574 if (reqlist->nr_segments != 0) { 1575 reqlist->kva = xbb_get_kva(xbb, reqlist->nr_segments); 1576 if (reqlist->kva == NULL) { 1577 /* 1578 * If we're out of KVA, return ENOMEM. 1579 */ 1580 return (ENOMEM); 1581 } 1582 } 1583 1584 binuptime(&reqlist->ds_t0); 1585 devstat_start_transaction(xbb->xbb_stats, &reqlist->ds_t0); 1586 1587 switch (reqlist->operation) { 1588 case BLKIF_OP_WRITE_BARRIER: 1589 bio_flags |= BIO_ORDERED; 1590 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; 1591 /* FALLTHROUGH */ 1592 case BLKIF_OP_WRITE: 1593 operation = BIO_WRITE; 1594 reqlist->ds_trans_type = DEVSTAT_WRITE; 1595 if ((xbb->flags & XBBF_READ_ONLY) != 0) { 1596 DPRINTF("Attempt to write to read only device %s\n", 1597 xbb->dev_name); 1598 reqlist->status = BLKIF_RSP_ERROR; 1599 goto send_response; 1600 } 1601 break; 1602 case BLKIF_OP_READ: 1603 operation = BIO_READ; 1604 reqlist->ds_trans_type = DEVSTAT_READ; 1605 break; 1606 case BLKIF_OP_FLUSH_DISKCACHE: 1607 /* 1608 * If this is true, the user has requested that we disable 1609 * flush support. So we just complete the requests 1610 * successfully. 1611 */ 1612 if (xbb->disable_flush != 0) { 1613 goto send_response; 1614 } 1615 1616 /* 1617 * The user has requested that we only send a real flush 1618 * for every N flush requests. So keep count, and either 1619 * complete the request immediately or queue it for the 1620 * backend. 1621 */ 1622 if (xbb->flush_interval != 0) { 1623 if (++(xbb->flush_count) < xbb->flush_interval) { 1624 goto send_response; 1625 } else 1626 xbb->flush_count = 0; 1627 } 1628 1629 operation = BIO_FLUSH; 1630 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; 1631 reqlist->ds_trans_type = DEVSTAT_NO_DATA; 1632 goto do_dispatch; 1633 /*NOTREACHED*/ 1634 default: 1635 DPRINTF("error: unknown block io operation [%d]\n", 1636 reqlist->operation); 1637 reqlist->status = BLKIF_RSP_ERROR; 1638 goto send_response; 1639 } 1640 1641 reqlist->xbb = xbb; 1642 xbb_sg = xbb->xbb_sgs; 1643 map = xbb->maps; 1644 seg_idx = 0; 1645 1646 STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { 1647 blkif_request_t *ring_req; 1648 RING_IDX req_ring_idx; 1649 u_int req_seg_idx; 1650 1651 ring_req = nreq->ring_req; 1652 req_ring_idx = nreq->req_ring_idx; 1653 nr_sects = 0; 1654 nseg = ring_req->nr_segments; 1655 nreq->nr_pages = nseg; 1656 nreq->nr_512b_sectors = 0; 1657 req_seg_idx = 0; 1658 sg = NULL; 1659 1660 /* Check that number of segments is sane. */ 1661 if (__predict_false(nseg == 0) 1662 || __predict_false(nseg > xbb->max_request_segments)) { 1663 DPRINTF("Bad number of segments in request (%d)\n", 1664 nseg); 1665 reqlist->status = BLKIF_RSP_ERROR; 1666 goto send_response; 1667 } 1668 1669 block_segs = MIN(nreq->nr_pages, 1670 BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK); 1671 sg = ring_req->seg; 1672 last_block_sg = sg + block_segs; 1673 while (1) { 1674 1675 while (sg < last_block_sg) { 1676 KASSERT(seg_idx < 1677 XBB_MAX_SEGMENTS_PER_REQLIST, 1678 ("seg_idx %d is too large, max " 1679 "segs %d\n", seg_idx, 1680 XBB_MAX_SEGMENTS_PER_REQLIST)); 1681 1682 xbb_sg->first_sect = sg->first_sect; 1683 xbb_sg->last_sect = sg->last_sect; 1684 xbb_sg->nsect = 1685 (int8_t)(sg->last_sect - 1686 sg->first_sect + 1); 1687 1688 if ((sg->last_sect >= (PAGE_SIZE >> 9)) 1689 || (xbb_sg->nsect <= 0)) { 1690 reqlist->status = BLKIF_RSP_ERROR; 1691 goto send_response; 1692 } 1693 1694 nr_sects += xbb_sg->nsect; 1695 map->host_addr = xbb_get_gntaddr(reqlist, 1696 seg_idx, /*sector*/0); 1697 KASSERT(map->host_addr + PAGE_SIZE <= 1698 xbb->ring_config.gnt_addr, 1699 ("Host address %#jx len %d overlaps " 1700 "ring address %#jx\n", 1701 (uintmax_t)map->host_addr, PAGE_SIZE, 1702 (uintmax_t)xbb->ring_config.gnt_addr)); 1703 1704 map->flags = GNTMAP_host_map; 1705 map->ref = sg->gref; 1706 map->dom = xbb->otherend_id; 1707 if (operation == BIO_WRITE) 1708 map->flags |= GNTMAP_readonly; 1709 sg++; 1710 map++; 1711 xbb_sg++; 1712 seg_idx++; 1713 req_seg_idx++; 1714 } 1715 1716 block_segs = MIN(nseg - req_seg_idx, 1717 BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK); 1718 if (block_segs == 0) 1719 break; 1720 1721 /* 1722 * Fetch the next request block full of SG elements. 1723 * For now, only the spacing between entries is 1724 * different in the different ABIs, not the sg entry 1725 * layout. 1726 */ 1727 req_ring_idx++; 1728 switch (xbb->abi) { 1729 case BLKIF_PROTOCOL_NATIVE: 1730 sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.native, 1731 req_ring_idx); 1732 break; 1733 case BLKIF_PROTOCOL_X86_32: 1734 { 1735 sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.x86_32, 1736 req_ring_idx); 1737 break; 1738 } 1739 case BLKIF_PROTOCOL_X86_64: 1740 { 1741 sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.x86_64, 1742 req_ring_idx); 1743 break; 1744 } 1745 default: 1746 panic("Unexpected blkif protocol ABI."); 1747 /* NOTREACHED */ 1748 } 1749 last_block_sg = sg + block_segs; 1750 } 1751 1752 /* Convert to the disk's sector size */ 1753 nreq->nr_512b_sectors = nr_sects; 1754 nr_sects = (nr_sects << 9) >> xbb->sector_size_shift; 1755 total_sects += nr_sects; 1756 1757 if ((nreq->nr_512b_sectors & 1758 ((xbb->sector_size >> 9) - 1)) != 0) { 1759 device_printf(xbb->dev, "%s: I/O size (%d) is not " 1760 "a multiple of the backing store sector " 1761 "size (%d)\n", __func__, 1762 nreq->nr_512b_sectors << 9, 1763 xbb->sector_size); 1764 reqlist->status = BLKIF_RSP_ERROR; 1765 goto send_response; 1766 } 1767 } 1768 1769 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, 1770 xbb->maps, reqlist->nr_segments); 1771 if (error != 0) 1772 panic("Grant table operation failed (%d)", error); 1773 1774 reqlist->flags |= XBB_REQLIST_MAPPED; 1775 1776 for (seg_idx = 0, map = xbb->maps; seg_idx < reqlist->nr_segments; 1777 seg_idx++, map++){ 1778 1779 if (__predict_false(map->status != 0)) { 1780 DPRINTF("invalid buffer -- could not remap " 1781 "it (%d)\n", map->status); 1782 DPRINTF("Mapping(%d): Host Addr 0x%lx, flags " 1783 "0x%x ref 0x%x, dom %d\n", seg_idx, 1784 map->host_addr, map->flags, map->ref, 1785 map->dom); 1786 reqlist->status = BLKIF_RSP_ERROR; 1787 goto send_response; 1788 } 1789 1790 reqlist->gnt_handles[seg_idx] = map->handle; 1791 } 1792 if (reqlist->starting_sector_number + total_sects > 1793 xbb->media_num_sectors) { 1794 1795 DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] " 1796 "extends past end of device %s\n", 1797 operation == BIO_READ ? "read" : "write", 1798 reqlist->starting_sector_number, 1799 reqlist->starting_sector_number + total_sects, 1800 xbb->dev_name); 1801 reqlist->status = BLKIF_RSP_ERROR; 1802 goto send_response; 1803 } 1804 1805do_dispatch: 1806 1807 error = xbb->dispatch_io(xbb, 1808 reqlist, 1809 operation, 1810 bio_flags); 1811 1812 if (error != 0) { 1813 reqlist->status = BLKIF_RSP_ERROR; 1814 goto send_response; 1815 } 1816 1817 return (0); 1818 1819send_response: 1820 1821 xbb_complete_reqlist(xbb, reqlist); 1822 1823 return (0); 1824} 1825 1826static __inline int 1827xbb_count_sects(blkif_request_t *ring_req) 1828{ 1829 int i; 1830 int cur_size = 0; 1831 1832 for (i = 0; i < ring_req->nr_segments; i++) { 1833 int nsect; 1834 1835 nsect = (int8_t)(ring_req->seg[i].last_sect - 1836 ring_req->seg[i].first_sect + 1); 1837 if (nsect <= 0) 1838 break; 1839 1840 cur_size += nsect; 1841 } 1842 1843 return (cur_size); 1844} 1845 1846/** 1847 * Process incoming requests from the shared communication ring in response 1848 * to a signal on the ring's event channel. 1849 * 1850 * \param context Callback argument registerd during task initialization - 1851 * the xbb_softc for this instance. 1852 * \param pending The number of taskqueue_enqueue events that have 1853 * occurred since this handler was last run. 1854 */ 1855static void 1856xbb_run_queue(void *context, int pending) 1857{ 1858 struct xbb_softc *xbb; 1859 blkif_back_rings_t *rings; 1860 RING_IDX rp; 1861 uint64_t cur_sector; 1862 int cur_operation; 1863 struct xbb_xen_reqlist *reqlist; 1864 1865 1866 xbb = (struct xbb_softc *)context; 1867 rings = &xbb->rings; 1868 1869 /* 1870 * Work gather and dispatch loop. Note that we have a bias here 1871 * towards gathering I/O sent by blockfront. We first gather up 1872 * everything in the ring, as long as we have resources. Then we 1873 * dispatch one request, and then attempt to gather up any 1874 * additional requests that have come in while we were dispatching 1875 * the request. 1876 * 1877 * This allows us to get a clearer picture (via devstat) of how 1878 * many requests blockfront is queueing to us at any given time. 1879 */ 1880 for (;;) { 1881 int retval; 1882 1883 /* 1884 * Initialize reqlist to the last element in the pending 1885 * queue, if there is one. This allows us to add more 1886 * requests to that request list, if we have room. 1887 */ 1888 reqlist = STAILQ_LAST(&xbb->reqlist_pending_stailq, 1889 xbb_xen_reqlist, links); 1890 if (reqlist != NULL) { 1891 cur_sector = reqlist->next_contig_sector; 1892 cur_operation = reqlist->operation; 1893 } else { 1894 cur_operation = 0; 1895 cur_sector = 0; 1896 } 1897 1898 /* 1899 * Cache req_prod to avoid accessing a cache line shared 1900 * with the frontend. 1901 */ 1902 rp = rings->common.sring->req_prod; 1903 1904 /* Ensure we see queued requests up to 'rp'. */ 1905 rmb(); 1906 1907 /** 1908 * Run so long as there is work to consume and the generation 1909 * of a response will not overflow the ring. 1910 * 1911 * @note There's a 1 to 1 relationship between requests and 1912 * responses, so an overflow should never occur. This 1913 * test is to protect our domain from digesting bogus 1914 * data. Shouldn't we log this? 1915 */ 1916 while (rings->common.req_cons != rp 1917 && RING_REQUEST_CONS_OVERFLOW(&rings->common, 1918 rings->common.req_cons) == 0){ 1919 blkif_request_t ring_req_storage; 1920 blkif_request_t *ring_req; 1921 int cur_size; 1922 1923 switch (xbb->abi) { 1924 case BLKIF_PROTOCOL_NATIVE: 1925 ring_req = RING_GET_REQUEST(&xbb->rings.native, 1926 rings->common.req_cons); 1927 break; 1928 case BLKIF_PROTOCOL_X86_32: 1929 { 1930 struct blkif_x86_32_request *ring_req32; 1931 1932 ring_req32 = RING_GET_REQUEST( 1933 &xbb->rings.x86_32, rings->common.req_cons); 1934 blkif_get_x86_32_req(&ring_req_storage, 1935 ring_req32); 1936 ring_req = &ring_req_storage; 1937 break; 1938 } 1939 case BLKIF_PROTOCOL_X86_64: 1940 { 1941 struct blkif_x86_64_request *ring_req64; 1942 1943 ring_req64 =RING_GET_REQUEST(&xbb->rings.x86_64, 1944 rings->common.req_cons); 1945 blkif_get_x86_64_req(&ring_req_storage, 1946 ring_req64); 1947 ring_req = &ring_req_storage; 1948 break; 1949 } 1950 default: 1951 panic("Unexpected blkif protocol ABI."); 1952 /* NOTREACHED */ 1953 } 1954 1955 /* 1956 * Check for situations that would require closing 1957 * off this I/O for further coalescing: 1958 * - Coalescing is turned off. 1959 * - Current I/O is out of sequence with the previous 1960 * I/O. 1961 * - Coalesced I/O would be too large. 1962 */ 1963 if ((reqlist != NULL) 1964 && ((xbb->no_coalesce_reqs != 0) 1965 || ((xbb->no_coalesce_reqs == 0) 1966 && ((ring_req->sector_number != cur_sector) 1967 || (ring_req->operation != cur_operation) 1968 || ((ring_req->nr_segments + reqlist->nr_segments) > 1969 xbb->max_reqlist_segments))))) { 1970 reqlist = NULL; 1971 } 1972 1973 /* 1974 * Grab and check for all resources in one shot. 1975 * If we can't get all of the resources we need, 1976 * the shortage is noted and the thread will get 1977 * woken up when more resources are available. 1978 */ 1979 retval = xbb_get_resources(xbb, &reqlist, ring_req, 1980 xbb->rings.common.req_cons); 1981 1982 if (retval != 0) { 1983 /* 1984 * Resource shortage has been recorded. 1985 * We'll be scheduled to run once a request 1986 * object frees up due to a completion. 1987 */ 1988 break; 1989 } 1990 1991 /* 1992 * Signify that we can overwrite this request with 1993 * a response by incrementing our consumer index. 1994 * The response won't be generated until after 1995 * we've already consumed all necessary data out 1996 * of the version of the request in the ring buffer 1997 * (for native mode). We must update the consumer 1998 * index before issueing back-end I/O so there is 1999 * no possibility that it will complete and a 2000 * response be generated before we make room in 2001 * the queue for that response. 2002 */ 2003 xbb->rings.common.req_cons += 2004 BLKIF_SEGS_TO_BLOCKS(ring_req->nr_segments); 2005 xbb->reqs_received++; 2006 2007 cur_size = xbb_count_sects(ring_req); 2008 cur_sector = ring_req->sector_number + cur_size; 2009 reqlist->next_contig_sector = cur_sector; 2010 cur_operation = ring_req->operation; 2011 } 2012 2013 /* Check for I/O to dispatch */ 2014 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); 2015 if (reqlist == NULL) { 2016 /* 2017 * We're out of work to do, put the task queue to 2018 * sleep. 2019 */ 2020 break; 2021 } 2022 2023 /* 2024 * Grab the first request off the queue and attempt 2025 * to dispatch it. 2026 */ 2027 STAILQ_REMOVE_HEAD(&xbb->reqlist_pending_stailq, links); 2028 2029 retval = xbb_dispatch_io(xbb, reqlist); 2030 if (retval != 0) { 2031 /* 2032 * xbb_dispatch_io() returns non-zero only when 2033 * there is a resource shortage. If that's the 2034 * case, re-queue this request on the head of the 2035 * queue, and go to sleep until we have more 2036 * resources. 2037 */ 2038 STAILQ_INSERT_HEAD(&xbb->reqlist_pending_stailq, 2039 reqlist, links); 2040 break; 2041 } else { 2042 /* 2043 * If we still have anything on the queue after 2044 * removing the head entry, that is because we 2045 * met one of the criteria to create a new 2046 * request list (outlined above), and we'll call 2047 * that a forced dispatch for statistical purposes. 2048 * 2049 * Otherwise, if there is only one element on the 2050 * queue, we coalesced everything available on 2051 * the ring and we'll call that a normal dispatch. 2052 */ 2053 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); 2054 2055 if (reqlist != NULL) 2056 xbb->forced_dispatch++; 2057 else 2058 xbb->normal_dispatch++; 2059 2060 xbb->total_dispatch++; 2061 } 2062 } 2063} 2064 2065/** 2066 * Interrupt handler bound to the shared ring's event channel. 2067 * 2068 * \param arg Callback argument registerd during event channel 2069 * binding - the xbb_softc for this instance. 2070 */ 2071static int 2072xbb_filter(void *arg) 2073{ 2074 struct xbb_softc *xbb; 2075 2076 /* Defer to taskqueue thread. */ 2077 xbb = (struct xbb_softc *)arg; 2078 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 2079 2080 return (FILTER_HANDLED); 2081} 2082 2083SDT_PROVIDER_DEFINE(xbb); 2084SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_dev, flush, "int"); 2085SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, read, "int", "uint64_t", 2086 "uint64_t"); 2087SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, write, "int", 2088 "uint64_t", "uint64_t"); 2089 2090/*----------------------------- Backend Handlers -----------------------------*/ 2091/** 2092 * Backend handler for character device access. 2093 * 2094 * \param xbb Per-instance xbb configuration structure. 2095 * \param reqlist Allocated internal request list structure. 2096 * \param operation BIO_* I/O operation code. 2097 * \param bio_flags Additional bio_flag data to pass to any generated 2098 * bios (e.g. BIO_ORDERED).. 2099 * 2100 * \return 0 for success, errno codes for failure. 2101 */ 2102static int 2103xbb_dispatch_dev(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 2104 int operation, int bio_flags) 2105{ 2106 struct xbb_dev_data *dev_data; 2107 struct bio *bios[XBB_MAX_SEGMENTS_PER_REQLIST]; 2108 off_t bio_offset; 2109 struct bio *bio; 2110 struct xbb_sg *xbb_sg; 2111 u_int nbio; 2112 u_int bio_idx; 2113 u_int nseg; 2114 u_int seg_idx; 2115 int error; 2116 2117 dev_data = &xbb->backend.dev; 2118 bio_offset = (off_t)reqlist->starting_sector_number 2119 << xbb->sector_size_shift; 2120 error = 0; 2121 nbio = 0; 2122 bio_idx = 0; 2123 2124 if (operation == BIO_FLUSH) { 2125 bio = g_new_bio(); 2126 if (__predict_false(bio == NULL)) { 2127 DPRINTF("Unable to allocate bio for BIO_FLUSH\n"); 2128 error = ENOMEM; 2129 return (error); 2130 } 2131 2132 bio->bio_cmd = BIO_FLUSH; 2133 bio->bio_flags |= BIO_ORDERED; 2134 bio->bio_dev = dev_data->cdev; 2135 bio->bio_offset = 0; 2136 bio->bio_data = 0; 2137 bio->bio_done = xbb_bio_done; 2138 bio->bio_caller1 = reqlist; 2139 bio->bio_pblkno = 0; 2140 2141 reqlist->pendcnt = 1; 2142 2143 SDT_PROBE1(xbb, kernel, xbb_dispatch_dev, flush, 2144 device_get_unit(xbb->dev)); 2145 2146 (*dev_data->csw->d_strategy)(bio); 2147 2148 return (0); 2149 } 2150 2151 xbb_sg = xbb->xbb_sgs; 2152 bio = NULL; 2153 nseg = reqlist->nr_segments; 2154 2155 for (seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { 2156 2157 /* 2158 * KVA will not be contiguous, so any additional 2159 * I/O will need to be represented in a new bio. 2160 */ 2161 if ((bio != NULL) 2162 && (xbb_sg->first_sect != 0)) { 2163 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { 2164 printf("%s: Discontiguous I/O request " 2165 "from domain %d ends on " 2166 "non-sector boundary\n", 2167 __func__, xbb->otherend_id); 2168 error = EINVAL; 2169 goto fail_free_bios; 2170 } 2171 bio = NULL; 2172 } 2173 2174 if (bio == NULL) { 2175 /* 2176 * Make sure that the start of this bio is 2177 * aligned to a device sector. 2178 */ 2179 if ((bio_offset & (xbb->sector_size - 1)) != 0){ 2180 printf("%s: Misaligned I/O request " 2181 "from domain %d\n", __func__, 2182 xbb->otherend_id); 2183 error = EINVAL; 2184 goto fail_free_bios; 2185 } 2186 2187 bio = bios[nbio++] = g_new_bio(); 2188 if (__predict_false(bio == NULL)) { 2189 error = ENOMEM; 2190 goto fail_free_bios; 2191 } 2192 bio->bio_cmd = operation; 2193 bio->bio_flags |= bio_flags; 2194 bio->bio_dev = dev_data->cdev; 2195 bio->bio_offset = bio_offset; 2196 bio->bio_data = xbb_reqlist_ioaddr(reqlist, seg_idx, 2197 xbb_sg->first_sect); 2198 bio->bio_done = xbb_bio_done; 2199 bio->bio_caller1 = reqlist; 2200 bio->bio_pblkno = bio_offset >> xbb->sector_size_shift; 2201 } 2202 2203 bio->bio_length += xbb_sg->nsect << 9; 2204 bio->bio_bcount = bio->bio_length; 2205 bio_offset += xbb_sg->nsect << 9; 2206 2207 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) { 2208 2209 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { 2210 printf("%s: Discontiguous I/O request " 2211 "from domain %d ends on " 2212 "non-sector boundary\n", 2213 __func__, xbb->otherend_id); 2214 error = EINVAL; 2215 goto fail_free_bios; 2216 } 2217 /* 2218 * KVA will not be contiguous, so any additional 2219 * I/O will need to be represented in a new bio. 2220 */ 2221 bio = NULL; 2222 } 2223 } 2224 2225 reqlist->pendcnt = nbio; 2226 2227 for (bio_idx = 0; bio_idx < nbio; bio_idx++) 2228 { 2229#ifdef XBB_USE_BOUNCE_BUFFERS 2230 vm_offset_t kva_offset; 2231 2232 kva_offset = (vm_offset_t)bios[bio_idx]->bio_data 2233 - (vm_offset_t)reqlist->bounce; 2234 if (operation == BIO_WRITE) { 2235 memcpy(bios[bio_idx]->bio_data, 2236 (uint8_t *)reqlist->kva + kva_offset, 2237 bios[bio_idx]->bio_bcount); 2238 } 2239#endif 2240 if (operation == BIO_READ) { 2241 SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, read, 2242 device_get_unit(xbb->dev), 2243 bios[bio_idx]->bio_offset, 2244 bios[bio_idx]->bio_length); 2245 } else if (operation == BIO_WRITE) { 2246 SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, write, 2247 device_get_unit(xbb->dev), 2248 bios[bio_idx]->bio_offset, 2249 bios[bio_idx]->bio_length); 2250 } 2251 (*dev_data->csw->d_strategy)(bios[bio_idx]); 2252 } 2253 2254 return (error); 2255 2256fail_free_bios: 2257 for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++) 2258 g_destroy_bio(bios[bio_idx]); 2259 2260 return (error); 2261} 2262 2263SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_file, flush, "int"); 2264SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, read, "int", "uint64_t", 2265 "uint64_t"); 2266SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, write, "int", 2267 "uint64_t", "uint64_t"); 2268 2269/** 2270 * Backend handler for file access. 2271 * 2272 * \param xbb Per-instance xbb configuration structure. 2273 * \param reqlist Allocated internal request list. 2274 * \param operation BIO_* I/O operation code. 2275 * \param flags Additional bio_flag data to pass to any generated bios 2276 * (e.g. BIO_ORDERED).. 2277 * 2278 * \return 0 for success, errno codes for failure. 2279 */ 2280static int 2281xbb_dispatch_file(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 2282 int operation, int flags) 2283{ 2284 struct xbb_file_data *file_data; 2285 u_int seg_idx; 2286 u_int nseg; 2287 off_t sectors_sent; 2288 struct uio xuio; 2289 struct xbb_sg *xbb_sg; 2290 struct iovec *xiovec; 2291#ifdef XBB_USE_BOUNCE_BUFFERS 2292 void **p_vaddr; 2293 int saved_uio_iovcnt; 2294#endif /* XBB_USE_BOUNCE_BUFFERS */ 2295 int error; 2296 2297 file_data = &xbb->backend.file; 2298 sectors_sent = 0; 2299 error = 0; 2300 bzero(&xuio, sizeof(xuio)); 2301 2302 switch (operation) { 2303 case BIO_READ: 2304 xuio.uio_rw = UIO_READ; 2305 break; 2306 case BIO_WRITE: 2307 xuio.uio_rw = UIO_WRITE; 2308 break; 2309 case BIO_FLUSH: { 2310 struct mount *mountpoint; 2311 2312 SDT_PROBE1(xbb, kernel, xbb_dispatch_file, flush, 2313 device_get_unit(xbb->dev)); 2314 2315 (void) vn_start_write(xbb->vn, &mountpoint, V_WAIT); 2316 2317 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2318 error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread); 2319 VOP_UNLOCK(xbb->vn, 0); 2320 2321 vn_finished_write(mountpoint); 2322 2323 goto bailout_send_response; 2324 /* NOTREACHED */ 2325 } 2326 default: 2327 panic("invalid operation %d", operation); 2328 /* NOTREACHED */ 2329 } 2330 xuio.uio_offset = (vm_offset_t)reqlist->starting_sector_number 2331 << xbb->sector_size_shift; 2332 xuio.uio_segflg = UIO_SYSSPACE; 2333 xuio.uio_iov = file_data->xiovecs; 2334 xuio.uio_iovcnt = 0; 2335 xbb_sg = xbb->xbb_sgs; 2336 nseg = reqlist->nr_segments; 2337 2338 for (xiovec = NULL, seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { 2339 2340 /* 2341 * If the first sector is not 0, the KVA will 2342 * not be contiguous and we'll need to go on 2343 * to another segment. 2344 */ 2345 if (xbb_sg->first_sect != 0) 2346 xiovec = NULL; 2347 2348 if (xiovec == NULL) { 2349 xiovec = &file_data->xiovecs[xuio.uio_iovcnt]; 2350 xiovec->iov_base = xbb_reqlist_ioaddr(reqlist, 2351 seg_idx, xbb_sg->first_sect); 2352#ifdef XBB_USE_BOUNCE_BUFFERS 2353 /* 2354 * Store the address of the incoming 2355 * buffer at this particular offset 2356 * as well, so we can do the copy 2357 * later without having to do more 2358 * work to recalculate this address. 2359 */ 2360 p_vaddr = &file_data->xiovecs_vaddr[xuio.uio_iovcnt]; 2361 *p_vaddr = xbb_reqlist_vaddr(reqlist, seg_idx, 2362 xbb_sg->first_sect); 2363#endif /* XBB_USE_BOUNCE_BUFFERS */ 2364 xiovec->iov_len = 0; 2365 xuio.uio_iovcnt++; 2366 } 2367 2368 xiovec->iov_len += xbb_sg->nsect << 9; 2369 2370 xuio.uio_resid += xbb_sg->nsect << 9; 2371 2372 /* 2373 * If the last sector is not the full page 2374 * size count, the next segment will not be 2375 * contiguous in KVA and we need a new iovec. 2376 */ 2377 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) 2378 xiovec = NULL; 2379 } 2380 2381 xuio.uio_td = curthread; 2382 2383#ifdef XBB_USE_BOUNCE_BUFFERS 2384 saved_uio_iovcnt = xuio.uio_iovcnt; 2385 2386 if (operation == BIO_WRITE) { 2387 /* Copy the write data to the local buffer. */ 2388 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, 2389 xiovec = xuio.uio_iov; seg_idx < xuio.uio_iovcnt; 2390 seg_idx++, xiovec++, p_vaddr++) { 2391 2392 memcpy(xiovec->iov_base, *p_vaddr, xiovec->iov_len); 2393 } 2394 } else { 2395 /* 2396 * We only need to save off the iovecs in the case of a 2397 * read, because the copy for the read happens after the 2398 * VOP_READ(). (The uio will get modified in that call 2399 * sequence.) 2400 */ 2401 memcpy(file_data->saved_xiovecs, xuio.uio_iov, 2402 xuio.uio_iovcnt * sizeof(xuio.uio_iov[0])); 2403 } 2404#endif /* XBB_USE_BOUNCE_BUFFERS */ 2405 2406 switch (operation) { 2407 case BIO_READ: 2408 2409 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, read, 2410 device_get_unit(xbb->dev), xuio.uio_offset, 2411 xuio.uio_resid); 2412 2413 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2414 2415 /* 2416 * UFS pays attention to IO_DIRECT for reads. If the 2417 * DIRECTIO option is configured into the kernel, it calls 2418 * ffs_rawread(). But that only works for single-segment 2419 * uios with user space addresses. In our case, with a 2420 * kernel uio, it still reads into the buffer cache, but it 2421 * will just try to release the buffer from the cache later 2422 * on in ffs_read(). 2423 * 2424 * ZFS does not pay attention to IO_DIRECT for reads. 2425 * 2426 * UFS does not pay attention to IO_SYNC for reads. 2427 * 2428 * ZFS pays attention to IO_SYNC (which translates into the 2429 * Solaris define FRSYNC for zfs_read()) for reads. It 2430 * attempts to sync the file before reading. 2431 * 2432 * So, to attempt to provide some barrier semantics in the 2433 * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC. 2434 */ 2435 error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 2436 (IO_DIRECT|IO_SYNC) : 0, file_data->cred); 2437 2438 VOP_UNLOCK(xbb->vn, 0); 2439 break; 2440 case BIO_WRITE: { 2441 struct mount *mountpoint; 2442 2443 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, write, 2444 device_get_unit(xbb->dev), xuio.uio_offset, 2445 xuio.uio_resid); 2446 2447 (void)vn_start_write(xbb->vn, &mountpoint, V_WAIT); 2448 2449 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2450 2451 /* 2452 * UFS pays attention to IO_DIRECT for writes. The write 2453 * is done asynchronously. (Normally the write would just 2454 * get put into cache. 2455 * 2456 * UFS pays attention to IO_SYNC for writes. It will 2457 * attempt to write the buffer out synchronously if that 2458 * flag is set. 2459 * 2460 * ZFS does not pay attention to IO_DIRECT for writes. 2461 * 2462 * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC) 2463 * for writes. It will flush the transaction from the 2464 * cache before returning. 2465 * 2466 * So if we've got the BIO_ORDERED flag set, we want 2467 * IO_SYNC in either the UFS or ZFS case. 2468 */ 2469 error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 2470 IO_SYNC : 0, file_data->cred); 2471 VOP_UNLOCK(xbb->vn, 0); 2472 2473 vn_finished_write(mountpoint); 2474 2475 break; 2476 } 2477 default: 2478 panic("invalid operation %d", operation); 2479 /* NOTREACHED */ 2480 } 2481 2482#ifdef XBB_USE_BOUNCE_BUFFERS 2483 /* We only need to copy here for read operations */ 2484 if (operation == BIO_READ) { 2485 2486 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, 2487 xiovec = file_data->saved_xiovecs; 2488 seg_idx < saved_uio_iovcnt; seg_idx++, 2489 xiovec++, p_vaddr++) { 2490 2491 /* 2492 * Note that we have to use the copy of the 2493 * io vector we made above. uiomove() modifies 2494 * the uio and its referenced vector as uiomove 2495 * performs the copy, so we can't rely on any 2496 * state from the original uio. 2497 */ 2498 memcpy(*p_vaddr, xiovec->iov_base, xiovec->iov_len); 2499 } 2500 } 2501#endif /* XBB_USE_BOUNCE_BUFFERS */ 2502 2503bailout_send_response: 2504 2505 if (error != 0) 2506 reqlist->status = BLKIF_RSP_ERROR; 2507 2508 xbb_complete_reqlist(xbb, reqlist); 2509 2510 return (0); 2511} 2512 2513/*--------------------------- Backend Configuration --------------------------*/ 2514/** 2515 * Close and cleanup any backend device/file specific state for this 2516 * block back instance. 2517 * 2518 * \param xbb Per-instance xbb configuration structure. 2519 */ 2520static void 2521xbb_close_backend(struct xbb_softc *xbb) 2522{ 2523 DROP_GIANT(); 2524 DPRINTF("closing dev=%s\n", xbb->dev_name); 2525 if (xbb->vn) { 2526 int flags = FREAD; 2527 2528 if ((xbb->flags & XBBF_READ_ONLY) == 0) 2529 flags |= FWRITE; 2530 2531 switch (xbb->device_type) { 2532 case XBB_TYPE_DISK: 2533 if (xbb->backend.dev.csw) { 2534 dev_relthread(xbb->backend.dev.cdev, 2535 xbb->backend.dev.dev_ref); 2536 xbb->backend.dev.csw = NULL; 2537 xbb->backend.dev.cdev = NULL; 2538 } 2539 break; 2540 case XBB_TYPE_FILE: 2541 break; 2542 case XBB_TYPE_NONE: 2543 default: 2544 panic("Unexpected backend type."); 2545 break; 2546 } 2547 2548 (void)vn_close(xbb->vn, flags, NOCRED, curthread); 2549 xbb->vn = NULL; 2550 2551 switch (xbb->device_type) { 2552 case XBB_TYPE_DISK: 2553 break; 2554 case XBB_TYPE_FILE: 2555 if (xbb->backend.file.cred != NULL) { 2556 crfree(xbb->backend.file.cred); 2557 xbb->backend.file.cred = NULL; 2558 } 2559 break; 2560 case XBB_TYPE_NONE: 2561 default: 2562 panic("Unexpected backend type."); 2563 break; 2564 } 2565 } 2566 PICKUP_GIANT(); 2567} 2568 2569/** 2570 * Open a character device to be used for backend I/O. 2571 * 2572 * \param xbb Per-instance xbb configuration structure. 2573 * 2574 * \return 0 for success, errno codes for failure. 2575 */ 2576static int 2577xbb_open_dev(struct xbb_softc *xbb) 2578{ 2579 struct vattr vattr; 2580 struct cdev *dev; 2581 struct cdevsw *devsw; 2582 int error; 2583 2584 xbb->device_type = XBB_TYPE_DISK; 2585 xbb->dispatch_io = xbb_dispatch_dev; 2586 xbb->backend.dev.cdev = xbb->vn->v_rdev; 2587 xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev, 2588 &xbb->backend.dev.dev_ref); 2589 if (xbb->backend.dev.csw == NULL) 2590 panic("Unable to retrieve device switch"); 2591 2592 error = VOP_GETATTR(xbb->vn, &vattr, NOCRED); 2593 if (error) { 2594 xenbus_dev_fatal(xbb->dev, error, "error getting " 2595 "vnode attributes for device %s", 2596 xbb->dev_name); 2597 return (error); 2598 } 2599 2600 2601 dev = xbb->vn->v_rdev; 2602 devsw = dev->si_devsw; 2603 if (!devsw->d_ioctl) { 2604 xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for " 2605 "device %s!", xbb->dev_name); 2606 return (ENODEV); 2607 } 2608 2609 error = devsw->d_ioctl(dev, DIOCGSECTORSIZE, 2610 (caddr_t)&xbb->sector_size, FREAD, 2611 curthread); 2612 if (error) { 2613 xenbus_dev_fatal(xbb->dev, error, 2614 "error calling ioctl DIOCGSECTORSIZE " 2615 "for device %s", xbb->dev_name); 2616 return (error); 2617 } 2618 2619 error = devsw->d_ioctl(dev, DIOCGMEDIASIZE, 2620 (caddr_t)&xbb->media_size, FREAD, 2621 curthread); 2622 if (error) { 2623 xenbus_dev_fatal(xbb->dev, error, 2624 "error calling ioctl DIOCGMEDIASIZE " 2625 "for device %s", xbb->dev_name); 2626 return (error); 2627 } 2628 2629 return (0); 2630} 2631 2632/** 2633 * Open a file to be used for backend I/O. 2634 * 2635 * \param xbb Per-instance xbb configuration structure. 2636 * 2637 * \return 0 for success, errno codes for failure. 2638 */ 2639static int 2640xbb_open_file(struct xbb_softc *xbb) 2641{ 2642 struct xbb_file_data *file_data; 2643 struct vattr vattr; 2644 int error; 2645 2646 file_data = &xbb->backend.file; 2647 xbb->device_type = XBB_TYPE_FILE; 2648 xbb->dispatch_io = xbb_dispatch_file; 2649 error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred); 2650 if (error != 0) { 2651 xenbus_dev_fatal(xbb->dev, error, 2652 "error calling VOP_GETATTR()" 2653 "for file %s", xbb->dev_name); 2654 return (error); 2655 } 2656 2657 /* 2658 * Verify that we have the ability to upgrade to exclusive 2659 * access on this file so we can trap errors at open instead 2660 * of reporting them during first access. 2661 */ 2662 if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) { 2663 vn_lock(xbb->vn, LK_UPGRADE | LK_RETRY); 2664 if (xbb->vn->v_iflag & VI_DOOMED) { 2665 error = EBADF; 2666 xenbus_dev_fatal(xbb->dev, error, 2667 "error locking file %s", 2668 xbb->dev_name); 2669 2670 return (error); 2671 } 2672 } 2673 2674 file_data->cred = crhold(curthread->td_ucred); 2675 xbb->media_size = vattr.va_size; 2676 2677 /* 2678 * XXX KDM vattr.va_blocksize may be larger than 512 bytes here. 2679 * With ZFS, it is 131072 bytes. Block sizes that large don't work 2680 * with disklabel and UFS on FreeBSD at least. Large block sizes 2681 * may not work with other OSes as well. So just export a sector 2682 * size of 512 bytes, which should work with any OS or 2683 * application. Since our backing is a file, any block size will 2684 * work fine for the backing store. 2685 */ 2686#if 0 2687 xbb->sector_size = vattr.va_blocksize; 2688#endif 2689 xbb->sector_size = 512; 2690 2691 /* 2692 * Sanity check. The media size has to be at least one 2693 * sector long. 2694 */ 2695 if (xbb->media_size < xbb->sector_size) { 2696 error = EINVAL; 2697 xenbus_dev_fatal(xbb->dev, error, 2698 "file %s size %ju < block size %u", 2699 xbb->dev_name, 2700 (uintmax_t)xbb->media_size, 2701 xbb->sector_size); 2702 } 2703 return (error); 2704} 2705 2706/** 2707 * Open the backend provider for this connection. 2708 * 2709 * \param xbb Per-instance xbb configuration structure. 2710 * 2711 * \return 0 for success, errno codes for failure. 2712 */ 2713static int 2714xbb_open_backend(struct xbb_softc *xbb) 2715{ 2716 struct nameidata nd; 2717 int flags; 2718 int error; 2719 2720 flags = FREAD; 2721 error = 0; 2722 2723 DPRINTF("opening dev=%s\n", xbb->dev_name); 2724 2725 if (rootvnode == NULL) { 2726 xenbus_dev_fatal(xbb->dev, ENOENT, 2727 "Root file system not mounted"); 2728 return (ENOENT); 2729 } 2730 2731 if ((xbb->flags & XBBF_READ_ONLY) == 0) 2732 flags |= FWRITE; 2733 2734 if (!curthread->td_proc->p_fd->fd_cdir) { 2735 curthread->td_proc->p_fd->fd_cdir = rootvnode; 2736 VREF(rootvnode); 2737 } 2738 if (!curthread->td_proc->p_fd->fd_rdir) { 2739 curthread->td_proc->p_fd->fd_rdir = rootvnode; 2740 VREF(rootvnode); 2741 } 2742 if (!curthread->td_proc->p_fd->fd_jdir) { 2743 curthread->td_proc->p_fd->fd_jdir = rootvnode; 2744 VREF(rootvnode); 2745 } 2746 2747 again: 2748 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name, curthread); 2749 error = vn_open(&nd, &flags, 0, NULL); 2750 if (error) { 2751 /* 2752 * This is the only reasonable guess we can make as far as 2753 * path if the user doesn't give us a fully qualified path. 2754 * If they want to specify a file, they need to specify the 2755 * full path. 2756 */ 2757 if (xbb->dev_name[0] != '/') { 2758 char *dev_path = "/dev/"; 2759 char *dev_name; 2760 2761 /* Try adding device path at beginning of name */ 2762 dev_name = malloc(strlen(xbb->dev_name) 2763 + strlen(dev_path) + 1, 2764 M_XENBLOCKBACK, M_NOWAIT); 2765 if (dev_name) { 2766 sprintf(dev_name, "%s%s", dev_path, 2767 xbb->dev_name); 2768 free(xbb->dev_name, M_XENBLOCKBACK); 2769 xbb->dev_name = dev_name; 2770 goto again; 2771 } 2772 } 2773 xenbus_dev_fatal(xbb->dev, error, "error opening device %s", 2774 xbb->dev_name); 2775 return (error); 2776 } 2777 2778 NDFREE(&nd, NDF_ONLY_PNBUF); 2779 2780 xbb->vn = nd.ni_vp; 2781 2782 /* We only support disks and files. */ 2783 if (vn_isdisk(xbb->vn, &error)) { 2784 error = xbb_open_dev(xbb); 2785 } else if (xbb->vn->v_type == VREG) { 2786 error = xbb_open_file(xbb); 2787 } else { 2788 error = EINVAL; 2789 xenbus_dev_fatal(xbb->dev, error, "%s is not a disk " 2790 "or file", xbb->dev_name); 2791 } 2792 VOP_UNLOCK(xbb->vn, 0); 2793 2794 if (error != 0) { 2795 xbb_close_backend(xbb); 2796 return (error); 2797 } 2798 2799 xbb->sector_size_shift = fls(xbb->sector_size) - 1; 2800 xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift; 2801 2802 DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n", 2803 (xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file", 2804 xbb->dev_name, xbb->sector_size, xbb->media_size); 2805 2806 return (0); 2807} 2808 2809/*------------------------ Inter-Domain Communication ------------------------*/ 2810/** 2811 * Free dynamically allocated KVA or pseudo-physical address allocations. 2812 * 2813 * \param xbb Per-instance xbb configuration structure. 2814 */ 2815static void 2816xbb_free_communication_mem(struct xbb_softc *xbb) 2817{ 2818 if (xbb->kva != 0) { 2819 if (xbb->pseudo_phys_res != NULL) { 2820 bus_release_resource(xbb->dev, SYS_RES_MEMORY, 2821 xbb->pseudo_phys_res_id, 2822 xbb->pseudo_phys_res); 2823 xbb->pseudo_phys_res = NULL; 2824 } 2825 } 2826 xbb->kva = 0; 2827 xbb->gnt_base_addr = 0; 2828 if (xbb->kva_free != NULL) { 2829 free(xbb->kva_free, M_XENBLOCKBACK); 2830 xbb->kva_free = NULL; 2831 } 2832} 2833 2834/** 2835 * Cleanup all inter-domain communication mechanisms. 2836 * 2837 * \param xbb Per-instance xbb configuration structure. 2838 */ 2839static int 2840xbb_disconnect(struct xbb_softc *xbb) 2841{ 2842 struct gnttab_unmap_grant_ref ops[XBB_MAX_RING_PAGES]; 2843 struct gnttab_unmap_grant_ref *op; 2844 u_int ring_idx; 2845 int error; 2846 2847 DPRINTF("\n"); 2848 2849 if ((xbb->flags & XBBF_RING_CONNECTED) == 0) 2850 return (0); 2851 2852 xen_intr_unbind(&xbb->xen_intr_handle); 2853 2854 mtx_unlock(&xbb->lock); 2855 taskqueue_drain(xbb->io_taskqueue, &xbb->io_task); 2856 mtx_lock(&xbb->lock); 2857 2858 /* 2859 * No new interrupts can generate work, but we must wait 2860 * for all currently active requests to drain. 2861 */ 2862 if (xbb->active_request_count != 0) 2863 return (EAGAIN); 2864 2865 for (ring_idx = 0, op = ops; 2866 ring_idx < xbb->ring_config.ring_pages; 2867 ring_idx++, op++) { 2868 2869 op->host_addr = xbb->ring_config.gnt_addr 2870 + (ring_idx * PAGE_SIZE); 2871 op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx]; 2872 op->handle = xbb->ring_config.handle[ring_idx]; 2873 } 2874 2875 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops, 2876 xbb->ring_config.ring_pages); 2877 if (error != 0) 2878 panic("Grant table op failed (%d)", error); 2879 2880 xbb_free_communication_mem(xbb); 2881 2882 if (xbb->requests != NULL) { 2883 free(xbb->requests, M_XENBLOCKBACK); 2884 xbb->requests = NULL; 2885 } 2886 2887 if (xbb->request_lists != NULL) { 2888 struct xbb_xen_reqlist *reqlist; 2889 int i; 2890 2891 /* There is one request list for ever allocated request. */ 2892 for (i = 0, reqlist = xbb->request_lists; 2893 i < xbb->max_requests; i++, reqlist++){ 2894#ifdef XBB_USE_BOUNCE_BUFFERS 2895 if (reqlist->bounce != NULL) { 2896 free(reqlist->bounce, M_XENBLOCKBACK); 2897 reqlist->bounce = NULL; 2898 } 2899#endif 2900 if (reqlist->gnt_handles != NULL) { 2901 free(reqlist->gnt_handles, M_XENBLOCKBACK); 2902 reqlist->gnt_handles = NULL; 2903 } 2904 } 2905 free(xbb->request_lists, M_XENBLOCKBACK); 2906 xbb->request_lists = NULL; 2907 } 2908 2909 xbb->flags &= ~XBBF_RING_CONNECTED; 2910 return (0); 2911} 2912 2913/** 2914 * Map shared memory ring into domain local address space, initialize 2915 * ring control structures, and bind an interrupt to the event channel 2916 * used to notify us of ring changes. 2917 * 2918 * \param xbb Per-instance xbb configuration structure. 2919 */ 2920static int 2921xbb_connect_ring(struct xbb_softc *xbb) 2922{ 2923 struct gnttab_map_grant_ref gnts[XBB_MAX_RING_PAGES]; 2924 struct gnttab_map_grant_ref *gnt; 2925 u_int ring_idx; 2926 int error; 2927 2928 if ((xbb->flags & XBBF_RING_CONNECTED) != 0) 2929 return (0); 2930 2931 /* 2932 * Kva for our ring is at the tail of the region of kva allocated 2933 * by xbb_alloc_communication_mem(). 2934 */ 2935 xbb->ring_config.va = xbb->kva 2936 + (xbb->kva_size 2937 - (xbb->ring_config.ring_pages * PAGE_SIZE)); 2938 xbb->ring_config.gnt_addr = xbb->gnt_base_addr 2939 + (xbb->kva_size 2940 - (xbb->ring_config.ring_pages * PAGE_SIZE)); 2941 2942 for (ring_idx = 0, gnt = gnts; 2943 ring_idx < xbb->ring_config.ring_pages; 2944 ring_idx++, gnt++) { 2945 2946 gnt->host_addr = xbb->ring_config.gnt_addr 2947 + (ring_idx * PAGE_SIZE); 2948 gnt->flags = GNTMAP_host_map; 2949 gnt->ref = xbb->ring_config.ring_ref[ring_idx]; 2950 gnt->dom = xbb->otherend_id; 2951 } 2952 2953 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts, 2954 xbb->ring_config.ring_pages); 2955 if (error) 2956 panic("blkback: Ring page grant table op failed (%d)", error); 2957 2958 for (ring_idx = 0, gnt = gnts; 2959 ring_idx < xbb->ring_config.ring_pages; 2960 ring_idx++, gnt++) { 2961 if (gnt->status != 0) { 2962 xbb->ring_config.va = 0; 2963 xenbus_dev_fatal(xbb->dev, EACCES, 2964 "Ring shared page mapping failed. " 2965 "Status %d.", gnt->status); 2966 return (EACCES); 2967 } 2968 xbb->ring_config.handle[ring_idx] = gnt->handle; 2969 xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr; 2970 } 2971 2972 /* Initialize the ring based on ABI. */ 2973 switch (xbb->abi) { 2974 case BLKIF_PROTOCOL_NATIVE: 2975 { 2976 blkif_sring_t *sring; 2977 sring = (blkif_sring_t *)xbb->ring_config.va; 2978 BACK_RING_INIT(&xbb->rings.native, sring, 2979 xbb->ring_config.ring_pages * PAGE_SIZE); 2980 break; 2981 } 2982 case BLKIF_PROTOCOL_X86_32: 2983 { 2984 blkif_x86_32_sring_t *sring_x86_32; 2985 sring_x86_32 = (blkif_x86_32_sring_t *)xbb->ring_config.va; 2986 BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32, 2987 xbb->ring_config.ring_pages * PAGE_SIZE); 2988 break; 2989 } 2990 case BLKIF_PROTOCOL_X86_64: 2991 { 2992 blkif_x86_64_sring_t *sring_x86_64; 2993 sring_x86_64 = (blkif_x86_64_sring_t *)xbb->ring_config.va; 2994 BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64, 2995 xbb->ring_config.ring_pages * PAGE_SIZE); 2996 break; 2997 } 2998 default: 2999 panic("Unexpected blkif protocol ABI."); 3000 } 3001 3002 xbb->flags |= XBBF_RING_CONNECTED; 3003 3004 error = xen_intr_bind_remote_port(xbb->dev, 3005 xbb->otherend_id, 3006 xbb->ring_config.evtchn, 3007 xbb_filter, 3008 /*ithread_handler*/NULL, 3009 /*arg*/xbb, 3010 INTR_TYPE_BIO | INTR_MPSAFE, 3011 &xbb->xen_intr_handle); 3012 if (error) { 3013 (void)xbb_disconnect(xbb); 3014 xenbus_dev_fatal(xbb->dev, error, "binding event channel"); 3015 return (error); 3016 } 3017 3018 DPRINTF("rings connected!\n"); 3019 3020 return 0; 3021} 3022 3023/* Needed to make bit_alloc() macro work */ 3024#define calloc(count, size) malloc((count)*(size), M_XENBLOCKBACK, \ 3025 M_NOWAIT|M_ZERO); 3026 3027/** 3028 * Size KVA and pseudo-physical address allocations based on negotiated 3029 * values for the size and number of I/O requests, and the size of our 3030 * communication ring. 3031 * 3032 * \param xbb Per-instance xbb configuration structure. 3033 * 3034 * These address spaces are used to dynamically map pages in the 3035 * front-end's domain into our own. 3036 */ 3037static int 3038xbb_alloc_communication_mem(struct xbb_softc *xbb) 3039{ 3040 xbb->reqlist_kva_pages = xbb->max_requests * xbb->max_request_segments; 3041 xbb->reqlist_kva_size = xbb->reqlist_kva_pages * PAGE_SIZE; 3042 xbb->kva_size = xbb->reqlist_kva_size + 3043 (xbb->ring_config.ring_pages * PAGE_SIZE); 3044 3045 xbb->kva_free = bit_alloc(xbb->reqlist_kva_pages); 3046 if (xbb->kva_free == NULL) 3047 return (ENOMEM); 3048 3049 DPRINTF("%s: kva_size = %d, reqlist_kva_size = %d\n", 3050 device_get_nameunit(xbb->dev), xbb->kva_size, 3051 xbb->reqlist_kva_size); 3052 /* 3053 * Reserve a range of pseudo physical memory that we can map 3054 * into kva. These pages will only be backed by machine 3055 * pages ("real memory") during the lifetime of front-end requests 3056 * via grant table operations. 3057 */ 3058 xbb->pseudo_phys_res_id = 0; 3059 xbb->pseudo_phys_res = bus_alloc_resource(xbb->dev, SYS_RES_MEMORY, 3060 &xbb->pseudo_phys_res_id, 3061 0, ~0, xbb->kva_size, 3062 RF_ACTIVE); 3063 if (xbb->pseudo_phys_res == NULL) { 3064 xbb->kva = 0; 3065 return (ENOMEM); 3066 } 3067 xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res); 3068 xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res); 3069 3070 DPRINTF("%s: kva: %#jx, gnt_base_addr: %#jx\n", 3071 device_get_nameunit(xbb->dev), (uintmax_t)xbb->kva, 3072 (uintmax_t)xbb->gnt_base_addr); 3073 return (0); 3074} 3075 3076/** 3077 * Collect front-end information from the XenStore. 3078 * 3079 * \param xbb Per-instance xbb configuration structure. 3080 */ 3081static int 3082xbb_collect_frontend_info(struct xbb_softc *xbb) 3083{ 3084 char protocol_abi[64]; 3085 const char *otherend_path; 3086 int error; 3087 u_int ring_idx; 3088 u_int ring_page_order; 3089 size_t ring_size; 3090 3091 otherend_path = xenbus_get_otherend_path(xbb->dev); 3092 3093 /* 3094 * Protocol defaults valid even if all negotiation fails. 3095 */ 3096 xbb->ring_config.ring_pages = 1; 3097 xbb->max_request_segments = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK; 3098 xbb->max_request_size = xbb->max_request_segments * PAGE_SIZE; 3099 3100 /* 3101 * Mandatory data (used in all versions of the protocol) first. 3102 */ 3103 error = xs_scanf(XST_NIL, otherend_path, 3104 "event-channel", NULL, "%" PRIu32, 3105 &xbb->ring_config.evtchn); 3106 if (error != 0) { 3107 xenbus_dev_fatal(xbb->dev, error, 3108 "Unable to retrieve event-channel information " 3109 "from frontend %s. Unable to connect.", 3110 xenbus_get_otherend_path(xbb->dev)); 3111 return (error); 3112 } 3113 3114 /* 3115 * These fields are initialized to legacy protocol defaults 3116 * so we only need to fail if reading the updated value succeeds 3117 * and the new value is outside of its allowed range. 3118 * 3119 * \note xs_gather() returns on the first encountered error, so 3120 * we must use independant calls in order to guarantee 3121 * we don't miss information in a sparsly populated front-end 3122 * tree. 3123 * 3124 * \note xs_scanf() does not update variables for unmatched 3125 * fields. 3126 */ 3127 ring_page_order = 0; 3128 (void)xs_scanf(XST_NIL, otherend_path, 3129 "ring-page-order", NULL, "%u", 3130 &ring_page_order); 3131 xbb->ring_config.ring_pages = 1 << ring_page_order; 3132 (void)xs_scanf(XST_NIL, otherend_path, 3133 "num-ring-pages", NULL, "%u", 3134 &xbb->ring_config.ring_pages); 3135 ring_size = PAGE_SIZE * xbb->ring_config.ring_pages; 3136 xbb->max_requests = BLKIF_MAX_RING_REQUESTS(ring_size); 3137 3138 (void)xs_scanf(XST_NIL, otherend_path, 3139 "max-requests", NULL, "%u", 3140 &xbb->max_requests); 3141 3142 (void)xs_scanf(XST_NIL, otherend_path, 3143 "max-request-segments", NULL, "%u", 3144 &xbb->max_request_segments); 3145 3146 (void)xs_scanf(XST_NIL, otherend_path, 3147 "max-request-size", NULL, "%u", 3148 &xbb->max_request_size); 3149 3150 if (xbb->ring_config.ring_pages > XBB_MAX_RING_PAGES) { 3151 xenbus_dev_fatal(xbb->dev, EINVAL, 3152 "Front-end specified ring-pages of %u " 3153 "exceeds backend limit of %zu. " 3154 "Unable to connect.", 3155 xbb->ring_config.ring_pages, 3156 XBB_MAX_RING_PAGES); 3157 return (EINVAL); 3158 } else if (xbb->max_requests > XBB_MAX_REQUESTS) { 3159 xenbus_dev_fatal(xbb->dev, EINVAL, 3160 "Front-end specified max_requests of %u " 3161 "exceeds backend limit of %u. " 3162 "Unable to connect.", 3163 xbb->max_requests, 3164 XBB_MAX_REQUESTS); 3165 return (EINVAL); 3166 } else if (xbb->max_request_segments > XBB_MAX_SEGMENTS_PER_REQUEST) { 3167 xenbus_dev_fatal(xbb->dev, EINVAL, 3168 "Front-end specified max_requests_segments " 3169 "of %u exceeds backend limit of %u. " 3170 "Unable to connect.", 3171 xbb->max_request_segments, 3172 XBB_MAX_SEGMENTS_PER_REQUEST); 3173 return (EINVAL); 3174 } else if (xbb->max_request_size > XBB_MAX_REQUEST_SIZE) { 3175 xenbus_dev_fatal(xbb->dev, EINVAL, 3176 "Front-end specified max_request_size " 3177 "of %u exceeds backend limit of %u. " 3178 "Unable to connect.", 3179 xbb->max_request_size, 3180 XBB_MAX_REQUEST_SIZE); 3181 return (EINVAL); 3182 } 3183 3184 if (xbb->ring_config.ring_pages == 1) { 3185 error = xs_gather(XST_NIL, otherend_path, 3186 "ring-ref", "%" PRIu32, 3187 &xbb->ring_config.ring_ref[0], 3188 NULL); 3189 if (error != 0) { 3190 xenbus_dev_fatal(xbb->dev, error, 3191 "Unable to retrieve ring information " 3192 "from frontend %s. Unable to " 3193 "connect.", 3194 xenbus_get_otherend_path(xbb->dev)); 3195 return (error); 3196 } 3197 } else { 3198 /* Multi-page ring format. */ 3199 for (ring_idx = 0; ring_idx < xbb->ring_config.ring_pages; 3200 ring_idx++) { 3201 char ring_ref_name[]= "ring_refXX"; 3202 3203 snprintf(ring_ref_name, sizeof(ring_ref_name), 3204 "ring-ref%u", ring_idx); 3205 error = xs_scanf(XST_NIL, otherend_path, 3206 ring_ref_name, NULL, "%" PRIu32, 3207 &xbb->ring_config.ring_ref[ring_idx]); 3208 if (error != 0) { 3209 xenbus_dev_fatal(xbb->dev, error, 3210 "Failed to retriev grant " 3211 "reference for page %u of " 3212 "shared ring. Unable " 3213 "to connect.", ring_idx); 3214 return (error); 3215 } 3216 } 3217 } 3218 3219 error = xs_gather(XST_NIL, otherend_path, 3220 "protocol", "%63s", protocol_abi, 3221 NULL); 3222 if (error != 0 3223 || !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) { 3224 /* 3225 * Assume native if the frontend has not 3226 * published ABI data or it has published and 3227 * matches our own ABI. 3228 */ 3229 xbb->abi = BLKIF_PROTOCOL_NATIVE; 3230 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) { 3231 3232 xbb->abi = BLKIF_PROTOCOL_X86_32; 3233 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) { 3234 3235 xbb->abi = BLKIF_PROTOCOL_X86_64; 3236 } else { 3237 3238 xenbus_dev_fatal(xbb->dev, EINVAL, 3239 "Unknown protocol ABI (%s) published by " 3240 "frontend. Unable to connect.", protocol_abi); 3241 return (EINVAL); 3242 } 3243 return (0); 3244} 3245 3246/** 3247 * Allocate per-request data structures given request size and number 3248 * information negotiated with the front-end. 3249 * 3250 * \param xbb Per-instance xbb configuration structure. 3251 */ 3252static int 3253xbb_alloc_requests(struct xbb_softc *xbb) 3254{ 3255 struct xbb_xen_req *req; 3256 struct xbb_xen_req *last_req; 3257 3258 /* 3259 * Allocate request book keeping datastructures. 3260 */ 3261 xbb->requests = malloc(xbb->max_requests * sizeof(*xbb->requests), 3262 M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3263 if (xbb->requests == NULL) { 3264 xenbus_dev_fatal(xbb->dev, ENOMEM, 3265 "Unable to allocate request structures"); 3266 return (ENOMEM); 3267 } 3268 3269 req = xbb->requests; 3270 last_req = &xbb->requests[xbb->max_requests - 1]; 3271 STAILQ_INIT(&xbb->request_free_stailq); 3272 while (req <= last_req) { 3273 STAILQ_INSERT_TAIL(&xbb->request_free_stailq, req, links); 3274 req++; 3275 } 3276 return (0); 3277} 3278 3279static int 3280xbb_alloc_request_lists(struct xbb_softc *xbb) 3281{ 3282 struct xbb_xen_reqlist *reqlist; 3283 int i; 3284 3285 /* 3286 * If no requests can be merged, we need 1 request list per 3287 * in flight request. 3288 */ 3289 xbb->request_lists = malloc(xbb->max_requests * 3290 sizeof(*xbb->request_lists), M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3291 if (xbb->request_lists == NULL) { 3292 xenbus_dev_fatal(xbb->dev, ENOMEM, 3293 "Unable to allocate request list structures"); 3294 return (ENOMEM); 3295 } 3296 3297 STAILQ_INIT(&xbb->reqlist_free_stailq); 3298 STAILQ_INIT(&xbb->reqlist_pending_stailq); 3299 for (i = 0; i < xbb->max_requests; i++) { 3300 int seg; 3301 3302 reqlist = &xbb->request_lists[i]; 3303 3304 reqlist->xbb = xbb; 3305 3306#ifdef XBB_USE_BOUNCE_BUFFERS 3307 reqlist->bounce = malloc(xbb->max_reqlist_size, 3308 M_XENBLOCKBACK, M_NOWAIT); 3309 if (reqlist->bounce == NULL) { 3310 xenbus_dev_fatal(xbb->dev, ENOMEM, 3311 "Unable to allocate request " 3312 "bounce buffers"); 3313 return (ENOMEM); 3314 } 3315#endif /* XBB_USE_BOUNCE_BUFFERS */ 3316 3317 reqlist->gnt_handles = malloc(xbb->max_reqlist_segments * 3318 sizeof(*reqlist->gnt_handles), 3319 M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3320 if (reqlist->gnt_handles == NULL) { 3321 xenbus_dev_fatal(xbb->dev, ENOMEM, 3322 "Unable to allocate request " 3323 "grant references"); 3324 return (ENOMEM); 3325 } 3326 3327 for (seg = 0; seg < xbb->max_reqlist_segments; seg++) 3328 reqlist->gnt_handles[seg] = GRANT_REF_INVALID; 3329 3330 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); 3331 } 3332 return (0); 3333} 3334 3335/** 3336 * Supply information about the physical device to the frontend 3337 * via XenBus. 3338 * 3339 * \param xbb Per-instance xbb configuration structure. 3340 */ 3341static int 3342xbb_publish_backend_info(struct xbb_softc *xbb) 3343{ 3344 struct xs_transaction xst; 3345 const char *our_path; 3346 const char *leaf; 3347 int error; 3348 3349 our_path = xenbus_get_node(xbb->dev); 3350 while (1) { 3351 error = xs_transaction_start(&xst); 3352 if (error != 0) { 3353 xenbus_dev_fatal(xbb->dev, error, 3354 "Error publishing backend info " 3355 "(start transaction)"); 3356 return (error); 3357 } 3358 3359 leaf = "sectors"; 3360 error = xs_printf(xst, our_path, leaf, 3361 "%"PRIu64, xbb->media_num_sectors); 3362 if (error != 0) 3363 break; 3364 3365 /* XXX Support all VBD attributes here. */ 3366 leaf = "info"; 3367 error = xs_printf(xst, our_path, leaf, "%u", 3368 xbb->flags & XBBF_READ_ONLY 3369 ? VDISK_READONLY : 0); 3370 if (error != 0) 3371 break; 3372 3373 leaf = "sector-size"; 3374 error = xs_printf(xst, our_path, leaf, "%u", 3375 xbb->sector_size); 3376 if (error != 0) 3377 break; 3378 3379 error = xs_transaction_end(xst, 0); 3380 if (error == 0) { 3381 return (0); 3382 } else if (error != EAGAIN) { 3383 xenbus_dev_fatal(xbb->dev, error, "ending transaction"); 3384 return (error); 3385 } 3386 } 3387 3388 xenbus_dev_fatal(xbb->dev, error, "writing %s/%s", 3389 our_path, leaf); 3390 xs_transaction_end(xst, 1); 3391 return (error); 3392} 3393 3394/** 3395 * Connect to our blkfront peer now that it has completed publishing 3396 * its configuration into the XenStore. 3397 * 3398 * \param xbb Per-instance xbb configuration structure. 3399 */ 3400static void 3401xbb_connect(struct xbb_softc *xbb) 3402{ 3403 int error; 3404 3405 if (xenbus_get_state(xbb->dev) == XenbusStateConnected) 3406 return; 3407 3408 if (xbb_collect_frontend_info(xbb) != 0) 3409 return; 3410 3411 xbb->flags &= ~XBBF_SHUTDOWN; 3412 3413 /* 3414 * We limit the maximum number of reqlist segments to the maximum 3415 * number of segments in the ring, or our absolute maximum, 3416 * whichever is smaller. 3417 */ 3418 xbb->max_reqlist_segments = MIN(xbb->max_request_segments * 3419 xbb->max_requests, XBB_MAX_SEGMENTS_PER_REQLIST); 3420 3421 /* 3422 * The maximum size is simply a function of the number of segments 3423 * we can handle. 3424 */ 3425 xbb->max_reqlist_size = xbb->max_reqlist_segments * PAGE_SIZE; 3426 3427 /* Allocate resources whose size depends on front-end configuration. */ 3428 error = xbb_alloc_communication_mem(xbb); 3429 if (error != 0) { 3430 xenbus_dev_fatal(xbb->dev, error, 3431 "Unable to allocate communication memory"); 3432 return; 3433 } 3434 3435 error = xbb_alloc_requests(xbb); 3436 if (error != 0) { 3437 /* Specific errors are reported by xbb_alloc_requests(). */ 3438 return; 3439 } 3440 3441 error = xbb_alloc_request_lists(xbb); 3442 if (error != 0) { 3443 /* Specific errors are reported by xbb_alloc_request_lists(). */ 3444 return; 3445 } 3446 3447 /* 3448 * Connect communication channel. 3449 */ 3450 error = xbb_connect_ring(xbb); 3451 if (error != 0) { 3452 /* Specific errors are reported by xbb_connect_ring(). */ 3453 return; 3454 } 3455 3456 if (xbb_publish_backend_info(xbb) != 0) { 3457 /* 3458 * If we can't publish our data, we cannot participate 3459 * in this connection, and waiting for a front-end state 3460 * change will not help the situation. 3461 */ 3462 (void)xbb_disconnect(xbb); 3463 return; 3464 } 3465 3466 /* Ready for I/O. */ 3467 xenbus_set_state(xbb->dev, XenbusStateConnected); 3468} 3469 3470/*-------------------------- Device Teardown Support -------------------------*/ 3471/** 3472 * Perform device shutdown functions. 3473 * 3474 * \param xbb Per-instance xbb configuration structure. 3475 * 3476 * Mark this instance as shutting down, wait for any active I/O on the 3477 * backend device/file to drain, disconnect from the front-end, and notify 3478 * any waiters (e.g. a thread invoking our detach method) that detach can 3479 * now proceed. 3480 */ 3481static int 3482xbb_shutdown(struct xbb_softc *xbb) 3483{ 3484 XenbusState frontState; 3485 int error; 3486 3487 DPRINTF("\n"); 3488 3489 /* 3490 * Due to the need to drop our mutex during some 3491 * xenbus operations, it is possible for two threads 3492 * to attempt to close out shutdown processing at 3493 * the same time. Tell the caller that hits this 3494 * race to try back later. 3495 */ 3496 if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0) 3497 return (EAGAIN); 3498 3499 xbb->flags |= XBBF_IN_SHUTDOWN; 3500 mtx_unlock(&xbb->lock); 3501 3502 if (xenbus_get_state(xbb->dev) < XenbusStateClosing) 3503 xenbus_set_state(xbb->dev, XenbusStateClosing); 3504 3505 frontState = xenbus_get_otherend_state(xbb->dev); 3506 mtx_lock(&xbb->lock); 3507 xbb->flags &= ~XBBF_IN_SHUTDOWN; 3508 3509 /* The front can submit I/O until entering the closed state. */ 3510 if (frontState < XenbusStateClosed) 3511 return (EAGAIN); 3512 3513 DPRINTF("\n"); 3514 3515 /* Indicate shutdown is in progress. */ 3516 xbb->flags |= XBBF_SHUTDOWN; 3517 3518 /* Disconnect from the front-end. */ 3519 error = xbb_disconnect(xbb); 3520 if (error != 0) { 3521 /* 3522 * Requests still outstanding. We'll be called again 3523 * once they complete. 3524 */ 3525 KASSERT(error == EAGAIN, 3526 ("%s: Unexpected xbb_disconnect() failure %d", 3527 __func__, error)); 3528 3529 return (error); 3530 } 3531 3532 DPRINTF("\n"); 3533 3534 /* Indicate to xbb_detach() that is it safe to proceed. */ 3535 wakeup(xbb); 3536 3537 return (0); 3538} 3539 3540/** 3541 * Report an attach time error to the console and Xen, and cleanup 3542 * this instance by forcing immediate detach processing. 3543 * 3544 * \param xbb Per-instance xbb configuration structure. 3545 * \param err Errno describing the error. 3546 * \param fmt Printf style format and arguments 3547 */ 3548static void 3549xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...) 3550{ 3551 va_list ap; 3552 va_list ap_hotplug; 3553 3554 va_start(ap, fmt); 3555 va_copy(ap_hotplug, ap); 3556 xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev), 3557 "hotplug-error", fmt, ap_hotplug); 3558 va_end(ap_hotplug); 3559 xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3560 "hotplug-status", "error"); 3561 3562 xenbus_dev_vfatal(xbb->dev, err, fmt, ap); 3563 va_end(ap); 3564 3565 xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3566 "online", "0"); 3567 xbb_detach(xbb->dev); 3568} 3569 3570/*---------------------------- NewBus Entrypoints ----------------------------*/ 3571/** 3572 * Inspect a XenBus device and claim it if is of the appropriate type. 3573 * 3574 * \param dev NewBus device object representing a candidate XenBus device. 3575 * 3576 * \return 0 for success, errno codes for failure. 3577 */ 3578static int 3579xbb_probe(device_t dev) 3580{ 3581 3582 if (!strcmp(xenbus_get_type(dev), "vbd")) { 3583 device_set_desc(dev, "Backend Virtual Block Device"); 3584 device_quiet(dev); 3585 return (0); 3586 } 3587 3588 return (ENXIO); 3589} 3590 3591/** 3592 * Setup sysctl variables to control various Block Back parameters. 3593 * 3594 * \param xbb Xen Block Back softc. 3595 * 3596 */ 3597static void 3598xbb_setup_sysctl(struct xbb_softc *xbb) 3599{ 3600 struct sysctl_ctx_list *sysctl_ctx = NULL; 3601 struct sysctl_oid *sysctl_tree = NULL; 3602 3603 sysctl_ctx = device_get_sysctl_ctx(xbb->dev); 3604 if (sysctl_ctx == NULL) 3605 return; 3606 3607 sysctl_tree = device_get_sysctl_tree(xbb->dev); 3608 if (sysctl_tree == NULL) 3609 return; 3610 3611 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3612 "disable_flush", CTLFLAG_RW, &xbb->disable_flush, 0, 3613 "fake the flush command"); 3614 3615 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3616 "flush_interval", CTLFLAG_RW, &xbb->flush_interval, 0, 3617 "send a real flush for N flush requests"); 3618 3619 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3620 "no_coalesce_reqs", CTLFLAG_RW, &xbb->no_coalesce_reqs,0, 3621 "Don't coalesce contiguous requests"); 3622 3623 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3624 "reqs_received", CTLFLAG_RW, &xbb->reqs_received, 3625 "how many I/O requests we have received"); 3626 3627 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3628 "reqs_completed", CTLFLAG_RW, &xbb->reqs_completed, 3629 "how many I/O requests have been completed"); 3630 3631 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3632 "reqs_queued_for_completion", CTLFLAG_RW, 3633 &xbb->reqs_queued_for_completion, 3634 "how many I/O requests queued but not yet pushed"); 3635 3636 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3637 "reqs_completed_with_error", CTLFLAG_RW, 3638 &xbb->reqs_completed_with_error, 3639 "how many I/O requests completed with error status"); 3640 3641 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3642 "forced_dispatch", CTLFLAG_RW, &xbb->forced_dispatch, 3643 "how many I/O dispatches were forced"); 3644 3645 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3646 "normal_dispatch", CTLFLAG_RW, &xbb->normal_dispatch, 3647 "how many I/O dispatches were normal"); 3648 3649 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3650 "total_dispatch", CTLFLAG_RW, &xbb->total_dispatch, 3651 "total number of I/O dispatches"); 3652 3653 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3654 "kva_shortages", CTLFLAG_RW, &xbb->kva_shortages, 3655 "how many times we have run out of KVA"); 3656 3657 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3658 "request_shortages", CTLFLAG_RW, 3659 &xbb->request_shortages, 3660 "how many times we have run out of requests"); 3661 3662 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3663 "max_requests", CTLFLAG_RD, &xbb->max_requests, 0, 3664 "maximum outstanding requests (negotiated)"); 3665 3666 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3667 "max_request_segments", CTLFLAG_RD, 3668 &xbb->max_request_segments, 0, 3669 "maximum number of pages per requests (negotiated)"); 3670 3671 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3672 "max_request_size", CTLFLAG_RD, 3673 &xbb->max_request_size, 0, 3674 "maximum size in bytes of a request (negotiated)"); 3675 3676 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3677 "ring_pages", CTLFLAG_RD, 3678 &xbb->ring_config.ring_pages, 0, 3679 "communication channel pages (negotiated)"); 3680} 3681 3682/** 3683 * Attach to a XenBus device that has been claimed by our probe routine. 3684 * 3685 * \param dev NewBus device object representing this Xen Block Back instance. 3686 * 3687 * \return 0 for success, errno codes for failure. 3688 */ 3689static int 3690xbb_attach(device_t dev) 3691{ 3692 struct xbb_softc *xbb; 3693 int error; 3694 u_int max_ring_page_order; 3695 3696 DPRINTF("Attaching to %s\n", xenbus_get_node(dev)); 3697 3698 /* 3699 * Basic initialization. 3700 * After this block it is safe to call xbb_detach() 3701 * to clean up any allocated data for this instance. 3702 */ 3703 xbb = device_get_softc(dev); 3704 xbb->dev = dev; 3705 xbb->otherend_id = xenbus_get_otherend_id(dev); 3706 TASK_INIT(&xbb->io_task, /*priority*/0, xbb_run_queue, xbb); 3707 mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF); 3708 3709 /* 3710 * Publish protocol capabilities for consumption by the 3711 * front-end. 3712 */ 3713 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3714 "feature-barrier", "1"); 3715 if (error) { 3716 xbb_attach_failed(xbb, error, "writing %s/feature-barrier", 3717 xenbus_get_node(xbb->dev)); 3718 return (error); 3719 } 3720 3721 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3722 "feature-flush-cache", "1"); 3723 if (error) { 3724 xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache", 3725 xenbus_get_node(xbb->dev)); 3726 return (error); 3727 } 3728 3729 /* 3730 * Amazon EC2 client compatility. They refer to max-ring-pages 3731 * instead of to max-ring-page-order. 3732 */ 3733 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3734 "max-ring-pages", "%zu", XBB_MAX_RING_PAGES); 3735 if (error) { 3736 xbb_attach_failed(xbb, error, "writing %s/max-ring-pages", 3737 xenbus_get_node(xbb->dev)); 3738 return (error); 3739 } 3740 3741 max_ring_page_order = flsl(XBB_MAX_RING_PAGES) - 1; 3742 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3743 "max-ring-page-order", "%u", max_ring_page_order); 3744 if (error) { 3745 xbb_attach_failed(xbb, error, "writing %s/max-ring-page-order", 3746 xenbus_get_node(xbb->dev)); 3747 return (error); 3748 } 3749 3750 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3751 "max-requests", "%u", XBB_MAX_REQUESTS); 3752 if (error) { 3753 xbb_attach_failed(xbb, error, "writing %s/max-requests", 3754 xenbus_get_node(xbb->dev)); 3755 return (error); 3756 } 3757 3758 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3759 "max-request-segments", "%u", 3760 XBB_MAX_SEGMENTS_PER_REQUEST); 3761 if (error) { 3762 xbb_attach_failed(xbb, error, "writing %s/max-request-segments", 3763 xenbus_get_node(xbb->dev)); 3764 return (error); 3765 } 3766 3767 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3768 "max-request-size", "%u", 3769 XBB_MAX_REQUEST_SIZE); 3770 if (error) { 3771 xbb_attach_failed(xbb, error, "writing %s/max-request-size", 3772 xenbus_get_node(xbb->dev)); 3773 return (error); 3774 } 3775 3776 /* Collect physical device information. */ 3777 error = xs_gather(XST_NIL, xenbus_get_otherend_path(xbb->dev), 3778 "device-type", NULL, &xbb->dev_type, 3779 NULL); 3780 if (error != 0) 3781 xbb->dev_type = NULL; 3782 3783 error = xs_gather(XST_NIL, xenbus_get_node(dev), 3784 "mode", NULL, &xbb->dev_mode, 3785 "params", NULL, &xbb->dev_name, 3786 NULL); 3787 if (error != 0) { 3788 xbb_attach_failed(xbb, error, "reading backend fields at %s", 3789 xenbus_get_node(dev)); 3790 return (ENXIO); 3791 } 3792 3793 /* Parse fopen style mode flags. */ 3794 if (strchr(xbb->dev_mode, 'w') == NULL) 3795 xbb->flags |= XBBF_READ_ONLY; 3796 3797 /* 3798 * Verify the physical device is present and can support 3799 * the desired I/O mode. 3800 */ 3801 DROP_GIANT(); 3802 error = xbb_open_backend(xbb); 3803 PICKUP_GIANT(); 3804 if (error != 0) { 3805 xbb_attach_failed(xbb, error, "Unable to open %s", 3806 xbb->dev_name); 3807 return (ENXIO); 3808 } 3809 3810 /* Use devstat(9) for recording statistics. */ 3811 xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev), 3812 xbb->sector_size, 3813 DEVSTAT_ALL_SUPPORTED, 3814 DEVSTAT_TYPE_DIRECT 3815 | DEVSTAT_TYPE_IF_OTHER, 3816 DEVSTAT_PRIORITY_OTHER); 3817 3818 xbb->xbb_stats_in = devstat_new_entry("xbbi", device_get_unit(xbb->dev), 3819 xbb->sector_size, 3820 DEVSTAT_ALL_SUPPORTED, 3821 DEVSTAT_TYPE_DIRECT 3822 | DEVSTAT_TYPE_IF_OTHER, 3823 DEVSTAT_PRIORITY_OTHER); 3824 /* 3825 * Setup sysctl variables. 3826 */ 3827 xbb_setup_sysctl(xbb); 3828 3829 /* 3830 * Create a taskqueue for doing work that must occur from a 3831 * thread context. 3832 */ 3833 xbb->io_taskqueue = taskqueue_create_fast(device_get_nameunit(dev), 3834 M_NOWAIT, 3835 taskqueue_thread_enqueue, 3836 /*contxt*/&xbb->io_taskqueue); 3837 if (xbb->io_taskqueue == NULL) { 3838 xbb_attach_failed(xbb, error, "Unable to create taskqueue"); 3839 return (ENOMEM); 3840 } 3841 3842 taskqueue_start_threads(&xbb->io_taskqueue, 3843 /*num threads*/1, 3844 /*priority*/PWAIT, 3845 /*thread name*/ 3846 "%s taskq", device_get_nameunit(dev)); 3847 3848 /* Update hot-plug status to satisfy xend. */ 3849 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3850 "hotplug-status", "connected"); 3851 if (error) { 3852 xbb_attach_failed(xbb, error, "writing %s/hotplug-status", 3853 xenbus_get_node(xbb->dev)); 3854 return (error); 3855 } 3856 3857 /* Tell the front end that we are ready to connect. */ 3858 xenbus_set_state(dev, XenbusStateInitWait); 3859 3860 return (0); 3861} 3862 3863/** 3864 * Detach from a block back device instance. 3865 * 3866 * \param dev NewBus device object representing this Xen Block Back instance. 3867 * 3868 * \return 0 for success, errno codes for failure. 3869 * 3870 * \note A block back device may be detached at any time in its life-cycle, 3871 * including part way through the attach process. For this reason, 3872 * initialization order and the intialization state checks in this 3873 * routine must be carefully coupled so that attach time failures 3874 * are gracefully handled. 3875 */ 3876static int 3877xbb_detach(device_t dev) 3878{ 3879 struct xbb_softc *xbb; 3880 3881 DPRINTF("\n"); 3882 3883 xbb = device_get_softc(dev); 3884 mtx_lock(&xbb->lock); 3885 while (xbb_shutdown(xbb) == EAGAIN) { 3886 msleep(xbb, &xbb->lock, /*wakeup prio unchanged*/0, 3887 "xbb_shutdown", 0); 3888 } 3889 mtx_unlock(&xbb->lock); 3890 3891 DPRINTF("\n"); 3892 3893 if (xbb->io_taskqueue != NULL) 3894 taskqueue_free(xbb->io_taskqueue); 3895 3896 if (xbb->xbb_stats != NULL) 3897 devstat_remove_entry(xbb->xbb_stats); 3898 3899 if (xbb->xbb_stats_in != NULL) 3900 devstat_remove_entry(xbb->xbb_stats_in); 3901 3902 xbb_close_backend(xbb); 3903 3904 if (xbb->dev_mode != NULL) { 3905 free(xbb->dev_mode, M_XENSTORE); 3906 xbb->dev_mode = NULL; 3907 } 3908 3909 if (xbb->dev_type != NULL) { 3910 free(xbb->dev_type, M_XENSTORE); 3911 xbb->dev_type = NULL; 3912 } 3913 3914 if (xbb->dev_name != NULL) { 3915 free(xbb->dev_name, M_XENSTORE); 3916 xbb->dev_name = NULL; 3917 } 3918 3919 mtx_destroy(&xbb->lock); 3920 return (0); 3921} 3922 3923/** 3924 * Prepare this block back device for suspension of this VM. 3925 * 3926 * \param dev NewBus device object representing this Xen Block Back instance. 3927 * 3928 * \return 0 for success, errno codes for failure. 3929 */ 3930static int 3931xbb_suspend(device_t dev) 3932{ 3933#ifdef NOT_YET 3934 struct xbb_softc *sc = device_get_softc(dev); 3935 3936 /* Prevent new requests being issued until we fix things up. */ 3937 mtx_lock(&sc->xb_io_lock); 3938 sc->connected = BLKIF_STATE_SUSPENDED; 3939 mtx_unlock(&sc->xb_io_lock); 3940#endif 3941 3942 return (0); 3943} 3944 3945/** 3946 * Perform any processing required to recover from a suspended state. 3947 * 3948 * \param dev NewBus device object representing this Xen Block Back instance. 3949 * 3950 * \return 0 for success, errno codes for failure. 3951 */ 3952static int 3953xbb_resume(device_t dev) 3954{ 3955 return (0); 3956} 3957 3958/** 3959 * Handle state changes expressed via the XenStore by our front-end peer. 3960 * 3961 * \param dev NewBus device object representing this Xen 3962 * Block Back instance. 3963 * \param frontend_state The new state of the front-end. 3964 * 3965 * \return 0 for success, errno codes for failure. 3966 */ 3967static void 3968xbb_frontend_changed(device_t dev, XenbusState frontend_state) 3969{ 3970 struct xbb_softc *xbb = device_get_softc(dev); 3971 3972 DPRINTF("frontend_state=%s, xbb_state=%s\n", 3973 xenbus_strstate(frontend_state), 3974 xenbus_strstate(xenbus_get_state(xbb->dev))); 3975 3976 switch (frontend_state) { 3977 case XenbusStateInitialising: 3978 break; 3979 case XenbusStateInitialised: 3980 case XenbusStateConnected: 3981 xbb_connect(xbb); 3982 break; 3983 case XenbusStateClosing: 3984 case XenbusStateClosed: 3985 mtx_lock(&xbb->lock); 3986 xbb_shutdown(xbb); 3987 mtx_unlock(&xbb->lock); 3988 if (frontend_state == XenbusStateClosed) 3989 xenbus_set_state(xbb->dev, XenbusStateClosed); 3990 break; 3991 default: 3992 xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend", 3993 frontend_state); 3994 break; 3995 } 3996} 3997 3998/*---------------------------- NewBus Registration ---------------------------*/ 3999static device_method_t xbb_methods[] = { 4000 /* Device interface */ 4001 DEVMETHOD(device_probe, xbb_probe), 4002 DEVMETHOD(device_attach, xbb_attach), 4003 DEVMETHOD(device_detach, xbb_detach), 4004 DEVMETHOD(device_shutdown, bus_generic_shutdown), 4005 DEVMETHOD(device_suspend, xbb_suspend), 4006 DEVMETHOD(device_resume, xbb_resume), 4007 4008 /* Xenbus interface */ 4009 DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed), 4010 4011 { 0, 0 } 4012}; 4013 4014static driver_t xbb_driver = { 4015 "xbbd", 4016 xbb_methods, 4017 sizeof(struct xbb_softc), 4018}; 4019devclass_t xbb_devclass; 4020 4021DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, xbb_devclass, 0, 0); 4022