1/*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2009-2012 Spectra Logic Corporation 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions, and the following disclaimer, 12 * without modification. 13 * 2. Redistributions in binary form must reproduce at minimum a disclaimer 14 * substantially similar to the "NO WARRANTY" disclaimer below 15 * ("Disclaimer") and any redistribution must be conditioned upon 16 * including a substantially similar Disclaimer requirement for further 17 * binary redistribution. 18 * 19 * NO WARRANTY 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 28 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 29 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGES. 31 * 32 * Authors: Justin T. Gibbs (Spectra Logic Corporation) 33 * Ken Merry (Spectra Logic Corporation) 34 */ 35#include <sys/cdefs.h> 36__FBSDID("$FreeBSD$"); 37 38/** 39 * \file blkback.c 40 * 41 * \brief Device driver supporting the vending of block storage from 42 * a FreeBSD domain to other domains. 43 */ 44 45#include <sys/param.h> 46#include <sys/systm.h> 47#include <sys/kernel.h> 48#include <sys/malloc.h> 49 50#include <sys/bio.h> 51#include <sys/bus.h> 52#include <sys/conf.h> 53#include <sys/devicestat.h> 54#include <sys/disk.h> 55#include <sys/fcntl.h> 56#include <sys/filedesc.h> 57#include <sys/kdb.h> 58#include <sys/module.h> 59#include <sys/namei.h> 60#include <sys/proc.h> 61#include <sys/rman.h> 62#include <sys/taskqueue.h> 63#include <sys/types.h> 64#include <sys/vnode.h> 65#include <sys/mount.h> 66#include <sys/sysctl.h> 67#include <sys/bitstring.h> 68#include <sys/sdt.h> 69 70#include <geom/geom.h> 71 72#include <machine/_inttypes.h> 73 74#include <vm/vm.h> 75#include <vm/vm_extern.h> 76#include <vm/vm_kern.h> 77 78#include <xen/xen-os.h> 79#include <xen/blkif.h> 80#include <xen/gnttab.h> 81#include <xen/xen_intr.h> 82 83#include <xen/interface/event_channel.h> 84#include <xen/interface/grant_table.h> 85 86#include <xen/xenbus/xenbusvar.h> 87 88/*--------------------------- Compile-time Tunables --------------------------*/ 89/** 90 * The maximum number of shared memory ring pages we will allow in a 91 * negotiated block-front/back communication channel. Allow enough 92 * ring space for all requests to be XBB_MAX_REQUEST_SIZE'd. 93 */ 94#define XBB_MAX_RING_PAGES 32 95 96/** 97 * The maximum number of outstanding request blocks (request headers plus 98 * additional segment blocks) we will allow in a negotiated block-front/back 99 * communication channel. 100 */ 101#define XBB_MAX_REQUESTS \ 102 __CONST_RING_SIZE(blkif, PAGE_SIZE * XBB_MAX_RING_PAGES) 103 104/** 105 * \brief Define to force all I/O to be performed on memory owned by the 106 * backend device, with a copy-in/out to the remote domain's memory. 107 * 108 * \note This option is currently required when this driver's domain is 109 * operating in HVM mode on a system using an IOMMU. 110 * 111 * This driver uses Xen's grant table API to gain access to the memory of 112 * the remote domains it serves. When our domain is operating in PV mode, 113 * the grant table mechanism directly updates our domain's page table entries 114 * to point to the physical pages of the remote domain. This scheme guarantees 115 * that blkback and the backing devices it uses can safely perform DMA 116 * operations to satisfy requests. In HVM mode, Xen may use a HW IOMMU to 117 * insure that our domain cannot DMA to pages owned by another domain. As 118 * of Xen 4.0, IOMMU mappings for HVM guests are not updated via the grant 119 * table API. For this reason, in HVM mode, we must bounce all requests into 120 * memory that is mapped into our domain at domain startup and thus has 121 * valid IOMMU mappings. 122 */ 123#define XBB_USE_BOUNCE_BUFFERS 124 125/** 126 * \brief Define to enable rudimentary request logging to the console. 127 */ 128#undef XBB_DEBUG 129 130/*---------------------------------- Macros ----------------------------------*/ 131/** 132 * Custom malloc type for all driver allocations. 133 */ 134static MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data"); 135 136#ifdef XBB_DEBUG 137#define DPRINTF(fmt, args...) \ 138 printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) 139#else 140#define DPRINTF(fmt, args...) do {} while(0) 141#endif 142 143/** 144 * The maximum mapped region size per request we will allow in a negotiated 145 * block-front/back communication channel. 146 */ 147#define XBB_MAX_REQUEST_SIZE \ 148 MIN(MAXPHYS, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) 149 150/** 151 * The maximum number of segments (within a request header and accompanying 152 * segment blocks) per request we will allow in a negotiated block-front/back 153 * communication channel. 154 */ 155#define XBB_MAX_SEGMENTS_PER_REQUEST \ 156 (MIN(UIO_MAXIOV, \ 157 MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST, \ 158 (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1))) 159 160/** 161 * The maximum number of ring pages that we can allow per request list. 162 * We limit this to the maximum number of segments per request, because 163 * that is already a reasonable number of segments to aggregate. This 164 * number should never be smaller than XBB_MAX_SEGMENTS_PER_REQUEST, 165 * because that would leave situations where we can't dispatch even one 166 * large request. 167 */ 168#define XBB_MAX_SEGMENTS_PER_REQLIST XBB_MAX_SEGMENTS_PER_REQUEST 169 170/*--------------------------- Forward Declarations ---------------------------*/ 171struct xbb_softc; 172struct xbb_xen_req; 173 174static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, 175 ...) __attribute__((format(printf, 3, 4))); 176static int xbb_shutdown(struct xbb_softc *xbb); 177 178/*------------------------------ Data Structures -----------------------------*/ 179 180STAILQ_HEAD(xbb_xen_req_list, xbb_xen_req); 181 182typedef enum { 183 XBB_REQLIST_NONE = 0x00, 184 XBB_REQLIST_MAPPED = 0x01 185} xbb_reqlist_flags; 186 187struct xbb_xen_reqlist { 188 /** 189 * Back reference to the parent block back instance for this 190 * request. Used during bio_done handling. 191 */ 192 struct xbb_softc *xbb; 193 194 /** 195 * BLKIF_OP code for this request. 196 */ 197 int operation; 198 199 /** 200 * Set to BLKIF_RSP_* to indicate request status. 201 * 202 * This field allows an error status to be recorded even if the 203 * delivery of this status must be deferred. Deferred reporting 204 * is necessary, for example, when an error is detected during 205 * completion processing of one bio when other bios for this 206 * request are still outstanding. 207 */ 208 int status; 209 210 /** 211 * Number of 512 byte sectors not transferred. 212 */ 213 int residual_512b_sectors; 214 215 /** 216 * Starting sector number of the first request in the list. 217 */ 218 off_t starting_sector_number; 219 220 /** 221 * If we're going to coalesce, the next contiguous sector would be 222 * this one. 223 */ 224 off_t next_contig_sector; 225 226 /** 227 * Number of child requests in the list. 228 */ 229 int num_children; 230 231 /** 232 * Number of I/O requests still pending on the backend. 233 */ 234 int pendcnt; 235 236 /** 237 * Total number of segments for requests in the list. 238 */ 239 int nr_segments; 240 241 /** 242 * Flags for this particular request list. 243 */ 244 xbb_reqlist_flags flags; 245 246 /** 247 * Kernel virtual address space reserved for this request 248 * list structure and used to map the remote domain's pages for 249 * this I/O, into our domain's address space. 250 */ 251 uint8_t *kva; 252 253 /** 254 * Base, pseudo-physical address, corresponding to the start 255 * of this request's kva region. 256 */ 257 uint64_t gnt_base; 258 259 260#ifdef XBB_USE_BOUNCE_BUFFERS 261 /** 262 * Pre-allocated domain local memory used to proxy remote 263 * domain memory during I/O operations. 264 */ 265 uint8_t *bounce; 266#endif 267 268 /** 269 * Array of grant handles (one per page) used to map this request. 270 */ 271 grant_handle_t *gnt_handles; 272 273 /** 274 * Device statistics request ordering type (ordered or simple). 275 */ 276 devstat_tag_type ds_tag_type; 277 278 /** 279 * Device statistics request type (read, write, no_data). 280 */ 281 devstat_trans_flags ds_trans_type; 282 283 /** 284 * The start time for this request. 285 */ 286 struct bintime ds_t0; 287 288 /** 289 * Linked list of contiguous requests with the same operation type. 290 */ 291 struct xbb_xen_req_list contig_req_list; 292 293 /** 294 * Linked list links used to aggregate idle requests in the 295 * request list free pool (xbb->reqlist_free_stailq) and pending 296 * requests waiting for execution (xbb->reqlist_pending_stailq). 297 */ 298 STAILQ_ENTRY(xbb_xen_reqlist) links; 299}; 300 301STAILQ_HEAD(xbb_xen_reqlist_list, xbb_xen_reqlist); 302 303/** 304 * \brief Object tracking an in-flight I/O from a Xen VBD consumer. 305 */ 306struct xbb_xen_req { 307 /** 308 * Linked list links used to aggregate requests into a reqlist 309 * and to store them in the request free pool. 310 */ 311 STAILQ_ENTRY(xbb_xen_req) links; 312 313 /** 314 * The remote domain's identifier for this I/O request. 315 */ 316 uint64_t id; 317 318 /** 319 * The number of pages currently mapped for this request. 320 */ 321 int nr_pages; 322 323 /** 324 * The number of 512 byte sectors comprising this requests. 325 */ 326 int nr_512b_sectors; 327 328 /** 329 * BLKIF_OP code for this request. 330 */ 331 int operation; 332 333 /** 334 * Storage used for non-native ring requests. 335 */ 336 blkif_request_t ring_req_storage; 337 338 /** 339 * Pointer to the Xen request in the ring. 340 */ 341 blkif_request_t *ring_req; 342 343 /** 344 * Consumer index for this request. 345 */ 346 RING_IDX req_ring_idx; 347 348 /** 349 * The start time for this request. 350 */ 351 struct bintime ds_t0; 352 353 /** 354 * Pointer back to our parent request list. 355 */ 356 struct xbb_xen_reqlist *reqlist; 357}; 358SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req); 359 360/** 361 * \brief Configuration data for the shared memory request ring 362 * used to communicate with the front-end client of this 363 * this driver. 364 */ 365struct xbb_ring_config { 366 /** KVA address where ring memory is mapped. */ 367 vm_offset_t va; 368 369 /** The pseudo-physical address where ring memory is mapped.*/ 370 uint64_t gnt_addr; 371 372 /** 373 * Grant table handles, one per-ring page, returned by the 374 * hyperpervisor upon mapping of the ring and required to 375 * unmap it when a connection is torn down. 376 */ 377 grant_handle_t handle[XBB_MAX_RING_PAGES]; 378 379 /** 380 * The device bus address returned by the hypervisor when 381 * mapping the ring and required to unmap it when a connection 382 * is torn down. 383 */ 384 uint64_t bus_addr[XBB_MAX_RING_PAGES]; 385 386 /** The number of ring pages mapped for the current connection. */ 387 u_int ring_pages; 388 389 /** 390 * The grant references, one per-ring page, supplied by the 391 * front-end, allowing us to reference the ring pages in the 392 * front-end's domain and to map these pages into our own domain. 393 */ 394 grant_ref_t ring_ref[XBB_MAX_RING_PAGES]; 395 396 /** The interrupt driven even channel used to signal ring events. */ 397 evtchn_port_t evtchn; 398}; 399 400/** 401 * Per-instance connection state flags. 402 */ 403typedef enum 404{ 405 /** 406 * The front-end requested a read-only mount of the 407 * back-end device/file. 408 */ 409 XBBF_READ_ONLY = 0x01, 410 411 /** Communication with the front-end has been established. */ 412 XBBF_RING_CONNECTED = 0x02, 413 414 /** 415 * Front-end requests exist in the ring and are waiting for 416 * xbb_xen_req objects to free up. 417 */ 418 XBBF_RESOURCE_SHORTAGE = 0x04, 419 420 /** Connection teardown in progress. */ 421 XBBF_SHUTDOWN = 0x08, 422 423 /** A thread is already performing shutdown processing. */ 424 XBBF_IN_SHUTDOWN = 0x10 425} xbb_flag_t; 426 427/** Backend device type. */ 428typedef enum { 429 /** Backend type unknown. */ 430 XBB_TYPE_NONE = 0x00, 431 432 /** 433 * Backend type disk (access via cdev switch 434 * strategy routine). 435 */ 436 XBB_TYPE_DISK = 0x01, 437 438 /** Backend type file (access vnode operations.). */ 439 XBB_TYPE_FILE = 0x02 440} xbb_type; 441 442/** 443 * \brief Structure used to memoize information about a per-request 444 * scatter-gather list. 445 * 446 * The chief benefit of using this data structure is it avoids having 447 * to reparse the possibly discontiguous S/G list in the original 448 * request. Due to the way that the mapping of the memory backing an 449 * I/O transaction is handled by Xen, a second pass is unavoidable. 450 * At least this way the second walk is a simple array traversal. 451 * 452 * \note A single Scatter/Gather element in the block interface covers 453 * at most 1 machine page. In this context a sector (blkif 454 * nomenclature, not what I'd choose) is a 512b aligned unit 455 * of mapping within the machine page referenced by an S/G 456 * element. 457 */ 458struct xbb_sg { 459 /** The number of 512b data chunks mapped in this S/G element. */ 460 int16_t nsect; 461 462 /** 463 * The index (0 based) of the first 512b data chunk mapped 464 * in this S/G element. 465 */ 466 uint8_t first_sect; 467 468 /** 469 * The index (0 based) of the last 512b data chunk mapped 470 * in this S/G element. 471 */ 472 uint8_t last_sect; 473}; 474 475/** 476 * Character device backend specific configuration data. 477 */ 478struct xbb_dev_data { 479 /** Cdev used for device backend access. */ 480 struct cdev *cdev; 481 482 /** Cdev switch used for device backend access. */ 483 struct cdevsw *csw; 484 485 /** Used to hold a reference on opened cdev backend devices. */ 486 int dev_ref; 487}; 488 489/** 490 * File backend specific configuration data. 491 */ 492struct xbb_file_data { 493 /** Credentials to use for vnode backed (file based) I/O. */ 494 struct ucred *cred; 495 496 /** 497 * \brief Array of io vectors used to process file based I/O. 498 * 499 * Only a single file based request is outstanding per-xbb instance, 500 * so we only need one of these. 501 */ 502 struct iovec xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; 503#ifdef XBB_USE_BOUNCE_BUFFERS 504 505 /** 506 * \brief Array of io vectors used to handle bouncing of file reads. 507 * 508 * Vnode operations are free to modify uio data during their 509 * exectuion. In the case of a read with bounce buffering active, 510 * we need some of the data from the original uio in order to 511 * bounce-out the read data. This array serves as the temporary 512 * storage for this saved data. 513 */ 514 struct iovec saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; 515 516 /** 517 * \brief Array of memoized bounce buffer kva offsets used 518 * in the file based backend. 519 * 520 * Due to the way that the mapping of the memory backing an 521 * I/O transaction is handled by Xen, a second pass through 522 * the request sg elements is unavoidable. We memoize the computed 523 * bounce address here to reduce the cost of the second walk. 524 */ 525 void *xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQLIST]; 526#endif /* XBB_USE_BOUNCE_BUFFERS */ 527}; 528 529/** 530 * Collection of backend type specific data. 531 */ 532union xbb_backend_data { 533 struct xbb_dev_data dev; 534 struct xbb_file_data file; 535}; 536 537/** 538 * Function signature of backend specific I/O handlers. 539 */ 540typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb, 541 struct xbb_xen_reqlist *reqlist, int operation, 542 int flags); 543 544/** 545 * Per-instance configuration data. 546 */ 547struct xbb_softc { 548 549 /** 550 * Task-queue used to process I/O requests. 551 */ 552 struct taskqueue *io_taskqueue; 553 554 /** 555 * Single "run the request queue" task enqueued 556 * on io_taskqueue. 557 */ 558 struct task io_task; 559 560 /** Device type for this instance. */ 561 xbb_type device_type; 562 563 /** NewBus device corresponding to this instance. */ 564 device_t dev; 565 566 /** Backend specific dispatch routine for this instance. */ 567 xbb_dispatch_t dispatch_io; 568 569 /** The number of requests outstanding on the backend device/file. */ 570 int active_request_count; 571 572 /** Free pool of request tracking structures. */ 573 struct xbb_xen_req_list request_free_stailq; 574 575 /** Array, sized at connection time, of request tracking structures. */ 576 struct xbb_xen_req *requests; 577 578 /** Free pool of request list structures. */ 579 struct xbb_xen_reqlist_list reqlist_free_stailq; 580 581 /** List of pending request lists awaiting execution. */ 582 struct xbb_xen_reqlist_list reqlist_pending_stailq; 583 584 /** Array, sized at connection time, of request list structures. */ 585 struct xbb_xen_reqlist *request_lists; 586 587 /** 588 * Global pool of kva used for mapping remote domain ring 589 * and I/O transaction data. 590 */ 591 vm_offset_t kva; 592 593 /** Pseudo-physical address corresponding to kva. */ 594 uint64_t gnt_base_addr; 595 596 /** The size of the global kva pool. */ 597 int kva_size; 598 599 /** The size of the KVA area used for request lists. */ 600 int reqlist_kva_size; 601 602 /** The number of pages of KVA used for request lists */ 603 int reqlist_kva_pages; 604 605 /** Bitmap of free KVA pages */ 606 bitstr_t *kva_free; 607 608 /** 609 * \brief Cached value of the front-end's domain id. 610 * 611 * This value is used at once for each mapped page in 612 * a transaction. We cache it to avoid incuring the 613 * cost of an ivar access every time this is needed. 614 */ 615 domid_t otherend_id; 616 617 /** 618 * \brief The blkif protocol abi in effect. 619 * 620 * There are situations where the back and front ends can 621 * have a different, native abi (e.g. intel x86_64 and 622 * 32bit x86 domains on the same machine). The back-end 623 * always accommodates the front-end's native abi. That 624 * value is pulled from the XenStore and recorded here. 625 */ 626 int abi; 627 628 /** 629 * \brief The maximum number of requests and request lists allowed 630 * to be in flight at a time. 631 * 632 * This value is negotiated via the XenStore. 633 */ 634 u_int max_requests; 635 636 /** 637 * \brief The maximum number of segments (1 page per segment) 638 * that can be mapped by a request. 639 * 640 * This value is negotiated via the XenStore. 641 */ 642 u_int max_request_segments; 643 644 /** 645 * \brief Maximum number of segments per request list. 646 * 647 * This value is derived from and will generally be larger than 648 * max_request_segments. 649 */ 650 u_int max_reqlist_segments; 651 652 /** 653 * The maximum size of any request to this back-end 654 * device. 655 * 656 * This value is negotiated via the XenStore. 657 */ 658 u_int max_request_size; 659 660 /** 661 * The maximum size of any request list. This is derived directly 662 * from max_reqlist_segments. 663 */ 664 u_int max_reqlist_size; 665 666 /** Various configuration and state bit flags. */ 667 xbb_flag_t flags; 668 669 /** Ring mapping and interrupt configuration data. */ 670 struct xbb_ring_config ring_config; 671 672 /** Runtime, cross-abi safe, structures for ring access. */ 673 blkif_back_rings_t rings; 674 675 /** IRQ mapping for the communication ring event channel. */ 676 xen_intr_handle_t xen_intr_handle; 677 678 /** 679 * \brief Backend access mode flags (e.g. write, or read-only). 680 * 681 * This value is passed to us by the front-end via the XenStore. 682 */ 683 char *dev_mode; 684 685 /** 686 * \brief Backend device type (e.g. "disk", "cdrom", "floppy"). 687 * 688 * This value is passed to us by the front-end via the XenStore. 689 * Currently unused. 690 */ 691 char *dev_type; 692 693 /** 694 * \brief Backend device/file identifier. 695 * 696 * This value is passed to us by the front-end via the XenStore. 697 * We expect this to be a POSIX path indicating the file or 698 * device to open. 699 */ 700 char *dev_name; 701 702 /** 703 * Vnode corresponding to the backend device node or file 704 * we are acessing. 705 */ 706 struct vnode *vn; 707 708 union xbb_backend_data backend; 709 710 /** The native sector size of the backend. */ 711 u_int sector_size; 712 713 /** log2 of sector_size. */ 714 u_int sector_size_shift; 715 716 /** Size in bytes of the backend device or file. */ 717 off_t media_size; 718 719 /** 720 * \brief media_size expressed in terms of the backend native 721 * sector size. 722 * 723 * (e.g. xbb->media_size >> xbb->sector_size_shift). 724 */ 725 uint64_t media_num_sectors; 726 727 /** 728 * \brief Array of memoized scatter gather data computed during the 729 * conversion of blkif ring requests to internal xbb_xen_req 730 * structures. 731 * 732 * Ring processing is serialized so we only need one of these. 733 */ 734 struct xbb_sg xbb_sgs[XBB_MAX_SEGMENTS_PER_REQLIST]; 735 736 /** 737 * Temporary grant table map used in xbb_dispatch_io(). When 738 * XBB_MAX_SEGMENTS_PER_REQLIST gets large, keeping this on the 739 * stack could cause a stack overflow. 740 */ 741 struct gnttab_map_grant_ref maps[XBB_MAX_SEGMENTS_PER_REQLIST]; 742 743 /** Mutex protecting per-instance data. */ 744 struct mtx lock; 745 746 /** 747 * Resource representing allocated physical address space 748 * associated with our per-instance kva region. 749 */ 750 struct resource *pseudo_phys_res; 751 752 /** Resource id for allocated physical address space. */ 753 int pseudo_phys_res_id; 754 755 /** 756 * I/O statistics from BlockBack dispatch down. These are 757 * coalesced requests, and we start them right before execution. 758 */ 759 struct devstat *xbb_stats; 760 761 /** 762 * I/O statistics coming into BlockBack. These are the requests as 763 * we get them from BlockFront. They are started as soon as we 764 * receive a request, and completed when the I/O is complete. 765 */ 766 struct devstat *xbb_stats_in; 767 768 /** Disable sending flush to the backend */ 769 int disable_flush; 770 771 /** Send a real flush for every N flush requests */ 772 int flush_interval; 773 774 /** Count of flush requests in the interval */ 775 int flush_count; 776 777 /** Don't coalesce requests if this is set */ 778 int no_coalesce_reqs; 779 780 /** Number of requests we have received */ 781 uint64_t reqs_received; 782 783 /** Number of requests we have completed*/ 784 uint64_t reqs_completed; 785 786 /** Number of requests we queued but not pushed*/ 787 uint64_t reqs_queued_for_completion; 788 789 /** Number of requests we completed with an error status*/ 790 uint64_t reqs_completed_with_error; 791 792 /** How many forced dispatches (i.e. without coalescing) have happened */ 793 uint64_t forced_dispatch; 794 795 /** How many normal dispatches have happened */ 796 uint64_t normal_dispatch; 797 798 /** How many total dispatches have happened */ 799 uint64_t total_dispatch; 800 801 /** How many times we have run out of KVA */ 802 uint64_t kva_shortages; 803 804 /** How many times we have run out of request structures */ 805 uint64_t request_shortages; 806 807 /** Watch to wait for hotplug script execution */ 808 struct xs_watch hotplug_watch; 809 810 /** Got the needed data from hotplug scripts? */ 811 bool hotplug_done; 812}; 813 814/*---------------------------- Request Processing ----------------------------*/ 815/** 816 * Allocate an internal transaction tracking structure from the free pool. 817 * 818 * \param xbb Per-instance xbb configuration structure. 819 * 820 * \return On success, a pointer to the allocated xbb_xen_req structure. 821 * Otherwise NULL. 822 */ 823static inline struct xbb_xen_req * 824xbb_get_req(struct xbb_softc *xbb) 825{ 826 struct xbb_xen_req *req; 827 828 req = NULL; 829 830 mtx_assert(&xbb->lock, MA_OWNED); 831 832 if ((req = STAILQ_FIRST(&xbb->request_free_stailq)) != NULL) { 833 STAILQ_REMOVE_HEAD(&xbb->request_free_stailq, links); 834 xbb->active_request_count++; 835 } 836 837 return (req); 838} 839 840/** 841 * Return an allocated transaction tracking structure to the free pool. 842 * 843 * \param xbb Per-instance xbb configuration structure. 844 * \param req The request structure to free. 845 */ 846static inline void 847xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req) 848{ 849 mtx_assert(&xbb->lock, MA_OWNED); 850 851 STAILQ_INSERT_HEAD(&xbb->request_free_stailq, req, links); 852 xbb->active_request_count--; 853 854 KASSERT(xbb->active_request_count >= 0, 855 ("xbb_release_req: negative active count")); 856} 857 858/** 859 * Return an xbb_xen_req_list of allocated xbb_xen_reqs to the free pool. 860 * 861 * \param xbb Per-instance xbb configuration structure. 862 * \param req_list The list of requests to free. 863 * \param nreqs The number of items in the list. 864 */ 865static inline void 866xbb_release_reqs(struct xbb_softc *xbb, struct xbb_xen_req_list *req_list, 867 int nreqs) 868{ 869 mtx_assert(&xbb->lock, MA_OWNED); 870 871 STAILQ_CONCAT(&xbb->request_free_stailq, req_list); 872 xbb->active_request_count -= nreqs; 873 874 KASSERT(xbb->active_request_count >= 0, 875 ("xbb_release_reqs: negative active count")); 876} 877 878/** 879 * Given a page index and 512b sector offset within that page, 880 * calculate an offset into a request's kva region. 881 * 882 * \param reqlist The request structure whose kva region will be accessed. 883 * \param pagenr The page index used to compute the kva offset. 884 * \param sector The 512b sector index used to compute the page relative 885 * kva offset. 886 * 887 * \return The computed global KVA offset. 888 */ 889static inline uint8_t * 890xbb_reqlist_vaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 891{ 892 return (reqlist->kva + (PAGE_SIZE * pagenr) + (sector << 9)); 893} 894 895#ifdef XBB_USE_BOUNCE_BUFFERS 896/** 897 * Given a page index and 512b sector offset within that page, 898 * calculate an offset into a request's local bounce memory region. 899 * 900 * \param reqlist The request structure whose bounce region will be accessed. 901 * \param pagenr The page index used to compute the bounce offset. 902 * \param sector The 512b sector index used to compute the page relative 903 * bounce offset. 904 * 905 * \return The computed global bounce buffer address. 906 */ 907static inline uint8_t * 908xbb_reqlist_bounce_addr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 909{ 910 return (reqlist->bounce + (PAGE_SIZE * pagenr) + (sector << 9)); 911} 912#endif 913 914/** 915 * Given a page number and 512b sector offset within that page, 916 * calculate an offset into the request's memory region that the 917 * underlying backend device/file should use for I/O. 918 * 919 * \param reqlist The request structure whose I/O region will be accessed. 920 * \param pagenr The page index used to compute the I/O offset. 921 * \param sector The 512b sector index used to compute the page relative 922 * I/O offset. 923 * 924 * \return The computed global I/O address. 925 * 926 * Depending on configuration, this will either be a local bounce buffer 927 * or a pointer to the memory mapped in from the front-end domain for 928 * this request. 929 */ 930static inline uint8_t * 931xbb_reqlist_ioaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 932{ 933#ifdef XBB_USE_BOUNCE_BUFFERS 934 return (xbb_reqlist_bounce_addr(reqlist, pagenr, sector)); 935#else 936 return (xbb_reqlist_vaddr(reqlist, pagenr, sector)); 937#endif 938} 939 940/** 941 * Given a page index and 512b sector offset within that page, calculate 942 * an offset into the local pseudo-physical address space used to map a 943 * front-end's request data into a request. 944 * 945 * \param reqlist The request list structure whose pseudo-physical region 946 * will be accessed. 947 * \param pagenr The page index used to compute the pseudo-physical offset. 948 * \param sector The 512b sector index used to compute the page relative 949 * pseudo-physical offset. 950 * 951 * \return The computed global pseudo-phsyical address. 952 * 953 * Depending on configuration, this will either be a local bounce buffer 954 * or a pointer to the memory mapped in from the front-end domain for 955 * this request. 956 */ 957static inline uintptr_t 958xbb_get_gntaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) 959{ 960 struct xbb_softc *xbb; 961 962 xbb = reqlist->xbb; 963 964 return ((uintptr_t)(xbb->gnt_base_addr + 965 (uintptr_t)(reqlist->kva - xbb->kva) + 966 (PAGE_SIZE * pagenr) + (sector << 9))); 967} 968 969/** 970 * Get Kernel Virtual Address space for mapping requests. 971 * 972 * \param xbb Per-instance xbb configuration structure. 973 * \param nr_pages Number of pages needed. 974 * \param check_only If set, check for free KVA but don't allocate it. 975 * \param have_lock If set, xbb lock is already held. 976 * 977 * \return On success, a pointer to the allocated KVA region. Otherwise NULL. 978 * 979 * Note: This should be unnecessary once we have either chaining or 980 * scatter/gather support for struct bio. At that point we'll be able to 981 * put multiple addresses and lengths in one bio/bio chain and won't need 982 * to map everything into one virtual segment. 983 */ 984static uint8_t * 985xbb_get_kva(struct xbb_softc *xbb, int nr_pages) 986{ 987 int first_clear; 988 int num_clear; 989 uint8_t *free_kva; 990 int i; 991 992 KASSERT(nr_pages != 0, ("xbb_get_kva of zero length")); 993 994 first_clear = 0; 995 free_kva = NULL; 996 997 mtx_lock(&xbb->lock); 998 999 /* 1000 * Look for the first available page. If there are none, we're done. 1001 */ 1002 bit_ffc(xbb->kva_free, xbb->reqlist_kva_pages, &first_clear); 1003 1004 if (first_clear == -1) 1005 goto bailout; 1006 1007 /* 1008 * Starting at the first available page, look for consecutive free 1009 * pages that will satisfy the user's request. 1010 */ 1011 for (i = first_clear, num_clear = 0; i < xbb->reqlist_kva_pages; i++) { 1012 /* 1013 * If this is true, the page is used, so we have to reset 1014 * the number of clear pages and the first clear page 1015 * (since it pointed to a region with an insufficient number 1016 * of clear pages). 1017 */ 1018 if (bit_test(xbb->kva_free, i)) { 1019 num_clear = 0; 1020 first_clear = -1; 1021 continue; 1022 } 1023 1024 if (first_clear == -1) 1025 first_clear = i; 1026 1027 /* 1028 * If this is true, we've found a large enough free region 1029 * to satisfy the request. 1030 */ 1031 if (++num_clear == nr_pages) { 1032 1033 bit_nset(xbb->kva_free, first_clear, 1034 first_clear + nr_pages - 1); 1035 1036 free_kva = xbb->kva + 1037 (uint8_t *)((intptr_t)first_clear * PAGE_SIZE); 1038 1039 KASSERT(free_kva >= (uint8_t *)xbb->kva && 1040 free_kva + (nr_pages * PAGE_SIZE) <= 1041 (uint8_t *)xbb->ring_config.va, 1042 ("Free KVA %p len %d out of range, " 1043 "kva = %#jx, ring VA = %#jx\n", free_kva, 1044 nr_pages * PAGE_SIZE, (uintmax_t)xbb->kva, 1045 (uintmax_t)xbb->ring_config.va)); 1046 break; 1047 } 1048 } 1049 1050bailout: 1051 1052 if (free_kva == NULL) { 1053 xbb->flags |= XBBF_RESOURCE_SHORTAGE; 1054 xbb->kva_shortages++; 1055 } 1056 1057 mtx_unlock(&xbb->lock); 1058 1059 return (free_kva); 1060} 1061 1062/** 1063 * Free allocated KVA. 1064 * 1065 * \param xbb Per-instance xbb configuration structure. 1066 * \param kva_ptr Pointer to allocated KVA region. 1067 * \param nr_pages Number of pages in the KVA region. 1068 */ 1069static void 1070xbb_free_kva(struct xbb_softc *xbb, uint8_t *kva_ptr, int nr_pages) 1071{ 1072 intptr_t start_page; 1073 1074 mtx_assert(&xbb->lock, MA_OWNED); 1075 1076 start_page = (intptr_t)(kva_ptr - xbb->kva) >> PAGE_SHIFT; 1077 bit_nclear(xbb->kva_free, start_page, start_page + nr_pages - 1); 1078 1079} 1080 1081/** 1082 * Unmap the front-end pages associated with this I/O request. 1083 * 1084 * \param req The request structure to unmap. 1085 */ 1086static void 1087xbb_unmap_reqlist(struct xbb_xen_reqlist *reqlist) 1088{ 1089 struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQLIST]; 1090 u_int i; 1091 u_int invcount; 1092 int error; 1093 1094 invcount = 0; 1095 for (i = 0; i < reqlist->nr_segments; i++) { 1096 1097 if (reqlist->gnt_handles[i] == GRANT_REF_INVALID) 1098 continue; 1099 1100 unmap[invcount].host_addr = xbb_get_gntaddr(reqlist, i, 0); 1101 unmap[invcount].dev_bus_addr = 0; 1102 unmap[invcount].handle = reqlist->gnt_handles[i]; 1103 reqlist->gnt_handles[i] = GRANT_REF_INVALID; 1104 invcount++; 1105 } 1106 1107 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, 1108 unmap, invcount); 1109 KASSERT(error == 0, ("Grant table operation failed")); 1110} 1111 1112/** 1113 * Allocate an internal transaction tracking structure from the free pool. 1114 * 1115 * \param xbb Per-instance xbb configuration structure. 1116 * 1117 * \return On success, a pointer to the allocated xbb_xen_reqlist structure. 1118 * Otherwise NULL. 1119 */ 1120static inline struct xbb_xen_reqlist * 1121xbb_get_reqlist(struct xbb_softc *xbb) 1122{ 1123 struct xbb_xen_reqlist *reqlist; 1124 1125 reqlist = NULL; 1126 1127 mtx_assert(&xbb->lock, MA_OWNED); 1128 1129 if ((reqlist = STAILQ_FIRST(&xbb->reqlist_free_stailq)) != NULL) { 1130 1131 STAILQ_REMOVE_HEAD(&xbb->reqlist_free_stailq, links); 1132 reqlist->flags = XBB_REQLIST_NONE; 1133 reqlist->kva = NULL; 1134 reqlist->status = BLKIF_RSP_OKAY; 1135 reqlist->residual_512b_sectors = 0; 1136 reqlist->num_children = 0; 1137 reqlist->nr_segments = 0; 1138 STAILQ_INIT(&reqlist->contig_req_list); 1139 } 1140 1141 return (reqlist); 1142} 1143 1144/** 1145 * Return an allocated transaction tracking structure to the free pool. 1146 * 1147 * \param xbb Per-instance xbb configuration structure. 1148 * \param req The request list structure to free. 1149 * \param wakeup If set, wakeup the work thread if freeing this reqlist 1150 * during a resource shortage condition. 1151 */ 1152static inline void 1153xbb_release_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 1154 int wakeup) 1155{ 1156 1157 mtx_assert(&xbb->lock, MA_OWNED); 1158 1159 if (wakeup) { 1160 wakeup = xbb->flags & XBBF_RESOURCE_SHORTAGE; 1161 xbb->flags &= ~XBBF_RESOURCE_SHORTAGE; 1162 } 1163 1164 if (reqlist->kva != NULL) 1165 xbb_free_kva(xbb, reqlist->kva, reqlist->nr_segments); 1166 1167 xbb_release_reqs(xbb, &reqlist->contig_req_list, reqlist->num_children); 1168 1169 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); 1170 1171 if ((xbb->flags & XBBF_SHUTDOWN) != 0) { 1172 /* 1173 * Shutdown is in progress. See if we can 1174 * progress further now that one more request 1175 * has completed and been returned to the 1176 * free pool. 1177 */ 1178 xbb_shutdown(xbb); 1179 } 1180 1181 if (wakeup != 0) 1182 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1183} 1184 1185/** 1186 * Request resources and do basic request setup. 1187 * 1188 * \param xbb Per-instance xbb configuration structure. 1189 * \param reqlist Pointer to reqlist pointer. 1190 * \param ring_req Pointer to a block ring request. 1191 * \param ring_index The ring index of this request. 1192 * 1193 * \return 0 for success, non-zero for failure. 1194 */ 1195static int 1196xbb_get_resources(struct xbb_softc *xbb, struct xbb_xen_reqlist **reqlist, 1197 blkif_request_t *ring_req, RING_IDX ring_idx) 1198{ 1199 struct xbb_xen_reqlist *nreqlist; 1200 struct xbb_xen_req *nreq; 1201 1202 nreqlist = NULL; 1203 nreq = NULL; 1204 1205 mtx_lock(&xbb->lock); 1206 1207 /* 1208 * We don't allow new resources to be allocated if we're in the 1209 * process of shutting down. 1210 */ 1211 if ((xbb->flags & XBBF_SHUTDOWN) != 0) { 1212 mtx_unlock(&xbb->lock); 1213 return (1); 1214 } 1215 1216 /* 1217 * Allocate a reqlist if the caller doesn't have one already. 1218 */ 1219 if (*reqlist == NULL) { 1220 nreqlist = xbb_get_reqlist(xbb); 1221 if (nreqlist == NULL) 1222 goto bailout_error; 1223 } 1224 1225 /* We always allocate a request. */ 1226 nreq = xbb_get_req(xbb); 1227 if (nreq == NULL) 1228 goto bailout_error; 1229 1230 mtx_unlock(&xbb->lock); 1231 1232 if (*reqlist == NULL) { 1233 *reqlist = nreqlist; 1234 nreqlist->operation = ring_req->operation; 1235 nreqlist->starting_sector_number = ring_req->sector_number; 1236 STAILQ_INSERT_TAIL(&xbb->reqlist_pending_stailq, nreqlist, 1237 links); 1238 } 1239 1240 nreq->reqlist = *reqlist; 1241 nreq->req_ring_idx = ring_idx; 1242 nreq->id = ring_req->id; 1243 nreq->operation = ring_req->operation; 1244 1245 if (xbb->abi != BLKIF_PROTOCOL_NATIVE) { 1246 bcopy(ring_req, &nreq->ring_req_storage, sizeof(*ring_req)); 1247 nreq->ring_req = &nreq->ring_req_storage; 1248 } else { 1249 nreq->ring_req = ring_req; 1250 } 1251 1252 binuptime(&nreq->ds_t0); 1253 devstat_start_transaction(xbb->xbb_stats_in, &nreq->ds_t0); 1254 STAILQ_INSERT_TAIL(&(*reqlist)->contig_req_list, nreq, links); 1255 (*reqlist)->num_children++; 1256 (*reqlist)->nr_segments += ring_req->nr_segments; 1257 1258 return (0); 1259 1260bailout_error: 1261 1262 /* 1263 * We're out of resources, so set the shortage flag. The next time 1264 * a request is released, we'll try waking up the work thread to 1265 * see if we can allocate more resources. 1266 */ 1267 xbb->flags |= XBBF_RESOURCE_SHORTAGE; 1268 xbb->request_shortages++; 1269 1270 if (nreq != NULL) 1271 xbb_release_req(xbb, nreq); 1272 1273 if (nreqlist != NULL) 1274 xbb_release_reqlist(xbb, nreqlist, /*wakeup*/ 0); 1275 1276 mtx_unlock(&xbb->lock); 1277 1278 return (1); 1279} 1280 1281/** 1282 * Create and queue a response to a blkif request. 1283 * 1284 * \param xbb Per-instance xbb configuration structure. 1285 * \param req The request structure to which to respond. 1286 * \param status The status code to report. See BLKIF_RSP_* 1287 * in sys/xen/interface/io/blkif.h. 1288 */ 1289static void 1290xbb_queue_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status) 1291{ 1292 blkif_response_t *resp; 1293 1294 /* 1295 * The mutex is required here, and should be held across this call 1296 * until after the subsequent call to xbb_push_responses(). This 1297 * is to guarantee that another context won't queue responses and 1298 * push them while we're active. 1299 * 1300 * That could lead to the other end being notified of responses 1301 * before the resources have been freed on this end. The other end 1302 * would then be able to queue additional I/O, and we may run out 1303 * of resources because we haven't freed them all yet. 1304 */ 1305 mtx_assert(&xbb->lock, MA_OWNED); 1306 1307 /* 1308 * Place on the response ring for the relevant domain. 1309 * For now, only the spacing between entries is different 1310 * in the different ABIs, not the response entry layout. 1311 */ 1312 switch (xbb->abi) { 1313 case BLKIF_PROTOCOL_NATIVE: 1314 resp = RING_GET_RESPONSE(&xbb->rings.native, 1315 xbb->rings.native.rsp_prod_pvt); 1316 break; 1317 case BLKIF_PROTOCOL_X86_32: 1318 resp = (blkif_response_t *) 1319 RING_GET_RESPONSE(&xbb->rings.x86_32, 1320 xbb->rings.x86_32.rsp_prod_pvt); 1321 break; 1322 case BLKIF_PROTOCOL_X86_64: 1323 resp = (blkif_response_t *) 1324 RING_GET_RESPONSE(&xbb->rings.x86_64, 1325 xbb->rings.x86_64.rsp_prod_pvt); 1326 break; 1327 default: 1328 panic("Unexpected blkif protocol ABI."); 1329 } 1330 1331 resp->id = req->id; 1332 resp->operation = req->operation; 1333 resp->status = status; 1334 1335 if (status != BLKIF_RSP_OKAY) 1336 xbb->reqs_completed_with_error++; 1337 1338 xbb->rings.common.rsp_prod_pvt++; 1339 1340 xbb->reqs_queued_for_completion++; 1341 1342} 1343 1344/** 1345 * Send queued responses to blkif requests. 1346 * 1347 * \param xbb Per-instance xbb configuration structure. 1348 * \param run_taskqueue Flag that is set to 1 if the taskqueue 1349 * should be run, 0 if it does not need to be run. 1350 * \param notify Flag that is set to 1 if the other end should be 1351 * notified via irq, 0 if the other end should not be 1352 * notified. 1353 */ 1354static void 1355xbb_push_responses(struct xbb_softc *xbb, int *run_taskqueue, int *notify) 1356{ 1357 int more_to_do; 1358 1359 /* 1360 * The mutex is required here. 1361 */ 1362 mtx_assert(&xbb->lock, MA_OWNED); 1363 1364 more_to_do = 0; 1365 1366 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, *notify); 1367 1368 if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) { 1369 1370 /* 1371 * Tail check for pending requests. Allows frontend to avoid 1372 * notifications if requests are already in flight (lower 1373 * overheads and promotes batching). 1374 */ 1375 RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do); 1376 } else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) { 1377 1378 more_to_do = 1; 1379 } 1380 1381 xbb->reqs_completed += xbb->reqs_queued_for_completion; 1382 xbb->reqs_queued_for_completion = 0; 1383 1384 *run_taskqueue = more_to_do; 1385} 1386 1387/** 1388 * Complete a request list. 1389 * 1390 * \param xbb Per-instance xbb configuration structure. 1391 * \param reqlist Allocated internal request list structure. 1392 */ 1393static void 1394xbb_complete_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) 1395{ 1396 struct xbb_xen_req *nreq; 1397 off_t sectors_sent; 1398 int notify, run_taskqueue; 1399 1400 sectors_sent = 0; 1401 1402 if (reqlist->flags & XBB_REQLIST_MAPPED) 1403 xbb_unmap_reqlist(reqlist); 1404 1405 mtx_lock(&xbb->lock); 1406 1407 /* 1408 * All I/O is done, send the response. A lock is not necessary 1409 * to protect the request list, because all requests have 1410 * completed. Therefore this is the only context accessing this 1411 * reqlist right now. However, in order to make sure that no one 1412 * else queues responses onto the queue or pushes them to the other 1413 * side while we're active, we need to hold the lock across the 1414 * calls to xbb_queue_response() and xbb_push_responses(). 1415 */ 1416 STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { 1417 off_t cur_sectors_sent; 1418 1419 /* Put this response on the ring, but don't push yet */ 1420 xbb_queue_response(xbb, nreq, reqlist->status); 1421 1422 /* We don't report bytes sent if there is an error. */ 1423 if (reqlist->status == BLKIF_RSP_OKAY) 1424 cur_sectors_sent = nreq->nr_512b_sectors; 1425 else 1426 cur_sectors_sent = 0; 1427 1428 sectors_sent += cur_sectors_sent; 1429 1430 devstat_end_transaction(xbb->xbb_stats_in, 1431 /*bytes*/cur_sectors_sent << 9, 1432 reqlist->ds_tag_type, 1433 reqlist->ds_trans_type, 1434 /*now*/NULL, 1435 /*then*/&nreq->ds_t0); 1436 } 1437 1438 /* 1439 * Take out any sectors not sent. If we wind up negative (which 1440 * might happen if an error is reported as well as a residual), just 1441 * report 0 sectors sent. 1442 */ 1443 sectors_sent -= reqlist->residual_512b_sectors; 1444 if (sectors_sent < 0) 1445 sectors_sent = 0; 1446 1447 devstat_end_transaction(xbb->xbb_stats, 1448 /*bytes*/ sectors_sent << 9, 1449 reqlist->ds_tag_type, 1450 reqlist->ds_trans_type, 1451 /*now*/NULL, 1452 /*then*/&reqlist->ds_t0); 1453 1454 xbb_release_reqlist(xbb, reqlist, /*wakeup*/ 1); 1455 1456 xbb_push_responses(xbb, &run_taskqueue, ¬ify); 1457 1458 mtx_unlock(&xbb->lock); 1459 1460 if (run_taskqueue) 1461 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1462 1463 if (notify) 1464 xen_intr_signal(xbb->xen_intr_handle); 1465} 1466 1467/** 1468 * Completion handler for buffer I/O requests issued by the device 1469 * backend driver. 1470 * 1471 * \param bio The buffer I/O request on which to perform completion 1472 * processing. 1473 */ 1474static void 1475xbb_bio_done(struct bio *bio) 1476{ 1477 struct xbb_softc *xbb; 1478 struct xbb_xen_reqlist *reqlist; 1479 1480 reqlist = bio->bio_caller1; 1481 xbb = reqlist->xbb; 1482 1483 reqlist->residual_512b_sectors += bio->bio_resid >> 9; 1484 1485 /* 1486 * This is a bit imprecise. With aggregated I/O a single 1487 * request list can contain multiple front-end requests and 1488 * a multiple bios may point to a single request. By carefully 1489 * walking the request list, we could map residuals and errors 1490 * back to the original front-end request, but the interface 1491 * isn't sufficiently rich for us to properly report the error. 1492 * So, we just treat the entire request list as having failed if an 1493 * error occurs on any part. And, if an error occurs, we treat 1494 * the amount of data transferred as 0. 1495 * 1496 * For residuals, we report it on the overall aggregated device, 1497 * but not on the individual requests, since we don't currently 1498 * do the work to determine which front-end request to which the 1499 * residual applies. 1500 */ 1501 if (bio->bio_error) { 1502 DPRINTF("BIO returned error %d for operation on device %s\n", 1503 bio->bio_error, xbb->dev_name); 1504 reqlist->status = BLKIF_RSP_ERROR; 1505 1506 if (bio->bio_error == ENXIO 1507 && xenbus_get_state(xbb->dev) == XenbusStateConnected) { 1508 1509 /* 1510 * Backend device has disappeared. Signal the 1511 * front-end that we (the device proxy) want to 1512 * go away. 1513 */ 1514 xenbus_set_state(xbb->dev, XenbusStateClosing); 1515 } 1516 } 1517 1518#ifdef XBB_USE_BOUNCE_BUFFERS 1519 if (bio->bio_cmd == BIO_READ) { 1520 vm_offset_t kva_offset; 1521 1522 kva_offset = (vm_offset_t)bio->bio_data 1523 - (vm_offset_t)reqlist->bounce; 1524 memcpy((uint8_t *)reqlist->kva + kva_offset, 1525 bio->bio_data, bio->bio_bcount); 1526 } 1527#endif /* XBB_USE_BOUNCE_BUFFERS */ 1528 1529 /* 1530 * Decrement the pending count for the request list. When we're 1531 * done with the requests, send status back for all of them. 1532 */ 1533 if (atomic_fetchadd_int(&reqlist->pendcnt, -1) == 1) 1534 xbb_complete_reqlist(xbb, reqlist); 1535 1536 g_destroy_bio(bio); 1537} 1538 1539/** 1540 * Parse a blkif request into an internal request structure and send 1541 * it to the backend for processing. 1542 * 1543 * \param xbb Per-instance xbb configuration structure. 1544 * \param reqlist Allocated internal request list structure. 1545 * 1546 * \return On success, 0. For resource shortages, non-zero. 1547 * 1548 * This routine performs the backend common aspects of request parsing 1549 * including compiling an internal request structure, parsing the S/G 1550 * list and any secondary ring requests in which they may reside, and 1551 * the mapping of front-end I/O pages into our domain. 1552 */ 1553static int 1554xbb_dispatch_io(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) 1555{ 1556 struct xbb_sg *xbb_sg; 1557 struct gnttab_map_grant_ref *map; 1558 struct blkif_request_segment *sg; 1559 struct blkif_request_segment *last_block_sg; 1560 struct xbb_xen_req *nreq; 1561 u_int nseg; 1562 u_int seg_idx; 1563 u_int block_segs; 1564 int nr_sects; 1565 int total_sects; 1566 int operation; 1567 uint8_t bio_flags; 1568 int error; 1569 1570 reqlist->ds_tag_type = DEVSTAT_TAG_SIMPLE; 1571 bio_flags = 0; 1572 total_sects = 0; 1573 nr_sects = 0; 1574 1575 /* 1576 * First determine whether we have enough free KVA to satisfy this 1577 * request list. If not, tell xbb_run_queue() so it can go to 1578 * sleep until we have more KVA. 1579 */ 1580 reqlist->kva = NULL; 1581 if (reqlist->nr_segments != 0) { 1582 reqlist->kva = xbb_get_kva(xbb, reqlist->nr_segments); 1583 if (reqlist->kva == NULL) { 1584 /* 1585 * If we're out of KVA, return ENOMEM. 1586 */ 1587 return (ENOMEM); 1588 } 1589 } 1590 1591 binuptime(&reqlist->ds_t0); 1592 devstat_start_transaction(xbb->xbb_stats, &reqlist->ds_t0); 1593 1594 switch (reqlist->operation) { 1595 case BLKIF_OP_WRITE_BARRIER: 1596 bio_flags |= BIO_ORDERED; 1597 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; 1598 /* FALLTHROUGH */ 1599 case BLKIF_OP_WRITE: 1600 operation = BIO_WRITE; 1601 reqlist->ds_trans_type = DEVSTAT_WRITE; 1602 if ((xbb->flags & XBBF_READ_ONLY) != 0) { 1603 DPRINTF("Attempt to write to read only device %s\n", 1604 xbb->dev_name); 1605 reqlist->status = BLKIF_RSP_ERROR; 1606 goto send_response; 1607 } 1608 break; 1609 case BLKIF_OP_READ: 1610 operation = BIO_READ; 1611 reqlist->ds_trans_type = DEVSTAT_READ; 1612 break; 1613 case BLKIF_OP_FLUSH_DISKCACHE: 1614 /* 1615 * If this is true, the user has requested that we disable 1616 * flush support. So we just complete the requests 1617 * successfully. 1618 */ 1619 if (xbb->disable_flush != 0) { 1620 goto send_response; 1621 } 1622 1623 /* 1624 * The user has requested that we only send a real flush 1625 * for every N flush requests. So keep count, and either 1626 * complete the request immediately or queue it for the 1627 * backend. 1628 */ 1629 if (xbb->flush_interval != 0) { 1630 if (++(xbb->flush_count) < xbb->flush_interval) { 1631 goto send_response; 1632 } else 1633 xbb->flush_count = 0; 1634 } 1635 1636 operation = BIO_FLUSH; 1637 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; 1638 reqlist->ds_trans_type = DEVSTAT_NO_DATA; 1639 goto do_dispatch; 1640 /*NOTREACHED*/ 1641 default: 1642 DPRINTF("error: unknown block io operation [%d]\n", 1643 reqlist->operation); 1644 reqlist->status = BLKIF_RSP_ERROR; 1645 goto send_response; 1646 } 1647 1648 reqlist->xbb = xbb; 1649 xbb_sg = xbb->xbb_sgs; 1650 map = xbb->maps; 1651 seg_idx = 0; 1652 1653 STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { 1654 blkif_request_t *ring_req; 1655 RING_IDX req_ring_idx; 1656 u_int req_seg_idx; 1657 1658 ring_req = nreq->ring_req; 1659 req_ring_idx = nreq->req_ring_idx; 1660 nr_sects = 0; 1661 nseg = ring_req->nr_segments; 1662 nreq->nr_pages = nseg; 1663 nreq->nr_512b_sectors = 0; 1664 req_seg_idx = 0; 1665 sg = NULL; 1666 1667 /* Check that number of segments is sane. */ 1668 if (__predict_false(nseg == 0) 1669 || __predict_false(nseg > xbb->max_request_segments)) { 1670 DPRINTF("Bad number of segments in request (%d)\n", 1671 nseg); 1672 reqlist->status = BLKIF_RSP_ERROR; 1673 goto send_response; 1674 } 1675 1676 block_segs = nseg; 1677 sg = ring_req->seg; 1678 last_block_sg = sg + block_segs; 1679 1680 while (sg < last_block_sg) { 1681 KASSERT(seg_idx < 1682 XBB_MAX_SEGMENTS_PER_REQLIST, 1683 ("seg_idx %d is too large, max " 1684 "segs %d\n", seg_idx, 1685 XBB_MAX_SEGMENTS_PER_REQLIST)); 1686 1687 xbb_sg->first_sect = sg->first_sect; 1688 xbb_sg->last_sect = sg->last_sect; 1689 xbb_sg->nsect = 1690 (int8_t)(sg->last_sect - 1691 sg->first_sect + 1); 1692 1693 if ((sg->last_sect >= (PAGE_SIZE >> 9)) 1694 || (xbb_sg->nsect <= 0)) { 1695 reqlist->status = BLKIF_RSP_ERROR; 1696 goto send_response; 1697 } 1698 1699 nr_sects += xbb_sg->nsect; 1700 map->host_addr = xbb_get_gntaddr(reqlist, 1701 seg_idx, /*sector*/0); 1702 KASSERT(map->host_addr + PAGE_SIZE <= 1703 xbb->ring_config.gnt_addr, 1704 ("Host address %#jx len %d overlaps " 1705 "ring address %#jx\n", 1706 (uintmax_t)map->host_addr, PAGE_SIZE, 1707 (uintmax_t)xbb->ring_config.gnt_addr)); 1708 1709 map->flags = GNTMAP_host_map; 1710 map->ref = sg->gref; 1711 map->dom = xbb->otherend_id; 1712 if (operation == BIO_WRITE) 1713 map->flags |= GNTMAP_readonly; 1714 sg++; 1715 map++; 1716 xbb_sg++; 1717 seg_idx++; 1718 req_seg_idx++; 1719 } 1720 1721 /* Convert to the disk's sector size */ 1722 nreq->nr_512b_sectors = nr_sects; 1723 nr_sects = (nr_sects << 9) >> xbb->sector_size_shift; 1724 total_sects += nr_sects; 1725 1726 if ((nreq->nr_512b_sectors & 1727 ((xbb->sector_size >> 9) - 1)) != 0) { 1728 device_printf(xbb->dev, "%s: I/O size (%d) is not " 1729 "a multiple of the backing store sector " 1730 "size (%d)\n", __func__, 1731 nreq->nr_512b_sectors << 9, 1732 xbb->sector_size); 1733 reqlist->status = BLKIF_RSP_ERROR; 1734 goto send_response; 1735 } 1736 } 1737 1738 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, 1739 xbb->maps, reqlist->nr_segments); 1740 if (error != 0) 1741 panic("Grant table operation failed (%d)", error); 1742 1743 reqlist->flags |= XBB_REQLIST_MAPPED; 1744 1745 for (seg_idx = 0, map = xbb->maps; seg_idx < reqlist->nr_segments; 1746 seg_idx++, map++){ 1747 1748 if (__predict_false(map->status != 0)) { 1749 DPRINTF("invalid buffer -- could not remap " 1750 "it (%d)\n", map->status); 1751 DPRINTF("Mapping(%d): Host Addr 0x%"PRIx64", flags " 1752 "0x%x ref 0x%x, dom %d\n", seg_idx, 1753 map->host_addr, map->flags, map->ref, 1754 map->dom); 1755 reqlist->status = BLKIF_RSP_ERROR; 1756 goto send_response; 1757 } 1758 1759 reqlist->gnt_handles[seg_idx] = map->handle; 1760 } 1761 if (reqlist->starting_sector_number + total_sects > 1762 xbb->media_num_sectors) { 1763 1764 DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] " 1765 "extends past end of device %s\n", 1766 operation == BIO_READ ? "read" : "write", 1767 reqlist->starting_sector_number, 1768 reqlist->starting_sector_number + total_sects, 1769 xbb->dev_name); 1770 reqlist->status = BLKIF_RSP_ERROR; 1771 goto send_response; 1772 } 1773 1774do_dispatch: 1775 1776 error = xbb->dispatch_io(xbb, 1777 reqlist, 1778 operation, 1779 bio_flags); 1780 1781 if (error != 0) { 1782 reqlist->status = BLKIF_RSP_ERROR; 1783 goto send_response; 1784 } 1785 1786 return (0); 1787 1788send_response: 1789 1790 xbb_complete_reqlist(xbb, reqlist); 1791 1792 return (0); 1793} 1794 1795static __inline int 1796xbb_count_sects(blkif_request_t *ring_req) 1797{ 1798 int i; 1799 int cur_size = 0; 1800 1801 for (i = 0; i < ring_req->nr_segments; i++) { 1802 int nsect; 1803 1804 nsect = (int8_t)(ring_req->seg[i].last_sect - 1805 ring_req->seg[i].first_sect + 1); 1806 if (nsect <= 0) 1807 break; 1808 1809 cur_size += nsect; 1810 } 1811 1812 return (cur_size); 1813} 1814 1815/** 1816 * Process incoming requests from the shared communication ring in response 1817 * to a signal on the ring's event channel. 1818 * 1819 * \param context Callback argument registerd during task initialization - 1820 * the xbb_softc for this instance. 1821 * \param pending The number of taskqueue_enqueue events that have 1822 * occurred since this handler was last run. 1823 */ 1824static void 1825xbb_run_queue(void *context, int pending) 1826{ 1827 struct xbb_softc *xbb; 1828 blkif_back_rings_t *rings; 1829 RING_IDX rp; 1830 uint64_t cur_sector; 1831 int cur_operation; 1832 struct xbb_xen_reqlist *reqlist; 1833 1834 1835 xbb = (struct xbb_softc *)context; 1836 rings = &xbb->rings; 1837 1838 /* 1839 * Work gather and dispatch loop. Note that we have a bias here 1840 * towards gathering I/O sent by blockfront. We first gather up 1841 * everything in the ring, as long as we have resources. Then we 1842 * dispatch one request, and then attempt to gather up any 1843 * additional requests that have come in while we were dispatching 1844 * the request. 1845 * 1846 * This allows us to get a clearer picture (via devstat) of how 1847 * many requests blockfront is queueing to us at any given time. 1848 */ 1849 for (;;) { 1850 int retval; 1851 1852 /* 1853 * Initialize reqlist to the last element in the pending 1854 * queue, if there is one. This allows us to add more 1855 * requests to that request list, if we have room. 1856 */ 1857 reqlist = STAILQ_LAST(&xbb->reqlist_pending_stailq, 1858 xbb_xen_reqlist, links); 1859 if (reqlist != NULL) { 1860 cur_sector = reqlist->next_contig_sector; 1861 cur_operation = reqlist->operation; 1862 } else { 1863 cur_operation = 0; 1864 cur_sector = 0; 1865 } 1866 1867 /* 1868 * Cache req_prod to avoid accessing a cache line shared 1869 * with the frontend. 1870 */ 1871 rp = rings->common.sring->req_prod; 1872 1873 /* Ensure we see queued requests up to 'rp'. */ 1874 rmb(); 1875 1876 /** 1877 * Run so long as there is work to consume and the generation 1878 * of a response will not overflow the ring. 1879 * 1880 * @note There's a 1 to 1 relationship between requests and 1881 * responses, so an overflow should never occur. This 1882 * test is to protect our domain from digesting bogus 1883 * data. Shouldn't we log this? 1884 */ 1885 while (rings->common.req_cons != rp 1886 && RING_REQUEST_CONS_OVERFLOW(&rings->common, 1887 rings->common.req_cons) == 0){ 1888 blkif_request_t ring_req_storage; 1889 blkif_request_t *ring_req; 1890 int cur_size; 1891 1892 switch (xbb->abi) { 1893 case BLKIF_PROTOCOL_NATIVE: 1894 ring_req = RING_GET_REQUEST(&xbb->rings.native, 1895 rings->common.req_cons); 1896 break; 1897 case BLKIF_PROTOCOL_X86_32: 1898 { 1899 struct blkif_x86_32_request *ring_req32; 1900 1901 ring_req32 = RING_GET_REQUEST( 1902 &xbb->rings.x86_32, rings->common.req_cons); 1903 blkif_get_x86_32_req(&ring_req_storage, 1904 ring_req32); 1905 ring_req = &ring_req_storage; 1906 break; 1907 } 1908 case BLKIF_PROTOCOL_X86_64: 1909 { 1910 struct blkif_x86_64_request *ring_req64; 1911 1912 ring_req64 =RING_GET_REQUEST(&xbb->rings.x86_64, 1913 rings->common.req_cons); 1914 blkif_get_x86_64_req(&ring_req_storage, 1915 ring_req64); 1916 ring_req = &ring_req_storage; 1917 break; 1918 } 1919 default: 1920 panic("Unexpected blkif protocol ABI."); 1921 /* NOTREACHED */ 1922 } 1923 1924 /* 1925 * Check for situations that would require closing 1926 * off this I/O for further coalescing: 1927 * - Coalescing is turned off. 1928 * - Current I/O is out of sequence with the previous 1929 * I/O. 1930 * - Coalesced I/O would be too large. 1931 */ 1932 if ((reqlist != NULL) 1933 && ((xbb->no_coalesce_reqs != 0) 1934 || ((xbb->no_coalesce_reqs == 0) 1935 && ((ring_req->sector_number != cur_sector) 1936 || (ring_req->operation != cur_operation) 1937 || ((ring_req->nr_segments + reqlist->nr_segments) > 1938 xbb->max_reqlist_segments))))) { 1939 reqlist = NULL; 1940 } 1941 1942 /* 1943 * Grab and check for all resources in one shot. 1944 * If we can't get all of the resources we need, 1945 * the shortage is noted and the thread will get 1946 * woken up when more resources are available. 1947 */ 1948 retval = xbb_get_resources(xbb, &reqlist, ring_req, 1949 xbb->rings.common.req_cons); 1950 1951 if (retval != 0) { 1952 /* 1953 * Resource shortage has been recorded. 1954 * We'll be scheduled to run once a request 1955 * object frees up due to a completion. 1956 */ 1957 break; 1958 } 1959 1960 /* 1961 * Signify that we can overwrite this request with 1962 * a response by incrementing our consumer index. 1963 * The response won't be generated until after 1964 * we've already consumed all necessary data out 1965 * of the version of the request in the ring buffer 1966 * (for native mode). We must update the consumer 1967 * index before issuing back-end I/O so there is 1968 * no possibility that it will complete and a 1969 * response be generated before we make room in 1970 * the queue for that response. 1971 */ 1972 xbb->rings.common.req_cons++; 1973 xbb->reqs_received++; 1974 1975 cur_size = xbb_count_sects(ring_req); 1976 cur_sector = ring_req->sector_number + cur_size; 1977 reqlist->next_contig_sector = cur_sector; 1978 cur_operation = ring_req->operation; 1979 } 1980 1981 /* Check for I/O to dispatch */ 1982 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); 1983 if (reqlist == NULL) { 1984 /* 1985 * We're out of work to do, put the task queue to 1986 * sleep. 1987 */ 1988 break; 1989 } 1990 1991 /* 1992 * Grab the first request off the queue and attempt 1993 * to dispatch it. 1994 */ 1995 STAILQ_REMOVE_HEAD(&xbb->reqlist_pending_stailq, links); 1996 1997 retval = xbb_dispatch_io(xbb, reqlist); 1998 if (retval != 0) { 1999 /* 2000 * xbb_dispatch_io() returns non-zero only when 2001 * there is a resource shortage. If that's the 2002 * case, re-queue this request on the head of the 2003 * queue, and go to sleep until we have more 2004 * resources. 2005 */ 2006 STAILQ_INSERT_HEAD(&xbb->reqlist_pending_stailq, 2007 reqlist, links); 2008 break; 2009 } else { 2010 /* 2011 * If we still have anything on the queue after 2012 * removing the head entry, that is because we 2013 * met one of the criteria to create a new 2014 * request list (outlined above), and we'll call 2015 * that a forced dispatch for statistical purposes. 2016 * 2017 * Otherwise, if there is only one element on the 2018 * queue, we coalesced everything available on 2019 * the ring and we'll call that a normal dispatch. 2020 */ 2021 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); 2022 2023 if (reqlist != NULL) 2024 xbb->forced_dispatch++; 2025 else 2026 xbb->normal_dispatch++; 2027 2028 xbb->total_dispatch++; 2029 } 2030 } 2031} 2032 2033/** 2034 * Interrupt handler bound to the shared ring's event channel. 2035 * 2036 * \param arg Callback argument registerd during event channel 2037 * binding - the xbb_softc for this instance. 2038 */ 2039static int 2040xbb_filter(void *arg) 2041{ 2042 struct xbb_softc *xbb; 2043 2044 /* Defer to taskqueue thread. */ 2045 xbb = (struct xbb_softc *)arg; 2046 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 2047 2048 return (FILTER_HANDLED); 2049} 2050 2051SDT_PROVIDER_DEFINE(xbb); 2052SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_dev, flush, "int"); 2053SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, read, "int", "uint64_t", 2054 "uint64_t"); 2055SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, write, "int", 2056 "uint64_t", "uint64_t"); 2057 2058/*----------------------------- Backend Handlers -----------------------------*/ 2059/** 2060 * Backend handler for character device access. 2061 * 2062 * \param xbb Per-instance xbb configuration structure. 2063 * \param reqlist Allocated internal request list structure. 2064 * \param operation BIO_* I/O operation code. 2065 * \param bio_flags Additional bio_flag data to pass to any generated 2066 * bios (e.g. BIO_ORDERED).. 2067 * 2068 * \return 0 for success, errno codes for failure. 2069 */ 2070static int 2071xbb_dispatch_dev(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 2072 int operation, int bio_flags) 2073{ 2074 struct xbb_dev_data *dev_data; 2075 struct bio *bios[XBB_MAX_SEGMENTS_PER_REQLIST]; 2076 off_t bio_offset; 2077 struct bio *bio; 2078 struct xbb_sg *xbb_sg; 2079 u_int nbio; 2080 u_int bio_idx; 2081 u_int nseg; 2082 u_int seg_idx; 2083 int error; 2084 2085 dev_data = &xbb->backend.dev; 2086 bio_offset = (off_t)reqlist->starting_sector_number 2087 << xbb->sector_size_shift; 2088 error = 0; 2089 nbio = 0; 2090 bio_idx = 0; 2091 2092 if (operation == BIO_FLUSH) { 2093 bio = g_new_bio(); 2094 if (__predict_false(bio == NULL)) { 2095 DPRINTF("Unable to allocate bio for BIO_FLUSH\n"); 2096 error = ENOMEM; 2097 return (error); 2098 } 2099 2100 bio->bio_cmd = BIO_FLUSH; 2101 bio->bio_flags |= BIO_ORDERED; 2102 bio->bio_dev = dev_data->cdev; 2103 bio->bio_offset = 0; 2104 bio->bio_data = 0; 2105 bio->bio_done = xbb_bio_done; 2106 bio->bio_caller1 = reqlist; 2107 bio->bio_pblkno = 0; 2108 2109 reqlist->pendcnt = 1; 2110 2111 SDT_PROBE1(xbb, kernel, xbb_dispatch_dev, flush, 2112 device_get_unit(xbb->dev)); 2113 2114 (*dev_data->csw->d_strategy)(bio); 2115 2116 return (0); 2117 } 2118 2119 xbb_sg = xbb->xbb_sgs; 2120 bio = NULL; 2121 nseg = reqlist->nr_segments; 2122 2123 for (seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { 2124 2125 /* 2126 * KVA will not be contiguous, so any additional 2127 * I/O will need to be represented in a new bio. 2128 */ 2129 if ((bio != NULL) 2130 && (xbb_sg->first_sect != 0)) { 2131 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { 2132 printf("%s: Discontiguous I/O request " 2133 "from domain %d ends on " 2134 "non-sector boundary\n", 2135 __func__, xbb->otherend_id); 2136 error = EINVAL; 2137 goto fail_free_bios; 2138 } 2139 bio = NULL; 2140 } 2141 2142 if (bio == NULL) { 2143 /* 2144 * Make sure that the start of this bio is 2145 * aligned to a device sector. 2146 */ 2147 if ((bio_offset & (xbb->sector_size - 1)) != 0){ 2148 printf("%s: Misaligned I/O request " 2149 "from domain %d\n", __func__, 2150 xbb->otherend_id); 2151 error = EINVAL; 2152 goto fail_free_bios; 2153 } 2154 2155 bio = bios[nbio++] = g_new_bio(); 2156 if (__predict_false(bio == NULL)) { 2157 error = ENOMEM; 2158 goto fail_free_bios; 2159 } 2160 bio->bio_cmd = operation; 2161 bio->bio_flags |= bio_flags; 2162 bio->bio_dev = dev_data->cdev; 2163 bio->bio_offset = bio_offset; 2164 bio->bio_data = xbb_reqlist_ioaddr(reqlist, seg_idx, 2165 xbb_sg->first_sect); 2166 bio->bio_done = xbb_bio_done; 2167 bio->bio_caller1 = reqlist; 2168 bio->bio_pblkno = bio_offset >> xbb->sector_size_shift; 2169 } 2170 2171 bio->bio_length += xbb_sg->nsect << 9; 2172 bio->bio_bcount = bio->bio_length; 2173 bio_offset += xbb_sg->nsect << 9; 2174 2175 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) { 2176 2177 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { 2178 printf("%s: Discontiguous I/O request " 2179 "from domain %d ends on " 2180 "non-sector boundary\n", 2181 __func__, xbb->otherend_id); 2182 error = EINVAL; 2183 goto fail_free_bios; 2184 } 2185 /* 2186 * KVA will not be contiguous, so any additional 2187 * I/O will need to be represented in a new bio. 2188 */ 2189 bio = NULL; 2190 } 2191 } 2192 2193 reqlist->pendcnt = nbio; 2194 2195 for (bio_idx = 0; bio_idx < nbio; bio_idx++) 2196 { 2197#ifdef XBB_USE_BOUNCE_BUFFERS 2198 vm_offset_t kva_offset; 2199 2200 kva_offset = (vm_offset_t)bios[bio_idx]->bio_data 2201 - (vm_offset_t)reqlist->bounce; 2202 if (operation == BIO_WRITE) { 2203 memcpy(bios[bio_idx]->bio_data, 2204 (uint8_t *)reqlist->kva + kva_offset, 2205 bios[bio_idx]->bio_bcount); 2206 } 2207#endif 2208 if (operation == BIO_READ) { 2209 SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, read, 2210 device_get_unit(xbb->dev), 2211 bios[bio_idx]->bio_offset, 2212 bios[bio_idx]->bio_length); 2213 } else if (operation == BIO_WRITE) { 2214 SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, write, 2215 device_get_unit(xbb->dev), 2216 bios[bio_idx]->bio_offset, 2217 bios[bio_idx]->bio_length); 2218 } 2219 (*dev_data->csw->d_strategy)(bios[bio_idx]); 2220 } 2221 2222 return (error); 2223 2224fail_free_bios: 2225 for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++) 2226 g_destroy_bio(bios[bio_idx]); 2227 2228 return (error); 2229} 2230 2231SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_file, flush, "int"); 2232SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, read, "int", "uint64_t", 2233 "uint64_t"); 2234SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, write, "int", 2235 "uint64_t", "uint64_t"); 2236 2237/** 2238 * Backend handler for file access. 2239 * 2240 * \param xbb Per-instance xbb configuration structure. 2241 * \param reqlist Allocated internal request list. 2242 * \param operation BIO_* I/O operation code. 2243 * \param flags Additional bio_flag data to pass to any generated bios 2244 * (e.g. BIO_ORDERED).. 2245 * 2246 * \return 0 for success, errno codes for failure. 2247 */ 2248static int 2249xbb_dispatch_file(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, 2250 int operation, int flags) 2251{ 2252 struct xbb_file_data *file_data; 2253 u_int seg_idx; 2254 u_int nseg; 2255 struct uio xuio; 2256 struct xbb_sg *xbb_sg; 2257 struct iovec *xiovec; 2258#ifdef XBB_USE_BOUNCE_BUFFERS 2259 void **p_vaddr; 2260 int saved_uio_iovcnt; 2261#endif /* XBB_USE_BOUNCE_BUFFERS */ 2262 int error; 2263 2264 file_data = &xbb->backend.file; 2265 error = 0; 2266 bzero(&xuio, sizeof(xuio)); 2267 2268 switch (operation) { 2269 case BIO_READ: 2270 xuio.uio_rw = UIO_READ; 2271 break; 2272 case BIO_WRITE: 2273 xuio.uio_rw = UIO_WRITE; 2274 break; 2275 case BIO_FLUSH: { 2276 struct mount *mountpoint; 2277 2278 SDT_PROBE1(xbb, kernel, xbb_dispatch_file, flush, 2279 device_get_unit(xbb->dev)); 2280 2281 (void) vn_start_write(xbb->vn, &mountpoint, V_WAIT); 2282 2283 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2284 error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread); 2285 VOP_UNLOCK(xbb->vn, 0); 2286 2287 vn_finished_write(mountpoint); 2288 2289 goto bailout_send_response; 2290 /* NOTREACHED */ 2291 } 2292 default: 2293 panic("invalid operation %d", operation); 2294 /* NOTREACHED */ 2295 } 2296 xuio.uio_offset = (vm_offset_t)reqlist->starting_sector_number 2297 << xbb->sector_size_shift; 2298 xuio.uio_segflg = UIO_SYSSPACE; 2299 xuio.uio_iov = file_data->xiovecs; 2300 xuio.uio_iovcnt = 0; 2301 xbb_sg = xbb->xbb_sgs; 2302 nseg = reqlist->nr_segments; 2303 2304 for (xiovec = NULL, seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { 2305 2306 /* 2307 * If the first sector is not 0, the KVA will 2308 * not be contiguous and we'll need to go on 2309 * to another segment. 2310 */ 2311 if (xbb_sg->first_sect != 0) 2312 xiovec = NULL; 2313 2314 if (xiovec == NULL) { 2315 xiovec = &file_data->xiovecs[xuio.uio_iovcnt]; 2316 xiovec->iov_base = xbb_reqlist_ioaddr(reqlist, 2317 seg_idx, xbb_sg->first_sect); 2318#ifdef XBB_USE_BOUNCE_BUFFERS 2319 /* 2320 * Store the address of the incoming 2321 * buffer at this particular offset 2322 * as well, so we can do the copy 2323 * later without having to do more 2324 * work to recalculate this address. 2325 */ 2326 p_vaddr = &file_data->xiovecs_vaddr[xuio.uio_iovcnt]; 2327 *p_vaddr = xbb_reqlist_vaddr(reqlist, seg_idx, 2328 xbb_sg->first_sect); 2329#endif /* XBB_USE_BOUNCE_BUFFERS */ 2330 xiovec->iov_len = 0; 2331 xuio.uio_iovcnt++; 2332 } 2333 2334 xiovec->iov_len += xbb_sg->nsect << 9; 2335 2336 xuio.uio_resid += xbb_sg->nsect << 9; 2337 2338 /* 2339 * If the last sector is not the full page 2340 * size count, the next segment will not be 2341 * contiguous in KVA and we need a new iovec. 2342 */ 2343 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) 2344 xiovec = NULL; 2345 } 2346 2347 xuio.uio_td = curthread; 2348 2349#ifdef XBB_USE_BOUNCE_BUFFERS 2350 saved_uio_iovcnt = xuio.uio_iovcnt; 2351 2352 if (operation == BIO_WRITE) { 2353 /* Copy the write data to the local buffer. */ 2354 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, 2355 xiovec = xuio.uio_iov; seg_idx < xuio.uio_iovcnt; 2356 seg_idx++, xiovec++, p_vaddr++) { 2357 2358 memcpy(xiovec->iov_base, *p_vaddr, xiovec->iov_len); 2359 } 2360 } else { 2361 /* 2362 * We only need to save off the iovecs in the case of a 2363 * read, because the copy for the read happens after the 2364 * VOP_READ(). (The uio will get modified in that call 2365 * sequence.) 2366 */ 2367 memcpy(file_data->saved_xiovecs, xuio.uio_iov, 2368 xuio.uio_iovcnt * sizeof(xuio.uio_iov[0])); 2369 } 2370#endif /* XBB_USE_BOUNCE_BUFFERS */ 2371 2372 switch (operation) { 2373 case BIO_READ: 2374 2375 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, read, 2376 device_get_unit(xbb->dev), xuio.uio_offset, 2377 xuio.uio_resid); 2378 2379 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2380 2381 /* 2382 * UFS pays attention to IO_DIRECT for reads. If the 2383 * DIRECTIO option is configured into the kernel, it calls 2384 * ffs_rawread(). But that only works for single-segment 2385 * uios with user space addresses. In our case, with a 2386 * kernel uio, it still reads into the buffer cache, but it 2387 * will just try to release the buffer from the cache later 2388 * on in ffs_read(). 2389 * 2390 * ZFS does not pay attention to IO_DIRECT for reads. 2391 * 2392 * UFS does not pay attention to IO_SYNC for reads. 2393 * 2394 * ZFS pays attention to IO_SYNC (which translates into the 2395 * Solaris define FRSYNC for zfs_read()) for reads. It 2396 * attempts to sync the file before reading. 2397 * 2398 * So, to attempt to provide some barrier semantics in the 2399 * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC. 2400 */ 2401 error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 2402 (IO_DIRECT|IO_SYNC) : 0, file_data->cred); 2403 2404 VOP_UNLOCK(xbb->vn, 0); 2405 break; 2406 case BIO_WRITE: { 2407 struct mount *mountpoint; 2408 2409 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, write, 2410 device_get_unit(xbb->dev), xuio.uio_offset, 2411 xuio.uio_resid); 2412 2413 (void)vn_start_write(xbb->vn, &mountpoint, V_WAIT); 2414 2415 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); 2416 2417 /* 2418 * UFS pays attention to IO_DIRECT for writes. The write 2419 * is done asynchronously. (Normally the write would just 2420 * get put into cache. 2421 * 2422 * UFS pays attention to IO_SYNC for writes. It will 2423 * attempt to write the buffer out synchronously if that 2424 * flag is set. 2425 * 2426 * ZFS does not pay attention to IO_DIRECT for writes. 2427 * 2428 * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC) 2429 * for writes. It will flush the transaction from the 2430 * cache before returning. 2431 * 2432 * So if we've got the BIO_ORDERED flag set, we want 2433 * IO_SYNC in either the UFS or ZFS case. 2434 */ 2435 error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 2436 IO_SYNC : 0, file_data->cred); 2437 VOP_UNLOCK(xbb->vn, 0); 2438 2439 vn_finished_write(mountpoint); 2440 2441 break; 2442 } 2443 default: 2444 panic("invalid operation %d", operation); 2445 /* NOTREACHED */ 2446 } 2447 2448#ifdef XBB_USE_BOUNCE_BUFFERS 2449 /* We only need to copy here for read operations */ 2450 if (operation == BIO_READ) { 2451 2452 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, 2453 xiovec = file_data->saved_xiovecs; 2454 seg_idx < saved_uio_iovcnt; seg_idx++, 2455 xiovec++, p_vaddr++) { 2456 2457 /* 2458 * Note that we have to use the copy of the 2459 * io vector we made above. uiomove() modifies 2460 * the uio and its referenced vector as uiomove 2461 * performs the copy, so we can't rely on any 2462 * state from the original uio. 2463 */ 2464 memcpy(*p_vaddr, xiovec->iov_base, xiovec->iov_len); 2465 } 2466 } 2467#endif /* XBB_USE_BOUNCE_BUFFERS */ 2468 2469bailout_send_response: 2470 2471 if (error != 0) 2472 reqlist->status = BLKIF_RSP_ERROR; 2473 2474 xbb_complete_reqlist(xbb, reqlist); 2475 2476 return (0); 2477} 2478 2479/*--------------------------- Backend Configuration --------------------------*/ 2480/** 2481 * Close and cleanup any backend device/file specific state for this 2482 * block back instance. 2483 * 2484 * \param xbb Per-instance xbb configuration structure. 2485 */ 2486static void 2487xbb_close_backend(struct xbb_softc *xbb) 2488{ 2489 DROP_GIANT(); 2490 DPRINTF("closing dev=%s\n", xbb->dev_name); 2491 if (xbb->vn) { 2492 int flags = FREAD; 2493 2494 if ((xbb->flags & XBBF_READ_ONLY) == 0) 2495 flags |= FWRITE; 2496 2497 switch (xbb->device_type) { 2498 case XBB_TYPE_DISK: 2499 if (xbb->backend.dev.csw) { 2500 dev_relthread(xbb->backend.dev.cdev, 2501 xbb->backend.dev.dev_ref); 2502 xbb->backend.dev.csw = NULL; 2503 xbb->backend.dev.cdev = NULL; 2504 } 2505 break; 2506 case XBB_TYPE_FILE: 2507 break; 2508 case XBB_TYPE_NONE: 2509 default: 2510 panic("Unexpected backend type."); 2511 break; 2512 } 2513 2514 (void)vn_close(xbb->vn, flags, NOCRED, curthread); 2515 xbb->vn = NULL; 2516 2517 switch (xbb->device_type) { 2518 case XBB_TYPE_DISK: 2519 break; 2520 case XBB_TYPE_FILE: 2521 if (xbb->backend.file.cred != NULL) { 2522 crfree(xbb->backend.file.cred); 2523 xbb->backend.file.cred = NULL; 2524 } 2525 break; 2526 case XBB_TYPE_NONE: 2527 default: 2528 panic("Unexpected backend type."); 2529 break; 2530 } 2531 } 2532 PICKUP_GIANT(); 2533} 2534 2535/** 2536 * Open a character device to be used for backend I/O. 2537 * 2538 * \param xbb Per-instance xbb configuration structure. 2539 * 2540 * \return 0 for success, errno codes for failure. 2541 */ 2542static int 2543xbb_open_dev(struct xbb_softc *xbb) 2544{ 2545 struct vattr vattr; 2546 struct cdev *dev; 2547 struct cdevsw *devsw; 2548 int error; 2549 2550 xbb->device_type = XBB_TYPE_DISK; 2551 xbb->dispatch_io = xbb_dispatch_dev; 2552 xbb->backend.dev.cdev = xbb->vn->v_rdev; 2553 xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev, 2554 &xbb->backend.dev.dev_ref); 2555 if (xbb->backend.dev.csw == NULL) 2556 panic("Unable to retrieve device switch"); 2557 2558 error = VOP_GETATTR(xbb->vn, &vattr, NOCRED); 2559 if (error) { 2560 xenbus_dev_fatal(xbb->dev, error, "error getting " 2561 "vnode attributes for device %s", 2562 xbb->dev_name); 2563 return (error); 2564 } 2565 2566 2567 dev = xbb->vn->v_rdev; 2568 devsw = dev->si_devsw; 2569 if (!devsw->d_ioctl) { 2570 xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for " 2571 "device %s!", xbb->dev_name); 2572 return (ENODEV); 2573 } 2574 2575 error = devsw->d_ioctl(dev, DIOCGSECTORSIZE, 2576 (caddr_t)&xbb->sector_size, FREAD, 2577 curthread); 2578 if (error) { 2579 xenbus_dev_fatal(xbb->dev, error, 2580 "error calling ioctl DIOCGSECTORSIZE " 2581 "for device %s", xbb->dev_name); 2582 return (error); 2583 } 2584 2585 error = devsw->d_ioctl(dev, DIOCGMEDIASIZE, 2586 (caddr_t)&xbb->media_size, FREAD, 2587 curthread); 2588 if (error) { 2589 xenbus_dev_fatal(xbb->dev, error, 2590 "error calling ioctl DIOCGMEDIASIZE " 2591 "for device %s", xbb->dev_name); 2592 return (error); 2593 } 2594 2595 return (0); 2596} 2597 2598/** 2599 * Open a file to be used for backend I/O. 2600 * 2601 * \param xbb Per-instance xbb configuration structure. 2602 * 2603 * \return 0 for success, errno codes for failure. 2604 */ 2605static int 2606xbb_open_file(struct xbb_softc *xbb) 2607{ 2608 struct xbb_file_data *file_data; 2609 struct vattr vattr; 2610 int error; 2611 2612 file_data = &xbb->backend.file; 2613 xbb->device_type = XBB_TYPE_FILE; 2614 xbb->dispatch_io = xbb_dispatch_file; 2615 error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred); 2616 if (error != 0) { 2617 xenbus_dev_fatal(xbb->dev, error, 2618 "error calling VOP_GETATTR()" 2619 "for file %s", xbb->dev_name); 2620 return (error); 2621 } 2622 2623 /* 2624 * Verify that we have the ability to upgrade to exclusive 2625 * access on this file so we can trap errors at open instead 2626 * of reporting them during first access. 2627 */ 2628 if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) { 2629 vn_lock(xbb->vn, LK_UPGRADE | LK_RETRY); 2630 if (xbb->vn->v_iflag & VI_DOOMED) { 2631 error = EBADF; 2632 xenbus_dev_fatal(xbb->dev, error, 2633 "error locking file %s", 2634 xbb->dev_name); 2635 2636 return (error); 2637 } 2638 } 2639 2640 file_data->cred = crhold(curthread->td_ucred); 2641 xbb->media_size = vattr.va_size; 2642 2643 /* 2644 * XXX KDM vattr.va_blocksize may be larger than 512 bytes here. 2645 * With ZFS, it is 131072 bytes. Block sizes that large don't work 2646 * with disklabel and UFS on FreeBSD at least. Large block sizes 2647 * may not work with other OSes as well. So just export a sector 2648 * size of 512 bytes, which should work with any OS or 2649 * application. Since our backing is a file, any block size will 2650 * work fine for the backing store. 2651 */ 2652#if 0 2653 xbb->sector_size = vattr.va_blocksize; 2654#endif 2655 xbb->sector_size = 512; 2656 2657 /* 2658 * Sanity check. The media size has to be at least one 2659 * sector long. 2660 */ 2661 if (xbb->media_size < xbb->sector_size) { 2662 error = EINVAL; 2663 xenbus_dev_fatal(xbb->dev, error, 2664 "file %s size %ju < block size %u", 2665 xbb->dev_name, 2666 (uintmax_t)xbb->media_size, 2667 xbb->sector_size); 2668 } 2669 return (error); 2670} 2671 2672/** 2673 * Open the backend provider for this connection. 2674 * 2675 * \param xbb Per-instance xbb configuration structure. 2676 * 2677 * \return 0 for success, errno codes for failure. 2678 */ 2679static int 2680xbb_open_backend(struct xbb_softc *xbb) 2681{ 2682 struct nameidata nd; 2683 int flags; 2684 int error; 2685 2686 flags = FREAD; 2687 error = 0; 2688 2689 DPRINTF("opening dev=%s\n", xbb->dev_name); 2690 2691 if (rootvnode == NULL) { 2692 xenbus_dev_fatal(xbb->dev, ENOENT, 2693 "Root file system not mounted"); 2694 return (ENOENT); 2695 } 2696 2697 if ((xbb->flags & XBBF_READ_ONLY) == 0) 2698 flags |= FWRITE; 2699 2700 pwd_ensure_dirs(); 2701 2702 again: 2703 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name, curthread); 2704 error = vn_open(&nd, &flags, 0, NULL); 2705 if (error) { 2706 /* 2707 * This is the only reasonable guess we can make as far as 2708 * path if the user doesn't give us a fully qualified path. 2709 * If they want to specify a file, they need to specify the 2710 * full path. 2711 */ 2712 if (xbb->dev_name[0] != '/') { 2713 char *dev_path = "/dev/"; 2714 char *dev_name; 2715 2716 /* Try adding device path at beginning of name */ 2717 dev_name = malloc(strlen(xbb->dev_name) 2718 + strlen(dev_path) + 1, 2719 M_XENBLOCKBACK, M_NOWAIT); 2720 if (dev_name) { 2721 sprintf(dev_name, "%s%s", dev_path, 2722 xbb->dev_name); 2723 free(xbb->dev_name, M_XENBLOCKBACK); 2724 xbb->dev_name = dev_name; 2725 goto again; 2726 } 2727 } 2728 xenbus_dev_fatal(xbb->dev, error, "error opening device %s", 2729 xbb->dev_name); 2730 return (error); 2731 } 2732 2733 NDFREE(&nd, NDF_ONLY_PNBUF); 2734 2735 xbb->vn = nd.ni_vp; 2736 2737 /* We only support disks and files. */ 2738 if (vn_isdisk(xbb->vn, &error)) { 2739 error = xbb_open_dev(xbb); 2740 } else if (xbb->vn->v_type == VREG) { 2741 error = xbb_open_file(xbb); 2742 } else { 2743 error = EINVAL; 2744 xenbus_dev_fatal(xbb->dev, error, "%s is not a disk " 2745 "or file", xbb->dev_name); 2746 } 2747 VOP_UNLOCK(xbb->vn, 0); 2748 2749 if (error != 0) { 2750 xbb_close_backend(xbb); 2751 return (error); 2752 } 2753 2754 xbb->sector_size_shift = fls(xbb->sector_size) - 1; 2755 xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift; 2756 2757 DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n", 2758 (xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file", 2759 xbb->dev_name, xbb->sector_size, xbb->media_size); 2760 2761 return (0); 2762} 2763 2764/*------------------------ Inter-Domain Communication ------------------------*/ 2765/** 2766 * Free dynamically allocated KVA or pseudo-physical address allocations. 2767 * 2768 * \param xbb Per-instance xbb configuration structure. 2769 */ 2770static void 2771xbb_free_communication_mem(struct xbb_softc *xbb) 2772{ 2773 if (xbb->kva != 0) { 2774 if (xbb->pseudo_phys_res != NULL) { 2775 xenmem_free(xbb->dev, xbb->pseudo_phys_res_id, 2776 xbb->pseudo_phys_res); 2777 xbb->pseudo_phys_res = NULL; 2778 } 2779 } 2780 xbb->kva = 0; 2781 xbb->gnt_base_addr = 0; 2782 if (xbb->kva_free != NULL) { 2783 free(xbb->kva_free, M_XENBLOCKBACK); 2784 xbb->kva_free = NULL; 2785 } 2786} 2787 2788/** 2789 * Cleanup all inter-domain communication mechanisms. 2790 * 2791 * \param xbb Per-instance xbb configuration structure. 2792 */ 2793static int 2794xbb_disconnect(struct xbb_softc *xbb) 2795{ 2796 struct gnttab_unmap_grant_ref ops[XBB_MAX_RING_PAGES]; 2797 struct gnttab_unmap_grant_ref *op; 2798 u_int ring_idx; 2799 int error; 2800 2801 DPRINTF("\n"); 2802 2803 if ((xbb->flags & XBBF_RING_CONNECTED) == 0) 2804 return (0); 2805 2806 mtx_unlock(&xbb->lock); 2807 xen_intr_unbind(&xbb->xen_intr_handle); 2808 taskqueue_drain(xbb->io_taskqueue, &xbb->io_task); 2809 mtx_lock(&xbb->lock); 2810 2811 /* 2812 * No new interrupts can generate work, but we must wait 2813 * for all currently active requests to drain. 2814 */ 2815 if (xbb->active_request_count != 0) 2816 return (EAGAIN); 2817 2818 for (ring_idx = 0, op = ops; 2819 ring_idx < xbb->ring_config.ring_pages; 2820 ring_idx++, op++) { 2821 2822 op->host_addr = xbb->ring_config.gnt_addr 2823 + (ring_idx * PAGE_SIZE); 2824 op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx]; 2825 op->handle = xbb->ring_config.handle[ring_idx]; 2826 } 2827 2828 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops, 2829 xbb->ring_config.ring_pages); 2830 if (error != 0) 2831 panic("Grant table op failed (%d)", error); 2832 2833 xbb_free_communication_mem(xbb); 2834 2835 if (xbb->requests != NULL) { 2836 free(xbb->requests, M_XENBLOCKBACK); 2837 xbb->requests = NULL; 2838 } 2839 2840 if (xbb->request_lists != NULL) { 2841 struct xbb_xen_reqlist *reqlist; 2842 int i; 2843 2844 /* There is one request list for ever allocated request. */ 2845 for (i = 0, reqlist = xbb->request_lists; 2846 i < xbb->max_requests; i++, reqlist++){ 2847#ifdef XBB_USE_BOUNCE_BUFFERS 2848 if (reqlist->bounce != NULL) { 2849 free(reqlist->bounce, M_XENBLOCKBACK); 2850 reqlist->bounce = NULL; 2851 } 2852#endif 2853 if (reqlist->gnt_handles != NULL) { 2854 free(reqlist->gnt_handles, M_XENBLOCKBACK); 2855 reqlist->gnt_handles = NULL; 2856 } 2857 } 2858 free(xbb->request_lists, M_XENBLOCKBACK); 2859 xbb->request_lists = NULL; 2860 } 2861 2862 xbb->flags &= ~XBBF_RING_CONNECTED; 2863 return (0); 2864} 2865 2866/** 2867 * Map shared memory ring into domain local address space, initialize 2868 * ring control structures, and bind an interrupt to the event channel 2869 * used to notify us of ring changes. 2870 * 2871 * \param xbb Per-instance xbb configuration structure. 2872 */ 2873static int 2874xbb_connect_ring(struct xbb_softc *xbb) 2875{ 2876 struct gnttab_map_grant_ref gnts[XBB_MAX_RING_PAGES]; 2877 struct gnttab_map_grant_ref *gnt; 2878 u_int ring_idx; 2879 int error; 2880 2881 if ((xbb->flags & XBBF_RING_CONNECTED) != 0) 2882 return (0); 2883 2884 /* 2885 * Kva for our ring is at the tail of the region of kva allocated 2886 * by xbb_alloc_communication_mem(). 2887 */ 2888 xbb->ring_config.va = xbb->kva 2889 + (xbb->kva_size 2890 - (xbb->ring_config.ring_pages * PAGE_SIZE)); 2891 xbb->ring_config.gnt_addr = xbb->gnt_base_addr 2892 + (xbb->kva_size 2893 - (xbb->ring_config.ring_pages * PAGE_SIZE)); 2894 2895 for (ring_idx = 0, gnt = gnts; 2896 ring_idx < xbb->ring_config.ring_pages; 2897 ring_idx++, gnt++) { 2898 2899 gnt->host_addr = xbb->ring_config.gnt_addr 2900 + (ring_idx * PAGE_SIZE); 2901 gnt->flags = GNTMAP_host_map; 2902 gnt->ref = xbb->ring_config.ring_ref[ring_idx]; 2903 gnt->dom = xbb->otherend_id; 2904 } 2905 2906 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts, 2907 xbb->ring_config.ring_pages); 2908 if (error) 2909 panic("blkback: Ring page grant table op failed (%d)", error); 2910 2911 for (ring_idx = 0, gnt = gnts; 2912 ring_idx < xbb->ring_config.ring_pages; 2913 ring_idx++, gnt++) { 2914 if (gnt->status != 0) { 2915 struct gnttab_unmap_grant_ref unmap[XBB_MAX_RING_PAGES]; 2916 unsigned int i, j; 2917 2918 xbb->ring_config.va = 0; 2919 xenbus_dev_fatal(xbb->dev, EACCES, 2920 "Ring shared page mapping failed. " 2921 "Status %d.", gnt->status); 2922 2923 /* Unmap everything to avoid leaking grant table maps */ 2924 for (i = 0, j = 0; i < xbb->ring_config.ring_pages; 2925 i++) { 2926 if (gnts[i].status != GNTST_okay) 2927 continue; 2928 2929 unmap[j].host_addr = gnts[i].host_addr; 2930 unmap[j].dev_bus_addr = gnts[i].dev_bus_addr; 2931 unmap[j++].handle = gnts[i].handle; 2932 } 2933 if (j != 0) { 2934 error = HYPERVISOR_grant_table_op( 2935 GNTTABOP_unmap_grant_ref, unmap, j); 2936 if (error != 0) 2937 panic("Unable to unmap grants (%d)", 2938 error); 2939 } 2940 return (EACCES); 2941 } 2942 xbb->ring_config.handle[ring_idx] = gnt->handle; 2943 xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr; 2944 } 2945 2946 /* Initialize the ring based on ABI. */ 2947 switch (xbb->abi) { 2948 case BLKIF_PROTOCOL_NATIVE: 2949 { 2950 blkif_sring_t *sring; 2951 sring = (blkif_sring_t *)xbb->ring_config.va; 2952 BACK_RING_INIT(&xbb->rings.native, sring, 2953 xbb->ring_config.ring_pages * PAGE_SIZE); 2954 break; 2955 } 2956 case BLKIF_PROTOCOL_X86_32: 2957 { 2958 blkif_x86_32_sring_t *sring_x86_32; 2959 sring_x86_32 = (blkif_x86_32_sring_t *)xbb->ring_config.va; 2960 BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32, 2961 xbb->ring_config.ring_pages * PAGE_SIZE); 2962 break; 2963 } 2964 case BLKIF_PROTOCOL_X86_64: 2965 { 2966 blkif_x86_64_sring_t *sring_x86_64; 2967 sring_x86_64 = (blkif_x86_64_sring_t *)xbb->ring_config.va; 2968 BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64, 2969 xbb->ring_config.ring_pages * PAGE_SIZE); 2970 break; 2971 } 2972 default: 2973 panic("Unexpected blkif protocol ABI."); 2974 } 2975 2976 xbb->flags |= XBBF_RING_CONNECTED; 2977 2978 error = xen_intr_bind_remote_port(xbb->dev, 2979 xbb->otherend_id, 2980 xbb->ring_config.evtchn, 2981 xbb_filter, 2982 /*ithread_handler*/NULL, 2983 /*arg*/xbb, 2984 INTR_TYPE_BIO | INTR_MPSAFE, 2985 &xbb->xen_intr_handle); 2986 if (error) { 2987 (void)xbb_disconnect(xbb); 2988 xenbus_dev_fatal(xbb->dev, error, "binding event channel"); 2989 return (error); 2990 } 2991 2992 DPRINTF("rings connected!\n"); 2993 2994 return 0; 2995} 2996 2997/** 2998 * Size KVA and pseudo-physical address allocations based on negotiated 2999 * values for the size and number of I/O requests, and the size of our 3000 * communication ring. 3001 * 3002 * \param xbb Per-instance xbb configuration structure. 3003 * 3004 * These address spaces are used to dynamically map pages in the 3005 * front-end's domain into our own. 3006 */ 3007static int 3008xbb_alloc_communication_mem(struct xbb_softc *xbb) 3009{ 3010 xbb->reqlist_kva_pages = xbb->max_requests * xbb->max_request_segments; 3011 xbb->reqlist_kva_size = xbb->reqlist_kva_pages * PAGE_SIZE; 3012 xbb->kva_size = xbb->reqlist_kva_size + 3013 (xbb->ring_config.ring_pages * PAGE_SIZE); 3014 3015 xbb->kva_free = bit_alloc(xbb->reqlist_kva_pages, M_XENBLOCKBACK, M_NOWAIT); 3016 if (xbb->kva_free == NULL) 3017 return (ENOMEM); 3018 3019 DPRINTF("%s: kva_size = %d, reqlist_kva_size = %d\n", 3020 device_get_nameunit(xbb->dev), xbb->kva_size, 3021 xbb->reqlist_kva_size); 3022 /* 3023 * Reserve a range of pseudo physical memory that we can map 3024 * into kva. These pages will only be backed by machine 3025 * pages ("real memory") during the lifetime of front-end requests 3026 * via grant table operations. 3027 */ 3028 xbb->pseudo_phys_res_id = 0; 3029 xbb->pseudo_phys_res = xenmem_alloc(xbb->dev, &xbb->pseudo_phys_res_id, 3030 xbb->kva_size); 3031 if (xbb->pseudo_phys_res == NULL) { 3032 xbb->kva = 0; 3033 return (ENOMEM); 3034 } 3035 xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res); 3036 xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res); 3037 3038 DPRINTF("%s: kva: %#jx, gnt_base_addr: %#jx\n", 3039 device_get_nameunit(xbb->dev), (uintmax_t)xbb->kva, 3040 (uintmax_t)xbb->gnt_base_addr); 3041 return (0); 3042} 3043 3044/** 3045 * Collect front-end information from the XenStore. 3046 * 3047 * \param xbb Per-instance xbb configuration structure. 3048 */ 3049static int 3050xbb_collect_frontend_info(struct xbb_softc *xbb) 3051{ 3052 char protocol_abi[64]; 3053 const char *otherend_path; 3054 int error; 3055 u_int ring_idx; 3056 u_int ring_page_order; 3057 size_t ring_size; 3058 3059 otherend_path = xenbus_get_otherend_path(xbb->dev); 3060 3061 /* 3062 * Protocol defaults valid even if all negotiation fails. 3063 */ 3064 xbb->ring_config.ring_pages = 1; 3065 xbb->max_request_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST; 3066 xbb->max_request_size = xbb->max_request_segments * PAGE_SIZE; 3067 3068 /* 3069 * Mandatory data (used in all versions of the protocol) first. 3070 */ 3071 error = xs_scanf(XST_NIL, otherend_path, 3072 "event-channel", NULL, "%" PRIu32, 3073 &xbb->ring_config.evtchn); 3074 if (error != 0) { 3075 xenbus_dev_fatal(xbb->dev, error, 3076 "Unable to retrieve event-channel information " 3077 "from frontend %s. Unable to connect.", 3078 xenbus_get_otherend_path(xbb->dev)); 3079 return (error); 3080 } 3081 3082 /* 3083 * These fields are initialized to legacy protocol defaults 3084 * so we only need to fail if reading the updated value succeeds 3085 * and the new value is outside of its allowed range. 3086 * 3087 * \note xs_gather() returns on the first encountered error, so 3088 * we must use independent calls in order to guarantee 3089 * we don't miss information in a sparsly populated front-end 3090 * tree. 3091 * 3092 * \note xs_scanf() does not update variables for unmatched 3093 * fields. 3094 */ 3095 ring_page_order = 0; 3096 xbb->max_requests = 32; 3097 3098 (void)xs_scanf(XST_NIL, otherend_path, 3099 "ring-page-order", NULL, "%u", 3100 &ring_page_order); 3101 xbb->ring_config.ring_pages = 1 << ring_page_order; 3102 ring_size = PAGE_SIZE * xbb->ring_config.ring_pages; 3103 xbb->max_requests = BLKIF_MAX_RING_REQUESTS(ring_size); 3104 3105 if (xbb->ring_config.ring_pages > XBB_MAX_RING_PAGES) { 3106 xenbus_dev_fatal(xbb->dev, EINVAL, 3107 "Front-end specified ring-pages of %u " 3108 "exceeds backend limit of %u. " 3109 "Unable to connect.", 3110 xbb->ring_config.ring_pages, 3111 XBB_MAX_RING_PAGES); 3112 return (EINVAL); 3113 } 3114 3115 if (xbb->ring_config.ring_pages == 1) { 3116 error = xs_gather(XST_NIL, otherend_path, 3117 "ring-ref", "%" PRIu32, 3118 &xbb->ring_config.ring_ref[0], 3119 NULL); 3120 if (error != 0) { 3121 xenbus_dev_fatal(xbb->dev, error, 3122 "Unable to retrieve ring information " 3123 "from frontend %s. Unable to " 3124 "connect.", 3125 xenbus_get_otherend_path(xbb->dev)); 3126 return (error); 3127 } 3128 } else { 3129 /* Multi-page ring format. */ 3130 for (ring_idx = 0; ring_idx < xbb->ring_config.ring_pages; 3131 ring_idx++) { 3132 char ring_ref_name[]= "ring_refXX"; 3133 3134 snprintf(ring_ref_name, sizeof(ring_ref_name), 3135 "ring-ref%u", ring_idx); 3136 error = xs_scanf(XST_NIL, otherend_path, 3137 ring_ref_name, NULL, "%" PRIu32, 3138 &xbb->ring_config.ring_ref[ring_idx]); 3139 if (error != 0) { 3140 xenbus_dev_fatal(xbb->dev, error, 3141 "Failed to retriev grant " 3142 "reference for page %u of " 3143 "shared ring. Unable " 3144 "to connect.", ring_idx); 3145 return (error); 3146 } 3147 } 3148 } 3149 3150 error = xs_gather(XST_NIL, otherend_path, 3151 "protocol", "%63s", protocol_abi, 3152 NULL); 3153 if (error != 0 3154 || !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) { 3155 /* 3156 * Assume native if the frontend has not 3157 * published ABI data or it has published and 3158 * matches our own ABI. 3159 */ 3160 xbb->abi = BLKIF_PROTOCOL_NATIVE; 3161 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) { 3162 3163 xbb->abi = BLKIF_PROTOCOL_X86_32; 3164 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) { 3165 3166 xbb->abi = BLKIF_PROTOCOL_X86_64; 3167 } else { 3168 3169 xenbus_dev_fatal(xbb->dev, EINVAL, 3170 "Unknown protocol ABI (%s) published by " 3171 "frontend. Unable to connect.", protocol_abi); 3172 return (EINVAL); 3173 } 3174 return (0); 3175} 3176 3177/** 3178 * Allocate per-request data structures given request size and number 3179 * information negotiated with the front-end. 3180 * 3181 * \param xbb Per-instance xbb configuration structure. 3182 */ 3183static int 3184xbb_alloc_requests(struct xbb_softc *xbb) 3185{ 3186 struct xbb_xen_req *req; 3187 struct xbb_xen_req *last_req; 3188 3189 /* 3190 * Allocate request book keeping datastructures. 3191 */ 3192 xbb->requests = malloc(xbb->max_requests * sizeof(*xbb->requests), 3193 M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3194 if (xbb->requests == NULL) { 3195 xenbus_dev_fatal(xbb->dev, ENOMEM, 3196 "Unable to allocate request structures"); 3197 return (ENOMEM); 3198 } 3199 3200 req = xbb->requests; 3201 last_req = &xbb->requests[xbb->max_requests - 1]; 3202 STAILQ_INIT(&xbb->request_free_stailq); 3203 while (req <= last_req) { 3204 STAILQ_INSERT_TAIL(&xbb->request_free_stailq, req, links); 3205 req++; 3206 } 3207 return (0); 3208} 3209 3210static int 3211xbb_alloc_request_lists(struct xbb_softc *xbb) 3212{ 3213 struct xbb_xen_reqlist *reqlist; 3214 int i; 3215 3216 /* 3217 * If no requests can be merged, we need 1 request list per 3218 * in flight request. 3219 */ 3220 xbb->request_lists = malloc(xbb->max_requests * 3221 sizeof(*xbb->request_lists), M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3222 if (xbb->request_lists == NULL) { 3223 xenbus_dev_fatal(xbb->dev, ENOMEM, 3224 "Unable to allocate request list structures"); 3225 return (ENOMEM); 3226 } 3227 3228 STAILQ_INIT(&xbb->reqlist_free_stailq); 3229 STAILQ_INIT(&xbb->reqlist_pending_stailq); 3230 for (i = 0; i < xbb->max_requests; i++) { 3231 int seg; 3232 3233 reqlist = &xbb->request_lists[i]; 3234 3235 reqlist->xbb = xbb; 3236 3237#ifdef XBB_USE_BOUNCE_BUFFERS 3238 reqlist->bounce = malloc(xbb->max_reqlist_size, 3239 M_XENBLOCKBACK, M_NOWAIT); 3240 if (reqlist->bounce == NULL) { 3241 xenbus_dev_fatal(xbb->dev, ENOMEM, 3242 "Unable to allocate request " 3243 "bounce buffers"); 3244 return (ENOMEM); 3245 } 3246#endif /* XBB_USE_BOUNCE_BUFFERS */ 3247 3248 reqlist->gnt_handles = malloc(xbb->max_reqlist_segments * 3249 sizeof(*reqlist->gnt_handles), 3250 M_XENBLOCKBACK, M_NOWAIT|M_ZERO); 3251 if (reqlist->gnt_handles == NULL) { 3252 xenbus_dev_fatal(xbb->dev, ENOMEM, 3253 "Unable to allocate request " 3254 "grant references"); 3255 return (ENOMEM); 3256 } 3257 3258 for (seg = 0; seg < xbb->max_reqlist_segments; seg++) 3259 reqlist->gnt_handles[seg] = GRANT_REF_INVALID; 3260 3261 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); 3262 } 3263 return (0); 3264} 3265 3266/** 3267 * Supply information about the physical device to the frontend 3268 * via XenBus. 3269 * 3270 * \param xbb Per-instance xbb configuration structure. 3271 */ 3272static int 3273xbb_publish_backend_info(struct xbb_softc *xbb) 3274{ 3275 struct xs_transaction xst; 3276 const char *our_path; 3277 const char *leaf; 3278 int error; 3279 3280 our_path = xenbus_get_node(xbb->dev); 3281 while (1) { 3282 error = xs_transaction_start(&xst); 3283 if (error != 0) { 3284 xenbus_dev_fatal(xbb->dev, error, 3285 "Error publishing backend info " 3286 "(start transaction)"); 3287 return (error); 3288 } 3289 3290 leaf = "sectors"; 3291 error = xs_printf(xst, our_path, leaf, 3292 "%"PRIu64, xbb->media_num_sectors); 3293 if (error != 0) 3294 break; 3295 3296 /* XXX Support all VBD attributes here. */ 3297 leaf = "info"; 3298 error = xs_printf(xst, our_path, leaf, "%u", 3299 xbb->flags & XBBF_READ_ONLY 3300 ? VDISK_READONLY : 0); 3301 if (error != 0) 3302 break; 3303 3304 leaf = "sector-size"; 3305 error = xs_printf(xst, our_path, leaf, "%u", 3306 xbb->sector_size); 3307 if (error != 0) 3308 break; 3309 3310 error = xs_transaction_end(xst, 0); 3311 if (error == 0) { 3312 return (0); 3313 } else if (error != EAGAIN) { 3314 xenbus_dev_fatal(xbb->dev, error, "ending transaction"); 3315 return (error); 3316 } 3317 } 3318 3319 xenbus_dev_fatal(xbb->dev, error, "writing %s/%s", 3320 our_path, leaf); 3321 xs_transaction_end(xst, 1); 3322 return (error); 3323} 3324 3325/** 3326 * Connect to our blkfront peer now that it has completed publishing 3327 * its configuration into the XenStore. 3328 * 3329 * \param xbb Per-instance xbb configuration structure. 3330 */ 3331static void 3332xbb_connect(struct xbb_softc *xbb) 3333{ 3334 int error; 3335 3336 if (!xbb->hotplug_done || 3337 (xenbus_get_state(xbb->dev) != XenbusStateInitWait) || 3338 (xbb_collect_frontend_info(xbb) != 0)) 3339 return; 3340 3341 xbb->flags &= ~XBBF_SHUTDOWN; 3342 3343 /* 3344 * We limit the maximum number of reqlist segments to the maximum 3345 * number of segments in the ring, or our absolute maximum, 3346 * whichever is smaller. 3347 */ 3348 xbb->max_reqlist_segments = MIN(xbb->max_request_segments * 3349 xbb->max_requests, XBB_MAX_SEGMENTS_PER_REQLIST); 3350 3351 /* 3352 * The maximum size is simply a function of the number of segments 3353 * we can handle. 3354 */ 3355 xbb->max_reqlist_size = xbb->max_reqlist_segments * PAGE_SIZE; 3356 3357 /* Allocate resources whose size depends on front-end configuration. */ 3358 error = xbb_alloc_communication_mem(xbb); 3359 if (error != 0) { 3360 xenbus_dev_fatal(xbb->dev, error, 3361 "Unable to allocate communication memory"); 3362 return; 3363 } 3364 3365 error = xbb_alloc_requests(xbb); 3366 if (error != 0) { 3367 /* Specific errors are reported by xbb_alloc_requests(). */ 3368 return; 3369 } 3370 3371 error = xbb_alloc_request_lists(xbb); 3372 if (error != 0) { 3373 /* Specific errors are reported by xbb_alloc_request_lists(). */ 3374 return; 3375 } 3376 3377 /* 3378 * Connect communication channel. 3379 */ 3380 error = xbb_connect_ring(xbb); 3381 if (error != 0) { 3382 /* Specific errors are reported by xbb_connect_ring(). */ 3383 return; 3384 } 3385 3386 if (xbb_publish_backend_info(xbb) != 0) { 3387 /* 3388 * If we can't publish our data, we cannot participate 3389 * in this connection, and waiting for a front-end state 3390 * change will not help the situation. 3391 */ 3392 (void)xbb_disconnect(xbb); 3393 return; 3394 } 3395 3396 /* Ready for I/O. */ 3397 xenbus_set_state(xbb->dev, XenbusStateConnected); 3398} 3399 3400/*-------------------------- Device Teardown Support -------------------------*/ 3401/** 3402 * Perform device shutdown functions. 3403 * 3404 * \param xbb Per-instance xbb configuration structure. 3405 * 3406 * Mark this instance as shutting down, wait for any active I/O on the 3407 * backend device/file to drain, disconnect from the front-end, and notify 3408 * any waiters (e.g. a thread invoking our detach method) that detach can 3409 * now proceed. 3410 */ 3411static int 3412xbb_shutdown(struct xbb_softc *xbb) 3413{ 3414 XenbusState frontState; 3415 int error; 3416 3417 DPRINTF("\n"); 3418 3419 /* 3420 * Due to the need to drop our mutex during some 3421 * xenbus operations, it is possible for two threads 3422 * to attempt to close out shutdown processing at 3423 * the same time. Tell the caller that hits this 3424 * race to try back later. 3425 */ 3426 if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0) 3427 return (EAGAIN); 3428 3429 xbb->flags |= XBBF_IN_SHUTDOWN; 3430 mtx_unlock(&xbb->lock); 3431 3432 if (xbb->hotplug_watch.node != NULL) { 3433 xs_unregister_watch(&xbb->hotplug_watch); 3434 free(xbb->hotplug_watch.node, M_XENBLOCKBACK); 3435 xbb->hotplug_watch.node = NULL; 3436 } 3437 xbb->hotplug_done = false; 3438 3439 if (xenbus_get_state(xbb->dev) < XenbusStateClosing) 3440 xenbus_set_state(xbb->dev, XenbusStateClosing); 3441 3442 frontState = xenbus_get_otherend_state(xbb->dev); 3443 mtx_lock(&xbb->lock); 3444 xbb->flags &= ~XBBF_IN_SHUTDOWN; 3445 3446 /* Wait for the frontend to disconnect (if it's connected). */ 3447 if (frontState == XenbusStateConnected) 3448 return (EAGAIN); 3449 3450 DPRINTF("\n"); 3451 3452 /* Indicate shutdown is in progress. */ 3453 xbb->flags |= XBBF_SHUTDOWN; 3454 3455 /* Disconnect from the front-end. */ 3456 error = xbb_disconnect(xbb); 3457 if (error != 0) { 3458 /* 3459 * Requests still outstanding. We'll be called again 3460 * once they complete. 3461 */ 3462 KASSERT(error == EAGAIN, 3463 ("%s: Unexpected xbb_disconnect() failure %d", 3464 __func__, error)); 3465 3466 return (error); 3467 } 3468 3469 DPRINTF("\n"); 3470 3471 /* Indicate to xbb_detach() that is it safe to proceed. */ 3472 wakeup(xbb); 3473 3474 return (0); 3475} 3476 3477/** 3478 * Report an attach time error to the console and Xen, and cleanup 3479 * this instance by forcing immediate detach processing. 3480 * 3481 * \param xbb Per-instance xbb configuration structure. 3482 * \param err Errno describing the error. 3483 * \param fmt Printf style format and arguments 3484 */ 3485static void 3486xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...) 3487{ 3488 va_list ap; 3489 va_list ap_hotplug; 3490 3491 va_start(ap, fmt); 3492 va_copy(ap_hotplug, ap); 3493 xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev), 3494 "hotplug-error", fmt, ap_hotplug); 3495 va_end(ap_hotplug); 3496 xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3497 "hotplug-status", "error"); 3498 3499 xenbus_dev_vfatal(xbb->dev, err, fmt, ap); 3500 va_end(ap); 3501 3502 xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3503 "online", "0"); 3504 mtx_lock(&xbb->lock); 3505 xbb_shutdown(xbb); 3506 mtx_unlock(&xbb->lock); 3507} 3508 3509/*---------------------------- NewBus Entrypoints ----------------------------*/ 3510/** 3511 * Inspect a XenBus device and claim it if is of the appropriate type. 3512 * 3513 * \param dev NewBus device object representing a candidate XenBus device. 3514 * 3515 * \return 0 for success, errno codes for failure. 3516 */ 3517static int 3518xbb_probe(device_t dev) 3519{ 3520 3521 if (!strcmp(xenbus_get_type(dev), "vbd")) { 3522 device_set_desc(dev, "Backend Virtual Block Device"); 3523 device_quiet(dev); 3524 return (0); 3525 } 3526 3527 return (ENXIO); 3528} 3529 3530/** 3531 * Setup sysctl variables to control various Block Back parameters. 3532 * 3533 * \param xbb Xen Block Back softc. 3534 * 3535 */ 3536static void 3537xbb_setup_sysctl(struct xbb_softc *xbb) 3538{ 3539 struct sysctl_ctx_list *sysctl_ctx = NULL; 3540 struct sysctl_oid *sysctl_tree = NULL; 3541 3542 sysctl_ctx = device_get_sysctl_ctx(xbb->dev); 3543 if (sysctl_ctx == NULL) 3544 return; 3545 3546 sysctl_tree = device_get_sysctl_tree(xbb->dev); 3547 if (sysctl_tree == NULL) 3548 return; 3549 3550 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3551 "disable_flush", CTLFLAG_RW, &xbb->disable_flush, 0, 3552 "fake the flush command"); 3553 3554 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3555 "flush_interval", CTLFLAG_RW, &xbb->flush_interval, 0, 3556 "send a real flush for N flush requests"); 3557 3558 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3559 "no_coalesce_reqs", CTLFLAG_RW, &xbb->no_coalesce_reqs,0, 3560 "Don't coalesce contiguous requests"); 3561 3562 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3563 "reqs_received", CTLFLAG_RW, &xbb->reqs_received, 3564 "how many I/O requests we have received"); 3565 3566 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3567 "reqs_completed", CTLFLAG_RW, &xbb->reqs_completed, 3568 "how many I/O requests have been completed"); 3569 3570 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3571 "reqs_queued_for_completion", CTLFLAG_RW, 3572 &xbb->reqs_queued_for_completion, 3573 "how many I/O requests queued but not yet pushed"); 3574 3575 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3576 "reqs_completed_with_error", CTLFLAG_RW, 3577 &xbb->reqs_completed_with_error, 3578 "how many I/O requests completed with error status"); 3579 3580 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3581 "forced_dispatch", CTLFLAG_RW, &xbb->forced_dispatch, 3582 "how many I/O dispatches were forced"); 3583 3584 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3585 "normal_dispatch", CTLFLAG_RW, &xbb->normal_dispatch, 3586 "how many I/O dispatches were normal"); 3587 3588 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3589 "total_dispatch", CTLFLAG_RW, &xbb->total_dispatch, 3590 "total number of I/O dispatches"); 3591 3592 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3593 "kva_shortages", CTLFLAG_RW, &xbb->kva_shortages, 3594 "how many times we have run out of KVA"); 3595 3596 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3597 "request_shortages", CTLFLAG_RW, 3598 &xbb->request_shortages, 3599 "how many times we have run out of requests"); 3600 3601 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3602 "max_requests", CTLFLAG_RD, &xbb->max_requests, 0, 3603 "maximum outstanding requests (negotiated)"); 3604 3605 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3606 "max_request_segments", CTLFLAG_RD, 3607 &xbb->max_request_segments, 0, 3608 "maximum number of pages per requests (negotiated)"); 3609 3610 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3611 "max_request_size", CTLFLAG_RD, 3612 &xbb->max_request_size, 0, 3613 "maximum size in bytes of a request (negotiated)"); 3614 3615 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3616 "ring_pages", CTLFLAG_RD, 3617 &xbb->ring_config.ring_pages, 0, 3618 "communication channel pages (negotiated)"); 3619} 3620 3621static void 3622xbb_attach_disk(struct xs_watch *watch, const char **vec, unsigned int len) 3623{ 3624 device_t dev; 3625 struct xbb_softc *xbb; 3626 int error; 3627 3628 dev = (device_t) watch->callback_data; 3629 xbb = device_get_softc(dev); 3630 3631 error = xs_gather(XST_NIL, xenbus_get_node(dev), "physical-device-path", 3632 NULL, &xbb->dev_name, NULL); 3633 if (error != 0) 3634 return; 3635 3636 xs_unregister_watch(watch); 3637 free(watch->node, M_XENBLOCKBACK); 3638 watch->node = NULL; 3639 3640 /* Collect physical device information. */ 3641 error = xs_gather(XST_NIL, xenbus_get_otherend_path(xbb->dev), 3642 "device-type", NULL, &xbb->dev_type, 3643 NULL); 3644 if (error != 0) 3645 xbb->dev_type = NULL; 3646 3647 error = xs_gather(XST_NIL, xenbus_get_node(dev), 3648 "mode", NULL, &xbb->dev_mode, 3649 NULL); 3650 if (error != 0) { 3651 xbb_attach_failed(xbb, error, "reading backend fields at %s", 3652 xenbus_get_node(dev)); 3653 return; 3654 } 3655 3656 /* Parse fopen style mode flags. */ 3657 if (strchr(xbb->dev_mode, 'w') == NULL) 3658 xbb->flags |= XBBF_READ_ONLY; 3659 3660 /* 3661 * Verify the physical device is present and can support 3662 * the desired I/O mode. 3663 */ 3664 error = xbb_open_backend(xbb); 3665 if (error != 0) { 3666 xbb_attach_failed(xbb, error, "Unable to open %s", 3667 xbb->dev_name); 3668 return; 3669 } 3670 3671 /* Use devstat(9) for recording statistics. */ 3672 xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev), 3673 xbb->sector_size, 3674 DEVSTAT_ALL_SUPPORTED, 3675 DEVSTAT_TYPE_DIRECT 3676 | DEVSTAT_TYPE_IF_OTHER, 3677 DEVSTAT_PRIORITY_OTHER); 3678 3679 xbb->xbb_stats_in = devstat_new_entry("xbbi", device_get_unit(xbb->dev), 3680 xbb->sector_size, 3681 DEVSTAT_ALL_SUPPORTED, 3682 DEVSTAT_TYPE_DIRECT 3683 | DEVSTAT_TYPE_IF_OTHER, 3684 DEVSTAT_PRIORITY_OTHER); 3685 /* 3686 * Setup sysctl variables. 3687 */ 3688 xbb_setup_sysctl(xbb); 3689 3690 /* 3691 * Create a taskqueue for doing work that must occur from a 3692 * thread context. 3693 */ 3694 xbb->io_taskqueue = taskqueue_create_fast(device_get_nameunit(dev), 3695 M_NOWAIT, 3696 taskqueue_thread_enqueue, 3697 /*contxt*/&xbb->io_taskqueue); 3698 if (xbb->io_taskqueue == NULL) { 3699 xbb_attach_failed(xbb, error, "Unable to create taskqueue"); 3700 return; 3701 } 3702 3703 taskqueue_start_threads(&xbb->io_taskqueue, 3704 /*num threads*/1, 3705 /*priority*/PWAIT, 3706 /*thread name*/ 3707 "%s taskq", device_get_nameunit(dev)); 3708 3709 /* Update hot-plug status to satisfy xend. */ 3710 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3711 "hotplug-status", "connected"); 3712 if (error) { 3713 xbb_attach_failed(xbb, error, "writing %s/hotplug-status", 3714 xenbus_get_node(xbb->dev)); 3715 return; 3716 } 3717 3718 xbb->hotplug_done = true; 3719 3720 /* The front end might be waiting for the backend, attach if so. */ 3721 if (xenbus_get_otherend_state(xbb->dev) == XenbusStateInitialised) 3722 xbb_connect(xbb); 3723} 3724 3725/** 3726 * Attach to a XenBus device that has been claimed by our probe routine. 3727 * 3728 * \param dev NewBus device object representing this Xen Block Back instance. 3729 * 3730 * \return 0 for success, errno codes for failure. 3731 */ 3732static int 3733xbb_attach(device_t dev) 3734{ 3735 struct xbb_softc *xbb; 3736 int error; 3737 u_int max_ring_page_order; 3738 struct sbuf *watch_path; 3739 3740 DPRINTF("Attaching to %s\n", xenbus_get_node(dev)); 3741 3742 /* 3743 * Basic initialization. 3744 * After this block it is safe to call xbb_detach() 3745 * to clean up any allocated data for this instance. 3746 */ 3747 xbb = device_get_softc(dev); 3748 xbb->dev = dev; 3749 xbb->otherend_id = xenbus_get_otherend_id(dev); 3750 TASK_INIT(&xbb->io_task, /*priority*/0, xbb_run_queue, xbb); 3751 mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF); 3752 3753 /* 3754 * Publish protocol capabilities for consumption by the 3755 * front-end. 3756 */ 3757 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3758 "feature-barrier", "1"); 3759 if (error) { 3760 xbb_attach_failed(xbb, error, "writing %s/feature-barrier", 3761 xenbus_get_node(xbb->dev)); 3762 return (error); 3763 } 3764 3765 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3766 "feature-flush-cache", "1"); 3767 if (error) { 3768 xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache", 3769 xenbus_get_node(xbb->dev)); 3770 return (error); 3771 } 3772 3773 max_ring_page_order = flsl(XBB_MAX_RING_PAGES) - 1; 3774 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3775 "max-ring-page-order", "%u", max_ring_page_order); 3776 if (error) { 3777 xbb_attach_failed(xbb, error, "writing %s/max-ring-page-order", 3778 xenbus_get_node(xbb->dev)); 3779 return (error); 3780 } 3781 3782 /* 3783 * We need to wait for hotplug script execution before 3784 * moving forward. 3785 */ 3786 KASSERT(!xbb->hotplug_done, ("Hotplug scripts already executed")); 3787 watch_path = xs_join(xenbus_get_node(xbb->dev), "physical-device-path"); 3788 xbb->hotplug_watch.callback_data = (uintptr_t)dev; 3789 xbb->hotplug_watch.callback = xbb_attach_disk; 3790 KASSERT(xbb->hotplug_watch.node == NULL, ("watch node already setup")); 3791 xbb->hotplug_watch.node = strdup(sbuf_data(watch_path), M_XENBLOCKBACK); 3792 /* 3793 * We don't care about the path updated, just about the value changes 3794 * on that single node, hence there's no need to queue more that one 3795 * event. 3796 */ 3797 xbb->hotplug_watch.max_pending = 1; 3798 sbuf_delete(watch_path); 3799 error = xs_register_watch(&xbb->hotplug_watch); 3800 if (error != 0) { 3801 xbb_attach_failed(xbb, error, "failed to create watch on %s", 3802 xbb->hotplug_watch.node); 3803 free(xbb->hotplug_watch.node, M_XENBLOCKBACK); 3804 return (error); 3805 } 3806 3807 /* Tell the toolstack blkback has attached. */ 3808 xenbus_set_state(dev, XenbusStateInitWait); 3809 3810 return (0); 3811} 3812 3813/** 3814 * Detach from a block back device instance. 3815 * 3816 * \param dev NewBus device object representing this Xen Block Back instance. 3817 * 3818 * \return 0 for success, errno codes for failure. 3819 * 3820 * \note A block back device may be detached at any time in its life-cycle, 3821 * including part way through the attach process. For this reason, 3822 * initialization order and the initialization state checks in this 3823 * routine must be carefully coupled so that attach time failures 3824 * are gracefully handled. 3825 */ 3826static int 3827xbb_detach(device_t dev) 3828{ 3829 struct xbb_softc *xbb; 3830 3831 DPRINTF("\n"); 3832 3833 xbb = device_get_softc(dev); 3834 mtx_lock(&xbb->lock); 3835 while (xbb_shutdown(xbb) == EAGAIN) { 3836 msleep(xbb, &xbb->lock, /*wakeup prio unchanged*/0, 3837 "xbb_shutdown", 0); 3838 } 3839 mtx_unlock(&xbb->lock); 3840 3841 DPRINTF("\n"); 3842 3843 if (xbb->io_taskqueue != NULL) 3844 taskqueue_free(xbb->io_taskqueue); 3845 3846 if (xbb->xbb_stats != NULL) 3847 devstat_remove_entry(xbb->xbb_stats); 3848 3849 if (xbb->xbb_stats_in != NULL) 3850 devstat_remove_entry(xbb->xbb_stats_in); 3851 3852 xbb_close_backend(xbb); 3853 3854 if (xbb->dev_mode != NULL) { 3855 free(xbb->dev_mode, M_XENSTORE); 3856 xbb->dev_mode = NULL; 3857 } 3858 3859 if (xbb->dev_type != NULL) { 3860 free(xbb->dev_type, M_XENSTORE); 3861 xbb->dev_type = NULL; 3862 } 3863 3864 if (xbb->dev_name != NULL) { 3865 free(xbb->dev_name, M_XENSTORE); 3866 xbb->dev_name = NULL; 3867 } 3868 3869 mtx_destroy(&xbb->lock); 3870 return (0); 3871} 3872 3873/** 3874 * Prepare this block back device for suspension of this VM. 3875 * 3876 * \param dev NewBus device object representing this Xen Block Back instance. 3877 * 3878 * \return 0 for success, errno codes for failure. 3879 */ 3880static int 3881xbb_suspend(device_t dev) 3882{ 3883#ifdef NOT_YET 3884 struct xbb_softc *sc = device_get_softc(dev); 3885 3886 /* Prevent new requests being issued until we fix things up. */ 3887 mtx_lock(&sc->xb_io_lock); 3888 sc->connected = BLKIF_STATE_SUSPENDED; 3889 mtx_unlock(&sc->xb_io_lock); 3890#endif 3891 3892 return (0); 3893} 3894 3895/** 3896 * Perform any processing required to recover from a suspended state. 3897 * 3898 * \param dev NewBus device object representing this Xen Block Back instance. 3899 * 3900 * \return 0 for success, errno codes for failure. 3901 */ 3902static int 3903xbb_resume(device_t dev) 3904{ 3905 return (0); 3906} 3907 3908/** 3909 * Handle state changes expressed via the XenStore by our front-end peer. 3910 * 3911 * \param dev NewBus device object representing this Xen 3912 * Block Back instance. 3913 * \param frontend_state The new state of the front-end. 3914 * 3915 * \return 0 for success, errno codes for failure. 3916 */ 3917static void 3918xbb_frontend_changed(device_t dev, XenbusState frontend_state) 3919{ 3920 struct xbb_softc *xbb = device_get_softc(dev); 3921 3922 DPRINTF("frontend_state=%s, xbb_state=%s\n", 3923 xenbus_strstate(frontend_state), 3924 xenbus_strstate(xenbus_get_state(xbb->dev))); 3925 3926 switch (frontend_state) { 3927 case XenbusStateInitialising: 3928 break; 3929 case XenbusStateInitialised: 3930 case XenbusStateConnected: 3931 xbb_connect(xbb); 3932 break; 3933 case XenbusStateClosing: 3934 case XenbusStateClosed: 3935 mtx_lock(&xbb->lock); 3936 xbb_shutdown(xbb); 3937 mtx_unlock(&xbb->lock); 3938 if (frontend_state == XenbusStateClosed) 3939 xenbus_set_state(xbb->dev, XenbusStateClosed); 3940 break; 3941 default: 3942 xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend", 3943 frontend_state); 3944 break; 3945 } 3946} 3947 3948/*---------------------------- NewBus Registration ---------------------------*/ 3949static device_method_t xbb_methods[] = { 3950 /* Device interface */ 3951 DEVMETHOD(device_probe, xbb_probe), 3952 DEVMETHOD(device_attach, xbb_attach), 3953 DEVMETHOD(device_detach, xbb_detach), 3954 DEVMETHOD(device_shutdown, bus_generic_shutdown), 3955 DEVMETHOD(device_suspend, xbb_suspend), 3956 DEVMETHOD(device_resume, xbb_resume), 3957 3958 /* Xenbus interface */ 3959 DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed), 3960 3961 { 0, 0 } 3962}; 3963 3964static driver_t xbb_driver = { 3965 "xbbd", 3966 xbb_methods, 3967 sizeof(struct xbb_softc), 3968}; 3969devclass_t xbb_devclass; 3970 3971DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, xbb_devclass, 0, 0); 3972