Cross Reference: /freebsd-10.1-release/sys/dev/xen/blkback/blkback.c

Deleted Added

sdiff udiff text old ( 231883 ) new ( 241896 )

full compact

blkback.c (231883)	blkback.c (241896)
1/- 2 Copyright (c) 2009-2011 Spectra Logic Corporation 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions, and the following disclaimer, 10 * without modification. 11 * 2. Redistributions in binary form must reproduce at minimum a disclaimer 12 * substantially similar to the "NO WARRANTY" disclaimer below 13 * ("Disclaimer") and any redistribution must be conditioned upon 14 * including a substantially similar Disclaimer requirement for further 15 * binary redistribution. 16 * 17 * NO WARRANTY 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 26 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 27 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * POSSIBILITY OF SUCH DAMAGES. 29 * 30 * Authors: Justin T. Gibbs (Spectra Logic Corporation) 31 * Ken Merry (Spectra Logic Corporation) 32 */ 33#include <sys/cdefs.h>	1/- 2 Copyright (c) 2009-2011 Spectra Logic Corporation 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions, and the following disclaimer, 10 * without modification. 11 * 2. Redistributions in binary form must reproduce at minimum a disclaimer 12 * substantially similar to the "NO WARRANTY" disclaimer below 13 * ("Disclaimer") and any redistribution must be conditioned upon 14 * including a substantially similar Disclaimer requirement for further 15 * binary redistribution. 16 * 17 * NO WARRANTY 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 26 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 27 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * POSSIBILITY OF SUCH DAMAGES. 29 * 30 * Authors: Justin T. Gibbs (Spectra Logic Corporation) 31 * Ken Merry (Spectra Logic Corporation) 32 */ 33#include <sys/cdefs.h>
34__FBSDID("$FreeBSD: head/sys/dev/xen/blkback/blkback.c 231883 2012-02-17 22:33:46Z gibbs $");	34__FBSDID("$FreeBSD: head/sys/dev/xen/blkback/blkback.c 241896 2012-10-22 17:50:54Z kib $");
35 36/** 37 * \file blkback.c 38 * 39 * \brief Device driver supporting the vending of block storage from 40 * a FreeBSD domain to other domains. 41 / 42 43#include "opt_kdtrace.h" 44 45#include <sys/param.h> 46#include <sys/systm.h> 47#include <sys/kernel.h> 48#include <sys/malloc.h> 49 50#include <sys/bio.h> 51#include <sys/bus.h> 52#include <sys/conf.h> 53#include <sys/devicestat.h> 54#include <sys/disk.h> 55#include <sys/fcntl.h> 56#include <sys/filedesc.h> 57#include <sys/kdb.h> 58#include <sys/module.h> 59#include <sys/namei.h> 60#include <sys/proc.h> 61#include <sys/rman.h> 62#include <sys/taskqueue.h> 63#include <sys/types.h> 64#include <sys/vnode.h> 65#include <sys/mount.h> 66#include <sys/sysctl.h> 67#include <sys/bitstring.h> 68#include <sys/sdt.h> 69 70#include <geom/geom.h> 71 72#include <machine/_inttypes.h> 73#include <machine/xen/xen-os.h> 74 75#include <vm/vm.h> 76#include <vm/vm_extern.h> 77#include <vm/vm_kern.h> 78 79#include <xen/blkif.h> 80#include <xen/evtchn.h> 81#include <xen/gnttab.h> 82#include <xen/xen_intr.h> 83 84#include <xen/interface/event_channel.h> 85#include <xen/interface/grant_table.h> 86 87#include <xen/xenbus/xenbusvar.h> 88 89/--------------------------- Compile-time Tunables --------------------------/ 90/* 91 * The maximum number of outstanding request blocks (request headers plus 92 * additional segment blocks) we will allow in a negotiated block-front/back 93 * communication channel. 94 / 95#define XBB_MAX_REQUESTS 256 96 97/* 98 * \brief Define to force all I/O to be performed on memory owned by the 99 * backend device, with a copy-in/out to the remote domain's memory. 100 * 101 * \note This option is currently required when this driver's domain is 102 * operating in HVM mode on a system using an IOMMU. 103 * 104 * This driver uses Xen's grant table API to gain access to the memory of 105 * the remote domains it serves. When our domain is operating in PV mode, 106 * the grant table mechanism directly updates our domain's page table entries 107 * to point to the physical pages of the remote domain. This scheme guarantees 108 * that blkback and the backing devices it uses can safely perform DMA 109 * operations to satisfy requests. In HVM mode, Xen may use a HW IOMMU to 110 * insure that our domain cannot DMA to pages owned by another domain. As 111 * of Xen 4.0, IOMMU mappings for HVM guests are not updated via the grant 112 * table API. For this reason, in HVM mode, we must bounce all requests into 113 * memory that is mapped into our domain at domain startup and thus has 114 * valid IOMMU mappings. 115 / 116#define XBB_USE_BOUNCE_BUFFERS 117* 118/** 119 * \brief Define to enable rudimentary request logging to the console. 120 / 121#undef XBB_DEBUG 122* 123/---------------------------------- Macros ----------------------------------/ 124/** 125 * Custom malloc type for all driver allocations. 126 / 127static MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data"); 128* 129#ifdef XBB_DEBUG 130#define DPRINTF(fmt, args...) \ 131 printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) 132#else 133#define DPRINTF(fmt, args...) do {} while(0) 134#endif 135 136/** 137 * The maximum mapped region size per request we will allow in a negotiated 138 * block-front/back communication channel. 139 / 140#define XBB_MAX_REQUEST_SIZE \ 141* MIN(MAXPHYS, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) 142 143/** 144 * The maximum number of segments (within a request header and accompanying 145 * segment blocks) per request we will allow in a negotiated block-front/back 146 * communication channel. 147 / 148#define XBB_MAX_SEGMENTS_PER_REQUEST \ 149* (MIN(UIO_MAXIOV, \ 150 MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST, \ 151 (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1))) 152 153/** 154 * The maximum number of shared memory ring pages we will allow in a 155 * negotiated block-front/back communication channel. Allow enough 156 * ring space for all requests to be XBB_MAX_REQUEST_SIZE'd. 157 / 158#define XBB_MAX_RING_PAGES \ 159* BLKIF_RING_PAGES(BLKIF_SEGS_TO_BLOCKS(XBB_MAX_SEGMENTS_PER_REQUEST) \ 160 * XBB_MAX_REQUESTS) 161/** 162 * The maximum number of ring pages that we can allow per request list. 163 * We limit this to the maximum number of segments per request, because 164 * that is already a reasonable number of segments to aggregate. This 165 * number should never be smaller than XBB_MAX_SEGMENTS_PER_REQUEST, 166 * because that would leave situations where we can't dispatch even one 167 * large request. 168 / 169#define XBB_MAX_SEGMENTS_PER_REQLIST XBB_MAX_SEGMENTS_PER_REQUEST 170* 171/--------------------------- Forward Declarations ---------------------------/ 172struct xbb_softc; 173struct xbb_xen_req; 174 175static void xbb_attach_failed(struct xbb_softc xbb, int err, const char fmt, 176 ...) __attribute__((format(printf, 3, 4))); 177static int xbb_shutdown(struct xbb_softc xbb); 178static int xbb_detach(device_t dev); 179* 180/------------------------------ Data Structures -----------------------------/ 181 182STAILQ_HEAD(xbb_xen_req_list, xbb_xen_req); 183 184typedef enum { 185 XBB_REQLIST_NONE = 0x00, 186 XBB_REQLIST_MAPPED = 0x01 187} xbb_reqlist_flags; 188 189struct xbb_xen_reqlist { 190 /** 191 * Back reference to the parent block back instance for this 192 * request. Used during bio_done handling. 193 / 194* struct xbb_softc xbb; 195* 196 /** 197 * BLKIF_OP code for this request. 198 / 199* int operation; 200 201 /** 202 * Set to BLKIF_RSP_* to indicate request status. 203 * 204 * This field allows an error status to be recorded even if the 205 * delivery of this status must be deferred. Deferred reporting 206 * is necessary, for example, when an error is detected during 207 * completion processing of one bio when other bios for this 208 * request are still outstanding. 209 / 210* int status; 211 212 /** 213 * Number of 512 byte sectors not transferred. 214 / 215* int residual_512b_sectors; 216 217 /** 218 * Starting sector number of the first request in the list. 219 / 220* off_t starting_sector_number; 221 222 /** 223 * If we're going to coalesce, the next contiguous sector would be 224 * this one. 225 / 226* off_t next_contig_sector; 227 228 /** 229 * Number of child requests in the list. 230 / 231* int num_children; 232 233 /** 234 * Number of I/O requests dispatched to the backend. 235 / 236* int pendcnt; 237 238 /** 239 * Total number of segments for requests in the list. 240 / 241* int nr_segments; 242 243 /** 244 * Flags for this particular request list. 245 / 246* xbb_reqlist_flags flags; 247 248 /** 249 * Kernel virtual address space reserved for this request 250 * list structure and used to map the remote domain's pages for 251 * this I/O, into our domain's address space. 252 / 253* uint8_t kva; 254* 255 /** 256 * Base, psuedo-physical address, corresponding to the start 257 * of this request's kva region. 258 / 259* uint64_t gnt_base; 260 261 262#ifdef XBB_USE_BOUNCE_BUFFERS 263 /** 264 * Pre-allocated domain local memory used to proxy remote 265 * domain memory during I/O operations. 266 / 267* uint8_t bounce; 268#endif 269* 270 /** 271 * Array of grant handles (one per page) used to map this request. 272 / 273* grant_handle_t gnt_handles; 274* 275 /** 276 * Device statistics request ordering type (ordered or simple). 277 / 278* devstat_tag_type ds_tag_type; 279 280 /** 281 * Device statistics request type (read, write, no_data). 282 / 283* devstat_trans_flags ds_trans_type; 284 285 /** 286 * The start time for this request. 287 / 288* struct bintime ds_t0; 289 290 /** 291 * Linked list of contiguous requests with the same operation type. 292 / 293* struct xbb_xen_req_list contig_req_list; 294 295 /** 296 * Linked list links used to aggregate idle requests in the 297 * request list free pool (xbb->reqlist_free_stailq) and pending 298 * requests waiting for execution (xbb->reqlist_pending_stailq). 299 / 300* STAILQ_ENTRY(xbb_xen_reqlist) links; 301}; 302 303STAILQ_HEAD(xbb_xen_reqlist_list, xbb_xen_reqlist); 304 305/** 306 * \brief Object tracking an in-flight I/O from a Xen VBD consumer. 307 / 308struct xbb_xen_req { 309* /** 310 * Linked list links used to aggregate requests into a reqlist 311 * and to store them in the request free pool. 312 / 313* STAILQ_ENTRY(xbb_xen_req) links; 314 315 /** 316 * The remote domain's identifier for this I/O request. 317 / 318* uint64_t id; 319 320 /** 321 * The number of pages currently mapped for this request. 322 / 323* int nr_pages; 324 325 /** 326 * The number of 512 byte sectors comprising this requests. 327 / 328* int nr_512b_sectors; 329 330 /** 331 * The number of struct bio requests still outstanding for this 332 * request on the backend device. This field is only used for 333 * device (rather than file) backed I/O. 334 / 335* int pendcnt; 336 337 /** 338 * BLKIF_OP code for this request. 339 / 340* int operation; 341 342 /** 343 * Storage used for non-native ring requests. 344 / 345* blkif_request_t ring_req_storage; 346 347 /** 348 * Pointer to the Xen request in the ring. 349 / 350* blkif_request_t ring_req; 351* 352 /** 353 * Consumer index for this request. 354 / 355* RING_IDX req_ring_idx; 356 357 /** 358 * The start time for this request. 359 / 360* struct bintime ds_t0; 361 362 /** 363 * Pointer back to our parent request list. 364 / 365* struct xbb_xen_reqlist reqlist; 366}; 367SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req); 368* 369/** 370 * \brief Configuration data for the shared memory request ring 371 * used to communicate with the front-end client of this 372 * this driver. 373 / 374struct xbb_ring_config { 375* /** KVA address where ring memory is mapped. / 376* vm_offset_t va; 377 378 /** The pseudo-physical address where ring memory is mapped./ 379* uint64_t gnt_addr; 380 381 /** 382 * Grant table handles, one per-ring page, returned by the 383 * hyperpervisor upon mapping of the ring and required to 384 * unmap it when a connection is torn down. 385 / 386* grant_handle_t handle[XBB_MAX_RING_PAGES]; 387 388 /** 389 * The device bus address returned by the hypervisor when 390 * mapping the ring and required to unmap it when a connection 391 * is torn down. 392 / 393* uint64_t bus_addr[XBB_MAX_RING_PAGES]; 394 395 /** The number of ring pages mapped for the current connection. / 396* u_int ring_pages; 397 398 /** 399 * The grant references, one per-ring page, supplied by the 400 * front-end, allowing us to reference the ring pages in the 401 * front-end's domain and to map these pages into our own domain. 402 / 403* grant_ref_t ring_ref[XBB_MAX_RING_PAGES]; 404 405 /** The interrupt driven even channel used to signal ring events. / 406* evtchn_port_t evtchn; 407}; 408 409/** 410 * Per-instance connection state flags. 411 / 412typedef enum 413{ 414* /** 415 * The front-end requested a read-only mount of the 416 * back-end device/file. 417 / 418* XBBF_READ_ONLY = 0x01, 419 420 /** Communication with the front-end has been established. / 421* XBBF_RING_CONNECTED = 0x02, 422 423 /** 424 * Front-end requests exist in the ring and are waiting for 425 * xbb_xen_req objects to free up. 426 / 427* XBBF_RESOURCE_SHORTAGE = 0x04, 428 429 /** Connection teardown in progress. / 430* XBBF_SHUTDOWN = 0x08, 431 432 /** A thread is already performing shutdown processing. / 433* XBBF_IN_SHUTDOWN = 0x10 434} xbb_flag_t; 435 436/** Backend device type. / 437typedef enum { 438* /** Backend type unknown. / 439* XBB_TYPE_NONE = 0x00, 440 441 /** 442 * Backend type disk (access via cdev switch 443 * strategy routine). 444 / 445* XBB_TYPE_DISK = 0x01, 446 447 /** Backend type file (access vnode operations.). / 448* XBB_TYPE_FILE = 0x02 449} xbb_type; 450 451/** 452 * \brief Structure used to memoize information about a per-request 453 * scatter-gather list. 454 * 455 * The chief benefit of using this data structure is it avoids having 456 * to reparse the possibly discontiguous S/G list in the original 457 * request. Due to the way that the mapping of the memory backing an 458 * I/O transaction is handled by Xen, a second pass is unavoidable. 459 * At least this way the second walk is a simple array traversal. 460 * 461 * \note A single Scatter/Gather element in the block interface covers 462 * at most 1 machine page. In this context a sector (blkif 463 * nomenclature, not what I'd choose) is a 512b aligned unit 464 * of mapping within the machine page referenced by an S/G 465 * element. 466 / 467struct xbb_sg { 468* /** The number of 512b data chunks mapped in this S/G element. / 469* int16_t nsect; 470 471 /** 472 * The index (0 based) of the first 512b data chunk mapped 473 * in this S/G element. 474 / 475* uint8_t first_sect; 476 477 /** 478 * The index (0 based) of the last 512b data chunk mapped 479 * in this S/G element. 480 / 481* uint8_t last_sect; 482}; 483 484/** 485 * Character device backend specific configuration data. 486 / 487struct xbb_dev_data { 488* /** Cdev used for device backend access. / 489* struct cdev cdev; 490* 491 /** Cdev switch used for device backend access. / 492* struct cdevsw csw; 493* 494 /** Used to hold a reference on opened cdev backend devices. / 495* int dev_ref; 496}; 497 498/** 499 * File backend specific configuration data. 500 / 501struct xbb_file_data { 502* /** Credentials to use for vnode backed (file based) I/O. / 503* struct ucred cred; 504* 505 /** 506 * \brief Array of io vectors used to process file based I/O. 507 * 508 * Only a single file based request is outstanding per-xbb instance, 509 * so we only need one of these. 510 / 511* struct iovec xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; 512#ifdef XBB_USE_BOUNCE_BUFFERS 513 514 /** 515 * \brief Array of io vectors used to handle bouncing of file reads. 516 * 517 * Vnode operations are free to modify uio data during their 518 * exectuion. In the case of a read with bounce buffering active, 519 * we need some of the data from the original uio in order to 520 * bounce-out the read data. This array serves as the temporary 521 * storage for this saved data. 522 / 523* struct iovec saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; 524 525 /** 526 * \brief Array of memoized bounce buffer kva offsets used 527 * in the file based backend. 528 * 529 * Due to the way that the mapping of the memory backing an 530 * I/O transaction is handled by Xen, a second pass through 531 * the request sg elements is unavoidable. We memoize the computed 532 * bounce address here to reduce the cost of the second walk. 533 / 534* void xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQLIST]; 535#endif / XBB_USE_BOUNCE_BUFFERS / 536}; 537* 538/** 539 * Collection of backend type specific data. 540 / 541union xbb_backend_data { 542* struct xbb_dev_data dev; 543 struct xbb_file_data file; 544}; 545 546/** 547 * Function signature of backend specific I/O handlers. 548 / 549typedef int (xbb_dispatch_t)(struct xbb_softc xbb, 550* struct xbb_xen_reqlist reqlist, int operation, 551* int flags); 552 553/** 554 * Per-instance configuration data. 555 / 556struct xbb_softc { 557* 558 /** 559 * Task-queue used to process I/O requests. 560 / 561* struct taskqueue io_taskqueue; 562* 563 /** 564 * Single "run the request queue" task enqueued 565 * on io_taskqueue. 566 / 567* struct task io_task; 568 569 /** Device type for this instance. / 570* xbb_type device_type; 571 572 /** NewBus device corresponding to this instance. / 573* device_t dev; 574 575 /** Backend specific dispatch routine for this instance. / 576* xbb_dispatch_t dispatch_io; 577 578 /** The number of requests outstanding on the backend device/file. / 579* int active_request_count; 580 581 /** Free pool of request tracking structures. / 582* struct xbb_xen_req_list request_free_stailq; 583 584 /** Array, sized at connection time, of request tracking structures. / 585* struct xbb_xen_req requests; 586* 587 /** Free pool of request list structures. / 588* struct xbb_xen_reqlist_list reqlist_free_stailq; 589 590 /** List of pending request lists awaiting execution. / 591* struct xbb_xen_reqlist_list reqlist_pending_stailq; 592 593 /** Array, sized at connection time, of request list structures. / 594* struct xbb_xen_reqlist request_lists; 595* 596 /** 597 * Global pool of kva used for mapping remote domain ring 598 * and I/O transaction data. 599 / 600* vm_offset_t kva; 601 602 /** Psuedo-physical address corresponding to kva. / 603* uint64_t gnt_base_addr; 604 605 /** The size of the global kva pool. / 606* int kva_size; 607 608 /** The size of the KVA area used for request lists. / 609* int reqlist_kva_size; 610 611 /** The number of pages of KVA used for request lists / 612* int reqlist_kva_pages; 613 614 /** Bitmap of free KVA pages / 615* bitstr_t kva_free; 616* 617 /** 618 * \brief Cached value of the front-end's domain id. 619 * 620 * This value is used at once for each mapped page in 621 * a transaction. We cache it to avoid incuring the 622 * cost of an ivar access every time this is needed. 623 / 624* domid_t otherend_id; 625 626 /** 627 * \brief The blkif protocol abi in effect. 628 * 629 * There are situations where the back and front ends can 630 * have a different, native abi (e.g. intel x86_64 and 631 * 32bit x86 domains on the same machine). The back-end 632 * always accomodates the front-end's native abi. That 633 * value is pulled from the XenStore and recorded here. 634 / 635* int abi; 636 637 /** 638 * \brief The maximum number of requests and request lists allowed 639 * to be in flight at a time. 640 * 641 * This value is negotiated via the XenStore. 642 / 643* u_int max_requests; 644 645 /** 646 * \brief The maximum number of segments (1 page per segment) 647 * that can be mapped by a request. 648 * 649 * This value is negotiated via the XenStore. 650 / 651* u_int max_request_segments; 652 653 /** 654 * \brief Maximum number of segments per request list. 655 * 656 * This value is derived from and will generally be larger than 657 * max_request_segments. 658 / 659* u_int max_reqlist_segments; 660 661 /** 662 * The maximum size of any request to this back-end 663 * device. 664 * 665 * This value is negotiated via the XenStore. 666 / 667* u_int max_request_size; 668 669 /** 670 * The maximum size of any request list. This is derived directly 671 * from max_reqlist_segments. 672 / 673* u_int max_reqlist_size; 674 675 /** Various configuration and state bit flags. / 676* xbb_flag_t flags; 677 678 /** Ring mapping and interrupt configuration data. / 679* struct xbb_ring_config ring_config; 680 681 /** Runtime, cross-abi safe, structures for ring access. / 682* blkif_back_rings_t rings; 683 684 /** IRQ mapping for the communication ring event channel. / 685* int irq; 686 687 /** 688 * \brief Backend access mode flags (e.g. write, or read-only). 689 * 690 * This value is passed to us by the front-end via the XenStore. 691 / 692* char dev_mode; 693* 694 /** 695 * \brief Backend device type (e.g. "disk", "cdrom", "floppy"). 696 * 697 * This value is passed to us by the front-end via the XenStore. 698 * Currently unused. 699 / 700* char dev_type; 701* 702 /** 703 * \brief Backend device/file identifier. 704 * 705 * This value is passed to us by the front-end via the XenStore. 706 * We expect this to be a POSIX path indicating the file or 707 * device to open. 708 / 709* char dev_name; 710* 711 /** 712 * Vnode corresponding to the backend device node or file 713 * we are acessing. 714 / 715* struct vnode vn; 716* 717 union xbb_backend_data backend; 718 719 /** The native sector size of the backend. / 720* u_int sector_size; 721 722 /** log2 of sector_size. / 723* u_int sector_size_shift; 724 725 /** Size in bytes of the backend device or file. / 726* off_t media_size; 727 728 /** 729 * \brief media_size expressed in terms of the backend native 730 * sector size. 731 * 732 * (e.g. xbb->media_size >> xbb->sector_size_shift). 733 / 734* uint64_t media_num_sectors; 735 736 /** 737 * \brief Array of memoized scatter gather data computed during the 738 * conversion of blkif ring requests to internal xbb_xen_req 739 * structures. 740 * 741 * Ring processing is serialized so we only need one of these. 742 / 743* struct xbb_sg xbb_sgs[XBB_MAX_SEGMENTS_PER_REQLIST]; 744 745 /** 746 * Temporary grant table map used in xbb_dispatch_io(). When 747 * XBB_MAX_SEGMENTS_PER_REQLIST gets large, keeping this on the 748 * stack could cause a stack overflow. 749 / 750* struct gnttab_map_grant_ref maps[XBB_MAX_SEGMENTS_PER_REQLIST]; 751 752 /** Mutex protecting per-instance data. / 753* struct mtx lock; 754 755#ifdef XENHVM 756 /** 757 * Resource representing allocated physical address space 758 * associated with our per-instance kva region. 759 / 760* struct resource pseudo_phys_res; 761* 762 /** Resource id for allocated physical address space. / 763* int pseudo_phys_res_id; 764#endif 765 766 /** 767 * I/O statistics from BlockBack dispatch down. These are 768 * coalesced requests, and we start them right before execution. 769 / 770* struct devstat xbb_stats; 771* 772 /** 773 * I/O statistics coming into BlockBack. These are the requests as 774 * we get them from BlockFront. They are started as soon as we 775 * receive a request, and completed when the I/O is complete. 776 / 777* struct devstat xbb_stats_in; 778* 779 /** Disable sending flush to the backend / 780* int disable_flush; 781 782 /** Send a real flush for every N flush requests / 783* int flush_interval; 784 785 /** Count of flush requests in the interval / 786* int flush_count; 787 788 /** Don't coalesce requests if this is set / 789* int no_coalesce_reqs; 790 791 /** Number of requests we have received / 792* uint64_t reqs_received; 793 794 /** Number of requests we have completed/ 795* uint64_t reqs_completed; 796 797 /** How many forced dispatches (i.e. without coalescing) have happend / 798* uint64_t forced_dispatch; 799 800 /** How many normal dispatches have happend / 801* uint64_t normal_dispatch; 802 803 /** How many total dispatches have happend / 804* uint64_t total_dispatch; 805 806 /** How many times we have run out of KVA / 807* uint64_t kva_shortages; 808 809 /** How many times we have run out of request structures / 810* uint64_t request_shortages; 811}; 812 813/---------------------------- Request Processing ----------------------------/ 814/** 815 * Allocate an internal transaction tracking structure from the free pool. 816 * 817 * \param xbb Per-instance xbb configuration structure. 818 * 819 * \return On success, a pointer to the allocated xbb_xen_req structure. 820 * Otherwise NULL. 821 / 822static inline struct xbb_xen_req 823xbb_get_req(struct xbb_softc xbb) 824{ 825* struct xbb_xen_req req; 826* 827 req = NULL; 828 829 mtx_assert(&xbb->lock, MA_OWNED); 830 831 if ((req = STAILQ_FIRST(&xbb->request_free_stailq)) != NULL) { 832 STAILQ_REMOVE_HEAD(&xbb->request_free_stailq, links); 833 xbb->active_request_count++; 834 } 835 836 return (req); 837} 838 839/** 840 * Return an allocated transaction tracking structure to the free pool. 841 * 842 * \param xbb Per-instance xbb configuration structure. 843 * \param req The request structure to free. 844 / 845static inline void 846xbb_release_req(struct xbb_softc xbb, struct xbb_xen_req req) 847{ 848* mtx_assert(&xbb->lock, MA_OWNED); 849 850 STAILQ_INSERT_HEAD(&xbb->request_free_stailq, req, links); 851 xbb->active_request_count--; 852 853 KASSERT(xbb->active_request_count >= 0, 854 ("xbb_release_req: negative active count")); 855} 856 857/** 858 * Return an xbb_xen_req_list of allocated xbb_xen_reqs to the free pool. 859 * 860 * \param xbb Per-instance xbb configuration structure. 861 * \param req_list The list of requests to free. 862 * \param nreqs The number of items in the list. 863 / 864static inline void 865xbb_release_reqs(struct xbb_softc xbb, struct xbb_xen_req_list req_list, 866* int nreqs) 867{ 868 mtx_assert(&xbb->lock, MA_OWNED); 869 870 STAILQ_CONCAT(&xbb->request_free_stailq, req_list); 871 xbb->active_request_count -= nreqs; 872 873 KASSERT(xbb->active_request_count >= 0, 874 ("xbb_release_reqs: negative active count")); 875} 876 877/** 878 * Given a page index and 512b sector offset within that page, 879 * calculate an offset into a request's kva region. 880 * 881 * \param reqlist The request structure whose kva region will be accessed. 882 * \param pagenr The page index used to compute the kva offset. 883 * \param sector The 512b sector index used to compute the page relative 884 * kva offset. 885 * 886 * \return The computed global KVA offset. 887 / 888static inline uint8_t 889xbb_reqlist_vaddr(struct xbb_xen_reqlist reqlist, int pagenr, int sector) 890{ 891* return (reqlist->kva + (PAGE_SIZE * pagenr) + (sector << 9)); 892} 893 894#ifdef XBB_USE_BOUNCE_BUFFERS 895/** 896 * Given a page index and 512b sector offset within that page, 897 * calculate an offset into a request's local bounce memory region. 898 * 899 * \param reqlist The request structure whose bounce region will be accessed. 900 * \param pagenr The page index used to compute the bounce offset. 901 * \param sector The 512b sector index used to compute the page relative 902 * bounce offset. 903 * 904 * \return The computed global bounce buffer address. 905 / 906static inline uint8_t 907xbb_reqlist_bounce_addr(struct xbb_xen_reqlist reqlist, int pagenr, int sector) 908{ 909* return (reqlist->bounce + (PAGE_SIZE * pagenr) + (sector << 9)); 910} 911#endif 912 913/** 914 * Given a page number and 512b sector offset within that page, 915 * calculate an offset into the request's memory region that the 916 * underlying backend device/file should use for I/O. 917 * 918 * \param reqlist The request structure whose I/O region will be accessed. 919 * \param pagenr The page index used to compute the I/O offset. 920 * \param sector The 512b sector index used to compute the page relative 921 * I/O offset. 922 * 923 * \return The computed global I/O address. 924 * 925 * Depending on configuration, this will either be a local bounce buffer 926 * or a pointer to the memory mapped in from the front-end domain for 927 * this request. 928 / 929static inline uint8_t 930xbb_reqlist_ioaddr(struct xbb_xen_reqlist reqlist, int pagenr, int sector) 931{ 932#ifdef XBB_USE_BOUNCE_BUFFERS 933* return (xbb_reqlist_bounce_addr(reqlist, pagenr, sector)); 934#else 935 return (xbb_reqlist_vaddr(reqlist, pagenr, sector)); 936#endif 937} 938 939/** 940 * Given a page index and 512b sector offset within that page, calculate 941 * an offset into the local psuedo-physical address space used to map a 942 * front-end's request data into a request. 943 * 944 * \param reqlist The request list structure whose pseudo-physical region 945 * will be accessed. 946 * \param pagenr The page index used to compute the pseudo-physical offset. 947 * \param sector The 512b sector index used to compute the page relative 948 * pseudo-physical offset. 949 * 950 * \return The computed global pseudo-phsyical address. 951 * 952 * Depending on configuration, this will either be a local bounce buffer 953 * or a pointer to the memory mapped in from the front-end domain for 954 * this request. 955 / 956static inline uintptr_t 957xbb_get_gntaddr(struct xbb_xen_reqlist reqlist, int pagenr, int sector) 958{ 959 struct xbb_softc xbb; 960* 961 xbb = reqlist->xbb; 962 963 return ((uintptr_t)(xbb->gnt_base_addr + 964 (uintptr_t)(reqlist->kva - xbb->kva) + 965 (PAGE_SIZE * pagenr) + (sector << 9))); 966} 967 968/** 969 * Get Kernel Virtual Address space for mapping requests. 970 * 971 * \param xbb Per-instance xbb configuration structure. 972 * \param nr_pages Number of pages needed. 973 * \param check_only If set, check for free KVA but don't allocate it. 974 * \param have_lock If set, xbb lock is already held. 975 * 976 * \return On success, a pointer to the allocated KVA region. Otherwise NULL. 977 * 978 * Note: This should be unnecessary once we have either chaining or 979 * scatter/gather support for struct bio. At that point we'll be able to 980 * put multiple addresses and lengths in one bio/bio chain and won't need 981 * to map everything into one virtual segment. 982 / 983static uint8_t 984xbb_get_kva(struct xbb_softc xbb, int nr_pages) 985{ 986* intptr_t first_clear; 987 intptr_t num_clear; 988 uint8_t free_kva; 989* int i; 990 991 KASSERT(nr_pages != 0, ("xbb_get_kva of zero length")); 992 993 first_clear = 0; 994 free_kva = NULL; 995 996 mtx_lock(&xbb->lock); 997 998 /* 999 * Look for the first available page. If there are none, we're done. 1000 / 1001* bit_ffc(xbb->kva_free, xbb->reqlist_kva_pages, &first_clear); 1002 1003 if (first_clear == -1) 1004 goto bailout; 1005 1006 /* 1007 * Starting at the first available page, look for consecutive free 1008 * pages that will satisfy the user's request. 1009 / 1010* for (i = first_clear, num_clear = 0; i < xbb->reqlist_kva_pages; i++) { 1011 /* 1012 * If this is true, the page is used, so we have to reset 1013 * the number of clear pages and the first clear page 1014 * (since it pointed to a region with an insufficient number 1015 * of clear pages). 1016 / 1017* if (bit_test(xbb->kva_free, i)) { 1018 num_clear = 0; 1019 first_clear = -1; 1020 continue; 1021 } 1022 1023 if (first_clear == -1) 1024 first_clear = i; 1025 1026 /* 1027 * If this is true, we've found a large enough free region 1028 * to satisfy the request. 1029 / 1030* if (++num_clear == nr_pages) { 1031 1032 bit_nset(xbb->kva_free, first_clear, 1033 first_clear + nr_pages - 1); 1034 1035 free_kva = xbb->kva + 1036 (uint8_t )(first_clear PAGE_SIZE); 1037 1038 KASSERT(free_kva >= (uint8_t )xbb->kva && 1039* free_kva + (nr_pages * PAGE_SIZE) <= 1040 (uint8_t )xbb->ring_config.va, 1041* ("Free KVA %p len %d out of range, " 1042 "kva = %#jx, ring VA = %#jx\n", free_kva, 1043 nr_pages * PAGE_SIZE, (uintmax_t)xbb->kva, 1044 (uintmax_t)xbb->ring_config.va)); 1045 break; 1046 } 1047 } 1048 1049bailout: 1050 1051 if (free_kva == NULL) { 1052 xbb->flags \|= XBBF_RESOURCE_SHORTAGE; 1053 xbb->kva_shortages++; 1054 } 1055 1056 mtx_unlock(&xbb->lock); 1057 1058 return (free_kva); 1059} 1060 1061/** 1062 * Free allocated KVA. 1063 * 1064 * \param xbb Per-instance xbb configuration structure. 1065 * \param kva_ptr Pointer to allocated KVA region. 1066 * \param nr_pages Number of pages in the KVA region. 1067 / 1068static void 1069xbb_free_kva(struct xbb_softc xbb, uint8_t kva_ptr, int nr_pages) 1070{ 1071* intptr_t start_page; 1072 1073 mtx_assert(&xbb->lock, MA_OWNED); 1074 1075 start_page = (intptr_t)(kva_ptr - xbb->kva) >> PAGE_SHIFT; 1076 bit_nclear(xbb->kva_free, start_page, start_page + nr_pages - 1); 1077 1078} 1079 1080/** 1081 * Unmap the front-end pages associated with this I/O request. 1082 * 1083 * \param req The request structure to unmap. 1084 / 1085static void 1086xbb_unmap_reqlist(struct xbb_xen_reqlist reqlist) 1087{ 1088 struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQLIST]; 1089 u_int i; 1090 u_int invcount; 1091 int error; 1092 1093 invcount = 0; 1094 for (i = 0; i < reqlist->nr_segments; i++) { 1095 1096 if (reqlist->gnt_handles[i] == GRANT_REF_INVALID) 1097 continue; 1098 1099 unmap[invcount].host_addr = xbb_get_gntaddr(reqlist, i, 0); 1100 unmap[invcount].dev_bus_addr = 0; 1101 unmap[invcount].handle = reqlist->gnt_handles[i]; 1102 reqlist->gnt_handles[i] = GRANT_REF_INVALID; 1103 invcount++; 1104 } 1105 1106 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, 1107 unmap, invcount); 1108 KASSERT(error == 0, ("Grant table operation failed")); 1109} 1110 1111/** 1112 * Allocate an internal transaction tracking structure from the free pool. 1113 * 1114 * \param xbb Per-instance xbb configuration structure. 1115 * 1116 * \return On success, a pointer to the allocated xbb_xen_reqlist structure. 1117 * Otherwise NULL. 1118 / 1119static inline struct xbb_xen_reqlist 1120xbb_get_reqlist(struct xbb_softc xbb) 1121{ 1122* struct xbb_xen_reqlist reqlist; 1123* 1124 reqlist = NULL; 1125 1126 mtx_assert(&xbb->lock, MA_OWNED); 1127 1128 if ((reqlist = STAILQ_FIRST(&xbb->reqlist_free_stailq)) != NULL) { 1129 1130 STAILQ_REMOVE_HEAD(&xbb->reqlist_free_stailq, links); 1131 reqlist->flags = XBB_REQLIST_NONE; 1132 reqlist->kva = NULL; 1133 reqlist->status = BLKIF_RSP_OKAY; 1134 reqlist->residual_512b_sectors = 0; 1135 reqlist->num_children = 0; 1136 reqlist->nr_segments = 0; 1137 STAILQ_INIT(&reqlist->contig_req_list); 1138 } 1139 1140 return (reqlist); 1141} 1142 1143/** 1144 * Return an allocated transaction tracking structure to the free pool. 1145 * 1146 * \param xbb Per-instance xbb configuration structure. 1147 * \param req The request list structure to free. 1148 * \param wakeup If set, wakeup the work thread if freeing this reqlist 1149 * during a resource shortage condition. 1150 / 1151static inline void 1152xbb_release_reqlist(struct xbb_softc xbb, struct xbb_xen_reqlist reqlist, 1153* int wakeup) 1154{ 1155 1156 mtx_lock(&xbb->lock); 1157 1158 if (wakeup) { 1159 wakeup = xbb->flags & XBBF_RESOURCE_SHORTAGE; 1160 xbb->flags &= ~XBBF_RESOURCE_SHORTAGE; 1161 } 1162 1163 if (reqlist->kva != NULL) 1164 xbb_free_kva(xbb, reqlist->kva, reqlist->nr_segments); 1165 1166 xbb_release_reqs(xbb, &reqlist->contig_req_list, reqlist->num_children); 1167 1168 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); 1169 1170 if ((xbb->flags & XBBF_SHUTDOWN) != 0) { 1171 /* 1172 * Shutdown is in progress. See if we can 1173 * progress further now that one more request 1174 * has completed and been returned to the 1175 * free pool. 1176 / 1177* xbb_shutdown(xbb); 1178 } 1179 1180 mtx_unlock(&xbb->lock); 1181 1182 if (wakeup != 0) 1183 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1184} 1185 1186/** 1187 * Request resources and do basic request setup. 1188 * 1189 * \param xbb Per-instance xbb configuration structure. 1190 * \param reqlist Pointer to reqlist pointer. 1191 * \param ring_req Pointer to a block ring request. 1192 * \param ring_index The ring index of this request. 1193 * 1194 * \return 0 for success, non-zero for failure. 1195 / 1196static int 1197xbb_get_resources(struct xbb_softc xbb, struct xbb_xen_reqlist *reqlist, 1198* blkif_request_t ring_req, RING_IDX ring_idx) 1199{ 1200* struct xbb_xen_reqlist nreqlist; 1201* struct xbb_xen_req nreq; 1202* 1203 nreqlist = NULL; 1204 nreq = NULL; 1205 1206 mtx_lock(&xbb->lock); 1207 1208 /* 1209 * We don't allow new resources to be allocated if we're in the 1210 * process of shutting down. 1211 / 1212* if ((xbb->flags & XBBF_SHUTDOWN) != 0) { 1213 mtx_unlock(&xbb->lock); 1214 return (1); 1215 } 1216 1217 /* 1218 * Allocate a reqlist if the caller doesn't have one already. 1219 / 1220* if (reqlist == NULL) { 1221* nreqlist = xbb_get_reqlist(xbb); 1222 if (nreqlist == NULL) 1223 goto bailout_error; 1224 } 1225 1226 /* We always allocate a request. / 1227* nreq = xbb_get_req(xbb); 1228 if (nreq == NULL) 1229 goto bailout_error; 1230 1231 mtx_unlock(&xbb->lock); 1232 1233 if (reqlist == NULL) { 1234* reqlist = nreqlist; 1235* nreqlist->operation = ring_req->operation; 1236 nreqlist->starting_sector_number = ring_req->sector_number; 1237 STAILQ_INSERT_TAIL(&xbb->reqlist_pending_stailq, nreqlist, 1238 links); 1239 } 1240 1241 nreq->reqlist = reqlist; 1242* nreq->req_ring_idx = ring_idx; 1243 1244 if (xbb->abi != BLKIF_PROTOCOL_NATIVE) { 1245 bcopy(ring_req, &nreq->ring_req_storage, sizeof(ring_req)); 1246* nreq->ring_req = &nreq->ring_req_storage; 1247 } else { 1248 nreq->ring_req = ring_req; 1249 } 1250 1251 binuptime(&nreq->ds_t0); 1252 devstat_start_transaction(xbb->xbb_stats_in, &nreq->ds_t0); 1253 STAILQ_INSERT_TAIL(&(reqlist)->contig_req_list, nreq, links); 1254* (reqlist)->num_children++; 1255* (reqlist)->nr_segments += ring_req->nr_segments; 1256* 1257 return (0); 1258 1259bailout_error: 1260 1261 /* 1262 * We're out of resources, so set the shortage flag. The next time 1263 * a request is released, we'll try waking up the work thread to 1264 * see if we can allocate more resources. 1265 / 1266* xbb->flags \|= XBBF_RESOURCE_SHORTAGE; 1267 xbb->request_shortages++; 1268 1269 if (nreq != NULL) 1270 xbb_release_req(xbb, nreq); 1271 1272 mtx_unlock(&xbb->lock); 1273 1274 if (nreqlist != NULL) 1275 xbb_release_reqlist(xbb, nreqlist, /wakeup/ 0); 1276 1277 return (1); 1278} 1279 1280/** 1281 * Create and transmit a response to a blkif request. 1282 * 1283 * \param xbb Per-instance xbb configuration structure. 1284 * \param req The request structure to which to respond. 1285 * \param status The status code to report. See BLKIF_RSP_* 1286 * in sys/xen/interface/io/blkif.h. 1287 / 1288static void 1289xbb_send_response(struct xbb_softc xbb, struct xbb_xen_req req, int status) 1290{ 1291* blkif_response_t resp; 1292* int more_to_do; 1293 int notify; 1294 1295 more_to_do = 0; 1296 1297 /* 1298 * Place on the response ring for the relevant domain. 1299 * For now, only the spacing between entries is different 1300 * in the different ABIs, not the response entry layout. 1301 / 1302* mtx_lock(&xbb->lock); 1303 switch (xbb->abi) { 1304 case BLKIF_PROTOCOL_NATIVE: 1305 resp = RING_GET_RESPONSE(&xbb->rings.native, 1306 xbb->rings.native.rsp_prod_pvt); 1307 break; 1308 case BLKIF_PROTOCOL_X86_32: 1309 resp = (blkif_response_t ) 1310* RING_GET_RESPONSE(&xbb->rings.x86_32, 1311 xbb->rings.x86_32.rsp_prod_pvt); 1312 break; 1313 case BLKIF_PROTOCOL_X86_64: 1314 resp = (blkif_response_t ) 1315* RING_GET_RESPONSE(&xbb->rings.x86_64, 1316 xbb->rings.x86_64.rsp_prod_pvt); 1317 break; 1318 default: 1319 panic("Unexpected blkif protocol ABI."); 1320 } 1321 1322 resp->id = req->id; 1323 resp->operation = req->operation; 1324 resp->status = status; 1325 1326 xbb->rings.common.rsp_prod_pvt += BLKIF_SEGS_TO_BLOCKS(req->nr_pages); 1327 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, notify); 1328 1329 if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) { 1330 1331 /* 1332 * Tail check for pending requests. Allows frontend to avoid 1333 * notifications if requests are already in flight (lower 1334 * overheads and promotes batching). 1335 / 1336* RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do); 1337 } else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) { 1338 1339 more_to_do = 1; 1340 } 1341 1342 xbb->reqs_completed++; 1343 1344 mtx_unlock(&xbb->lock); 1345 1346 if (more_to_do) 1347 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1348 1349 if (notify) 1350 notify_remote_via_irq(xbb->irq); 1351} 1352 1353/** 1354 * Complete a request list. 1355 * 1356 * \param xbb Per-instance xbb configuration structure. 1357 * \param reqlist Allocated internal request list structure. 1358 / 1359static void 1360xbb_complete_reqlist(struct xbb_softc xbb, struct xbb_xen_reqlist reqlist) 1361{ 1362* struct xbb_xen_req nreq; 1363* off_t sectors_sent; 1364 1365 sectors_sent = 0; 1366 1367 if (reqlist->flags & XBB_REQLIST_MAPPED) 1368 xbb_unmap_reqlist(reqlist); 1369 1370 /* 1371 * All I/O is done, send the response. A lock should not be 1372 * necessary here because the request list is complete, and 1373 * therefore this is the only context accessing this request 1374 * right now. The functions we call do their own locking if 1375 * necessary. 1376 / 1377* STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { 1378 off_t cur_sectors_sent; 1379 1380 xbb_send_response(xbb, nreq, reqlist->status); 1381 1382 /* We don't report bytes sent if there is an error. / 1383* if (reqlist->status == BLKIF_RSP_OKAY) 1384 cur_sectors_sent = nreq->nr_512b_sectors; 1385 else 1386 cur_sectors_sent = 0; 1387 1388 sectors_sent += cur_sectors_sent; 1389 1390 devstat_end_transaction(xbb->xbb_stats_in, 1391 /bytes/cur_sectors_sent << 9, 1392 reqlist->ds_tag_type, 1393 reqlist->ds_trans_type, 1394 /now/NULL, 1395 /then/&nreq->ds_t0); 1396 } 1397 1398 /* 1399 * Take out any sectors not sent. If we wind up negative (which 1400 * might happen if an error is reported as well as a residual), just 1401 * report 0 sectors sent. 1402 / 1403* sectors_sent -= reqlist->residual_512b_sectors; 1404 if (sectors_sent < 0) 1405 sectors_sent = 0; 1406 1407 devstat_end_transaction(xbb->xbb_stats, 1408 /bytes/ sectors_sent << 9, 1409 reqlist->ds_tag_type, 1410 reqlist->ds_trans_type, 1411 /now/NULL, 1412 /then/&reqlist->ds_t0); 1413 1414 xbb_release_reqlist(xbb, reqlist, /wakeup/ 1); 1415} 1416 1417/** 1418 * Completion handler for buffer I/O requests issued by the device 1419 * backend driver. 1420 * 1421 * \param bio The buffer I/O request on which to perform completion 1422 * processing. 1423 / 1424static void 1425xbb_bio_done(struct bio bio) 1426{ 1427 struct xbb_softc xbb; 1428* struct xbb_xen_reqlist reqlist; 1429* 1430 reqlist = bio->bio_caller1; 1431 xbb = reqlist->xbb; 1432 1433 reqlist->residual_512b_sectors += bio->bio_resid >> 9; 1434 1435 /* 1436 * This is a bit imprecise. With aggregated I/O a single 1437 * request list can contain multiple front-end requests and 1438 * a multiple bios may point to a single request. By carefully 1439 * walking the request list, we could map residuals and errors 1440 * back to the original front-end request, but the interface 1441 * isn't sufficiently rich for us to properly report the error. 1442 * So, we just treat the entire request list as having failed if an 1443 * error occurs on any part. And, if an error occurs, we treat 1444 * the amount of data transferred as 0. 1445 * 1446 * For residuals, we report it on the overall aggregated device, 1447 * but not on the individual requests, since we don't currently 1448 * do the work to determine which front-end request to which the 1449 * residual applies. 1450 / 1451* if (bio->bio_error) { 1452 DPRINTF("BIO returned error %d for operation on device %s\n", 1453 bio->bio_error, xbb->dev_name); 1454 reqlist->status = BLKIF_RSP_ERROR; 1455 1456 if (bio->bio_error == ENXIO 1457 && xenbus_get_state(xbb->dev) == XenbusStateConnected) { 1458 1459 /* 1460 * Backend device has disappeared. Signal the 1461 * front-end that we (the device proxy) want to 1462 * go away. 1463 / 1464* xenbus_set_state(xbb->dev, XenbusStateClosing); 1465 } 1466 } 1467 1468#ifdef XBB_USE_BOUNCE_BUFFERS 1469 if (bio->bio_cmd == BIO_READ) { 1470 vm_offset_t kva_offset; 1471 1472 kva_offset = (vm_offset_t)bio->bio_data 1473 - (vm_offset_t)reqlist->bounce; 1474 memcpy((uint8_t )reqlist->kva + kva_offset, 1475* bio->bio_data, bio->bio_bcount); 1476 } 1477#endif /* XBB_USE_BOUNCE_BUFFERS / 1478* 1479 /* 1480 * Decrement the pending count for the request list. When we're 1481 * done with the requests, send status back for all of them. 1482 / 1483* if (atomic_fetchadd_int(&reqlist->pendcnt, -1) == 1) 1484 xbb_complete_reqlist(xbb, reqlist); 1485 1486 g_destroy_bio(bio); 1487} 1488 1489/** 1490 * Parse a blkif request into an internal request structure and send 1491 * it to the backend for processing. 1492 * 1493 * \param xbb Per-instance xbb configuration structure. 1494 * \param reqlist Allocated internal request list structure. 1495 * 1496 * \return On success, 0. For resource shortages, non-zero. 1497 * 1498 * This routine performs the backend common aspects of request parsing 1499 * including compiling an internal request structure, parsing the S/G 1500 * list and any secondary ring requests in which they may reside, and 1501 * the mapping of front-end I/O pages into our domain. 1502 / 1503static int 1504xbb_dispatch_io(struct xbb_softc xbb, struct xbb_xen_reqlist reqlist) 1505{ 1506* struct xbb_sg xbb_sg; 1507* struct gnttab_map_grant_ref map; 1508* struct blkif_request_segment sg; 1509* struct blkif_request_segment last_block_sg; 1510* struct xbb_xen_req nreq; 1511* u_int nseg; 1512 u_int seg_idx; 1513 u_int block_segs; 1514 int nr_sects; 1515 int total_sects; 1516 int operation; 1517 uint8_t bio_flags; 1518 int error; 1519 1520 reqlist->ds_tag_type = DEVSTAT_TAG_SIMPLE; 1521 bio_flags = 0; 1522 total_sects = 0; 1523 nr_sects = 0; 1524 1525 /* 1526 * First determine whether we have enough free KVA to satisfy this 1527 * request list. If not, tell xbb_run_queue() so it can go to 1528 * sleep until we have more KVA. 1529 / 1530* reqlist->kva = NULL; 1531 if (reqlist->nr_segments != 0) { 1532 reqlist->kva = xbb_get_kva(xbb, reqlist->nr_segments); 1533 if (reqlist->kva == NULL) { 1534 /* 1535 * If we're out of KVA, return ENOMEM. 1536 / 1537* return (ENOMEM); 1538 } 1539 } 1540 1541 binuptime(&reqlist->ds_t0); 1542 devstat_start_transaction(xbb->xbb_stats, &reqlist->ds_t0); 1543 1544 switch (reqlist->operation) { 1545 case BLKIF_OP_WRITE_BARRIER: 1546 bio_flags \|= BIO_ORDERED; 1547 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; 1548 /* FALLTHROUGH / 1549* case BLKIF_OP_WRITE: 1550 operation = BIO_WRITE; 1551 reqlist->ds_trans_type = DEVSTAT_WRITE; 1552 if ((xbb->flags & XBBF_READ_ONLY) != 0) { 1553 DPRINTF("Attempt to write to read only device %s\n", 1554 xbb->dev_name); 1555 reqlist->status = BLKIF_RSP_ERROR; 1556 goto send_response; 1557 } 1558 break; 1559 case BLKIF_OP_READ: 1560 operation = BIO_READ; 1561 reqlist->ds_trans_type = DEVSTAT_READ; 1562 break; 1563 case BLKIF_OP_FLUSH_DISKCACHE: 1564 /* 1565 * If this is true, the user has requested that we disable 1566 * flush support. So we just complete the requests 1567 * successfully. 1568 / 1569* if (xbb->disable_flush != 0) { 1570 goto send_response; 1571 } 1572 1573 /* 1574 * The user has requested that we only send a real flush 1575 * for every N flush requests. So keep count, and either 1576 * complete the request immediately or queue it for the 1577 * backend. 1578 / 1579* if (xbb->flush_interval != 0) { 1580 if (++(xbb->flush_count) < xbb->flush_interval) { 1581 goto send_response; 1582 } else 1583 xbb->flush_count = 0; 1584 } 1585 1586 operation = BIO_FLUSH; 1587 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; 1588 reqlist->ds_trans_type = DEVSTAT_NO_DATA; 1589 goto do_dispatch; 1590 /NOTREACHED/ 1591 default: 1592 DPRINTF("error: unknown block io operation [%d]\n", 1593 reqlist->operation); 1594 reqlist->status = BLKIF_RSP_ERROR; 1595 goto send_response; 1596 } 1597 1598 reqlist->xbb = xbb; 1599 xbb_sg = xbb->xbb_sgs; 1600 map = xbb->maps; 1601 seg_idx = 0; 1602 1603 STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { 1604 blkif_request_t ring_req; 1605* RING_IDX req_ring_idx; 1606 u_int req_seg_idx; 1607 1608 ring_req = nreq->ring_req; 1609 req_ring_idx = nreq->req_ring_idx; 1610 nr_sects = 0; 1611 nseg = ring_req->nr_segments; 1612 nreq->id = ring_req->id; 1613 nreq->nr_pages = nseg; 1614 nreq->nr_512b_sectors = 0; 1615 req_seg_idx = 0; 1616 sg = NULL; 1617 1618 /* Check that number of segments is sane. / 1619* if (unlikely(nseg == 0) 1620 \|\| unlikely(nseg > xbb->max_request_segments)) { 1621 DPRINTF("Bad number of segments in request (%d)\n", 1622 nseg); 1623 reqlist->status = BLKIF_RSP_ERROR; 1624 goto send_response; 1625 } 1626 1627 block_segs = MIN(nreq->nr_pages, 1628 BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK); 1629 sg = ring_req->seg; 1630 last_block_sg = sg + block_segs; 1631 while (1) { 1632 1633 while (sg < last_block_sg) { 1634 KASSERT(seg_idx < 1635 XBB_MAX_SEGMENTS_PER_REQLIST, 1636 ("seg_idx %d is too large, max " 1637 "segs %d\n", seg_idx, 1638 XBB_MAX_SEGMENTS_PER_REQLIST)); 1639 1640 xbb_sg->first_sect = sg->first_sect; 1641 xbb_sg->last_sect = sg->last_sect; 1642 xbb_sg->nsect = 1643 (int8_t)(sg->last_sect - 1644 sg->first_sect + 1); 1645 1646 if ((sg->last_sect >= (PAGE_SIZE >> 9)) 1647 \|\| (xbb_sg->nsect <= 0)) { 1648 reqlist->status = BLKIF_RSP_ERROR; 1649 goto send_response; 1650 } 1651 1652 nr_sects += xbb_sg->nsect; 1653 map->host_addr = xbb_get_gntaddr(reqlist, 1654 seg_idx, /sector/0); 1655 KASSERT(map->host_addr + PAGE_SIZE <= 1656 xbb->ring_config.gnt_addr, 1657 ("Host address %#jx len %d overlaps " 1658 "ring address %#jx\n", 1659 (uintmax_t)map->host_addr, PAGE_SIZE, 1660 (uintmax_t)xbb->ring_config.gnt_addr)); 1661 1662 map->flags = GNTMAP_host_map; 1663 map->ref = sg->gref; 1664 map->dom = xbb->otherend_id; 1665 if (operation == BIO_WRITE) 1666 map->flags \|= GNTMAP_readonly; 1667 sg++; 1668 map++; 1669 xbb_sg++; 1670 seg_idx++; 1671 req_seg_idx++; 1672 } 1673 1674 block_segs = MIN(nseg - req_seg_idx, 1675 BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK); 1676 if (block_segs == 0) 1677 break; 1678 1679 /* 1680 * Fetch the next request block full of SG elements. 1681 * For now, only the spacing between entries is 1682 * different in the different ABIs, not the sg entry 1683 * layout. 1684 / 1685* req_ring_idx++; 1686 switch (xbb->abi) { 1687 case BLKIF_PROTOCOL_NATIVE: 1688 sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.native, 1689 req_ring_idx); 1690 break; 1691 case BLKIF_PROTOCOL_X86_32: 1692 { 1693 sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.x86_32, 1694 req_ring_idx); 1695 break; 1696 } 1697 case BLKIF_PROTOCOL_X86_64: 1698 { 1699 sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.x86_64, 1700 req_ring_idx); 1701 break; 1702 } 1703 default: 1704 panic("Unexpected blkif protocol ABI."); 1705 /* NOTREACHED / 1706* } 1707 last_block_sg = sg + block_segs; 1708 } 1709 1710 /* Convert to the disk's sector size / 1711* nreq->nr_512b_sectors = nr_sects; 1712 nr_sects = (nr_sects << 9) >> xbb->sector_size_shift; 1713 total_sects += nr_sects; 1714 1715 if ((nreq->nr_512b_sectors & 1716 ((xbb->sector_size >> 9) - 1)) != 0) { 1717 device_printf(xbb->dev, "%s: I/O size (%d) is not " 1718 "a multiple of the backing store sector " 1719 "size (%d)\n", __func__, 1720 nreq->nr_512b_sectors << 9, 1721 xbb->sector_size); 1722 reqlist->status = BLKIF_RSP_ERROR; 1723 goto send_response; 1724 } 1725 } 1726 1727 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, 1728 xbb->maps, reqlist->nr_segments); 1729 if (error != 0) 1730 panic("Grant table operation failed (%d)", error); 1731 1732 reqlist->flags \|= XBB_REQLIST_MAPPED; 1733 1734 for (seg_idx = 0, map = xbb->maps; seg_idx < reqlist->nr_segments; 1735 seg_idx++, map++){ 1736 1737 if (unlikely(map->status != 0)) { 1738 DPRINTF("invalid buffer -- could not remap " 1739 "it (%d)\n", map->status); 1740 DPRINTF("Mapping(%d): Host Addr 0x%lx, flags " 1741 "0x%x ref 0x%x, dom %d\n", seg_idx, 1742 map->host_addr, map->flags, map->ref, 1743 map->dom); 1744 reqlist->status = BLKIF_RSP_ERROR; 1745 goto send_response; 1746 } 1747 1748 reqlist->gnt_handles[seg_idx] = map->handle; 1749 } 1750 if (reqlist->starting_sector_number + total_sects > 1751 xbb->media_num_sectors) { 1752 1753 DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] " 1754 "extends past end of device %s\n", 1755 operation == BIO_READ ? "read" : "write", 1756 reqlist->starting_sector_number, 1757 reqlist->starting_sector_number + total_sects, 1758 xbb->dev_name); 1759 reqlist->status = BLKIF_RSP_ERROR; 1760 goto send_response; 1761 } 1762 1763do_dispatch: 1764 1765 error = xbb->dispatch_io(xbb, 1766 reqlist, 1767 operation, 1768 bio_flags); 1769 1770 if (error != 0) { 1771 reqlist->status = BLKIF_RSP_ERROR; 1772 goto send_response; 1773 } 1774 1775 return (0); 1776 1777send_response: 1778 1779 xbb_complete_reqlist(xbb, reqlist); 1780 1781 return (0); 1782} 1783 1784static __inline int 1785xbb_count_sects(blkif_request_t ring_req) 1786{ 1787* int i; 1788 int cur_size = 0; 1789 1790 for (i = 0; i < ring_req->nr_segments; i++) { 1791 int nsect; 1792 1793 nsect = (int8_t)(ring_req->seg[i].last_sect - 1794 ring_req->seg[i].first_sect + 1); 1795 if (nsect <= 0) 1796 break; 1797 1798 cur_size += nsect; 1799 } 1800 1801 return (cur_size); 1802} 1803 1804/** 1805 * Process incoming requests from the shared communication ring in response 1806 * to a signal on the ring's event channel. 1807 * 1808 * \param context Callback argument registerd during task initialization - 1809 * the xbb_softc for this instance. 1810 * \param pending The number of taskqueue_enqueue events that have 1811 * occurred since this handler was last run. 1812 / 1813static void 1814xbb_run_queue(void context, int pending) 1815{ 1816 struct xbb_softc xbb; 1817* blkif_back_rings_t rings; 1818* RING_IDX rp; 1819 uint64_t cur_sector; 1820 int cur_operation; 1821 struct xbb_xen_reqlist reqlist; 1822* 1823 1824 xbb = (struct xbb_softc )context; 1825* rings = &xbb->rings; 1826 1827 /* 1828 * Work gather and dispatch loop. Note that we have a bias here 1829 * towards gathering I/O sent by blockfront. We first gather up 1830 * everything in the ring, as long as we have resources. Then we 1831 * dispatch one request, and then attempt to gather up any 1832 * additional requests that have come in while we were dispatching 1833 * the request. 1834 * 1835 * This allows us to get a clearer picture (via devstat) of how 1836 * many requests blockfront is queueing to us at any given time. 1837 / 1838* for (;;) { 1839 int retval; 1840 1841 /* 1842 * Initialize reqlist to the last element in the pending 1843 * queue, if there is one. This allows us to add more 1844 * requests to that request list, if we have room. 1845 / 1846* reqlist = STAILQ_LAST(&xbb->reqlist_pending_stailq, 1847 xbb_xen_reqlist, links); 1848 if (reqlist != NULL) { 1849 cur_sector = reqlist->next_contig_sector; 1850 cur_operation = reqlist->operation; 1851 } else { 1852 cur_operation = 0; 1853 cur_sector = 0; 1854 } 1855 1856 /* 1857 * Cache req_prod to avoid accessing a cache line shared 1858 * with the frontend. 1859 / 1860* rp = rings->common.sring->req_prod; 1861 1862 /* Ensure we see queued requests up to 'rp'. / 1863* rmb(); 1864 1865 /** 1866 * Run so long as there is work to consume and the generation 1867 * of a response will not overflow the ring. 1868 * 1869 * @note There's a 1 to 1 relationship between requests and 1870 * responses, so an overflow should never occur. This 1871 * test is to protect our domain from digesting bogus 1872 * data. Shouldn't we log this? 1873 / 1874* while (rings->common.req_cons != rp 1875 && RING_REQUEST_CONS_OVERFLOW(&rings->common, 1876 rings->common.req_cons) == 0){ 1877 blkif_request_t ring_req_storage; 1878 blkif_request_t ring_req; 1879* int cur_size; 1880 1881 switch (xbb->abi) { 1882 case BLKIF_PROTOCOL_NATIVE: 1883 ring_req = RING_GET_REQUEST(&xbb->rings.native, 1884 rings->common.req_cons); 1885 break; 1886 case BLKIF_PROTOCOL_X86_32: 1887 { 1888 struct blkif_x86_32_request ring_req32; 1889* 1890 ring_req32 = RING_GET_REQUEST( 1891 &xbb->rings.x86_32, rings->common.req_cons); 1892 blkif_get_x86_32_req(&ring_req_storage, 1893 ring_req32); 1894 ring_req = &ring_req_storage; 1895 break; 1896 } 1897 case BLKIF_PROTOCOL_X86_64: 1898 { 1899 struct blkif_x86_64_request ring_req64; 1900* 1901 ring_req64 =RING_GET_REQUEST(&xbb->rings.x86_64, 1902 rings->common.req_cons); 1903 blkif_get_x86_64_req(&ring_req_storage, 1904 ring_req64); 1905 ring_req = &ring_req_storage; 1906 break; 1907 } 1908 default: 1909 panic("Unexpected blkif protocol ABI."); 1910 /* NOTREACHED / 1911* } 1912 1913 /* 1914 * Check for situations that would require closing 1915 * off this I/O for further coalescing: 1916 * - Coalescing is turned off. 1917 * - Current I/O is out of sequence with the previous 1918 * I/O. 1919 * - Coalesced I/O would be too large. 1920 / 1921* if ((reqlist != NULL) 1922 && ((xbb->no_coalesce_reqs != 0) 1923 \|\| ((xbb->no_coalesce_reqs == 0) 1924 && ((ring_req->sector_number != cur_sector) 1925 \|\| (ring_req->operation != cur_operation) 1926 \|\| ((ring_req->nr_segments + reqlist->nr_segments) > 1927 xbb->max_reqlist_segments))))) { 1928 reqlist = NULL; 1929 } 1930 1931 /* 1932 * Grab and check for all resources in one shot. 1933 * If we can't get all of the resources we need, 1934 * the shortage is noted and the thread will get 1935 * woken up when more resources are available. 1936 / 1937* retval = xbb_get_resources(xbb, &reqlist, ring_req, 1938 xbb->rings.common.req_cons); 1939 1940 if (retval != 0) { 1941 /* 1942 * Resource shortage has been recorded. 1943 * We'll be scheduled to run once a request 1944 * object frees up due to a completion. 1945 / 1946* break; 1947 } 1948 1949 /* 1950 * Signify that we can overwrite this request with 1951 * a response by incrementing our consumer index. 1952 * The response won't be generated until after 1953 * we've already consumed all necessary data out 1954 * of the version of the request in the ring buffer 1955 * (for native mode). We must update the consumer 1956 * index before issueing back-end I/O so there is 1957 * no possibility that it will complete and a 1958 * response be generated before we make room in 1959 * the queue for that response. 1960 / 1961* xbb->rings.common.req_cons += 1962 BLKIF_SEGS_TO_BLOCKS(ring_req->nr_segments); 1963 xbb->reqs_received++; 1964 1965 cur_size = xbb_count_sects(ring_req); 1966 cur_sector = ring_req->sector_number + cur_size; 1967 reqlist->next_contig_sector = cur_sector; 1968 cur_operation = ring_req->operation; 1969 } 1970 1971 /* Check for I/O to dispatch / 1972* reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); 1973 if (reqlist == NULL) { 1974 /* 1975 * We're out of work to do, put the task queue to 1976 * sleep. 1977 / 1978* break; 1979 } 1980 1981 /* 1982 * Grab the first request off the queue and attempt 1983 * to dispatch it. 1984 / 1985* STAILQ_REMOVE_HEAD(&xbb->reqlist_pending_stailq, links); 1986 1987 retval = xbb_dispatch_io(xbb, reqlist); 1988 if (retval != 0) { 1989 /* 1990 * xbb_dispatch_io() returns non-zero only when 1991 * there is a resource shortage. If that's the 1992 * case, re-queue this request on the head of the 1993 * queue, and go to sleep until we have more 1994 * resources. 1995 / 1996* STAILQ_INSERT_HEAD(&xbb->reqlist_pending_stailq, 1997 reqlist, links); 1998 break; 1999 } else { 2000 /* 2001 * If we still have anything on the queue after 2002 * removing the head entry, that is because we 2003 * met one of the criteria to create a new 2004 * request list (outlined above), and we'll call 2005 * that a forced dispatch for statistical purposes. 2006 * 2007 * Otherwise, if there is only one element on the 2008 * queue, we coalesced everything available on 2009 * the ring and we'll call that a normal dispatch. 2010 / 2011* reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); 2012 2013 if (reqlist != NULL) 2014 xbb->forced_dispatch++; 2015 else 2016 xbb->normal_dispatch++; 2017 2018 xbb->total_dispatch++; 2019 } 2020 } 2021} 2022 2023/** 2024 * Interrupt handler bound to the shared ring's event channel. 2025 * 2026 * \param arg Callback argument registerd during event channel 2027 * binding - the xbb_softc for this instance. 2028 / 2029static void 2030xbb_intr(void arg) 2031{ 2032 struct xbb_softc xbb; 2033* 2034 /* Defer to kernel thread. / 2035* xbb = (struct xbb_softc )arg; 2036* taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 2037} 2038 2039SDT_PROVIDER_DEFINE(xbb); 2040SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_dev, flush, flush, "int"); 2041SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, read, read, "int", "uint64_t", 2042 "uint64_t"); 2043SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, write, write, "int", 2044 "uint64_t", "uint64_t"); 2045 2046/----------------------------- Backend Handlers -----------------------------/ 2047/** 2048 * Backend handler for character device access. 2049 * 2050 * \param xbb Per-instance xbb configuration structure. 2051 * \param reqlist Allocated internal request list structure. 2052 * \param operation BIO_* I/O operation code. 2053 * \param bio_flags Additional bio_flag data to pass to any generated 2054 * bios (e.g. BIO_ORDERED).. 2055 * 2056 * \return 0 for success, errno codes for failure. 2057 / 2058static int 2059xbb_dispatch_dev(struct xbb_softc xbb, struct xbb_xen_reqlist reqlist, 2060* int operation, int bio_flags) 2061{ 2062 struct xbb_dev_data dev_data; 2063* struct bio bios[XBB_MAX_SEGMENTS_PER_REQLIST]; 2064* struct xbb_xen_req nreq; 2065* off_t bio_offset; 2066 struct bio bio; 2067* struct xbb_sg xbb_sg; 2068* u_int nbio; 2069 u_int bio_idx; 2070 u_int nseg; 2071 u_int seg_idx; 2072 int error; 2073 2074 dev_data = &xbb->backend.dev; 2075 bio_offset = (off_t)reqlist->starting_sector_number 2076 << xbb->sector_size_shift; 2077 error = 0; 2078 nbio = 0; 2079 bio_idx = 0; 2080 2081 if (operation == BIO_FLUSH) { 2082 nreq = STAILQ_FIRST(&reqlist->contig_req_list); 2083 bio = g_new_bio(); 2084 if (unlikely(bio == NULL)) { 2085 DPRINTF("Unable to allocate bio for BIO_FLUSH\n"); 2086 error = ENOMEM; 2087 return (error); 2088 } 2089 2090 bio->bio_cmd = BIO_FLUSH; 2091 bio->bio_flags \|= BIO_ORDERED; 2092 bio->bio_dev = dev_data->cdev; 2093 bio->bio_offset = 0; 2094 bio->bio_data = 0; 2095 bio->bio_done = xbb_bio_done; 2096 bio->bio_caller1 = nreq; 2097 bio->bio_pblkno = 0; 2098 2099 nreq->pendcnt = 1; 2100 2101 SDT_PROBE1(xbb, kernel, xbb_dispatch_dev, flush, 2102 device_get_unit(xbb->dev)); 2103 2104 (dev_data->csw->d_strategy)(bio); 2105* 2106 return (0); 2107 } 2108 2109 xbb_sg = xbb->xbb_sgs; 2110 bio = NULL; 2111 nseg = reqlist->nr_segments; 2112 2113 for (seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { 2114 2115 /* 2116 * KVA will not be contiguous, so any additional 2117 * I/O will need to be represented in a new bio. 2118 / 2119* if ((bio != NULL) 2120 && (xbb_sg->first_sect != 0)) { 2121 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { 2122 printf("%s: Discontiguous I/O request " 2123 "from domain %d ends on " 2124 "non-sector boundary\n", 2125 __func__, xbb->otherend_id); 2126 error = EINVAL; 2127 goto fail_free_bios; 2128 } 2129 bio = NULL; 2130 } 2131 2132 if (bio == NULL) { 2133 /* 2134 * Make sure that the start of this bio is 2135 * aligned to a device sector. 2136 / 2137* if ((bio_offset & (xbb->sector_size - 1)) != 0){ 2138 printf("%s: Misaligned I/O request " 2139 "from domain %d\n", __func__, 2140 xbb->otherend_id); 2141 error = EINVAL; 2142 goto fail_free_bios; 2143 } 2144 2145 bio = bios[nbio++] = g_new_bio(); 2146 if (unlikely(bio == NULL)) { 2147 error = ENOMEM; 2148 goto fail_free_bios; 2149 } 2150 bio->bio_cmd = operation; 2151 bio->bio_flags \|= bio_flags; 2152 bio->bio_dev = dev_data->cdev; 2153 bio->bio_offset = bio_offset; 2154 bio->bio_data = xbb_reqlist_ioaddr(reqlist, seg_idx, 2155 xbb_sg->first_sect); 2156 bio->bio_done = xbb_bio_done; 2157 bio->bio_caller1 = reqlist; 2158 bio->bio_pblkno = bio_offset >> xbb->sector_size_shift; 2159 } 2160 2161 bio->bio_length += xbb_sg->nsect << 9; 2162 bio->bio_bcount = bio->bio_length; 2163 bio_offset += xbb_sg->nsect << 9; 2164 2165 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) { 2166 2167 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { 2168 printf("%s: Discontiguous I/O request " 2169 "from domain %d ends on " 2170 "non-sector boundary\n", 2171 __func__, xbb->otherend_id); 2172 error = EINVAL; 2173 goto fail_free_bios; 2174 } 2175 /* 2176 * KVA will not be contiguous, so any additional 2177 * I/O will need to be represented in a new bio. 2178 / 2179* bio = NULL; 2180 } 2181 } 2182 2183 reqlist->pendcnt = nbio; 2184 2185 for (bio_idx = 0; bio_idx < nbio; bio_idx++) 2186 { 2187#ifdef XBB_USE_BOUNCE_BUFFERS 2188 vm_offset_t kva_offset; 2189 2190 kva_offset = (vm_offset_t)bios[bio_idx]->bio_data 2191 - (vm_offset_t)reqlist->bounce; 2192 if (operation == BIO_WRITE) { 2193 memcpy(bios[bio_idx]->bio_data, 2194 (uint8_t )reqlist->kva + kva_offset, 2195* bios[bio_idx]->bio_bcount); 2196 } 2197#endif 2198 if (operation == BIO_READ) { 2199 SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, read, 2200 device_get_unit(xbb->dev), 2201 bios[bio_idx]->bio_offset, 2202 bios[bio_idx]->bio_length); 2203 } else if (operation == BIO_WRITE) { 2204 SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, write, 2205 device_get_unit(xbb->dev), 2206 bios[bio_idx]->bio_offset, 2207 bios[bio_idx]->bio_length); 2208 } 2209 (dev_data->csw->d_strategy)(bios[bio_idx]); 2210* } 2211 2212 return (error); 2213 2214fail_free_bios: 2215 for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++) 2216 g_destroy_bio(bios[bio_idx]); 2217 2218 return (error); 2219} 2220 2221SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_file, flush, flush, "int"); 2222SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, read, read, "int", "uint64_t", 2223 "uint64_t"); 2224SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, write, write, "int", 2225 "uint64_t", "uint64_t"); 2226 2227/** 2228 * Backend handler for file access. 2229 * 2230 * \param xbb Per-instance xbb configuration structure. 2231 * \param reqlist Allocated internal request list. 2232 * \param operation BIO_* I/O operation code. 2233 * \param flags Additional bio_flag data to pass to any generated bios 2234 * (e.g. BIO_ORDERED).. 2235 * 2236 * \return 0 for success, errno codes for failure. 2237 / 2238static int 2239xbb_dispatch_file(struct xbb_softc xbb, struct xbb_xen_reqlist reqlist, 2240* int operation, int flags) 2241{ 2242 struct xbb_file_data file_data; 2243* u_int seg_idx; 2244 u_int nseg; 2245 off_t sectors_sent; 2246 struct uio xuio; 2247 struct xbb_sg xbb_sg; 2248* struct iovec xiovec; 2249#ifdef XBB_USE_BOUNCE_BUFFERS 2250* void *p_vaddr; 2251* int saved_uio_iovcnt; 2252#endif /* XBB_USE_BOUNCE_BUFFERS */	35 36/** 37 * \file blkback.c 38 * 39 * \brief Device driver supporting the vending of block storage from 40 * a FreeBSD domain to other domains. 41 / 42 43#include "opt_kdtrace.h" 44 45#include <sys/param.h> 46#include <sys/systm.h> 47#include <sys/kernel.h> 48#include <sys/malloc.h> 49 50#include <sys/bio.h> 51#include <sys/bus.h> 52#include <sys/conf.h> 53#include <sys/devicestat.h> 54#include <sys/disk.h> 55#include <sys/fcntl.h> 56#include <sys/filedesc.h> 57#include <sys/kdb.h> 58#include <sys/module.h> 59#include <sys/namei.h> 60#include <sys/proc.h> 61#include <sys/rman.h> 62#include <sys/taskqueue.h> 63#include <sys/types.h> 64#include <sys/vnode.h> 65#include <sys/mount.h> 66#include <sys/sysctl.h> 67#include <sys/bitstring.h> 68#include <sys/sdt.h> 69 70#include <geom/geom.h> 71 72#include <machine/_inttypes.h> 73#include <machine/xen/xen-os.h> 74 75#include <vm/vm.h> 76#include <vm/vm_extern.h> 77#include <vm/vm_kern.h> 78 79#include <xen/blkif.h> 80#include <xen/evtchn.h> 81#include <xen/gnttab.h> 82#include <xen/xen_intr.h> 83 84#include <xen/interface/event_channel.h> 85#include <xen/interface/grant_table.h> 86 87#include <xen/xenbus/xenbusvar.h> 88 89/--------------------------- Compile-time Tunables --------------------------/ 90/* 91 * The maximum number of outstanding request blocks (request headers plus 92 * additional segment blocks) we will allow in a negotiated block-front/back 93 * communication channel. 94 / 95#define XBB_MAX_REQUESTS 256 96 97/* 98 * \brief Define to force all I/O to be performed on memory owned by the 99 * backend device, with a copy-in/out to the remote domain's memory. 100 * 101 * \note This option is currently required when this driver's domain is 102 * operating in HVM mode on a system using an IOMMU. 103 * 104 * This driver uses Xen's grant table API to gain access to the memory of 105 * the remote domains it serves. When our domain is operating in PV mode, 106 * the grant table mechanism directly updates our domain's page table entries 107 * to point to the physical pages of the remote domain. This scheme guarantees 108 * that blkback and the backing devices it uses can safely perform DMA 109 * operations to satisfy requests. In HVM mode, Xen may use a HW IOMMU to 110 * insure that our domain cannot DMA to pages owned by another domain. As 111 * of Xen 4.0, IOMMU mappings for HVM guests are not updated via the grant 112 * table API. For this reason, in HVM mode, we must bounce all requests into 113 * memory that is mapped into our domain at domain startup and thus has 114 * valid IOMMU mappings. 115 / 116#define XBB_USE_BOUNCE_BUFFERS 117* 118/** 119 * \brief Define to enable rudimentary request logging to the console. 120 / 121#undef XBB_DEBUG 122* 123/---------------------------------- Macros ----------------------------------/ 124/** 125 * Custom malloc type for all driver allocations. 126 / 127static MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data"); 128* 129#ifdef XBB_DEBUG 130#define DPRINTF(fmt, args...) \ 131 printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) 132#else 133#define DPRINTF(fmt, args...) do {} while(0) 134#endif 135 136/** 137 * The maximum mapped region size per request we will allow in a negotiated 138 * block-front/back communication channel. 139 / 140#define XBB_MAX_REQUEST_SIZE \ 141* MIN(MAXPHYS, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) 142 143/** 144 * The maximum number of segments (within a request header and accompanying 145 * segment blocks) per request we will allow in a negotiated block-front/back 146 * communication channel. 147 / 148#define XBB_MAX_SEGMENTS_PER_REQUEST \ 149* (MIN(UIO_MAXIOV, \ 150 MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST, \ 151 (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1))) 152 153/** 154 * The maximum number of shared memory ring pages we will allow in a 155 * negotiated block-front/back communication channel. Allow enough 156 * ring space for all requests to be XBB_MAX_REQUEST_SIZE'd. 157 / 158#define XBB_MAX_RING_PAGES \ 159* BLKIF_RING_PAGES(BLKIF_SEGS_TO_BLOCKS(XBB_MAX_SEGMENTS_PER_REQUEST) \ 160 * XBB_MAX_REQUESTS) 161/** 162 * The maximum number of ring pages that we can allow per request list. 163 * We limit this to the maximum number of segments per request, because 164 * that is already a reasonable number of segments to aggregate. This 165 * number should never be smaller than XBB_MAX_SEGMENTS_PER_REQUEST, 166 * because that would leave situations where we can't dispatch even one 167 * large request. 168 / 169#define XBB_MAX_SEGMENTS_PER_REQLIST XBB_MAX_SEGMENTS_PER_REQUEST 170* 171/--------------------------- Forward Declarations ---------------------------/ 172struct xbb_softc; 173struct xbb_xen_req; 174 175static void xbb_attach_failed(struct xbb_softc xbb, int err, const char fmt, 176 ...) __attribute__((format(printf, 3, 4))); 177static int xbb_shutdown(struct xbb_softc xbb); 178static int xbb_detach(device_t dev); 179* 180/------------------------------ Data Structures -----------------------------/ 181 182STAILQ_HEAD(xbb_xen_req_list, xbb_xen_req); 183 184typedef enum { 185 XBB_REQLIST_NONE = 0x00, 186 XBB_REQLIST_MAPPED = 0x01 187} xbb_reqlist_flags; 188 189struct xbb_xen_reqlist { 190 /** 191 * Back reference to the parent block back instance for this 192 * request. Used during bio_done handling. 193 / 194* struct xbb_softc xbb; 195* 196 /** 197 * BLKIF_OP code for this request. 198 / 199* int operation; 200 201 /** 202 * Set to BLKIF_RSP_* to indicate request status. 203 * 204 * This field allows an error status to be recorded even if the 205 * delivery of this status must be deferred. Deferred reporting 206 * is necessary, for example, when an error is detected during 207 * completion processing of one bio when other bios for this 208 * request are still outstanding. 209 / 210* int status; 211 212 /** 213 * Number of 512 byte sectors not transferred. 214 / 215* int residual_512b_sectors; 216 217 /** 218 * Starting sector number of the first request in the list. 219 / 220* off_t starting_sector_number; 221 222 /** 223 * If we're going to coalesce, the next contiguous sector would be 224 * this one. 225 / 226* off_t next_contig_sector; 227 228 /** 229 * Number of child requests in the list. 230 / 231* int num_children; 232 233 /** 234 * Number of I/O requests dispatched to the backend. 235 / 236* int pendcnt; 237 238 /** 239 * Total number of segments for requests in the list. 240 / 241* int nr_segments; 242 243 /** 244 * Flags for this particular request list. 245 / 246* xbb_reqlist_flags flags; 247 248 /** 249 * Kernel virtual address space reserved for this request 250 * list structure and used to map the remote domain's pages for 251 * this I/O, into our domain's address space. 252 / 253* uint8_t kva; 254* 255 /** 256 * Base, psuedo-physical address, corresponding to the start 257 * of this request's kva region. 258 / 259* uint64_t gnt_base; 260 261 262#ifdef XBB_USE_BOUNCE_BUFFERS 263 /** 264 * Pre-allocated domain local memory used to proxy remote 265 * domain memory during I/O operations. 266 / 267* uint8_t bounce; 268#endif 269* 270 /** 271 * Array of grant handles (one per page) used to map this request. 272 / 273* grant_handle_t gnt_handles; 274* 275 /** 276 * Device statistics request ordering type (ordered or simple). 277 / 278* devstat_tag_type ds_tag_type; 279 280 /** 281 * Device statistics request type (read, write, no_data). 282 / 283* devstat_trans_flags ds_trans_type; 284 285 /** 286 * The start time for this request. 287 / 288* struct bintime ds_t0; 289 290 /** 291 * Linked list of contiguous requests with the same operation type. 292 / 293* struct xbb_xen_req_list contig_req_list; 294 295 /** 296 * Linked list links used to aggregate idle requests in the 297 * request list free pool (xbb->reqlist_free_stailq) and pending 298 * requests waiting for execution (xbb->reqlist_pending_stailq). 299 / 300* STAILQ_ENTRY(xbb_xen_reqlist) links; 301}; 302 303STAILQ_HEAD(xbb_xen_reqlist_list, xbb_xen_reqlist); 304 305/** 306 * \brief Object tracking an in-flight I/O from a Xen VBD consumer. 307 / 308struct xbb_xen_req { 309* /** 310 * Linked list links used to aggregate requests into a reqlist 311 * and to store them in the request free pool. 312 / 313* STAILQ_ENTRY(xbb_xen_req) links; 314 315 /** 316 * The remote domain's identifier for this I/O request. 317 / 318* uint64_t id; 319 320 /** 321 * The number of pages currently mapped for this request. 322 / 323* int nr_pages; 324 325 /** 326 * The number of 512 byte sectors comprising this requests. 327 / 328* int nr_512b_sectors; 329 330 /** 331 * The number of struct bio requests still outstanding for this 332 * request on the backend device. This field is only used for 333 * device (rather than file) backed I/O. 334 / 335* int pendcnt; 336 337 /** 338 * BLKIF_OP code for this request. 339 / 340* int operation; 341 342 /** 343 * Storage used for non-native ring requests. 344 / 345* blkif_request_t ring_req_storage; 346 347 /** 348 * Pointer to the Xen request in the ring. 349 / 350* blkif_request_t ring_req; 351* 352 /** 353 * Consumer index for this request. 354 / 355* RING_IDX req_ring_idx; 356 357 /** 358 * The start time for this request. 359 / 360* struct bintime ds_t0; 361 362 /** 363 * Pointer back to our parent request list. 364 / 365* struct xbb_xen_reqlist reqlist; 366}; 367SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req); 368* 369/** 370 * \brief Configuration data for the shared memory request ring 371 * used to communicate with the front-end client of this 372 * this driver. 373 / 374struct xbb_ring_config { 375* /** KVA address where ring memory is mapped. / 376* vm_offset_t va; 377 378 /** The pseudo-physical address where ring memory is mapped./ 379* uint64_t gnt_addr; 380 381 /** 382 * Grant table handles, one per-ring page, returned by the 383 * hyperpervisor upon mapping of the ring and required to 384 * unmap it when a connection is torn down. 385 / 386* grant_handle_t handle[XBB_MAX_RING_PAGES]; 387 388 /** 389 * The device bus address returned by the hypervisor when 390 * mapping the ring and required to unmap it when a connection 391 * is torn down. 392 / 393* uint64_t bus_addr[XBB_MAX_RING_PAGES]; 394 395 /** The number of ring pages mapped for the current connection. / 396* u_int ring_pages; 397 398 /** 399 * The grant references, one per-ring page, supplied by the 400 * front-end, allowing us to reference the ring pages in the 401 * front-end's domain and to map these pages into our own domain. 402 / 403* grant_ref_t ring_ref[XBB_MAX_RING_PAGES]; 404 405 /** The interrupt driven even channel used to signal ring events. / 406* evtchn_port_t evtchn; 407}; 408 409/** 410 * Per-instance connection state flags. 411 / 412typedef enum 413{ 414* /** 415 * The front-end requested a read-only mount of the 416 * back-end device/file. 417 / 418* XBBF_READ_ONLY = 0x01, 419 420 /** Communication with the front-end has been established. / 421* XBBF_RING_CONNECTED = 0x02, 422 423 /** 424 * Front-end requests exist in the ring and are waiting for 425 * xbb_xen_req objects to free up. 426 / 427* XBBF_RESOURCE_SHORTAGE = 0x04, 428 429 /** Connection teardown in progress. / 430* XBBF_SHUTDOWN = 0x08, 431 432 /** A thread is already performing shutdown processing. / 433* XBBF_IN_SHUTDOWN = 0x10 434} xbb_flag_t; 435 436/** Backend device type. / 437typedef enum { 438* /** Backend type unknown. / 439* XBB_TYPE_NONE = 0x00, 440 441 /** 442 * Backend type disk (access via cdev switch 443 * strategy routine). 444 / 445* XBB_TYPE_DISK = 0x01, 446 447 /** Backend type file (access vnode operations.). / 448* XBB_TYPE_FILE = 0x02 449} xbb_type; 450 451/** 452 * \brief Structure used to memoize information about a per-request 453 * scatter-gather list. 454 * 455 * The chief benefit of using this data structure is it avoids having 456 * to reparse the possibly discontiguous S/G list in the original 457 * request. Due to the way that the mapping of the memory backing an 458 * I/O transaction is handled by Xen, a second pass is unavoidable. 459 * At least this way the second walk is a simple array traversal. 460 * 461 * \note A single Scatter/Gather element in the block interface covers 462 * at most 1 machine page. In this context a sector (blkif 463 * nomenclature, not what I'd choose) is a 512b aligned unit 464 * of mapping within the machine page referenced by an S/G 465 * element. 466 / 467struct xbb_sg { 468* /** The number of 512b data chunks mapped in this S/G element. / 469* int16_t nsect; 470 471 /** 472 * The index (0 based) of the first 512b data chunk mapped 473 * in this S/G element. 474 / 475* uint8_t first_sect; 476 477 /** 478 * The index (0 based) of the last 512b data chunk mapped 479 * in this S/G element. 480 / 481* uint8_t last_sect; 482}; 483 484/** 485 * Character device backend specific configuration data. 486 / 487struct xbb_dev_data { 488* /** Cdev used for device backend access. / 489* struct cdev cdev; 490* 491 /** Cdev switch used for device backend access. / 492* struct cdevsw csw; 493* 494 /** Used to hold a reference on opened cdev backend devices. / 495* int dev_ref; 496}; 497 498/** 499 * File backend specific configuration data. 500 / 501struct xbb_file_data { 502* /** Credentials to use for vnode backed (file based) I/O. / 503* struct ucred cred; 504* 505 /** 506 * \brief Array of io vectors used to process file based I/O. 507 * 508 * Only a single file based request is outstanding per-xbb instance, 509 * so we only need one of these. 510 / 511* struct iovec xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; 512#ifdef XBB_USE_BOUNCE_BUFFERS 513 514 /** 515 * \brief Array of io vectors used to handle bouncing of file reads. 516 * 517 * Vnode operations are free to modify uio data during their 518 * exectuion. In the case of a read with bounce buffering active, 519 * we need some of the data from the original uio in order to 520 * bounce-out the read data. This array serves as the temporary 521 * storage for this saved data. 522 / 523* struct iovec saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; 524 525 /** 526 * \brief Array of memoized bounce buffer kva offsets used 527 * in the file based backend. 528 * 529 * Due to the way that the mapping of the memory backing an 530 * I/O transaction is handled by Xen, a second pass through 531 * the request sg elements is unavoidable. We memoize the computed 532 * bounce address here to reduce the cost of the second walk. 533 / 534* void xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQLIST]; 535#endif / XBB_USE_BOUNCE_BUFFERS / 536}; 537* 538/** 539 * Collection of backend type specific data. 540 / 541union xbb_backend_data { 542* struct xbb_dev_data dev; 543 struct xbb_file_data file; 544}; 545 546/** 547 * Function signature of backend specific I/O handlers. 548 / 549typedef int (xbb_dispatch_t)(struct xbb_softc xbb, 550* struct xbb_xen_reqlist reqlist, int operation, 551* int flags); 552 553/** 554 * Per-instance configuration data. 555 / 556struct xbb_softc { 557* 558 /** 559 * Task-queue used to process I/O requests. 560 / 561* struct taskqueue io_taskqueue; 562* 563 /** 564 * Single "run the request queue" task enqueued 565 * on io_taskqueue. 566 / 567* struct task io_task; 568 569 /** Device type for this instance. / 570* xbb_type device_type; 571 572 /** NewBus device corresponding to this instance. / 573* device_t dev; 574 575 /** Backend specific dispatch routine for this instance. / 576* xbb_dispatch_t dispatch_io; 577 578 /** The number of requests outstanding on the backend device/file. / 579* int active_request_count; 580 581 /** Free pool of request tracking structures. / 582* struct xbb_xen_req_list request_free_stailq; 583 584 /** Array, sized at connection time, of request tracking structures. / 585* struct xbb_xen_req requests; 586* 587 /** Free pool of request list structures. / 588* struct xbb_xen_reqlist_list reqlist_free_stailq; 589 590 /** List of pending request lists awaiting execution. / 591* struct xbb_xen_reqlist_list reqlist_pending_stailq; 592 593 /** Array, sized at connection time, of request list structures. / 594* struct xbb_xen_reqlist request_lists; 595* 596 /** 597 * Global pool of kva used for mapping remote domain ring 598 * and I/O transaction data. 599 / 600* vm_offset_t kva; 601 602 /** Psuedo-physical address corresponding to kva. / 603* uint64_t gnt_base_addr; 604 605 /** The size of the global kva pool. / 606* int kva_size; 607 608 /** The size of the KVA area used for request lists. / 609* int reqlist_kva_size; 610 611 /** The number of pages of KVA used for request lists / 612* int reqlist_kva_pages; 613 614 /** Bitmap of free KVA pages / 615* bitstr_t kva_free; 616* 617 /** 618 * \brief Cached value of the front-end's domain id. 619 * 620 * This value is used at once for each mapped page in 621 * a transaction. We cache it to avoid incuring the 622 * cost of an ivar access every time this is needed. 623 / 624* domid_t otherend_id; 625 626 /** 627 * \brief The blkif protocol abi in effect. 628 * 629 * There are situations where the back and front ends can 630 * have a different, native abi (e.g. intel x86_64 and 631 * 32bit x86 domains on the same machine). The back-end 632 * always accomodates the front-end's native abi. That 633 * value is pulled from the XenStore and recorded here. 634 / 635* int abi; 636 637 /** 638 * \brief The maximum number of requests and request lists allowed 639 * to be in flight at a time. 640 * 641 * This value is negotiated via the XenStore. 642 / 643* u_int max_requests; 644 645 /** 646 * \brief The maximum number of segments (1 page per segment) 647 * that can be mapped by a request. 648 * 649 * This value is negotiated via the XenStore. 650 / 651* u_int max_request_segments; 652 653 /** 654 * \brief Maximum number of segments per request list. 655 * 656 * This value is derived from and will generally be larger than 657 * max_request_segments. 658 / 659* u_int max_reqlist_segments; 660 661 /** 662 * The maximum size of any request to this back-end 663 * device. 664 * 665 * This value is negotiated via the XenStore. 666 / 667* u_int max_request_size; 668 669 /** 670 * The maximum size of any request list. This is derived directly 671 * from max_reqlist_segments. 672 / 673* u_int max_reqlist_size; 674 675 /** Various configuration and state bit flags. / 676* xbb_flag_t flags; 677 678 /** Ring mapping and interrupt configuration data. / 679* struct xbb_ring_config ring_config; 680 681 /** Runtime, cross-abi safe, structures for ring access. / 682* blkif_back_rings_t rings; 683 684 /** IRQ mapping for the communication ring event channel. / 685* int irq; 686 687 /** 688 * \brief Backend access mode flags (e.g. write, or read-only). 689 * 690 * This value is passed to us by the front-end via the XenStore. 691 / 692* char dev_mode; 693* 694 /** 695 * \brief Backend device type (e.g. "disk", "cdrom", "floppy"). 696 * 697 * This value is passed to us by the front-end via the XenStore. 698 * Currently unused. 699 / 700* char dev_type; 701* 702 /** 703 * \brief Backend device/file identifier. 704 * 705 * This value is passed to us by the front-end via the XenStore. 706 * We expect this to be a POSIX path indicating the file or 707 * device to open. 708 / 709* char dev_name; 710* 711 /** 712 * Vnode corresponding to the backend device node or file 713 * we are acessing. 714 / 715* struct vnode vn; 716* 717 union xbb_backend_data backend; 718 719 /** The native sector size of the backend. / 720* u_int sector_size; 721 722 /** log2 of sector_size. / 723* u_int sector_size_shift; 724 725 /** Size in bytes of the backend device or file. / 726* off_t media_size; 727 728 /** 729 * \brief media_size expressed in terms of the backend native 730 * sector size. 731 * 732 * (e.g. xbb->media_size >> xbb->sector_size_shift). 733 / 734* uint64_t media_num_sectors; 735 736 /** 737 * \brief Array of memoized scatter gather data computed during the 738 * conversion of blkif ring requests to internal xbb_xen_req 739 * structures. 740 * 741 * Ring processing is serialized so we only need one of these. 742 / 743* struct xbb_sg xbb_sgs[XBB_MAX_SEGMENTS_PER_REQLIST]; 744 745 /** 746 * Temporary grant table map used in xbb_dispatch_io(). When 747 * XBB_MAX_SEGMENTS_PER_REQLIST gets large, keeping this on the 748 * stack could cause a stack overflow. 749 / 750* struct gnttab_map_grant_ref maps[XBB_MAX_SEGMENTS_PER_REQLIST]; 751 752 /** Mutex protecting per-instance data. / 753* struct mtx lock; 754 755#ifdef XENHVM 756 /** 757 * Resource representing allocated physical address space 758 * associated with our per-instance kva region. 759 / 760* struct resource pseudo_phys_res; 761* 762 /** Resource id for allocated physical address space. / 763* int pseudo_phys_res_id; 764#endif 765 766 /** 767 * I/O statistics from BlockBack dispatch down. These are 768 * coalesced requests, and we start them right before execution. 769 / 770* struct devstat xbb_stats; 771* 772 /** 773 * I/O statistics coming into BlockBack. These are the requests as 774 * we get them from BlockFront. They are started as soon as we 775 * receive a request, and completed when the I/O is complete. 776 / 777* struct devstat xbb_stats_in; 778* 779 /** Disable sending flush to the backend / 780* int disable_flush; 781 782 /** Send a real flush for every N flush requests / 783* int flush_interval; 784 785 /** Count of flush requests in the interval / 786* int flush_count; 787 788 /** Don't coalesce requests if this is set / 789* int no_coalesce_reqs; 790 791 /** Number of requests we have received / 792* uint64_t reqs_received; 793 794 /** Number of requests we have completed/ 795* uint64_t reqs_completed; 796 797 /** How many forced dispatches (i.e. without coalescing) have happend / 798* uint64_t forced_dispatch; 799 800 /** How many normal dispatches have happend / 801* uint64_t normal_dispatch; 802 803 /** How many total dispatches have happend / 804* uint64_t total_dispatch; 805 806 /** How many times we have run out of KVA / 807* uint64_t kva_shortages; 808 809 /** How many times we have run out of request structures / 810* uint64_t request_shortages; 811}; 812 813/---------------------------- Request Processing ----------------------------/ 814/** 815 * Allocate an internal transaction tracking structure from the free pool. 816 * 817 * \param xbb Per-instance xbb configuration structure. 818 * 819 * \return On success, a pointer to the allocated xbb_xen_req structure. 820 * Otherwise NULL. 821 / 822static inline struct xbb_xen_req 823xbb_get_req(struct xbb_softc xbb) 824{ 825* struct xbb_xen_req req; 826* 827 req = NULL; 828 829 mtx_assert(&xbb->lock, MA_OWNED); 830 831 if ((req = STAILQ_FIRST(&xbb->request_free_stailq)) != NULL) { 832 STAILQ_REMOVE_HEAD(&xbb->request_free_stailq, links); 833 xbb->active_request_count++; 834 } 835 836 return (req); 837} 838 839/** 840 * Return an allocated transaction tracking structure to the free pool. 841 * 842 * \param xbb Per-instance xbb configuration structure. 843 * \param req The request structure to free. 844 / 845static inline void 846xbb_release_req(struct xbb_softc xbb, struct xbb_xen_req req) 847{ 848* mtx_assert(&xbb->lock, MA_OWNED); 849 850 STAILQ_INSERT_HEAD(&xbb->request_free_stailq, req, links); 851 xbb->active_request_count--; 852 853 KASSERT(xbb->active_request_count >= 0, 854 ("xbb_release_req: negative active count")); 855} 856 857/** 858 * Return an xbb_xen_req_list of allocated xbb_xen_reqs to the free pool. 859 * 860 * \param xbb Per-instance xbb configuration structure. 861 * \param req_list The list of requests to free. 862 * \param nreqs The number of items in the list. 863 / 864static inline void 865xbb_release_reqs(struct xbb_softc xbb, struct xbb_xen_req_list req_list, 866* int nreqs) 867{ 868 mtx_assert(&xbb->lock, MA_OWNED); 869 870 STAILQ_CONCAT(&xbb->request_free_stailq, req_list); 871 xbb->active_request_count -= nreqs; 872 873 KASSERT(xbb->active_request_count >= 0, 874 ("xbb_release_reqs: negative active count")); 875} 876 877/** 878 * Given a page index and 512b sector offset within that page, 879 * calculate an offset into a request's kva region. 880 * 881 * \param reqlist The request structure whose kva region will be accessed. 882 * \param pagenr The page index used to compute the kva offset. 883 * \param sector The 512b sector index used to compute the page relative 884 * kva offset. 885 * 886 * \return The computed global KVA offset. 887 / 888static inline uint8_t 889xbb_reqlist_vaddr(struct xbb_xen_reqlist reqlist, int pagenr, int sector) 890{ 891* return (reqlist->kva + (PAGE_SIZE * pagenr) + (sector << 9)); 892} 893 894#ifdef XBB_USE_BOUNCE_BUFFERS 895/** 896 * Given a page index and 512b sector offset within that page, 897 * calculate an offset into a request's local bounce memory region. 898 * 899 * \param reqlist The request structure whose bounce region will be accessed. 900 * \param pagenr The page index used to compute the bounce offset. 901 * \param sector The 512b sector index used to compute the page relative 902 * bounce offset. 903 * 904 * \return The computed global bounce buffer address. 905 / 906static inline uint8_t 907xbb_reqlist_bounce_addr(struct xbb_xen_reqlist reqlist, int pagenr, int sector) 908{ 909* return (reqlist->bounce + (PAGE_SIZE * pagenr) + (sector << 9)); 910} 911#endif 912 913/** 914 * Given a page number and 512b sector offset within that page, 915 * calculate an offset into the request's memory region that the 916 * underlying backend device/file should use for I/O. 917 * 918 * \param reqlist The request structure whose I/O region will be accessed. 919 * \param pagenr The page index used to compute the I/O offset. 920 * \param sector The 512b sector index used to compute the page relative 921 * I/O offset. 922 * 923 * \return The computed global I/O address. 924 * 925 * Depending on configuration, this will either be a local bounce buffer 926 * or a pointer to the memory mapped in from the front-end domain for 927 * this request. 928 / 929static inline uint8_t 930xbb_reqlist_ioaddr(struct xbb_xen_reqlist reqlist, int pagenr, int sector) 931{ 932#ifdef XBB_USE_BOUNCE_BUFFERS 933* return (xbb_reqlist_bounce_addr(reqlist, pagenr, sector)); 934#else 935 return (xbb_reqlist_vaddr(reqlist, pagenr, sector)); 936#endif 937} 938 939/** 940 * Given a page index and 512b sector offset within that page, calculate 941 * an offset into the local psuedo-physical address space used to map a 942 * front-end's request data into a request. 943 * 944 * \param reqlist The request list structure whose pseudo-physical region 945 * will be accessed. 946 * \param pagenr The page index used to compute the pseudo-physical offset. 947 * \param sector The 512b sector index used to compute the page relative 948 * pseudo-physical offset. 949 * 950 * \return The computed global pseudo-phsyical address. 951 * 952 * Depending on configuration, this will either be a local bounce buffer 953 * or a pointer to the memory mapped in from the front-end domain for 954 * this request. 955 / 956static inline uintptr_t 957xbb_get_gntaddr(struct xbb_xen_reqlist reqlist, int pagenr, int sector) 958{ 959 struct xbb_softc xbb; 960* 961 xbb = reqlist->xbb; 962 963 return ((uintptr_t)(xbb->gnt_base_addr + 964 (uintptr_t)(reqlist->kva - xbb->kva) + 965 (PAGE_SIZE * pagenr) + (sector << 9))); 966} 967 968/** 969 * Get Kernel Virtual Address space for mapping requests. 970 * 971 * \param xbb Per-instance xbb configuration structure. 972 * \param nr_pages Number of pages needed. 973 * \param check_only If set, check for free KVA but don't allocate it. 974 * \param have_lock If set, xbb lock is already held. 975 * 976 * \return On success, a pointer to the allocated KVA region. Otherwise NULL. 977 * 978 * Note: This should be unnecessary once we have either chaining or 979 * scatter/gather support for struct bio. At that point we'll be able to 980 * put multiple addresses and lengths in one bio/bio chain and won't need 981 * to map everything into one virtual segment. 982 / 983static uint8_t 984xbb_get_kva(struct xbb_softc xbb, int nr_pages) 985{ 986* intptr_t first_clear; 987 intptr_t num_clear; 988 uint8_t free_kva; 989* int i; 990 991 KASSERT(nr_pages != 0, ("xbb_get_kva of zero length")); 992 993 first_clear = 0; 994 free_kva = NULL; 995 996 mtx_lock(&xbb->lock); 997 998 /* 999 * Look for the first available page. If there are none, we're done. 1000 / 1001* bit_ffc(xbb->kva_free, xbb->reqlist_kva_pages, &first_clear); 1002 1003 if (first_clear == -1) 1004 goto bailout; 1005 1006 /* 1007 * Starting at the first available page, look for consecutive free 1008 * pages that will satisfy the user's request. 1009 / 1010* for (i = first_clear, num_clear = 0; i < xbb->reqlist_kva_pages; i++) { 1011 /* 1012 * If this is true, the page is used, so we have to reset 1013 * the number of clear pages and the first clear page 1014 * (since it pointed to a region with an insufficient number 1015 * of clear pages). 1016 / 1017* if (bit_test(xbb->kva_free, i)) { 1018 num_clear = 0; 1019 first_clear = -1; 1020 continue; 1021 } 1022 1023 if (first_clear == -1) 1024 first_clear = i; 1025 1026 /* 1027 * If this is true, we've found a large enough free region 1028 * to satisfy the request. 1029 / 1030* if (++num_clear == nr_pages) { 1031 1032 bit_nset(xbb->kva_free, first_clear, 1033 first_clear + nr_pages - 1); 1034 1035 free_kva = xbb->kva + 1036 (uint8_t )(first_clear PAGE_SIZE); 1037 1038 KASSERT(free_kva >= (uint8_t )xbb->kva && 1039* free_kva + (nr_pages * PAGE_SIZE) <= 1040 (uint8_t )xbb->ring_config.va, 1041* ("Free KVA %p len %d out of range, " 1042 "kva = %#jx, ring VA = %#jx\n", free_kva, 1043 nr_pages * PAGE_SIZE, (uintmax_t)xbb->kva, 1044 (uintmax_t)xbb->ring_config.va)); 1045 break; 1046 } 1047 } 1048 1049bailout: 1050 1051 if (free_kva == NULL) { 1052 xbb->flags \|= XBBF_RESOURCE_SHORTAGE; 1053 xbb->kva_shortages++; 1054 } 1055 1056 mtx_unlock(&xbb->lock); 1057 1058 return (free_kva); 1059} 1060 1061/** 1062 * Free allocated KVA. 1063 * 1064 * \param xbb Per-instance xbb configuration structure. 1065 * \param kva_ptr Pointer to allocated KVA region. 1066 * \param nr_pages Number of pages in the KVA region. 1067 / 1068static void 1069xbb_free_kva(struct xbb_softc xbb, uint8_t kva_ptr, int nr_pages) 1070{ 1071* intptr_t start_page; 1072 1073 mtx_assert(&xbb->lock, MA_OWNED); 1074 1075 start_page = (intptr_t)(kva_ptr - xbb->kva) >> PAGE_SHIFT; 1076 bit_nclear(xbb->kva_free, start_page, start_page + nr_pages - 1); 1077 1078} 1079 1080/** 1081 * Unmap the front-end pages associated with this I/O request. 1082 * 1083 * \param req The request structure to unmap. 1084 / 1085static void 1086xbb_unmap_reqlist(struct xbb_xen_reqlist reqlist) 1087{ 1088 struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQLIST]; 1089 u_int i; 1090 u_int invcount; 1091 int error; 1092 1093 invcount = 0; 1094 for (i = 0; i < reqlist->nr_segments; i++) { 1095 1096 if (reqlist->gnt_handles[i] == GRANT_REF_INVALID) 1097 continue; 1098 1099 unmap[invcount].host_addr = xbb_get_gntaddr(reqlist, i, 0); 1100 unmap[invcount].dev_bus_addr = 0; 1101 unmap[invcount].handle = reqlist->gnt_handles[i]; 1102 reqlist->gnt_handles[i] = GRANT_REF_INVALID; 1103 invcount++; 1104 } 1105 1106 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, 1107 unmap, invcount); 1108 KASSERT(error == 0, ("Grant table operation failed")); 1109} 1110 1111/** 1112 * Allocate an internal transaction tracking structure from the free pool. 1113 * 1114 * \param xbb Per-instance xbb configuration structure. 1115 * 1116 * \return On success, a pointer to the allocated xbb_xen_reqlist structure. 1117 * Otherwise NULL. 1118 / 1119static inline struct xbb_xen_reqlist 1120xbb_get_reqlist(struct xbb_softc xbb) 1121{ 1122* struct xbb_xen_reqlist reqlist; 1123* 1124 reqlist = NULL; 1125 1126 mtx_assert(&xbb->lock, MA_OWNED); 1127 1128 if ((reqlist = STAILQ_FIRST(&xbb->reqlist_free_stailq)) != NULL) { 1129 1130 STAILQ_REMOVE_HEAD(&xbb->reqlist_free_stailq, links); 1131 reqlist->flags = XBB_REQLIST_NONE; 1132 reqlist->kva = NULL; 1133 reqlist->status = BLKIF_RSP_OKAY; 1134 reqlist->residual_512b_sectors = 0; 1135 reqlist->num_children = 0; 1136 reqlist->nr_segments = 0; 1137 STAILQ_INIT(&reqlist->contig_req_list); 1138 } 1139 1140 return (reqlist); 1141} 1142 1143/** 1144 * Return an allocated transaction tracking structure to the free pool. 1145 * 1146 * \param xbb Per-instance xbb configuration structure. 1147 * \param req The request list structure to free. 1148 * \param wakeup If set, wakeup the work thread if freeing this reqlist 1149 * during a resource shortage condition. 1150 / 1151static inline void 1152xbb_release_reqlist(struct xbb_softc xbb, struct xbb_xen_reqlist reqlist, 1153* int wakeup) 1154{ 1155 1156 mtx_lock(&xbb->lock); 1157 1158 if (wakeup) { 1159 wakeup = xbb->flags & XBBF_RESOURCE_SHORTAGE; 1160 xbb->flags &= ~XBBF_RESOURCE_SHORTAGE; 1161 } 1162 1163 if (reqlist->kva != NULL) 1164 xbb_free_kva(xbb, reqlist->kva, reqlist->nr_segments); 1165 1166 xbb_release_reqs(xbb, &reqlist->contig_req_list, reqlist->num_children); 1167 1168 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); 1169 1170 if ((xbb->flags & XBBF_SHUTDOWN) != 0) { 1171 /* 1172 * Shutdown is in progress. See if we can 1173 * progress further now that one more request 1174 * has completed and been returned to the 1175 * free pool. 1176 / 1177* xbb_shutdown(xbb); 1178 } 1179 1180 mtx_unlock(&xbb->lock); 1181 1182 if (wakeup != 0) 1183 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1184} 1185 1186/** 1187 * Request resources and do basic request setup. 1188 * 1189 * \param xbb Per-instance xbb configuration structure. 1190 * \param reqlist Pointer to reqlist pointer. 1191 * \param ring_req Pointer to a block ring request. 1192 * \param ring_index The ring index of this request. 1193 * 1194 * \return 0 for success, non-zero for failure. 1195 / 1196static int 1197xbb_get_resources(struct xbb_softc xbb, struct xbb_xen_reqlist *reqlist, 1198* blkif_request_t ring_req, RING_IDX ring_idx) 1199{ 1200* struct xbb_xen_reqlist nreqlist; 1201* struct xbb_xen_req nreq; 1202* 1203 nreqlist = NULL; 1204 nreq = NULL; 1205 1206 mtx_lock(&xbb->lock); 1207 1208 /* 1209 * We don't allow new resources to be allocated if we're in the 1210 * process of shutting down. 1211 / 1212* if ((xbb->flags & XBBF_SHUTDOWN) != 0) { 1213 mtx_unlock(&xbb->lock); 1214 return (1); 1215 } 1216 1217 /* 1218 * Allocate a reqlist if the caller doesn't have one already. 1219 / 1220* if (reqlist == NULL) { 1221* nreqlist = xbb_get_reqlist(xbb); 1222 if (nreqlist == NULL) 1223 goto bailout_error; 1224 } 1225 1226 /* We always allocate a request. / 1227* nreq = xbb_get_req(xbb); 1228 if (nreq == NULL) 1229 goto bailout_error; 1230 1231 mtx_unlock(&xbb->lock); 1232 1233 if (reqlist == NULL) { 1234* reqlist = nreqlist; 1235* nreqlist->operation = ring_req->operation; 1236 nreqlist->starting_sector_number = ring_req->sector_number; 1237 STAILQ_INSERT_TAIL(&xbb->reqlist_pending_stailq, nreqlist, 1238 links); 1239 } 1240 1241 nreq->reqlist = reqlist; 1242* nreq->req_ring_idx = ring_idx; 1243 1244 if (xbb->abi != BLKIF_PROTOCOL_NATIVE) { 1245 bcopy(ring_req, &nreq->ring_req_storage, sizeof(ring_req)); 1246* nreq->ring_req = &nreq->ring_req_storage; 1247 } else { 1248 nreq->ring_req = ring_req; 1249 } 1250 1251 binuptime(&nreq->ds_t0); 1252 devstat_start_transaction(xbb->xbb_stats_in, &nreq->ds_t0); 1253 STAILQ_INSERT_TAIL(&(reqlist)->contig_req_list, nreq, links); 1254* (reqlist)->num_children++; 1255* (reqlist)->nr_segments += ring_req->nr_segments; 1256* 1257 return (0); 1258 1259bailout_error: 1260 1261 /* 1262 * We're out of resources, so set the shortage flag. The next time 1263 * a request is released, we'll try waking up the work thread to 1264 * see if we can allocate more resources. 1265 / 1266* xbb->flags \|= XBBF_RESOURCE_SHORTAGE; 1267 xbb->request_shortages++; 1268 1269 if (nreq != NULL) 1270 xbb_release_req(xbb, nreq); 1271 1272 mtx_unlock(&xbb->lock); 1273 1274 if (nreqlist != NULL) 1275 xbb_release_reqlist(xbb, nreqlist, /wakeup/ 0); 1276 1277 return (1); 1278} 1279 1280/** 1281 * Create and transmit a response to a blkif request. 1282 * 1283 * \param xbb Per-instance xbb configuration structure. 1284 * \param req The request structure to which to respond. 1285 * \param status The status code to report. See BLKIF_RSP_* 1286 * in sys/xen/interface/io/blkif.h. 1287 / 1288static void 1289xbb_send_response(struct xbb_softc xbb, struct xbb_xen_req req, int status) 1290{ 1291* blkif_response_t resp; 1292* int more_to_do; 1293 int notify; 1294 1295 more_to_do = 0; 1296 1297 /* 1298 * Place on the response ring for the relevant domain. 1299 * For now, only the spacing between entries is different 1300 * in the different ABIs, not the response entry layout. 1301 / 1302* mtx_lock(&xbb->lock); 1303 switch (xbb->abi) { 1304 case BLKIF_PROTOCOL_NATIVE: 1305 resp = RING_GET_RESPONSE(&xbb->rings.native, 1306 xbb->rings.native.rsp_prod_pvt); 1307 break; 1308 case BLKIF_PROTOCOL_X86_32: 1309 resp = (blkif_response_t ) 1310* RING_GET_RESPONSE(&xbb->rings.x86_32, 1311 xbb->rings.x86_32.rsp_prod_pvt); 1312 break; 1313 case BLKIF_PROTOCOL_X86_64: 1314 resp = (blkif_response_t ) 1315* RING_GET_RESPONSE(&xbb->rings.x86_64, 1316 xbb->rings.x86_64.rsp_prod_pvt); 1317 break; 1318 default: 1319 panic("Unexpected blkif protocol ABI."); 1320 } 1321 1322 resp->id = req->id; 1323 resp->operation = req->operation; 1324 resp->status = status; 1325 1326 xbb->rings.common.rsp_prod_pvt += BLKIF_SEGS_TO_BLOCKS(req->nr_pages); 1327 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, notify); 1328 1329 if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) { 1330 1331 /* 1332 * Tail check for pending requests. Allows frontend to avoid 1333 * notifications if requests are already in flight (lower 1334 * overheads and promotes batching). 1335 / 1336* RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do); 1337 } else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) { 1338 1339 more_to_do = 1; 1340 } 1341 1342 xbb->reqs_completed++; 1343 1344 mtx_unlock(&xbb->lock); 1345 1346 if (more_to_do) 1347 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 1348 1349 if (notify) 1350 notify_remote_via_irq(xbb->irq); 1351} 1352 1353/** 1354 * Complete a request list. 1355 * 1356 * \param xbb Per-instance xbb configuration structure. 1357 * \param reqlist Allocated internal request list structure. 1358 / 1359static void 1360xbb_complete_reqlist(struct xbb_softc xbb, struct xbb_xen_reqlist reqlist) 1361{ 1362* struct xbb_xen_req nreq; 1363* off_t sectors_sent; 1364 1365 sectors_sent = 0; 1366 1367 if (reqlist->flags & XBB_REQLIST_MAPPED) 1368 xbb_unmap_reqlist(reqlist); 1369 1370 /* 1371 * All I/O is done, send the response. A lock should not be 1372 * necessary here because the request list is complete, and 1373 * therefore this is the only context accessing this request 1374 * right now. The functions we call do their own locking if 1375 * necessary. 1376 / 1377* STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { 1378 off_t cur_sectors_sent; 1379 1380 xbb_send_response(xbb, nreq, reqlist->status); 1381 1382 /* We don't report bytes sent if there is an error. / 1383* if (reqlist->status == BLKIF_RSP_OKAY) 1384 cur_sectors_sent = nreq->nr_512b_sectors; 1385 else 1386 cur_sectors_sent = 0; 1387 1388 sectors_sent += cur_sectors_sent; 1389 1390 devstat_end_transaction(xbb->xbb_stats_in, 1391 /bytes/cur_sectors_sent << 9, 1392 reqlist->ds_tag_type, 1393 reqlist->ds_trans_type, 1394 /now/NULL, 1395 /then/&nreq->ds_t0); 1396 } 1397 1398 /* 1399 * Take out any sectors not sent. If we wind up negative (which 1400 * might happen if an error is reported as well as a residual), just 1401 * report 0 sectors sent. 1402 / 1403* sectors_sent -= reqlist->residual_512b_sectors; 1404 if (sectors_sent < 0) 1405 sectors_sent = 0; 1406 1407 devstat_end_transaction(xbb->xbb_stats, 1408 /bytes/ sectors_sent << 9, 1409 reqlist->ds_tag_type, 1410 reqlist->ds_trans_type, 1411 /now/NULL, 1412 /then/&reqlist->ds_t0); 1413 1414 xbb_release_reqlist(xbb, reqlist, /wakeup/ 1); 1415} 1416 1417/** 1418 * Completion handler for buffer I/O requests issued by the device 1419 * backend driver. 1420 * 1421 * \param bio The buffer I/O request on which to perform completion 1422 * processing. 1423 / 1424static void 1425xbb_bio_done(struct bio bio) 1426{ 1427 struct xbb_softc xbb; 1428* struct xbb_xen_reqlist reqlist; 1429* 1430 reqlist = bio->bio_caller1; 1431 xbb = reqlist->xbb; 1432 1433 reqlist->residual_512b_sectors += bio->bio_resid >> 9; 1434 1435 /* 1436 * This is a bit imprecise. With aggregated I/O a single 1437 * request list can contain multiple front-end requests and 1438 * a multiple bios may point to a single request. By carefully 1439 * walking the request list, we could map residuals and errors 1440 * back to the original front-end request, but the interface 1441 * isn't sufficiently rich for us to properly report the error. 1442 * So, we just treat the entire request list as having failed if an 1443 * error occurs on any part. And, if an error occurs, we treat 1444 * the amount of data transferred as 0. 1445 * 1446 * For residuals, we report it on the overall aggregated device, 1447 * but not on the individual requests, since we don't currently 1448 * do the work to determine which front-end request to which the 1449 * residual applies. 1450 / 1451* if (bio->bio_error) { 1452 DPRINTF("BIO returned error %d for operation on device %s\n", 1453 bio->bio_error, xbb->dev_name); 1454 reqlist->status = BLKIF_RSP_ERROR; 1455 1456 if (bio->bio_error == ENXIO 1457 && xenbus_get_state(xbb->dev) == XenbusStateConnected) { 1458 1459 /* 1460 * Backend device has disappeared. Signal the 1461 * front-end that we (the device proxy) want to 1462 * go away. 1463 / 1464* xenbus_set_state(xbb->dev, XenbusStateClosing); 1465 } 1466 } 1467 1468#ifdef XBB_USE_BOUNCE_BUFFERS 1469 if (bio->bio_cmd == BIO_READ) { 1470 vm_offset_t kva_offset; 1471 1472 kva_offset = (vm_offset_t)bio->bio_data 1473 - (vm_offset_t)reqlist->bounce; 1474 memcpy((uint8_t )reqlist->kva + kva_offset, 1475* bio->bio_data, bio->bio_bcount); 1476 } 1477#endif /* XBB_USE_BOUNCE_BUFFERS / 1478* 1479 /* 1480 * Decrement the pending count for the request list. When we're 1481 * done with the requests, send status back for all of them. 1482 / 1483* if (atomic_fetchadd_int(&reqlist->pendcnt, -1) == 1) 1484 xbb_complete_reqlist(xbb, reqlist); 1485 1486 g_destroy_bio(bio); 1487} 1488 1489/** 1490 * Parse a blkif request into an internal request structure and send 1491 * it to the backend for processing. 1492 * 1493 * \param xbb Per-instance xbb configuration structure. 1494 * \param reqlist Allocated internal request list structure. 1495 * 1496 * \return On success, 0. For resource shortages, non-zero. 1497 * 1498 * This routine performs the backend common aspects of request parsing 1499 * including compiling an internal request structure, parsing the S/G 1500 * list and any secondary ring requests in which they may reside, and 1501 * the mapping of front-end I/O pages into our domain. 1502 / 1503static int 1504xbb_dispatch_io(struct xbb_softc xbb, struct xbb_xen_reqlist reqlist) 1505{ 1506* struct xbb_sg xbb_sg; 1507* struct gnttab_map_grant_ref map; 1508* struct blkif_request_segment sg; 1509* struct blkif_request_segment last_block_sg; 1510* struct xbb_xen_req nreq; 1511* u_int nseg; 1512 u_int seg_idx; 1513 u_int block_segs; 1514 int nr_sects; 1515 int total_sects; 1516 int operation; 1517 uint8_t bio_flags; 1518 int error; 1519 1520 reqlist->ds_tag_type = DEVSTAT_TAG_SIMPLE; 1521 bio_flags = 0; 1522 total_sects = 0; 1523 nr_sects = 0; 1524 1525 /* 1526 * First determine whether we have enough free KVA to satisfy this 1527 * request list. If not, tell xbb_run_queue() so it can go to 1528 * sleep until we have more KVA. 1529 / 1530* reqlist->kva = NULL; 1531 if (reqlist->nr_segments != 0) { 1532 reqlist->kva = xbb_get_kva(xbb, reqlist->nr_segments); 1533 if (reqlist->kva == NULL) { 1534 /* 1535 * If we're out of KVA, return ENOMEM. 1536 / 1537* return (ENOMEM); 1538 } 1539 } 1540 1541 binuptime(&reqlist->ds_t0); 1542 devstat_start_transaction(xbb->xbb_stats, &reqlist->ds_t0); 1543 1544 switch (reqlist->operation) { 1545 case BLKIF_OP_WRITE_BARRIER: 1546 bio_flags \|= BIO_ORDERED; 1547 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; 1548 /* FALLTHROUGH / 1549* case BLKIF_OP_WRITE: 1550 operation = BIO_WRITE; 1551 reqlist->ds_trans_type = DEVSTAT_WRITE; 1552 if ((xbb->flags & XBBF_READ_ONLY) != 0) { 1553 DPRINTF("Attempt to write to read only device %s\n", 1554 xbb->dev_name); 1555 reqlist->status = BLKIF_RSP_ERROR; 1556 goto send_response; 1557 } 1558 break; 1559 case BLKIF_OP_READ: 1560 operation = BIO_READ; 1561 reqlist->ds_trans_type = DEVSTAT_READ; 1562 break; 1563 case BLKIF_OP_FLUSH_DISKCACHE: 1564 /* 1565 * If this is true, the user has requested that we disable 1566 * flush support. So we just complete the requests 1567 * successfully. 1568 / 1569* if (xbb->disable_flush != 0) { 1570 goto send_response; 1571 } 1572 1573 /* 1574 * The user has requested that we only send a real flush 1575 * for every N flush requests. So keep count, and either 1576 * complete the request immediately or queue it for the 1577 * backend. 1578 / 1579* if (xbb->flush_interval != 0) { 1580 if (++(xbb->flush_count) < xbb->flush_interval) { 1581 goto send_response; 1582 } else 1583 xbb->flush_count = 0; 1584 } 1585 1586 operation = BIO_FLUSH; 1587 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; 1588 reqlist->ds_trans_type = DEVSTAT_NO_DATA; 1589 goto do_dispatch; 1590 /NOTREACHED/ 1591 default: 1592 DPRINTF("error: unknown block io operation [%d]\n", 1593 reqlist->operation); 1594 reqlist->status = BLKIF_RSP_ERROR; 1595 goto send_response; 1596 } 1597 1598 reqlist->xbb = xbb; 1599 xbb_sg = xbb->xbb_sgs; 1600 map = xbb->maps; 1601 seg_idx = 0; 1602 1603 STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { 1604 blkif_request_t ring_req; 1605* RING_IDX req_ring_idx; 1606 u_int req_seg_idx; 1607 1608 ring_req = nreq->ring_req; 1609 req_ring_idx = nreq->req_ring_idx; 1610 nr_sects = 0; 1611 nseg = ring_req->nr_segments; 1612 nreq->id = ring_req->id; 1613 nreq->nr_pages = nseg; 1614 nreq->nr_512b_sectors = 0; 1615 req_seg_idx = 0; 1616 sg = NULL; 1617 1618 /* Check that number of segments is sane. / 1619* if (unlikely(nseg == 0) 1620 \|\| unlikely(nseg > xbb->max_request_segments)) { 1621 DPRINTF("Bad number of segments in request (%d)\n", 1622 nseg); 1623 reqlist->status = BLKIF_RSP_ERROR; 1624 goto send_response; 1625 } 1626 1627 block_segs = MIN(nreq->nr_pages, 1628 BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK); 1629 sg = ring_req->seg; 1630 last_block_sg = sg + block_segs; 1631 while (1) { 1632 1633 while (sg < last_block_sg) { 1634 KASSERT(seg_idx < 1635 XBB_MAX_SEGMENTS_PER_REQLIST, 1636 ("seg_idx %d is too large, max " 1637 "segs %d\n", seg_idx, 1638 XBB_MAX_SEGMENTS_PER_REQLIST)); 1639 1640 xbb_sg->first_sect = sg->first_sect; 1641 xbb_sg->last_sect = sg->last_sect; 1642 xbb_sg->nsect = 1643 (int8_t)(sg->last_sect - 1644 sg->first_sect + 1); 1645 1646 if ((sg->last_sect >= (PAGE_SIZE >> 9)) 1647 \|\| (xbb_sg->nsect <= 0)) { 1648 reqlist->status = BLKIF_RSP_ERROR; 1649 goto send_response; 1650 } 1651 1652 nr_sects += xbb_sg->nsect; 1653 map->host_addr = xbb_get_gntaddr(reqlist, 1654 seg_idx, /sector/0); 1655 KASSERT(map->host_addr + PAGE_SIZE <= 1656 xbb->ring_config.gnt_addr, 1657 ("Host address %#jx len %d overlaps " 1658 "ring address %#jx\n", 1659 (uintmax_t)map->host_addr, PAGE_SIZE, 1660 (uintmax_t)xbb->ring_config.gnt_addr)); 1661 1662 map->flags = GNTMAP_host_map; 1663 map->ref = sg->gref; 1664 map->dom = xbb->otherend_id; 1665 if (operation == BIO_WRITE) 1666 map->flags \|= GNTMAP_readonly; 1667 sg++; 1668 map++; 1669 xbb_sg++; 1670 seg_idx++; 1671 req_seg_idx++; 1672 } 1673 1674 block_segs = MIN(nseg - req_seg_idx, 1675 BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK); 1676 if (block_segs == 0) 1677 break; 1678 1679 /* 1680 * Fetch the next request block full of SG elements. 1681 * For now, only the spacing between entries is 1682 * different in the different ABIs, not the sg entry 1683 * layout. 1684 / 1685* req_ring_idx++; 1686 switch (xbb->abi) { 1687 case BLKIF_PROTOCOL_NATIVE: 1688 sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.native, 1689 req_ring_idx); 1690 break; 1691 case BLKIF_PROTOCOL_X86_32: 1692 { 1693 sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.x86_32, 1694 req_ring_idx); 1695 break; 1696 } 1697 case BLKIF_PROTOCOL_X86_64: 1698 { 1699 sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.x86_64, 1700 req_ring_idx); 1701 break; 1702 } 1703 default: 1704 panic("Unexpected blkif protocol ABI."); 1705 /* NOTREACHED / 1706* } 1707 last_block_sg = sg + block_segs; 1708 } 1709 1710 /* Convert to the disk's sector size / 1711* nreq->nr_512b_sectors = nr_sects; 1712 nr_sects = (nr_sects << 9) >> xbb->sector_size_shift; 1713 total_sects += nr_sects; 1714 1715 if ((nreq->nr_512b_sectors & 1716 ((xbb->sector_size >> 9) - 1)) != 0) { 1717 device_printf(xbb->dev, "%s: I/O size (%d) is not " 1718 "a multiple of the backing store sector " 1719 "size (%d)\n", __func__, 1720 nreq->nr_512b_sectors << 9, 1721 xbb->sector_size); 1722 reqlist->status = BLKIF_RSP_ERROR; 1723 goto send_response; 1724 } 1725 } 1726 1727 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, 1728 xbb->maps, reqlist->nr_segments); 1729 if (error != 0) 1730 panic("Grant table operation failed (%d)", error); 1731 1732 reqlist->flags \|= XBB_REQLIST_MAPPED; 1733 1734 for (seg_idx = 0, map = xbb->maps; seg_idx < reqlist->nr_segments; 1735 seg_idx++, map++){ 1736 1737 if (unlikely(map->status != 0)) { 1738 DPRINTF("invalid buffer -- could not remap " 1739 "it (%d)\n", map->status); 1740 DPRINTF("Mapping(%d): Host Addr 0x%lx, flags " 1741 "0x%x ref 0x%x, dom %d\n", seg_idx, 1742 map->host_addr, map->flags, map->ref, 1743 map->dom); 1744 reqlist->status = BLKIF_RSP_ERROR; 1745 goto send_response; 1746 } 1747 1748 reqlist->gnt_handles[seg_idx] = map->handle; 1749 } 1750 if (reqlist->starting_sector_number + total_sects > 1751 xbb->media_num_sectors) { 1752 1753 DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] " 1754 "extends past end of device %s\n", 1755 operation == BIO_READ ? "read" : "write", 1756 reqlist->starting_sector_number, 1757 reqlist->starting_sector_number + total_sects, 1758 xbb->dev_name); 1759 reqlist->status = BLKIF_RSP_ERROR; 1760 goto send_response; 1761 } 1762 1763do_dispatch: 1764 1765 error = xbb->dispatch_io(xbb, 1766 reqlist, 1767 operation, 1768 bio_flags); 1769 1770 if (error != 0) { 1771 reqlist->status = BLKIF_RSP_ERROR; 1772 goto send_response; 1773 } 1774 1775 return (0); 1776 1777send_response: 1778 1779 xbb_complete_reqlist(xbb, reqlist); 1780 1781 return (0); 1782} 1783 1784static __inline int 1785xbb_count_sects(blkif_request_t ring_req) 1786{ 1787* int i; 1788 int cur_size = 0; 1789 1790 for (i = 0; i < ring_req->nr_segments; i++) { 1791 int nsect; 1792 1793 nsect = (int8_t)(ring_req->seg[i].last_sect - 1794 ring_req->seg[i].first_sect + 1); 1795 if (nsect <= 0) 1796 break; 1797 1798 cur_size += nsect; 1799 } 1800 1801 return (cur_size); 1802} 1803 1804/** 1805 * Process incoming requests from the shared communication ring in response 1806 * to a signal on the ring's event channel. 1807 * 1808 * \param context Callback argument registerd during task initialization - 1809 * the xbb_softc for this instance. 1810 * \param pending The number of taskqueue_enqueue events that have 1811 * occurred since this handler was last run. 1812 / 1813static void 1814xbb_run_queue(void context, int pending) 1815{ 1816 struct xbb_softc xbb; 1817* blkif_back_rings_t rings; 1818* RING_IDX rp; 1819 uint64_t cur_sector; 1820 int cur_operation; 1821 struct xbb_xen_reqlist reqlist; 1822* 1823 1824 xbb = (struct xbb_softc )context; 1825* rings = &xbb->rings; 1826 1827 /* 1828 * Work gather and dispatch loop. Note that we have a bias here 1829 * towards gathering I/O sent by blockfront. We first gather up 1830 * everything in the ring, as long as we have resources. Then we 1831 * dispatch one request, and then attempt to gather up any 1832 * additional requests that have come in while we were dispatching 1833 * the request. 1834 * 1835 * This allows us to get a clearer picture (via devstat) of how 1836 * many requests blockfront is queueing to us at any given time. 1837 / 1838* for (;;) { 1839 int retval; 1840 1841 /* 1842 * Initialize reqlist to the last element in the pending 1843 * queue, if there is one. This allows us to add more 1844 * requests to that request list, if we have room. 1845 / 1846* reqlist = STAILQ_LAST(&xbb->reqlist_pending_stailq, 1847 xbb_xen_reqlist, links); 1848 if (reqlist != NULL) { 1849 cur_sector = reqlist->next_contig_sector; 1850 cur_operation = reqlist->operation; 1851 } else { 1852 cur_operation = 0; 1853 cur_sector = 0; 1854 } 1855 1856 /* 1857 * Cache req_prod to avoid accessing a cache line shared 1858 * with the frontend. 1859 / 1860* rp = rings->common.sring->req_prod; 1861 1862 /* Ensure we see queued requests up to 'rp'. / 1863* rmb(); 1864 1865 /** 1866 * Run so long as there is work to consume and the generation 1867 * of a response will not overflow the ring. 1868 * 1869 * @note There's a 1 to 1 relationship between requests and 1870 * responses, so an overflow should never occur. This 1871 * test is to protect our domain from digesting bogus 1872 * data. Shouldn't we log this? 1873 / 1874* while (rings->common.req_cons != rp 1875 && RING_REQUEST_CONS_OVERFLOW(&rings->common, 1876 rings->common.req_cons) == 0){ 1877 blkif_request_t ring_req_storage; 1878 blkif_request_t ring_req; 1879* int cur_size; 1880 1881 switch (xbb->abi) { 1882 case BLKIF_PROTOCOL_NATIVE: 1883 ring_req = RING_GET_REQUEST(&xbb->rings.native, 1884 rings->common.req_cons); 1885 break; 1886 case BLKIF_PROTOCOL_X86_32: 1887 { 1888 struct blkif_x86_32_request ring_req32; 1889* 1890 ring_req32 = RING_GET_REQUEST( 1891 &xbb->rings.x86_32, rings->common.req_cons); 1892 blkif_get_x86_32_req(&ring_req_storage, 1893 ring_req32); 1894 ring_req = &ring_req_storage; 1895 break; 1896 } 1897 case BLKIF_PROTOCOL_X86_64: 1898 { 1899 struct blkif_x86_64_request ring_req64; 1900* 1901 ring_req64 =RING_GET_REQUEST(&xbb->rings.x86_64, 1902 rings->common.req_cons); 1903 blkif_get_x86_64_req(&ring_req_storage, 1904 ring_req64); 1905 ring_req = &ring_req_storage; 1906 break; 1907 } 1908 default: 1909 panic("Unexpected blkif protocol ABI."); 1910 /* NOTREACHED / 1911* } 1912 1913 /* 1914 * Check for situations that would require closing 1915 * off this I/O for further coalescing: 1916 * - Coalescing is turned off. 1917 * - Current I/O is out of sequence with the previous 1918 * I/O. 1919 * - Coalesced I/O would be too large. 1920 / 1921* if ((reqlist != NULL) 1922 && ((xbb->no_coalesce_reqs != 0) 1923 \|\| ((xbb->no_coalesce_reqs == 0) 1924 && ((ring_req->sector_number != cur_sector) 1925 \|\| (ring_req->operation != cur_operation) 1926 \|\| ((ring_req->nr_segments + reqlist->nr_segments) > 1927 xbb->max_reqlist_segments))))) { 1928 reqlist = NULL; 1929 } 1930 1931 /* 1932 * Grab and check for all resources in one shot. 1933 * If we can't get all of the resources we need, 1934 * the shortage is noted and the thread will get 1935 * woken up when more resources are available. 1936 / 1937* retval = xbb_get_resources(xbb, &reqlist, ring_req, 1938 xbb->rings.common.req_cons); 1939 1940 if (retval != 0) { 1941 /* 1942 * Resource shortage has been recorded. 1943 * We'll be scheduled to run once a request 1944 * object frees up due to a completion. 1945 / 1946* break; 1947 } 1948 1949 /* 1950 * Signify that we can overwrite this request with 1951 * a response by incrementing our consumer index. 1952 * The response won't be generated until after 1953 * we've already consumed all necessary data out 1954 * of the version of the request in the ring buffer 1955 * (for native mode). We must update the consumer 1956 * index before issueing back-end I/O so there is 1957 * no possibility that it will complete and a 1958 * response be generated before we make room in 1959 * the queue for that response. 1960 / 1961* xbb->rings.common.req_cons += 1962 BLKIF_SEGS_TO_BLOCKS(ring_req->nr_segments); 1963 xbb->reqs_received++; 1964 1965 cur_size = xbb_count_sects(ring_req); 1966 cur_sector = ring_req->sector_number + cur_size; 1967 reqlist->next_contig_sector = cur_sector; 1968 cur_operation = ring_req->operation; 1969 } 1970 1971 /* Check for I/O to dispatch / 1972* reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); 1973 if (reqlist == NULL) { 1974 /* 1975 * We're out of work to do, put the task queue to 1976 * sleep. 1977 / 1978* break; 1979 } 1980 1981 /* 1982 * Grab the first request off the queue and attempt 1983 * to dispatch it. 1984 / 1985* STAILQ_REMOVE_HEAD(&xbb->reqlist_pending_stailq, links); 1986 1987 retval = xbb_dispatch_io(xbb, reqlist); 1988 if (retval != 0) { 1989 /* 1990 * xbb_dispatch_io() returns non-zero only when 1991 * there is a resource shortage. If that's the 1992 * case, re-queue this request on the head of the 1993 * queue, and go to sleep until we have more 1994 * resources. 1995 / 1996* STAILQ_INSERT_HEAD(&xbb->reqlist_pending_stailq, 1997 reqlist, links); 1998 break; 1999 } else { 2000 /* 2001 * If we still have anything on the queue after 2002 * removing the head entry, that is because we 2003 * met one of the criteria to create a new 2004 * request list (outlined above), and we'll call 2005 * that a forced dispatch for statistical purposes. 2006 * 2007 * Otherwise, if there is only one element on the 2008 * queue, we coalesced everything available on 2009 * the ring and we'll call that a normal dispatch. 2010 / 2011* reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); 2012 2013 if (reqlist != NULL) 2014 xbb->forced_dispatch++; 2015 else 2016 xbb->normal_dispatch++; 2017 2018 xbb->total_dispatch++; 2019 } 2020 } 2021} 2022 2023/** 2024 * Interrupt handler bound to the shared ring's event channel. 2025 * 2026 * \param arg Callback argument registerd during event channel 2027 * binding - the xbb_softc for this instance. 2028 / 2029static void 2030xbb_intr(void arg) 2031{ 2032 struct xbb_softc xbb; 2033* 2034 /* Defer to kernel thread. / 2035* xbb = (struct xbb_softc )arg; 2036* taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 2037} 2038 2039SDT_PROVIDER_DEFINE(xbb); 2040SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_dev, flush, flush, "int"); 2041SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, read, read, "int", "uint64_t", 2042 "uint64_t"); 2043SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, write, write, "int", 2044 "uint64_t", "uint64_t"); 2045 2046/----------------------------- Backend Handlers -----------------------------/ 2047/** 2048 * Backend handler for character device access. 2049 * 2050 * \param xbb Per-instance xbb configuration structure. 2051 * \param reqlist Allocated internal request list structure. 2052 * \param operation BIO_* I/O operation code. 2053 * \param bio_flags Additional bio_flag data to pass to any generated 2054 * bios (e.g. BIO_ORDERED).. 2055 * 2056 * \return 0 for success, errno codes for failure. 2057 / 2058static int 2059xbb_dispatch_dev(struct xbb_softc xbb, struct xbb_xen_reqlist reqlist, 2060* int operation, int bio_flags) 2061{ 2062 struct xbb_dev_data dev_data; 2063* struct bio bios[XBB_MAX_SEGMENTS_PER_REQLIST]; 2064* struct xbb_xen_req nreq; 2065* off_t bio_offset; 2066 struct bio bio; 2067* struct xbb_sg xbb_sg; 2068* u_int nbio; 2069 u_int bio_idx; 2070 u_int nseg; 2071 u_int seg_idx; 2072 int error; 2073 2074 dev_data = &xbb->backend.dev; 2075 bio_offset = (off_t)reqlist->starting_sector_number 2076 << xbb->sector_size_shift; 2077 error = 0; 2078 nbio = 0; 2079 bio_idx = 0; 2080 2081 if (operation == BIO_FLUSH) { 2082 nreq = STAILQ_FIRST(&reqlist->contig_req_list); 2083 bio = g_new_bio(); 2084 if (unlikely(bio == NULL)) { 2085 DPRINTF("Unable to allocate bio for BIO_FLUSH\n"); 2086 error = ENOMEM; 2087 return (error); 2088 } 2089 2090 bio->bio_cmd = BIO_FLUSH; 2091 bio->bio_flags \|= BIO_ORDERED; 2092 bio->bio_dev = dev_data->cdev; 2093 bio->bio_offset = 0; 2094 bio->bio_data = 0; 2095 bio->bio_done = xbb_bio_done; 2096 bio->bio_caller1 = nreq; 2097 bio->bio_pblkno = 0; 2098 2099 nreq->pendcnt = 1; 2100 2101 SDT_PROBE1(xbb, kernel, xbb_dispatch_dev, flush, 2102 device_get_unit(xbb->dev)); 2103 2104 (dev_data->csw->d_strategy)(bio); 2105* 2106 return (0); 2107 } 2108 2109 xbb_sg = xbb->xbb_sgs; 2110 bio = NULL; 2111 nseg = reqlist->nr_segments; 2112 2113 for (seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { 2114 2115 /* 2116 * KVA will not be contiguous, so any additional 2117 * I/O will need to be represented in a new bio. 2118 / 2119* if ((bio != NULL) 2120 && (xbb_sg->first_sect != 0)) { 2121 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { 2122 printf("%s: Discontiguous I/O request " 2123 "from domain %d ends on " 2124 "non-sector boundary\n", 2125 __func__, xbb->otherend_id); 2126 error = EINVAL; 2127 goto fail_free_bios; 2128 } 2129 bio = NULL; 2130 } 2131 2132 if (bio == NULL) { 2133 /* 2134 * Make sure that the start of this bio is 2135 * aligned to a device sector. 2136 / 2137* if ((bio_offset & (xbb->sector_size - 1)) != 0){ 2138 printf("%s: Misaligned I/O request " 2139 "from domain %d\n", __func__, 2140 xbb->otherend_id); 2141 error = EINVAL; 2142 goto fail_free_bios; 2143 } 2144 2145 bio = bios[nbio++] = g_new_bio(); 2146 if (unlikely(bio == NULL)) { 2147 error = ENOMEM; 2148 goto fail_free_bios; 2149 } 2150 bio->bio_cmd = operation; 2151 bio->bio_flags \|= bio_flags; 2152 bio->bio_dev = dev_data->cdev; 2153 bio->bio_offset = bio_offset; 2154 bio->bio_data = xbb_reqlist_ioaddr(reqlist, seg_idx, 2155 xbb_sg->first_sect); 2156 bio->bio_done = xbb_bio_done; 2157 bio->bio_caller1 = reqlist; 2158 bio->bio_pblkno = bio_offset >> xbb->sector_size_shift; 2159 } 2160 2161 bio->bio_length += xbb_sg->nsect << 9; 2162 bio->bio_bcount = bio->bio_length; 2163 bio_offset += xbb_sg->nsect << 9; 2164 2165 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) { 2166 2167 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { 2168 printf("%s: Discontiguous I/O request " 2169 "from domain %d ends on " 2170 "non-sector boundary\n", 2171 __func__, xbb->otherend_id); 2172 error = EINVAL; 2173 goto fail_free_bios; 2174 } 2175 /* 2176 * KVA will not be contiguous, so any additional 2177 * I/O will need to be represented in a new bio. 2178 / 2179* bio = NULL; 2180 } 2181 } 2182 2183 reqlist->pendcnt = nbio; 2184 2185 for (bio_idx = 0; bio_idx < nbio; bio_idx++) 2186 { 2187#ifdef XBB_USE_BOUNCE_BUFFERS 2188 vm_offset_t kva_offset; 2189 2190 kva_offset = (vm_offset_t)bios[bio_idx]->bio_data 2191 - (vm_offset_t)reqlist->bounce; 2192 if (operation == BIO_WRITE) { 2193 memcpy(bios[bio_idx]->bio_data, 2194 (uint8_t )reqlist->kva + kva_offset, 2195* bios[bio_idx]->bio_bcount); 2196 } 2197#endif 2198 if (operation == BIO_READ) { 2199 SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, read, 2200 device_get_unit(xbb->dev), 2201 bios[bio_idx]->bio_offset, 2202 bios[bio_idx]->bio_length); 2203 } else if (operation == BIO_WRITE) { 2204 SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, write, 2205 device_get_unit(xbb->dev), 2206 bios[bio_idx]->bio_offset, 2207 bios[bio_idx]->bio_length); 2208 } 2209 (dev_data->csw->d_strategy)(bios[bio_idx]); 2210* } 2211 2212 return (error); 2213 2214fail_free_bios: 2215 for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++) 2216 g_destroy_bio(bios[bio_idx]); 2217 2218 return (error); 2219} 2220 2221SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_file, flush, flush, "int"); 2222SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, read, read, "int", "uint64_t", 2223 "uint64_t"); 2224SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, write, write, "int", 2225 "uint64_t", "uint64_t"); 2226 2227/** 2228 * Backend handler for file access. 2229 * 2230 * \param xbb Per-instance xbb configuration structure. 2231 * \param reqlist Allocated internal request list. 2232 * \param operation BIO_* I/O operation code. 2233 * \param flags Additional bio_flag data to pass to any generated bios 2234 * (e.g. BIO_ORDERED).. 2235 * 2236 * \return 0 for success, errno codes for failure. 2237 / 2238static int 2239xbb_dispatch_file(struct xbb_softc xbb, struct xbb_xen_reqlist reqlist, 2240* int operation, int flags) 2241{ 2242 struct xbb_file_data file_data; 2243* u_int seg_idx; 2244 u_int nseg; 2245 off_t sectors_sent; 2246 struct uio xuio; 2247 struct xbb_sg xbb_sg; 2248* struct iovec xiovec; 2249#ifdef XBB_USE_BOUNCE_BUFFERS 2250* void *p_vaddr; 2251* int saved_uio_iovcnt; 2252#endif /* XBB_USE_BOUNCE_BUFFERS */
2253 int vfs_is_locked;
2254 int error; 2255 2256 file_data = &xbb->backend.file; 2257 sectors_sent = 0; 2258 error = 0; 2259 bzero(&xuio, sizeof(xuio)); 2260 2261 switch (operation) { 2262 case BIO_READ: 2263 xuio.uio_rw = UIO_READ; 2264 break; 2265 case BIO_WRITE: 2266 xuio.uio_rw = UIO_WRITE; 2267 break; 2268 case BIO_FLUSH: { 2269 struct mount mountpoint; 2270* 2271 SDT_PROBE1(xbb, kernel, xbb_dispatch_file, flush, 2272 device_get_unit(xbb->dev)); 2273	2253 int error; 2254 2255 file_data = &xbb->backend.file; 2256 sectors_sent = 0; 2257 error = 0; 2258 bzero(&xuio, sizeof(xuio)); 2259 2260 switch (operation) { 2261 case BIO_READ: 2262 xuio.uio_rw = UIO_READ; 2263 break; 2264 case BIO_WRITE: 2265 xuio.uio_rw = UIO_WRITE; 2266 break; 2267 case BIO_FLUSH: { 2268 struct mount mountpoint; 2269* 2270 SDT_PROBE1(xbb, kernel, xbb_dispatch_file, flush, 2271 device_get_unit(xbb->dev)); 2272
2274 vfs_is_locked = VFS_LOCK_GIANT(xbb->vn->v_mount); 2275
2276 (void) vn_start_write(xbb->vn, &mountpoint, V_WAIT); 2277 2278 vn_lock(xbb->vn, LK_EXCLUSIVE \| LK_RETRY); 2279 error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread); 2280 VOP_UNLOCK(xbb->vn, 0); 2281 2282 vn_finished_write(mountpoint); 2283	2273 (void) vn_start_write(xbb->vn, &mountpoint, V_WAIT); 2274 2275 vn_lock(xbb->vn, LK_EXCLUSIVE \| LK_RETRY); 2276 error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread); 2277 VOP_UNLOCK(xbb->vn, 0); 2278 2279 vn_finished_write(mountpoint); 2280
2284 VFS_UNLOCK_GIANT(vfs_is_locked); 2285
2286 goto bailout_send_response; 2287 /* NOTREACHED / 2288* } 2289 default: 2290 panic("invalid operation %d", operation); 2291 /* NOTREACHED / 2292* } 2293 xuio.uio_offset = (vm_offset_t)reqlist->starting_sector_number 2294 << xbb->sector_size_shift; 2295 xuio.uio_segflg = UIO_SYSSPACE; 2296 xuio.uio_iov = file_data->xiovecs; 2297 xuio.uio_iovcnt = 0; 2298 xbb_sg = xbb->xbb_sgs; 2299 nseg = reqlist->nr_segments; 2300 2301 for (xiovec = NULL, seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { 2302 2303 /* 2304 * If the first sector is not 0, the KVA will 2305 * not be contiguous and we'll need to go on 2306 * to another segment. 2307 / 2308* if (xbb_sg->first_sect != 0) 2309 xiovec = NULL; 2310 2311 if (xiovec == NULL) { 2312 xiovec = &file_data->xiovecs[xuio.uio_iovcnt]; 2313 xiovec->iov_base = xbb_reqlist_ioaddr(reqlist, 2314 seg_idx, xbb_sg->first_sect); 2315#ifdef XBB_USE_BOUNCE_BUFFERS 2316 /* 2317 * Store the address of the incoming 2318 * buffer at this particular offset 2319 * as well, so we can do the copy 2320 * later without having to do more 2321 * work to recalculate this address. 2322 / 2323* p_vaddr = &file_data->xiovecs_vaddr[xuio.uio_iovcnt]; 2324 p_vaddr = xbb_reqlist_vaddr(reqlist, seg_idx, 2325* xbb_sg->first_sect); 2326#endif /* XBB_USE_BOUNCE_BUFFERS / 2327* xiovec->iov_len = 0; 2328 xuio.uio_iovcnt++; 2329 } 2330 2331 xiovec->iov_len += xbb_sg->nsect << 9; 2332 2333 xuio.uio_resid += xbb_sg->nsect << 9; 2334 2335 /* 2336 * If the last sector is not the full page 2337 * size count, the next segment will not be 2338 * contiguous in KVA and we need a new iovec. 2339 / 2340* if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) 2341 xiovec = NULL; 2342 } 2343 2344 xuio.uio_td = curthread; 2345 2346#ifdef XBB_USE_BOUNCE_BUFFERS 2347 saved_uio_iovcnt = xuio.uio_iovcnt; 2348 2349 if (operation == BIO_WRITE) { 2350 /* Copy the write data to the local buffer. / 2351* for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, 2352 xiovec = xuio.uio_iov; seg_idx < xuio.uio_iovcnt; 2353 seg_idx++, xiovec++, p_vaddr++) { 2354 2355 memcpy(xiovec->iov_base, p_vaddr, xiovec->iov_len); 2356* } 2357 } else { 2358 /* 2359 * We only need to save off the iovecs in the case of a 2360 * read, because the copy for the read happens after the 2361 * VOP_READ(). (The uio will get modified in that call 2362 * sequence.) 2363 / 2364* memcpy(file_data->saved_xiovecs, xuio.uio_iov, 2365 xuio.uio_iovcnt * sizeof(xuio.uio_iov[0])); 2366 } 2367#endif /* XBB_USE_BOUNCE_BUFFERS / 2368*	2281 goto bailout_send_response; 2282 /* NOTREACHED / 2283* } 2284 default: 2285 panic("invalid operation %d", operation); 2286 /* NOTREACHED / 2287* } 2288 xuio.uio_offset = (vm_offset_t)reqlist->starting_sector_number 2289 << xbb->sector_size_shift; 2290 xuio.uio_segflg = UIO_SYSSPACE; 2291 xuio.uio_iov = file_data->xiovecs; 2292 xuio.uio_iovcnt = 0; 2293 xbb_sg = xbb->xbb_sgs; 2294 nseg = reqlist->nr_segments; 2295 2296 for (xiovec = NULL, seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { 2297 2298 /* 2299 * If the first sector is not 0, the KVA will 2300 * not be contiguous and we'll need to go on 2301 * to another segment. 2302 / 2303* if (xbb_sg->first_sect != 0) 2304 xiovec = NULL; 2305 2306 if (xiovec == NULL) { 2307 xiovec = &file_data->xiovecs[xuio.uio_iovcnt]; 2308 xiovec->iov_base = xbb_reqlist_ioaddr(reqlist, 2309 seg_idx, xbb_sg->first_sect); 2310#ifdef XBB_USE_BOUNCE_BUFFERS 2311 /* 2312 * Store the address of the incoming 2313 * buffer at this particular offset 2314 * as well, so we can do the copy 2315 * later without having to do more 2316 * work to recalculate this address. 2317 / 2318* p_vaddr = &file_data->xiovecs_vaddr[xuio.uio_iovcnt]; 2319 p_vaddr = xbb_reqlist_vaddr(reqlist, seg_idx, 2320* xbb_sg->first_sect); 2321#endif /* XBB_USE_BOUNCE_BUFFERS / 2322* xiovec->iov_len = 0; 2323 xuio.uio_iovcnt++; 2324 } 2325 2326 xiovec->iov_len += xbb_sg->nsect << 9; 2327 2328 xuio.uio_resid += xbb_sg->nsect << 9; 2329 2330 /* 2331 * If the last sector is not the full page 2332 * size count, the next segment will not be 2333 * contiguous in KVA and we need a new iovec. 2334 / 2335* if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) 2336 xiovec = NULL; 2337 } 2338 2339 xuio.uio_td = curthread; 2340 2341#ifdef XBB_USE_BOUNCE_BUFFERS 2342 saved_uio_iovcnt = xuio.uio_iovcnt; 2343 2344 if (operation == BIO_WRITE) { 2345 /* Copy the write data to the local buffer. / 2346* for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, 2347 xiovec = xuio.uio_iov; seg_idx < xuio.uio_iovcnt; 2348 seg_idx++, xiovec++, p_vaddr++) { 2349 2350 memcpy(xiovec->iov_base, p_vaddr, xiovec->iov_len); 2351* } 2352 } else { 2353 /* 2354 * We only need to save off the iovecs in the case of a 2355 * read, because the copy for the read happens after the 2356 * VOP_READ(). (The uio will get modified in that call 2357 * sequence.) 2358 / 2359* memcpy(file_data->saved_xiovecs, xuio.uio_iov, 2360 xuio.uio_iovcnt * sizeof(xuio.uio_iov[0])); 2361 } 2362#endif /* XBB_USE_BOUNCE_BUFFERS / 2363*
2369 vfs_is_locked = VFS_LOCK_GIANT(xbb->vn->v_mount);
2370 switch (operation) { 2371 case BIO_READ: 2372 2373 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, read, 2374 device_get_unit(xbb->dev), xuio.uio_offset, 2375 xuio.uio_resid); 2376 2377 vn_lock(xbb->vn, LK_EXCLUSIVE \| LK_RETRY); 2378 2379 /* 2380 * UFS pays attention to IO_DIRECT for reads. If the 2381 * DIRECTIO option is configured into the kernel, it calls 2382 * ffs_rawread(). But that only works for single-segment 2383 * uios with user space addresses. In our case, with a 2384 * kernel uio, it still reads into the buffer cache, but it 2385 * will just try to release the buffer from the cache later 2386 * on in ffs_read(). 2387 * 2388 * ZFS does not pay attention to IO_DIRECT for reads. 2389 * 2390 * UFS does not pay attention to IO_SYNC for reads. 2391 * 2392 * ZFS pays attention to IO_SYNC (which translates into the 2393 * Solaris define FRSYNC for zfs_read()) for reads. It 2394 * attempts to sync the file before reading. 2395 * 2396 * So, to attempt to provide some barrier semantics in the 2397 * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC. 2398 / 2399* error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 2400 (IO_DIRECT\|IO_SYNC) : 0, file_data->cred); 2401 2402 VOP_UNLOCK(xbb->vn, 0); 2403 break; 2404 case BIO_WRITE: { 2405 struct mount mountpoint; 2406* 2407 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, write, 2408 device_get_unit(xbb->dev), xuio.uio_offset, 2409 xuio.uio_resid); 2410 2411 (void)vn_start_write(xbb->vn, &mountpoint, V_WAIT); 2412 2413 vn_lock(xbb->vn, LK_EXCLUSIVE \| LK_RETRY); 2414 2415 /* 2416 * UFS pays attention to IO_DIRECT for writes. The write 2417 * is done asynchronously. (Normally the write would just 2418 * get put into cache. 2419 * 2420 * UFS pays attention to IO_SYNC for writes. It will 2421 * attempt to write the buffer out synchronously if that 2422 * flag is set. 2423 * 2424 * ZFS does not pay attention to IO_DIRECT for writes. 2425 * 2426 * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC) 2427 * for writes. It will flush the transaction from the 2428 * cache before returning. 2429 * 2430 * So if we've got the BIO_ORDERED flag set, we want 2431 * IO_SYNC in either the UFS or ZFS case. 2432 / 2433* error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 2434 IO_SYNC : 0, file_data->cred); 2435 VOP_UNLOCK(xbb->vn, 0); 2436 2437 vn_finished_write(mountpoint); 2438 2439 break; 2440 } 2441 default: 2442 panic("invalid operation %d", operation); 2443 /* NOTREACHED / 2444* }	2364 switch (operation) { 2365 case BIO_READ: 2366 2367 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, read, 2368 device_get_unit(xbb->dev), xuio.uio_offset, 2369 xuio.uio_resid); 2370 2371 vn_lock(xbb->vn, LK_EXCLUSIVE \| LK_RETRY); 2372 2373 /* 2374 * UFS pays attention to IO_DIRECT for reads. If the 2375 * DIRECTIO option is configured into the kernel, it calls 2376 * ffs_rawread(). But that only works for single-segment 2377 * uios with user space addresses. In our case, with a 2378 * kernel uio, it still reads into the buffer cache, but it 2379 * will just try to release the buffer from the cache later 2380 * on in ffs_read(). 2381 * 2382 * ZFS does not pay attention to IO_DIRECT for reads. 2383 * 2384 * UFS does not pay attention to IO_SYNC for reads. 2385 * 2386 * ZFS pays attention to IO_SYNC (which translates into the 2387 * Solaris define FRSYNC for zfs_read()) for reads. It 2388 * attempts to sync the file before reading. 2389 * 2390 * So, to attempt to provide some barrier semantics in the 2391 * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC. 2392 / 2393* error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 2394 (IO_DIRECT\|IO_SYNC) : 0, file_data->cred); 2395 2396 VOP_UNLOCK(xbb->vn, 0); 2397 break; 2398 case BIO_WRITE: { 2399 struct mount mountpoint; 2400* 2401 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, write, 2402 device_get_unit(xbb->dev), xuio.uio_offset, 2403 xuio.uio_resid); 2404 2405 (void)vn_start_write(xbb->vn, &mountpoint, V_WAIT); 2406 2407 vn_lock(xbb->vn, LK_EXCLUSIVE \| LK_RETRY); 2408 2409 /* 2410 * UFS pays attention to IO_DIRECT for writes. The write 2411 * is done asynchronously. (Normally the write would just 2412 * get put into cache. 2413 * 2414 * UFS pays attention to IO_SYNC for writes. It will 2415 * attempt to write the buffer out synchronously if that 2416 * flag is set. 2417 * 2418 * ZFS does not pay attention to IO_DIRECT for writes. 2419 * 2420 * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC) 2421 * for writes. It will flush the transaction from the 2422 * cache before returning. 2423 * 2424 * So if we've got the BIO_ORDERED flag set, we want 2425 * IO_SYNC in either the UFS or ZFS case. 2426 / 2427* error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 2428 IO_SYNC : 0, file_data->cred); 2429 VOP_UNLOCK(xbb->vn, 0); 2430 2431 vn_finished_write(mountpoint); 2432 2433 break; 2434 } 2435 default: 2436 panic("invalid operation %d", operation); 2437 /* NOTREACHED / 2438* }
2445 VFS_UNLOCK_GIANT(vfs_is_locked);
2446 2447#ifdef XBB_USE_BOUNCE_BUFFERS 2448 /* We only need to copy here for read operations / 2449* if (operation == BIO_READ) { 2450 2451 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, 2452 xiovec = file_data->saved_xiovecs; 2453 seg_idx < saved_uio_iovcnt; seg_idx++, 2454 xiovec++, p_vaddr++) { 2455 2456 /* 2457 * Note that we have to use the copy of the 2458 * io vector we made above. uiomove() modifies 2459 * the uio and its referenced vector as uiomove 2460 * performs the copy, so we can't rely on any 2461 * state from the original uio. 2462 / 2463* memcpy(p_vaddr, xiovec->iov_base, xiovec->iov_len); 2464* } 2465 } 2466#endif /* XBB_USE_BOUNCE_BUFFERS / 2467* 2468bailout_send_response: 2469 2470 if (error != 0) 2471 reqlist->status = BLKIF_RSP_ERROR; 2472 2473 xbb_complete_reqlist(xbb, reqlist); 2474 2475 return (0); 2476} 2477 2478/--------------------------- Backend Configuration --------------------------/ 2479/** 2480 * Close and cleanup any backend device/file specific state for this 2481 * block back instance. 2482 * 2483 * \param xbb Per-instance xbb configuration structure. 2484 / 2485static void 2486xbb_close_backend(struct xbb_softc xbb) 2487{ 2488 DROP_GIANT(); 2489 DPRINTF("closing dev=%s\n", xbb->dev_name); 2490 if (xbb->vn) { 2491 int flags = FREAD;	2439 2440#ifdef XBB_USE_BOUNCE_BUFFERS 2441 /* We only need to copy here for read operations / 2442* if (operation == BIO_READ) { 2443 2444 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, 2445 xiovec = file_data->saved_xiovecs; 2446 seg_idx < saved_uio_iovcnt; seg_idx++, 2447 xiovec++, p_vaddr++) { 2448 2449 /* 2450 * Note that we have to use the copy of the 2451 * io vector we made above. uiomove() modifies 2452 * the uio and its referenced vector as uiomove 2453 * performs the copy, so we can't rely on any 2454 * state from the original uio. 2455 / 2456* memcpy(p_vaddr, xiovec->iov_base, xiovec->iov_len); 2457* } 2458 } 2459#endif /* XBB_USE_BOUNCE_BUFFERS / 2460* 2461bailout_send_response: 2462 2463 if (error != 0) 2464 reqlist->status = BLKIF_RSP_ERROR; 2465 2466 xbb_complete_reqlist(xbb, reqlist); 2467 2468 return (0); 2469} 2470 2471/--------------------------- Backend Configuration --------------------------/ 2472/** 2473 * Close and cleanup any backend device/file specific state for this 2474 * block back instance. 2475 * 2476 * \param xbb Per-instance xbb configuration structure. 2477 / 2478static void 2479xbb_close_backend(struct xbb_softc xbb) 2480{ 2481 DROP_GIANT(); 2482 DPRINTF("closing dev=%s\n", xbb->dev_name); 2483 if (xbb->vn) { 2484 int flags = FREAD;
2492 int vfs_is_locked = 0;
2493 2494 if ((xbb->flags & XBBF_READ_ONLY) == 0) 2495 flags \|= FWRITE; 2496 2497 switch (xbb->device_type) { 2498 case XBB_TYPE_DISK: 2499 if (xbb->backend.dev.csw) { 2500 dev_relthread(xbb->backend.dev.cdev, 2501 xbb->backend.dev.dev_ref); 2502 xbb->backend.dev.csw = NULL; 2503 xbb->backend.dev.cdev = NULL; 2504 } 2505 break; 2506 case XBB_TYPE_FILE:	2485 2486 if ((xbb->flags & XBBF_READ_ONLY) == 0) 2487 flags \|= FWRITE; 2488 2489 switch (xbb->device_type) { 2490 case XBB_TYPE_DISK: 2491 if (xbb->backend.dev.csw) { 2492 dev_relthread(xbb->backend.dev.cdev, 2493 xbb->backend.dev.dev_ref); 2494 xbb->backend.dev.csw = NULL; 2495 xbb->backend.dev.cdev = NULL; 2496 } 2497 break; 2498 case XBB_TYPE_FILE:
2507 vfs_is_locked = VFS_LOCK_GIANT(xbb->vn->v_mount);
2508 break; 2509 case XBB_TYPE_NONE: 2510 default: 2511 panic("Unexpected backend type."); 2512 break; 2513 } 2514 2515 (void)vn_close(xbb->vn, flags, NOCRED, curthread); 2516 xbb->vn = NULL; 2517 2518 switch (xbb->device_type) { 2519 case XBB_TYPE_DISK: 2520 break; 2521 case XBB_TYPE_FILE:	2499 break; 2500 case XBB_TYPE_NONE: 2501 default: 2502 panic("Unexpected backend type."); 2503 break; 2504 } 2505 2506 (void)vn_close(xbb->vn, flags, NOCRED, curthread); 2507 xbb->vn = NULL; 2508 2509 switch (xbb->device_type) { 2510 case XBB_TYPE_DISK: 2511 break; 2512 case XBB_TYPE_FILE:
2522 VFS_UNLOCK_GIANT(vfs_is_locked);
2523 if (xbb->backend.file.cred != NULL) { 2524 crfree(xbb->backend.file.cred); 2525 xbb->backend.file.cred = NULL; 2526 } 2527 break; 2528 case XBB_TYPE_NONE: 2529 default: 2530 panic("Unexpected backend type."); 2531 break; 2532 } 2533 } 2534 PICKUP_GIANT(); 2535} 2536 2537/** 2538 * Open a character device to be used for backend I/O. 2539 * 2540 * \param xbb Per-instance xbb configuration structure. 2541 * 2542 * \return 0 for success, errno codes for failure. 2543 / 2544static int 2545xbb_open_dev(struct xbb_softc xbb) 2546{ 2547 struct vattr vattr; 2548 struct cdev dev; 2549* struct cdevsw devsw; 2550* int error; 2551 2552 xbb->device_type = XBB_TYPE_DISK; 2553 xbb->dispatch_io = xbb_dispatch_dev; 2554 xbb->backend.dev.cdev = xbb->vn->v_rdev; 2555 xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev, 2556 &xbb->backend.dev.dev_ref); 2557 if (xbb->backend.dev.csw == NULL) 2558 panic("Unable to retrieve device switch"); 2559 2560 error = VOP_GETATTR(xbb->vn, &vattr, NOCRED); 2561 if (error) { 2562 xenbus_dev_fatal(xbb->dev, error, "error getting " 2563 "vnode attributes for device %s", 2564 xbb->dev_name); 2565 return (error); 2566 } 2567 2568 2569 dev = xbb->vn->v_rdev; 2570 devsw = dev->si_devsw; 2571 if (!devsw->d_ioctl) { 2572 xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for " 2573 "device %s!", xbb->dev_name); 2574 return (ENODEV); 2575 } 2576 2577 error = devsw->d_ioctl(dev, DIOCGSECTORSIZE, 2578 (caddr_t)&xbb->sector_size, FREAD, 2579 curthread); 2580 if (error) { 2581 xenbus_dev_fatal(xbb->dev, error, 2582 "error calling ioctl DIOCGSECTORSIZE " 2583 "for device %s", xbb->dev_name); 2584 return (error); 2585 } 2586 2587 error = devsw->d_ioctl(dev, DIOCGMEDIASIZE, 2588 (caddr_t)&xbb->media_size, FREAD, 2589 curthread); 2590 if (error) { 2591 xenbus_dev_fatal(xbb->dev, error, 2592 "error calling ioctl DIOCGMEDIASIZE " 2593 "for device %s", xbb->dev_name); 2594 return (error); 2595 } 2596 2597 return (0); 2598} 2599 2600/** 2601 * Open a file to be used for backend I/O. 2602 * 2603 * \param xbb Per-instance xbb configuration structure. 2604 * 2605 * \return 0 for success, errno codes for failure. 2606 / 2607static int 2608xbb_open_file(struct xbb_softc xbb) 2609{ 2610 struct xbb_file_data file_data; 2611* struct vattr vattr; 2612 int error; 2613 2614 file_data = &xbb->backend.file; 2615 xbb->device_type = XBB_TYPE_FILE; 2616 xbb->dispatch_io = xbb_dispatch_file; 2617 error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred); 2618 if (error != 0) { 2619 xenbus_dev_fatal(xbb->dev, error, 2620 "error calling VOP_GETATTR()" 2621 "for file %s", xbb->dev_name); 2622 return (error); 2623 } 2624 2625 /* 2626 * Verify that we have the ability to upgrade to exclusive 2627 * access on this file so we can trap errors at open instead 2628 * of reporting them during first access. 2629 / 2630* if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) { 2631 vn_lock(xbb->vn, LK_UPGRADE \| LK_RETRY); 2632 if (xbb->vn->v_iflag & VI_DOOMED) { 2633 error = EBADF; 2634 xenbus_dev_fatal(xbb->dev, error, 2635 "error locking file %s", 2636 xbb->dev_name); 2637 2638 return (error); 2639 } 2640 } 2641 2642 file_data->cred = crhold(curthread->td_ucred); 2643 xbb->media_size = vattr.va_size; 2644 2645 /* 2646 * XXX KDM vattr.va_blocksize may be larger than 512 bytes here. 2647 * With ZFS, it is 131072 bytes. Block sizes that large don't work 2648 * with disklabel and UFS on FreeBSD at least. Large block sizes 2649 * may not work with other OSes as well. So just export a sector 2650 * size of 512 bytes, which should work with any OS or 2651 * application. Since our backing is a file, any block size will 2652 * work fine for the backing store. 2653 / 2654#if 0 2655* xbb->sector_size = vattr.va_blocksize; 2656#endif 2657 xbb->sector_size = 512; 2658 2659 /* 2660 * Sanity check. The media size has to be at least one 2661 * sector long. 2662 / 2663* if (xbb->media_size < xbb->sector_size) { 2664 error = EINVAL; 2665 xenbus_dev_fatal(xbb->dev, error, 2666 "file %s size %ju < block size %u", 2667 xbb->dev_name, 2668 (uintmax_t)xbb->media_size, 2669 xbb->sector_size); 2670 } 2671 return (error); 2672} 2673 2674/** 2675 * Open the backend provider for this connection. 2676 * 2677 * \param xbb Per-instance xbb configuration structure. 2678 * 2679 * \return 0 for success, errno codes for failure. 2680 / 2681static int 2682xbb_open_backend(struct xbb_softc xbb) 2683{ 2684 struct nameidata nd; 2685 int flags; 2686 int error;	2513 if (xbb->backend.file.cred != NULL) { 2514 crfree(xbb->backend.file.cred); 2515 xbb->backend.file.cred = NULL; 2516 } 2517 break; 2518 case XBB_TYPE_NONE: 2519 default: 2520 panic("Unexpected backend type."); 2521 break; 2522 } 2523 } 2524 PICKUP_GIANT(); 2525} 2526 2527/** 2528 * Open a character device to be used for backend I/O. 2529 * 2530 * \param xbb Per-instance xbb configuration structure. 2531 * 2532 * \return 0 for success, errno codes for failure. 2533 / 2534static int 2535xbb_open_dev(struct xbb_softc xbb) 2536{ 2537 struct vattr vattr; 2538 struct cdev dev; 2539* struct cdevsw devsw; 2540* int error; 2541 2542 xbb->device_type = XBB_TYPE_DISK; 2543 xbb->dispatch_io = xbb_dispatch_dev; 2544 xbb->backend.dev.cdev = xbb->vn->v_rdev; 2545 xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev, 2546 &xbb->backend.dev.dev_ref); 2547 if (xbb->backend.dev.csw == NULL) 2548 panic("Unable to retrieve device switch"); 2549 2550 error = VOP_GETATTR(xbb->vn, &vattr, NOCRED); 2551 if (error) { 2552 xenbus_dev_fatal(xbb->dev, error, "error getting " 2553 "vnode attributes for device %s", 2554 xbb->dev_name); 2555 return (error); 2556 } 2557 2558 2559 dev = xbb->vn->v_rdev; 2560 devsw = dev->si_devsw; 2561 if (!devsw->d_ioctl) { 2562 xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for " 2563 "device %s!", xbb->dev_name); 2564 return (ENODEV); 2565 } 2566 2567 error = devsw->d_ioctl(dev, DIOCGSECTORSIZE, 2568 (caddr_t)&xbb->sector_size, FREAD, 2569 curthread); 2570 if (error) { 2571 xenbus_dev_fatal(xbb->dev, error, 2572 "error calling ioctl DIOCGSECTORSIZE " 2573 "for device %s", xbb->dev_name); 2574 return (error); 2575 } 2576 2577 error = devsw->d_ioctl(dev, DIOCGMEDIASIZE, 2578 (caddr_t)&xbb->media_size, FREAD, 2579 curthread); 2580 if (error) { 2581 xenbus_dev_fatal(xbb->dev, error, 2582 "error calling ioctl DIOCGMEDIASIZE " 2583 "for device %s", xbb->dev_name); 2584 return (error); 2585 } 2586 2587 return (0); 2588} 2589 2590/** 2591 * Open a file to be used for backend I/O. 2592 * 2593 * \param xbb Per-instance xbb configuration structure. 2594 * 2595 * \return 0 for success, errno codes for failure. 2596 / 2597static int 2598xbb_open_file(struct xbb_softc xbb) 2599{ 2600 struct xbb_file_data file_data; 2601* struct vattr vattr; 2602 int error; 2603 2604 file_data = &xbb->backend.file; 2605 xbb->device_type = XBB_TYPE_FILE; 2606 xbb->dispatch_io = xbb_dispatch_file; 2607 error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred); 2608 if (error != 0) { 2609 xenbus_dev_fatal(xbb->dev, error, 2610 "error calling VOP_GETATTR()" 2611 "for file %s", xbb->dev_name); 2612 return (error); 2613 } 2614 2615 /* 2616 * Verify that we have the ability to upgrade to exclusive 2617 * access on this file so we can trap errors at open instead 2618 * of reporting them during first access. 2619 / 2620* if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) { 2621 vn_lock(xbb->vn, LK_UPGRADE \| LK_RETRY); 2622 if (xbb->vn->v_iflag & VI_DOOMED) { 2623 error = EBADF; 2624 xenbus_dev_fatal(xbb->dev, error, 2625 "error locking file %s", 2626 xbb->dev_name); 2627 2628 return (error); 2629 } 2630 } 2631 2632 file_data->cred = crhold(curthread->td_ucred); 2633 xbb->media_size = vattr.va_size; 2634 2635 /* 2636 * XXX KDM vattr.va_blocksize may be larger than 512 bytes here. 2637 * With ZFS, it is 131072 bytes. Block sizes that large don't work 2638 * with disklabel and UFS on FreeBSD at least. Large block sizes 2639 * may not work with other OSes as well. So just export a sector 2640 * size of 512 bytes, which should work with any OS or 2641 * application. Since our backing is a file, any block size will 2642 * work fine for the backing store. 2643 / 2644#if 0 2645* xbb->sector_size = vattr.va_blocksize; 2646#endif 2647 xbb->sector_size = 512; 2648 2649 /* 2650 * Sanity check. The media size has to be at least one 2651 * sector long. 2652 / 2653* if (xbb->media_size < xbb->sector_size) { 2654 error = EINVAL; 2655 xenbus_dev_fatal(xbb->dev, error, 2656 "file %s size %ju < block size %u", 2657 xbb->dev_name, 2658 (uintmax_t)xbb->media_size, 2659 xbb->sector_size); 2660 } 2661 return (error); 2662} 2663 2664/** 2665 * Open the backend provider for this connection. 2666 * 2667 * \param xbb Per-instance xbb configuration structure. 2668 * 2669 * \return 0 for success, errno codes for failure. 2670 / 2671static int 2672xbb_open_backend(struct xbb_softc xbb) 2673{ 2674 struct nameidata nd; 2675 int flags; 2676 int error;
2687 int vfs_is_locked;
2688 2689 flags = FREAD; 2690 error = 0; 2691 2692 DPRINTF("opening dev=%s\n", xbb->dev_name); 2693 2694 if (rootvnode == NULL) { 2695 xenbus_dev_fatal(xbb->dev, ENOENT, 2696 "Root file system not mounted"); 2697 return (ENOENT); 2698 } 2699 2700 if ((xbb->flags & XBBF_READ_ONLY) == 0) 2701 flags \|= FWRITE; 2702 2703 if (!curthread->td_proc->p_fd->fd_cdir) { 2704 curthread->td_proc->p_fd->fd_cdir = rootvnode; 2705 VREF(rootvnode); 2706 } 2707 if (!curthread->td_proc->p_fd->fd_rdir) { 2708 curthread->td_proc->p_fd->fd_rdir = rootvnode; 2709 VREF(rootvnode); 2710 } 2711 if (!curthread->td_proc->p_fd->fd_jdir) { 2712 curthread->td_proc->p_fd->fd_jdir = rootvnode; 2713 VREF(rootvnode); 2714 } 2715 2716 again: 2717 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name, curthread); 2718 error = vn_open(&nd, &flags, 0, NULL); 2719 if (error) { 2720 /* 2721 * This is the only reasonable guess we can make as far as 2722 * path if the user doesn't give us a fully qualified path. 2723 * If they want to specify a file, they need to specify the 2724 * full path. 2725 / 2726* if (xbb->dev_name[0] != '/') { 2727 char dev_path = "/dev/"; 2728* char dev_name; 2729* 2730 /* Try adding device path at beginning of name / 2731* dev_name = malloc(strlen(xbb->dev_name) 2732 + strlen(dev_path) + 1, 2733 M_XENBLOCKBACK, M_NOWAIT); 2734 if (dev_name) { 2735 sprintf(dev_name, "%s%s", dev_path, 2736 xbb->dev_name); 2737 free(xbb->dev_name, M_XENBLOCKBACK); 2738 xbb->dev_name = dev_name; 2739 goto again; 2740 } 2741 } 2742 xenbus_dev_fatal(xbb->dev, error, "error opening device %s", 2743 xbb->dev_name); 2744 return (error); 2745 } 2746	2677 2678 flags = FREAD; 2679 error = 0; 2680 2681 DPRINTF("opening dev=%s\n", xbb->dev_name); 2682 2683 if (rootvnode == NULL) { 2684 xenbus_dev_fatal(xbb->dev, ENOENT, 2685 "Root file system not mounted"); 2686 return (ENOENT); 2687 } 2688 2689 if ((xbb->flags & XBBF_READ_ONLY) == 0) 2690 flags \|= FWRITE; 2691 2692 if (!curthread->td_proc->p_fd->fd_cdir) { 2693 curthread->td_proc->p_fd->fd_cdir = rootvnode; 2694 VREF(rootvnode); 2695 } 2696 if (!curthread->td_proc->p_fd->fd_rdir) { 2697 curthread->td_proc->p_fd->fd_rdir = rootvnode; 2698 VREF(rootvnode); 2699 } 2700 if (!curthread->td_proc->p_fd->fd_jdir) { 2701 curthread->td_proc->p_fd->fd_jdir = rootvnode; 2702 VREF(rootvnode); 2703 } 2704 2705 again: 2706 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name, curthread); 2707 error = vn_open(&nd, &flags, 0, NULL); 2708 if (error) { 2709 /* 2710 * This is the only reasonable guess we can make as far as 2711 * path if the user doesn't give us a fully qualified path. 2712 * If they want to specify a file, they need to specify the 2713 * full path. 2714 / 2715* if (xbb->dev_name[0] != '/') { 2716 char dev_path = "/dev/"; 2717* char dev_name; 2718* 2719 /* Try adding device path at beginning of name / 2720* dev_name = malloc(strlen(xbb->dev_name) 2721 + strlen(dev_path) + 1, 2722 M_XENBLOCKBACK, M_NOWAIT); 2723 if (dev_name) { 2724 sprintf(dev_name, "%s%s", dev_path, 2725 xbb->dev_name); 2726 free(xbb->dev_name, M_XENBLOCKBACK); 2727 xbb->dev_name = dev_name; 2728 goto again; 2729 } 2730 } 2731 xenbus_dev_fatal(xbb->dev, error, "error opening device %s", 2732 xbb->dev_name); 2733 return (error); 2734 } 2735
2747 vfs_is_locked = NDHASGIANT(&nd); 2748
2749 NDFREE(&nd, NDF_ONLY_PNBUF); 2750 2751 xbb->vn = nd.ni_vp; 2752 2753 /* We only support disks and files. / 2754* if (vn_isdisk(xbb->vn, &error)) { 2755 error = xbb_open_dev(xbb); 2756 } else if (xbb->vn->v_type == VREG) { 2757 error = xbb_open_file(xbb); 2758 } else { 2759 error = EINVAL; 2760 xenbus_dev_fatal(xbb->dev, error, "%s is not a disk " 2761 "or file", xbb->dev_name); 2762 } 2763 VOP_UNLOCK(xbb->vn, 0);	2736 NDFREE(&nd, NDF_ONLY_PNBUF); 2737 2738 xbb->vn = nd.ni_vp; 2739 2740 /* We only support disks and files. / 2741* if (vn_isdisk(xbb->vn, &error)) { 2742 error = xbb_open_dev(xbb); 2743 } else if (xbb->vn->v_type == VREG) { 2744 error = xbb_open_file(xbb); 2745 } else { 2746 error = EINVAL; 2747 xenbus_dev_fatal(xbb->dev, error, "%s is not a disk " 2748 "or file", xbb->dev_name); 2749 } 2750 VOP_UNLOCK(xbb->vn, 0);
2764 VFS_UNLOCK_GIANT(vfs_is_locked);
2765 2766 if (error != 0) { 2767 xbb_close_backend(xbb); 2768 return (error); 2769 } 2770 2771 xbb->sector_size_shift = fls(xbb->sector_size) - 1; 2772 xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift; 2773 2774 DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n", 2775 (xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file", 2776 xbb->dev_name, xbb->sector_size, xbb->media_size); 2777 2778 return (0); 2779} 2780 2781/------------------------ Inter-Domain Communication ------------------------/ 2782/** 2783 * Free dynamically allocated KVA or pseudo-physical address allocations. 2784 * 2785 * \param xbb Per-instance xbb configuration structure. 2786 / 2787static void 2788xbb_free_communication_mem(struct xbb_softc xbb) 2789{ 2790 if (xbb->kva != 0) { 2791#ifndef XENHVM 2792 kmem_free(kernel_map, xbb->kva, xbb->kva_size); 2793#else 2794 if (xbb->pseudo_phys_res != NULL) { 2795 bus_release_resource(xbb->dev, SYS_RES_MEMORY, 2796 xbb->pseudo_phys_res_id, 2797 xbb->pseudo_phys_res); 2798 xbb->pseudo_phys_res = NULL; 2799 } 2800#endif 2801 } 2802 xbb->kva = 0; 2803 xbb->gnt_base_addr = 0; 2804 if (xbb->kva_free != NULL) { 2805 free(xbb->kva_free, M_XENBLOCKBACK); 2806 xbb->kva_free = NULL; 2807 } 2808} 2809 2810/** 2811 * Cleanup all inter-domain communication mechanisms. 2812 * 2813 * \param xbb Per-instance xbb configuration structure. 2814 / 2815static int 2816xbb_disconnect(struct xbb_softc xbb) 2817{ 2818 struct gnttab_unmap_grant_ref ops[XBB_MAX_RING_PAGES]; 2819 struct gnttab_unmap_grant_ref op; 2820* u_int ring_idx; 2821 int error; 2822 2823 DPRINTF("\n"); 2824 2825 if ((xbb->flags & XBBF_RING_CONNECTED) == 0) 2826 return (0); 2827 2828 if (xbb->irq != 0) { 2829 unbind_from_irqhandler(xbb->irq); 2830 xbb->irq = 0; 2831 } 2832 2833 mtx_unlock(&xbb->lock); 2834 taskqueue_drain(xbb->io_taskqueue, &xbb->io_task); 2835 mtx_lock(&xbb->lock); 2836 2837 /* 2838 * No new interrupts can generate work, but we must wait 2839 * for all currently active requests to drain. 2840 / 2841* if (xbb->active_request_count != 0) 2842 return (EAGAIN); 2843 2844 for (ring_idx = 0, op = ops; 2845 ring_idx < xbb->ring_config.ring_pages; 2846 ring_idx++, op++) { 2847 2848 op->host_addr = xbb->ring_config.gnt_addr 2849 + (ring_idx * PAGE_SIZE); 2850 op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx]; 2851 op->handle = xbb->ring_config.handle[ring_idx]; 2852 } 2853 2854 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops, 2855 xbb->ring_config.ring_pages); 2856 if (error != 0) 2857 panic("Grant table op failed (%d)", error); 2858 2859 xbb_free_communication_mem(xbb); 2860 2861 if (xbb->requests != NULL) { 2862 free(xbb->requests, M_XENBLOCKBACK); 2863 xbb->requests = NULL; 2864 } 2865 2866 if (xbb->request_lists != NULL) { 2867 struct xbb_xen_reqlist reqlist; 2868* int i; 2869 2870 /* There is one request list for ever allocated request. / 2871* for (i = 0, reqlist = xbb->request_lists; 2872 i < xbb->max_requests; i++, reqlist++){ 2873#ifdef XBB_USE_BOUNCE_BUFFERS 2874 if (reqlist->bounce != NULL) { 2875 free(reqlist->bounce, M_XENBLOCKBACK); 2876 reqlist->bounce = NULL; 2877 } 2878#endif 2879 if (reqlist->gnt_handles != NULL) { 2880 free(reqlist->gnt_handles, M_XENBLOCKBACK); 2881 reqlist->gnt_handles = NULL; 2882 } 2883 } 2884 free(xbb->request_lists, M_XENBLOCKBACK); 2885 xbb->request_lists = NULL; 2886 } 2887 2888 xbb->flags &= ~XBBF_RING_CONNECTED; 2889 return (0); 2890} 2891 2892/** 2893 * Map shared memory ring into domain local address space, initialize 2894 * ring control structures, and bind an interrupt to the event channel 2895 * used to notify us of ring changes. 2896 * 2897 * \param xbb Per-instance xbb configuration structure. 2898 / 2899static int 2900xbb_connect_ring(struct xbb_softc xbb) 2901{ 2902 struct gnttab_map_grant_ref gnts[XBB_MAX_RING_PAGES]; 2903 struct gnttab_map_grant_ref gnt; 2904* u_int ring_idx; 2905 int error; 2906 2907 if ((xbb->flags & XBBF_RING_CONNECTED) != 0) 2908 return (0); 2909 2910 /* 2911 * Kva for our ring is at the tail of the region of kva allocated 2912 * by xbb_alloc_communication_mem(). 2913 / 2914* xbb->ring_config.va = xbb->kva 2915 + (xbb->kva_size 2916 - (xbb->ring_config.ring_pages * PAGE_SIZE)); 2917 xbb->ring_config.gnt_addr = xbb->gnt_base_addr 2918 + (xbb->kva_size 2919 - (xbb->ring_config.ring_pages * PAGE_SIZE)); 2920 2921 for (ring_idx = 0, gnt = gnts; 2922 ring_idx < xbb->ring_config.ring_pages; 2923 ring_idx++, gnt++) { 2924 2925 gnt->host_addr = xbb->ring_config.gnt_addr 2926 + (ring_idx * PAGE_SIZE); 2927 gnt->flags = GNTMAP_host_map; 2928 gnt->ref = xbb->ring_config.ring_ref[ring_idx]; 2929 gnt->dom = xbb->otherend_id; 2930 } 2931 2932 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts, 2933 xbb->ring_config.ring_pages); 2934 if (error) 2935 panic("blkback: Ring page grant table op failed (%d)", error); 2936 2937 for (ring_idx = 0, gnt = gnts; 2938 ring_idx < xbb->ring_config.ring_pages; 2939 ring_idx++, gnt++) { 2940 if (gnt->status != 0) { 2941 xbb->ring_config.va = 0; 2942 xenbus_dev_fatal(xbb->dev, EACCES, 2943 "Ring shared page mapping failed. " 2944 "Status %d.", gnt->status); 2945 return (EACCES); 2946 } 2947 xbb->ring_config.handle[ring_idx] = gnt->handle; 2948 xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr; 2949 } 2950 2951 /* Initialize the ring based on ABI. / 2952* switch (xbb->abi) { 2953 case BLKIF_PROTOCOL_NATIVE: 2954 { 2955 blkif_sring_t sring; 2956* sring = (blkif_sring_t )xbb->ring_config.va; 2957* BACK_RING_INIT(&xbb->rings.native, sring, 2958 xbb->ring_config.ring_pages * PAGE_SIZE); 2959 break; 2960 } 2961 case BLKIF_PROTOCOL_X86_32: 2962 { 2963 blkif_x86_32_sring_t sring_x86_32; 2964* sring_x86_32 = (blkif_x86_32_sring_t )xbb->ring_config.va; 2965* BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32, 2966 xbb->ring_config.ring_pages * PAGE_SIZE); 2967 break; 2968 } 2969 case BLKIF_PROTOCOL_X86_64: 2970 { 2971 blkif_x86_64_sring_t sring_x86_64; 2972* sring_x86_64 = (blkif_x86_64_sring_t )xbb->ring_config.va; 2973* BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64, 2974 xbb->ring_config.ring_pages * PAGE_SIZE); 2975 break; 2976 } 2977 default: 2978 panic("Unexpected blkif protocol ABI."); 2979 } 2980 2981 xbb->flags \|= XBBF_RING_CONNECTED; 2982 2983 error = 2984 bind_interdomain_evtchn_to_irqhandler(xbb->otherend_id, 2985 xbb->ring_config.evtchn, 2986 device_get_nameunit(xbb->dev), 2987 xbb_intr, /arg/xbb, 2988 INTR_TYPE_BIO \| INTR_MPSAFE, 2989 &xbb->irq); 2990 if (error) { 2991 (void)xbb_disconnect(xbb); 2992 xenbus_dev_fatal(xbb->dev, error, "binding event channel"); 2993 return (error); 2994 } 2995 2996 DPRINTF("rings connected!\n"); 2997 2998 return 0; 2999} 3000 3001/* Needed to make bit_alloc() macro work / 3002#define calloc(count, size) malloc((count)(size), M_XENBLOCKBACK, \ 3003 M_NOWAIT\|M_ZERO); 3004 3005/** 3006 * Size KVA and pseudo-physical address allocations based on negotiated 3007 * values for the size and number of I/O requests, and the size of our 3008 * communication ring. 3009 * 3010 * \param xbb Per-instance xbb configuration structure. 3011 * 3012 * These address spaces are used to dynamically map pages in the 3013 * front-end's domain into our own. 3014 / 3015static int 3016xbb_alloc_communication_mem(struct xbb_softc xbb) 3017{ 3018 xbb->reqlist_kva_pages = xbb->max_requests * xbb->max_request_segments; 3019 xbb->reqlist_kva_size = xbb->reqlist_kva_pages * PAGE_SIZE; 3020 xbb->kva_size = xbb->reqlist_kva_size + 3021 (xbb->ring_config.ring_pages * PAGE_SIZE); 3022 3023 xbb->kva_free = bit_alloc(xbb->reqlist_kva_pages); 3024 if (xbb->kva_free == NULL) 3025 return (ENOMEM); 3026 3027 DPRINTF("%s: kva_size = %d, reqlist_kva_size = %d\n", 3028 device_get_nameunit(xbb->dev), xbb->kva_size, 3029 xbb->reqlist_kva_size); 3030#ifndef XENHVM 3031 xbb->kva = kmem_alloc_nofault(kernel_map, xbb->kva_size); 3032 if (xbb->kva == 0) 3033 return (ENOMEM); 3034 xbb->gnt_base_addr = xbb->kva; 3035#else /* XENHVM / 3036* /* 3037 * Reserve a range of pseudo physical memory that we can map 3038 * into kva. These pages will only be backed by machine 3039 * pages ("real memory") during the lifetime of front-end requests 3040 * via grant table operations. 3041 / 3042* xbb->pseudo_phys_res_id = 0; 3043 xbb->pseudo_phys_res = bus_alloc_resource(xbb->dev, SYS_RES_MEMORY, 3044 &xbb->pseudo_phys_res_id, 3045 0, ~0, xbb->kva_size, 3046 RF_ACTIVE); 3047 if (xbb->pseudo_phys_res == NULL) { 3048 xbb->kva = 0; 3049 return (ENOMEM); 3050 } 3051 xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res); 3052 xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res); 3053#endif /* XENHVM / 3054* 3055 DPRINTF("%s: kva: %#jx, gnt_base_addr: %#jx\n", 3056 device_get_nameunit(xbb->dev), (uintmax_t)xbb->kva, 3057 (uintmax_t)xbb->gnt_base_addr); 3058 return (0); 3059} 3060 3061/** 3062 * Collect front-end information from the XenStore. 3063 * 3064 * \param xbb Per-instance xbb configuration structure. 3065 / 3066static int 3067xbb_collect_frontend_info(struct xbb_softc xbb) 3068{ 3069 char protocol_abi[64]; 3070 const char otherend_path; 3071* int error; 3072 u_int ring_idx; 3073 u_int ring_page_order; 3074 size_t ring_size; 3075 3076 otherend_path = xenbus_get_otherend_path(xbb->dev); 3077 3078 /* 3079 * Protocol defaults valid even if all negotiation fails. 3080 / 3081* xbb->ring_config.ring_pages = 1; 3082 xbb->max_request_segments = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK; 3083 xbb->max_request_size = xbb->max_request_segments * PAGE_SIZE; 3084 3085 /* 3086 * Mandatory data (used in all versions of the protocol) first. 3087 / 3088* error = xs_scanf(XST_NIL, otherend_path, 3089 "event-channel", NULL, "%" PRIu32, 3090 &xbb->ring_config.evtchn); 3091 if (error != 0) { 3092 xenbus_dev_fatal(xbb->dev, error, 3093 "Unable to retrieve event-channel information " 3094 "from frontend %s. Unable to connect.", 3095 xenbus_get_otherend_path(xbb->dev)); 3096 return (error); 3097 } 3098 3099 /* 3100 * These fields are initialized to legacy protocol defaults 3101 * so we only need to fail if reading the updated value succeeds 3102 * and the new value is outside of its allowed range. 3103 * 3104 * \note xs_gather() returns on the first encountered error, so 3105 * we must use independant calls in order to guarantee 3106 * we don't miss information in a sparsly populated front-end 3107 * tree. 3108 * 3109 * \note xs_scanf() does not update variables for unmatched 3110 * fields. 3111 / 3112* ring_page_order = 0; 3113 (void)xs_scanf(XST_NIL, otherend_path, 3114 "ring-page-order", NULL, "%u", 3115 &ring_page_order); 3116 xbb->ring_config.ring_pages = 1 << ring_page_order; 3117 (void)xs_scanf(XST_NIL, otherend_path, 3118 "num-ring-pages", NULL, "%u", 3119 &xbb->ring_config.ring_pages); 3120 ring_size = PAGE_SIZE * xbb->ring_config.ring_pages; 3121 xbb->max_requests = BLKIF_MAX_RING_REQUESTS(ring_size); 3122 3123 (void)xs_scanf(XST_NIL, otherend_path, 3124 "max-requests", NULL, "%u", 3125 &xbb->max_requests); 3126 3127 (void)xs_scanf(XST_NIL, otherend_path, 3128 "max-request-segments", NULL, "%u", 3129 &xbb->max_request_segments); 3130 3131 (void)xs_scanf(XST_NIL, otherend_path, 3132 "max-request-size", NULL, "%u", 3133 &xbb->max_request_size); 3134 3135 if (xbb->ring_config.ring_pages > XBB_MAX_RING_PAGES) { 3136 xenbus_dev_fatal(xbb->dev, EINVAL, 3137 "Front-end specified ring-pages of %u " 3138 "exceeds backend limit of %zu. " 3139 "Unable to connect.", 3140 xbb->ring_config.ring_pages, 3141 XBB_MAX_RING_PAGES); 3142 return (EINVAL); 3143 } else if (xbb->max_requests > XBB_MAX_REQUESTS) { 3144 xenbus_dev_fatal(xbb->dev, EINVAL, 3145 "Front-end specified max_requests of %u " 3146 "exceeds backend limit of %u. " 3147 "Unable to connect.", 3148 xbb->max_requests, 3149 XBB_MAX_REQUESTS); 3150 return (EINVAL); 3151 } else if (xbb->max_request_segments > XBB_MAX_SEGMENTS_PER_REQUEST) { 3152 xenbus_dev_fatal(xbb->dev, EINVAL, 3153 "Front-end specified max_requests_segments " 3154 "of %u exceeds backend limit of %u. " 3155 "Unable to connect.", 3156 xbb->max_request_segments, 3157 XBB_MAX_SEGMENTS_PER_REQUEST); 3158 return (EINVAL); 3159 } else if (xbb->max_request_size > XBB_MAX_REQUEST_SIZE) { 3160 xenbus_dev_fatal(xbb->dev, EINVAL, 3161 "Front-end specified max_request_size " 3162 "of %u exceeds backend limit of %u. " 3163 "Unable to connect.", 3164 xbb->max_request_size, 3165 XBB_MAX_REQUEST_SIZE); 3166 return (EINVAL); 3167 } 3168 3169 if (xbb->ring_config.ring_pages == 1) { 3170 error = xs_gather(XST_NIL, otherend_path, 3171 "ring-ref", "%" PRIu32, 3172 &xbb->ring_config.ring_ref[0], 3173 NULL); 3174 if (error != 0) { 3175 xenbus_dev_fatal(xbb->dev, error, 3176 "Unable to retrieve ring information " 3177 "from frontend %s. Unable to " 3178 "connect.", 3179 xenbus_get_otherend_path(xbb->dev)); 3180 return (error); 3181 } 3182 } else { 3183 /* Multi-page ring format. / 3184* for (ring_idx = 0; ring_idx < xbb->ring_config.ring_pages; 3185 ring_idx++) { 3186 char ring_ref_name[]= "ring_refXX"; 3187 3188 snprintf(ring_ref_name, sizeof(ring_ref_name), 3189 "ring-ref%u", ring_idx); 3190 error = xs_scanf(XST_NIL, otherend_path, 3191 ring_ref_name, NULL, "%" PRIu32, 3192 &xbb->ring_config.ring_ref[ring_idx]); 3193 if (error != 0) { 3194 xenbus_dev_fatal(xbb->dev, error, 3195 "Failed to retriev grant " 3196 "reference for page %u of " 3197 "shared ring. Unable " 3198 "to connect.", ring_idx); 3199 return (error); 3200 } 3201 } 3202 } 3203 3204 error = xs_gather(XST_NIL, otherend_path, 3205 "protocol", "%63s", protocol_abi, 3206 NULL); 3207 if (error != 0 3208 \|\| !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) { 3209 /* 3210 * Assume native if the frontend has not 3211 * published ABI data or it has published and 3212 * matches our own ABI. 3213 / 3214* xbb->abi = BLKIF_PROTOCOL_NATIVE; 3215 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) { 3216 3217 xbb->abi = BLKIF_PROTOCOL_X86_32; 3218 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) { 3219 3220 xbb->abi = BLKIF_PROTOCOL_X86_64; 3221 } else { 3222 3223 xenbus_dev_fatal(xbb->dev, EINVAL, 3224 "Unknown protocol ABI (%s) published by " 3225 "frontend. Unable to connect.", protocol_abi); 3226 return (EINVAL); 3227 } 3228 return (0); 3229} 3230 3231/** 3232 * Allocate per-request data structures given request size and number 3233 * information negotiated with the front-end. 3234 * 3235 * \param xbb Per-instance xbb configuration structure. 3236 / 3237static int 3238xbb_alloc_requests(struct xbb_softc xbb) 3239{ 3240 struct xbb_xen_req req; 3241* struct xbb_xen_req last_req; 3242* 3243 /* 3244 * Allocate request book keeping datastructures. 3245 / 3246* xbb->requests = malloc(xbb->max_requests * sizeof(xbb->requests), 3247* M_XENBLOCKBACK, M_NOWAIT\|M_ZERO); 3248 if (xbb->requests == NULL) { 3249 xenbus_dev_fatal(xbb->dev, ENOMEM, 3250 "Unable to allocate request structures"); 3251 return (ENOMEM); 3252 } 3253 3254 req = xbb->requests; 3255 last_req = &xbb->requests[xbb->max_requests - 1]; 3256 STAILQ_INIT(&xbb->request_free_stailq); 3257 while (req <= last_req) { 3258 STAILQ_INSERT_TAIL(&xbb->request_free_stailq, req, links); 3259 req++; 3260 } 3261 return (0); 3262} 3263 3264static int 3265xbb_alloc_request_lists(struct xbb_softc xbb) 3266{ 3267* struct xbb_xen_reqlist reqlist; 3268* int i; 3269 3270 /* 3271 * If no requests can be merged, we need 1 request list per 3272 * in flight request. 3273 / 3274* xbb->request_lists = malloc(xbb->max_requests * 3275 sizeof(xbb->request_lists), M_XENBLOCKBACK, M_NOWAIT\|M_ZERO); 3276* if (xbb->request_lists == NULL) { 3277 xenbus_dev_fatal(xbb->dev, ENOMEM, 3278 "Unable to allocate request list structures"); 3279 return (ENOMEM); 3280 } 3281 3282 STAILQ_INIT(&xbb->reqlist_free_stailq); 3283 STAILQ_INIT(&xbb->reqlist_pending_stailq); 3284 for (i = 0; i < xbb->max_requests; i++) { 3285 int seg; 3286 3287 reqlist = &xbb->request_lists[i]; 3288 3289 reqlist->xbb = xbb; 3290 3291#ifdef XBB_USE_BOUNCE_BUFFERS 3292 reqlist->bounce = malloc(xbb->max_reqlist_size, 3293 M_XENBLOCKBACK, M_NOWAIT); 3294 if (reqlist->bounce == NULL) { 3295 xenbus_dev_fatal(xbb->dev, ENOMEM, 3296 "Unable to allocate request " 3297 "bounce buffers"); 3298 return (ENOMEM); 3299 } 3300#endif /* XBB_USE_BOUNCE_BUFFERS / 3301* 3302 reqlist->gnt_handles = malloc(xbb->max_reqlist_segments * 3303 sizeof(reqlist->gnt_handles), 3304* M_XENBLOCKBACK, M_NOWAIT\|M_ZERO); 3305 if (reqlist->gnt_handles == NULL) { 3306 xenbus_dev_fatal(xbb->dev, ENOMEM, 3307 "Unable to allocate request " 3308 "grant references"); 3309 return (ENOMEM); 3310 } 3311 3312 for (seg = 0; seg < xbb->max_reqlist_segments; seg++) 3313 reqlist->gnt_handles[seg] = GRANT_REF_INVALID; 3314 3315 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); 3316 } 3317 return (0); 3318} 3319 3320/** 3321 * Supply information about the physical device to the frontend 3322 * via XenBus. 3323 * 3324 * \param xbb Per-instance xbb configuration structure. 3325 / 3326static int 3327xbb_publish_backend_info(struct xbb_softc xbb) 3328{ 3329 struct xs_transaction xst; 3330 const char our_path; 3331* const char leaf; 3332* int error; 3333 3334 our_path = xenbus_get_node(xbb->dev); 3335 while (1) { 3336 error = xs_transaction_start(&xst); 3337 if (error != 0) { 3338 xenbus_dev_fatal(xbb->dev, error, 3339 "Error publishing backend info " 3340 "(start transaction)"); 3341 return (error); 3342 } 3343 3344 leaf = "sectors"; 3345 error = xs_printf(xst, our_path, leaf, 3346 "%"PRIu64, xbb->media_num_sectors); 3347 if (error != 0) 3348 break; 3349 3350 /* XXX Support all VBD attributes here. / 3351* leaf = "info"; 3352 error = xs_printf(xst, our_path, leaf, "%u", 3353 xbb->flags & XBBF_READ_ONLY 3354 ? VDISK_READONLY : 0); 3355 if (error != 0) 3356 break; 3357 3358 leaf = "sector-size"; 3359 error = xs_printf(xst, our_path, leaf, "%u", 3360 xbb->sector_size); 3361 if (error != 0) 3362 break; 3363 3364 error = xs_transaction_end(xst, 0); 3365 if (error == 0) { 3366 return (0); 3367 } else if (error != EAGAIN) { 3368 xenbus_dev_fatal(xbb->dev, error, "ending transaction"); 3369 return (error); 3370 } 3371 } 3372 3373 xenbus_dev_fatal(xbb->dev, error, "writing %s/%s", 3374 our_path, leaf); 3375 xs_transaction_end(xst, 1); 3376 return (error); 3377} 3378 3379/** 3380 * Connect to our blkfront peer now that it has completed publishing 3381 * its configuration into the XenStore. 3382 * 3383 * \param xbb Per-instance xbb configuration structure. 3384 / 3385static void 3386xbb_connect(struct xbb_softc xbb) 3387{ 3388 int error; 3389 3390 if (xenbus_get_state(xbb->dev) == XenbusStateConnected) 3391 return; 3392 3393 if (xbb_collect_frontend_info(xbb) != 0) 3394 return; 3395 3396 xbb->flags &= ~XBBF_SHUTDOWN; 3397 3398 /* 3399 * We limit the maximum number of reqlist segments to the maximum 3400 * number of segments in the ring, or our absolute maximum, 3401 * whichever is smaller. 3402 / 3403* xbb->max_reqlist_segments = MIN(xbb->max_request_segments * 3404 xbb->max_requests, XBB_MAX_SEGMENTS_PER_REQLIST); 3405 3406 /* 3407 * The maximum size is simply a function of the number of segments 3408 * we can handle. 3409 / 3410* xbb->max_reqlist_size = xbb->max_reqlist_segments * PAGE_SIZE; 3411 3412 /* Allocate resources whose size depends on front-end configuration. / 3413* error = xbb_alloc_communication_mem(xbb); 3414 if (error != 0) { 3415 xenbus_dev_fatal(xbb->dev, error, 3416 "Unable to allocate communication memory"); 3417 return; 3418 } 3419 3420 error = xbb_alloc_requests(xbb); 3421 if (error != 0) { 3422 /* Specific errors are reported by xbb_alloc_requests(). / 3423* return; 3424 } 3425 3426 error = xbb_alloc_request_lists(xbb); 3427 if (error != 0) { 3428 /* Specific errors are reported by xbb_alloc_request_lists(). / 3429* return; 3430 } 3431 3432 /* 3433 * Connect communication channel. 3434 / 3435* error = xbb_connect_ring(xbb); 3436 if (error != 0) { 3437 /* Specific errors are reported by xbb_connect_ring(). / 3438* return; 3439 } 3440 3441 if (xbb_publish_backend_info(xbb) != 0) { 3442 /* 3443 * If we can't publish our data, we cannot participate 3444 * in this connection, and waiting for a front-end state 3445 * change will not help the situation. 3446 / 3447* (void)xbb_disconnect(xbb); 3448 return; 3449 } 3450 3451 /* Ready for I/O. / 3452* xenbus_set_state(xbb->dev, XenbusStateConnected); 3453} 3454 3455/-------------------------- Device Teardown Support -------------------------/ 3456/** 3457 * Perform device shutdown functions. 3458 * 3459 * \param xbb Per-instance xbb configuration structure. 3460 * 3461 * Mark this instance as shutting down, wait for any active I/O on the 3462 * backend device/file to drain, disconnect from the front-end, and notify 3463 * any waiters (e.g. a thread invoking our detach method) that detach can 3464 * now proceed. 3465 / 3466static int 3467xbb_shutdown(struct xbb_softc xbb) 3468{ 3469 XenbusState frontState; 3470 int error; 3471 3472 DPRINTF("\n"); 3473 3474 /* 3475 * Due to the need to drop our mutex during some 3476 * xenbus operations, it is possible for two threads 3477 * to attempt to close out shutdown processing at 3478 * the same time. Tell the caller that hits this 3479 * race to try back later. 3480 / 3481* if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0) 3482 return (EAGAIN); 3483 3484 xbb->flags \|= XBBF_IN_SHUTDOWN; 3485 mtx_unlock(&xbb->lock); 3486 3487 if (xenbus_get_state(xbb->dev) < XenbusStateClosing) 3488 xenbus_set_state(xbb->dev, XenbusStateClosing); 3489 3490 frontState = xenbus_get_otherend_state(xbb->dev); 3491 mtx_lock(&xbb->lock); 3492 xbb->flags &= ~XBBF_IN_SHUTDOWN; 3493 3494 /* The front can submit I/O until entering the closed state. / 3495* if (frontState < XenbusStateClosed) 3496 return (EAGAIN); 3497 3498 DPRINTF("\n"); 3499 3500 /* Indicate shutdown is in progress. / 3501* xbb->flags \|= XBBF_SHUTDOWN; 3502 3503 /* Disconnect from the front-end. / 3504* error = xbb_disconnect(xbb); 3505 if (error != 0) { 3506 /* 3507 * Requests still outstanding. We'll be called again 3508 * once they complete. 3509 / 3510* KASSERT(error == EAGAIN, 3511 ("%s: Unexpected xbb_disconnect() failure %d", 3512 __func__, error)); 3513 3514 return (error); 3515 } 3516 3517 DPRINTF("\n"); 3518 3519 /* Indicate to xbb_detach() that is it safe to proceed. / 3520* wakeup(xbb); 3521 3522 return (0); 3523} 3524 3525/** 3526 * Report an attach time error to the console and Xen, and cleanup 3527 * this instance by forcing immediate detach processing. 3528 * 3529 * \param xbb Per-instance xbb configuration structure. 3530 * \param err Errno describing the error. 3531 * \param fmt Printf style format and arguments 3532 / 3533static void 3534xbb_attach_failed(struct xbb_softc xbb, int err, const char fmt, ...) 3535{ 3536* va_list ap; 3537 va_list ap_hotplug; 3538 3539 va_start(ap, fmt); 3540 va_copy(ap_hotplug, ap); 3541 xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev), 3542 "hotplug-error", fmt, ap_hotplug); 3543 va_end(ap_hotplug); 3544 xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3545 "hotplug-status", "error"); 3546 3547 xenbus_dev_vfatal(xbb->dev, err, fmt, ap); 3548 va_end(ap); 3549 3550 xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3551 "online", "0"); 3552 xbb_detach(xbb->dev); 3553} 3554 3555/---------------------------- NewBus Entrypoints ----------------------------/ 3556/** 3557 * Inspect a XenBus device and claim it if is of the appropriate type. 3558 * 3559 * \param dev NewBus device object representing a candidate XenBus device. 3560 * 3561 * \return 0 for success, errno codes for failure. 3562 / 3563static int 3564xbb_probe(device_t dev) 3565{ 3566* 3567 if (!strcmp(xenbus_get_type(dev), "vbd")) { 3568 device_set_desc(dev, "Backend Virtual Block Device"); 3569 device_quiet(dev); 3570 return (0); 3571 } 3572 3573 return (ENXIO); 3574} 3575 3576/** 3577 * Setup sysctl variables to control various Block Back parameters. 3578 * 3579 * \param xbb Xen Block Back softc. 3580 * 3581 / 3582static void 3583xbb_setup_sysctl(struct xbb_softc xbb) 3584{ 3585 struct sysctl_ctx_list sysctl_ctx = NULL; 3586* struct sysctl_oid sysctl_tree = NULL; 3587* 3588 sysctl_ctx = device_get_sysctl_ctx(xbb->dev); 3589 if (sysctl_ctx == NULL) 3590 return; 3591 3592 sysctl_tree = device_get_sysctl_tree(xbb->dev); 3593 if (sysctl_tree == NULL) 3594 return; 3595 3596 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3597 "disable_flush", CTLFLAG_RW, &xbb->disable_flush, 0, 3598 "fake the flush command"); 3599 3600 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3601 "flush_interval", CTLFLAG_RW, &xbb->flush_interval, 0, 3602 "send a real flush for N flush requests"); 3603 3604 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3605 "no_coalesce_reqs", CTLFLAG_RW, &xbb->no_coalesce_reqs,0, 3606 "Don't coalesce contiguous requests"); 3607 3608 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3609 "reqs_received", CTLFLAG_RW, &xbb->reqs_received, 3610 "how many I/O requests we have received"); 3611 3612 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3613 "reqs_completed", CTLFLAG_RW, &xbb->reqs_completed, 3614 "how many I/O requests have been completed"); 3615 3616 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3617 "forced_dispatch", CTLFLAG_RW, &xbb->forced_dispatch, 3618 "how many I/O dispatches were forced"); 3619 3620 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3621 "normal_dispatch", CTLFLAG_RW, &xbb->normal_dispatch, 3622 "how many I/O dispatches were normal"); 3623 3624 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3625 "total_dispatch", CTLFLAG_RW, &xbb->total_dispatch, 3626 "total number of I/O dispatches"); 3627 3628 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3629 "kva_shortages", CTLFLAG_RW, &xbb->kva_shortages, 3630 "how many times we have run out of KVA"); 3631 3632 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3633 "request_shortages", CTLFLAG_RW, 3634 &xbb->request_shortages, 3635 "how many times we have run out of requests"); 3636 3637 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3638 "max_requests", CTLFLAG_RD, &xbb->max_requests, 0, 3639 "maximum outstanding requests (negotiated)"); 3640 3641 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3642 "max_request_segments", CTLFLAG_RD, 3643 &xbb->max_request_segments, 0, 3644 "maximum number of pages per requests (negotiated)"); 3645 3646 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3647 "max_request_size", CTLFLAG_RD, 3648 &xbb->max_request_size, 0, 3649 "maximum size in bytes of a request (negotiated)"); 3650 3651 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3652 "ring_pages", CTLFLAG_RD, 3653 &xbb->ring_config.ring_pages, 0, 3654 "communication channel pages (negotiated)"); 3655} 3656 3657/** 3658 * Attach to a XenBus device that has been claimed by our probe routine. 3659 * 3660 * \param dev NewBus device object representing this Xen Block Back instance. 3661 * 3662 * \return 0 for success, errno codes for failure. 3663 / 3664static int 3665xbb_attach(device_t dev) 3666{ 3667* struct xbb_softc xbb; 3668* int error; 3669 u_int max_ring_page_order; 3670 3671 DPRINTF("Attaching to %s\n", xenbus_get_node(dev)); 3672 3673 /* 3674 * Basic initialization. 3675 * After this block it is safe to call xbb_detach() 3676 * to clean up any allocated data for this instance. 3677 / 3678* xbb = device_get_softc(dev); 3679 xbb->dev = dev; 3680 xbb->otherend_id = xenbus_get_otherend_id(dev); 3681 TASK_INIT(&xbb->io_task, /priority/0, xbb_run_queue, xbb); 3682 mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF); 3683 3684 /* 3685 * Publish protocol capabilities for consumption by the 3686 * front-end. 3687 / 3688* error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3689 "feature-barrier", "1"); 3690 if (error) { 3691 xbb_attach_failed(xbb, error, "writing %s/feature-barrier", 3692 xenbus_get_node(xbb->dev)); 3693 return (error); 3694 } 3695 3696 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3697 "feature-flush-cache", "1"); 3698 if (error) { 3699 xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache", 3700 xenbus_get_node(xbb->dev)); 3701 return (error); 3702 } 3703 3704 /* 3705 * Amazon EC2 client compatility. They refer to max-ring-pages 3706 * instead of to max-ring-page-order. 3707 / 3708* error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3709 "max-ring-pages", "%zu", XBB_MAX_RING_PAGES); 3710 if (error) { 3711 xbb_attach_failed(xbb, error, "writing %s/max-ring-pages", 3712 xenbus_get_node(xbb->dev)); 3713 return (error); 3714 } 3715 3716 max_ring_page_order = flsl(XBB_MAX_RING_PAGES) - 1; 3717 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3718 "max-ring-page-order", "%u", max_ring_page_order); 3719 if (error) { 3720 xbb_attach_failed(xbb, error, "writing %s/max-ring-page-order", 3721 xenbus_get_node(xbb->dev)); 3722 return (error); 3723 } 3724 3725 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3726 "max-requests", "%u", XBB_MAX_REQUESTS); 3727 if (error) { 3728 xbb_attach_failed(xbb, error, "writing %s/max-requests", 3729 xenbus_get_node(xbb->dev)); 3730 return (error); 3731 } 3732 3733 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3734 "max-request-segments", "%u", 3735 XBB_MAX_SEGMENTS_PER_REQUEST); 3736 if (error) { 3737 xbb_attach_failed(xbb, error, "writing %s/max-request-segments", 3738 xenbus_get_node(xbb->dev)); 3739 return (error); 3740 } 3741 3742 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3743 "max-request-size", "%u", 3744 XBB_MAX_REQUEST_SIZE); 3745 if (error) { 3746 xbb_attach_failed(xbb, error, "writing %s/max-request-size", 3747 xenbus_get_node(xbb->dev)); 3748 return (error); 3749 } 3750 3751 /* Collect physical device information. / 3752* error = xs_gather(XST_NIL, xenbus_get_otherend_path(xbb->dev), 3753 "device-type", NULL, &xbb->dev_type, 3754 NULL); 3755 if (error != 0) 3756 xbb->dev_type = NULL; 3757 3758 error = xs_gather(XST_NIL, xenbus_get_node(dev), 3759 "mode", NULL, &xbb->dev_mode, 3760 "params", NULL, &xbb->dev_name, 3761 NULL); 3762 if (error != 0) { 3763 xbb_attach_failed(xbb, error, "reading backend fields at %s", 3764 xenbus_get_node(dev)); 3765 return (ENXIO); 3766 } 3767 3768 /* Parse fopen style mode flags. / 3769* if (strchr(xbb->dev_mode, 'w') == NULL) 3770 xbb->flags \|= XBBF_READ_ONLY; 3771 3772 /* 3773 * Verify the physical device is present and can support 3774 * the desired I/O mode. 3775 / 3776* DROP_GIANT(); 3777 error = xbb_open_backend(xbb); 3778 PICKUP_GIANT(); 3779 if (error != 0) { 3780 xbb_attach_failed(xbb, error, "Unable to open %s", 3781 xbb->dev_name); 3782 return (ENXIO); 3783 } 3784 3785 /* Use devstat(9) for recording statistics. / 3786* xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev), 3787 xbb->sector_size, 3788 DEVSTAT_ALL_SUPPORTED, 3789 DEVSTAT_TYPE_DIRECT 3790 \| DEVSTAT_TYPE_IF_OTHER, 3791 DEVSTAT_PRIORITY_OTHER); 3792 3793 xbb->xbb_stats_in = devstat_new_entry("xbbi", device_get_unit(xbb->dev), 3794 xbb->sector_size, 3795 DEVSTAT_ALL_SUPPORTED, 3796 DEVSTAT_TYPE_DIRECT 3797 \| DEVSTAT_TYPE_IF_OTHER, 3798 DEVSTAT_PRIORITY_OTHER); 3799 /* 3800 * Setup sysctl variables. 3801 / 3802* xbb_setup_sysctl(xbb); 3803 3804 /* 3805 * Create a taskqueue for doing work that must occur from a 3806 * thread context. 3807 / 3808* xbb->io_taskqueue = taskqueue_create(device_get_nameunit(dev), M_NOWAIT, 3809 taskqueue_thread_enqueue, 3810 /context/&xbb->io_taskqueue); 3811 if (xbb->io_taskqueue == NULL) { 3812 xbb_attach_failed(xbb, error, "Unable to create taskqueue"); 3813 return (ENOMEM); 3814 } 3815 3816 taskqueue_start_threads(&xbb->io_taskqueue, 3817 /num threads/1, 3818 /priority/PWAIT, 3819 /thread name/ 3820 "%s taskq", device_get_nameunit(dev)); 3821 3822 /* Update hot-plug status to satisfy xend. / 3823* error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3824 "hotplug-status", "connected"); 3825 if (error) { 3826 xbb_attach_failed(xbb, error, "writing %s/hotplug-status", 3827 xenbus_get_node(xbb->dev)); 3828 return (error); 3829 } 3830 3831 /* Tell the front end that we are ready to connect. / 3832* xenbus_set_state(dev, XenbusStateInitWait); 3833 3834 return (0); 3835} 3836 3837/** 3838 * Detach from a block back device instance. 3839 * 3840 * \param dev NewBus device object representing this Xen Block Back instance. 3841 * 3842 * \return 0 for success, errno codes for failure. 3843 * 3844 * \note A block back device may be detached at any time in its life-cycle, 3845 * including part way through the attach process. For this reason, 3846 * initialization order and the intialization state checks in this 3847 * routine must be carefully coupled so that attach time failures 3848 * are gracefully handled. 3849 / 3850static int 3851xbb_detach(device_t dev) 3852{ 3853* struct xbb_softc xbb; 3854* 3855 DPRINTF("\n"); 3856 3857 xbb = device_get_softc(dev); 3858 mtx_lock(&xbb->lock); 3859 while (xbb_shutdown(xbb) == EAGAIN) { 3860 msleep(xbb, &xbb->lock, /wakeup prio unchanged/0, 3861 "xbb_shutdown", 0); 3862 } 3863 mtx_unlock(&xbb->lock); 3864 3865 DPRINTF("\n"); 3866 3867 if (xbb->io_taskqueue != NULL) 3868 taskqueue_free(xbb->io_taskqueue); 3869 3870 if (xbb->xbb_stats != NULL) 3871 devstat_remove_entry(xbb->xbb_stats); 3872 3873 if (xbb->xbb_stats_in != NULL) 3874 devstat_remove_entry(xbb->xbb_stats_in); 3875 3876 xbb_close_backend(xbb); 3877 3878 if (xbb->dev_mode != NULL) { 3879 free(xbb->dev_mode, M_XENBUS); 3880 xbb->dev_mode = NULL; 3881 } 3882 3883 if (xbb->dev_type != NULL) { 3884 free(xbb->dev_type, M_XENBUS); 3885 xbb->dev_type = NULL; 3886 } 3887 3888 if (xbb->dev_name != NULL) { 3889 free(xbb->dev_name, M_XENBUS); 3890 xbb->dev_name = NULL; 3891 } 3892 3893 mtx_destroy(&xbb->lock); 3894 return (0); 3895} 3896 3897/** 3898 * Prepare this block back device for suspension of this VM. 3899 * 3900 * \param dev NewBus device object representing this Xen Block Back instance. 3901 * 3902 * \return 0 for success, errno codes for failure. 3903 / 3904static int 3905xbb_suspend(device_t dev) 3906{ 3907#ifdef NOT_YET 3908* struct xbb_softc sc = device_get_softc(dev); 3909* 3910 /* Prevent new requests being issued until we fix things up. / 3911* mtx_lock(&sc->xb_io_lock); 3912 sc->connected = BLKIF_STATE_SUSPENDED; 3913 mtx_unlock(&sc->xb_io_lock); 3914#endif 3915 3916 return (0); 3917} 3918 3919/** 3920 * Perform any processing required to recover from a suspended state. 3921 * 3922 * \param dev NewBus device object representing this Xen Block Back instance. 3923 * 3924 * \return 0 for success, errno codes for failure. 3925 / 3926static int 3927xbb_resume(device_t dev) 3928{ 3929* return (0); 3930} 3931 3932/** 3933 * Handle state changes expressed via the XenStore by our front-end peer. 3934 * 3935 * \param dev NewBus device object representing this Xen 3936 * Block Back instance. 3937 * \param frontend_state The new state of the front-end. 3938 * 3939 * \return 0 for success, errno codes for failure. 3940 / 3941static void 3942xbb_frontend_changed(device_t dev, XenbusState frontend_state) 3943{ 3944* struct xbb_softc xbb = device_get_softc(dev); 3945* 3946 DPRINTF("frontend_state=%s, xbb_state=%s\n", 3947 xenbus_strstate(frontend_state), 3948 xenbus_strstate(xenbus_get_state(xbb->dev))); 3949 3950 switch (frontend_state) { 3951 case XenbusStateInitialising: 3952 break; 3953 case XenbusStateInitialised: 3954 case XenbusStateConnected: 3955 xbb_connect(xbb); 3956 break; 3957 case XenbusStateClosing: 3958 case XenbusStateClosed: 3959 mtx_lock(&xbb->lock); 3960 xbb_shutdown(xbb); 3961 mtx_unlock(&xbb->lock); 3962 if (frontend_state == XenbusStateClosed) 3963 xenbus_set_state(xbb->dev, XenbusStateClosed); 3964 break; 3965 default: 3966 xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend", 3967 frontend_state); 3968 break; 3969 } 3970} 3971 3972/---------------------------- NewBus Registration ---------------------------/ 3973static device_method_t xbb_methods[] = { 3974 /* Device interface / 3975* DEVMETHOD(device_probe, xbb_probe), 3976 DEVMETHOD(device_attach, xbb_attach), 3977 DEVMETHOD(device_detach, xbb_detach), 3978 DEVMETHOD(device_shutdown, bus_generic_shutdown), 3979 DEVMETHOD(device_suspend, xbb_suspend), 3980 DEVMETHOD(device_resume, xbb_resume), 3981 3982 /* Xenbus interface / 3983* DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed), 3984 3985 { 0, 0 } 3986}; 3987 3988static driver_t xbb_driver = { 3989 "xbbd", 3990 xbb_methods, 3991 sizeof(struct xbb_softc), 3992}; 3993devclass_t xbb_devclass; 3994 3995DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, xbb_devclass, 0, 0);	2751 2752 if (error != 0) { 2753 xbb_close_backend(xbb); 2754 return (error); 2755 } 2756 2757 xbb->sector_size_shift = fls(xbb->sector_size) - 1; 2758 xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift; 2759 2760 DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n", 2761 (xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file", 2762 xbb->dev_name, xbb->sector_size, xbb->media_size); 2763 2764 return (0); 2765} 2766 2767/------------------------ Inter-Domain Communication ------------------------/ 2768/** 2769 * Free dynamically allocated KVA or pseudo-physical address allocations. 2770 * 2771 * \param xbb Per-instance xbb configuration structure. 2772 / 2773static void 2774xbb_free_communication_mem(struct xbb_softc xbb) 2775{ 2776 if (xbb->kva != 0) { 2777#ifndef XENHVM 2778 kmem_free(kernel_map, xbb->kva, xbb->kva_size); 2779#else 2780 if (xbb->pseudo_phys_res != NULL) { 2781 bus_release_resource(xbb->dev, SYS_RES_MEMORY, 2782 xbb->pseudo_phys_res_id, 2783 xbb->pseudo_phys_res); 2784 xbb->pseudo_phys_res = NULL; 2785 } 2786#endif 2787 } 2788 xbb->kva = 0; 2789 xbb->gnt_base_addr = 0; 2790 if (xbb->kva_free != NULL) { 2791 free(xbb->kva_free, M_XENBLOCKBACK); 2792 xbb->kva_free = NULL; 2793 } 2794} 2795 2796/** 2797 * Cleanup all inter-domain communication mechanisms. 2798 * 2799 * \param xbb Per-instance xbb configuration structure. 2800 / 2801static int 2802xbb_disconnect(struct xbb_softc xbb) 2803{ 2804 struct gnttab_unmap_grant_ref ops[XBB_MAX_RING_PAGES]; 2805 struct gnttab_unmap_grant_ref op; 2806* u_int ring_idx; 2807 int error; 2808 2809 DPRINTF("\n"); 2810 2811 if ((xbb->flags & XBBF_RING_CONNECTED) == 0) 2812 return (0); 2813 2814 if (xbb->irq != 0) { 2815 unbind_from_irqhandler(xbb->irq); 2816 xbb->irq = 0; 2817 } 2818 2819 mtx_unlock(&xbb->lock); 2820 taskqueue_drain(xbb->io_taskqueue, &xbb->io_task); 2821 mtx_lock(&xbb->lock); 2822 2823 /* 2824 * No new interrupts can generate work, but we must wait 2825 * for all currently active requests to drain. 2826 / 2827* if (xbb->active_request_count != 0) 2828 return (EAGAIN); 2829 2830 for (ring_idx = 0, op = ops; 2831 ring_idx < xbb->ring_config.ring_pages; 2832 ring_idx++, op++) { 2833 2834 op->host_addr = xbb->ring_config.gnt_addr 2835 + (ring_idx * PAGE_SIZE); 2836 op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx]; 2837 op->handle = xbb->ring_config.handle[ring_idx]; 2838 } 2839 2840 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops, 2841 xbb->ring_config.ring_pages); 2842 if (error != 0) 2843 panic("Grant table op failed (%d)", error); 2844 2845 xbb_free_communication_mem(xbb); 2846 2847 if (xbb->requests != NULL) { 2848 free(xbb->requests, M_XENBLOCKBACK); 2849 xbb->requests = NULL; 2850 } 2851 2852 if (xbb->request_lists != NULL) { 2853 struct xbb_xen_reqlist reqlist; 2854* int i; 2855 2856 /* There is one request list for ever allocated request. / 2857* for (i = 0, reqlist = xbb->request_lists; 2858 i < xbb->max_requests; i++, reqlist++){ 2859#ifdef XBB_USE_BOUNCE_BUFFERS 2860 if (reqlist->bounce != NULL) { 2861 free(reqlist->bounce, M_XENBLOCKBACK); 2862 reqlist->bounce = NULL; 2863 } 2864#endif 2865 if (reqlist->gnt_handles != NULL) { 2866 free(reqlist->gnt_handles, M_XENBLOCKBACK); 2867 reqlist->gnt_handles = NULL; 2868 } 2869 } 2870 free(xbb->request_lists, M_XENBLOCKBACK); 2871 xbb->request_lists = NULL; 2872 } 2873 2874 xbb->flags &= ~XBBF_RING_CONNECTED; 2875 return (0); 2876} 2877 2878/** 2879 * Map shared memory ring into domain local address space, initialize 2880 * ring control structures, and bind an interrupt to the event channel 2881 * used to notify us of ring changes. 2882 * 2883 * \param xbb Per-instance xbb configuration structure. 2884 / 2885static int 2886xbb_connect_ring(struct xbb_softc xbb) 2887{ 2888 struct gnttab_map_grant_ref gnts[XBB_MAX_RING_PAGES]; 2889 struct gnttab_map_grant_ref gnt; 2890* u_int ring_idx; 2891 int error; 2892 2893 if ((xbb->flags & XBBF_RING_CONNECTED) != 0) 2894 return (0); 2895 2896 /* 2897 * Kva for our ring is at the tail of the region of kva allocated 2898 * by xbb_alloc_communication_mem(). 2899 / 2900* xbb->ring_config.va = xbb->kva 2901 + (xbb->kva_size 2902 - (xbb->ring_config.ring_pages * PAGE_SIZE)); 2903 xbb->ring_config.gnt_addr = xbb->gnt_base_addr 2904 + (xbb->kva_size 2905 - (xbb->ring_config.ring_pages * PAGE_SIZE)); 2906 2907 for (ring_idx = 0, gnt = gnts; 2908 ring_idx < xbb->ring_config.ring_pages; 2909 ring_idx++, gnt++) { 2910 2911 gnt->host_addr = xbb->ring_config.gnt_addr 2912 + (ring_idx * PAGE_SIZE); 2913 gnt->flags = GNTMAP_host_map; 2914 gnt->ref = xbb->ring_config.ring_ref[ring_idx]; 2915 gnt->dom = xbb->otherend_id; 2916 } 2917 2918 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts, 2919 xbb->ring_config.ring_pages); 2920 if (error) 2921 panic("blkback: Ring page grant table op failed (%d)", error); 2922 2923 for (ring_idx = 0, gnt = gnts; 2924 ring_idx < xbb->ring_config.ring_pages; 2925 ring_idx++, gnt++) { 2926 if (gnt->status != 0) { 2927 xbb->ring_config.va = 0; 2928 xenbus_dev_fatal(xbb->dev, EACCES, 2929 "Ring shared page mapping failed. " 2930 "Status %d.", gnt->status); 2931 return (EACCES); 2932 } 2933 xbb->ring_config.handle[ring_idx] = gnt->handle; 2934 xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr; 2935 } 2936 2937 /* Initialize the ring based on ABI. / 2938* switch (xbb->abi) { 2939 case BLKIF_PROTOCOL_NATIVE: 2940 { 2941 blkif_sring_t sring; 2942* sring = (blkif_sring_t )xbb->ring_config.va; 2943* BACK_RING_INIT(&xbb->rings.native, sring, 2944 xbb->ring_config.ring_pages * PAGE_SIZE); 2945 break; 2946 } 2947 case BLKIF_PROTOCOL_X86_32: 2948 { 2949 blkif_x86_32_sring_t sring_x86_32; 2950* sring_x86_32 = (blkif_x86_32_sring_t )xbb->ring_config.va; 2951* BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32, 2952 xbb->ring_config.ring_pages * PAGE_SIZE); 2953 break; 2954 } 2955 case BLKIF_PROTOCOL_X86_64: 2956 { 2957 blkif_x86_64_sring_t sring_x86_64; 2958* sring_x86_64 = (blkif_x86_64_sring_t )xbb->ring_config.va; 2959* BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64, 2960 xbb->ring_config.ring_pages * PAGE_SIZE); 2961 break; 2962 } 2963 default: 2964 panic("Unexpected blkif protocol ABI."); 2965 } 2966 2967 xbb->flags \|= XBBF_RING_CONNECTED; 2968 2969 error = 2970 bind_interdomain_evtchn_to_irqhandler(xbb->otherend_id, 2971 xbb->ring_config.evtchn, 2972 device_get_nameunit(xbb->dev), 2973 xbb_intr, /arg/xbb, 2974 INTR_TYPE_BIO \| INTR_MPSAFE, 2975 &xbb->irq); 2976 if (error) { 2977 (void)xbb_disconnect(xbb); 2978 xenbus_dev_fatal(xbb->dev, error, "binding event channel"); 2979 return (error); 2980 } 2981 2982 DPRINTF("rings connected!\n"); 2983 2984 return 0; 2985} 2986 2987/* Needed to make bit_alloc() macro work / 2988#define calloc(count, size) malloc((count)(size), M_XENBLOCKBACK, \ 2989 M_NOWAIT\|M_ZERO); 2990 2991/** 2992 * Size KVA and pseudo-physical address allocations based on negotiated 2993 * values for the size and number of I/O requests, and the size of our 2994 * communication ring. 2995 * 2996 * \param xbb Per-instance xbb configuration structure. 2997 * 2998 * These address spaces are used to dynamically map pages in the 2999 * front-end's domain into our own. 3000 / 3001static int 3002xbb_alloc_communication_mem(struct xbb_softc xbb) 3003{ 3004 xbb->reqlist_kva_pages = xbb->max_requests * xbb->max_request_segments; 3005 xbb->reqlist_kva_size = xbb->reqlist_kva_pages * PAGE_SIZE; 3006 xbb->kva_size = xbb->reqlist_kva_size + 3007 (xbb->ring_config.ring_pages * PAGE_SIZE); 3008 3009 xbb->kva_free = bit_alloc(xbb->reqlist_kva_pages); 3010 if (xbb->kva_free == NULL) 3011 return (ENOMEM); 3012 3013 DPRINTF("%s: kva_size = %d, reqlist_kva_size = %d\n", 3014 device_get_nameunit(xbb->dev), xbb->kva_size, 3015 xbb->reqlist_kva_size); 3016#ifndef XENHVM 3017 xbb->kva = kmem_alloc_nofault(kernel_map, xbb->kva_size); 3018 if (xbb->kva == 0) 3019 return (ENOMEM); 3020 xbb->gnt_base_addr = xbb->kva; 3021#else /* XENHVM / 3022* /* 3023 * Reserve a range of pseudo physical memory that we can map 3024 * into kva. These pages will only be backed by machine 3025 * pages ("real memory") during the lifetime of front-end requests 3026 * via grant table operations. 3027 / 3028* xbb->pseudo_phys_res_id = 0; 3029 xbb->pseudo_phys_res = bus_alloc_resource(xbb->dev, SYS_RES_MEMORY, 3030 &xbb->pseudo_phys_res_id, 3031 0, ~0, xbb->kva_size, 3032 RF_ACTIVE); 3033 if (xbb->pseudo_phys_res == NULL) { 3034 xbb->kva = 0; 3035 return (ENOMEM); 3036 } 3037 xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res); 3038 xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res); 3039#endif /* XENHVM / 3040* 3041 DPRINTF("%s: kva: %#jx, gnt_base_addr: %#jx\n", 3042 device_get_nameunit(xbb->dev), (uintmax_t)xbb->kva, 3043 (uintmax_t)xbb->gnt_base_addr); 3044 return (0); 3045} 3046 3047/** 3048 * Collect front-end information from the XenStore. 3049 * 3050 * \param xbb Per-instance xbb configuration structure. 3051 / 3052static int 3053xbb_collect_frontend_info(struct xbb_softc xbb) 3054{ 3055 char protocol_abi[64]; 3056 const char otherend_path; 3057* int error; 3058 u_int ring_idx; 3059 u_int ring_page_order; 3060 size_t ring_size; 3061 3062 otherend_path = xenbus_get_otherend_path(xbb->dev); 3063 3064 /* 3065 * Protocol defaults valid even if all negotiation fails. 3066 / 3067* xbb->ring_config.ring_pages = 1; 3068 xbb->max_request_segments = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK; 3069 xbb->max_request_size = xbb->max_request_segments * PAGE_SIZE; 3070 3071 /* 3072 * Mandatory data (used in all versions of the protocol) first. 3073 / 3074* error = xs_scanf(XST_NIL, otherend_path, 3075 "event-channel", NULL, "%" PRIu32, 3076 &xbb->ring_config.evtchn); 3077 if (error != 0) { 3078 xenbus_dev_fatal(xbb->dev, error, 3079 "Unable to retrieve event-channel information " 3080 "from frontend %s. Unable to connect.", 3081 xenbus_get_otherend_path(xbb->dev)); 3082 return (error); 3083 } 3084 3085 /* 3086 * These fields are initialized to legacy protocol defaults 3087 * so we only need to fail if reading the updated value succeeds 3088 * and the new value is outside of its allowed range. 3089 * 3090 * \note xs_gather() returns on the first encountered error, so 3091 * we must use independant calls in order to guarantee 3092 * we don't miss information in a sparsly populated front-end 3093 * tree. 3094 * 3095 * \note xs_scanf() does not update variables for unmatched 3096 * fields. 3097 / 3098* ring_page_order = 0; 3099 (void)xs_scanf(XST_NIL, otherend_path, 3100 "ring-page-order", NULL, "%u", 3101 &ring_page_order); 3102 xbb->ring_config.ring_pages = 1 << ring_page_order; 3103 (void)xs_scanf(XST_NIL, otherend_path, 3104 "num-ring-pages", NULL, "%u", 3105 &xbb->ring_config.ring_pages); 3106 ring_size = PAGE_SIZE * xbb->ring_config.ring_pages; 3107 xbb->max_requests = BLKIF_MAX_RING_REQUESTS(ring_size); 3108 3109 (void)xs_scanf(XST_NIL, otherend_path, 3110 "max-requests", NULL, "%u", 3111 &xbb->max_requests); 3112 3113 (void)xs_scanf(XST_NIL, otherend_path, 3114 "max-request-segments", NULL, "%u", 3115 &xbb->max_request_segments); 3116 3117 (void)xs_scanf(XST_NIL, otherend_path, 3118 "max-request-size", NULL, "%u", 3119 &xbb->max_request_size); 3120 3121 if (xbb->ring_config.ring_pages > XBB_MAX_RING_PAGES) { 3122 xenbus_dev_fatal(xbb->dev, EINVAL, 3123 "Front-end specified ring-pages of %u " 3124 "exceeds backend limit of %zu. " 3125 "Unable to connect.", 3126 xbb->ring_config.ring_pages, 3127 XBB_MAX_RING_PAGES); 3128 return (EINVAL); 3129 } else if (xbb->max_requests > XBB_MAX_REQUESTS) { 3130 xenbus_dev_fatal(xbb->dev, EINVAL, 3131 "Front-end specified max_requests of %u " 3132 "exceeds backend limit of %u. " 3133 "Unable to connect.", 3134 xbb->max_requests, 3135 XBB_MAX_REQUESTS); 3136 return (EINVAL); 3137 } else if (xbb->max_request_segments > XBB_MAX_SEGMENTS_PER_REQUEST) { 3138 xenbus_dev_fatal(xbb->dev, EINVAL, 3139 "Front-end specified max_requests_segments " 3140 "of %u exceeds backend limit of %u. " 3141 "Unable to connect.", 3142 xbb->max_request_segments, 3143 XBB_MAX_SEGMENTS_PER_REQUEST); 3144 return (EINVAL); 3145 } else if (xbb->max_request_size > XBB_MAX_REQUEST_SIZE) { 3146 xenbus_dev_fatal(xbb->dev, EINVAL, 3147 "Front-end specified max_request_size " 3148 "of %u exceeds backend limit of %u. " 3149 "Unable to connect.", 3150 xbb->max_request_size, 3151 XBB_MAX_REQUEST_SIZE); 3152 return (EINVAL); 3153 } 3154 3155 if (xbb->ring_config.ring_pages == 1) { 3156 error = xs_gather(XST_NIL, otherend_path, 3157 "ring-ref", "%" PRIu32, 3158 &xbb->ring_config.ring_ref[0], 3159 NULL); 3160 if (error != 0) { 3161 xenbus_dev_fatal(xbb->dev, error, 3162 "Unable to retrieve ring information " 3163 "from frontend %s. Unable to " 3164 "connect.", 3165 xenbus_get_otherend_path(xbb->dev)); 3166 return (error); 3167 } 3168 } else { 3169 /* Multi-page ring format. / 3170* for (ring_idx = 0; ring_idx < xbb->ring_config.ring_pages; 3171 ring_idx++) { 3172 char ring_ref_name[]= "ring_refXX"; 3173 3174 snprintf(ring_ref_name, sizeof(ring_ref_name), 3175 "ring-ref%u", ring_idx); 3176 error = xs_scanf(XST_NIL, otherend_path, 3177 ring_ref_name, NULL, "%" PRIu32, 3178 &xbb->ring_config.ring_ref[ring_idx]); 3179 if (error != 0) { 3180 xenbus_dev_fatal(xbb->dev, error, 3181 "Failed to retriev grant " 3182 "reference for page %u of " 3183 "shared ring. Unable " 3184 "to connect.", ring_idx); 3185 return (error); 3186 } 3187 } 3188 } 3189 3190 error = xs_gather(XST_NIL, otherend_path, 3191 "protocol", "%63s", protocol_abi, 3192 NULL); 3193 if (error != 0 3194 \|\| !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) { 3195 /* 3196 * Assume native if the frontend has not 3197 * published ABI data or it has published and 3198 * matches our own ABI. 3199 / 3200* xbb->abi = BLKIF_PROTOCOL_NATIVE; 3201 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) { 3202 3203 xbb->abi = BLKIF_PROTOCOL_X86_32; 3204 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) { 3205 3206 xbb->abi = BLKIF_PROTOCOL_X86_64; 3207 } else { 3208 3209 xenbus_dev_fatal(xbb->dev, EINVAL, 3210 "Unknown protocol ABI (%s) published by " 3211 "frontend. Unable to connect.", protocol_abi); 3212 return (EINVAL); 3213 } 3214 return (0); 3215} 3216 3217/** 3218 * Allocate per-request data structures given request size and number 3219 * information negotiated with the front-end. 3220 * 3221 * \param xbb Per-instance xbb configuration structure. 3222 / 3223static int 3224xbb_alloc_requests(struct xbb_softc xbb) 3225{ 3226 struct xbb_xen_req req; 3227* struct xbb_xen_req last_req; 3228* 3229 /* 3230 * Allocate request book keeping datastructures. 3231 / 3232* xbb->requests = malloc(xbb->max_requests * sizeof(xbb->requests), 3233* M_XENBLOCKBACK, M_NOWAIT\|M_ZERO); 3234 if (xbb->requests == NULL) { 3235 xenbus_dev_fatal(xbb->dev, ENOMEM, 3236 "Unable to allocate request structures"); 3237 return (ENOMEM); 3238 } 3239 3240 req = xbb->requests; 3241 last_req = &xbb->requests[xbb->max_requests - 1]; 3242 STAILQ_INIT(&xbb->request_free_stailq); 3243 while (req <= last_req) { 3244 STAILQ_INSERT_TAIL(&xbb->request_free_stailq, req, links); 3245 req++; 3246 } 3247 return (0); 3248} 3249 3250static int 3251xbb_alloc_request_lists(struct xbb_softc xbb) 3252{ 3253* struct xbb_xen_reqlist reqlist; 3254* int i; 3255 3256 /* 3257 * If no requests can be merged, we need 1 request list per 3258 * in flight request. 3259 / 3260* xbb->request_lists = malloc(xbb->max_requests * 3261 sizeof(xbb->request_lists), M_XENBLOCKBACK, M_NOWAIT\|M_ZERO); 3262* if (xbb->request_lists == NULL) { 3263 xenbus_dev_fatal(xbb->dev, ENOMEM, 3264 "Unable to allocate request list structures"); 3265 return (ENOMEM); 3266 } 3267 3268 STAILQ_INIT(&xbb->reqlist_free_stailq); 3269 STAILQ_INIT(&xbb->reqlist_pending_stailq); 3270 for (i = 0; i < xbb->max_requests; i++) { 3271 int seg; 3272 3273 reqlist = &xbb->request_lists[i]; 3274 3275 reqlist->xbb = xbb; 3276 3277#ifdef XBB_USE_BOUNCE_BUFFERS 3278 reqlist->bounce = malloc(xbb->max_reqlist_size, 3279 M_XENBLOCKBACK, M_NOWAIT); 3280 if (reqlist->bounce == NULL) { 3281 xenbus_dev_fatal(xbb->dev, ENOMEM, 3282 "Unable to allocate request " 3283 "bounce buffers"); 3284 return (ENOMEM); 3285 } 3286#endif /* XBB_USE_BOUNCE_BUFFERS / 3287* 3288 reqlist->gnt_handles = malloc(xbb->max_reqlist_segments * 3289 sizeof(reqlist->gnt_handles), 3290* M_XENBLOCKBACK, M_NOWAIT\|M_ZERO); 3291 if (reqlist->gnt_handles == NULL) { 3292 xenbus_dev_fatal(xbb->dev, ENOMEM, 3293 "Unable to allocate request " 3294 "grant references"); 3295 return (ENOMEM); 3296 } 3297 3298 for (seg = 0; seg < xbb->max_reqlist_segments; seg++) 3299 reqlist->gnt_handles[seg] = GRANT_REF_INVALID; 3300 3301 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); 3302 } 3303 return (0); 3304} 3305 3306/** 3307 * Supply information about the physical device to the frontend 3308 * via XenBus. 3309 * 3310 * \param xbb Per-instance xbb configuration structure. 3311 / 3312static int 3313xbb_publish_backend_info(struct xbb_softc xbb) 3314{ 3315 struct xs_transaction xst; 3316 const char our_path; 3317* const char leaf; 3318* int error; 3319 3320 our_path = xenbus_get_node(xbb->dev); 3321 while (1) { 3322 error = xs_transaction_start(&xst); 3323 if (error != 0) { 3324 xenbus_dev_fatal(xbb->dev, error, 3325 "Error publishing backend info " 3326 "(start transaction)"); 3327 return (error); 3328 } 3329 3330 leaf = "sectors"; 3331 error = xs_printf(xst, our_path, leaf, 3332 "%"PRIu64, xbb->media_num_sectors); 3333 if (error != 0) 3334 break; 3335 3336 /* XXX Support all VBD attributes here. / 3337* leaf = "info"; 3338 error = xs_printf(xst, our_path, leaf, "%u", 3339 xbb->flags & XBBF_READ_ONLY 3340 ? VDISK_READONLY : 0); 3341 if (error != 0) 3342 break; 3343 3344 leaf = "sector-size"; 3345 error = xs_printf(xst, our_path, leaf, "%u", 3346 xbb->sector_size); 3347 if (error != 0) 3348 break; 3349 3350 error = xs_transaction_end(xst, 0); 3351 if (error == 0) { 3352 return (0); 3353 } else if (error != EAGAIN) { 3354 xenbus_dev_fatal(xbb->dev, error, "ending transaction"); 3355 return (error); 3356 } 3357 } 3358 3359 xenbus_dev_fatal(xbb->dev, error, "writing %s/%s", 3360 our_path, leaf); 3361 xs_transaction_end(xst, 1); 3362 return (error); 3363} 3364 3365/** 3366 * Connect to our blkfront peer now that it has completed publishing 3367 * its configuration into the XenStore. 3368 * 3369 * \param xbb Per-instance xbb configuration structure. 3370 / 3371static void 3372xbb_connect(struct xbb_softc xbb) 3373{ 3374 int error; 3375 3376 if (xenbus_get_state(xbb->dev) == XenbusStateConnected) 3377 return; 3378 3379 if (xbb_collect_frontend_info(xbb) != 0) 3380 return; 3381 3382 xbb->flags &= ~XBBF_SHUTDOWN; 3383 3384 /* 3385 * We limit the maximum number of reqlist segments to the maximum 3386 * number of segments in the ring, or our absolute maximum, 3387 * whichever is smaller. 3388 / 3389* xbb->max_reqlist_segments = MIN(xbb->max_request_segments * 3390 xbb->max_requests, XBB_MAX_SEGMENTS_PER_REQLIST); 3391 3392 /* 3393 * The maximum size is simply a function of the number of segments 3394 * we can handle. 3395 / 3396* xbb->max_reqlist_size = xbb->max_reqlist_segments * PAGE_SIZE; 3397 3398 /* Allocate resources whose size depends on front-end configuration. / 3399* error = xbb_alloc_communication_mem(xbb); 3400 if (error != 0) { 3401 xenbus_dev_fatal(xbb->dev, error, 3402 "Unable to allocate communication memory"); 3403 return; 3404 } 3405 3406 error = xbb_alloc_requests(xbb); 3407 if (error != 0) { 3408 /* Specific errors are reported by xbb_alloc_requests(). / 3409* return; 3410 } 3411 3412 error = xbb_alloc_request_lists(xbb); 3413 if (error != 0) { 3414 /* Specific errors are reported by xbb_alloc_request_lists(). / 3415* return; 3416 } 3417 3418 /* 3419 * Connect communication channel. 3420 / 3421* error = xbb_connect_ring(xbb); 3422 if (error != 0) { 3423 /* Specific errors are reported by xbb_connect_ring(). / 3424* return; 3425 } 3426 3427 if (xbb_publish_backend_info(xbb) != 0) { 3428 /* 3429 * If we can't publish our data, we cannot participate 3430 * in this connection, and waiting for a front-end state 3431 * change will not help the situation. 3432 / 3433* (void)xbb_disconnect(xbb); 3434 return; 3435 } 3436 3437 /* Ready for I/O. / 3438* xenbus_set_state(xbb->dev, XenbusStateConnected); 3439} 3440 3441/-------------------------- Device Teardown Support -------------------------/ 3442/** 3443 * Perform device shutdown functions. 3444 * 3445 * \param xbb Per-instance xbb configuration structure. 3446 * 3447 * Mark this instance as shutting down, wait for any active I/O on the 3448 * backend device/file to drain, disconnect from the front-end, and notify 3449 * any waiters (e.g. a thread invoking our detach method) that detach can 3450 * now proceed. 3451 / 3452static int 3453xbb_shutdown(struct xbb_softc xbb) 3454{ 3455 XenbusState frontState; 3456 int error; 3457 3458 DPRINTF("\n"); 3459 3460 /* 3461 * Due to the need to drop our mutex during some 3462 * xenbus operations, it is possible for two threads 3463 * to attempt to close out shutdown processing at 3464 * the same time. Tell the caller that hits this 3465 * race to try back later. 3466 / 3467* if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0) 3468 return (EAGAIN); 3469 3470 xbb->flags \|= XBBF_IN_SHUTDOWN; 3471 mtx_unlock(&xbb->lock); 3472 3473 if (xenbus_get_state(xbb->dev) < XenbusStateClosing) 3474 xenbus_set_state(xbb->dev, XenbusStateClosing); 3475 3476 frontState = xenbus_get_otherend_state(xbb->dev); 3477 mtx_lock(&xbb->lock); 3478 xbb->flags &= ~XBBF_IN_SHUTDOWN; 3479 3480 /* The front can submit I/O until entering the closed state. / 3481* if (frontState < XenbusStateClosed) 3482 return (EAGAIN); 3483 3484 DPRINTF("\n"); 3485 3486 /* Indicate shutdown is in progress. / 3487* xbb->flags \|= XBBF_SHUTDOWN; 3488 3489 /* Disconnect from the front-end. / 3490* error = xbb_disconnect(xbb); 3491 if (error != 0) { 3492 /* 3493 * Requests still outstanding. We'll be called again 3494 * once they complete. 3495 / 3496* KASSERT(error == EAGAIN, 3497 ("%s: Unexpected xbb_disconnect() failure %d", 3498 __func__, error)); 3499 3500 return (error); 3501 } 3502 3503 DPRINTF("\n"); 3504 3505 /* Indicate to xbb_detach() that is it safe to proceed. / 3506* wakeup(xbb); 3507 3508 return (0); 3509} 3510 3511/** 3512 * Report an attach time error to the console and Xen, and cleanup 3513 * this instance by forcing immediate detach processing. 3514 * 3515 * \param xbb Per-instance xbb configuration structure. 3516 * \param err Errno describing the error. 3517 * \param fmt Printf style format and arguments 3518 / 3519static void 3520xbb_attach_failed(struct xbb_softc xbb, int err, const char fmt, ...) 3521{ 3522* va_list ap; 3523 va_list ap_hotplug; 3524 3525 va_start(ap, fmt); 3526 va_copy(ap_hotplug, ap); 3527 xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev), 3528 "hotplug-error", fmt, ap_hotplug); 3529 va_end(ap_hotplug); 3530 xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3531 "hotplug-status", "error"); 3532 3533 xenbus_dev_vfatal(xbb->dev, err, fmt, ap); 3534 va_end(ap); 3535 3536 xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3537 "online", "0"); 3538 xbb_detach(xbb->dev); 3539} 3540 3541/---------------------------- NewBus Entrypoints ----------------------------/ 3542/** 3543 * Inspect a XenBus device and claim it if is of the appropriate type. 3544 * 3545 * \param dev NewBus device object representing a candidate XenBus device. 3546 * 3547 * \return 0 for success, errno codes for failure. 3548 / 3549static int 3550xbb_probe(device_t dev) 3551{ 3552* 3553 if (!strcmp(xenbus_get_type(dev), "vbd")) { 3554 device_set_desc(dev, "Backend Virtual Block Device"); 3555 device_quiet(dev); 3556 return (0); 3557 } 3558 3559 return (ENXIO); 3560} 3561 3562/** 3563 * Setup sysctl variables to control various Block Back parameters. 3564 * 3565 * \param xbb Xen Block Back softc. 3566 * 3567 / 3568static void 3569xbb_setup_sysctl(struct xbb_softc xbb) 3570{ 3571 struct sysctl_ctx_list sysctl_ctx = NULL; 3572* struct sysctl_oid sysctl_tree = NULL; 3573* 3574 sysctl_ctx = device_get_sysctl_ctx(xbb->dev); 3575 if (sysctl_ctx == NULL) 3576 return; 3577 3578 sysctl_tree = device_get_sysctl_tree(xbb->dev); 3579 if (sysctl_tree == NULL) 3580 return; 3581 3582 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3583 "disable_flush", CTLFLAG_RW, &xbb->disable_flush, 0, 3584 "fake the flush command"); 3585 3586 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3587 "flush_interval", CTLFLAG_RW, &xbb->flush_interval, 0, 3588 "send a real flush for N flush requests"); 3589 3590 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3591 "no_coalesce_reqs", CTLFLAG_RW, &xbb->no_coalesce_reqs,0, 3592 "Don't coalesce contiguous requests"); 3593 3594 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3595 "reqs_received", CTLFLAG_RW, &xbb->reqs_received, 3596 "how many I/O requests we have received"); 3597 3598 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3599 "reqs_completed", CTLFLAG_RW, &xbb->reqs_completed, 3600 "how many I/O requests have been completed"); 3601 3602 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3603 "forced_dispatch", CTLFLAG_RW, &xbb->forced_dispatch, 3604 "how many I/O dispatches were forced"); 3605 3606 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3607 "normal_dispatch", CTLFLAG_RW, &xbb->normal_dispatch, 3608 "how many I/O dispatches were normal"); 3609 3610 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3611 "total_dispatch", CTLFLAG_RW, &xbb->total_dispatch, 3612 "total number of I/O dispatches"); 3613 3614 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3615 "kva_shortages", CTLFLAG_RW, &xbb->kva_shortages, 3616 "how many times we have run out of KVA"); 3617 3618 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3619 "request_shortages", CTLFLAG_RW, 3620 &xbb->request_shortages, 3621 "how many times we have run out of requests"); 3622 3623 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3624 "max_requests", CTLFLAG_RD, &xbb->max_requests, 0, 3625 "maximum outstanding requests (negotiated)"); 3626 3627 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3628 "max_request_segments", CTLFLAG_RD, 3629 &xbb->max_request_segments, 0, 3630 "maximum number of pages per requests (negotiated)"); 3631 3632 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3633 "max_request_size", CTLFLAG_RD, 3634 &xbb->max_request_size, 0, 3635 "maximum size in bytes of a request (negotiated)"); 3636 3637 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, 3638 "ring_pages", CTLFLAG_RD, 3639 &xbb->ring_config.ring_pages, 0, 3640 "communication channel pages (negotiated)"); 3641} 3642 3643/** 3644 * Attach to a XenBus device that has been claimed by our probe routine. 3645 * 3646 * \param dev NewBus device object representing this Xen Block Back instance. 3647 * 3648 * \return 0 for success, errno codes for failure. 3649 / 3650static int 3651xbb_attach(device_t dev) 3652{ 3653* struct xbb_softc xbb; 3654* int error; 3655 u_int max_ring_page_order; 3656 3657 DPRINTF("Attaching to %s\n", xenbus_get_node(dev)); 3658 3659 /* 3660 * Basic initialization. 3661 * After this block it is safe to call xbb_detach() 3662 * to clean up any allocated data for this instance. 3663 / 3664* xbb = device_get_softc(dev); 3665 xbb->dev = dev; 3666 xbb->otherend_id = xenbus_get_otherend_id(dev); 3667 TASK_INIT(&xbb->io_task, /priority/0, xbb_run_queue, xbb); 3668 mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF); 3669 3670 /* 3671 * Publish protocol capabilities for consumption by the 3672 * front-end. 3673 / 3674* error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3675 "feature-barrier", "1"); 3676 if (error) { 3677 xbb_attach_failed(xbb, error, "writing %s/feature-barrier", 3678 xenbus_get_node(xbb->dev)); 3679 return (error); 3680 } 3681 3682 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3683 "feature-flush-cache", "1"); 3684 if (error) { 3685 xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache", 3686 xenbus_get_node(xbb->dev)); 3687 return (error); 3688 } 3689 3690 /* 3691 * Amazon EC2 client compatility. They refer to max-ring-pages 3692 * instead of to max-ring-page-order. 3693 / 3694* error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3695 "max-ring-pages", "%zu", XBB_MAX_RING_PAGES); 3696 if (error) { 3697 xbb_attach_failed(xbb, error, "writing %s/max-ring-pages", 3698 xenbus_get_node(xbb->dev)); 3699 return (error); 3700 } 3701 3702 max_ring_page_order = flsl(XBB_MAX_RING_PAGES) - 1; 3703 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3704 "max-ring-page-order", "%u", max_ring_page_order); 3705 if (error) { 3706 xbb_attach_failed(xbb, error, "writing %s/max-ring-page-order", 3707 xenbus_get_node(xbb->dev)); 3708 return (error); 3709 } 3710 3711 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3712 "max-requests", "%u", XBB_MAX_REQUESTS); 3713 if (error) { 3714 xbb_attach_failed(xbb, error, "writing %s/max-requests", 3715 xenbus_get_node(xbb->dev)); 3716 return (error); 3717 } 3718 3719 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3720 "max-request-segments", "%u", 3721 XBB_MAX_SEGMENTS_PER_REQUEST); 3722 if (error) { 3723 xbb_attach_failed(xbb, error, "writing %s/max-request-segments", 3724 xenbus_get_node(xbb->dev)); 3725 return (error); 3726 } 3727 3728 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3729 "max-request-size", "%u", 3730 XBB_MAX_REQUEST_SIZE); 3731 if (error) { 3732 xbb_attach_failed(xbb, error, "writing %s/max-request-size", 3733 xenbus_get_node(xbb->dev)); 3734 return (error); 3735 } 3736 3737 /* Collect physical device information. / 3738* error = xs_gather(XST_NIL, xenbus_get_otherend_path(xbb->dev), 3739 "device-type", NULL, &xbb->dev_type, 3740 NULL); 3741 if (error != 0) 3742 xbb->dev_type = NULL; 3743 3744 error = xs_gather(XST_NIL, xenbus_get_node(dev), 3745 "mode", NULL, &xbb->dev_mode, 3746 "params", NULL, &xbb->dev_name, 3747 NULL); 3748 if (error != 0) { 3749 xbb_attach_failed(xbb, error, "reading backend fields at %s", 3750 xenbus_get_node(dev)); 3751 return (ENXIO); 3752 } 3753 3754 /* Parse fopen style mode flags. / 3755* if (strchr(xbb->dev_mode, 'w') == NULL) 3756 xbb->flags \|= XBBF_READ_ONLY; 3757 3758 /* 3759 * Verify the physical device is present and can support 3760 * the desired I/O mode. 3761 / 3762* DROP_GIANT(); 3763 error = xbb_open_backend(xbb); 3764 PICKUP_GIANT(); 3765 if (error != 0) { 3766 xbb_attach_failed(xbb, error, "Unable to open %s", 3767 xbb->dev_name); 3768 return (ENXIO); 3769 } 3770 3771 /* Use devstat(9) for recording statistics. / 3772* xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev), 3773 xbb->sector_size, 3774 DEVSTAT_ALL_SUPPORTED, 3775 DEVSTAT_TYPE_DIRECT 3776 \| DEVSTAT_TYPE_IF_OTHER, 3777 DEVSTAT_PRIORITY_OTHER); 3778 3779 xbb->xbb_stats_in = devstat_new_entry("xbbi", device_get_unit(xbb->dev), 3780 xbb->sector_size, 3781 DEVSTAT_ALL_SUPPORTED, 3782 DEVSTAT_TYPE_DIRECT 3783 \| DEVSTAT_TYPE_IF_OTHER, 3784 DEVSTAT_PRIORITY_OTHER); 3785 /* 3786 * Setup sysctl variables. 3787 / 3788* xbb_setup_sysctl(xbb); 3789 3790 /* 3791 * Create a taskqueue for doing work that must occur from a 3792 * thread context. 3793 / 3794* xbb->io_taskqueue = taskqueue_create(device_get_nameunit(dev), M_NOWAIT, 3795 taskqueue_thread_enqueue, 3796 /context/&xbb->io_taskqueue); 3797 if (xbb->io_taskqueue == NULL) { 3798 xbb_attach_failed(xbb, error, "Unable to create taskqueue"); 3799 return (ENOMEM); 3800 } 3801 3802 taskqueue_start_threads(&xbb->io_taskqueue, 3803 /num threads/1, 3804 /priority/PWAIT, 3805 /thread name/ 3806 "%s taskq", device_get_nameunit(dev)); 3807 3808 /* Update hot-plug status to satisfy xend. / 3809* error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), 3810 "hotplug-status", "connected"); 3811 if (error) { 3812 xbb_attach_failed(xbb, error, "writing %s/hotplug-status", 3813 xenbus_get_node(xbb->dev)); 3814 return (error); 3815 } 3816 3817 /* Tell the front end that we are ready to connect. / 3818* xenbus_set_state(dev, XenbusStateInitWait); 3819 3820 return (0); 3821} 3822 3823/** 3824 * Detach from a block back device instance. 3825 * 3826 * \param dev NewBus device object representing this Xen Block Back instance. 3827 * 3828 * \return 0 for success, errno codes for failure. 3829 * 3830 * \note A block back device may be detached at any time in its life-cycle, 3831 * including part way through the attach process. For this reason, 3832 * initialization order and the intialization state checks in this 3833 * routine must be carefully coupled so that attach time failures 3834 * are gracefully handled. 3835 / 3836static int 3837xbb_detach(device_t dev) 3838{ 3839* struct xbb_softc xbb; 3840* 3841 DPRINTF("\n"); 3842 3843 xbb = device_get_softc(dev); 3844 mtx_lock(&xbb->lock); 3845 while (xbb_shutdown(xbb) == EAGAIN) { 3846 msleep(xbb, &xbb->lock, /wakeup prio unchanged/0, 3847 "xbb_shutdown", 0); 3848 } 3849 mtx_unlock(&xbb->lock); 3850 3851 DPRINTF("\n"); 3852 3853 if (xbb->io_taskqueue != NULL) 3854 taskqueue_free(xbb->io_taskqueue); 3855 3856 if (xbb->xbb_stats != NULL) 3857 devstat_remove_entry(xbb->xbb_stats); 3858 3859 if (xbb->xbb_stats_in != NULL) 3860 devstat_remove_entry(xbb->xbb_stats_in); 3861 3862 xbb_close_backend(xbb); 3863 3864 if (xbb->dev_mode != NULL) { 3865 free(xbb->dev_mode, M_XENBUS); 3866 xbb->dev_mode = NULL; 3867 } 3868 3869 if (xbb->dev_type != NULL) { 3870 free(xbb->dev_type, M_XENBUS); 3871 xbb->dev_type = NULL; 3872 } 3873 3874 if (xbb->dev_name != NULL) { 3875 free(xbb->dev_name, M_XENBUS); 3876 xbb->dev_name = NULL; 3877 } 3878 3879 mtx_destroy(&xbb->lock); 3880 return (0); 3881} 3882 3883/** 3884 * Prepare this block back device for suspension of this VM. 3885 * 3886 * \param dev NewBus device object representing this Xen Block Back instance. 3887 * 3888 * \return 0 for success, errno codes for failure. 3889 / 3890static int 3891xbb_suspend(device_t dev) 3892{ 3893#ifdef NOT_YET 3894* struct xbb_softc sc = device_get_softc(dev); 3895* 3896 /* Prevent new requests being issued until we fix things up. / 3897* mtx_lock(&sc->xb_io_lock); 3898 sc->connected = BLKIF_STATE_SUSPENDED; 3899 mtx_unlock(&sc->xb_io_lock); 3900#endif 3901 3902 return (0); 3903} 3904 3905/** 3906 * Perform any processing required to recover from a suspended state. 3907 * 3908 * \param dev NewBus device object representing this Xen Block Back instance. 3909 * 3910 * \return 0 for success, errno codes for failure. 3911 / 3912static int 3913xbb_resume(device_t dev) 3914{ 3915* return (0); 3916} 3917 3918/** 3919 * Handle state changes expressed via the XenStore by our front-end peer. 3920 * 3921 * \param dev NewBus device object representing this Xen 3922 * Block Back instance. 3923 * \param frontend_state The new state of the front-end. 3924 * 3925 * \return 0 for success, errno codes for failure. 3926 / 3927static void 3928xbb_frontend_changed(device_t dev, XenbusState frontend_state) 3929{ 3930* struct xbb_softc xbb = device_get_softc(dev); 3931* 3932 DPRINTF("frontend_state=%s, xbb_state=%s\n", 3933 xenbus_strstate(frontend_state), 3934 xenbus_strstate(xenbus_get_state(xbb->dev))); 3935 3936 switch (frontend_state) { 3937 case XenbusStateInitialising: 3938 break; 3939 case XenbusStateInitialised: 3940 case XenbusStateConnected: 3941 xbb_connect(xbb); 3942 break; 3943 case XenbusStateClosing: 3944 case XenbusStateClosed: 3945 mtx_lock(&xbb->lock); 3946 xbb_shutdown(xbb); 3947 mtx_unlock(&xbb->lock); 3948 if (frontend_state == XenbusStateClosed) 3949 xenbus_set_state(xbb->dev, XenbusStateClosed); 3950 break; 3951 default: 3952 xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend", 3953 frontend_state); 3954 break; 3955 } 3956} 3957 3958/---------------------------- NewBus Registration ---------------------------/ 3959static device_method_t xbb_methods[] = { 3960 /* Device interface / 3961* DEVMETHOD(device_probe, xbb_probe), 3962 DEVMETHOD(device_attach, xbb_attach), 3963 DEVMETHOD(device_detach, xbb_detach), 3964 DEVMETHOD(device_shutdown, bus_generic_shutdown), 3965 DEVMETHOD(device_suspend, xbb_suspend), 3966 DEVMETHOD(device_resume, xbb_resume), 3967 3968 /* Xenbus interface / 3969* DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed), 3970 3971 { 0, 0 } 3972}; 3973 3974static driver_t xbb_driver = { 3975 "xbbd", 3976 xbb_methods, 3977 sizeof(struct xbb_softc), 3978}; 3979devclass_t xbb_devclass; 3980 3981DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, xbb_devclass, 0, 0);