1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2009-2012 Spectra Logic Corporation
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions, and the following disclaimer,
12 *    without modification.
13 * 2. Redistributions in binary form must reproduce at minimum a disclaimer
14 *    substantially similar to the "NO WARRANTY" disclaimer below
15 *    ("Disclaimer") and any redistribution must be conditioned upon
16 *    including a substantially similar Disclaimer requirement for further
17 *    binary redistribution.
18 *
19 * NO WARRANTY
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
28 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
29 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGES.
31 *
32 * Authors: Justin T. Gibbs     (Spectra Logic Corporation)
33 *          Ken Merry           (Spectra Logic Corporation)
34 */
35#include <sys/cdefs.h>
36__FBSDID("$FreeBSD$");
37
38/**
39 * \file blkback.c
40 *
41 * \brief Device driver supporting the vending of block storage from
42 *        a FreeBSD domain to other domains.
43 */
44
45#include <sys/param.h>
46#include <sys/systm.h>
47#include <sys/kernel.h>
48#include <sys/malloc.h>
49
50#include <sys/bio.h>
51#include <sys/bus.h>
52#include <sys/conf.h>
53#include <sys/devicestat.h>
54#include <sys/disk.h>
55#include <sys/fcntl.h>
56#include <sys/filedesc.h>
57#include <sys/kdb.h>
58#include <sys/module.h>
59#include <sys/namei.h>
60#include <sys/proc.h>
61#include <sys/rman.h>
62#include <sys/taskqueue.h>
63#include <sys/types.h>
64#include <sys/vnode.h>
65#include <sys/mount.h>
66#include <sys/sysctl.h>
67#include <sys/bitstring.h>
68#include <sys/sdt.h>
69
70#include <geom/geom.h>
71
72#include <machine/_inttypes.h>
73
74#include <vm/vm.h>
75#include <vm/vm_extern.h>
76#include <vm/vm_kern.h>
77
78#include <xen/xen-os.h>
79#include <xen/blkif.h>
80#include <xen/gnttab.h>
81#include <xen/xen_intr.h>
82
83#include <xen/interface/event_channel.h>
84#include <xen/interface/grant_table.h>
85
86#include <xen/xenbus/xenbusvar.h>
87
88/*--------------------------- Compile-time Tunables --------------------------*/
89/**
90 * The maximum number of shared memory ring pages we will allow in a
91 * negotiated block-front/back communication channel.  Allow enough
92 * ring space for all requests to be XBB_MAX_REQUEST_SIZE'd.
93 */
94#define	XBB_MAX_RING_PAGES		32
95
96/**
97 * The maximum number of outstanding request blocks (request headers plus
98 * additional segment blocks) we will allow in a negotiated block-front/back
99 * communication channel.
100 */
101#define	XBB_MAX_REQUESTS 					\
102	__CONST_RING_SIZE(blkif, PAGE_SIZE * XBB_MAX_RING_PAGES)
103
104/**
105 * \brief Define to force all I/O to be performed on memory owned by the
106 *        backend device, with a copy-in/out to the remote domain's memory.
107 *
108 * \note  This option is currently required when this driver's domain is
109 *        operating in HVM mode on a system using an IOMMU.
110 *
111 * This driver uses Xen's grant table API to gain access to the memory of
112 * the remote domains it serves.  When our domain is operating in PV mode,
113 * the grant table mechanism directly updates our domain's page table entries
114 * to point to the physical pages of the remote domain.  This scheme guarantees
115 * that blkback and the backing devices it uses can safely perform DMA
116 * operations to satisfy requests.  In HVM mode, Xen may use a HW IOMMU to
117 * insure that our domain cannot DMA to pages owned by another domain.  As
118 * of Xen 4.0, IOMMU mappings for HVM guests are not updated via the grant
119 * table API.  For this reason, in HVM mode, we must bounce all requests into
120 * memory that is mapped into our domain at domain startup and thus has
121 * valid IOMMU mappings.
122 */
123#define XBB_USE_BOUNCE_BUFFERS
124
125/**
126 * \brief Define to enable rudimentary request logging to the console.
127 */
128#undef XBB_DEBUG
129
130/*---------------------------------- Macros ----------------------------------*/
131/**
132 * Custom malloc type for all driver allocations.
133 */
134static MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data");
135
136#ifdef XBB_DEBUG
137#define DPRINTF(fmt, args...)					\
138    printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
139#else
140#define DPRINTF(fmt, args...) do {} while(0)
141#endif
142
143/**
144 * The maximum mapped region size per request we will allow in a negotiated
145 * block-front/back communication channel.
146 */
147#define	XBB_MAX_REQUEST_SIZE					\
148	MIN(MAXPHYS, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE)
149
150/**
151 * The maximum number of segments (within a request header and accompanying
152 * segment blocks) per request we will allow in a negotiated block-front/back
153 * communication channel.
154 */
155#define	XBB_MAX_SEGMENTS_PER_REQUEST				\
156	(MIN(UIO_MAXIOV,					\
157	     MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST,		\
158		 (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1)))
159
160/**
161 * The maximum number of ring pages that we can allow per request list.
162 * We limit this to the maximum number of segments per request, because
163 * that is already a reasonable number of segments to aggregate.  This
164 * number should never be smaller than XBB_MAX_SEGMENTS_PER_REQUEST,
165 * because that would leave situations where we can't dispatch even one
166 * large request.
167 */
168#define	XBB_MAX_SEGMENTS_PER_REQLIST XBB_MAX_SEGMENTS_PER_REQUEST
169
170/*--------------------------- Forward Declarations ---------------------------*/
171struct xbb_softc;
172struct xbb_xen_req;
173
174static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt,
175			      ...) __attribute__((format(printf, 3, 4)));
176static int  xbb_shutdown(struct xbb_softc *xbb);
177
178/*------------------------------ Data Structures -----------------------------*/
179
180STAILQ_HEAD(xbb_xen_req_list, xbb_xen_req);
181
182typedef enum {
183	XBB_REQLIST_NONE	= 0x00,
184	XBB_REQLIST_MAPPED	= 0x01
185} xbb_reqlist_flags;
186
187struct xbb_xen_reqlist {
188	/**
189	 * Back reference to the parent block back instance for this
190	 * request.  Used during bio_done handling.
191	 */
192	struct xbb_softc        *xbb;
193
194	/**
195	 * BLKIF_OP code for this request.
196	 */
197	int			 operation;
198
199	/**
200	 * Set to BLKIF_RSP_* to indicate request status.
201	 *
202	 * This field allows an error status to be recorded even if the
203	 * delivery of this status must be deferred.  Deferred reporting
204	 * is necessary, for example, when an error is detected during
205	 * completion processing of one bio when other bios for this
206	 * request are still outstanding.
207	 */
208	int			 status;
209
210	/**
211	 * Number of 512 byte sectors not transferred.
212	 */
213	int			 residual_512b_sectors;
214
215	/**
216	 * Starting sector number of the first request in the list.
217	 */
218	off_t			 starting_sector_number;
219
220	/**
221	 * If we're going to coalesce, the next contiguous sector would be
222	 * this one.
223	 */
224	off_t			 next_contig_sector;
225
226	/**
227	 * Number of child requests in the list.
228	 */
229	int			 num_children;
230
231	/**
232	 * Number of I/O requests still pending on the backend.
233	 */
234	int			 pendcnt;
235
236	/**
237	 * Total number of segments for requests in the list.
238	 */
239	int			 nr_segments;
240
241	/**
242	 * Flags for this particular request list.
243	 */
244	xbb_reqlist_flags	 flags;
245
246	/**
247	 * Kernel virtual address space reserved for this request
248	 * list structure and used to map the remote domain's pages for
249	 * this I/O, into our domain's address space.
250	 */
251	uint8_t			*kva;
252
253	/**
254	 * Base, pseudo-physical address, corresponding to the start
255	 * of this request's kva region.
256	 */
257	uint64_t	 	 gnt_base;
258
259
260#ifdef XBB_USE_BOUNCE_BUFFERS
261	/**
262	 * Pre-allocated domain local memory used to proxy remote
263	 * domain memory during I/O operations.
264	 */
265	uint8_t			*bounce;
266#endif
267
268	/**
269	 * Array of grant handles (one per page) used to map this request.
270	 */
271	grant_handle_t		*gnt_handles;
272
273	/**
274	 * Device statistics request ordering type (ordered or simple).
275	 */
276	devstat_tag_type	 ds_tag_type;
277
278	/**
279	 * Device statistics request type (read, write, no_data).
280	 */
281	devstat_trans_flags	 ds_trans_type;
282
283	/**
284	 * The start time for this request.
285	 */
286	struct bintime		 ds_t0;
287
288	/**
289	 * Linked list of contiguous requests with the same operation type.
290	 */
291	struct xbb_xen_req_list	 contig_req_list;
292
293	/**
294	 * Linked list links used to aggregate idle requests in the
295	 * request list free pool (xbb->reqlist_free_stailq) and pending
296	 * requests waiting for execution (xbb->reqlist_pending_stailq).
297	 */
298	STAILQ_ENTRY(xbb_xen_reqlist) links;
299};
300
301STAILQ_HEAD(xbb_xen_reqlist_list, xbb_xen_reqlist);
302
303/**
304 * \brief Object tracking an in-flight I/O from a Xen VBD consumer.
305 */
306struct xbb_xen_req {
307	/**
308	 * Linked list links used to aggregate requests into a reqlist
309	 * and to store them in the request free pool.
310	 */
311	STAILQ_ENTRY(xbb_xen_req) links;
312
313	/**
314	 * The remote domain's identifier for this I/O request.
315	 */
316	uint64_t		  id;
317
318	/**
319	 * The number of pages currently mapped for this request.
320	 */
321	int			  nr_pages;
322
323	/**
324	 * The number of 512 byte sectors comprising this requests.
325	 */
326	int			  nr_512b_sectors;
327
328	/**
329	 * BLKIF_OP code for this request.
330	 */
331	int			  operation;
332
333	/**
334	 * Storage used for non-native ring requests.
335	 */
336	blkif_request_t		 ring_req_storage;
337
338	/**
339	 * Pointer to the Xen request in the ring.
340	 */
341	blkif_request_t		*ring_req;
342
343	/**
344	 * Consumer index for this request.
345	 */
346	RING_IDX		 req_ring_idx;
347
348	/**
349	 * The start time for this request.
350	 */
351	struct bintime		 ds_t0;
352
353	/**
354	 * Pointer back to our parent request list.
355	 */
356	struct xbb_xen_reqlist  *reqlist;
357};
358SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req);
359
360/**
361 * \brief Configuration data for the shared memory request ring
362 *        used to communicate with the front-end client of this
363 *        this driver.
364 */
365struct xbb_ring_config {
366	/** KVA address where ring memory is mapped. */
367	vm_offset_t	va;
368
369	/** The pseudo-physical address where ring memory is mapped.*/
370	uint64_t	gnt_addr;
371
372	/**
373	 * Grant table handles, one per-ring page, returned by the
374	 * hyperpervisor upon mapping of the ring and required to
375	 * unmap it when a connection is torn down.
376	 */
377	grant_handle_t	handle[XBB_MAX_RING_PAGES];
378
379	/**
380	 * The device bus address returned by the hypervisor when
381	 * mapping the ring and required to unmap it when a connection
382	 * is torn down.
383	 */
384	uint64_t	bus_addr[XBB_MAX_RING_PAGES];
385
386	/** The number of ring pages mapped for the current connection. */
387	u_int		ring_pages;
388
389	/**
390	 * The grant references, one per-ring page, supplied by the
391	 * front-end, allowing us to reference the ring pages in the
392	 * front-end's domain and to map these pages into our own domain.
393	 */
394	grant_ref_t	ring_ref[XBB_MAX_RING_PAGES];
395
396	/** The interrupt driven even channel used to signal ring events. */
397	evtchn_port_t   evtchn;
398};
399
400/**
401 * Per-instance connection state flags.
402 */
403typedef enum
404{
405	/**
406	 * The front-end requested a read-only mount of the
407	 * back-end device/file.
408	 */
409	XBBF_READ_ONLY         = 0x01,
410
411	/** Communication with the front-end has been established. */
412	XBBF_RING_CONNECTED    = 0x02,
413
414	/**
415	 * Front-end requests exist in the ring and are waiting for
416	 * xbb_xen_req objects to free up.
417	 */
418	XBBF_RESOURCE_SHORTAGE = 0x04,
419
420	/** Connection teardown in progress. */
421	XBBF_SHUTDOWN          = 0x08,
422
423	/** A thread is already performing shutdown processing. */
424	XBBF_IN_SHUTDOWN       = 0x10
425} xbb_flag_t;
426
427/** Backend device type.  */
428typedef enum {
429	/** Backend type unknown. */
430	XBB_TYPE_NONE		= 0x00,
431
432	/**
433	 * Backend type disk (access via cdev switch
434	 * strategy routine).
435	 */
436	XBB_TYPE_DISK		= 0x01,
437
438	/** Backend type file (access vnode operations.). */
439	XBB_TYPE_FILE		= 0x02
440} xbb_type;
441
442/**
443 * \brief Structure used to memoize information about a per-request
444 *        scatter-gather list.
445 *
446 * The chief benefit of using this data structure is it avoids having
447 * to reparse the possibly discontiguous S/G list in the original
448 * request.  Due to the way that the mapping of the memory backing an
449 * I/O transaction is handled by Xen, a second pass is unavoidable.
450 * At least this way the second walk is a simple array traversal.
451 *
452 * \note A single Scatter/Gather element in the block interface covers
453 *       at most 1 machine page.  In this context a sector (blkif
454 *       nomenclature, not what I'd choose) is a 512b aligned unit
455 *       of mapping within the machine page referenced by an S/G
456 *       element.
457 */
458struct xbb_sg {
459	/** The number of 512b data chunks mapped in this S/G element. */
460	int16_t nsect;
461
462	/**
463	 * The index (0 based) of the first 512b data chunk mapped
464	 * in this S/G element.
465	 */
466	uint8_t first_sect;
467
468	/**
469	 * The index (0 based) of the last 512b data chunk mapped
470	 * in this S/G element.
471	 */
472	uint8_t last_sect;
473};
474
475/**
476 * Character device backend specific configuration data.
477 */
478struct xbb_dev_data {
479	/** Cdev used for device backend access.  */
480	struct cdev   *cdev;
481
482	/** Cdev switch used for device backend access.  */
483	struct cdevsw *csw;
484
485	/** Used to hold a reference on opened cdev backend devices. */
486	int	       dev_ref;
487};
488
489/**
490 * File backend specific configuration data.
491 */
492struct xbb_file_data {
493	/** Credentials to use for vnode backed (file based) I/O. */
494	struct ucred   *cred;
495
496	/**
497	 * \brief Array of io vectors used to process file based I/O.
498	 *
499	 * Only a single file based request is outstanding per-xbb instance,
500	 * so we only need one of these.
501	 */
502	struct iovec	xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST];
503#ifdef XBB_USE_BOUNCE_BUFFERS
504
505	/**
506	 * \brief Array of io vectors used to handle bouncing of file reads.
507	 *
508	 * Vnode operations are free to modify uio data during their
509	 * exectuion.  In the case of a read with bounce buffering active,
510	 * we need some of the data from the original uio in order to
511	 * bounce-out the read data.  This array serves as the temporary
512	 * storage for this saved data.
513	 */
514	struct iovec	saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST];
515
516	/**
517	 * \brief Array of memoized bounce buffer kva offsets used
518	 *        in the file based backend.
519	 *
520	 * Due to the way that the mapping of the memory backing an
521	 * I/O transaction is handled by Xen, a second pass through
522	 * the request sg elements is unavoidable. We memoize the computed
523	 * bounce address here to reduce the cost of the second walk.
524	 */
525	void		*xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQLIST];
526#endif /* XBB_USE_BOUNCE_BUFFERS */
527};
528
529/**
530 * Collection of backend type specific data.
531 */
532union xbb_backend_data {
533	struct xbb_dev_data  dev;
534	struct xbb_file_data file;
535};
536
537/**
538 * Function signature of backend specific I/O handlers.
539 */
540typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb,
541			      struct xbb_xen_reqlist *reqlist, int operation,
542			      int flags);
543
544/**
545 * Per-instance configuration data.
546 */
547struct xbb_softc {
548
549	/**
550	 * Task-queue used to process I/O requests.
551	 */
552	struct taskqueue	 *io_taskqueue;
553
554	/**
555	 * Single "run the request queue" task enqueued
556	 * on io_taskqueue.
557	 */
558	struct task		  io_task;
559
560	/** Device type for this instance. */
561	xbb_type		  device_type;
562
563	/** NewBus device corresponding to this instance. */
564	device_t		  dev;
565
566	/** Backend specific dispatch routine for this instance. */
567	xbb_dispatch_t		  dispatch_io;
568
569	/** The number of requests outstanding on the backend device/file. */
570	int			  active_request_count;
571
572	/** Free pool of request tracking structures. */
573	struct xbb_xen_req_list   request_free_stailq;
574
575	/** Array, sized at connection time, of request tracking structures. */
576	struct xbb_xen_req	 *requests;
577
578	/** Free pool of request list structures. */
579	struct xbb_xen_reqlist_list reqlist_free_stailq;
580
581	/** List of pending request lists awaiting execution. */
582	struct xbb_xen_reqlist_list reqlist_pending_stailq;
583
584	/** Array, sized at connection time, of request list structures. */
585	struct xbb_xen_reqlist	 *request_lists;
586
587	/**
588	 * Global pool of kva used for mapping remote domain ring
589	 * and I/O transaction data.
590	 */
591	vm_offset_t		  kva;
592
593	/** Pseudo-physical address corresponding to kva. */
594	uint64_t		  gnt_base_addr;
595
596	/** The size of the global kva pool. */
597	int			  kva_size;
598
599	/** The size of the KVA area used for request lists. */
600	int			  reqlist_kva_size;
601
602	/** The number of pages of KVA used for request lists */
603	int			  reqlist_kva_pages;
604
605	/** Bitmap of free KVA pages */
606	bitstr_t		 *kva_free;
607
608	/**
609	 * \brief Cached value of the front-end's domain id.
610	 *
611	 * This value is used at once for each mapped page in
612	 * a transaction.  We cache it to avoid incuring the
613	 * cost of an ivar access every time this is needed.
614	 */
615	domid_t			  otherend_id;
616
617	/**
618	 * \brief The blkif protocol abi in effect.
619	 *
620	 * There are situations where the back and front ends can
621	 * have a different, native abi (e.g. intel x86_64 and
622	 * 32bit x86 domains on the same machine).  The back-end
623	 * always accommodates the front-end's native abi.  That
624	 * value is pulled from the XenStore and recorded here.
625	 */
626	int			  abi;
627
628	/**
629	 * \brief The maximum number of requests and request lists allowed
630	 *        to be in flight at a time.
631	 *
632	 * This value is negotiated via the XenStore.
633	 */
634	u_int			  max_requests;
635
636	/**
637	 * \brief The maximum number of segments (1 page per segment)
638	 *	  that can be mapped by a request.
639	 *
640	 * This value is negotiated via the XenStore.
641	 */
642	u_int			  max_request_segments;
643
644	/**
645	 * \brief Maximum number of segments per request list.
646	 *
647	 * This value is derived from and will generally be larger than
648	 * max_request_segments.
649	 */
650	u_int			  max_reqlist_segments;
651
652	/**
653	 * The maximum size of any request to this back-end
654	 * device.
655	 *
656	 * This value is negotiated via the XenStore.
657	 */
658	u_int			  max_request_size;
659
660	/**
661	 * The maximum size of any request list.  This is derived directly
662	 * from max_reqlist_segments.
663	 */
664	u_int			  max_reqlist_size;
665
666	/** Various configuration and state bit flags. */
667	xbb_flag_t		  flags;
668
669	/** Ring mapping and interrupt configuration data. */
670	struct xbb_ring_config	  ring_config;
671
672	/** Runtime, cross-abi safe, structures for ring access. */
673	blkif_back_rings_t	  rings;
674
675	/** IRQ mapping for the communication ring event channel. */
676	xen_intr_handle_t	  xen_intr_handle;
677
678	/**
679	 * \brief Backend access mode flags (e.g. write, or read-only).
680	 *
681	 * This value is passed to us by the front-end via the XenStore.
682	 */
683	char			 *dev_mode;
684
685	/**
686	 * \brief Backend device type (e.g. "disk", "cdrom", "floppy").
687	 *
688	 * This value is passed to us by the front-end via the XenStore.
689	 * Currently unused.
690	 */
691	char			 *dev_type;
692
693	/**
694	 * \brief Backend device/file identifier.
695	 *
696	 * This value is passed to us by the front-end via the XenStore.
697	 * We expect this to be a POSIX path indicating the file or
698	 * device to open.
699	 */
700	char			 *dev_name;
701
702	/**
703	 * Vnode corresponding to the backend device node or file
704	 * we are acessing.
705	 */
706	struct vnode		 *vn;
707
708	union xbb_backend_data	  backend;
709
710	/** The native sector size of the backend. */
711	u_int			  sector_size;
712
713	/** log2 of sector_size.  */
714	u_int			  sector_size_shift;
715
716	/** Size in bytes of the backend device or file.  */
717	off_t			  media_size;
718
719	/**
720	 * \brief media_size expressed in terms of the backend native
721	 *	  sector size.
722	 *
723	 * (e.g. xbb->media_size >> xbb->sector_size_shift).
724	 */
725	uint64_t		  media_num_sectors;
726
727	/**
728	 * \brief Array of memoized scatter gather data computed during the
729	 *	  conversion of blkif ring requests to internal xbb_xen_req
730	 *	  structures.
731	 *
732	 * Ring processing is serialized so we only need one of these.
733	 */
734	struct xbb_sg		  xbb_sgs[XBB_MAX_SEGMENTS_PER_REQLIST];
735
736	/**
737	 * Temporary grant table map used in xbb_dispatch_io().  When
738	 * XBB_MAX_SEGMENTS_PER_REQLIST gets large, keeping this on the
739	 * stack could cause a stack overflow.
740	 */
741	struct gnttab_map_grant_ref   maps[XBB_MAX_SEGMENTS_PER_REQLIST];
742
743	/** Mutex protecting per-instance data. */
744	struct mtx		  lock;
745
746	/**
747	 * Resource representing allocated physical address space
748	 * associated with our per-instance kva region.
749	 */
750	struct resource		 *pseudo_phys_res;
751
752	/** Resource id for allocated physical address space. */
753	int			  pseudo_phys_res_id;
754
755	/**
756	 * I/O statistics from BlockBack dispatch down.  These are
757	 * coalesced requests, and we start them right before execution.
758	 */
759	struct devstat		 *xbb_stats;
760
761	/**
762	 * I/O statistics coming into BlockBack.  These are the requests as
763	 * we get them from BlockFront.  They are started as soon as we
764	 * receive a request, and completed when the I/O is complete.
765	 */
766	struct devstat		 *xbb_stats_in;
767
768	/** Disable sending flush to the backend */
769	int			  disable_flush;
770
771	/** Send a real flush for every N flush requests */
772	int			  flush_interval;
773
774	/** Count of flush requests in the interval */
775	int			  flush_count;
776
777	/** Don't coalesce requests if this is set */
778	int			  no_coalesce_reqs;
779
780	/** Number of requests we have received */
781	uint64_t		  reqs_received;
782
783	/** Number of requests we have completed*/
784	uint64_t		  reqs_completed;
785
786	/** Number of requests we queued but not pushed*/
787	uint64_t		  reqs_queued_for_completion;
788
789	/** Number of requests we completed with an error status*/
790	uint64_t		  reqs_completed_with_error;
791
792	/** How many forced dispatches (i.e. without coalescing) have happened */
793	uint64_t		  forced_dispatch;
794
795	/** How many normal dispatches have happened */
796	uint64_t		  normal_dispatch;
797
798	/** How many total dispatches have happened */
799	uint64_t		  total_dispatch;
800
801	/** How many times we have run out of KVA */
802	uint64_t		  kva_shortages;
803
804	/** How many times we have run out of request structures */
805	uint64_t		  request_shortages;
806
807	/** Watch to wait for hotplug script execution */
808	struct xs_watch		  hotplug_watch;
809
810	/** Got the needed data from hotplug scripts? */
811	bool			  hotplug_done;
812};
813
814/*---------------------------- Request Processing ----------------------------*/
815/**
816 * Allocate an internal transaction tracking structure from the free pool.
817 *
818 * \param xbb  Per-instance xbb configuration structure.
819 *
820 * \return  On success, a pointer to the allocated xbb_xen_req structure.
821 *          Otherwise NULL.
822 */
823static inline struct xbb_xen_req *
824xbb_get_req(struct xbb_softc *xbb)
825{
826	struct xbb_xen_req *req;
827
828	req = NULL;
829
830	mtx_assert(&xbb->lock, MA_OWNED);
831
832	if ((req = STAILQ_FIRST(&xbb->request_free_stailq)) != NULL) {
833		STAILQ_REMOVE_HEAD(&xbb->request_free_stailq, links);
834		xbb->active_request_count++;
835	}
836
837	return (req);
838}
839
840/**
841 * Return an allocated transaction tracking structure to the free pool.
842 *
843 * \param xbb  Per-instance xbb configuration structure.
844 * \param req  The request structure to free.
845 */
846static inline void
847xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req)
848{
849	mtx_assert(&xbb->lock, MA_OWNED);
850
851	STAILQ_INSERT_HEAD(&xbb->request_free_stailq, req, links);
852	xbb->active_request_count--;
853
854	KASSERT(xbb->active_request_count >= 0,
855		("xbb_release_req: negative active count"));
856}
857
858/**
859 * Return an xbb_xen_req_list of allocated xbb_xen_reqs to the free pool.
860 *
861 * \param xbb	    Per-instance xbb configuration structure.
862 * \param req_list  The list of requests to free.
863 * \param nreqs	    The number of items in the list.
864 */
865static inline void
866xbb_release_reqs(struct xbb_softc *xbb, struct xbb_xen_req_list *req_list,
867		 int nreqs)
868{
869	mtx_assert(&xbb->lock, MA_OWNED);
870
871	STAILQ_CONCAT(&xbb->request_free_stailq, req_list);
872	xbb->active_request_count -= nreqs;
873
874	KASSERT(xbb->active_request_count >= 0,
875		("xbb_release_reqs: negative active count"));
876}
877
878/**
879 * Given a page index and 512b sector offset within that page,
880 * calculate an offset into a request's kva region.
881 *
882 * \param reqlist The request structure whose kva region will be accessed.
883 * \param pagenr  The page index used to compute the kva offset.
884 * \param sector  The 512b sector index used to compute the page relative
885 *                kva offset.
886 *
887 * \return  The computed global KVA offset.
888 */
889static inline uint8_t *
890xbb_reqlist_vaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
891{
892	return (reqlist->kva + (PAGE_SIZE * pagenr) + (sector << 9));
893}
894
895#ifdef XBB_USE_BOUNCE_BUFFERS
896/**
897 * Given a page index and 512b sector offset within that page,
898 * calculate an offset into a request's local bounce memory region.
899 *
900 * \param reqlist The request structure whose bounce region will be accessed.
901 * \param pagenr  The page index used to compute the bounce offset.
902 * \param sector  The 512b sector index used to compute the page relative
903 *                bounce offset.
904 *
905 * \return  The computed global bounce buffer address.
906 */
907static inline uint8_t *
908xbb_reqlist_bounce_addr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
909{
910	return (reqlist->bounce + (PAGE_SIZE * pagenr) + (sector << 9));
911}
912#endif
913
914/**
915 * Given a page number and 512b sector offset within that page,
916 * calculate an offset into the request's memory region that the
917 * underlying backend device/file should use for I/O.
918 *
919 * \param reqlist The request structure whose I/O region will be accessed.
920 * \param pagenr  The page index used to compute the I/O offset.
921 * \param sector  The 512b sector index used to compute the page relative
922 *                I/O offset.
923 *
924 * \return  The computed global I/O address.
925 *
926 * Depending on configuration, this will either be a local bounce buffer
927 * or a pointer to the memory mapped in from the front-end domain for
928 * this request.
929 */
930static inline uint8_t *
931xbb_reqlist_ioaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
932{
933#ifdef XBB_USE_BOUNCE_BUFFERS
934	return (xbb_reqlist_bounce_addr(reqlist, pagenr, sector));
935#else
936	return (xbb_reqlist_vaddr(reqlist, pagenr, sector));
937#endif
938}
939
940/**
941 * Given a page index and 512b sector offset within that page, calculate
942 * an offset into the local pseudo-physical address space used to map a
943 * front-end's request data into a request.
944 *
945 * \param reqlist The request list structure whose pseudo-physical region
946 *                will be accessed.
947 * \param pagenr  The page index used to compute the pseudo-physical offset.
948 * \param sector  The 512b sector index used to compute the page relative
949 *                pseudo-physical offset.
950 *
951 * \return  The computed global pseudo-phsyical address.
952 *
953 * Depending on configuration, this will either be a local bounce buffer
954 * or a pointer to the memory mapped in from the front-end domain for
955 * this request.
956 */
957static inline uintptr_t
958xbb_get_gntaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
959{
960	struct xbb_softc *xbb;
961
962	xbb = reqlist->xbb;
963
964	return ((uintptr_t)(xbb->gnt_base_addr +
965		(uintptr_t)(reqlist->kva - xbb->kva) +
966		(PAGE_SIZE * pagenr) + (sector << 9)));
967}
968
969/**
970 * Get Kernel Virtual Address space for mapping requests.
971 *
972 * \param xbb         Per-instance xbb configuration structure.
973 * \param nr_pages    Number of pages needed.
974 * \param check_only  If set, check for free KVA but don't allocate it.
975 * \param have_lock   If set, xbb lock is already held.
976 *
977 * \return  On success, a pointer to the allocated KVA region.  Otherwise NULL.
978 *
979 * Note:  This should be unnecessary once we have either chaining or
980 * scatter/gather support for struct bio.  At that point we'll be able to
981 * put multiple addresses and lengths in one bio/bio chain and won't need
982 * to map everything into one virtual segment.
983 */
984static uint8_t *
985xbb_get_kva(struct xbb_softc *xbb, int nr_pages)
986{
987	int first_clear;
988	int num_clear;
989	uint8_t *free_kva;
990	int      i;
991
992	KASSERT(nr_pages != 0, ("xbb_get_kva of zero length"));
993
994	first_clear = 0;
995	free_kva = NULL;
996
997	mtx_lock(&xbb->lock);
998
999	/*
1000	 * Look for the first available page.  If there are none, we're done.
1001	 */
1002	bit_ffc(xbb->kva_free, xbb->reqlist_kva_pages, &first_clear);
1003
1004	if (first_clear == -1)
1005		goto bailout;
1006
1007	/*
1008	 * Starting at the first available page, look for consecutive free
1009	 * pages that will satisfy the user's request.
1010	 */
1011	for (i = first_clear, num_clear = 0; i < xbb->reqlist_kva_pages; i++) {
1012		/*
1013		 * If this is true, the page is used, so we have to reset
1014		 * the number of clear pages and the first clear page
1015		 * (since it pointed to a region with an insufficient number
1016		 * of clear pages).
1017		 */
1018		if (bit_test(xbb->kva_free, i)) {
1019			num_clear = 0;
1020			first_clear = -1;
1021			continue;
1022		}
1023
1024		if (first_clear == -1)
1025			first_clear = i;
1026
1027		/*
1028		 * If this is true, we've found a large enough free region
1029		 * to satisfy the request.
1030		 */
1031		if (++num_clear == nr_pages) {
1032
1033			bit_nset(xbb->kva_free, first_clear,
1034				 first_clear + nr_pages - 1);
1035
1036			free_kva = xbb->kva +
1037				(uint8_t *)((intptr_t)first_clear * PAGE_SIZE);
1038
1039			KASSERT(free_kva >= (uint8_t *)xbb->kva &&
1040				free_kva + (nr_pages * PAGE_SIZE) <=
1041				(uint8_t *)xbb->ring_config.va,
1042				("Free KVA %p len %d out of range, "
1043				 "kva = %#jx, ring VA = %#jx\n", free_kva,
1044				 nr_pages * PAGE_SIZE, (uintmax_t)xbb->kva,
1045				 (uintmax_t)xbb->ring_config.va));
1046			break;
1047		}
1048	}
1049
1050bailout:
1051
1052	if (free_kva == NULL) {
1053		xbb->flags |= XBBF_RESOURCE_SHORTAGE;
1054		xbb->kva_shortages++;
1055	}
1056
1057	mtx_unlock(&xbb->lock);
1058
1059	return (free_kva);
1060}
1061
1062/**
1063 * Free allocated KVA.
1064 *
1065 * \param xbb	    Per-instance xbb configuration structure.
1066 * \param kva_ptr   Pointer to allocated KVA region.
1067 * \param nr_pages  Number of pages in the KVA region.
1068 */
1069static void
1070xbb_free_kva(struct xbb_softc *xbb, uint8_t *kva_ptr, int nr_pages)
1071{
1072	intptr_t start_page;
1073
1074	mtx_assert(&xbb->lock, MA_OWNED);
1075
1076	start_page = (intptr_t)(kva_ptr - xbb->kva) >> PAGE_SHIFT;
1077	bit_nclear(xbb->kva_free, start_page, start_page + nr_pages - 1);
1078
1079}
1080
1081/**
1082 * Unmap the front-end pages associated with this I/O request.
1083 *
1084 * \param req  The request structure to unmap.
1085 */
1086static void
1087xbb_unmap_reqlist(struct xbb_xen_reqlist *reqlist)
1088{
1089	struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQLIST];
1090	u_int			      i;
1091	u_int			      invcount;
1092	int			      error;
1093
1094	invcount = 0;
1095	for (i = 0; i < reqlist->nr_segments; i++) {
1096
1097		if (reqlist->gnt_handles[i] == GRANT_REF_INVALID)
1098			continue;
1099
1100		unmap[invcount].host_addr    = xbb_get_gntaddr(reqlist, i, 0);
1101		unmap[invcount].dev_bus_addr = 0;
1102		unmap[invcount].handle       = reqlist->gnt_handles[i];
1103		reqlist->gnt_handles[i]	     = GRANT_REF_INVALID;
1104		invcount++;
1105	}
1106
1107	error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1108					  unmap, invcount);
1109	KASSERT(error == 0, ("Grant table operation failed"));
1110}
1111
1112/**
1113 * Allocate an internal transaction tracking structure from the free pool.
1114 *
1115 * \param xbb  Per-instance xbb configuration structure.
1116 *
1117 * \return  On success, a pointer to the allocated xbb_xen_reqlist structure.
1118 *          Otherwise NULL.
1119 */
1120static inline struct xbb_xen_reqlist *
1121xbb_get_reqlist(struct xbb_softc *xbb)
1122{
1123	struct xbb_xen_reqlist *reqlist;
1124
1125	reqlist = NULL;
1126
1127	mtx_assert(&xbb->lock, MA_OWNED);
1128
1129	if ((reqlist = STAILQ_FIRST(&xbb->reqlist_free_stailq)) != NULL) {
1130
1131		STAILQ_REMOVE_HEAD(&xbb->reqlist_free_stailq, links);
1132		reqlist->flags = XBB_REQLIST_NONE;
1133		reqlist->kva = NULL;
1134		reqlist->status = BLKIF_RSP_OKAY;
1135		reqlist->residual_512b_sectors = 0;
1136		reqlist->num_children = 0;
1137		reqlist->nr_segments = 0;
1138		STAILQ_INIT(&reqlist->contig_req_list);
1139	}
1140
1141	return (reqlist);
1142}
1143
1144/**
1145 * Return an allocated transaction tracking structure to the free pool.
1146 *
1147 * \param xbb        Per-instance xbb configuration structure.
1148 * \param req        The request list structure to free.
1149 * \param wakeup     If set, wakeup the work thread if freeing this reqlist
1150 *                   during a resource shortage condition.
1151 */
1152static inline void
1153xbb_release_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
1154		    int wakeup)
1155{
1156
1157	mtx_assert(&xbb->lock, MA_OWNED);
1158
1159	if (wakeup) {
1160		wakeup = xbb->flags & XBBF_RESOURCE_SHORTAGE;
1161		xbb->flags &= ~XBBF_RESOURCE_SHORTAGE;
1162	}
1163
1164	if (reqlist->kva != NULL)
1165		xbb_free_kva(xbb, reqlist->kva, reqlist->nr_segments);
1166
1167	xbb_release_reqs(xbb, &reqlist->contig_req_list, reqlist->num_children);
1168
1169	STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links);
1170
1171	if ((xbb->flags & XBBF_SHUTDOWN) != 0) {
1172		/*
1173		 * Shutdown is in progress.  See if we can
1174		 * progress further now that one more request
1175		 * has completed and been returned to the
1176		 * free pool.
1177		 */
1178		xbb_shutdown(xbb);
1179	}
1180
1181	if (wakeup != 0)
1182		taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task);
1183}
1184
1185/**
1186 * Request resources and do basic request setup.
1187 *
1188 * \param xbb          Per-instance xbb configuration structure.
1189 * \param reqlist      Pointer to reqlist pointer.
1190 * \param ring_req     Pointer to a block ring request.
1191 * \param ring_index   The ring index of this request.
1192 *
1193 * \return  0 for success, non-zero for failure.
1194 */
1195static int
1196xbb_get_resources(struct xbb_softc *xbb, struct xbb_xen_reqlist **reqlist,
1197		  blkif_request_t *ring_req, RING_IDX ring_idx)
1198{
1199	struct xbb_xen_reqlist *nreqlist;
1200	struct xbb_xen_req     *nreq;
1201
1202	nreqlist = NULL;
1203	nreq     = NULL;
1204
1205	mtx_lock(&xbb->lock);
1206
1207	/*
1208	 * We don't allow new resources to be allocated if we're in the
1209	 * process of shutting down.
1210	 */
1211	if ((xbb->flags & XBBF_SHUTDOWN) != 0) {
1212		mtx_unlock(&xbb->lock);
1213		return (1);
1214	}
1215
1216	/*
1217	 * Allocate a reqlist if the caller doesn't have one already.
1218	 */
1219	if (*reqlist == NULL) {
1220		nreqlist = xbb_get_reqlist(xbb);
1221		if (nreqlist == NULL)
1222			goto bailout_error;
1223	}
1224
1225	/* We always allocate a request. */
1226	nreq = xbb_get_req(xbb);
1227	if (nreq == NULL)
1228		goto bailout_error;
1229
1230	mtx_unlock(&xbb->lock);
1231
1232	if (*reqlist == NULL) {
1233		*reqlist = nreqlist;
1234		nreqlist->operation = ring_req->operation;
1235		nreqlist->starting_sector_number = ring_req->sector_number;
1236		STAILQ_INSERT_TAIL(&xbb->reqlist_pending_stailq, nreqlist,
1237				   links);
1238	}
1239
1240	nreq->reqlist = *reqlist;
1241	nreq->req_ring_idx = ring_idx;
1242	nreq->id = ring_req->id;
1243	nreq->operation = ring_req->operation;
1244
1245	if (xbb->abi != BLKIF_PROTOCOL_NATIVE) {
1246		bcopy(ring_req, &nreq->ring_req_storage, sizeof(*ring_req));
1247		nreq->ring_req = &nreq->ring_req_storage;
1248	} else {
1249		nreq->ring_req = ring_req;
1250	}
1251
1252	binuptime(&nreq->ds_t0);
1253	devstat_start_transaction(xbb->xbb_stats_in, &nreq->ds_t0);
1254	STAILQ_INSERT_TAIL(&(*reqlist)->contig_req_list, nreq, links);
1255	(*reqlist)->num_children++;
1256	(*reqlist)->nr_segments += ring_req->nr_segments;
1257
1258	return (0);
1259
1260bailout_error:
1261
1262	/*
1263	 * We're out of resources, so set the shortage flag.  The next time
1264	 * a request is released, we'll try waking up the work thread to
1265	 * see if we can allocate more resources.
1266	 */
1267	xbb->flags |= XBBF_RESOURCE_SHORTAGE;
1268	xbb->request_shortages++;
1269
1270	if (nreq != NULL)
1271		xbb_release_req(xbb, nreq);
1272
1273	if (nreqlist != NULL)
1274		xbb_release_reqlist(xbb, nreqlist, /*wakeup*/ 0);
1275
1276	mtx_unlock(&xbb->lock);
1277
1278	return (1);
1279}
1280
1281/**
1282 * Create and queue a response to a blkif request.
1283 *
1284 * \param xbb     Per-instance xbb configuration structure.
1285 * \param req     The request structure to which to respond.
1286 * \param status  The status code to report.  See BLKIF_RSP_*
1287 *                in sys/xen/interface/io/blkif.h.
1288 */
1289static void
1290xbb_queue_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status)
1291{
1292	blkif_response_t *resp;
1293
1294	/*
1295	 * The mutex is required here, and should be held across this call
1296	 * until after the subsequent call to xbb_push_responses().  This
1297	 * is to guarantee that another context won't queue responses and
1298	 * push them while we're active.
1299	 *
1300	 * That could lead to the other end being notified of responses
1301	 * before the resources have been freed on this end.  The other end
1302	 * would then be able to queue additional I/O, and we may run out
1303 	 * of resources because we haven't freed them all yet.
1304	 */
1305	mtx_assert(&xbb->lock, MA_OWNED);
1306
1307	/*
1308	 * Place on the response ring for the relevant domain.
1309	 * For now, only the spacing between entries is different
1310	 * in the different ABIs, not the response entry layout.
1311	 */
1312	switch (xbb->abi) {
1313	case BLKIF_PROTOCOL_NATIVE:
1314		resp = RING_GET_RESPONSE(&xbb->rings.native,
1315					 xbb->rings.native.rsp_prod_pvt);
1316		break;
1317	case BLKIF_PROTOCOL_X86_32:
1318		resp = (blkif_response_t *)
1319		    RING_GET_RESPONSE(&xbb->rings.x86_32,
1320				      xbb->rings.x86_32.rsp_prod_pvt);
1321		break;
1322	case BLKIF_PROTOCOL_X86_64:
1323		resp = (blkif_response_t *)
1324		    RING_GET_RESPONSE(&xbb->rings.x86_64,
1325				      xbb->rings.x86_64.rsp_prod_pvt);
1326		break;
1327	default:
1328		panic("Unexpected blkif protocol ABI.");
1329	}
1330
1331	resp->id        = req->id;
1332	resp->operation = req->operation;
1333	resp->status    = status;
1334
1335	if (status != BLKIF_RSP_OKAY)
1336		xbb->reqs_completed_with_error++;
1337
1338	xbb->rings.common.rsp_prod_pvt++;
1339
1340	xbb->reqs_queued_for_completion++;
1341
1342}
1343
1344/**
1345 * Send queued responses to blkif requests.
1346 *
1347 * \param xbb            Per-instance xbb configuration structure.
1348 * \param run_taskqueue  Flag that is set to 1 if the taskqueue
1349 *			 should be run, 0 if it does not need to be run.
1350 * \param notify	 Flag that is set to 1 if the other end should be
1351 * 			 notified via irq, 0 if the other end should not be
1352 *			 notified.
1353 */
1354static void
1355xbb_push_responses(struct xbb_softc *xbb, int *run_taskqueue, int *notify)
1356{
1357	int more_to_do;
1358
1359	/*
1360	 * The mutex is required here.
1361	 */
1362	mtx_assert(&xbb->lock, MA_OWNED);
1363
1364	more_to_do = 0;
1365
1366	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, *notify);
1367
1368	if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) {
1369
1370		/*
1371		 * Tail check for pending requests. Allows frontend to avoid
1372		 * notifications if requests are already in flight (lower
1373		 * overheads and promotes batching).
1374		 */
1375		RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do);
1376	} else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) {
1377
1378		more_to_do = 1;
1379	}
1380
1381	xbb->reqs_completed += xbb->reqs_queued_for_completion;
1382	xbb->reqs_queued_for_completion = 0;
1383
1384	*run_taskqueue = more_to_do;
1385}
1386
1387/**
1388 * Complete a request list.
1389 *
1390 * \param xbb        Per-instance xbb configuration structure.
1391 * \param reqlist    Allocated internal request list structure.
1392 */
1393static void
1394xbb_complete_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist)
1395{
1396	struct xbb_xen_req *nreq;
1397	off_t		    sectors_sent;
1398	int		    notify, run_taskqueue;
1399
1400	sectors_sent = 0;
1401
1402	if (reqlist->flags & XBB_REQLIST_MAPPED)
1403		xbb_unmap_reqlist(reqlist);
1404
1405	mtx_lock(&xbb->lock);
1406
1407	/*
1408	 * All I/O is done, send the response. A lock is not necessary
1409	 * to protect the request list, because all requests have
1410	 * completed.  Therefore this is the only context accessing this
1411	 * reqlist right now.  However, in order to make sure that no one
1412	 * else queues responses onto the queue or pushes them to the other
1413	 * side while we're active, we need to hold the lock across the
1414	 * calls to xbb_queue_response() and xbb_push_responses().
1415	 */
1416	STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) {
1417		off_t cur_sectors_sent;
1418
1419		/* Put this response on the ring, but don't push yet */
1420		xbb_queue_response(xbb, nreq, reqlist->status);
1421
1422		/* We don't report bytes sent if there is an error. */
1423		if (reqlist->status == BLKIF_RSP_OKAY)
1424			cur_sectors_sent = nreq->nr_512b_sectors;
1425		else
1426			cur_sectors_sent = 0;
1427
1428		sectors_sent += cur_sectors_sent;
1429
1430		devstat_end_transaction(xbb->xbb_stats_in,
1431					/*bytes*/cur_sectors_sent << 9,
1432					reqlist->ds_tag_type,
1433					reqlist->ds_trans_type,
1434					/*now*/NULL,
1435					/*then*/&nreq->ds_t0);
1436	}
1437
1438	/*
1439	 * Take out any sectors not sent.  If we wind up negative (which
1440	 * might happen if an error is reported as well as a residual), just
1441	 * report 0 sectors sent.
1442	 */
1443	sectors_sent -= reqlist->residual_512b_sectors;
1444	if (sectors_sent < 0)
1445		sectors_sent = 0;
1446
1447	devstat_end_transaction(xbb->xbb_stats,
1448				/*bytes*/ sectors_sent << 9,
1449				reqlist->ds_tag_type,
1450				reqlist->ds_trans_type,
1451				/*now*/NULL,
1452				/*then*/&reqlist->ds_t0);
1453
1454	xbb_release_reqlist(xbb, reqlist, /*wakeup*/ 1);
1455
1456	xbb_push_responses(xbb, &run_taskqueue, &notify);
1457
1458	mtx_unlock(&xbb->lock);
1459
1460	if (run_taskqueue)
1461		taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task);
1462
1463	if (notify)
1464		xen_intr_signal(xbb->xen_intr_handle);
1465}
1466
1467/**
1468 * Completion handler for buffer I/O requests issued by the device
1469 * backend driver.
1470 *
1471 * \param bio  The buffer I/O request on which to perform completion
1472 *             processing.
1473 */
1474static void
1475xbb_bio_done(struct bio *bio)
1476{
1477	struct xbb_softc       *xbb;
1478	struct xbb_xen_reqlist *reqlist;
1479
1480	reqlist = bio->bio_caller1;
1481	xbb     = reqlist->xbb;
1482
1483	reqlist->residual_512b_sectors += bio->bio_resid >> 9;
1484
1485	/*
1486	 * This is a bit imprecise.  With aggregated I/O a single
1487	 * request list can contain multiple front-end requests and
1488	 * a multiple bios may point to a single request.  By carefully
1489	 * walking the request list, we could map residuals and errors
1490	 * back to the original front-end request, but the interface
1491	 * isn't sufficiently rich for us to properly report the error.
1492	 * So, we just treat the entire request list as having failed if an
1493	 * error occurs on any part.  And, if an error occurs, we treat
1494	 * the amount of data transferred as 0.
1495	 *
1496	 * For residuals, we report it on the overall aggregated device,
1497	 * but not on the individual requests, since we don't currently
1498	 * do the work to determine which front-end request to which the
1499	 * residual applies.
1500	 */
1501	if (bio->bio_error) {
1502		DPRINTF("BIO returned error %d for operation on device %s\n",
1503			bio->bio_error, xbb->dev_name);
1504		reqlist->status = BLKIF_RSP_ERROR;
1505
1506		if (bio->bio_error == ENXIO
1507		 && xenbus_get_state(xbb->dev) == XenbusStateConnected) {
1508
1509			/*
1510			 * Backend device has disappeared.  Signal the
1511			 * front-end that we (the device proxy) want to
1512			 * go away.
1513			 */
1514			xenbus_set_state(xbb->dev, XenbusStateClosing);
1515		}
1516	}
1517
1518#ifdef XBB_USE_BOUNCE_BUFFERS
1519	if (bio->bio_cmd == BIO_READ) {
1520		vm_offset_t kva_offset;
1521
1522		kva_offset = (vm_offset_t)bio->bio_data
1523			   - (vm_offset_t)reqlist->bounce;
1524		memcpy((uint8_t *)reqlist->kva + kva_offset,
1525		       bio->bio_data, bio->bio_bcount);
1526	}
1527#endif /* XBB_USE_BOUNCE_BUFFERS */
1528
1529	/*
1530	 * Decrement the pending count for the request list.  When we're
1531	 * done with the requests, send status back for all of them.
1532	 */
1533	if (atomic_fetchadd_int(&reqlist->pendcnt, -1) == 1)
1534		xbb_complete_reqlist(xbb, reqlist);
1535
1536	g_destroy_bio(bio);
1537}
1538
1539/**
1540 * Parse a blkif request into an internal request structure and send
1541 * it to the backend for processing.
1542 *
1543 * \param xbb       Per-instance xbb configuration structure.
1544 * \param reqlist   Allocated internal request list structure.
1545 *
1546 * \return          On success, 0.  For resource shortages, non-zero.
1547 *
1548 * This routine performs the backend common aspects of request parsing
1549 * including compiling an internal request structure, parsing the S/G
1550 * list and any secondary ring requests in which they may reside, and
1551 * the mapping of front-end I/O pages into our domain.
1552 */
1553static int
1554xbb_dispatch_io(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist)
1555{
1556	struct xbb_sg                *xbb_sg;
1557	struct gnttab_map_grant_ref  *map;
1558	struct blkif_request_segment *sg;
1559	struct blkif_request_segment *last_block_sg;
1560	struct xbb_xen_req	     *nreq;
1561	u_int			      nseg;
1562	u_int			      seg_idx;
1563	u_int			      block_segs;
1564	int			      nr_sects;
1565	int			      total_sects;
1566	int			      operation;
1567	uint8_t			      bio_flags;
1568	int			      error;
1569
1570	reqlist->ds_tag_type = DEVSTAT_TAG_SIMPLE;
1571	bio_flags            = 0;
1572	total_sects	     = 0;
1573	nr_sects	     = 0;
1574
1575	/*
1576	 * First determine whether we have enough free KVA to satisfy this
1577	 * request list.  If not, tell xbb_run_queue() so it can go to
1578	 * sleep until we have more KVA.
1579	 */
1580	reqlist->kva = NULL;
1581	if (reqlist->nr_segments != 0) {
1582		reqlist->kva = xbb_get_kva(xbb, reqlist->nr_segments);
1583		if (reqlist->kva == NULL) {
1584			/*
1585			 * If we're out of KVA, return ENOMEM.
1586			 */
1587			return (ENOMEM);
1588		}
1589	}
1590
1591	binuptime(&reqlist->ds_t0);
1592	devstat_start_transaction(xbb->xbb_stats, &reqlist->ds_t0);
1593
1594	switch (reqlist->operation) {
1595	case BLKIF_OP_WRITE_BARRIER:
1596		bio_flags       |= BIO_ORDERED;
1597		reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED;
1598		/* FALLTHROUGH */
1599	case BLKIF_OP_WRITE:
1600		operation = BIO_WRITE;
1601		reqlist->ds_trans_type = DEVSTAT_WRITE;
1602		if ((xbb->flags & XBBF_READ_ONLY) != 0) {
1603			DPRINTF("Attempt to write to read only device %s\n",
1604				xbb->dev_name);
1605			reqlist->status = BLKIF_RSP_ERROR;
1606			goto send_response;
1607		}
1608		break;
1609	case BLKIF_OP_READ:
1610		operation = BIO_READ;
1611		reqlist->ds_trans_type = DEVSTAT_READ;
1612		break;
1613	case BLKIF_OP_FLUSH_DISKCACHE:
1614		/*
1615		 * If this is true, the user has requested that we disable
1616		 * flush support.  So we just complete the requests
1617		 * successfully.
1618		 */
1619		if (xbb->disable_flush != 0) {
1620			goto send_response;
1621		}
1622
1623		/*
1624		 * The user has requested that we only send a real flush
1625		 * for every N flush requests.  So keep count, and either
1626		 * complete the request immediately or queue it for the
1627		 * backend.
1628		 */
1629		if (xbb->flush_interval != 0) {
1630		 	if (++(xbb->flush_count) < xbb->flush_interval) {
1631				goto send_response;
1632			} else
1633				xbb->flush_count = 0;
1634		}
1635
1636		operation = BIO_FLUSH;
1637		reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED;
1638		reqlist->ds_trans_type = DEVSTAT_NO_DATA;
1639		goto do_dispatch;
1640		/*NOTREACHED*/
1641	default:
1642		DPRINTF("error: unknown block io operation [%d]\n",
1643			reqlist->operation);
1644		reqlist->status = BLKIF_RSP_ERROR;
1645		goto send_response;
1646	}
1647
1648	reqlist->xbb  = xbb;
1649	xbb_sg        = xbb->xbb_sgs;
1650	map	      = xbb->maps;
1651	seg_idx	      = 0;
1652
1653	STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) {
1654		blkif_request_t		*ring_req;
1655		RING_IDX		 req_ring_idx;
1656		u_int			 req_seg_idx;
1657
1658		ring_req	      = nreq->ring_req;
1659		req_ring_idx	      = nreq->req_ring_idx;
1660		nr_sects              = 0;
1661		nseg                  = ring_req->nr_segments;
1662		nreq->nr_pages        = nseg;
1663		nreq->nr_512b_sectors = 0;
1664		req_seg_idx	      = 0;
1665		sg	              = NULL;
1666
1667		/* Check that number of segments is sane. */
1668		if (__predict_false(nseg == 0)
1669		 || __predict_false(nseg > xbb->max_request_segments)) {
1670			DPRINTF("Bad number of segments in request (%d)\n",
1671				nseg);
1672			reqlist->status = BLKIF_RSP_ERROR;
1673			goto send_response;
1674		}
1675
1676		block_segs    = nseg;
1677		sg            = ring_req->seg;
1678		last_block_sg = sg + block_segs;
1679
1680		while (sg < last_block_sg) {
1681			KASSERT(seg_idx <
1682				XBB_MAX_SEGMENTS_PER_REQLIST,
1683				("seg_idx %d is too large, max "
1684				"segs %d\n", seg_idx,
1685				XBB_MAX_SEGMENTS_PER_REQLIST));
1686
1687			xbb_sg->first_sect = sg->first_sect;
1688			xbb_sg->last_sect  = sg->last_sect;
1689			xbb_sg->nsect =
1690			    (int8_t)(sg->last_sect -
1691			    sg->first_sect + 1);
1692
1693			if ((sg->last_sect >= (PAGE_SIZE >> 9))
1694			 || (xbb_sg->nsect <= 0)) {
1695				reqlist->status = BLKIF_RSP_ERROR;
1696				goto send_response;
1697			}
1698
1699			nr_sects += xbb_sg->nsect;
1700			map->host_addr = xbb_get_gntaddr(reqlist,
1701						seg_idx, /*sector*/0);
1702			KASSERT(map->host_addr + PAGE_SIZE <=
1703				xbb->ring_config.gnt_addr,
1704				("Host address %#jx len %d overlaps "
1705				 "ring address %#jx\n",
1706				(uintmax_t)map->host_addr, PAGE_SIZE,
1707				(uintmax_t)xbb->ring_config.gnt_addr));
1708
1709			map->flags     = GNTMAP_host_map;
1710			map->ref       = sg->gref;
1711			map->dom       = xbb->otherend_id;
1712			if (operation == BIO_WRITE)
1713				map->flags |= GNTMAP_readonly;
1714			sg++;
1715			map++;
1716			xbb_sg++;
1717			seg_idx++;
1718			req_seg_idx++;
1719		}
1720
1721		/* Convert to the disk's sector size */
1722		nreq->nr_512b_sectors = nr_sects;
1723		nr_sects = (nr_sects << 9) >> xbb->sector_size_shift;
1724		total_sects += nr_sects;
1725
1726		if ((nreq->nr_512b_sectors &
1727		    ((xbb->sector_size >> 9) - 1)) != 0) {
1728			device_printf(xbb->dev, "%s: I/O size (%d) is not "
1729				      "a multiple of the backing store sector "
1730				      "size (%d)\n", __func__,
1731				      nreq->nr_512b_sectors << 9,
1732				      xbb->sector_size);
1733			reqlist->status = BLKIF_RSP_ERROR;
1734			goto send_response;
1735		}
1736	}
1737
1738	error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
1739					  xbb->maps, reqlist->nr_segments);
1740	if (error != 0)
1741		panic("Grant table operation failed (%d)", error);
1742
1743	reqlist->flags |= XBB_REQLIST_MAPPED;
1744
1745	for (seg_idx = 0, map = xbb->maps; seg_idx < reqlist->nr_segments;
1746	     seg_idx++, map++){
1747
1748		if (__predict_false(map->status != 0)) {
1749			DPRINTF("invalid buffer -- could not remap "
1750			        "it (%d)\n", map->status);
1751			DPRINTF("Mapping(%d): Host Addr 0x%"PRIx64", flags "
1752			        "0x%x ref 0x%x, dom %d\n", seg_idx,
1753				map->host_addr, map->flags, map->ref,
1754				map->dom);
1755			reqlist->status = BLKIF_RSP_ERROR;
1756			goto send_response;
1757		}
1758
1759		reqlist->gnt_handles[seg_idx] = map->handle;
1760	}
1761	if (reqlist->starting_sector_number + total_sects >
1762	    xbb->media_num_sectors) {
1763
1764		DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] "
1765			"extends past end of device %s\n",
1766			operation == BIO_READ ? "read" : "write",
1767			reqlist->starting_sector_number,
1768			reqlist->starting_sector_number + total_sects,
1769			xbb->dev_name);
1770		reqlist->status = BLKIF_RSP_ERROR;
1771		goto send_response;
1772	}
1773
1774do_dispatch:
1775
1776	error = xbb->dispatch_io(xbb,
1777				 reqlist,
1778				 operation,
1779				 bio_flags);
1780
1781	if (error != 0) {
1782		reqlist->status = BLKIF_RSP_ERROR;
1783		goto send_response;
1784	}
1785
1786	return (0);
1787
1788send_response:
1789
1790	xbb_complete_reqlist(xbb, reqlist);
1791
1792	return (0);
1793}
1794
1795static __inline int
1796xbb_count_sects(blkif_request_t *ring_req)
1797{
1798	int i;
1799	int cur_size = 0;
1800
1801	for (i = 0; i < ring_req->nr_segments; i++) {
1802		int nsect;
1803
1804		nsect = (int8_t)(ring_req->seg[i].last_sect -
1805			ring_req->seg[i].first_sect + 1);
1806		if (nsect <= 0)
1807			break;
1808
1809		cur_size += nsect;
1810	}
1811
1812	return (cur_size);
1813}
1814
1815/**
1816 * Process incoming requests from the shared communication ring in response
1817 * to a signal on the ring's event channel.
1818 *
1819 * \param context  Callback argument registerd during task initialization -
1820 *                 the xbb_softc for this instance.
1821 * \param pending  The number of taskqueue_enqueue events that have
1822 *                 occurred since this handler was last run.
1823 */
1824static void
1825xbb_run_queue(void *context, int pending)
1826{
1827	struct xbb_softc       *xbb;
1828	blkif_back_rings_t     *rings;
1829	RING_IDX		rp;
1830	uint64_t		cur_sector;
1831	int			cur_operation;
1832	struct xbb_xen_reqlist *reqlist;
1833
1834
1835	xbb   = (struct xbb_softc *)context;
1836	rings = &xbb->rings;
1837
1838	/*
1839	 * Work gather and dispatch loop.  Note that we have a bias here
1840	 * towards gathering I/O sent by blockfront.  We first gather up
1841	 * everything in the ring, as long as we have resources.  Then we
1842	 * dispatch one request, and then attempt to gather up any
1843	 * additional requests that have come in while we were dispatching
1844	 * the request.
1845	 *
1846	 * This allows us to get a clearer picture (via devstat) of how
1847	 * many requests blockfront is queueing to us at any given time.
1848	 */
1849	for (;;) {
1850		int retval;
1851
1852		/*
1853		 * Initialize reqlist to the last element in the pending
1854		 * queue, if there is one.  This allows us to add more
1855		 * requests to that request list, if we have room.
1856		 */
1857		reqlist = STAILQ_LAST(&xbb->reqlist_pending_stailq,
1858				      xbb_xen_reqlist, links);
1859		if (reqlist != NULL) {
1860			cur_sector = reqlist->next_contig_sector;
1861			cur_operation = reqlist->operation;
1862		} else {
1863			cur_operation = 0;
1864			cur_sector    = 0;
1865		}
1866
1867		/*
1868		 * Cache req_prod to avoid accessing a cache line shared
1869		 * with the frontend.
1870		 */
1871		rp = rings->common.sring->req_prod;
1872
1873		/* Ensure we see queued requests up to 'rp'. */
1874		rmb();
1875
1876		/**
1877		 * Run so long as there is work to consume and the generation
1878		 * of a response will not overflow the ring.
1879		 *
1880		 * @note There's a 1 to 1 relationship between requests and
1881		 *       responses, so an overflow should never occur.  This
1882		 *       test is to protect our domain from digesting bogus
1883		 *       data.  Shouldn't we log this?
1884		 */
1885		while (rings->common.req_cons != rp
1886		    && RING_REQUEST_CONS_OVERFLOW(&rings->common,
1887						  rings->common.req_cons) == 0){
1888			blkif_request_t	        ring_req_storage;
1889			blkif_request_t	       *ring_req;
1890			int			cur_size;
1891
1892			switch (xbb->abi) {
1893			case BLKIF_PROTOCOL_NATIVE:
1894				ring_req = RING_GET_REQUEST(&xbb->rings.native,
1895				    rings->common.req_cons);
1896				break;
1897			case BLKIF_PROTOCOL_X86_32:
1898			{
1899				struct blkif_x86_32_request *ring_req32;
1900
1901				ring_req32 = RING_GET_REQUEST(
1902				    &xbb->rings.x86_32, rings->common.req_cons);
1903				blkif_get_x86_32_req(&ring_req_storage,
1904						     ring_req32);
1905				ring_req = &ring_req_storage;
1906				break;
1907			}
1908			case BLKIF_PROTOCOL_X86_64:
1909			{
1910				struct blkif_x86_64_request *ring_req64;
1911
1912				ring_req64 =RING_GET_REQUEST(&xbb->rings.x86_64,
1913				    rings->common.req_cons);
1914				blkif_get_x86_64_req(&ring_req_storage,
1915						     ring_req64);
1916				ring_req = &ring_req_storage;
1917				break;
1918			}
1919			default:
1920				panic("Unexpected blkif protocol ABI.");
1921				/* NOTREACHED */
1922			}
1923
1924			/*
1925			 * Check for situations that would require closing
1926			 * off this I/O for further coalescing:
1927			 *  - Coalescing is turned off.
1928			 *  - Current I/O is out of sequence with the previous
1929			 *    I/O.
1930			 *  - Coalesced I/O would be too large.
1931			 */
1932			if ((reqlist != NULL)
1933			 && ((xbb->no_coalesce_reqs != 0)
1934			  || ((xbb->no_coalesce_reqs == 0)
1935			   && ((ring_req->sector_number != cur_sector)
1936			    || (ring_req->operation != cur_operation)
1937			    || ((ring_req->nr_segments + reqlist->nr_segments) >
1938			         xbb->max_reqlist_segments))))) {
1939				reqlist = NULL;
1940			}
1941
1942			/*
1943			 * Grab and check for all resources in one shot.
1944			 * If we can't get all of the resources we need,
1945			 * the shortage is noted and the thread will get
1946			 * woken up when more resources are available.
1947			 */
1948			retval = xbb_get_resources(xbb, &reqlist, ring_req,
1949						   xbb->rings.common.req_cons);
1950
1951			if (retval != 0) {
1952				/*
1953				 * Resource shortage has been recorded.
1954				 * We'll be scheduled to run once a request
1955				 * object frees up due to a completion.
1956				 */
1957				break;
1958			}
1959
1960			/*
1961			 * Signify that	we can overwrite this request with
1962			 * a response by incrementing our consumer index.
1963			 * The response won't be generated until after
1964			 * we've already consumed all necessary data out
1965			 * of the version of the request in the ring buffer
1966			 * (for native mode).  We must update the consumer
1967			 * index  before issuing back-end I/O so there is
1968			 * no possibility that it will complete and a
1969			 * response be generated before we make room in
1970			 * the queue for that response.
1971			 */
1972			xbb->rings.common.req_cons++;
1973			xbb->reqs_received++;
1974
1975			cur_size = xbb_count_sects(ring_req);
1976			cur_sector = ring_req->sector_number + cur_size;
1977			reqlist->next_contig_sector = cur_sector;
1978			cur_operation = ring_req->operation;
1979		}
1980
1981		/* Check for I/O to dispatch */
1982		reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq);
1983		if (reqlist == NULL) {
1984			/*
1985			 * We're out of work to do, put the task queue to
1986			 * sleep.
1987			 */
1988			break;
1989		}
1990
1991		/*
1992		 * Grab the first request off the queue and attempt
1993		 * to dispatch it.
1994		 */
1995		STAILQ_REMOVE_HEAD(&xbb->reqlist_pending_stailq, links);
1996
1997		retval = xbb_dispatch_io(xbb, reqlist);
1998		if (retval != 0) {
1999			/*
2000			 * xbb_dispatch_io() returns non-zero only when
2001			 * there is a resource shortage.  If that's the
2002			 * case, re-queue this request on the head of the
2003			 * queue, and go to sleep until we have more
2004			 * resources.
2005			 */
2006			STAILQ_INSERT_HEAD(&xbb->reqlist_pending_stailq,
2007					   reqlist, links);
2008			break;
2009		} else {
2010			/*
2011			 * If we still have anything on the queue after
2012			 * removing the head entry, that is because we
2013			 * met one of the criteria to create a new
2014			 * request list (outlined above), and we'll call
2015			 * that a forced dispatch for statistical purposes.
2016			 *
2017			 * Otherwise, if there is only one element on the
2018			 * queue, we coalesced everything available on
2019			 * the ring and we'll call that a normal dispatch.
2020			 */
2021			reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq);
2022
2023			if (reqlist != NULL)
2024				xbb->forced_dispatch++;
2025			else
2026				xbb->normal_dispatch++;
2027
2028			xbb->total_dispatch++;
2029		}
2030	}
2031}
2032
2033/**
2034 * Interrupt handler bound to the shared ring's event channel.
2035 *
2036 * \param arg  Callback argument registerd during event channel
2037 *             binding - the xbb_softc for this instance.
2038 */
2039static int
2040xbb_filter(void *arg)
2041{
2042	struct xbb_softc *xbb;
2043
2044	/* Defer to taskqueue thread. */
2045	xbb = (struct xbb_softc *)arg;
2046	taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task);
2047
2048	return (FILTER_HANDLED);
2049}
2050
2051SDT_PROVIDER_DEFINE(xbb);
2052SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_dev, flush, "int");
2053SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, read, "int", "uint64_t",
2054		  "uint64_t");
2055SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, write, "int",
2056		  "uint64_t", "uint64_t");
2057
2058/*----------------------------- Backend Handlers -----------------------------*/
2059/**
2060 * Backend handler for character device access.
2061 *
2062 * \param xbb        Per-instance xbb configuration structure.
2063 * \param reqlist    Allocated internal request list structure.
2064 * \param operation  BIO_* I/O operation code.
2065 * \param bio_flags  Additional bio_flag data to pass to any generated
2066 *                   bios (e.g. BIO_ORDERED)..
2067 *
2068 * \return  0 for success, errno codes for failure.
2069 */
2070static int
2071xbb_dispatch_dev(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
2072		 int operation, int bio_flags)
2073{
2074	struct xbb_dev_data *dev_data;
2075	struct bio          *bios[XBB_MAX_SEGMENTS_PER_REQLIST];
2076	off_t                bio_offset;
2077	struct bio          *bio;
2078	struct xbb_sg       *xbb_sg;
2079	u_int	             nbio;
2080	u_int                bio_idx;
2081	u_int		     nseg;
2082	u_int                seg_idx;
2083	int                  error;
2084
2085	dev_data   = &xbb->backend.dev;
2086	bio_offset = (off_t)reqlist->starting_sector_number
2087		   << xbb->sector_size_shift;
2088	error      = 0;
2089	nbio       = 0;
2090	bio_idx    = 0;
2091
2092	if (operation == BIO_FLUSH) {
2093		bio = g_new_bio();
2094		if (__predict_false(bio == NULL)) {
2095			DPRINTF("Unable to allocate bio for BIO_FLUSH\n");
2096			error = ENOMEM;
2097			return (error);
2098		}
2099
2100		bio->bio_cmd	 = BIO_FLUSH;
2101		bio->bio_flags	|= BIO_ORDERED;
2102		bio->bio_dev	 = dev_data->cdev;
2103		bio->bio_offset	 = 0;
2104		bio->bio_data	 = 0;
2105		bio->bio_done	 = xbb_bio_done;
2106		bio->bio_caller1 = reqlist;
2107		bio->bio_pblkno	 = 0;
2108
2109		reqlist->pendcnt = 1;
2110
2111		SDT_PROBE1(xbb, kernel, xbb_dispatch_dev, flush,
2112			   device_get_unit(xbb->dev));
2113
2114		(*dev_data->csw->d_strategy)(bio);
2115
2116		return (0);
2117	}
2118
2119	xbb_sg = xbb->xbb_sgs;
2120	bio    = NULL;
2121	nseg = reqlist->nr_segments;
2122
2123	for (seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) {
2124
2125		/*
2126		 * KVA will not be contiguous, so any additional
2127		 * I/O will need to be represented in a new bio.
2128		 */
2129		if ((bio != NULL)
2130		 && (xbb_sg->first_sect != 0)) {
2131			if ((bio->bio_length & (xbb->sector_size - 1)) != 0) {
2132				printf("%s: Discontiguous I/O request "
2133				       "from domain %d ends on "
2134				       "non-sector boundary\n",
2135				       __func__, xbb->otherend_id);
2136				error = EINVAL;
2137				goto fail_free_bios;
2138			}
2139			bio = NULL;
2140		}
2141
2142		if (bio == NULL) {
2143			/*
2144			 * Make sure that the start of this bio is
2145			 * aligned to a device sector.
2146			 */
2147			if ((bio_offset & (xbb->sector_size - 1)) != 0){
2148				printf("%s: Misaligned I/O request "
2149				       "from domain %d\n", __func__,
2150				       xbb->otherend_id);
2151				error = EINVAL;
2152				goto fail_free_bios;
2153			}
2154
2155			bio = bios[nbio++] = g_new_bio();
2156			if (__predict_false(bio == NULL)) {
2157				error = ENOMEM;
2158				goto fail_free_bios;
2159			}
2160			bio->bio_cmd     = operation;
2161			bio->bio_flags  |= bio_flags;
2162			bio->bio_dev     = dev_data->cdev;
2163			bio->bio_offset  = bio_offset;
2164			bio->bio_data    = xbb_reqlist_ioaddr(reqlist, seg_idx,
2165						xbb_sg->first_sect);
2166			bio->bio_done    = xbb_bio_done;
2167			bio->bio_caller1 = reqlist;
2168			bio->bio_pblkno  = bio_offset >> xbb->sector_size_shift;
2169		}
2170
2171		bio->bio_length += xbb_sg->nsect << 9;
2172		bio->bio_bcount  = bio->bio_length;
2173		bio_offset      += xbb_sg->nsect << 9;
2174
2175		if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) {
2176
2177			if ((bio->bio_length & (xbb->sector_size - 1)) != 0) {
2178				printf("%s: Discontiguous I/O request "
2179				       "from domain %d ends on "
2180				       "non-sector boundary\n",
2181				       __func__, xbb->otherend_id);
2182				error = EINVAL;
2183				goto fail_free_bios;
2184			}
2185			/*
2186			 * KVA will not be contiguous, so any additional
2187			 * I/O will need to be represented in a new bio.
2188			 */
2189			bio = NULL;
2190		}
2191	}
2192
2193	reqlist->pendcnt = nbio;
2194
2195	for (bio_idx = 0; bio_idx < nbio; bio_idx++)
2196	{
2197#ifdef XBB_USE_BOUNCE_BUFFERS
2198		vm_offset_t kva_offset;
2199
2200		kva_offset = (vm_offset_t)bios[bio_idx]->bio_data
2201			   - (vm_offset_t)reqlist->bounce;
2202		if (operation == BIO_WRITE) {
2203			memcpy(bios[bio_idx]->bio_data,
2204			       (uint8_t *)reqlist->kva + kva_offset,
2205			       bios[bio_idx]->bio_bcount);
2206		}
2207#endif
2208		if (operation == BIO_READ) {
2209			SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, read,
2210				   device_get_unit(xbb->dev),
2211				   bios[bio_idx]->bio_offset,
2212				   bios[bio_idx]->bio_length);
2213		} else if (operation == BIO_WRITE) {
2214			SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, write,
2215				   device_get_unit(xbb->dev),
2216				   bios[bio_idx]->bio_offset,
2217				   bios[bio_idx]->bio_length);
2218		}
2219		(*dev_data->csw->d_strategy)(bios[bio_idx]);
2220	}
2221
2222	return (error);
2223
2224fail_free_bios:
2225	for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++)
2226		g_destroy_bio(bios[bio_idx]);
2227
2228	return (error);
2229}
2230
2231SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_file, flush, "int");
2232SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, read, "int", "uint64_t",
2233		  "uint64_t");
2234SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, write, "int",
2235		  "uint64_t", "uint64_t");
2236
2237/**
2238 * Backend handler for file access.
2239 *
2240 * \param xbb        Per-instance xbb configuration structure.
2241 * \param reqlist    Allocated internal request list.
2242 * \param operation  BIO_* I/O operation code.
2243 * \param flags      Additional bio_flag data to pass to any generated bios
2244 *                   (e.g. BIO_ORDERED)..
2245 *
2246 * \return  0 for success, errno codes for failure.
2247 */
2248static int
2249xbb_dispatch_file(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
2250		  int operation, int flags)
2251{
2252	struct xbb_file_data *file_data;
2253	u_int                 seg_idx;
2254	u_int		      nseg;
2255	struct uio            xuio;
2256	struct xbb_sg        *xbb_sg;
2257	struct iovec         *xiovec;
2258#ifdef XBB_USE_BOUNCE_BUFFERS
2259	void                **p_vaddr;
2260	int                   saved_uio_iovcnt;
2261#endif /* XBB_USE_BOUNCE_BUFFERS */
2262	int                   error;
2263
2264	file_data = &xbb->backend.file;
2265	error = 0;
2266	bzero(&xuio, sizeof(xuio));
2267
2268	switch (operation) {
2269	case BIO_READ:
2270		xuio.uio_rw = UIO_READ;
2271		break;
2272	case BIO_WRITE:
2273		xuio.uio_rw = UIO_WRITE;
2274		break;
2275	case BIO_FLUSH: {
2276		struct mount *mountpoint;
2277
2278		SDT_PROBE1(xbb, kernel, xbb_dispatch_file, flush,
2279			   device_get_unit(xbb->dev));
2280
2281		(void) vn_start_write(xbb->vn, &mountpoint, V_WAIT);
2282
2283		vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
2284		error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread);
2285		VOP_UNLOCK(xbb->vn, 0);
2286
2287		vn_finished_write(mountpoint);
2288
2289		goto bailout_send_response;
2290		/* NOTREACHED */
2291	}
2292	default:
2293		panic("invalid operation %d", operation);
2294		/* NOTREACHED */
2295	}
2296	xuio.uio_offset = (vm_offset_t)reqlist->starting_sector_number
2297			<< xbb->sector_size_shift;
2298	xuio.uio_segflg = UIO_SYSSPACE;
2299	xuio.uio_iov = file_data->xiovecs;
2300	xuio.uio_iovcnt = 0;
2301	xbb_sg = xbb->xbb_sgs;
2302	nseg = reqlist->nr_segments;
2303
2304	for (xiovec = NULL, seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) {
2305
2306		/*
2307		 * If the first sector is not 0, the KVA will
2308		 * not be contiguous and we'll need to go on
2309		 * to another segment.
2310		 */
2311		if (xbb_sg->first_sect != 0)
2312			xiovec = NULL;
2313
2314		if (xiovec == NULL) {
2315			xiovec = &file_data->xiovecs[xuio.uio_iovcnt];
2316			xiovec->iov_base = xbb_reqlist_ioaddr(reqlist,
2317			    seg_idx, xbb_sg->first_sect);
2318#ifdef XBB_USE_BOUNCE_BUFFERS
2319			/*
2320			 * Store the address of the incoming
2321			 * buffer at this particular offset
2322			 * as well, so we can do the copy
2323			 * later without having to do more
2324			 * work to recalculate this address.
2325		 	 */
2326			p_vaddr = &file_data->xiovecs_vaddr[xuio.uio_iovcnt];
2327			*p_vaddr = xbb_reqlist_vaddr(reqlist, seg_idx,
2328			    xbb_sg->first_sect);
2329#endif /* XBB_USE_BOUNCE_BUFFERS */
2330			xiovec->iov_len = 0;
2331			xuio.uio_iovcnt++;
2332		}
2333
2334		xiovec->iov_len += xbb_sg->nsect << 9;
2335
2336		xuio.uio_resid += xbb_sg->nsect << 9;
2337
2338		/*
2339		 * If the last sector is not the full page
2340		 * size count, the next segment will not be
2341		 * contiguous in KVA and we need a new iovec.
2342		 */
2343		if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9)
2344			xiovec = NULL;
2345	}
2346
2347	xuio.uio_td = curthread;
2348
2349#ifdef XBB_USE_BOUNCE_BUFFERS
2350	saved_uio_iovcnt = xuio.uio_iovcnt;
2351
2352	if (operation == BIO_WRITE) {
2353		/* Copy the write data to the local buffer. */
2354		for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr,
2355		     xiovec = xuio.uio_iov; seg_idx < xuio.uio_iovcnt;
2356		     seg_idx++, xiovec++, p_vaddr++) {
2357
2358			memcpy(xiovec->iov_base, *p_vaddr, xiovec->iov_len);
2359		}
2360	} else {
2361		/*
2362		 * We only need to save off the iovecs in the case of a
2363		 * read, because the copy for the read happens after the
2364		 * VOP_READ().  (The uio will get modified in that call
2365		 * sequence.)
2366		 */
2367		memcpy(file_data->saved_xiovecs, xuio.uio_iov,
2368		       xuio.uio_iovcnt * sizeof(xuio.uio_iov[0]));
2369	}
2370#endif /* XBB_USE_BOUNCE_BUFFERS */
2371
2372	switch (operation) {
2373	case BIO_READ:
2374
2375		SDT_PROBE3(xbb, kernel, xbb_dispatch_file, read,
2376			   device_get_unit(xbb->dev), xuio.uio_offset,
2377			   xuio.uio_resid);
2378
2379		vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
2380
2381		/*
2382		 * UFS pays attention to IO_DIRECT for reads.  If the
2383		 * DIRECTIO option is configured into the kernel, it calls
2384		 * ffs_rawread().  But that only works for single-segment
2385		 * uios with user space addresses.  In our case, with a
2386		 * kernel uio, it still reads into the buffer cache, but it
2387		 * will just try to release the buffer from the cache later
2388		 * on in ffs_read().
2389		 *
2390		 * ZFS does not pay attention to IO_DIRECT for reads.
2391		 *
2392		 * UFS does not pay attention to IO_SYNC for reads.
2393		 *
2394		 * ZFS pays attention to IO_SYNC (which translates into the
2395		 * Solaris define FRSYNC for zfs_read()) for reads.  It
2396		 * attempts to sync the file before reading.
2397		 *
2398		 * So, to attempt to provide some barrier semantics in the
2399		 * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC.
2400		 */
2401		error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ?
2402				 (IO_DIRECT|IO_SYNC) : 0, file_data->cred);
2403
2404		VOP_UNLOCK(xbb->vn, 0);
2405		break;
2406	case BIO_WRITE: {
2407		struct mount *mountpoint;
2408
2409		SDT_PROBE3(xbb, kernel, xbb_dispatch_file, write,
2410			   device_get_unit(xbb->dev), xuio.uio_offset,
2411			   xuio.uio_resid);
2412
2413		(void)vn_start_write(xbb->vn, &mountpoint, V_WAIT);
2414
2415		vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
2416
2417		/*
2418		 * UFS pays attention to IO_DIRECT for writes.  The write
2419		 * is done asynchronously.  (Normally the write would just
2420		 * get put into cache.
2421		 *
2422		 * UFS pays attention to IO_SYNC for writes.  It will
2423		 * attempt to write the buffer out synchronously if that
2424		 * flag is set.
2425		 *
2426		 * ZFS does not pay attention to IO_DIRECT for writes.
2427		 *
2428		 * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC)
2429		 * for writes.  It will flush the transaction from the
2430		 * cache before returning.
2431		 *
2432		 * So if we've got the BIO_ORDERED flag set, we want
2433		 * IO_SYNC in either the UFS or ZFS case.
2434		 */
2435		error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ?
2436				  IO_SYNC : 0, file_data->cred);
2437		VOP_UNLOCK(xbb->vn, 0);
2438
2439		vn_finished_write(mountpoint);
2440
2441		break;
2442	}
2443	default:
2444		panic("invalid operation %d", operation);
2445		/* NOTREACHED */
2446	}
2447
2448#ifdef XBB_USE_BOUNCE_BUFFERS
2449	/* We only need to copy here for read operations */
2450	if (operation == BIO_READ) {
2451
2452		for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr,
2453		     xiovec = file_data->saved_xiovecs;
2454		     seg_idx < saved_uio_iovcnt; seg_idx++,
2455		     xiovec++, p_vaddr++) {
2456
2457			/*
2458			 * Note that we have to use the copy of the
2459			 * io vector we made above.  uiomove() modifies
2460			 * the uio and its referenced vector as uiomove
2461			 * performs the copy, so we can't rely on any
2462			 * state from the original uio.
2463			 */
2464			memcpy(*p_vaddr, xiovec->iov_base, xiovec->iov_len);
2465		}
2466	}
2467#endif /* XBB_USE_BOUNCE_BUFFERS */
2468
2469bailout_send_response:
2470
2471	if (error != 0)
2472		reqlist->status = BLKIF_RSP_ERROR;
2473
2474	xbb_complete_reqlist(xbb, reqlist);
2475
2476	return (0);
2477}
2478
2479/*--------------------------- Backend Configuration --------------------------*/
2480/**
2481 * Close and cleanup any backend device/file specific state for this
2482 * block back instance.
2483 *
2484 * \param xbb  Per-instance xbb configuration structure.
2485 */
2486static void
2487xbb_close_backend(struct xbb_softc *xbb)
2488{
2489	DROP_GIANT();
2490	DPRINTF("closing dev=%s\n", xbb->dev_name);
2491	if (xbb->vn) {
2492		int flags = FREAD;
2493
2494		if ((xbb->flags & XBBF_READ_ONLY) == 0)
2495			flags |= FWRITE;
2496
2497		switch (xbb->device_type) {
2498		case XBB_TYPE_DISK:
2499			if (xbb->backend.dev.csw) {
2500				dev_relthread(xbb->backend.dev.cdev,
2501					      xbb->backend.dev.dev_ref);
2502				xbb->backend.dev.csw  = NULL;
2503				xbb->backend.dev.cdev = NULL;
2504			}
2505			break;
2506		case XBB_TYPE_FILE:
2507			break;
2508		case XBB_TYPE_NONE:
2509		default:
2510			panic("Unexpected backend type.");
2511			break;
2512		}
2513
2514		(void)vn_close(xbb->vn, flags, NOCRED, curthread);
2515		xbb->vn = NULL;
2516
2517		switch (xbb->device_type) {
2518		case XBB_TYPE_DISK:
2519			break;
2520		case XBB_TYPE_FILE:
2521			if (xbb->backend.file.cred != NULL) {
2522				crfree(xbb->backend.file.cred);
2523				xbb->backend.file.cred = NULL;
2524			}
2525			break;
2526		case XBB_TYPE_NONE:
2527		default:
2528			panic("Unexpected backend type.");
2529			break;
2530		}
2531	}
2532	PICKUP_GIANT();
2533}
2534
2535/**
2536 * Open a character device to be used for backend I/O.
2537 *
2538 * \param xbb  Per-instance xbb configuration structure.
2539 *
2540 * \return  0 for success, errno codes for failure.
2541 */
2542static int
2543xbb_open_dev(struct xbb_softc *xbb)
2544{
2545	struct vattr   vattr;
2546	struct cdev   *dev;
2547	struct cdevsw *devsw;
2548	int	       error;
2549
2550	xbb->device_type = XBB_TYPE_DISK;
2551	xbb->dispatch_io = xbb_dispatch_dev;
2552	xbb->backend.dev.cdev = xbb->vn->v_rdev;
2553	xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev,
2554					     &xbb->backend.dev.dev_ref);
2555	if (xbb->backend.dev.csw == NULL)
2556		panic("Unable to retrieve device switch");
2557
2558	error = VOP_GETATTR(xbb->vn, &vattr, NOCRED);
2559	if (error) {
2560		xenbus_dev_fatal(xbb->dev, error, "error getting "
2561				 "vnode attributes for device %s",
2562				 xbb->dev_name);
2563		return (error);
2564	}
2565
2566
2567	dev = xbb->vn->v_rdev;
2568	devsw = dev->si_devsw;
2569	if (!devsw->d_ioctl) {
2570		xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for "
2571				 "device %s!", xbb->dev_name);
2572		return (ENODEV);
2573	}
2574
2575	error = devsw->d_ioctl(dev, DIOCGSECTORSIZE,
2576			       (caddr_t)&xbb->sector_size, FREAD,
2577			       curthread);
2578	if (error) {
2579		xenbus_dev_fatal(xbb->dev, error,
2580				 "error calling ioctl DIOCGSECTORSIZE "
2581				 "for device %s", xbb->dev_name);
2582		return (error);
2583	}
2584
2585	error = devsw->d_ioctl(dev, DIOCGMEDIASIZE,
2586			       (caddr_t)&xbb->media_size, FREAD,
2587			       curthread);
2588	if (error) {
2589		xenbus_dev_fatal(xbb->dev, error,
2590				 "error calling ioctl DIOCGMEDIASIZE "
2591				 "for device %s", xbb->dev_name);
2592		return (error);
2593	}
2594
2595	return (0);
2596}
2597
2598/**
2599 * Open a file to be used for backend I/O.
2600 *
2601 * \param xbb  Per-instance xbb configuration structure.
2602 *
2603 * \return  0 for success, errno codes for failure.
2604 */
2605static int
2606xbb_open_file(struct xbb_softc *xbb)
2607{
2608	struct xbb_file_data *file_data;
2609	struct vattr          vattr;
2610	int                   error;
2611
2612	file_data = &xbb->backend.file;
2613	xbb->device_type = XBB_TYPE_FILE;
2614	xbb->dispatch_io = xbb_dispatch_file;
2615	error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred);
2616	if (error != 0) {
2617		xenbus_dev_fatal(xbb->dev, error,
2618				 "error calling VOP_GETATTR()"
2619				 "for file %s", xbb->dev_name);
2620		return (error);
2621	}
2622
2623	/*
2624	 * Verify that we have the ability to upgrade to exclusive
2625	 * access on this file so we can trap errors at open instead
2626	 * of reporting them during first access.
2627	 */
2628	if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) {
2629		vn_lock(xbb->vn, LK_UPGRADE | LK_RETRY);
2630		if (xbb->vn->v_iflag & VI_DOOMED) {
2631			error = EBADF;
2632			xenbus_dev_fatal(xbb->dev, error,
2633					 "error locking file %s",
2634					 xbb->dev_name);
2635
2636			return (error);
2637		}
2638	}
2639
2640	file_data->cred = crhold(curthread->td_ucred);
2641	xbb->media_size = vattr.va_size;
2642
2643	/*
2644	 * XXX KDM vattr.va_blocksize may be larger than 512 bytes here.
2645	 * With ZFS, it is 131072 bytes.  Block sizes that large don't work
2646	 * with disklabel and UFS on FreeBSD at least.  Large block sizes
2647	 * may not work with other OSes as well.  So just export a sector
2648	 * size of 512 bytes, which should work with any OS or
2649	 * application.  Since our backing is a file, any block size will
2650	 * work fine for the backing store.
2651	 */
2652#if 0
2653	xbb->sector_size = vattr.va_blocksize;
2654#endif
2655	xbb->sector_size = 512;
2656
2657	/*
2658	 * Sanity check.  The media size has to be at least one
2659	 * sector long.
2660	 */
2661	if (xbb->media_size < xbb->sector_size) {
2662		error = EINVAL;
2663		xenbus_dev_fatal(xbb->dev, error,
2664				 "file %s size %ju < block size %u",
2665				 xbb->dev_name,
2666				 (uintmax_t)xbb->media_size,
2667				 xbb->sector_size);
2668	}
2669	return (error);
2670}
2671
2672/**
2673 * Open the backend provider for this connection.
2674 *
2675 * \param xbb  Per-instance xbb configuration structure.
2676 *
2677 * \return  0 for success, errno codes for failure.
2678 */
2679static int
2680xbb_open_backend(struct xbb_softc *xbb)
2681{
2682	struct nameidata nd;
2683	int		 flags;
2684	int		 error;
2685
2686	flags = FREAD;
2687	error = 0;
2688
2689	DPRINTF("opening dev=%s\n", xbb->dev_name);
2690
2691	if (rootvnode == NULL) {
2692		xenbus_dev_fatal(xbb->dev, ENOENT,
2693				 "Root file system not mounted");
2694		return (ENOENT);
2695	}
2696
2697	if ((xbb->flags & XBBF_READ_ONLY) == 0)
2698		flags |= FWRITE;
2699
2700	pwd_ensure_dirs();
2701
2702 again:
2703	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name, curthread);
2704	error = vn_open(&nd, &flags, 0, NULL);
2705	if (error) {
2706		/*
2707		 * This is the only reasonable guess we can make as far as
2708		 * path if the user doesn't give us a fully qualified path.
2709		 * If they want to specify a file, they need to specify the
2710		 * full path.
2711		 */
2712		if (xbb->dev_name[0] != '/') {
2713			char *dev_path = "/dev/";
2714			char *dev_name;
2715
2716			/* Try adding device path at beginning of name */
2717			dev_name = malloc(strlen(xbb->dev_name)
2718					+ strlen(dev_path) + 1,
2719					  M_XENBLOCKBACK, M_NOWAIT);
2720			if (dev_name) {
2721				sprintf(dev_name, "%s%s", dev_path,
2722					xbb->dev_name);
2723				free(xbb->dev_name, M_XENBLOCKBACK);
2724				xbb->dev_name = dev_name;
2725				goto again;
2726			}
2727		}
2728		xenbus_dev_fatal(xbb->dev, error, "error opening device %s",
2729				 xbb->dev_name);
2730		return (error);
2731	}
2732
2733	NDFREE(&nd, NDF_ONLY_PNBUF);
2734
2735	xbb->vn = nd.ni_vp;
2736
2737	/* We only support disks and files. */
2738	if (vn_isdisk(xbb->vn, &error)) {
2739		error = xbb_open_dev(xbb);
2740	} else if (xbb->vn->v_type == VREG) {
2741		error = xbb_open_file(xbb);
2742	} else {
2743		error = EINVAL;
2744		xenbus_dev_fatal(xbb->dev, error, "%s is not a disk "
2745				 "or file", xbb->dev_name);
2746	}
2747	VOP_UNLOCK(xbb->vn, 0);
2748
2749	if (error != 0) {
2750		xbb_close_backend(xbb);
2751		return (error);
2752	}
2753
2754	xbb->sector_size_shift = fls(xbb->sector_size) - 1;
2755	xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift;
2756
2757	DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n",
2758		(xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file",
2759		xbb->dev_name, xbb->sector_size, xbb->media_size);
2760
2761	return (0);
2762}
2763
2764/*------------------------ Inter-Domain Communication ------------------------*/
2765/**
2766 * Free dynamically allocated KVA or pseudo-physical address allocations.
2767 *
2768 * \param xbb  Per-instance xbb configuration structure.
2769 */
2770static void
2771xbb_free_communication_mem(struct xbb_softc *xbb)
2772{
2773	if (xbb->kva != 0) {
2774		if (xbb->pseudo_phys_res != NULL) {
2775			xenmem_free(xbb->dev, xbb->pseudo_phys_res_id,
2776			    xbb->pseudo_phys_res);
2777			xbb->pseudo_phys_res = NULL;
2778		}
2779	}
2780	xbb->kva = 0;
2781	xbb->gnt_base_addr = 0;
2782	if (xbb->kva_free != NULL) {
2783		free(xbb->kva_free, M_XENBLOCKBACK);
2784		xbb->kva_free = NULL;
2785	}
2786}
2787
2788/**
2789 * Cleanup all inter-domain communication mechanisms.
2790 *
2791 * \param xbb  Per-instance xbb configuration structure.
2792 */
2793static int
2794xbb_disconnect(struct xbb_softc *xbb)
2795{
2796	struct gnttab_unmap_grant_ref  ops[XBB_MAX_RING_PAGES];
2797	struct gnttab_unmap_grant_ref *op;
2798	u_int			       ring_idx;
2799	int			       error;
2800
2801	DPRINTF("\n");
2802
2803	if ((xbb->flags & XBBF_RING_CONNECTED) == 0)
2804		return (0);
2805
2806	mtx_unlock(&xbb->lock);
2807	xen_intr_unbind(&xbb->xen_intr_handle);
2808	taskqueue_drain(xbb->io_taskqueue, &xbb->io_task);
2809	mtx_lock(&xbb->lock);
2810
2811	/*
2812	 * No new interrupts can generate work, but we must wait
2813	 * for all currently active requests to drain.
2814	 */
2815	if (xbb->active_request_count != 0)
2816		return (EAGAIN);
2817
2818	for (ring_idx = 0, op = ops;
2819	     ring_idx < xbb->ring_config.ring_pages;
2820	     ring_idx++, op++) {
2821
2822		op->host_addr    = xbb->ring_config.gnt_addr
2823			         + (ring_idx * PAGE_SIZE);
2824		op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx];
2825		op->handle	 = xbb->ring_config.handle[ring_idx];
2826	}
2827
2828	error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops,
2829					  xbb->ring_config.ring_pages);
2830	if (error != 0)
2831		panic("Grant table op failed (%d)", error);
2832
2833	xbb_free_communication_mem(xbb);
2834
2835	if (xbb->requests != NULL) {
2836		free(xbb->requests, M_XENBLOCKBACK);
2837		xbb->requests = NULL;
2838	}
2839
2840	if (xbb->request_lists != NULL) {
2841		struct xbb_xen_reqlist *reqlist;
2842		int i;
2843
2844		/* There is one request list for ever allocated request. */
2845		for (i = 0, reqlist = xbb->request_lists;
2846		     i < xbb->max_requests; i++, reqlist++){
2847#ifdef XBB_USE_BOUNCE_BUFFERS
2848			if (reqlist->bounce != NULL) {
2849				free(reqlist->bounce, M_XENBLOCKBACK);
2850				reqlist->bounce = NULL;
2851			}
2852#endif
2853			if (reqlist->gnt_handles != NULL) {
2854				free(reqlist->gnt_handles, M_XENBLOCKBACK);
2855				reqlist->gnt_handles = NULL;
2856			}
2857		}
2858		free(xbb->request_lists, M_XENBLOCKBACK);
2859		xbb->request_lists = NULL;
2860	}
2861
2862	xbb->flags &= ~XBBF_RING_CONNECTED;
2863	return (0);
2864}
2865
2866/**
2867 * Map shared memory ring into domain local address space, initialize
2868 * ring control structures, and bind an interrupt to the event channel
2869 * used to notify us of ring changes.
2870 *
2871 * \param xbb  Per-instance xbb configuration structure.
2872 */
2873static int
2874xbb_connect_ring(struct xbb_softc *xbb)
2875{
2876	struct gnttab_map_grant_ref  gnts[XBB_MAX_RING_PAGES];
2877	struct gnttab_map_grant_ref *gnt;
2878	u_int			     ring_idx;
2879	int			     error;
2880
2881	if ((xbb->flags & XBBF_RING_CONNECTED) != 0)
2882		return (0);
2883
2884	/*
2885	 * Kva for our ring is at the tail of the region of kva allocated
2886	 * by xbb_alloc_communication_mem().
2887	 */
2888	xbb->ring_config.va = xbb->kva
2889			    + (xbb->kva_size
2890			     - (xbb->ring_config.ring_pages * PAGE_SIZE));
2891	xbb->ring_config.gnt_addr = xbb->gnt_base_addr
2892				  + (xbb->kva_size
2893				   - (xbb->ring_config.ring_pages * PAGE_SIZE));
2894
2895	for (ring_idx = 0, gnt = gnts;
2896	     ring_idx < xbb->ring_config.ring_pages;
2897	     ring_idx++, gnt++) {
2898
2899		gnt->host_addr = xbb->ring_config.gnt_addr
2900			       + (ring_idx * PAGE_SIZE);
2901		gnt->flags     = GNTMAP_host_map;
2902		gnt->ref       = xbb->ring_config.ring_ref[ring_idx];
2903		gnt->dom       = xbb->otherend_id;
2904	}
2905
2906	error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts,
2907					  xbb->ring_config.ring_pages);
2908	if (error)
2909		panic("blkback: Ring page grant table op failed (%d)", error);
2910
2911	for (ring_idx = 0, gnt = gnts;
2912	     ring_idx < xbb->ring_config.ring_pages;
2913	     ring_idx++, gnt++) {
2914		if (gnt->status != 0) {
2915			struct gnttab_unmap_grant_ref unmap[XBB_MAX_RING_PAGES];
2916			unsigned int i, j;
2917
2918			xbb->ring_config.va = 0;
2919			xenbus_dev_fatal(xbb->dev, EACCES,
2920					 "Ring shared page mapping failed. "
2921					 "Status %d.", gnt->status);
2922
2923			/* Unmap everything to avoid leaking grant table maps */
2924			for (i = 0, j = 0; i < xbb->ring_config.ring_pages;
2925			    i++) {
2926				if (gnts[i].status != GNTST_okay)
2927					continue;
2928
2929				unmap[j].host_addr = gnts[i].host_addr;
2930				unmap[j].dev_bus_addr = gnts[i].dev_bus_addr;
2931				unmap[j++].handle = gnts[i].handle;
2932			}
2933			if (j != 0) {
2934				error = HYPERVISOR_grant_table_op(
2935				    GNTTABOP_unmap_grant_ref, unmap, j);
2936				if (error != 0)
2937					panic("Unable to unmap grants (%d)",
2938					    error);
2939			}
2940			return (EACCES);
2941		}
2942		xbb->ring_config.handle[ring_idx]   = gnt->handle;
2943		xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr;
2944	}
2945
2946	/* Initialize the ring based on ABI. */
2947	switch (xbb->abi) {
2948	case BLKIF_PROTOCOL_NATIVE:
2949	{
2950		blkif_sring_t *sring;
2951		sring = (blkif_sring_t *)xbb->ring_config.va;
2952		BACK_RING_INIT(&xbb->rings.native, sring,
2953			       xbb->ring_config.ring_pages * PAGE_SIZE);
2954		break;
2955	}
2956	case BLKIF_PROTOCOL_X86_32:
2957	{
2958		blkif_x86_32_sring_t *sring_x86_32;
2959		sring_x86_32 = (blkif_x86_32_sring_t *)xbb->ring_config.va;
2960		BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32,
2961			       xbb->ring_config.ring_pages * PAGE_SIZE);
2962		break;
2963	}
2964	case BLKIF_PROTOCOL_X86_64:
2965	{
2966		blkif_x86_64_sring_t *sring_x86_64;
2967		sring_x86_64 = (blkif_x86_64_sring_t *)xbb->ring_config.va;
2968		BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64,
2969			       xbb->ring_config.ring_pages * PAGE_SIZE);
2970		break;
2971	}
2972	default:
2973		panic("Unexpected blkif protocol ABI.");
2974	}
2975
2976	xbb->flags |= XBBF_RING_CONNECTED;
2977
2978	error = xen_intr_bind_remote_port(xbb->dev,
2979					  xbb->otherend_id,
2980					  xbb->ring_config.evtchn,
2981					  xbb_filter,
2982					  /*ithread_handler*/NULL,
2983					  /*arg*/xbb,
2984					  INTR_TYPE_BIO | INTR_MPSAFE,
2985					  &xbb->xen_intr_handle);
2986	if (error) {
2987		(void)xbb_disconnect(xbb);
2988		xenbus_dev_fatal(xbb->dev, error, "binding event channel");
2989		return (error);
2990	}
2991
2992	DPRINTF("rings connected!\n");
2993
2994	return 0;
2995}
2996
2997/**
2998 * Size KVA and pseudo-physical address allocations based on negotiated
2999 * values for the size and number of I/O requests, and the size of our
3000 * communication ring.
3001 *
3002 * \param xbb  Per-instance xbb configuration structure.
3003 *
3004 * These address spaces are used to dynamically map pages in the
3005 * front-end's domain into our own.
3006 */
3007static int
3008xbb_alloc_communication_mem(struct xbb_softc *xbb)
3009{
3010	xbb->reqlist_kva_pages = xbb->max_requests * xbb->max_request_segments;
3011	xbb->reqlist_kva_size = xbb->reqlist_kva_pages * PAGE_SIZE;
3012	xbb->kva_size = xbb->reqlist_kva_size +
3013			(xbb->ring_config.ring_pages * PAGE_SIZE);
3014
3015	xbb->kva_free = bit_alloc(xbb->reqlist_kva_pages, M_XENBLOCKBACK, M_NOWAIT);
3016	if (xbb->kva_free == NULL)
3017		return (ENOMEM);
3018
3019	DPRINTF("%s: kva_size = %d, reqlist_kva_size = %d\n",
3020		device_get_nameunit(xbb->dev), xbb->kva_size,
3021		xbb->reqlist_kva_size);
3022	/*
3023	 * Reserve a range of pseudo physical memory that we can map
3024	 * into kva.  These pages will only be backed by machine
3025	 * pages ("real memory") during the lifetime of front-end requests
3026	 * via grant table operations.
3027	 */
3028	xbb->pseudo_phys_res_id = 0;
3029	xbb->pseudo_phys_res = xenmem_alloc(xbb->dev, &xbb->pseudo_phys_res_id,
3030	    xbb->kva_size);
3031	if (xbb->pseudo_phys_res == NULL) {
3032		xbb->kva = 0;
3033		return (ENOMEM);
3034	}
3035	xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res);
3036	xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res);
3037
3038	DPRINTF("%s: kva: %#jx, gnt_base_addr: %#jx\n",
3039		device_get_nameunit(xbb->dev), (uintmax_t)xbb->kva,
3040		(uintmax_t)xbb->gnt_base_addr);
3041	return (0);
3042}
3043
3044/**
3045 * Collect front-end information from the XenStore.
3046 *
3047 * \param xbb  Per-instance xbb configuration structure.
3048 */
3049static int
3050xbb_collect_frontend_info(struct xbb_softc *xbb)
3051{
3052	char	    protocol_abi[64];
3053	const char *otherend_path;
3054	int	    error;
3055	u_int	    ring_idx;
3056	u_int	    ring_page_order;
3057	size_t	    ring_size;
3058
3059	otherend_path = xenbus_get_otherend_path(xbb->dev);
3060
3061	/*
3062	 * Protocol defaults valid even if all negotiation fails.
3063	 */
3064	xbb->ring_config.ring_pages = 1;
3065	xbb->max_request_segments   = BLKIF_MAX_SEGMENTS_PER_REQUEST;
3066	xbb->max_request_size	    = xbb->max_request_segments * PAGE_SIZE;
3067
3068	/*
3069	 * Mandatory data (used in all versions of the protocol) first.
3070	 */
3071	error = xs_scanf(XST_NIL, otherend_path,
3072			 "event-channel", NULL, "%" PRIu32,
3073			 &xbb->ring_config.evtchn);
3074	if (error != 0) {
3075		xenbus_dev_fatal(xbb->dev, error,
3076				 "Unable to retrieve event-channel information "
3077				 "from frontend %s.  Unable to connect.",
3078				 xenbus_get_otherend_path(xbb->dev));
3079		return (error);
3080	}
3081
3082	/*
3083	 * These fields are initialized to legacy protocol defaults
3084	 * so we only need to fail if reading the updated value succeeds
3085	 * and the new value is outside of its allowed range.
3086	 *
3087	 * \note xs_gather() returns on the first encountered error, so
3088	 *       we must use independent calls in order to guarantee
3089	 *       we don't miss information in a sparsly populated front-end
3090	 *       tree.
3091	 *
3092	 * \note xs_scanf() does not update variables for unmatched
3093	 *       fields.
3094	 */
3095	ring_page_order = 0;
3096	xbb->max_requests = 32;
3097
3098	(void)xs_scanf(XST_NIL, otherend_path,
3099		       "ring-page-order", NULL, "%u",
3100		       &ring_page_order);
3101	xbb->ring_config.ring_pages = 1 << ring_page_order;
3102	ring_size = PAGE_SIZE * xbb->ring_config.ring_pages;
3103	xbb->max_requests = BLKIF_MAX_RING_REQUESTS(ring_size);
3104
3105	if (xbb->ring_config.ring_pages	> XBB_MAX_RING_PAGES) {
3106		xenbus_dev_fatal(xbb->dev, EINVAL,
3107				 "Front-end specified ring-pages of %u "
3108				 "exceeds backend limit of %u.  "
3109				 "Unable to connect.",
3110				 xbb->ring_config.ring_pages,
3111				 XBB_MAX_RING_PAGES);
3112		return (EINVAL);
3113	}
3114
3115	if (xbb->ring_config.ring_pages	== 1) {
3116		error = xs_gather(XST_NIL, otherend_path,
3117				  "ring-ref", "%" PRIu32,
3118				  &xbb->ring_config.ring_ref[0],
3119				  NULL);
3120		if (error != 0) {
3121			xenbus_dev_fatal(xbb->dev, error,
3122					 "Unable to retrieve ring information "
3123					 "from frontend %s.  Unable to "
3124					 "connect.",
3125					 xenbus_get_otherend_path(xbb->dev));
3126			return (error);
3127		}
3128	} else {
3129		/* Multi-page ring format. */
3130		for (ring_idx = 0; ring_idx < xbb->ring_config.ring_pages;
3131		     ring_idx++) {
3132			char ring_ref_name[]= "ring_refXX";
3133
3134			snprintf(ring_ref_name, sizeof(ring_ref_name),
3135				 "ring-ref%u", ring_idx);
3136			error = xs_scanf(XST_NIL, otherend_path,
3137					 ring_ref_name, NULL, "%" PRIu32,
3138					 &xbb->ring_config.ring_ref[ring_idx]);
3139			if (error != 0) {
3140				xenbus_dev_fatal(xbb->dev, error,
3141						 "Failed to retriev grant "
3142						 "reference for page %u of "
3143						 "shared ring.  Unable "
3144						 "to connect.", ring_idx);
3145				return (error);
3146			}
3147		}
3148	}
3149
3150	error = xs_gather(XST_NIL, otherend_path,
3151			  "protocol", "%63s", protocol_abi,
3152			  NULL);
3153	if (error != 0
3154	 || !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) {
3155		/*
3156		 * Assume native if the frontend has not
3157		 * published ABI data or it has published and
3158		 * matches our own ABI.
3159		 */
3160		xbb->abi = BLKIF_PROTOCOL_NATIVE;
3161	} else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) {
3162
3163		xbb->abi = BLKIF_PROTOCOL_X86_32;
3164	} else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) {
3165
3166		xbb->abi = BLKIF_PROTOCOL_X86_64;
3167	} else {
3168
3169		xenbus_dev_fatal(xbb->dev, EINVAL,
3170				 "Unknown protocol ABI (%s) published by "
3171				 "frontend.  Unable to connect.", protocol_abi);
3172		return (EINVAL);
3173	}
3174	return (0);
3175}
3176
3177/**
3178 * Allocate per-request data structures given request size and number
3179 * information negotiated with the front-end.
3180 *
3181 * \param xbb  Per-instance xbb configuration structure.
3182 */
3183static int
3184xbb_alloc_requests(struct xbb_softc *xbb)
3185{
3186	struct xbb_xen_req *req;
3187	struct xbb_xen_req *last_req;
3188
3189	/*
3190	 * Allocate request book keeping datastructures.
3191	 */
3192	xbb->requests = malloc(xbb->max_requests * sizeof(*xbb->requests),
3193			       M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
3194	if (xbb->requests == NULL) {
3195		xenbus_dev_fatal(xbb->dev, ENOMEM,
3196				  "Unable to allocate request structures");
3197		return (ENOMEM);
3198	}
3199
3200	req      = xbb->requests;
3201	last_req = &xbb->requests[xbb->max_requests - 1];
3202	STAILQ_INIT(&xbb->request_free_stailq);
3203	while (req <= last_req) {
3204		STAILQ_INSERT_TAIL(&xbb->request_free_stailq, req, links);
3205		req++;
3206	}
3207	return (0);
3208}
3209
3210static int
3211xbb_alloc_request_lists(struct xbb_softc *xbb)
3212{
3213	struct xbb_xen_reqlist *reqlist;
3214	int			i;
3215
3216	/*
3217	 * If no requests can be merged, we need 1 request list per
3218	 * in flight request.
3219	 */
3220	xbb->request_lists = malloc(xbb->max_requests *
3221		sizeof(*xbb->request_lists), M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
3222	if (xbb->request_lists == NULL) {
3223		xenbus_dev_fatal(xbb->dev, ENOMEM,
3224				  "Unable to allocate request list structures");
3225		return (ENOMEM);
3226	}
3227
3228	STAILQ_INIT(&xbb->reqlist_free_stailq);
3229	STAILQ_INIT(&xbb->reqlist_pending_stailq);
3230	for (i = 0; i < xbb->max_requests; i++) {
3231		int seg;
3232
3233		reqlist      = &xbb->request_lists[i];
3234
3235		reqlist->xbb = xbb;
3236
3237#ifdef XBB_USE_BOUNCE_BUFFERS
3238		reqlist->bounce = malloc(xbb->max_reqlist_size,
3239					 M_XENBLOCKBACK, M_NOWAIT);
3240		if (reqlist->bounce == NULL) {
3241			xenbus_dev_fatal(xbb->dev, ENOMEM,
3242					 "Unable to allocate request "
3243					 "bounce buffers");
3244			return (ENOMEM);
3245		}
3246#endif /* XBB_USE_BOUNCE_BUFFERS */
3247
3248		reqlist->gnt_handles = malloc(xbb->max_reqlist_segments *
3249					      sizeof(*reqlist->gnt_handles),
3250					      M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
3251		if (reqlist->gnt_handles == NULL) {
3252			xenbus_dev_fatal(xbb->dev, ENOMEM,
3253					  "Unable to allocate request "
3254					  "grant references");
3255			return (ENOMEM);
3256		}
3257
3258		for (seg = 0; seg < xbb->max_reqlist_segments; seg++)
3259			reqlist->gnt_handles[seg] = GRANT_REF_INVALID;
3260
3261		STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links);
3262	}
3263	return (0);
3264}
3265
3266/**
3267 * Supply information about the physical device to the frontend
3268 * via XenBus.
3269 *
3270 * \param xbb  Per-instance xbb configuration structure.
3271 */
3272static int
3273xbb_publish_backend_info(struct xbb_softc *xbb)
3274{
3275	struct xs_transaction xst;
3276	const char	     *our_path;
3277	const char	     *leaf;
3278	int		      error;
3279
3280	our_path = xenbus_get_node(xbb->dev);
3281	while (1) {
3282		error = xs_transaction_start(&xst);
3283		if (error != 0) {
3284			xenbus_dev_fatal(xbb->dev, error,
3285					 "Error publishing backend info "
3286					 "(start transaction)");
3287			return (error);
3288		}
3289
3290		leaf = "sectors";
3291		error = xs_printf(xst, our_path, leaf,
3292				  "%"PRIu64, xbb->media_num_sectors);
3293		if (error != 0)
3294			break;
3295
3296		/* XXX Support all VBD attributes here. */
3297		leaf = "info";
3298		error = xs_printf(xst, our_path, leaf, "%u",
3299				  xbb->flags & XBBF_READ_ONLY
3300				? VDISK_READONLY : 0);
3301		if (error != 0)
3302			break;
3303
3304		leaf = "sector-size";
3305		error = xs_printf(xst, our_path, leaf, "%u",
3306				  xbb->sector_size);
3307		if (error != 0)
3308			break;
3309
3310		error = xs_transaction_end(xst, 0);
3311		if (error == 0) {
3312			return (0);
3313		} else if (error != EAGAIN) {
3314			xenbus_dev_fatal(xbb->dev, error, "ending transaction");
3315			return (error);
3316		}
3317	}
3318
3319	xenbus_dev_fatal(xbb->dev, error, "writing %s/%s",
3320			our_path, leaf);
3321	xs_transaction_end(xst, 1);
3322	return (error);
3323}
3324
3325/**
3326 * Connect to our blkfront peer now that it has completed publishing
3327 * its configuration into the XenStore.
3328 *
3329 * \param xbb  Per-instance xbb configuration structure.
3330 */
3331static void
3332xbb_connect(struct xbb_softc *xbb)
3333{
3334	int error;
3335
3336	if (!xbb->hotplug_done ||
3337	    (xenbus_get_state(xbb->dev) != XenbusStateInitWait) ||
3338	    (xbb_collect_frontend_info(xbb) != 0))
3339		return;
3340
3341	xbb->flags &= ~XBBF_SHUTDOWN;
3342
3343	/*
3344	 * We limit the maximum number of reqlist segments to the maximum
3345	 * number of segments in the ring, or our absolute maximum,
3346	 * whichever is smaller.
3347	 */
3348	xbb->max_reqlist_segments = MIN(xbb->max_request_segments *
3349		xbb->max_requests, XBB_MAX_SEGMENTS_PER_REQLIST);
3350
3351	/*
3352	 * The maximum size is simply a function of the number of segments
3353	 * we can handle.
3354	 */
3355	xbb->max_reqlist_size = xbb->max_reqlist_segments * PAGE_SIZE;
3356
3357	/* Allocate resources whose size depends on front-end configuration. */
3358	error = xbb_alloc_communication_mem(xbb);
3359	if (error != 0) {
3360		xenbus_dev_fatal(xbb->dev, error,
3361				 "Unable to allocate communication memory");
3362		return;
3363	}
3364
3365	error = xbb_alloc_requests(xbb);
3366	if (error != 0) {
3367		/* Specific errors are reported by xbb_alloc_requests(). */
3368		return;
3369	}
3370
3371	error = xbb_alloc_request_lists(xbb);
3372	if (error != 0) {
3373		/* Specific errors are reported by xbb_alloc_request_lists(). */
3374		return;
3375	}
3376
3377	/*
3378	 * Connect communication channel.
3379	 */
3380	error = xbb_connect_ring(xbb);
3381	if (error != 0) {
3382		/* Specific errors are reported by xbb_connect_ring(). */
3383		return;
3384	}
3385
3386	if (xbb_publish_backend_info(xbb) != 0) {
3387		/*
3388		 * If we can't publish our data, we cannot participate
3389		 * in this connection, and waiting for a front-end state
3390		 * change will not help the situation.
3391		 */
3392		(void)xbb_disconnect(xbb);
3393		return;
3394	}
3395
3396	/* Ready for I/O. */
3397	xenbus_set_state(xbb->dev, XenbusStateConnected);
3398}
3399
3400/*-------------------------- Device Teardown Support -------------------------*/
3401/**
3402 * Perform device shutdown functions.
3403 *
3404 * \param xbb  Per-instance xbb configuration structure.
3405 *
3406 * Mark this instance as shutting down, wait for any active I/O on the
3407 * backend device/file to drain, disconnect from the front-end, and notify
3408 * any waiters (e.g. a thread invoking our detach method) that detach can
3409 * now proceed.
3410 */
3411static int
3412xbb_shutdown(struct xbb_softc *xbb)
3413{
3414	XenbusState frontState;
3415	int	    error;
3416
3417	DPRINTF("\n");
3418
3419	/*
3420	 * Due to the need to drop our mutex during some
3421	 * xenbus operations, it is possible for two threads
3422	 * to attempt to close out shutdown processing at
3423	 * the same time.  Tell the caller that hits this
3424	 * race to try back later.
3425	 */
3426	if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0)
3427		return (EAGAIN);
3428
3429	xbb->flags |= XBBF_IN_SHUTDOWN;
3430	mtx_unlock(&xbb->lock);
3431
3432	if (xbb->hotplug_watch.node != NULL) {
3433		xs_unregister_watch(&xbb->hotplug_watch);
3434		free(xbb->hotplug_watch.node, M_XENBLOCKBACK);
3435		xbb->hotplug_watch.node = NULL;
3436	}
3437	xbb->hotplug_done = false;
3438
3439	if (xenbus_get_state(xbb->dev) < XenbusStateClosing)
3440		xenbus_set_state(xbb->dev, XenbusStateClosing);
3441
3442	frontState = xenbus_get_otherend_state(xbb->dev);
3443	mtx_lock(&xbb->lock);
3444	xbb->flags &= ~XBBF_IN_SHUTDOWN;
3445
3446	/* Wait for the frontend to disconnect (if it's connected). */
3447	if (frontState == XenbusStateConnected)
3448		return (EAGAIN);
3449
3450	DPRINTF("\n");
3451
3452	/* Indicate shutdown is in progress. */
3453	xbb->flags |= XBBF_SHUTDOWN;
3454
3455	/* Disconnect from the front-end. */
3456	error = xbb_disconnect(xbb);
3457	if (error != 0) {
3458		/*
3459		 * Requests still outstanding.  We'll be called again
3460		 * once they complete.
3461		 */
3462		KASSERT(error == EAGAIN,
3463			("%s: Unexpected xbb_disconnect() failure %d",
3464			 __func__, error));
3465
3466		return (error);
3467	}
3468
3469	DPRINTF("\n");
3470
3471	/* Indicate to xbb_detach() that is it safe to proceed. */
3472	wakeup(xbb);
3473
3474	return (0);
3475}
3476
3477/**
3478 * Report an attach time error to the console and Xen, and cleanup
3479 * this instance by forcing immediate detach processing.
3480 *
3481 * \param xbb  Per-instance xbb configuration structure.
3482 * \param err  Errno describing the error.
3483 * \param fmt  Printf style format and arguments
3484 */
3485static void
3486xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...)
3487{
3488	va_list ap;
3489	va_list ap_hotplug;
3490
3491	va_start(ap, fmt);
3492	va_copy(ap_hotplug, ap);
3493	xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev),
3494		  "hotplug-error", fmt, ap_hotplug);
3495	va_end(ap_hotplug);
3496	xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3497		  "hotplug-status", "error");
3498
3499	xenbus_dev_vfatal(xbb->dev, err, fmt, ap);
3500	va_end(ap);
3501
3502	xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3503		  "online", "0");
3504	mtx_lock(&xbb->lock);
3505	xbb_shutdown(xbb);
3506	mtx_unlock(&xbb->lock);
3507}
3508
3509/*---------------------------- NewBus Entrypoints ----------------------------*/
3510/**
3511 * Inspect a XenBus device and claim it if is of the appropriate type.
3512 *
3513 * \param dev  NewBus device object representing a candidate XenBus device.
3514 *
3515 * \return  0 for success, errno codes for failure.
3516 */
3517static int
3518xbb_probe(device_t dev)
3519{
3520
3521        if (!strcmp(xenbus_get_type(dev), "vbd")) {
3522                device_set_desc(dev, "Backend Virtual Block Device");
3523                device_quiet(dev);
3524                return (0);
3525        }
3526
3527        return (ENXIO);
3528}
3529
3530/**
3531 * Setup sysctl variables to control various Block Back parameters.
3532 *
3533 * \param xbb  Xen Block Back softc.
3534 *
3535 */
3536static void
3537xbb_setup_sysctl(struct xbb_softc *xbb)
3538{
3539	struct sysctl_ctx_list *sysctl_ctx = NULL;
3540	struct sysctl_oid      *sysctl_tree = NULL;
3541
3542	sysctl_ctx = device_get_sysctl_ctx(xbb->dev);
3543	if (sysctl_ctx == NULL)
3544		return;
3545
3546	sysctl_tree = device_get_sysctl_tree(xbb->dev);
3547	if (sysctl_tree == NULL)
3548		return;
3549
3550	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3551		       "disable_flush", CTLFLAG_RW, &xbb->disable_flush, 0,
3552		       "fake the flush command");
3553
3554	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3555		       "flush_interval", CTLFLAG_RW, &xbb->flush_interval, 0,
3556		       "send a real flush for N flush requests");
3557
3558	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3559		       "no_coalesce_reqs", CTLFLAG_RW, &xbb->no_coalesce_reqs,0,
3560		       "Don't coalesce contiguous requests");
3561
3562	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3563			 "reqs_received", CTLFLAG_RW, &xbb->reqs_received,
3564			 "how many I/O requests we have received");
3565
3566	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3567			 "reqs_completed", CTLFLAG_RW, &xbb->reqs_completed,
3568			 "how many I/O requests have been completed");
3569
3570	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3571			 "reqs_queued_for_completion", CTLFLAG_RW,
3572			 &xbb->reqs_queued_for_completion,
3573			 "how many I/O requests queued but not yet pushed");
3574
3575	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3576			 "reqs_completed_with_error", CTLFLAG_RW,
3577			 &xbb->reqs_completed_with_error,
3578			 "how many I/O requests completed with error status");
3579
3580	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3581			 "forced_dispatch", CTLFLAG_RW, &xbb->forced_dispatch,
3582			 "how many I/O dispatches were forced");
3583
3584	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3585			 "normal_dispatch", CTLFLAG_RW, &xbb->normal_dispatch,
3586			 "how many I/O dispatches were normal");
3587
3588	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3589			 "total_dispatch", CTLFLAG_RW, &xbb->total_dispatch,
3590			 "total number of I/O dispatches");
3591
3592	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3593			 "kva_shortages", CTLFLAG_RW, &xbb->kva_shortages,
3594			 "how many times we have run out of KVA");
3595
3596	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3597			 "request_shortages", CTLFLAG_RW,
3598			 &xbb->request_shortages,
3599			 "how many times we have run out of requests");
3600
3601	SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3602		        "max_requests", CTLFLAG_RD, &xbb->max_requests, 0,
3603		        "maximum outstanding requests (negotiated)");
3604
3605	SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3606		        "max_request_segments", CTLFLAG_RD,
3607		        &xbb->max_request_segments, 0,
3608		        "maximum number of pages per requests (negotiated)");
3609
3610	SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3611		        "max_request_size", CTLFLAG_RD,
3612		        &xbb->max_request_size, 0,
3613		        "maximum size in bytes of a request (negotiated)");
3614
3615	SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3616		        "ring_pages", CTLFLAG_RD,
3617		        &xbb->ring_config.ring_pages, 0,
3618		        "communication channel pages (negotiated)");
3619}
3620
3621static void
3622xbb_attach_disk(struct xs_watch *watch, const char **vec, unsigned int len)
3623{
3624	device_t		 dev;
3625	struct xbb_softc	*xbb;
3626	int			 error;
3627
3628	dev = (device_t) watch->callback_data;
3629	xbb = device_get_softc(dev);
3630
3631	error = xs_gather(XST_NIL, xenbus_get_node(dev), "physical-device-path",
3632	    NULL, &xbb->dev_name, NULL);
3633	if (error != 0)
3634		return;
3635
3636	xs_unregister_watch(watch);
3637	free(watch->node, M_XENBLOCKBACK);
3638	watch->node = NULL;
3639
3640	/* Collect physical device information. */
3641	error = xs_gather(XST_NIL, xenbus_get_otherend_path(xbb->dev),
3642			  "device-type", NULL, &xbb->dev_type,
3643			  NULL);
3644	if (error != 0)
3645		xbb->dev_type = NULL;
3646
3647	error = xs_gather(XST_NIL, xenbus_get_node(dev),
3648                          "mode", NULL, &xbb->dev_mode,
3649                          NULL);
3650	if (error != 0) {
3651		xbb_attach_failed(xbb, error, "reading backend fields at %s",
3652				  xenbus_get_node(dev));
3653                return;
3654        }
3655
3656	/* Parse fopen style mode flags. */
3657	if (strchr(xbb->dev_mode, 'w') == NULL)
3658		xbb->flags |= XBBF_READ_ONLY;
3659
3660	/*
3661	 * Verify the physical device is present and can support
3662	 * the desired I/O mode.
3663	 */
3664	error = xbb_open_backend(xbb);
3665	if (error != 0) {
3666		xbb_attach_failed(xbb, error, "Unable to open %s",
3667				  xbb->dev_name);
3668		return;
3669	}
3670
3671	/* Use devstat(9) for recording statistics. */
3672	xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev),
3673					   xbb->sector_size,
3674					   DEVSTAT_ALL_SUPPORTED,
3675					   DEVSTAT_TYPE_DIRECT
3676					 | DEVSTAT_TYPE_IF_OTHER,
3677					   DEVSTAT_PRIORITY_OTHER);
3678
3679	xbb->xbb_stats_in = devstat_new_entry("xbbi", device_get_unit(xbb->dev),
3680					      xbb->sector_size,
3681					      DEVSTAT_ALL_SUPPORTED,
3682					      DEVSTAT_TYPE_DIRECT
3683					    | DEVSTAT_TYPE_IF_OTHER,
3684					      DEVSTAT_PRIORITY_OTHER);
3685	/*
3686	 * Setup sysctl variables.
3687	 */
3688	xbb_setup_sysctl(xbb);
3689
3690	/*
3691	 * Create a taskqueue for doing work that must occur from a
3692	 * thread context.
3693	 */
3694	xbb->io_taskqueue = taskqueue_create_fast(device_get_nameunit(dev),
3695						  M_NOWAIT,
3696						  taskqueue_thread_enqueue,
3697						  /*contxt*/&xbb->io_taskqueue);
3698	if (xbb->io_taskqueue == NULL) {
3699		xbb_attach_failed(xbb, error, "Unable to create taskqueue");
3700		return;
3701	}
3702
3703	taskqueue_start_threads(&xbb->io_taskqueue,
3704				/*num threads*/1,
3705				/*priority*/PWAIT,
3706				/*thread name*/
3707				"%s taskq", device_get_nameunit(dev));
3708
3709	/* Update hot-plug status to satisfy xend. */
3710	error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3711			  "hotplug-status", "connected");
3712	if (error) {
3713		xbb_attach_failed(xbb, error, "writing %s/hotplug-status",
3714				  xenbus_get_node(xbb->dev));
3715		return;
3716	}
3717
3718	xbb->hotplug_done = true;
3719
3720	/* The front end might be waiting for the backend, attach if so. */
3721	if (xenbus_get_otherend_state(xbb->dev) == XenbusStateInitialised)
3722		xbb_connect(xbb);
3723}
3724
3725/**
3726 * Attach to a XenBus device that has been claimed by our probe routine.
3727 *
3728 * \param dev  NewBus device object representing this Xen Block Back instance.
3729 *
3730 * \return  0 for success, errno codes for failure.
3731 */
3732static int
3733xbb_attach(device_t dev)
3734{
3735	struct xbb_softc	*xbb;
3736	int			 error;
3737	u_int			 max_ring_page_order;
3738	struct sbuf		*watch_path;
3739
3740	DPRINTF("Attaching to %s\n", xenbus_get_node(dev));
3741
3742	/*
3743	 * Basic initialization.
3744	 * After this block it is safe to call xbb_detach()
3745	 * to clean up any allocated data for this instance.
3746	 */
3747	xbb = device_get_softc(dev);
3748	xbb->dev = dev;
3749	xbb->otherend_id = xenbus_get_otherend_id(dev);
3750	TASK_INIT(&xbb->io_task, /*priority*/0, xbb_run_queue, xbb);
3751	mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF);
3752
3753	/*
3754	 * Publish protocol capabilities for consumption by the
3755	 * front-end.
3756	 */
3757	error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3758			  "feature-barrier", "1");
3759	if (error) {
3760		xbb_attach_failed(xbb, error, "writing %s/feature-barrier",
3761				  xenbus_get_node(xbb->dev));
3762		return (error);
3763	}
3764
3765	error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3766			  "feature-flush-cache", "1");
3767	if (error) {
3768		xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache",
3769				  xenbus_get_node(xbb->dev));
3770		return (error);
3771	}
3772
3773	max_ring_page_order = flsl(XBB_MAX_RING_PAGES) - 1;
3774	error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3775			  "max-ring-page-order", "%u", max_ring_page_order);
3776	if (error) {
3777		xbb_attach_failed(xbb, error, "writing %s/max-ring-page-order",
3778				  xenbus_get_node(xbb->dev));
3779		return (error);
3780	}
3781
3782	/*
3783	 * We need to wait for hotplug script execution before
3784	 * moving forward.
3785	 */
3786	KASSERT(!xbb->hotplug_done, ("Hotplug scripts already executed"));
3787	watch_path = xs_join(xenbus_get_node(xbb->dev), "physical-device-path");
3788	xbb->hotplug_watch.callback_data = (uintptr_t)dev;
3789	xbb->hotplug_watch.callback = xbb_attach_disk;
3790	KASSERT(xbb->hotplug_watch.node == NULL, ("watch node already setup"));
3791	xbb->hotplug_watch.node = strdup(sbuf_data(watch_path), M_XENBLOCKBACK);
3792	/*
3793	 * We don't care about the path updated, just about the value changes
3794	 * on that single node, hence there's no need to queue more that one
3795	 * event.
3796	 */
3797	xbb->hotplug_watch.max_pending = 1;
3798	sbuf_delete(watch_path);
3799	error = xs_register_watch(&xbb->hotplug_watch);
3800	if (error != 0) {
3801		xbb_attach_failed(xbb, error, "failed to create watch on %s",
3802		    xbb->hotplug_watch.node);
3803		free(xbb->hotplug_watch.node, M_XENBLOCKBACK);
3804		return (error);
3805	}
3806
3807	/* Tell the toolstack blkback has attached. */
3808	xenbus_set_state(dev, XenbusStateInitWait);
3809
3810	return (0);
3811}
3812
3813/**
3814 * Detach from a block back device instance.
3815 *
3816 * \param dev  NewBus device object representing this Xen Block Back instance.
3817 *
3818 * \return  0 for success, errno codes for failure.
3819 *
3820 * \note A block back device may be detached at any time in its life-cycle,
3821 *       including part way through the attach process.  For this reason,
3822 *       initialization order and the initialization state checks in this
3823 *       routine must be carefully coupled so that attach time failures
3824 *       are gracefully handled.
3825 */
3826static int
3827xbb_detach(device_t dev)
3828{
3829        struct xbb_softc *xbb;
3830
3831	DPRINTF("\n");
3832
3833        xbb = device_get_softc(dev);
3834	mtx_lock(&xbb->lock);
3835	while (xbb_shutdown(xbb) == EAGAIN) {
3836		msleep(xbb, &xbb->lock, /*wakeup prio unchanged*/0,
3837		       "xbb_shutdown", 0);
3838	}
3839	mtx_unlock(&xbb->lock);
3840
3841	DPRINTF("\n");
3842
3843	if (xbb->io_taskqueue != NULL)
3844		taskqueue_free(xbb->io_taskqueue);
3845
3846	if (xbb->xbb_stats != NULL)
3847		devstat_remove_entry(xbb->xbb_stats);
3848
3849	if (xbb->xbb_stats_in != NULL)
3850		devstat_remove_entry(xbb->xbb_stats_in);
3851
3852	xbb_close_backend(xbb);
3853
3854	if (xbb->dev_mode != NULL) {
3855		free(xbb->dev_mode, M_XENSTORE);
3856		xbb->dev_mode = NULL;
3857	}
3858
3859	if (xbb->dev_type != NULL) {
3860		free(xbb->dev_type, M_XENSTORE);
3861		xbb->dev_type = NULL;
3862	}
3863
3864	if (xbb->dev_name != NULL) {
3865		free(xbb->dev_name, M_XENSTORE);
3866		xbb->dev_name = NULL;
3867	}
3868
3869	mtx_destroy(&xbb->lock);
3870        return (0);
3871}
3872
3873/**
3874 * Prepare this block back device for suspension of this VM.
3875 *
3876 * \param dev  NewBus device object representing this Xen Block Back instance.
3877 *
3878 * \return  0 for success, errno codes for failure.
3879 */
3880static int
3881xbb_suspend(device_t dev)
3882{
3883#ifdef NOT_YET
3884        struct xbb_softc *sc = device_get_softc(dev);
3885
3886        /* Prevent new requests being issued until we fix things up. */
3887        mtx_lock(&sc->xb_io_lock);
3888        sc->connected = BLKIF_STATE_SUSPENDED;
3889        mtx_unlock(&sc->xb_io_lock);
3890#endif
3891
3892        return (0);
3893}
3894
3895/**
3896 * Perform any processing required to recover from a suspended state.
3897 *
3898 * \param dev  NewBus device object representing this Xen Block Back instance.
3899 *
3900 * \return  0 for success, errno codes for failure.
3901 */
3902static int
3903xbb_resume(device_t dev)
3904{
3905	return (0);
3906}
3907
3908/**
3909 * Handle state changes expressed via the XenStore by our front-end peer.
3910 *
3911 * \param dev             NewBus device object representing this Xen
3912 *                        Block Back instance.
3913 * \param frontend_state  The new state of the front-end.
3914 *
3915 * \return  0 for success, errno codes for failure.
3916 */
3917static void
3918xbb_frontend_changed(device_t dev, XenbusState frontend_state)
3919{
3920	struct xbb_softc *xbb = device_get_softc(dev);
3921
3922	DPRINTF("frontend_state=%s, xbb_state=%s\n",
3923	        xenbus_strstate(frontend_state),
3924		xenbus_strstate(xenbus_get_state(xbb->dev)));
3925
3926	switch (frontend_state) {
3927	case XenbusStateInitialising:
3928		break;
3929	case XenbusStateInitialised:
3930	case XenbusStateConnected:
3931		xbb_connect(xbb);
3932		break;
3933	case XenbusStateClosing:
3934	case XenbusStateClosed:
3935		mtx_lock(&xbb->lock);
3936		xbb_shutdown(xbb);
3937		mtx_unlock(&xbb->lock);
3938		if (frontend_state == XenbusStateClosed)
3939			xenbus_set_state(xbb->dev, XenbusStateClosed);
3940		break;
3941	default:
3942		xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend",
3943				 frontend_state);
3944		break;
3945	}
3946}
3947
3948/*---------------------------- NewBus Registration ---------------------------*/
3949static device_method_t xbb_methods[] = {
3950	/* Device interface */
3951	DEVMETHOD(device_probe,		xbb_probe),
3952	DEVMETHOD(device_attach,	xbb_attach),
3953	DEVMETHOD(device_detach,	xbb_detach),
3954	DEVMETHOD(device_shutdown,	bus_generic_shutdown),
3955	DEVMETHOD(device_suspend,	xbb_suspend),
3956	DEVMETHOD(device_resume,	xbb_resume),
3957
3958	/* Xenbus interface */
3959	DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed),
3960
3961	{ 0, 0 }
3962};
3963
3964static driver_t xbb_driver = {
3965        "xbbd",
3966        xbb_methods,
3967        sizeof(struct xbb_softc),
3968};
3969devclass_t xbb_devclass;
3970
3971DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, xbb_devclass, 0, 0);
3972