blkfront.c revision 199734
1/*-
2 * All rights reserved.
3 *
4 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
5 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
6 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
7 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
8 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
9 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
10 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
11 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
12 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
13 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
14 * SUCH DAMAGE.
15 *
16 */
17
18/*
19 * XenBSD block device driver
20 *
21 * Copyright (c) 2009 Frank Suchomel, Citrix
22 */
23
24#include <sys/cdefs.h>
25__FBSDID("$FreeBSD: head/sys/dev/xen/blkfront/blkfront.c 199734 2009-11-24 07:17:51Z kmacy $");
26
27#include <sys/param.h>
28#include <sys/systm.h>
29#include <sys/malloc.h>
30#include <sys/kernel.h>
31#include <vm/vm.h>
32#include <vm/pmap.h>
33
34#include <sys/bio.h>
35#include <sys/bus.h>
36#include <sys/conf.h>
37#include <sys/module.h>
38
39#include <machine/bus.h>
40#include <sys/rman.h>
41#include <machine/resource.h>
42#include <machine/intr_machdep.h>
43#include <machine/vmparam.h>
44
45#include <machine/xen/xen-os.h>
46#include <machine/xen/xenfunc.h>
47#include <xen/hypervisor.h>
48#include <xen/xen_intr.h>
49#include <xen/evtchn.h>
50#include <xen/gnttab.h>
51#include <xen/interface/grant_table.h>
52#include <xen/interface/io/protocols.h>
53#include <xen/xenbus/xenbusvar.h>
54
55#include <geom/geom_disk.h>
56
57#include <dev/xen/blkfront/block.h>
58
59#include "xenbus_if.h"
60
61#define    ASSERT(S)       KASSERT(S, (#S))
62/* prototypes */
63struct xb_softc;
64static void xb_startio(struct xb_softc *sc);
65static void connect(device_t, struct blkfront_info *);
66static void blkfront_closing(device_t);
67static int blkfront_detach(device_t);
68static int talk_to_backend(device_t, struct blkfront_info *);
69static int setup_blkring(device_t, struct blkfront_info *);
70static void blkif_int(void *);
71#if 0
72static void blkif_restart_queue(void *arg);
73#endif
74static void blkif_recover(struct blkfront_info *);
75static void blkif_completion(struct blk_shadow *);
76static void blkif_free(struct blkfront_info *, int);
77
78#define GRANT_INVALID_REF 0
79#define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)
80
81LIST_HEAD(xb_softc_list_head, xb_softc) xbsl_head;
82
83/* Control whether runtime update of vbds is enabled. */
84#define ENABLE_VBD_UPDATE 0
85
86#if ENABLE_VBD_UPDATE
87static void vbd_update(void);
88#endif
89
90
91#define BLKIF_STATE_DISCONNECTED 0
92#define BLKIF_STATE_CONNECTED    1
93#define BLKIF_STATE_SUSPENDED    2
94
95#ifdef notyet
96static char *blkif_state_name[] = {
97	[BLKIF_STATE_DISCONNECTED] = "disconnected",
98	[BLKIF_STATE_CONNECTED]    = "connected",
99	[BLKIF_STATE_SUSPENDED]    = "closed",
100};
101
102static char * blkif_status_name[] = {
103	[BLKIF_INTERFACE_STATUS_CLOSED]       = "closed",
104	[BLKIF_INTERFACE_STATUS_DISCONNECTED] = "disconnected",
105	[BLKIF_INTERFACE_STATUS_CONNECTED]    = "connected",
106	[BLKIF_INTERFACE_STATUS_CHANGED]      = "changed",
107};
108#endif
109#define WPRINTK(fmt, args...) printf("[XEN] " fmt, ##args)
110#if 0
111#define DPRINTK(fmt, args...) printf("[XEN] %s:%d: " fmt ".\n", __func__, __LINE__, ##args)
112#else
113#define DPRINTK(fmt, args...)
114#endif
115
116static grant_ref_t gref_head;
117#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
118    (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
119
120static void kick_pending_request_queues(struct blkfront_info *);
121static int blkif_open(struct disk *dp);
122static int blkif_close(struct disk *dp);
123static int blkif_ioctl(struct disk *dp, u_long cmd, void *addr, int flag, struct thread *td);
124static int blkif_queue_request(struct bio *bp);
125static void xb_strategy(struct bio *bp);
126
127// In order to quiesce the device during kernel dumps, outstanding requests to
128// DOM0 for disk reads/writes need to be accounted for.
129static	int	blkif_queued_requests;
130static	int	xb_dump(void *, void *, vm_offset_t, off_t, size_t);
131
132
133/* XXX move to xb_vbd.c when VBD update support is added */
134#define MAX_VBDS 64
135
136#define XBD_SECTOR_SIZE		512	/* XXX: assume for now */
137#define XBD_SECTOR_SHFT		9
138
139static struct mtx blkif_io_lock;
140
141static vm_paddr_t
142pfn_to_mfn(vm_paddr_t pfn)
143{
144	return (phystomach(pfn << PAGE_SHIFT) >> PAGE_SHIFT);
145}
146
147/*
148 * Translate Linux major/minor to an appropriate name and unit
149 * number. For HVM guests, this allows us to use the same drive names
150 * with blkfront as the emulated drives, easing transition slightly.
151 */
152static void
153blkfront_vdevice_to_unit(int vdevice, int *unit, const char **name)
154{
155	static struct vdev_info {
156		int major;
157		int shift;
158		int base;
159		const char *name;
160	} info[] = {
161		{3,	6,	0,	"ad"},	/* ide0 */
162		{22,	6,	2,	"ad"},	/* ide1 */
163		{33,	6,	4,	"ad"},	/* ide2 */
164		{34,	6,	6,	"ad"},	/* ide3 */
165		{56,	6,	8,	"ad"},	/* ide4 */
166		{57,	6,	10,	"ad"},	/* ide5 */
167		{88,	6,	12,	"ad"},	/* ide6 */
168		{89,	6,	14,	"ad"},	/* ide7 */
169		{90,	6,	16,	"ad"},	/* ide8 */
170		{91,	6,	18,	"ad"},	/* ide9 */
171
172		{8,	4,	0,	"da"},	/* scsi disk0 */
173		{65,	4,	16,	"da"},	/* scsi disk1 */
174		{66,	4,	32,	"da"},	/* scsi disk2 */
175		{67,	4,	48,	"da"},	/* scsi disk3 */
176		{68,	4,	64,	"da"},	/* scsi disk4 */
177		{69,	4,	80,	"da"},	/* scsi disk5 */
178		{70,	4,	96,	"da"},	/* scsi disk6 */
179		{71,	4,	112,	"da"},	/* scsi disk7 */
180		{128,	4,	128,	"da"},	/* scsi disk8 */
181		{129,	4,	144,	"da"},	/* scsi disk9 */
182		{130,	4,	160,	"da"},	/* scsi disk10 */
183		{131,	4,	176,	"da"},	/* scsi disk11 */
184		{132,	4,	192,	"da"},	/* scsi disk12 */
185		{133,	4,	208,	"da"},	/* scsi disk13 */
186		{134,	4,	224,	"da"},	/* scsi disk14 */
187		{135,	4,	240,	"da"},	/* scsi disk15 */
188
189		{202,	4,	0,	"xbd"},	/* xbd */
190
191		{0,	0,	0,	NULL},
192	};
193	int major = vdevice >> 8;
194	int minor = vdevice & 0xff;
195	int i;
196
197	if (vdevice & (1 << 28)) {
198		*unit = (vdevice & ((1 << 28) - 1)) >> 8;
199		*name = "xbd";
200	}
201
202	for (i = 0; info[i].major; i++) {
203		if (info[i].major == major) {
204			*unit = info[i].base + (minor >> info[i].shift);
205			*name = info[i].name;
206			return;
207		}
208	}
209
210	*unit = minor >> 4;
211	*name = "xbd";
212}
213
214int
215xlvbd_add(device_t dev, blkif_sector_t capacity,
216    int vdevice, uint16_t vdisk_info, uint16_t sector_size,
217    struct blkfront_info *info)
218{
219	struct xb_softc	*sc;
220	int	unit, error = 0;
221	const char *name;
222
223	blkfront_vdevice_to_unit(vdevice, &unit, &name);
224
225	sc = (struct xb_softc *)malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO);
226	sc->xb_unit = unit;
227	sc->xb_info = info;
228	info->sc = sc;
229
230	if (strcmp(name, "xbd"))
231		device_printf(dev, "attaching as %s%d\n", name, unit);
232
233	memset(&sc->xb_disk, 0, sizeof(sc->xb_disk));
234	sc->xb_disk = disk_alloc();
235	sc->xb_disk->d_unit = sc->xb_unit;
236	sc->xb_disk->d_open = blkif_open;
237	sc->xb_disk->d_close = blkif_close;
238	sc->xb_disk->d_ioctl = blkif_ioctl;
239	sc->xb_disk->d_strategy = xb_strategy;
240	sc->xb_disk->d_dump = xb_dump;
241	sc->xb_disk->d_name = name;
242	sc->xb_disk->d_drv1 = sc;
243	sc->xb_disk->d_sectorsize = sector_size;
244
245	/* XXX */
246	sc->xb_disk->d_mediasize = capacity << XBD_SECTOR_SHFT;
247#if 0
248	sc->xb_disk->d_maxsize = DFLTPHYS;
249#else /* XXX: xen can't handle large single i/o requests */
250	sc->xb_disk->d_maxsize = 4096;
251#endif
252#ifdef notyet
253	XENPRINTF("attaching device 0x%x unit %d capacity %llu\n",
254		  xb_diskinfo[sc->xb_unit].device, sc->xb_unit,
255		  sc->xb_disk->d_mediasize);
256#endif
257	sc->xb_disk->d_flags = 0;
258	disk_create(sc->xb_disk, DISK_VERSION_00);
259	bioq_init(&sc->xb_bioq);
260
261	return error;
262}
263
264void
265xlvbd_del(struct blkfront_info *info)
266{
267	struct xb_softc	*sc;
268
269	sc = info->sc;
270	disk_destroy(sc->xb_disk);
271}
272/************************ end VBD support *****************/
273
274/*
275 * Read/write routine for a buffer.  Finds the proper unit, place it on
276 * the sortq and kick the controller.
277 */
278static void
279xb_strategy(struct bio *bp)
280{
281	struct xb_softc	*sc = (struct xb_softc *)bp->bio_disk->d_drv1;
282
283	/* bogus disk? */
284	if (sc == NULL) {
285		bp->bio_error = EINVAL;
286		bp->bio_flags |= BIO_ERROR;
287		goto bad;
288	}
289
290	DPRINTK("");
291
292	/*
293	 * Place it in the queue of disk activities for this disk
294	 */
295	mtx_lock(&blkif_io_lock);
296
297	bioq_disksort(&sc->xb_bioq, bp);
298	xb_startio(sc);
299
300	mtx_unlock(&blkif_io_lock);
301	return;
302
303 bad:
304	/*
305	 * Correctly set the bio to indicate a failed tranfer.
306	 */
307	bp->bio_resid = bp->bio_bcount;
308	biodone(bp);
309	return;
310}
311
312static void xb_quiesce(struct blkfront_info *info);
313// Quiesce the disk writes for a dump file before allowing the next buffer.
314static void
315xb_quiesce(struct blkfront_info *info)
316{
317	int		mtd;
318
319	// While there are outstanding requests
320	while (blkif_queued_requests) {
321		RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, mtd);
322		if (mtd) {
323			// Recieved request completions, update queue.
324			blkif_int(info);
325		}
326		if (blkif_queued_requests) {
327			// Still pending requests, wait for the disk i/o to complete
328			HYPERVISOR_yield();
329		}
330	}
331}
332
333// Some bio structures for dumping core
334#define DUMP_BIO_NO 16				// 16 * 4KB = 64KB dump block
335static	struct bio		xb_dump_bp[DUMP_BIO_NO];
336
337// Kernel dump function for a paravirtualized disk device
338static int
339xb_dump(void *arg, void *virtual, vm_offset_t physical, off_t offset,
340        size_t length)
341{
342			int				 sbp;
343  			int			     mbp;
344			size_t			 chunk;
345	struct	disk   			*dp = arg;
346	struct	xb_softc		*sc = (struct xb_softc *) dp->d_drv1;
347	        int	    		 rc = 0;
348
349	xb_quiesce(sc->xb_info);		// All quiet on the western front.
350	if (length > 0) {
351		// If this lock is held, then this module is failing, and a successful
352		// kernel dump is highly unlikely anyway.
353		mtx_lock(&blkif_io_lock);
354		// Split the 64KB block into 16 4KB blocks
355		for (sbp=0; length>0 && sbp<DUMP_BIO_NO; sbp++) {
356			chunk = length > PAGE_SIZE ? PAGE_SIZE : length;
357			xb_dump_bp[sbp].bio_disk   = dp;
358			xb_dump_bp[sbp].bio_pblkno = offset / dp->d_sectorsize;
359			xb_dump_bp[sbp].bio_bcount = chunk;
360			xb_dump_bp[sbp].bio_resid  = chunk;
361			xb_dump_bp[sbp].bio_data   = virtual;
362			xb_dump_bp[sbp].bio_cmd    = BIO_WRITE;
363			xb_dump_bp[sbp].bio_done   = NULL;
364
365			bioq_disksort(&sc->xb_bioq, &xb_dump_bp[sbp]);
366
367			length -= chunk;
368			offset += chunk;
369			virtual = (char *) virtual + chunk;
370		}
371		// Tell DOM0 to do the I/O
372		xb_startio(sc);
373		mtx_unlock(&blkif_io_lock);
374
375		// Must wait for the completion: the dump routine reuses the same
376		//                               16 x 4KB buffer space.
377		xb_quiesce(sc->xb_info);	// All quite on the eastern front
378		// If there were any errors, bail out...
379		for (mbp=0; mbp<sbp; mbp++) {
380			if ((rc = xb_dump_bp[mbp].bio_error)) break;
381		}
382	}
383	return (rc);
384}
385
386
387static int
388blkfront_probe(device_t dev)
389{
390
391	if (!strcmp(xenbus_get_type(dev), "vbd")) {
392		device_set_desc(dev, "Virtual Block Device");
393		device_quiet(dev);
394		return (0);
395	}
396
397	return (ENXIO);
398}
399
400/*
401 * Setup supplies the backend dir, virtual device.  We place an event
402 * channel and shared frame entries.  We watch backend to wait if it's
403 * ok.
404 */
405static int
406blkfront_attach(device_t dev)
407{
408	int error, vdevice, i, unit;
409	struct blkfront_info *info;
410	const char *name;
411
412	/* FIXME: Use dynamic device id if this is not set. */
413	error = xenbus_scanf(XBT_NIL, xenbus_get_node(dev),
414	    "virtual-device", NULL, "%i", &vdevice);
415	if (error) {
416		xenbus_dev_fatal(dev, error, "reading virtual-device");
417		printf("couldn't find virtual device");
418		return (error);
419	}
420
421	blkfront_vdevice_to_unit(vdevice, &unit, &name);
422	if (!strcmp(name, "xbd"))
423		device_set_unit(dev, unit);
424
425	info = device_get_softc(dev);
426
427	/*
428	 * XXX debug only
429	 */
430	for (i = 0; i < sizeof(*info); i++)
431			if (((uint8_t *)info)[i] != 0)
432					panic("non-null memory");
433
434	info->shadow_free = 0;
435	info->xbdev = dev;
436	info->vdevice = vdevice;
437	info->connected = BLKIF_STATE_DISCONNECTED;
438
439	/* work queue needed ? */
440	for (i = 0; i < BLK_RING_SIZE; i++)
441		info->shadow[i].req.id = i+1;
442	info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
443
444	/* Front end dir is a number, which is used as the id. */
445	info->handle = strtoul(strrchr(xenbus_get_node(dev),'/')+1, NULL, 0);
446
447	error = talk_to_backend(dev, info);
448	if (error)
449		return (error);
450
451	return (0);
452}
453
454static int
455blkfront_suspend(device_t dev)
456{
457	struct blkfront_info *info = device_get_softc(dev);
458
459	/* Prevent new requests being issued until we fix things up. */
460	mtx_lock(&blkif_io_lock);
461	info->connected = BLKIF_STATE_SUSPENDED;
462	mtx_unlock(&blkif_io_lock);
463
464	return (0);
465}
466
467static int
468blkfront_resume(device_t dev)
469{
470	struct blkfront_info *info = device_get_softc(dev);
471	int err;
472
473	DPRINTK("blkfront_resume: %s\n", xenbus_get_node(dev));
474
475	blkif_free(info, 1);
476	err = talk_to_backend(dev, info);
477	if (info->connected == BLKIF_STATE_SUSPENDED && !err)
478		blkif_recover(info);
479
480	return (err);
481}
482
483/* Common code used when first setting up, and when resuming. */
484static int
485talk_to_backend(device_t dev, struct blkfront_info *info)
486{
487	const char *message = NULL;
488	struct xenbus_transaction xbt;
489	int err;
490
491	/* Create shared ring, alloc event channel. */
492	err = setup_blkring(dev, info);
493	if (err)
494		goto out;
495
496 again:
497	err = xenbus_transaction_start(&xbt);
498	if (err) {
499		xenbus_dev_fatal(dev, err, "starting transaction");
500		goto destroy_blkring;
501	}
502
503	err = xenbus_printf(xbt, xenbus_get_node(dev),
504			    "ring-ref","%u", info->ring_ref);
505	if (err) {
506		message = "writing ring-ref";
507		goto abort_transaction;
508	}
509	err = xenbus_printf(xbt, xenbus_get_node(dev),
510		"event-channel", "%u", irq_to_evtchn_port(info->irq));
511	if (err) {
512		message = "writing event-channel";
513		goto abort_transaction;
514	}
515	err = xenbus_printf(xbt, xenbus_get_node(dev),
516		"protocol", "%s", XEN_IO_PROTO_ABI_NATIVE);
517	if (err) {
518		message = "writing protocol";
519		goto abort_transaction;
520	}
521
522	err = xenbus_transaction_end(xbt, 0);
523	if (err) {
524		if (err == EAGAIN)
525			goto again;
526		xenbus_dev_fatal(dev, err, "completing transaction");
527		goto destroy_blkring;
528	}
529	xenbus_set_state(dev, XenbusStateInitialised);
530
531	return 0;
532
533 abort_transaction:
534	xenbus_transaction_end(xbt, 1);
535	if (message)
536		xenbus_dev_fatal(dev, err, "%s", message);
537 destroy_blkring:
538	blkif_free(info, 0);
539 out:
540	return err;
541}
542
543static int
544setup_blkring(device_t dev, struct blkfront_info *info)
545{
546	blkif_sring_t *sring;
547	int error;
548
549	info->ring_ref = GRANT_INVALID_REF;
550
551	sring = (blkif_sring_t *)malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT|M_ZERO);
552	if (sring == NULL) {
553		xenbus_dev_fatal(dev, ENOMEM, "allocating shared ring");
554		return ENOMEM;
555	}
556	SHARED_RING_INIT(sring);
557	FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
558
559	error = xenbus_grant_ring(dev,
560	    (vtomach(info->ring.sring) >> PAGE_SHIFT), &info->ring_ref);
561	if (error) {
562		free(sring, M_DEVBUF);
563		info->ring.sring = NULL;
564		goto fail;
565	}
566
567	error = bind_listening_port_to_irqhandler(xenbus_get_otherend_id(dev),
568	    "xbd", (driver_intr_t *)blkif_int, info,
569	    INTR_TYPE_BIO | INTR_MPSAFE, &info->irq);
570	if (error) {
571		xenbus_dev_fatal(dev, error,
572		    "bind_evtchn_to_irqhandler failed");
573		goto fail;
574	}
575
576	return (0);
577 fail:
578	blkif_free(info, 0);
579	return (error);
580}
581
582
583/**
584 * Callback received when the backend's state changes.
585 */
586static int
587blkfront_backend_changed(device_t dev, XenbusState backend_state)
588{
589	struct blkfront_info *info = device_get_softc(dev);
590
591	DPRINTK("backend_state=%d\n", backend_state);
592
593	switch (backend_state) {
594	case XenbusStateUnknown:
595	case XenbusStateInitialising:
596	case XenbusStateInitWait:
597	case XenbusStateInitialised:
598	case XenbusStateClosed:
599	case XenbusStateReconfigured:
600	case XenbusStateReconfiguring:
601		break;
602
603	case XenbusStateConnected:
604		connect(dev, info);
605		break;
606
607	case XenbusStateClosing:
608		if (info->users > 0)
609			xenbus_dev_error(dev, -EBUSY,
610					 "Device in use; refusing to close");
611		else
612			blkfront_closing(dev);
613#ifdef notyet
614		bd = bdget(info->dev);
615		if (bd == NULL)
616			xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
617
618		down(&bd->bd_sem);
619		if (info->users > 0)
620			xenbus_dev_error(dev, -EBUSY,
621					 "Device in use; refusing to close");
622		else
623			blkfront_closing(dev);
624		up(&bd->bd_sem);
625		bdput(bd);
626#endif
627	}
628
629	return (0);
630}
631
632/*
633** Invoked when the backend is finally 'ready' (and has told produced
634** the details about the physical device - #sectors, size, etc).
635*/
636static void
637connect(device_t dev, struct blkfront_info *info)
638{
639	unsigned long sectors, sector_size;
640	unsigned int binfo;
641	int err;
642
643        if( (info->connected == BLKIF_STATE_CONNECTED) ||
644	    (info->connected == BLKIF_STATE_SUSPENDED) )
645		return;
646
647	DPRINTK("blkfront.c:connect:%s.\n", xenbus_get_otherend_path(dev));
648
649	err = xenbus_gather(XBT_NIL, xenbus_get_otherend_path(dev),
650			    "sectors", "%lu", &sectors,
651			    "info", "%u", &binfo,
652			    "sector-size", "%lu", &sector_size,
653			    NULL);
654	if (err) {
655		xenbus_dev_fatal(dev, err,
656		    "reading backend fields at %s",
657		    xenbus_get_otherend_path(dev));
658		return;
659	}
660	err = xenbus_gather(XBT_NIL, xenbus_get_otherend_path(dev),
661			    "feature-barrier", "%lu", &info->feature_barrier,
662			    NULL);
663	if (err)
664		info->feature_barrier = 0;
665
666	device_printf(dev, "%juMB <%s> at %s",
667	    (uintmax_t) sectors / (1048576 / sector_size),
668	    device_get_desc(dev),
669	    xenbus_get_node(dev));
670	bus_print_child_footer(device_get_parent(dev), dev);
671
672	xlvbd_add(dev, sectors, info->vdevice, binfo, sector_size, info);
673
674	(void)xenbus_set_state(dev, XenbusStateConnected);
675
676	/* Kick pending requests. */
677	mtx_lock(&blkif_io_lock);
678	info->connected = BLKIF_STATE_CONNECTED;
679	kick_pending_request_queues(info);
680	mtx_unlock(&blkif_io_lock);
681	info->is_ready = 1;
682
683#if 0
684	add_disk(info->gd);
685#endif
686}
687
688/**
689 * Handle the change of state of the backend to Closing.  We must delete our
690 * device-layer structures now, to ensure that writes are flushed through to
691 * the backend.  Once is this done, we can switch to Closed in
692 * acknowledgement.
693 */
694static void
695blkfront_closing(device_t dev)
696{
697	struct blkfront_info *info = device_get_softc(dev);
698
699	DPRINTK("blkfront_closing: %s removed\n", xenbus_get_node(dev));
700
701	if (info->mi) {
702		DPRINTK("Calling xlvbd_del\n");
703		xlvbd_del(info);
704		info->mi = NULL;
705	}
706
707	xenbus_set_state(dev, XenbusStateClosed);
708}
709
710
711static int
712blkfront_detach(device_t dev)
713{
714	struct blkfront_info *info = device_get_softc(dev);
715
716	DPRINTK("blkfront_remove: %s removed\n", xenbus_get_node(dev));
717
718	blkif_free(info, 0);
719
720	return 0;
721}
722
723
724static inline int
725GET_ID_FROM_FREELIST(struct blkfront_info *info)
726{
727	unsigned long nfree = info->shadow_free;
728
729	KASSERT(nfree <= BLK_RING_SIZE, ("free %lu > RING_SIZE", nfree));
730	info->shadow_free = info->shadow[nfree].req.id;
731	info->shadow[nfree].req.id = 0x0fffffee; /* debug */
732	atomic_add_int(&blkif_queued_requests, 1);
733	return nfree;
734}
735
736static inline void
737ADD_ID_TO_FREELIST(struct blkfront_info *info, unsigned long id)
738{
739	info->shadow[id].req.id  = info->shadow_free;
740	info->shadow[id].request = 0;
741	info->shadow_free = id;
742	atomic_subtract_int(&blkif_queued_requests, 1);
743}
744
745static inline void
746flush_requests(struct blkfront_info *info)
747{
748	int notify;
749
750	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
751
752	if (notify)
753		notify_remote_via_irq(info->irq);
754}
755
756static void
757kick_pending_request_queues(struct blkfront_info *info)
758{
759	/* XXX check if we can't simplify */
760#if 0
761	if (!RING_FULL(&info->ring)) {
762		/* Re-enable calldowns. */
763		blk_start_queue(info->rq);
764		/* Kick things off immediately. */
765		do_blkif_request(info->rq);
766	}
767#endif
768	if (!RING_FULL(&info->ring)) {
769#if 0
770		sc = LIST_FIRST(&xbsl_head);
771		LIST_REMOVE(sc, entry);
772		/* Re-enable calldowns. */
773		blk_start_queue(di->rq);
774#endif
775		/* Kick things off immediately. */
776		xb_startio(info->sc);
777	}
778}
779
780#if 0
781/* XXX */
782static void blkif_restart_queue(void *arg)
783{
784	struct blkfront_info *info = (struct blkfront_info *)arg;
785
786	mtx_lock(&blkif_io_lock);
787	kick_pending_request_queues(info);
788	mtx_unlock(&blkif_io_lock);
789}
790#endif
791
792static void blkif_restart_queue_callback(void *arg)
793{
794#if 0
795	struct blkfront_info *info = (struct blkfront_info *)arg;
796	/* XXX BSD equiv ? */
797
798	schedule_work(&info->work);
799#endif
800}
801
802static int
803blkif_open(struct disk *dp)
804{
805	struct xb_softc	*sc = (struct xb_softc *)dp->d_drv1;
806
807	if (sc == NULL) {
808		printf("xb%d: not found", sc->xb_unit);
809		return (ENXIO);
810	}
811
812	sc->xb_flags |= XB_OPEN;
813	sc->xb_info->users++;
814	return (0);
815}
816
817static int
818blkif_close(struct disk *dp)
819{
820	struct xb_softc	*sc = (struct xb_softc *)dp->d_drv1;
821
822	if (sc == NULL)
823		return (ENXIO);
824	sc->xb_flags &= ~XB_OPEN;
825	if (--(sc->xb_info->users) == 0) {
826		/* Check whether we have been instructed to close.  We will
827		   have ignored this request initially, as the device was
828		   still mounted. */
829		device_t dev = sc->xb_info->xbdev;
830		XenbusState state =
831			xenbus_read_driver_state(xenbus_get_otherend_path(dev));
832
833		if (state == XenbusStateClosing)
834			blkfront_closing(dev);
835	}
836	return (0);
837}
838
839static int
840blkif_ioctl(struct disk *dp, u_long cmd, void *addr, int flag, struct thread *td)
841{
842	struct xb_softc	*sc = (struct xb_softc *)dp->d_drv1;
843
844	if (sc == NULL)
845		return (ENXIO);
846
847	return (ENOTTY);
848}
849
850
851/*
852 * blkif_queue_request
853 *
854 * request block io
855 *
856 * id: for guest use only.
857 * operation: BLKIF_OP_{READ,WRITE,PROBE}
858 * buffer: buffer to read/write into. this should be a
859 *   virtual address in the guest os.
860 */
861static int blkif_queue_request(struct bio *bp)
862{
863	caddr_t alignbuf;
864	vm_paddr_t buffer_ma;
865	blkif_request_t     *ring_req;
866	unsigned long id;
867	uint64_t fsect, lsect;
868	struct xb_softc *sc = (struct xb_softc *)bp->bio_disk->d_drv1;
869	struct blkfront_info *info = sc->xb_info;
870	int ref;
871
872	if (unlikely(sc->xb_info->connected != BLKIF_STATE_CONNECTED))
873		return 1;
874
875	if (gnttab_alloc_grant_references(
876		    BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) {
877		gnttab_request_free_callback(
878			&info->callback,
879			blkif_restart_queue_callback,
880			info,
881			BLKIF_MAX_SEGMENTS_PER_REQUEST);
882		return 1;
883	}
884
885	/* Check if the buffer is properly aligned */
886	if ((vm_offset_t)bp->bio_data & PAGE_MASK) {
887		int align = (bp->bio_bcount < PAGE_SIZE/2) ? XBD_SECTOR_SIZE :
888			PAGE_SIZE;
889		caddr_t newbuf = malloc(bp->bio_bcount + align, M_DEVBUF,
890					M_NOWAIT);
891
892		alignbuf = (char *)roundup2((u_long)newbuf, align);
893
894		/* save a copy of the current buffer */
895		bp->bio_driver1 = newbuf;
896		bp->bio_driver2 = alignbuf;
897
898		/* Copy the data for a write */
899		if (bp->bio_cmd == BIO_WRITE)
900			bcopy(bp->bio_data, alignbuf, bp->bio_bcount);
901	} else
902		alignbuf = bp->bio_data;
903
904	/* Fill out a communications ring structure. */
905	ring_req 	         = RING_GET_REQUEST(&info->ring,
906						    info->ring.req_prod_pvt);
907	id		         = GET_ID_FROM_FREELIST(info);
908	info->shadow[id].request = (unsigned long)bp;
909
910	ring_req->id 	         = id;
911	ring_req->operation 	 = (bp->bio_cmd == BIO_READ) ? BLKIF_OP_READ :
912		BLKIF_OP_WRITE;
913
914	ring_req->sector_number= (blkif_sector_t)bp->bio_pblkno;
915	ring_req->handle 	  = (blkif_vdev_t)(uintptr_t)sc->xb_disk;
916
917	ring_req->nr_segments  = 0;	/* XXX not doing scatter/gather since buffer
918					 * chaining is not supported.
919					 */
920
921	buffer_ma = vtomach(alignbuf);
922	fsect = (buffer_ma & PAGE_MASK) >> XBD_SECTOR_SHFT;
923	lsect = fsect + (bp->bio_bcount >> XBD_SECTOR_SHFT) - 1;
924	/* install a grant reference. */
925	ref = gnttab_claim_grant_reference(&gref_head);
926	KASSERT( ref != -ENOSPC, ("grant_reference failed") );
927
928	gnttab_grant_foreign_access_ref(
929		ref,
930		xenbus_get_otherend_id(info->xbdev),
931		buffer_ma >> PAGE_SHIFT,
932		ring_req->operation & 1 ); /* ??? */
933	info->shadow[id].frame[ring_req->nr_segments] =
934		buffer_ma >> PAGE_SHIFT;
935
936	ring_req->seg[ring_req->nr_segments] =
937		(struct blkif_request_segment) {
938			.gref       = ref,
939			.first_sect = fsect,
940			.last_sect  = lsect };
941
942	ring_req->nr_segments++;
943	KASSERT((buffer_ma & (XBD_SECTOR_SIZE-1)) == 0,
944		("XEN buffer must be sector aligned"));
945	KASSERT(lsect <= 7,
946		("XEN disk driver data cannot cross a page boundary"));
947
948	buffer_ma &= ~PAGE_MASK;
949
950	info->ring.req_prod_pvt++;
951
952	/* Keep a private copy so we can reissue requests when recovering. */
953	info->shadow[id].req = *ring_req;
954
955	gnttab_free_grant_references(gref_head);
956
957	return 0;
958}
959
960
961
962/*
963 * Dequeue buffers and place them in the shared communication ring.
964 * Return when no more requests can be accepted or all buffers have
965 * been queued.
966 *
967 * Signal XEN once the ring has been filled out.
968 */
969static void
970xb_startio(struct xb_softc *sc)
971{
972	struct bio		*bp;
973	int			queued = 0;
974	struct blkfront_info *info = sc->xb_info;
975	DPRINTK("");
976
977	mtx_assert(&blkif_io_lock, MA_OWNED);
978
979	while ((bp = bioq_takefirst(&sc->xb_bioq)) != NULL) {
980
981		if (RING_FULL(&info->ring))
982			goto wait;
983
984		if (blkif_queue_request(bp)) {
985		wait:
986			bioq_insert_head(&sc->xb_bioq, bp);
987			break;
988		}
989		queued++;
990	}
991
992	if (queued != 0)
993		flush_requests(sc->xb_info);
994}
995
996static void
997blkif_int(void *xsc)
998{
999	struct xb_softc *sc = NULL;
1000	struct bio *bp;
1001	blkif_response_t *bret;
1002	RING_IDX i, rp;
1003	struct blkfront_info *info = xsc;
1004	DPRINTK("");
1005
1006	TRACE_ENTER;
1007
1008	mtx_lock(&blkif_io_lock);
1009
1010	if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
1011		mtx_unlock(&blkif_io_lock);
1012		return;
1013	}
1014
1015 again:
1016	rp = info->ring.sring->rsp_prod;
1017	rmb(); /* Ensure we see queued responses up to 'rp'. */
1018
1019	for (i = info->ring.rsp_cons; i != rp; i++) {
1020		unsigned long id;
1021
1022		bret = RING_GET_RESPONSE(&info->ring, i);
1023		id   = bret->id;
1024		bp   = (struct bio *)info->shadow[id].request;
1025
1026		blkif_completion(&info->shadow[id]);
1027
1028		ADD_ID_TO_FREELIST(info, id);
1029
1030		switch (bret->operation) {
1031		case BLKIF_OP_READ:
1032			/* had an unaligned buffer that needs to be copied */
1033			if (bp->bio_driver1)
1034				bcopy(bp->bio_driver2, bp->bio_data, bp->bio_bcount);
1035			/* FALLTHROUGH */
1036		case BLKIF_OP_WRITE:
1037
1038			/* free the copy buffer */
1039			if (bp->bio_driver1) {
1040				free(bp->bio_driver1, M_DEVBUF);
1041				bp->bio_driver1 = NULL;
1042			}
1043
1044			if ( unlikely(bret->status != BLKIF_RSP_OKAY) ) {
1045					printf("Bad return from blkdev data request: %x\n",
1046					  bret->status);
1047				bp->bio_flags |= BIO_ERROR;
1048			}
1049
1050			sc = (struct xb_softc *)bp->bio_disk->d_drv1;
1051
1052			if (bp->bio_flags & BIO_ERROR)
1053				bp->bio_error = EIO;
1054			else
1055				bp->bio_resid = 0;
1056
1057			biodone(bp);
1058			break;
1059		default:
1060			panic("received invalid operation");
1061			break;
1062		}
1063	}
1064
1065	info->ring.rsp_cons = i;
1066
1067	if (i != info->ring.req_prod_pvt) {
1068		int more_to_do;
1069		RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
1070		if (more_to_do)
1071			goto again;
1072	} else {
1073		info->ring.sring->rsp_event = i + 1;
1074	}
1075
1076	kick_pending_request_queues(info);
1077
1078	mtx_unlock(&blkif_io_lock);
1079}
1080
1081static void
1082blkif_free(struct blkfront_info *info, int suspend)
1083{
1084
1085/* Prevent new requests being issued until we fix things up. */
1086	mtx_lock(&blkif_io_lock);
1087	info->connected = suspend ?
1088		BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
1089	mtx_unlock(&blkif_io_lock);
1090
1091	/* Free resources associated with old device channel. */
1092	if (info->ring_ref != GRANT_INVALID_REF) {
1093		gnttab_end_foreign_access(info->ring_ref,
1094					  info->ring.sring);
1095		info->ring_ref = GRANT_INVALID_REF;
1096		info->ring.sring = NULL;
1097	}
1098	if (info->irq)
1099		unbind_from_irqhandler(info->irq);
1100	info->irq = 0;
1101
1102}
1103
1104static void
1105blkif_completion(struct blk_shadow *s)
1106{
1107	int i;
1108
1109	for (i = 0; i < s->req.nr_segments; i++)
1110		gnttab_end_foreign_access(s->req.seg[i].gref, 0UL);
1111}
1112
1113static void
1114blkif_recover(struct blkfront_info *info)
1115{
1116	int i, j;
1117	blkif_request_t *req;
1118	struct blk_shadow *copy;
1119
1120	if (!info->sc)
1121		return;
1122
1123	/* Stage 1: Make a safe copy of the shadow state. */
1124	copy = (struct blk_shadow *)malloc(sizeof(info->shadow), M_DEVBUF, M_NOWAIT|M_ZERO);
1125	memcpy(copy, info->shadow, sizeof(info->shadow));
1126
1127	/* Stage 2: Set up free list. */
1128	memset(&info->shadow, 0, sizeof(info->shadow));
1129	for (i = 0; i < BLK_RING_SIZE; i++)
1130		info->shadow[i].req.id = i+1;
1131	info->shadow_free = info->ring.req_prod_pvt;
1132	info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
1133
1134	/* Stage 3: Find pending requests and requeue them. */
1135	for (i = 0; i < BLK_RING_SIZE; i++) {
1136		/* Not in use? */
1137		if (copy[i].request == 0)
1138			continue;
1139
1140		/* Grab a request slot and copy shadow state into it. */
1141		req = RING_GET_REQUEST(
1142			&info->ring, info->ring.req_prod_pvt);
1143		*req = copy[i].req;
1144
1145		/* We get a new request id, and must reset the shadow state. */
1146		req->id = GET_ID_FROM_FREELIST(info);
1147		memcpy(&info->shadow[req->id], &copy[i], sizeof(copy[i]));
1148
1149		/* Rewrite any grant references invalidated by suspend/resume. */
1150		for (j = 0; j < req->nr_segments; j++)
1151			gnttab_grant_foreign_access_ref(
1152				req->seg[j].gref,
1153				xenbus_get_otherend_id(info->xbdev),
1154				pfn_to_mfn(info->shadow[req->id].frame[j]),
1155				0 /* assume not readonly */);
1156
1157		info->shadow[req->id].req = *req;
1158
1159		info->ring.req_prod_pvt++;
1160	}
1161
1162	free(copy, M_DEVBUF);
1163
1164	xenbus_set_state(info->xbdev, XenbusStateConnected);
1165
1166	/* Now safe for us to use the shared ring */
1167	mtx_lock(&blkif_io_lock);
1168	info->connected = BLKIF_STATE_CONNECTED;
1169	mtx_unlock(&blkif_io_lock);
1170
1171	/* Send off requeued requests */
1172	mtx_lock(&blkif_io_lock);
1173	flush_requests(info);
1174
1175	/* Kick any other new requests queued since we resumed */
1176	kick_pending_request_queues(info);
1177	mtx_unlock(&blkif_io_lock);
1178}
1179
1180/* ** Driver registration ** */
1181static device_method_t blkfront_methods[] = {
1182	/* Device interface */
1183	DEVMETHOD(device_probe,         blkfront_probe),
1184	DEVMETHOD(device_attach,        blkfront_attach),
1185	DEVMETHOD(device_detach,        blkfront_detach),
1186	DEVMETHOD(device_shutdown,      bus_generic_shutdown),
1187	DEVMETHOD(device_suspend,       blkfront_suspend),
1188	DEVMETHOD(device_resume,        blkfront_resume),
1189
1190	/* Xenbus interface */
1191	DEVMETHOD(xenbus_backend_changed, blkfront_backend_changed),
1192
1193	{ 0, 0 }
1194};
1195
1196static driver_t blkfront_driver = {
1197	"xbd",
1198	blkfront_methods,
1199	sizeof(struct blkfront_info),
1200};
1201devclass_t blkfront_devclass;
1202
1203DRIVER_MODULE(xbd, xenbus, blkfront_driver, blkfront_devclass, 0, 0);
1204
1205MTX_SYSINIT(ioreq, &blkif_io_lock, "BIO LOCK", MTX_NOWITNESS); /* XXX how does one enroll a lock? */
1206
1207