1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2011, Bryan Venteicher <bryanv@FreeBSD.org>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice unmodified, this list of conditions, and the following
12 *    disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/* Driver for VirtIO block devices. */
30
31#include <sys/param.h>
32#include <sys/systm.h>
33#include <sys/kernel.h>
34#include <sys/bio.h>
35#include <sys/malloc.h>
36#include <sys/module.h>
37#include <sys/msan.h>
38#include <sys/sglist.h>
39#include <sys/sysctl.h>
40#include <sys/lock.h>
41#include <sys/mutex.h>
42#include <sys/queue.h>
43
44#include <geom/geom.h>
45#include <geom/geom_disk.h>
46
47#include <machine/bus.h>
48#include <machine/resource.h>
49#include <sys/bus.h>
50#include <sys/rman.h>
51
52#include <dev/virtio/virtio.h>
53#include <dev/virtio/virtqueue.h>
54#include <dev/virtio/block/virtio_blk.h>
55
56#include "virtio_if.h"
57
58struct vtblk_request {
59	struct vtblk_softc		*vbr_sc;
60	bus_dmamap_t			 vbr_mapp;
61
62	/* Fields after this point are zeroed for each request. */
63	struct virtio_blk_outhdr	 vbr_hdr;
64	struct bio			*vbr_bp;
65	uint8_t				 vbr_ack;
66	uint8_t				 vbr_requeue_on_error;
67	uint8_t				 vbr_busdma_wait;
68	int				 vbr_error;
69	TAILQ_ENTRY(vtblk_request)	 vbr_link;
70};
71
72enum vtblk_cache_mode {
73	VTBLK_CACHE_WRITETHROUGH,
74	VTBLK_CACHE_WRITEBACK,
75	VTBLK_CACHE_MAX
76};
77
78struct vtblk_softc {
79	device_t		 vtblk_dev;
80	struct mtx		 vtblk_mtx;
81	uint64_t		 vtblk_features;
82	uint32_t		 vtblk_flags;
83#define VTBLK_FLAG_INDIRECT	0x0001
84#define VTBLK_FLAG_DETACH	0x0002
85#define VTBLK_FLAG_SUSPEND	0x0004
86#define VTBLK_FLAG_BARRIER	0x0008
87#define VTBLK_FLAG_WCE_CONFIG	0x0010
88#define VTBLK_FLAG_BUSDMA_WAIT	0x0020
89#define VTBLK_FLAG_BUSDMA_ALIGN	0x0040
90
91	struct virtqueue	*vtblk_vq;
92	struct sglist		*vtblk_sglist;
93	bus_dma_tag_t		 vtblk_dmat;
94	struct disk		*vtblk_disk;
95
96	struct bio_queue_head	 vtblk_bioq;
97	TAILQ_HEAD(, vtblk_request)
98				 vtblk_req_free;
99	TAILQ_HEAD(, vtblk_request)
100				 vtblk_req_ready;
101	struct vtblk_request	*vtblk_req_ordered;
102
103	int			 vtblk_max_nsegs;
104	int			 vtblk_request_count;
105	enum vtblk_cache_mode	 vtblk_write_cache;
106
107	struct bio_queue	 vtblk_dump_queue;
108	struct vtblk_request	 vtblk_dump_request;
109};
110
111static struct virtio_feature_desc vtblk_feature_desc[] = {
112	{ VIRTIO_BLK_F_BARRIER,		"HostBarrier"	},
113	{ VIRTIO_BLK_F_SIZE_MAX,	"MaxSegSize"	},
114	{ VIRTIO_BLK_F_SEG_MAX,		"MaxNumSegs"	},
115	{ VIRTIO_BLK_F_GEOMETRY,	"DiskGeometry"	},
116	{ VIRTIO_BLK_F_RO,		"ReadOnly"	},
117	{ VIRTIO_BLK_F_BLK_SIZE,	"BlockSize"	},
118	{ VIRTIO_BLK_F_SCSI,		"SCSICmds"	},
119	{ VIRTIO_BLK_F_FLUSH,		"FlushCmd"	},
120	{ VIRTIO_BLK_F_TOPOLOGY,	"Topology"	},
121	{ VIRTIO_BLK_F_CONFIG_WCE,	"ConfigWCE"	},
122	{ VIRTIO_BLK_F_MQ,		"Multiqueue"	},
123	{ VIRTIO_BLK_F_DISCARD,		"Discard"	},
124	{ VIRTIO_BLK_F_WRITE_ZEROES,	"WriteZeros"	},
125
126	{ 0, NULL }
127};
128
129static int	vtblk_modevent(module_t, int, void *);
130
131static int	vtblk_probe(device_t);
132static int	vtblk_attach(device_t);
133static int	vtblk_detach(device_t);
134static int	vtblk_suspend(device_t);
135static int	vtblk_resume(device_t);
136static int	vtblk_shutdown(device_t);
137static int	vtblk_attach_completed(device_t);
138static int	vtblk_config_change(device_t);
139
140static int	vtblk_open(struct disk *);
141static int	vtblk_close(struct disk *);
142static int	vtblk_ioctl(struct disk *, u_long, void *, int,
143		    struct thread *);
144static int	vtblk_dump(void *, void *, off_t, size_t);
145static void	vtblk_strategy(struct bio *);
146
147static int	vtblk_negotiate_features(struct vtblk_softc *);
148static int	vtblk_setup_features(struct vtblk_softc *);
149static int	vtblk_maximum_segments(struct vtblk_softc *,
150		    struct virtio_blk_config *);
151static int	vtblk_alloc_virtqueue(struct vtblk_softc *);
152static void	vtblk_resize_disk(struct vtblk_softc *, uint64_t);
153static void	vtblk_alloc_disk(struct vtblk_softc *,
154		    struct virtio_blk_config *);
155static void	vtblk_create_disk(struct vtblk_softc *);
156
157static int	vtblk_request_prealloc(struct vtblk_softc *);
158static void	vtblk_request_free(struct vtblk_softc *);
159static struct vtblk_request *
160		vtblk_request_dequeue(struct vtblk_softc *);
161static void	vtblk_request_enqueue(struct vtblk_softc *,
162		    struct vtblk_request *);
163static struct vtblk_request *
164		vtblk_request_next_ready(struct vtblk_softc *);
165static void	vtblk_request_requeue_ready(struct vtblk_softc *,
166		    struct vtblk_request *);
167static struct vtblk_request *
168		vtblk_request_next(struct vtblk_softc *);
169static struct vtblk_request *
170		vtblk_request_bio(struct vtblk_softc *);
171static int	vtblk_request_execute(struct vtblk_request *, int);
172static void	vtblk_request_execute_cb(void *,
173		    bus_dma_segment_t *, int, int);
174static int	vtblk_request_error(struct vtblk_request *);
175
176static void	vtblk_queue_completed(struct vtblk_softc *,
177		    struct bio_queue *);
178static void	vtblk_done_completed(struct vtblk_softc *,
179		    struct bio_queue *);
180static void	vtblk_drain_vq(struct vtblk_softc *);
181static void	vtblk_drain(struct vtblk_softc *);
182
183static void	vtblk_startio(struct vtblk_softc *);
184static void	vtblk_bio_done(struct vtblk_softc *, struct bio *, int);
185
186static void	vtblk_read_config(struct vtblk_softc *,
187		    struct virtio_blk_config *);
188static void	vtblk_ident(struct vtblk_softc *);
189static int	vtblk_poll_request(struct vtblk_softc *,
190		    struct vtblk_request *);
191static int	vtblk_quiesce(struct vtblk_softc *);
192static void	vtblk_vq_intr(void *);
193static void	vtblk_stop(struct vtblk_softc *);
194
195static void	vtblk_dump_quiesce(struct vtblk_softc *);
196static int	vtblk_dump_write(struct vtblk_softc *, void *, off_t, size_t);
197static int	vtblk_dump_flush(struct vtblk_softc *);
198static void	vtblk_dump_complete(struct vtblk_softc *);
199
200static void	vtblk_set_write_cache(struct vtblk_softc *, int);
201static int	vtblk_write_cache_enabled(struct vtblk_softc *sc,
202		    struct virtio_blk_config *);
203static int	vtblk_write_cache_sysctl(SYSCTL_HANDLER_ARGS);
204
205static void	vtblk_setup_sysctl(struct vtblk_softc *);
206static int	vtblk_tunable_int(struct vtblk_softc *, const char *, int);
207
208#define vtblk_modern(_sc) (((_sc)->vtblk_features & VIRTIO_F_VERSION_1) != 0)
209#define vtblk_htog16(_sc, _val)	virtio_htog16(vtblk_modern(_sc), _val)
210#define vtblk_htog32(_sc, _val)	virtio_htog32(vtblk_modern(_sc), _val)
211#define vtblk_htog64(_sc, _val)	virtio_htog64(vtblk_modern(_sc), _val)
212#define vtblk_gtoh16(_sc, _val)	virtio_gtoh16(vtblk_modern(_sc), _val)
213#define vtblk_gtoh32(_sc, _val)	virtio_gtoh32(vtblk_modern(_sc), _val)
214#define vtblk_gtoh64(_sc, _val)	virtio_gtoh64(vtblk_modern(_sc), _val)
215
216/* Tunables. */
217static int vtblk_no_ident = 0;
218TUNABLE_INT("hw.vtblk.no_ident", &vtblk_no_ident);
219static int vtblk_writecache_mode = -1;
220TUNABLE_INT("hw.vtblk.writecache_mode", &vtblk_writecache_mode);
221
222#define VTBLK_COMMON_FEATURES \
223    (VIRTIO_BLK_F_SIZE_MAX		| \
224     VIRTIO_BLK_F_SEG_MAX		| \
225     VIRTIO_BLK_F_GEOMETRY		| \
226     VIRTIO_BLK_F_RO			| \
227     VIRTIO_BLK_F_BLK_SIZE		| \
228     VIRTIO_BLK_F_FLUSH			| \
229     VIRTIO_BLK_F_TOPOLOGY		| \
230     VIRTIO_BLK_F_CONFIG_WCE		| \
231     VIRTIO_BLK_F_DISCARD		| \
232     VIRTIO_RING_F_INDIRECT_DESC)
233
234#define VTBLK_MODERN_FEATURES	(VTBLK_COMMON_FEATURES)
235#define VTBLK_LEGACY_FEATURES	(VIRTIO_BLK_F_BARRIER | VTBLK_COMMON_FEATURES)
236
237#define VTBLK_MTX(_sc)		&(_sc)->vtblk_mtx
238#define VTBLK_LOCK_INIT(_sc, _name) \
239				mtx_init(VTBLK_MTX((_sc)), (_name), \
240				    "VirtIO Block Lock", MTX_DEF)
241#define VTBLK_LOCK(_sc)		mtx_lock(VTBLK_MTX((_sc)))
242#define VTBLK_UNLOCK(_sc)	mtx_unlock(VTBLK_MTX((_sc)))
243#define VTBLK_LOCK_DESTROY(_sc)	mtx_destroy(VTBLK_MTX((_sc)))
244#define VTBLK_LOCK_ASSERT(_sc)	mtx_assert(VTBLK_MTX((_sc)), MA_OWNED)
245#define VTBLK_LOCK_ASSERT_NOTOWNED(_sc) \
246				mtx_assert(VTBLK_MTX((_sc)), MA_NOTOWNED)
247
248#define VTBLK_DISK_NAME		"vtbd"
249#define VTBLK_QUIESCE_TIMEOUT	(30 * hz)
250#define VTBLK_BSIZE		512
251
252/*
253 * Each block request uses at least two segments - one for the header
254 * and one for the status.
255 */
256#define VTBLK_MIN_SEGMENTS	2
257
258static device_method_t vtblk_methods[] = {
259	/* Device methods. */
260	DEVMETHOD(device_probe,		vtblk_probe),
261	DEVMETHOD(device_attach,	vtblk_attach),
262	DEVMETHOD(device_detach,	vtblk_detach),
263	DEVMETHOD(device_suspend,	vtblk_suspend),
264	DEVMETHOD(device_resume,	vtblk_resume),
265	DEVMETHOD(device_shutdown,	vtblk_shutdown),
266
267	/* VirtIO methods. */
268	DEVMETHOD(virtio_attach_completed, vtblk_attach_completed),
269	DEVMETHOD(virtio_config_change,	vtblk_config_change),
270
271	DEVMETHOD_END
272};
273
274static driver_t vtblk_driver = {
275	"vtblk",
276	vtblk_methods,
277	sizeof(struct vtblk_softc)
278};
279
280VIRTIO_DRIVER_MODULE(virtio_blk, vtblk_driver, vtblk_modevent, NULL);
281MODULE_VERSION(virtio_blk, 1);
282MODULE_DEPEND(virtio_blk, virtio, 1, 1, 1);
283
284VIRTIO_SIMPLE_PNPINFO(virtio_blk, VIRTIO_ID_BLOCK, "VirtIO Block Adapter");
285
286static int
287vtblk_modevent(module_t mod, int type, void *unused)
288{
289	int error;
290
291	error = 0;
292
293	switch (type) {
294	case MOD_LOAD:
295	case MOD_QUIESCE:
296	case MOD_UNLOAD:
297	case MOD_SHUTDOWN:
298		break;
299	default:
300		error = EOPNOTSUPP;
301		break;
302	}
303
304	return (error);
305}
306
307static int
308vtblk_probe(device_t dev)
309{
310	return (VIRTIO_SIMPLE_PROBE(dev, virtio_blk));
311}
312
313static int
314vtblk_attach(device_t dev)
315{
316	struct vtblk_softc *sc;
317	struct virtio_blk_config blkcfg;
318	int error;
319
320	sc = device_get_softc(dev);
321	sc->vtblk_dev = dev;
322	virtio_set_feature_desc(dev, vtblk_feature_desc);
323
324	VTBLK_LOCK_INIT(sc, device_get_nameunit(dev));
325	bioq_init(&sc->vtblk_bioq);
326	TAILQ_INIT(&sc->vtblk_dump_queue);
327	TAILQ_INIT(&sc->vtblk_req_free);
328	TAILQ_INIT(&sc->vtblk_req_ready);
329
330	vtblk_setup_sysctl(sc);
331
332	error = vtblk_setup_features(sc);
333	if (error) {
334		device_printf(dev, "cannot setup features\n");
335		goto fail;
336	}
337
338	vtblk_read_config(sc, &blkcfg);
339
340	/*
341	 * With the current sglist(9) implementation, it is not easy
342	 * for us to support a maximum segment size as adjacent
343	 * segments are coalesced. For now, just make sure it's larger
344	 * than the maximum supported transfer size.
345	 */
346	if (virtio_with_feature(dev, VIRTIO_BLK_F_SIZE_MAX)) {
347		if (blkcfg.size_max < maxphys) {
348			error = ENOTSUP;
349			device_printf(dev, "host requires unsupported "
350			    "maximum segment size feature\n");
351			goto fail;
352		}
353	}
354
355	sc->vtblk_max_nsegs = vtblk_maximum_segments(sc, &blkcfg);
356	if (sc->vtblk_max_nsegs <= VTBLK_MIN_SEGMENTS) {
357		error = EINVAL;
358		device_printf(dev, "fewer than minimum number of segments "
359		    "allowed: %d\n", sc->vtblk_max_nsegs);
360		goto fail;
361	}
362
363	sc->vtblk_sglist = sglist_alloc(sc->vtblk_max_nsegs, M_NOWAIT);
364	if (sc->vtblk_sglist == NULL) {
365		error = ENOMEM;
366		device_printf(dev, "cannot allocate sglist\n");
367		goto fail;
368	}
369
370	/*
371	 * If vtblk_max_nsegs == VTBLK_MIN_SEGMENTS + 1, the device only
372	 * supports a single data segment; in that case we need busdma to
373	 * align to a page boundary so we can send a *contiguous* page size
374	 * request to the host.
375	 */
376	if (sc->vtblk_max_nsegs == VTBLK_MIN_SEGMENTS + 1)
377		sc->vtblk_flags |= VTBLK_FLAG_BUSDMA_ALIGN;
378	error = bus_dma_tag_create(
379	    bus_get_dma_tag(dev),			/* parent */
380	    (sc->vtblk_flags & VTBLK_FLAG_BUSDMA_ALIGN) ? PAGE_SIZE : 1,
381	    0,						/* boundary */
382	    BUS_SPACE_MAXADDR,				/* lowaddr */
383	    BUS_SPACE_MAXADDR,				/* highaddr */
384	    NULL, NULL,					/* filter, filterarg */
385	    maxphys,					/* max request size */
386	    sc->vtblk_max_nsegs - VTBLK_MIN_SEGMENTS,	/* max # segments */
387	    maxphys,					/* maxsegsize */
388	    0,						/* flags */
389	    busdma_lock_mutex,				/* lockfunc */
390	    &sc->vtblk_mtx,				/* lockarg */
391	    &sc->vtblk_dmat);
392	if (error) {
393		device_printf(dev, "cannot create bus dma tag\n");
394		goto fail;
395	}
396
397#ifdef __powerpc__
398	/*
399	 * Virtio uses physical addresses rather than bus addresses, so we
400	 * need to ask busdma to skip the iommu physical->bus mapping.  At
401	 * present, this is only a thing on the powerpc architectures.
402	 */
403	bus_dma_tag_set_iommu(sc->vtblk_dmat, NULL, NULL);
404#endif
405
406	error = vtblk_alloc_virtqueue(sc);
407	if (error) {
408		device_printf(dev, "cannot allocate virtqueue\n");
409		goto fail;
410	}
411
412	error = vtblk_request_prealloc(sc);
413	if (error) {
414		device_printf(dev, "cannot preallocate requests\n");
415		goto fail;
416	}
417
418	vtblk_alloc_disk(sc, &blkcfg);
419
420	error = virtio_setup_intr(dev, INTR_TYPE_BIO | INTR_ENTROPY);
421	if (error) {
422		device_printf(dev, "cannot setup virtqueue interrupt\n");
423		goto fail;
424	}
425
426	virtqueue_enable_intr(sc->vtblk_vq);
427
428fail:
429	if (error)
430		vtblk_detach(dev);
431
432	return (error);
433}
434
435static int
436vtblk_detach(device_t dev)
437{
438	struct vtblk_softc *sc;
439
440	sc = device_get_softc(dev);
441
442	VTBLK_LOCK(sc);
443	sc->vtblk_flags |= VTBLK_FLAG_DETACH;
444	if (device_is_attached(dev))
445		vtblk_stop(sc);
446	VTBLK_UNLOCK(sc);
447
448	vtblk_drain(sc);
449
450	if (sc->vtblk_disk != NULL) {
451		disk_destroy(sc->vtblk_disk);
452		sc->vtblk_disk = NULL;
453	}
454
455	if (sc->vtblk_dmat != NULL) {
456		bus_dma_tag_destroy(sc->vtblk_dmat);
457		sc->vtblk_dmat = NULL;
458	}
459
460	if (sc->vtblk_sglist != NULL) {
461		sglist_free(sc->vtblk_sglist);
462		sc->vtblk_sglist = NULL;
463	}
464
465	VTBLK_LOCK_DESTROY(sc);
466
467	return (0);
468}
469
470static int
471vtblk_suspend(device_t dev)
472{
473	struct vtblk_softc *sc;
474	int error;
475
476	sc = device_get_softc(dev);
477
478	VTBLK_LOCK(sc);
479	sc->vtblk_flags |= VTBLK_FLAG_SUSPEND;
480	/* XXX BMV: virtio_stop(), etc needed here? */
481	error = vtblk_quiesce(sc);
482	if (error)
483		sc->vtblk_flags &= ~VTBLK_FLAG_SUSPEND;
484	VTBLK_UNLOCK(sc);
485
486	return (error);
487}
488
489static int
490vtblk_resume(device_t dev)
491{
492	struct vtblk_softc *sc;
493
494	sc = device_get_softc(dev);
495
496	VTBLK_LOCK(sc);
497	/* XXX BMV: virtio_reinit(), etc needed here? */
498	sc->vtblk_flags &= ~VTBLK_FLAG_SUSPEND;
499	vtblk_startio(sc);
500	VTBLK_UNLOCK(sc);
501
502	return (0);
503}
504
505static int
506vtblk_shutdown(device_t dev)
507{
508
509	return (0);
510}
511
512static int
513vtblk_attach_completed(device_t dev)
514{
515	struct vtblk_softc *sc;
516
517	sc = device_get_softc(dev);
518
519	/*
520	 * Create disk after attach as VIRTIO_BLK_T_GET_ID can only be
521	 * processed after the device acknowledged
522	 * VIRTIO_CONFIG_STATUS_DRIVER_OK.
523	 */
524	vtblk_create_disk(sc);
525	return (0);
526}
527
528static int
529vtblk_config_change(device_t dev)
530{
531	struct vtblk_softc *sc;
532	struct virtio_blk_config blkcfg;
533	uint64_t capacity;
534
535	sc = device_get_softc(dev);
536
537	vtblk_read_config(sc, &blkcfg);
538
539	/* Capacity is always in 512-byte units. */
540	capacity = blkcfg.capacity * VTBLK_BSIZE;
541
542	if (sc->vtblk_disk->d_mediasize != capacity)
543		vtblk_resize_disk(sc, capacity);
544
545	return (0);
546}
547
548static int
549vtblk_open(struct disk *dp)
550{
551	struct vtblk_softc *sc;
552
553	if ((sc = dp->d_drv1) == NULL)
554		return (ENXIO);
555
556	return (sc->vtblk_flags & VTBLK_FLAG_DETACH ? ENXIO : 0);
557}
558
559static int
560vtblk_close(struct disk *dp)
561{
562	struct vtblk_softc *sc;
563
564	if ((sc = dp->d_drv1) == NULL)
565		return (ENXIO);
566
567	return (0);
568}
569
570static int
571vtblk_ioctl(struct disk *dp, u_long cmd, void *addr, int flag,
572    struct thread *td)
573{
574	struct vtblk_softc *sc;
575
576	if ((sc = dp->d_drv1) == NULL)
577		return (ENXIO);
578
579	return (ENOTTY);
580}
581
582static int
583vtblk_dump(void *arg, void *virtual, off_t offset, size_t length)
584{
585	struct disk *dp;
586	struct vtblk_softc *sc;
587	int error;
588
589	dp = arg;
590	error = 0;
591
592	if ((sc = dp->d_drv1) == NULL)
593		return (ENXIO);
594
595	VTBLK_LOCK(sc);
596
597	vtblk_dump_quiesce(sc);
598
599	if (length > 0)
600		error = vtblk_dump_write(sc, virtual, offset, length);
601	if (error || (virtual == NULL && offset == 0))
602		vtblk_dump_complete(sc);
603
604	VTBLK_UNLOCK(sc);
605
606	return (error);
607}
608
609static void
610vtblk_strategy(struct bio *bp)
611{
612	struct vtblk_softc *sc;
613
614	if ((sc = bp->bio_disk->d_drv1) == NULL) {
615		vtblk_bio_done(NULL, bp, EINVAL);
616		return;
617	}
618
619	if ((bp->bio_cmd != BIO_READ) && (bp->bio_cmd != BIO_WRITE) &&
620	    (bp->bio_cmd != BIO_FLUSH) && (bp->bio_cmd != BIO_DELETE)) {
621		vtblk_bio_done(sc, bp, EOPNOTSUPP);
622		return;
623	}
624
625	VTBLK_LOCK(sc);
626
627	if (sc->vtblk_flags & VTBLK_FLAG_DETACH) {
628		VTBLK_UNLOCK(sc);
629		vtblk_bio_done(sc, bp, ENXIO);
630		return;
631	}
632
633	bioq_insert_tail(&sc->vtblk_bioq, bp);
634	vtblk_startio(sc);
635
636	VTBLK_UNLOCK(sc);
637}
638
639static int
640vtblk_negotiate_features(struct vtblk_softc *sc)
641{
642	device_t dev;
643	uint64_t features;
644
645	dev = sc->vtblk_dev;
646	features = virtio_bus_is_modern(dev) ? VTBLK_MODERN_FEATURES :
647	    VTBLK_LEGACY_FEATURES;
648
649	sc->vtblk_features = virtio_negotiate_features(dev, features);
650	return (virtio_finalize_features(dev));
651}
652
653static int
654vtblk_setup_features(struct vtblk_softc *sc)
655{
656	device_t dev;
657	int error;
658
659	dev = sc->vtblk_dev;
660
661	error = vtblk_negotiate_features(sc);
662	if (error)
663		return (error);
664
665	if (virtio_with_feature(dev, VIRTIO_RING_F_INDIRECT_DESC))
666		sc->vtblk_flags |= VTBLK_FLAG_INDIRECT;
667	if (virtio_with_feature(dev, VIRTIO_BLK_F_CONFIG_WCE))
668		sc->vtblk_flags |= VTBLK_FLAG_WCE_CONFIG;
669
670	/* Legacy. */
671	if (virtio_with_feature(dev, VIRTIO_BLK_F_BARRIER))
672		sc->vtblk_flags |= VTBLK_FLAG_BARRIER;
673
674	return (0);
675}
676
677static int
678vtblk_maximum_segments(struct vtblk_softc *sc,
679    struct virtio_blk_config *blkcfg)
680{
681	device_t dev;
682	int nsegs;
683
684	dev = sc->vtblk_dev;
685	nsegs = VTBLK_MIN_SEGMENTS;
686
687	if (virtio_with_feature(dev, VIRTIO_BLK_F_SEG_MAX)) {
688		nsegs += MIN(blkcfg->seg_max, maxphys / PAGE_SIZE + 1);
689		if (sc->vtblk_flags & VTBLK_FLAG_INDIRECT)
690			nsegs = MIN(nsegs, VIRTIO_MAX_INDIRECT);
691	} else
692		nsegs += 1;
693
694	return (nsegs);
695}
696
697static int
698vtblk_alloc_virtqueue(struct vtblk_softc *sc)
699{
700	device_t dev;
701	struct vq_alloc_info vq_info;
702
703	dev = sc->vtblk_dev;
704
705	VQ_ALLOC_INFO_INIT(&vq_info, sc->vtblk_max_nsegs,
706	    vtblk_vq_intr, sc, &sc->vtblk_vq,
707	    "%s request", device_get_nameunit(dev));
708
709	return (virtio_alloc_virtqueues(dev, 1, &vq_info));
710}
711
712static void
713vtblk_resize_disk(struct vtblk_softc *sc, uint64_t new_capacity)
714{
715	device_t dev;
716	struct disk *dp;
717	int error;
718
719	dev = sc->vtblk_dev;
720	dp = sc->vtblk_disk;
721
722	dp->d_mediasize = new_capacity;
723	if (bootverbose) {
724		device_printf(dev, "resized to %juMB (%ju %u byte sectors)\n",
725		    (uintmax_t) dp->d_mediasize >> 20,
726		    (uintmax_t) dp->d_mediasize / dp->d_sectorsize,
727		    dp->d_sectorsize);
728	}
729
730	error = disk_resize(dp, M_NOWAIT);
731	if (error) {
732		device_printf(dev,
733		    "disk_resize(9) failed, error: %d\n", error);
734	}
735}
736
737static void
738vtblk_alloc_disk(struct vtblk_softc *sc, struct virtio_blk_config *blkcfg)
739{
740	device_t dev;
741	struct disk *dp;
742
743	dev = sc->vtblk_dev;
744
745	sc->vtblk_disk = dp = disk_alloc();
746	dp->d_open = vtblk_open;
747	dp->d_close = vtblk_close;
748	dp->d_ioctl = vtblk_ioctl;
749	dp->d_strategy = vtblk_strategy;
750	dp->d_name = VTBLK_DISK_NAME;
751	dp->d_unit = device_get_unit(dev);
752	dp->d_drv1 = sc;
753	dp->d_flags = DISKFLAG_UNMAPPED_BIO | DISKFLAG_DIRECT_COMPLETION;
754	dp->d_hba_vendor = virtio_get_vendor(dev);
755	dp->d_hba_device = virtio_get_device(dev);
756	dp->d_hba_subvendor = virtio_get_subvendor(dev);
757	dp->d_hba_subdevice = virtio_get_subdevice(dev);
758
759	if (virtio_with_feature(dev, VIRTIO_BLK_F_RO))
760		dp->d_flags |= DISKFLAG_WRITE_PROTECT;
761	else {
762		if (virtio_with_feature(dev, VIRTIO_BLK_F_FLUSH))
763			dp->d_flags |= DISKFLAG_CANFLUSHCACHE;
764		dp->d_dump = vtblk_dump;
765	}
766
767	/* Capacity is always in 512-byte units. */
768	dp->d_mediasize = blkcfg->capacity * VTBLK_BSIZE;
769
770	if (virtio_with_feature(dev, VIRTIO_BLK_F_BLK_SIZE))
771		dp->d_sectorsize = blkcfg->blk_size;
772	else
773		dp->d_sectorsize = VTBLK_BSIZE;
774
775	/*
776	 * The VirtIO maximum I/O size is given in terms of segments.
777	 * However, FreeBSD limits I/O size by logical buffer size, not
778	 * by physically contiguous pages. Therefore, we have to assume
779	 * no pages are contiguous. This may impose an artificially low
780	 * maximum I/O size. But in practice, since QEMU advertises 128
781	 * segments, this gives us a maximum IO size of 125 * PAGE_SIZE,
782	 * which is typically greater than maxphys. Eventually we should
783	 * just advertise maxphys and split buffers that are too big.
784	 *
785	 * If we're not asking busdma to align data to page boundaries, the
786	 * maximum I/O size is reduced by PAGE_SIZE in order to accommodate
787	 * unaligned I/Os.
788	 */
789	dp->d_maxsize = (sc->vtblk_max_nsegs - VTBLK_MIN_SEGMENTS) *
790	    PAGE_SIZE;
791	if ((sc->vtblk_flags & VTBLK_FLAG_BUSDMA_ALIGN) == 0)
792		dp->d_maxsize -= PAGE_SIZE;
793
794	if (virtio_with_feature(dev, VIRTIO_BLK_F_GEOMETRY)) {
795		dp->d_fwsectors = blkcfg->geometry.sectors;
796		dp->d_fwheads = blkcfg->geometry.heads;
797	}
798
799	if (virtio_with_feature(dev, VIRTIO_BLK_F_TOPOLOGY) &&
800	    blkcfg->topology.physical_block_exp > 0) {
801		dp->d_stripesize = dp->d_sectorsize *
802		    (1 << blkcfg->topology.physical_block_exp);
803		dp->d_stripeoffset = (dp->d_stripesize -
804		    blkcfg->topology.alignment_offset * dp->d_sectorsize) %
805		    dp->d_stripesize;
806	}
807
808	if (virtio_with_feature(dev, VIRTIO_BLK_F_DISCARD)) {
809		dp->d_flags |= DISKFLAG_CANDELETE;
810		dp->d_delmaxsize = blkcfg->max_discard_sectors * VTBLK_BSIZE;
811	}
812
813	if (vtblk_write_cache_enabled(sc, blkcfg) != 0)
814		sc->vtblk_write_cache = VTBLK_CACHE_WRITEBACK;
815	else
816		sc->vtblk_write_cache = VTBLK_CACHE_WRITETHROUGH;
817}
818
819static void
820vtblk_create_disk(struct vtblk_softc *sc)
821{
822	struct disk *dp;
823
824	dp = sc->vtblk_disk;
825
826	vtblk_ident(sc);
827
828	device_printf(sc->vtblk_dev, "%juMB (%ju %u byte sectors)\n",
829	    (uintmax_t) dp->d_mediasize >> 20,
830	    (uintmax_t) dp->d_mediasize / dp->d_sectorsize,
831	    dp->d_sectorsize);
832
833	disk_create(dp, DISK_VERSION);
834}
835
836static int
837vtblk_request_prealloc(struct vtblk_softc *sc)
838{
839	struct vtblk_request *req;
840	int i, nreqs;
841
842	nreqs = virtqueue_size(sc->vtblk_vq);
843
844	/*
845	 * Preallocate sufficient requests to keep the virtqueue full. Each
846	 * request consumes VTBLK_MIN_SEGMENTS or more descriptors so reduce
847	 * the number allocated when indirect descriptors are not available.
848	 */
849	if ((sc->vtblk_flags & VTBLK_FLAG_INDIRECT) == 0)
850		nreqs /= VTBLK_MIN_SEGMENTS;
851
852	for (i = 0; i < nreqs; i++) {
853		req = malloc(sizeof(struct vtblk_request), M_DEVBUF, M_NOWAIT);
854		if (req == NULL)
855			return (ENOMEM);
856
857		req->vbr_sc = sc;
858		if (bus_dmamap_create(sc->vtblk_dmat, 0, &req->vbr_mapp)) {
859			free(req, M_DEVBUF);
860			return (ENOMEM);
861		}
862
863		MPASS(sglist_count(&req->vbr_hdr, sizeof(req->vbr_hdr)) == 1);
864		MPASS(sglist_count(&req->vbr_ack, sizeof(req->vbr_ack)) == 1);
865
866		sc->vtblk_request_count++;
867		vtblk_request_enqueue(sc, req);
868	}
869
870	return (0);
871}
872
873static void
874vtblk_request_free(struct vtblk_softc *sc)
875{
876	struct vtblk_request *req;
877
878	MPASS(TAILQ_EMPTY(&sc->vtblk_req_ready));
879
880	while ((req = vtblk_request_dequeue(sc)) != NULL) {
881		sc->vtblk_request_count--;
882		bus_dmamap_destroy(sc->vtblk_dmat, req->vbr_mapp);
883		free(req, M_DEVBUF);
884	}
885
886	KASSERT(sc->vtblk_request_count == 0,
887	    ("%s: leaked %d requests", __func__, sc->vtblk_request_count));
888}
889
890static struct vtblk_request *
891vtblk_request_dequeue(struct vtblk_softc *sc)
892{
893	struct vtblk_request *req;
894
895	req = TAILQ_FIRST(&sc->vtblk_req_free);
896	if (req != NULL) {
897		TAILQ_REMOVE(&sc->vtblk_req_free, req, vbr_link);
898		bzero(&req->vbr_hdr, sizeof(struct vtblk_request) -
899		    offsetof(struct vtblk_request, vbr_hdr));
900	}
901
902	return (req);
903}
904
905static void
906vtblk_request_enqueue(struct vtblk_softc *sc, struct vtblk_request *req)
907{
908
909	TAILQ_INSERT_HEAD(&sc->vtblk_req_free, req, vbr_link);
910}
911
912static struct vtblk_request *
913vtblk_request_next_ready(struct vtblk_softc *sc)
914{
915	struct vtblk_request *req;
916
917	req = TAILQ_FIRST(&sc->vtblk_req_ready);
918	if (req != NULL)
919		TAILQ_REMOVE(&sc->vtblk_req_ready, req, vbr_link);
920
921	return (req);
922}
923
924static void
925vtblk_request_requeue_ready(struct vtblk_softc *sc, struct vtblk_request *req)
926{
927
928	/* NOTE: Currently, there will be at most one request in the queue. */
929	TAILQ_INSERT_HEAD(&sc->vtblk_req_ready, req, vbr_link);
930}
931
932static struct vtblk_request *
933vtblk_request_next(struct vtblk_softc *sc)
934{
935	struct vtblk_request *req;
936
937	req = vtblk_request_next_ready(sc);
938	if (req != NULL)
939		return (req);
940
941	return (vtblk_request_bio(sc));
942}
943
944static struct vtblk_request *
945vtblk_request_bio(struct vtblk_softc *sc)
946{
947	struct bio_queue_head *bioq;
948	struct vtblk_request *req;
949	struct bio *bp;
950
951	bioq = &sc->vtblk_bioq;
952
953	if (bioq_first(bioq) == NULL)
954		return (NULL);
955
956	req = vtblk_request_dequeue(sc);
957	if (req == NULL)
958		return (NULL);
959
960	bp = bioq_takefirst(bioq);
961	req->vbr_bp = bp;
962	req->vbr_ack = -1;
963	req->vbr_hdr.ioprio = vtblk_gtoh32(sc, 1);
964
965	switch (bp->bio_cmd) {
966	case BIO_FLUSH:
967		req->vbr_hdr.type = vtblk_gtoh32(sc, VIRTIO_BLK_T_FLUSH);
968		req->vbr_hdr.sector = 0;
969		break;
970	case BIO_READ:
971		req->vbr_hdr.type = vtblk_gtoh32(sc, VIRTIO_BLK_T_IN);
972		req->vbr_hdr.sector = vtblk_gtoh64(sc, bp->bio_offset / VTBLK_BSIZE);
973		break;
974	case BIO_WRITE:
975		req->vbr_hdr.type = vtblk_gtoh32(sc, VIRTIO_BLK_T_OUT);
976		req->vbr_hdr.sector = vtblk_gtoh64(sc, bp->bio_offset / VTBLK_BSIZE);
977		break;
978	case BIO_DELETE:
979		req->vbr_hdr.type = vtblk_gtoh32(sc, VIRTIO_BLK_T_DISCARD);
980		req->vbr_hdr.sector = vtblk_gtoh64(sc, bp->bio_offset / VTBLK_BSIZE);
981		break;
982	default:
983		panic("%s: bio with unhandled cmd: %d", __func__, bp->bio_cmd);
984	}
985
986	if (bp->bio_flags & BIO_ORDERED)
987		req->vbr_hdr.type |= vtblk_gtoh32(sc, VIRTIO_BLK_T_BARRIER);
988
989	return (req);
990}
991
992static int
993vtblk_request_execute(struct vtblk_request *req, int flags)
994{
995	struct vtblk_softc *sc = req->vbr_sc;
996	struct bio *bp = req->vbr_bp;
997	int error = 0;
998
999	/*
1000	 * Call via bus_dmamap_load_bio or directly depending on whether we
1001	 * have a buffer we need to map.  If we don't have a busdma map,
1002	 * try to perform the I/O directly and hope that it works (this will
1003	 * happen when dumping).
1004	 */
1005	if ((req->vbr_mapp != NULL) &&
1006	    (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE)) {
1007		error = bus_dmamap_load_bio(sc->vtblk_dmat, req->vbr_mapp,
1008		    req->vbr_bp, vtblk_request_execute_cb, req, flags);
1009		if (error == EINPROGRESS) {
1010			req->vbr_busdma_wait = 1;
1011			sc->vtblk_flags |= VTBLK_FLAG_BUSDMA_WAIT;
1012		}
1013	} else {
1014		vtblk_request_execute_cb(req, NULL, 0, 0);
1015	}
1016
1017	return (error ? error : req->vbr_error);
1018}
1019
1020static void
1021vtblk_request_execute_cb(void * callback_arg, bus_dma_segment_t * segs,
1022    int nseg, int error)
1023{
1024	struct vtblk_request *req;
1025	struct vtblk_softc *sc;
1026	struct virtqueue *vq;
1027	struct sglist *sg;
1028	struct bio *bp;
1029	int ordered, readable, writable, i;
1030
1031	req = (struct vtblk_request *)callback_arg;
1032	sc = req->vbr_sc;
1033	vq = sc->vtblk_vq;
1034	sg = sc->vtblk_sglist;
1035	bp = req->vbr_bp;
1036	ordered = 0;
1037	writable = 0;
1038
1039	/*
1040	 * If we paused request queueing while we waited for busdma to call us
1041	 * asynchronously, unpause it now; this request made it through so we
1042	 * don't need to worry about others getting ahead of us.  (Note that we
1043	 * hold the device mutex so nothing will happen until after we return
1044	 * anyway.)
1045	 */
1046	if (req->vbr_busdma_wait)
1047		sc->vtblk_flags &= ~VTBLK_FLAG_BUSDMA_WAIT;
1048
1049	/* Fail on errors from busdma. */
1050	if (error)
1051		goto out1;
1052
1053	/*
1054	 * Some hosts (such as bhyve) do not implement the barrier feature,
1055	 * so we emulate it in the driver by allowing the barrier request
1056	 * to be the only one in flight.
1057	 */
1058	if ((sc->vtblk_flags & VTBLK_FLAG_BARRIER) == 0) {
1059		if (sc->vtblk_req_ordered != NULL) {
1060			error = EBUSY;
1061			goto out;
1062		}
1063		if (bp->bio_flags & BIO_ORDERED) {
1064			if (!virtqueue_empty(vq)) {
1065				error = EBUSY;
1066				goto out;
1067			}
1068			ordered = 1;
1069			req->vbr_hdr.type &= vtblk_gtoh32(sc,
1070				~VIRTIO_BLK_T_BARRIER);
1071		}
1072	}
1073
1074	sglist_reset(sg);
1075	sglist_append(sg, &req->vbr_hdr, sizeof(struct virtio_blk_outhdr));
1076
1077	if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) {
1078		/*
1079		 * We cast bus_addr_t to vm_paddr_t here; since we skip the
1080		 * iommu mapping (see vtblk_attach) this should be safe.
1081		 */
1082		for (i = 0; i < nseg; i++) {
1083			error = sglist_append_phys(sg,
1084			    (vm_paddr_t)segs[i].ds_addr, segs[i].ds_len);
1085			if (error || sg->sg_nseg == sg->sg_maxseg) {
1086				panic("%s: bio %p data buffer too big %d",
1087				    __func__, bp, error);
1088			}
1089		}
1090
1091		/* Special handling for dump, which bypasses busdma. */
1092		if (req->vbr_mapp == NULL) {
1093			error = sglist_append_bio(sg, bp);
1094			if (error || sg->sg_nseg == sg->sg_maxseg) {
1095				panic("%s: bio %p data buffer too big %d",
1096				    __func__, bp, error);
1097			}
1098		}
1099
1100		/* BIO_READ means the host writes into our buffer. */
1101		if (bp->bio_cmd == BIO_READ)
1102			writable = sg->sg_nseg - 1;
1103	} else if (bp->bio_cmd == BIO_DELETE) {
1104		struct virtio_blk_discard_write_zeroes *discard;
1105
1106		discard = malloc(sizeof(*discard), M_DEVBUF, M_NOWAIT | M_ZERO);
1107		if (discard == NULL) {
1108			error = ENOMEM;
1109			goto out;
1110		}
1111
1112		bp->bio_driver1 = discard;
1113		discard->sector = vtblk_gtoh64(sc, bp->bio_offset / VTBLK_BSIZE);
1114		discard->num_sectors = vtblk_gtoh32(sc, bp->bio_bcount / VTBLK_BSIZE);
1115		error = sglist_append(sg, discard, sizeof(*discard));
1116		if (error || sg->sg_nseg == sg->sg_maxseg) {
1117			panic("%s: bio %p data buffer too big %d",
1118			    __func__, bp, error);
1119		}
1120	}
1121
1122	writable++;
1123	sglist_append(sg, &req->vbr_ack, sizeof(uint8_t));
1124	readable = sg->sg_nseg - writable;
1125
1126	if (req->vbr_mapp != NULL) {
1127		switch (bp->bio_cmd) {
1128		case BIO_READ:
1129			bus_dmamap_sync(sc->vtblk_dmat, req->vbr_mapp,
1130			    BUS_DMASYNC_PREREAD);
1131			break;
1132		case BIO_WRITE:
1133			bus_dmamap_sync(sc->vtblk_dmat, req->vbr_mapp,
1134			    BUS_DMASYNC_PREWRITE);
1135			break;
1136		}
1137	}
1138
1139	error = virtqueue_enqueue(vq, req, sg, readable, writable);
1140	if (error == 0 && ordered)
1141		sc->vtblk_req_ordered = req;
1142
1143	/*
1144	 * If we were called asynchronously, we need to notify the queue that
1145	 * we've added a new request, since the notification from startio was
1146	 * performed already.
1147	 */
1148	if (error == 0 && req->vbr_busdma_wait)
1149		virtqueue_notify(vq);
1150
1151out:
1152	if (error && (req->vbr_mapp != NULL))
1153		bus_dmamap_unload(sc->vtblk_dmat, req->vbr_mapp);
1154out1:
1155	if (error && req->vbr_requeue_on_error)
1156		vtblk_request_requeue_ready(sc, req);
1157	req->vbr_error = error;
1158}
1159
1160static int
1161vtblk_request_error(struct vtblk_request *req)
1162{
1163	int error;
1164
1165	switch (req->vbr_ack) {
1166	case VIRTIO_BLK_S_OK:
1167		error = 0;
1168		break;
1169	case VIRTIO_BLK_S_UNSUPP:
1170		error = ENOTSUP;
1171		break;
1172	default:
1173		error = EIO;
1174		break;
1175	}
1176
1177	return (error);
1178}
1179
1180static void
1181vtblk_queue_completed(struct vtblk_softc *sc, struct bio_queue *queue)
1182{
1183	struct vtblk_request *req;
1184	struct bio *bp;
1185
1186	while ((req = virtqueue_dequeue(sc->vtblk_vq, NULL)) != NULL) {
1187		if (sc->vtblk_req_ordered != NULL) {
1188			MPASS(sc->vtblk_req_ordered == req);
1189			sc->vtblk_req_ordered = NULL;
1190		}
1191
1192		bp = req->vbr_bp;
1193		if (req->vbr_mapp != NULL) {
1194			switch (bp->bio_cmd) {
1195			case BIO_READ:
1196				bus_dmamap_sync(sc->vtblk_dmat, req->vbr_mapp,
1197				    BUS_DMASYNC_POSTREAD);
1198				bus_dmamap_unload(sc->vtblk_dmat,
1199				    req->vbr_mapp);
1200				break;
1201			case BIO_WRITE:
1202				bus_dmamap_sync(sc->vtblk_dmat, req->vbr_mapp,
1203				    BUS_DMASYNC_POSTWRITE);
1204				bus_dmamap_unload(sc->vtblk_dmat,
1205				    req->vbr_mapp);
1206				break;
1207			}
1208		}
1209		bp->bio_error = vtblk_request_error(req);
1210		TAILQ_INSERT_TAIL(queue, bp, bio_queue);
1211
1212		vtblk_request_enqueue(sc, req);
1213	}
1214}
1215
1216static void
1217vtblk_done_completed(struct vtblk_softc *sc, struct bio_queue *queue)
1218{
1219	struct bio *bp, *tmp;
1220
1221	TAILQ_FOREACH_SAFE(bp, queue, bio_queue, tmp) {
1222		if (bp->bio_error != 0)
1223			disk_err(bp, "hard error", -1, 1);
1224		vtblk_bio_done(sc, bp, bp->bio_error);
1225	}
1226}
1227
1228static void
1229vtblk_drain_vq(struct vtblk_softc *sc)
1230{
1231	struct virtqueue *vq;
1232	struct vtblk_request *req;
1233	int last;
1234
1235	vq = sc->vtblk_vq;
1236	last = 0;
1237
1238	while ((req = virtqueue_drain(vq, &last)) != NULL) {
1239		vtblk_bio_done(sc, req->vbr_bp, ENXIO);
1240		vtblk_request_enqueue(sc, req);
1241	}
1242
1243	sc->vtblk_req_ordered = NULL;
1244	KASSERT(virtqueue_empty(vq), ("virtqueue not empty"));
1245}
1246
1247static void
1248vtblk_drain(struct vtblk_softc *sc)
1249{
1250	struct bio_queue_head *bioq;
1251	struct vtblk_request *req;
1252	struct bio *bp;
1253
1254	bioq = &sc->vtblk_bioq;
1255
1256	if (sc->vtblk_vq != NULL) {
1257		struct bio_queue queue;
1258
1259		TAILQ_INIT(&queue);
1260		vtblk_queue_completed(sc, &queue);
1261		vtblk_done_completed(sc, &queue);
1262
1263		vtblk_drain_vq(sc);
1264	}
1265
1266	while ((req = vtblk_request_next_ready(sc)) != NULL) {
1267		vtblk_bio_done(sc, req->vbr_bp, ENXIO);
1268		vtblk_request_enqueue(sc, req);
1269	}
1270
1271	while (bioq_first(bioq) != NULL) {
1272		bp = bioq_takefirst(bioq);
1273		vtblk_bio_done(sc, bp, ENXIO);
1274	}
1275
1276	vtblk_request_free(sc);
1277}
1278
1279static void
1280vtblk_startio(struct vtblk_softc *sc)
1281{
1282	struct virtqueue *vq;
1283	struct vtblk_request *req;
1284	int enq;
1285
1286	VTBLK_LOCK_ASSERT(sc);
1287	vq = sc->vtblk_vq;
1288	enq = 0;
1289
1290	if (sc->vtblk_flags & (VTBLK_FLAG_SUSPEND | VTBLK_FLAG_BUSDMA_WAIT))
1291		return;
1292
1293	while (!virtqueue_full(vq)) {
1294		req = vtblk_request_next(sc);
1295		if (req == NULL)
1296			break;
1297
1298		req->vbr_requeue_on_error = 1;
1299		if (vtblk_request_execute(req, BUS_DMA_WAITOK))
1300			break;
1301
1302		enq++;
1303	}
1304
1305	if (enq > 0)
1306		virtqueue_notify(vq);
1307}
1308
1309static void
1310vtblk_bio_done(struct vtblk_softc *sc, struct bio *bp, int error)
1311{
1312
1313	/* Because of GEOM direct dispatch, we cannot hold any locks. */
1314	if (sc != NULL)
1315		VTBLK_LOCK_ASSERT_NOTOWNED(sc);
1316
1317	if (error) {
1318		bp->bio_resid = bp->bio_bcount;
1319		bp->bio_error = error;
1320		bp->bio_flags |= BIO_ERROR;
1321	} else {
1322		kmsan_mark_bio(bp, KMSAN_STATE_INITED);
1323	}
1324
1325	if (bp->bio_driver1 != NULL) {
1326		free(bp->bio_driver1, M_DEVBUF);
1327		bp->bio_driver1 = NULL;
1328	}
1329
1330	biodone(bp);
1331}
1332
1333#define VTBLK_GET_CONFIG(_dev, _feature, _field, _cfg)			\
1334	if (virtio_with_feature(_dev, _feature)) {			\
1335		virtio_read_device_config(_dev,				\
1336		    offsetof(struct virtio_blk_config, _field),		\
1337		    &(_cfg)->_field, sizeof((_cfg)->_field));		\
1338	}
1339
1340static void
1341vtblk_read_config(struct vtblk_softc *sc, struct virtio_blk_config *blkcfg)
1342{
1343	device_t dev;
1344
1345	dev = sc->vtblk_dev;
1346
1347	bzero(blkcfg, sizeof(struct virtio_blk_config));
1348
1349	/* The capacity is always available. */
1350	virtio_read_device_config(dev, offsetof(struct virtio_blk_config,
1351	    capacity), &blkcfg->capacity, sizeof(blkcfg->capacity));
1352
1353	/* Read the configuration if the feature was negotiated. */
1354	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_SIZE_MAX, size_max, blkcfg);
1355	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_SEG_MAX, seg_max, blkcfg);
1356	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_GEOMETRY,
1357	    geometry.cylinders, blkcfg);
1358	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_GEOMETRY,
1359	    geometry.heads, blkcfg);
1360	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_GEOMETRY,
1361	    geometry.sectors, blkcfg);
1362	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_BLK_SIZE, blk_size, blkcfg);
1363	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_TOPOLOGY,
1364	    topology.physical_block_exp, blkcfg);
1365	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_TOPOLOGY,
1366	    topology.alignment_offset, blkcfg);
1367	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_TOPOLOGY,
1368	    topology.min_io_size, blkcfg);
1369	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_TOPOLOGY,
1370	    topology.opt_io_size, blkcfg);
1371	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_CONFIG_WCE, wce, blkcfg);
1372	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_DISCARD, max_discard_sectors,
1373	    blkcfg);
1374	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_DISCARD, max_discard_seg, blkcfg);
1375	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_DISCARD, discard_sector_alignment,
1376	    blkcfg);
1377}
1378
1379#undef VTBLK_GET_CONFIG
1380
1381static void
1382vtblk_ident(struct vtblk_softc *sc)
1383{
1384	struct bio buf;
1385	struct disk *dp;
1386	struct vtblk_request *req;
1387	int len, error;
1388
1389	dp = sc->vtblk_disk;
1390	len = MIN(VIRTIO_BLK_ID_BYTES, DISK_IDENT_SIZE);
1391
1392	if (vtblk_tunable_int(sc, "no_ident", vtblk_no_ident) != 0)
1393		return;
1394
1395	req = vtblk_request_dequeue(sc);
1396	if (req == NULL)
1397		return;
1398
1399	req->vbr_ack = -1;
1400	req->vbr_hdr.type = vtblk_gtoh32(sc, VIRTIO_BLK_T_GET_ID);
1401	req->vbr_hdr.ioprio = vtblk_gtoh32(sc, 1);
1402	req->vbr_hdr.sector = 0;
1403
1404	req->vbr_bp = &buf;
1405	g_reset_bio(&buf);
1406
1407	buf.bio_cmd = BIO_READ;
1408	buf.bio_data = dp->d_ident;
1409	buf.bio_bcount = len;
1410
1411	VTBLK_LOCK(sc);
1412	error = vtblk_poll_request(sc, req);
1413	VTBLK_UNLOCK(sc);
1414
1415	vtblk_request_enqueue(sc, req);
1416
1417	if (error) {
1418		device_printf(sc->vtblk_dev,
1419		    "error getting device identifier: %d\n", error);
1420	}
1421}
1422
1423static int
1424vtblk_poll_request(struct vtblk_softc *sc, struct vtblk_request *req)
1425{
1426	struct virtqueue *vq;
1427	int error;
1428
1429	vq = sc->vtblk_vq;
1430
1431	if (!virtqueue_empty(vq))
1432		return (EBUSY);
1433
1434	error = vtblk_request_execute(req, BUS_DMA_NOWAIT);
1435	if (error)
1436		return (error);
1437
1438	virtqueue_notify(vq);
1439	virtqueue_poll(vq, NULL);
1440
1441	error = vtblk_request_error(req);
1442	if (error && bootverbose) {
1443		device_printf(sc->vtblk_dev,
1444		    "%s: IO error: %d\n", __func__, error);
1445	}
1446
1447	return (error);
1448}
1449
1450static int
1451vtblk_quiesce(struct vtblk_softc *sc)
1452{
1453	int error;
1454
1455	VTBLK_LOCK_ASSERT(sc);
1456	error = 0;
1457
1458	while (!virtqueue_empty(sc->vtblk_vq)) {
1459		if (mtx_sleep(&sc->vtblk_vq, VTBLK_MTX(sc), PRIBIO, "vtblkq",
1460		    VTBLK_QUIESCE_TIMEOUT) == EWOULDBLOCK) {
1461			error = EBUSY;
1462			break;
1463		}
1464	}
1465
1466	return (error);
1467}
1468
1469static void
1470vtblk_vq_intr(void *xsc)
1471{
1472	struct vtblk_softc *sc;
1473	struct virtqueue *vq;
1474	struct bio_queue queue;
1475
1476	sc = xsc;
1477	vq = sc->vtblk_vq;
1478	TAILQ_INIT(&queue);
1479
1480	VTBLK_LOCK(sc);
1481
1482again:
1483	if (sc->vtblk_flags & VTBLK_FLAG_DETACH)
1484		goto out;
1485
1486	vtblk_queue_completed(sc, &queue);
1487	vtblk_startio(sc);
1488
1489	if (virtqueue_enable_intr(vq) != 0) {
1490		virtqueue_disable_intr(vq);
1491		goto again;
1492	}
1493
1494	if (sc->vtblk_flags & VTBLK_FLAG_SUSPEND)
1495		wakeup(&sc->vtblk_vq);
1496
1497out:
1498	VTBLK_UNLOCK(sc);
1499	vtblk_done_completed(sc, &queue);
1500}
1501
1502static void
1503vtblk_stop(struct vtblk_softc *sc)
1504{
1505
1506	virtqueue_disable_intr(sc->vtblk_vq);
1507	virtio_stop(sc->vtblk_dev);
1508}
1509
1510static void
1511vtblk_dump_quiesce(struct vtblk_softc *sc)
1512{
1513
1514	/*
1515	 * Spin here until all the requests in-flight at the time of the
1516	 * dump are completed and queued. The queued requests will be
1517	 * biodone'd once the dump is finished.
1518	 */
1519	while (!virtqueue_empty(sc->vtblk_vq))
1520		vtblk_queue_completed(sc, &sc->vtblk_dump_queue);
1521}
1522
1523static int
1524vtblk_dump_write(struct vtblk_softc *sc, void *virtual, off_t offset,
1525    size_t length)
1526{
1527	struct bio buf;
1528	struct vtblk_request *req;
1529
1530	req = &sc->vtblk_dump_request;
1531	req->vbr_sc = sc;
1532	req->vbr_ack = -1;
1533	req->vbr_hdr.type = vtblk_gtoh32(sc, VIRTIO_BLK_T_OUT);
1534	req->vbr_hdr.ioprio = vtblk_gtoh32(sc, 1);
1535	req->vbr_hdr.sector = vtblk_gtoh64(sc, offset / VTBLK_BSIZE);
1536
1537	req->vbr_bp = &buf;
1538	g_reset_bio(&buf);
1539
1540	buf.bio_cmd = BIO_WRITE;
1541	buf.bio_data = virtual;
1542	buf.bio_bcount = length;
1543
1544	return (vtblk_poll_request(sc, req));
1545}
1546
1547static int
1548vtblk_dump_flush(struct vtblk_softc *sc)
1549{
1550	struct bio buf;
1551	struct vtblk_request *req;
1552
1553	req = &sc->vtblk_dump_request;
1554	req->vbr_sc = sc;
1555	req->vbr_ack = -1;
1556	req->vbr_hdr.type = vtblk_gtoh32(sc, VIRTIO_BLK_T_FLUSH);
1557	req->vbr_hdr.ioprio = vtblk_gtoh32(sc, 1);
1558	req->vbr_hdr.sector = 0;
1559
1560	req->vbr_bp = &buf;
1561	g_reset_bio(&buf);
1562
1563	buf.bio_cmd = BIO_FLUSH;
1564
1565	return (vtblk_poll_request(sc, req));
1566}
1567
1568static void
1569vtblk_dump_complete(struct vtblk_softc *sc)
1570{
1571
1572	vtblk_dump_flush(sc);
1573
1574	VTBLK_UNLOCK(sc);
1575	vtblk_done_completed(sc, &sc->vtblk_dump_queue);
1576	VTBLK_LOCK(sc);
1577}
1578
1579static void
1580vtblk_set_write_cache(struct vtblk_softc *sc, int wc)
1581{
1582
1583	/* Set either writeback (1) or writethrough (0) mode. */
1584	virtio_write_dev_config_1(sc->vtblk_dev,
1585	    offsetof(struct virtio_blk_config, wce), wc);
1586}
1587
1588static int
1589vtblk_write_cache_enabled(struct vtblk_softc *sc,
1590    struct virtio_blk_config *blkcfg)
1591{
1592	int wc;
1593
1594	if (sc->vtblk_flags & VTBLK_FLAG_WCE_CONFIG) {
1595		wc = vtblk_tunable_int(sc, "writecache_mode",
1596		    vtblk_writecache_mode);
1597		if (wc >= 0 && wc < VTBLK_CACHE_MAX)
1598			vtblk_set_write_cache(sc, wc);
1599		else
1600			wc = blkcfg->wce;
1601	} else
1602		wc = virtio_with_feature(sc->vtblk_dev, VIRTIO_BLK_F_FLUSH);
1603
1604	return (wc);
1605}
1606
1607static int
1608vtblk_write_cache_sysctl(SYSCTL_HANDLER_ARGS)
1609{
1610	struct vtblk_softc *sc;
1611	int wc, error;
1612
1613	sc = oidp->oid_arg1;
1614	wc = sc->vtblk_write_cache;
1615
1616	error = sysctl_handle_int(oidp, &wc, 0, req);
1617	if (error || req->newptr == NULL)
1618		return (error);
1619	if ((sc->vtblk_flags & VTBLK_FLAG_WCE_CONFIG) == 0)
1620		return (EPERM);
1621	if (wc < 0 || wc >= VTBLK_CACHE_MAX)
1622		return (EINVAL);
1623
1624	VTBLK_LOCK(sc);
1625	sc->vtblk_write_cache = wc;
1626	vtblk_set_write_cache(sc, sc->vtblk_write_cache);
1627	VTBLK_UNLOCK(sc);
1628
1629	return (0);
1630}
1631
1632static void
1633vtblk_setup_sysctl(struct vtblk_softc *sc)
1634{
1635	device_t dev;
1636	struct sysctl_ctx_list *ctx;
1637	struct sysctl_oid *tree;
1638	struct sysctl_oid_list *child;
1639
1640	dev = sc->vtblk_dev;
1641	ctx = device_get_sysctl_ctx(dev);
1642	tree = device_get_sysctl_tree(dev);
1643	child = SYSCTL_CHILDREN(tree);
1644
1645	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "writecache_mode",
1646	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1647	    vtblk_write_cache_sysctl, "I",
1648	    "Write cache mode (writethrough (0) or writeback (1))");
1649}
1650
1651static int
1652vtblk_tunable_int(struct vtblk_softc *sc, const char *knob, int def)
1653{
1654	char path[64];
1655
1656	snprintf(path, sizeof(path),
1657	    "hw.vtblk.%d.%s", device_get_unit(sc->vtblk_dev), knob);
1658	TUNABLE_INT_FETCH(path, &def);
1659
1660	return (def);
1661}
1662