1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2011 NetApp, Inc.
5 * All rights reserved.
6 * Copyright 2020 Joyent, Inc.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 * $FreeBSD$
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD$");
34
35#include <sys/param.h>
36#include <sys/linker_set.h>
37#include <sys/stat.h>
38#include <sys/uio.h>
39#include <sys/ioctl.h>
40#include <sys/disk.h>
41
42#include <errno.h>
43#include <fcntl.h>
44#include <stdio.h>
45#include <stdlib.h>
46#include <stdint.h>
47#include <string.h>
48#include <strings.h>
49#include <unistd.h>
50#include <assert.h>
51#include <pthread.h>
52#include <md5.h>
53
54#include "bhyverun.h"
55#include "debug.h"
56#include "pci_emul.h"
57#include "virtio.h"
58#include "block_if.h"
59
60#define	VTBLK_BSIZE	512
61#define	VTBLK_RINGSZ	128
62
63_Static_assert(VTBLK_RINGSZ <= BLOCKIF_RING_MAX, "Each ring entry must be able to queue a request");
64
65#define	VTBLK_S_OK	0
66#define	VTBLK_S_IOERR	1
67#define	VTBLK_S_UNSUPP	2
68
69#define	VTBLK_BLK_ID_BYTES	20 + 1
70
71/* Capability bits */
72#define	VTBLK_F_BARRIER		(1 << 0)	/* Does host support barriers? */
73#define	VTBLK_F_SIZE_MAX	(1 << 1)	/* Indicates maximum segment size */
74#define	VTBLK_F_SEG_MAX		(1 << 2)	/* Indicates maximum # of segments */
75#define	VTBLK_F_GEOMETRY	(1 << 4)	/* Legacy geometry available  */
76#define	VTBLK_F_RO		(1 << 5)	/* Disk is read-only */
77#define	VTBLK_F_BLK_SIZE	(1 << 6)	/* Block size of disk is available*/
78#define	VTBLK_F_SCSI		(1 << 7)	/* Supports scsi command passthru */
79#define	VTBLK_F_FLUSH		(1 << 9)	/* Writeback mode enabled after reset */
80#define	VTBLK_F_WCE		(1 << 9)	/* Legacy alias for FLUSH */
81#define	VTBLK_F_TOPOLOGY	(1 << 10)	/* Topology information is available */
82#define	VTBLK_F_CONFIG_WCE	(1 << 11)	/* Writeback mode available in config */
83#define	VTBLK_F_MQ		(1 << 12)	/* Multi-Queue */
84#define	VTBLK_F_DISCARD		(1 << 13)	/* Trim blocks */
85#define	VTBLK_F_WRITE_ZEROES	(1 << 14)	/* Write zeros */
86
87/*
88 * Host capabilities
89 */
90#define	VTBLK_S_HOSTCAPS      \
91  ( VTBLK_F_SEG_MAX  |						    \
92    VTBLK_F_BLK_SIZE |						    \
93    VTBLK_F_FLUSH    |						    \
94    VTBLK_F_TOPOLOGY |						    \
95    VIRTIO_RING_F_INDIRECT_DESC )	/* indirect descriptors */
96
97/*
98 * The current blockif_delete() interface only allows a single delete
99 * request at a time.
100 */
101#define	VTBLK_MAX_DISCARD_SEG	1
102
103/*
104 * An arbitrary limit to prevent excessive latency due to large
105 * delete requests.
106 */
107#define	VTBLK_MAX_DISCARD_SECT	((16 << 20) / VTBLK_BSIZE)	/* 16 MiB */
108
109/*
110 * Config space "registers"
111 */
112struct vtblk_config {
113	uint64_t	vbc_capacity;
114	uint32_t	vbc_size_max;
115	uint32_t	vbc_seg_max;
116	struct {
117		uint16_t cylinders;
118		uint8_t heads;
119		uint8_t sectors;
120	} vbc_geometry;
121	uint32_t	vbc_blk_size;
122	struct {
123		uint8_t physical_block_exp;
124		uint8_t alignment_offset;
125		uint16_t min_io_size;
126		uint32_t opt_io_size;
127	} vbc_topology;
128	uint8_t		vbc_writeback;
129	uint8_t		unused0[1];
130	uint16_t	num_queues;
131	uint32_t	max_discard_sectors;
132	uint32_t	max_discard_seg;
133	uint32_t	discard_sector_alignment;
134	uint32_t	max_write_zeroes_sectors;
135	uint32_t	max_write_zeroes_seg;
136	uint8_t		write_zeroes_may_unmap;
137	uint8_t		unused1[3];
138} __packed;
139
140/*
141 * Fixed-size block header
142 */
143struct virtio_blk_hdr {
144#define	VBH_OP_READ		0
145#define	VBH_OP_WRITE		1
146#define	VBH_OP_SCSI_CMD		2
147#define	VBH_OP_SCSI_CMD_OUT	3
148#define	VBH_OP_FLUSH		4
149#define	VBH_OP_FLUSH_OUT	5
150#define	VBH_OP_IDENT		8
151#define	VBH_OP_DISCARD		11
152#define	VBH_OP_WRITE_ZEROES	13
153
154#define	VBH_FLAG_BARRIER	0x80000000	/* OR'ed into vbh_type */
155	uint32_t	vbh_type;
156	uint32_t	vbh_ioprio;
157	uint64_t	vbh_sector;
158} __packed;
159
160/*
161 * Debug printf
162 */
163static int pci_vtblk_debug;
164#define	DPRINTF(params) if (pci_vtblk_debug) PRINTLN params
165#define	WPRINTF(params) PRINTLN params
166
167struct pci_vtblk_ioreq {
168	struct blockif_req		io_req;
169	struct pci_vtblk_softc		*io_sc;
170	uint8_t				*io_status;
171	uint16_t			io_idx;
172};
173
174struct virtio_blk_discard_write_zeroes {
175	uint64_t	sector;
176	uint32_t	num_sectors;
177	struct {
178		uint32_t unmap:1;
179		uint32_t reserved:31;
180	} flags;
181};
182
183/*
184 * Per-device softc
185 */
186struct pci_vtblk_softc {
187	struct virtio_softc vbsc_vs;
188	pthread_mutex_t vsc_mtx;
189	struct vqueue_info vbsc_vq;
190	struct vtblk_config vbsc_cfg;
191	struct virtio_consts vbsc_consts;
192	struct blockif_ctxt *bc;
193	char vbsc_ident[VTBLK_BLK_ID_BYTES];
194	struct pci_vtblk_ioreq vbsc_ios[VTBLK_RINGSZ];
195};
196
197static void pci_vtblk_reset(void *);
198static void pci_vtblk_notify(void *, struct vqueue_info *);
199static int pci_vtblk_cfgread(void *, int, int, uint32_t *);
200static int pci_vtblk_cfgwrite(void *, int, int, uint32_t);
201
202static struct virtio_consts vtblk_vi_consts = {
203	"vtblk",		/* our name */
204	1,			/* we support 1 virtqueue */
205	sizeof(struct vtblk_config),	/* config reg size */
206	pci_vtblk_reset,	/* reset */
207	pci_vtblk_notify,	/* device-wide qnotify */
208	pci_vtblk_cfgread,	/* read PCI config */
209	pci_vtblk_cfgwrite,	/* write PCI config */
210	NULL,			/* apply negotiated features */
211	VTBLK_S_HOSTCAPS,	/* our capabilities */
212};
213
214static void
215pci_vtblk_reset(void *vsc)
216{
217	struct pci_vtblk_softc *sc = vsc;
218
219	DPRINTF(("vtblk: device reset requested !"));
220	vi_reset_dev(&sc->vbsc_vs);
221}
222
223static void
224pci_vtblk_done_locked(struct pci_vtblk_ioreq *io, int err)
225{
226	struct pci_vtblk_softc *sc = io->io_sc;
227
228	/* convert errno into a virtio block error return */
229	if (err == EOPNOTSUPP || err == ENOSYS)
230		*io->io_status = VTBLK_S_UNSUPP;
231	else if (err != 0)
232		*io->io_status = VTBLK_S_IOERR;
233	else
234		*io->io_status = VTBLK_S_OK;
235
236	/*
237	 * Return the descriptor back to the host.
238	 * We wrote 1 byte (our status) to host.
239	 */
240	vq_relchain(&sc->vbsc_vq, io->io_idx, 1);
241	vq_endchains(&sc->vbsc_vq, 0);
242}
243
244static void
245pci_vtblk_done(struct blockif_req *br, int err)
246{
247	struct pci_vtblk_ioreq *io = br->br_param;
248	struct pci_vtblk_softc *sc = io->io_sc;
249
250	pthread_mutex_lock(&sc->vsc_mtx);
251	pci_vtblk_done_locked(io, err);
252	pthread_mutex_unlock(&sc->vsc_mtx);
253}
254
255static void
256pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq)
257{
258	struct virtio_blk_hdr *vbh;
259	struct pci_vtblk_ioreq *io;
260	int i, n;
261	int err;
262	ssize_t iolen;
263	int writeop, type;
264	struct iovec iov[BLOCKIF_IOV_MAX + 2];
265	uint16_t idx, flags[BLOCKIF_IOV_MAX + 2];
266	struct virtio_blk_discard_write_zeroes *discard;
267
268	n = vq_getchain(vq, &idx, iov, BLOCKIF_IOV_MAX + 2, flags);
269
270	/*
271	 * The first descriptor will be the read-only fixed header,
272	 * and the last is for status (hence +2 above and below).
273	 * The remaining iov's are the actual data I/O vectors.
274	 *
275	 * XXX - note - this fails on crash dump, which does a
276	 * VIRTIO_BLK_T_FLUSH with a zero transfer length
277	 */
278	assert(n >= 2 && n <= BLOCKIF_IOV_MAX + 2);
279
280	io = &sc->vbsc_ios[idx];
281	assert((flags[0] & VRING_DESC_F_WRITE) == 0);
282	assert(iov[0].iov_len == sizeof(struct virtio_blk_hdr));
283	vbh = (struct virtio_blk_hdr *)iov[0].iov_base;
284	memcpy(&io->io_req.br_iov, &iov[1], sizeof(struct iovec) * (n - 2));
285	io->io_req.br_iovcnt = n - 2;
286	io->io_req.br_offset = vbh->vbh_sector * VTBLK_BSIZE;
287	io->io_status = (uint8_t *)iov[--n].iov_base;
288	assert(iov[n].iov_len == 1);
289	assert(flags[n] & VRING_DESC_F_WRITE);
290
291	/*
292	 * XXX
293	 * The guest should not be setting the BARRIER flag because
294	 * we don't advertise the capability.
295	 */
296	type = vbh->vbh_type & ~VBH_FLAG_BARRIER;
297	writeop = (type == VBH_OP_WRITE || type == VBH_OP_DISCARD);
298
299	iolen = 0;
300	for (i = 1; i < n; i++) {
301		/*
302		 * - write op implies read-only descriptor,
303		 * - read/ident op implies write-only descriptor,
304		 * therefore test the inverse of the descriptor bit
305		 * to the op.
306		 */
307		assert(((flags[i] & VRING_DESC_F_WRITE) == 0) == writeop);
308		iolen += iov[i].iov_len;
309	}
310	io->io_req.br_resid = iolen;
311
312	DPRINTF(("virtio-block: %s op, %zd bytes, %d segs, offset %ld",
313		 writeop ? "write/discard" : "read/ident", iolen, i - 1,
314		 io->io_req.br_offset));
315
316	switch (type) {
317	case VBH_OP_READ:
318		err = blockif_read(sc->bc, &io->io_req);
319		break;
320	case VBH_OP_WRITE:
321		err = blockif_write(sc->bc, &io->io_req);
322		break;
323	case VBH_OP_DISCARD:
324		/*
325		 * We currently only support a single request, if the guest
326		 * has submitted a request that doesn't conform to the
327		 * requirements, we return a error.
328		 */
329		if (iov[1].iov_len != sizeof (*discard)) {
330			pci_vtblk_done_locked(io, EINVAL);
331			return;
332		}
333
334		/* The segments to discard are provided rather than data */
335		discard = (struct virtio_blk_discard_write_zeroes *)
336		    iov[1].iov_base;
337
338		/*
339		 * virtio v1.1 5.2.6.2:
340		 * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP
341		 * for discard and write zeroes commands if any unknown flag is
342		 * set. Furthermore, the device MUST set the status byte to
343		 * VIRTIO_BLK_S_UNSUPP for discard commands if the unmap flag
344		 * is set.
345		 *
346		 * Currently there are no known flags for a DISCARD request.
347		 */
348		if (discard->flags.unmap != 0 || discard->flags.reserved != 0) {
349			pci_vtblk_done_locked(io, ENOTSUP);
350			return;
351		}
352
353		/* Make sure the request doesn't exceed our size limit */
354		if (discard->num_sectors > VTBLK_MAX_DISCARD_SECT) {
355			pci_vtblk_done_locked(io, EINVAL);
356			return;
357		}
358
359		io->io_req.br_offset = discard->sector * VTBLK_BSIZE;
360		io->io_req.br_resid = discard->num_sectors * VTBLK_BSIZE;
361		err = blockif_delete(sc->bc, &io->io_req);
362		break;
363	case VBH_OP_FLUSH:
364	case VBH_OP_FLUSH_OUT:
365		err = blockif_flush(sc->bc, &io->io_req);
366		break;
367	case VBH_OP_IDENT:
368		/* Assume a single buffer */
369		/* S/n equal to buffer is not zero-terminated. */
370		memset(iov[1].iov_base, 0, iov[1].iov_len);
371		strncpy(iov[1].iov_base, sc->vbsc_ident,
372		    MIN(iov[1].iov_len, sizeof(sc->vbsc_ident)));
373		pci_vtblk_done_locked(io, 0);
374		return;
375	default:
376		pci_vtblk_done_locked(io, EOPNOTSUPP);
377		return;
378	}
379	assert(err == 0);
380}
381
382static void
383pci_vtblk_notify(void *vsc, struct vqueue_info *vq)
384{
385	struct pci_vtblk_softc *sc = vsc;
386
387	while (vq_has_descs(vq))
388		pci_vtblk_proc(sc, vq);
389}
390
391static int
392pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
393{
394	char bident[sizeof("XX:X:X")];
395	struct blockif_ctxt *bctxt;
396	MD5_CTX mdctx;
397	u_char digest[16];
398	struct pci_vtblk_softc *sc;
399	off_t size;
400	int i, sectsz, sts, sto;
401
402	if (opts == NULL) {
403		WPRINTF(("virtio-block: backing device required"));
404		return (1);
405	}
406
407	/*
408	 * The supplied backing file has to exist
409	 */
410	snprintf(bident, sizeof(bident), "%d:%d", pi->pi_slot, pi->pi_func);
411	bctxt = blockif_open(opts, bident);
412	if (bctxt == NULL) {
413		perror("Could not open backing file");
414		return (1);
415	}
416
417	size = blockif_size(bctxt);
418	sectsz = blockif_sectsz(bctxt);
419	blockif_psectsz(bctxt, &sts, &sto);
420
421	sc = calloc(1, sizeof(struct pci_vtblk_softc));
422	sc->bc = bctxt;
423	for (i = 0; i < VTBLK_RINGSZ; i++) {
424		struct pci_vtblk_ioreq *io = &sc->vbsc_ios[i];
425		io->io_req.br_callback = pci_vtblk_done;
426		io->io_req.br_param = io;
427		io->io_sc = sc;
428		io->io_idx = i;
429	}
430
431	bcopy(&vtblk_vi_consts, &sc->vbsc_consts, sizeof (vtblk_vi_consts));
432	if (blockif_candelete(sc->bc))
433		sc->vbsc_consts.vc_hv_caps |= VTBLK_F_DISCARD;
434
435	pthread_mutex_init(&sc->vsc_mtx, NULL);
436
437	/* init virtio softc and virtqueues */
438	vi_softc_linkup(&sc->vbsc_vs, &sc->vbsc_consts, sc, pi, &sc->vbsc_vq);
439	sc->vbsc_vs.vs_mtx = &sc->vsc_mtx;
440
441	sc->vbsc_vq.vq_qsize = VTBLK_RINGSZ;
442	/* sc->vbsc_vq.vq_notify = we have no per-queue notify */
443
444	/*
445	 * Create an identifier for the backing file. Use parts of the
446	 * md5 sum of the filename
447	 */
448	MD5Init(&mdctx);
449	MD5Update(&mdctx, opts, strlen(opts));
450	MD5Final(digest, &mdctx);
451	snprintf(sc->vbsc_ident, VTBLK_BLK_ID_BYTES,
452	    "BHYVE-%02X%02X-%02X%02X-%02X%02X",
453	    digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]);
454
455	/* setup virtio block config space */
456	sc->vbsc_cfg.vbc_capacity = size / VTBLK_BSIZE; /* 512-byte units */
457	sc->vbsc_cfg.vbc_size_max = 0;	/* not negotiated */
458
459	/*
460	 * If Linux is presented with a seg_max greater than the virtio queue
461	 * size, it can stumble into situations where it violates its own
462	 * invariants and panics.  For safety, we keep seg_max clamped, paying
463	 * heed to the two extra descriptors needed for the header and status
464	 * of a request.
465	 */
466	sc->vbsc_cfg.vbc_seg_max = MIN(VTBLK_RINGSZ - 2, BLOCKIF_IOV_MAX);
467	sc->vbsc_cfg.vbc_geometry.cylinders = 0;	/* no geometry */
468	sc->vbsc_cfg.vbc_geometry.heads = 0;
469	sc->vbsc_cfg.vbc_geometry.sectors = 0;
470	sc->vbsc_cfg.vbc_blk_size = sectsz;
471	sc->vbsc_cfg.vbc_topology.physical_block_exp =
472	    (sts > sectsz) ? (ffsll(sts / sectsz) - 1) : 0;
473	sc->vbsc_cfg.vbc_topology.alignment_offset =
474	    (sto != 0) ? ((sts - sto) / sectsz) : 0;
475	sc->vbsc_cfg.vbc_topology.min_io_size = 0;
476	sc->vbsc_cfg.vbc_topology.opt_io_size = 0;
477	sc->vbsc_cfg.vbc_writeback = 0;
478	sc->vbsc_cfg.max_discard_sectors = VTBLK_MAX_DISCARD_SECT;
479	sc->vbsc_cfg.max_discard_seg = VTBLK_MAX_DISCARD_SEG;
480	sc->vbsc_cfg.discard_sector_alignment = sectsz / VTBLK_BSIZE;
481
482	/*
483	 * Should we move some of this into virtio.c?  Could
484	 * have the device, class, and subdev_0 as fields in
485	 * the virtio constants structure.
486	 */
487	pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_BLOCK);
488	pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
489	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
490	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_BLOCK);
491	pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
492
493	if (vi_intr_init(&sc->vbsc_vs, 1, fbsdrun_virtio_msix())) {
494		blockif_close(sc->bc);
495		free(sc);
496		return (1);
497	}
498	vi_set_io_bar(&sc->vbsc_vs, 0);
499	return (0);
500}
501
502static int
503pci_vtblk_cfgwrite(void *vsc, int offset, int size, uint32_t value)
504{
505
506	DPRINTF(("vtblk: write to readonly reg %d", offset));
507	return (1);
508}
509
510static int
511pci_vtblk_cfgread(void *vsc, int offset, int size, uint32_t *retval)
512{
513	struct pci_vtblk_softc *sc = vsc;
514	void *ptr;
515
516	/* our caller has already verified offset and size */
517	ptr = (uint8_t *)&sc->vbsc_cfg + offset;
518	memcpy(retval, ptr, size);
519	return (0);
520}
521
522struct pci_devemu pci_de_vblk = {
523	.pe_emu =	"virtio-blk",
524	.pe_init =	pci_vtblk_init,
525	.pe_barwrite =	vi_pci_write,
526	.pe_barread =	vi_pci_read
527};
528PCI_EMUL_SET(pci_de_vblk);
529