1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2013  Chris Torek <torek @ torek net>
5 * All rights reserved.
6 * Copyright (c) 2019 Joyent, Inc.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include <sys/param.h>
31#include <sys/uio.h>
32
33#include <machine/atomic.h>
34
35#include <dev/virtio/pci/virtio_pci_legacy_var.h>
36
37#include <stdio.h>
38#include <stdint.h>
39#include <string.h>
40#include <pthread.h>
41#include <pthread_np.h>
42
43#include "bhyverun.h"
44#include "debug.h"
45#include "pci_emul.h"
46#ifdef BHYVE_SNAPSHOT
47#include "snapshot.h"
48#endif
49#include "virtio.h"
50
51/*
52 * Functions for dealing with generalized "virtual devices" as
53 * defined by <https://www.google.com/#output=search&q=virtio+spec>
54 */
55
56/*
57 * In case we decide to relax the "virtio softc comes at the
58 * front of virtio-based device softc" constraint, let's use
59 * this to convert.
60 */
61#define	DEV_SOFTC(vs) ((void *)(vs))
62
63/*
64 * Link a virtio_softc to its constants, the device softc, and
65 * the PCI emulation.
66 */
67void
68vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc,
69		void *dev_softc, struct pci_devinst *pi,
70		struct vqueue_info *queues)
71{
72	int i;
73
74	/* vs and dev_softc addresses must match */
75	assert((void *)vs == dev_softc);
76	vs->vs_vc = vc;
77	vs->vs_pi = pi;
78	pi->pi_arg = vs;
79
80	vs->vs_queues = queues;
81	for (i = 0; i < vc->vc_nvq; i++) {
82		queues[i].vq_vs = vs;
83		queues[i].vq_num = i;
84	}
85}
86
87/*
88 * Reset device (device-wide).  This erases all queues, i.e.,
89 * all the queues become invalid (though we don't wipe out the
90 * internal pointers, we just clear the VQ_ALLOC flag).
91 *
92 * It resets negotiated features to "none".
93 *
94 * If MSI-X is enabled, this also resets all the vectors to NO_VECTOR.
95 */
96void
97vi_reset_dev(struct virtio_softc *vs)
98{
99	struct vqueue_info *vq;
100	int i, nvq;
101
102	if (vs->vs_mtx)
103		assert(pthread_mutex_isowned_np(vs->vs_mtx));
104
105	nvq = vs->vs_vc->vc_nvq;
106	for (vq = vs->vs_queues, i = 0; i < nvq; vq++, i++) {
107		vq->vq_flags = 0;
108		vq->vq_last_avail = 0;
109		vq->vq_next_used = 0;
110		vq->vq_save_used = 0;
111		vq->vq_pfn = 0;
112		vq->vq_msix_idx = VIRTIO_MSI_NO_VECTOR;
113	}
114	vs->vs_negotiated_caps = 0;
115	vs->vs_curq = 0;
116	/* vs->vs_status = 0; -- redundant */
117	if (vs->vs_isr)
118		pci_lintr_deassert(vs->vs_pi);
119	vs->vs_isr = 0;
120	vs->vs_msix_cfg_idx = VIRTIO_MSI_NO_VECTOR;
121}
122
123/*
124 * Set I/O BAR (usually 0) to map PCI config registers.
125 */
126void
127vi_set_io_bar(struct virtio_softc *vs, int barnum)
128{
129	size_t size;
130
131	/*
132	 * ??? should we use VIRTIO_PCI_CONFIG_OFF(0) if MSI-X is disabled?
133	 * Existing code did not...
134	 */
135	size = VIRTIO_PCI_CONFIG_OFF(1) + vs->vs_vc->vc_cfgsize;
136	pci_emul_alloc_bar(vs->vs_pi, barnum, PCIBAR_IO, size);
137}
138
139/*
140 * Initialize MSI-X vector capabilities if we're to use MSI-X,
141 * or MSI capabilities if not.
142 *
143 * We assume we want one MSI-X vector per queue, here, plus one
144 * for the config vec.
145 */
146int
147vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix)
148{
149	int nvec;
150
151	if (use_msix) {
152		vs->vs_flags |= VIRTIO_USE_MSIX;
153		VS_LOCK(vs);
154		vi_reset_dev(vs); /* set all vectors to NO_VECTOR */
155		VS_UNLOCK(vs);
156		nvec = vs->vs_vc->vc_nvq + 1;
157		if (pci_emul_add_msixcap(vs->vs_pi, nvec, barnum))
158			return (1);
159	} else
160		vs->vs_flags &= ~VIRTIO_USE_MSIX;
161
162	/* Only 1 MSI vector for bhyve */
163	pci_emul_add_msicap(vs->vs_pi, 1);
164
165	/* Legacy interrupts are mandatory for virtio devices */
166	pci_lintr_request(vs->vs_pi);
167
168	return (0);
169}
170
171/*
172 * Initialize the currently-selected virtio queue (vs->vs_curq).
173 * The guest just gave us a page frame number, from which we can
174 * calculate the addresses of the queue.
175 */
176static void
177vi_vq_init(struct virtio_softc *vs, uint32_t pfn)
178{
179	struct vqueue_info *vq;
180	uint64_t phys;
181	size_t size;
182	char *base;
183
184	vq = &vs->vs_queues[vs->vs_curq];
185	vq->vq_pfn = pfn;
186	phys = (uint64_t)pfn << VRING_PFN;
187	size = vring_size_aligned(vq->vq_qsize);
188	base = paddr_guest2host(vs->vs_pi->pi_vmctx, phys, size);
189
190	/* First page(s) are descriptors... */
191	vq->vq_desc = (struct vring_desc *)base;
192	base += vq->vq_qsize * sizeof(struct vring_desc);
193
194	/* ... immediately followed by "avail" ring (entirely uint16_t's) */
195	vq->vq_avail = (struct vring_avail *)base;
196	base += (2 + vq->vq_qsize + 1) * sizeof(uint16_t);
197
198	/* Then it's rounded up to the next page... */
199	base = (char *)roundup2((uintptr_t)base, VRING_ALIGN);
200
201	/* ... and the last page(s) are the used ring. */
202	vq->vq_used = (struct vring_used *)base;
203
204	/* Mark queue as allocated, and start at 0 when we use it. */
205	vq->vq_flags = VQ_ALLOC;
206	vq->vq_last_avail = 0;
207	vq->vq_next_used = 0;
208	vq->vq_save_used = 0;
209}
210
211/*
212 * Helper inline for vq_getchain(): record the i'th "real"
213 * descriptor.
214 */
215static inline void
216_vq_record(int i, struct vring_desc *vd, struct vmctx *ctx, struct iovec *iov,
217    int n_iov, struct vi_req *reqp)
218{
219	if (i >= n_iov)
220		return;
221	iov[i].iov_base = paddr_guest2host(ctx, vd->addr, vd->len);
222	iov[i].iov_len = vd->len;
223	if ((vd->flags & VRING_DESC_F_WRITE) == 0)
224		reqp->readable++;
225	else
226		reqp->writable++;
227}
228#define	VQ_MAX_DESCRIPTORS	512	/* see below */
229
230/*
231 * Examine the chain of descriptors starting at the "next one" to
232 * make sure that they describe a sensible request.  If so, return
233 * the number of "real" descriptors that would be needed/used in
234 * acting on this request.  This may be smaller than the number of
235 * available descriptors, e.g., if there are two available but
236 * they are two separate requests, this just returns 1.  Or, it
237 * may be larger: if there are indirect descriptors involved,
238 * there may only be one descriptor available but it may be an
239 * indirect pointing to eight more.  We return 8 in this case,
240 * i.e., we do not count the indirect descriptors, only the "real"
241 * ones.
242 *
243 * Basically, this vets the "flags" and "next" field of each
244 * descriptor and tells you how many are involved.  Since some may
245 * be indirect, this also needs the vmctx (in the pci_devinst
246 * at vs->vs_pi) so that it can find indirect descriptors.
247 *
248 * As we process each descriptor, we copy and adjust it (guest to
249 * host address wise, also using the vmtctx) into the given iov[]
250 * array (of the given size).  If the array overflows, we stop
251 * placing values into the array but keep processing descriptors,
252 * up to VQ_MAX_DESCRIPTORS, before giving up and returning -1.
253 * So you, the caller, must not assume that iov[] is as big as the
254 * return value (you can process the same thing twice to allocate
255 * a larger iov array if needed, or supply a zero length to find
256 * out how much space is needed).
257 *
258 * If some descriptor(s) are invalid, this prints a diagnostic message
259 * and returns -1.  If no descriptors are ready now it simply returns 0.
260 *
261 * You are assumed to have done a vq_ring_ready() if needed (note
262 * that vq_has_descs() does one).
263 */
264int
265vq_getchain(struct vqueue_info *vq, struct iovec *iov, int niov,
266	    struct vi_req *reqp)
267{
268	int i;
269	u_int ndesc, n_indir;
270	u_int idx, next;
271	struct vi_req req;
272	struct vring_desc *vdir, *vindir, *vp;
273	struct vmctx *ctx;
274	struct virtio_softc *vs;
275	const char *name;
276
277	vs = vq->vq_vs;
278	name = vs->vs_vc->vc_name;
279	memset(&req, 0, sizeof(req));
280
281	/*
282	 * Note: it's the responsibility of the guest not to
283	 * update vq->vq_avail->idx until all of the descriptors
284         * the guest has written are valid (including all their
285         * "next" fields and "flags").
286	 *
287	 * Compute (vq_avail->idx - last_avail) in integers mod 2**16.  This is
288	 * the number of descriptors the device has made available
289	 * since the last time we updated vq->vq_last_avail.
290	 *
291	 * We just need to do the subtraction as an unsigned int,
292	 * then trim off excess bits.
293	 */
294	idx = vq->vq_last_avail;
295	ndesc = (uint16_t)((u_int)vq->vq_avail->idx - idx);
296	if (ndesc == 0)
297		return (0);
298	if (ndesc > vq->vq_qsize) {
299		/* XXX need better way to diagnose issues */
300		EPRINTLN(
301		    "%s: ndesc (%u) out of range, driver confused?",
302		    name, (u_int)ndesc);
303		return (-1);
304	}
305
306	/*
307	 * Now count/parse "involved" descriptors starting from
308	 * the head of the chain.
309	 *
310	 * To prevent loops, we could be more complicated and
311	 * check whether we're re-visiting a previously visited
312	 * index, but we just abort if the count gets excessive.
313	 */
314	ctx = vs->vs_pi->pi_vmctx;
315	req.idx = next = vq->vq_avail->ring[idx & (vq->vq_qsize - 1)];
316	vq->vq_last_avail++;
317	for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->next) {
318		if (next >= vq->vq_qsize) {
319			EPRINTLN(
320			    "%s: descriptor index %u out of range, "
321			    "driver confused?",
322			    name, next);
323			return (-1);
324		}
325		vdir = &vq->vq_desc[next];
326		if ((vdir->flags & VRING_DESC_F_INDIRECT) == 0) {
327			_vq_record(i, vdir, ctx, iov, niov, &req);
328			i++;
329		} else if ((vs->vs_vc->vc_hv_caps &
330		    VIRTIO_RING_F_INDIRECT_DESC) == 0) {
331			EPRINTLN(
332			    "%s: descriptor has forbidden INDIRECT flag, "
333			    "driver confused?",
334			    name);
335			return (-1);
336		} else {
337			n_indir = vdir->len / 16;
338			if ((vdir->len & 0xf) || n_indir == 0) {
339				EPRINTLN(
340				    "%s: invalid indir len 0x%x, "
341				    "driver confused?",
342				    name, (u_int)vdir->len);
343				return (-1);
344			}
345			vindir = paddr_guest2host(ctx,
346			    vdir->addr, vdir->len);
347			/*
348			 * Indirects start at the 0th, then follow
349			 * their own embedded "next"s until those run
350			 * out.  Each one's indirect flag must be off
351			 * (we don't really have to check, could just
352			 * ignore errors...).
353			 */
354			next = 0;
355			for (;;) {
356				vp = &vindir[next];
357				if (vp->flags & VRING_DESC_F_INDIRECT) {
358					EPRINTLN(
359					    "%s: indirect desc has INDIR flag,"
360					    " driver confused?",
361					    name);
362					return (-1);
363				}
364				_vq_record(i, vp, ctx, iov, niov, &req);
365				if (++i > VQ_MAX_DESCRIPTORS)
366					goto loopy;
367				if ((vp->flags & VRING_DESC_F_NEXT) == 0)
368					break;
369				next = vp->next;
370				if (next >= n_indir) {
371					EPRINTLN(
372					    "%s: invalid next %u > %u, "
373					    "driver confused?",
374					    name, (u_int)next, n_indir);
375					return (-1);
376				}
377			}
378		}
379		if ((vdir->flags & VRING_DESC_F_NEXT) == 0)
380			goto done;
381	}
382
383loopy:
384	EPRINTLN(
385	    "%s: descriptor loop? count > %d - driver confused?",
386	    name, i);
387	return (-1);
388
389done:
390	*reqp = req;
391	return (i);
392}
393
394/*
395 * Return the first n_chain request chains back to the available queue.
396 *
397 * (These chains are the ones you handled when you called vq_getchain()
398 * and used its positive return value.)
399 */
400void
401vq_retchains(struct vqueue_info *vq, uint16_t n_chains)
402{
403
404	vq->vq_last_avail -= n_chains;
405}
406
407void
408vq_relchain_prepare(struct vqueue_info *vq, uint16_t idx, uint32_t iolen)
409{
410	struct vring_used *vuh;
411	struct vring_used_elem *vue;
412	uint16_t mask;
413
414	/*
415	 * Notes:
416	 *  - mask is N-1 where N is a power of 2 so computes x % N
417	 *  - vuh points to the "used" data shared with guest
418	 *  - vue points to the "used" ring entry we want to update
419	 */
420	mask = vq->vq_qsize - 1;
421	vuh = vq->vq_used;
422
423	vue = &vuh->ring[vq->vq_next_used++ & mask];
424	vue->id = idx;
425	vue->len = iolen;
426}
427
428void
429vq_relchain_publish(struct vqueue_info *vq)
430{
431	/*
432	 * Ensure the used descriptor is visible before updating the index.
433	 * This is necessary on ISAs with memory ordering less strict than x86
434	 * (and even on x86 to act as a compiler barrier).
435	 */
436	atomic_thread_fence_rel();
437	vq->vq_used->idx = vq->vq_next_used;
438}
439
440/*
441 * Return specified request chain to the guest, setting its I/O length
442 * to the provided value.
443 *
444 * (This chain is the one you handled when you called vq_getchain()
445 * and used its positive return value.)
446 */
447void
448vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen)
449{
450	vq_relchain_prepare(vq, idx, iolen);
451	vq_relchain_publish(vq);
452}
453
454/*
455 * Driver has finished processing "available" chains and calling
456 * vq_relchain on each one.  If driver used all the available
457 * chains, used_all should be set.
458 *
459 * If the "used" index moved we may need to inform the guest, i.e.,
460 * deliver an interrupt.  Even if the used index did NOT move we
461 * may need to deliver an interrupt, if the avail ring is empty and
462 * we are supposed to interrupt on empty.
463 *
464 * Note that used_all_avail is provided by the caller because it's
465 * a snapshot of the ring state when he decided to finish interrupt
466 * processing -- it's possible that descriptors became available after
467 * that point.  (It's also typically a constant 1/True as well.)
468 */
469void
470vq_endchains(struct vqueue_info *vq, int used_all_avail)
471{
472	struct virtio_softc *vs;
473	uint16_t event_idx, new_idx, old_idx;
474	int intr;
475
476	/*
477	 * Interrupt generation: if we're using EVENT_IDX,
478	 * interrupt if we've crossed the event threshold.
479	 * Otherwise interrupt is generated if we added "used" entries,
480	 * but suppressed by VRING_AVAIL_F_NO_INTERRUPT.
481	 *
482	 * In any case, though, if NOTIFY_ON_EMPTY is set and the
483	 * entire avail was processed, we need to interrupt always.
484	 */
485	vs = vq->vq_vs;
486	old_idx = vq->vq_save_used;
487	vq->vq_save_used = new_idx = vq->vq_used->idx;
488
489	/*
490	 * Use full memory barrier between "idx" store from preceding
491	 * vq_relchain() call and the loads from VQ_USED_EVENT_IDX() or
492	 * "flags" field below.
493	 */
494	atomic_thread_fence_seq_cst();
495	if (used_all_avail &&
496	    (vs->vs_negotiated_caps & VIRTIO_F_NOTIFY_ON_EMPTY))
497		intr = 1;
498	else if (vs->vs_negotiated_caps & VIRTIO_RING_F_EVENT_IDX) {
499		event_idx = VQ_USED_EVENT_IDX(vq);
500		/*
501		 * This calculation is per docs and the kernel
502		 * (see src/sys/dev/virtio/virtio_ring.h).
503		 */
504		intr = (uint16_t)(new_idx - event_idx - 1) <
505			(uint16_t)(new_idx - old_idx);
506	} else {
507		intr = new_idx != old_idx &&
508		    !(vq->vq_avail->flags & VRING_AVAIL_F_NO_INTERRUPT);
509	}
510	if (intr)
511		vq_interrupt(vs, vq);
512}
513
514/* Note: these are in sorted order to make for a fast search */
515static struct config_reg {
516	uint16_t	cr_offset;	/* register offset */
517	uint8_t		cr_size;	/* size (bytes) */
518	uint8_t		cr_ro;		/* true => reg is read only */
519	const char	*cr_name;	/* name of reg */
520} config_regs[] = {
521	{ VIRTIO_PCI_HOST_FEATURES,	4, 1, "HOST_FEATURES" },
522	{ VIRTIO_PCI_GUEST_FEATURES,	4, 0, "GUEST_FEATURES" },
523	{ VIRTIO_PCI_QUEUE_PFN,		4, 0, "QUEUE_PFN" },
524	{ VIRTIO_PCI_QUEUE_NUM,		2, 1, "QUEUE_NUM" },
525	{ VIRTIO_PCI_QUEUE_SEL,		2, 0, "QUEUE_SEL" },
526	{ VIRTIO_PCI_QUEUE_NOTIFY,	2, 0, "QUEUE_NOTIFY" },
527	{ VIRTIO_PCI_STATUS,		1, 0, "STATUS" },
528	{ VIRTIO_PCI_ISR,		1, 0, "ISR" },
529	{ VIRTIO_MSI_CONFIG_VECTOR,	2, 0, "CONFIG_VECTOR" },
530	{ VIRTIO_MSI_QUEUE_VECTOR,	2, 0, "QUEUE_VECTOR" },
531};
532
533static inline struct config_reg *
534vi_find_cr(int offset) {
535	u_int hi, lo, mid;
536	struct config_reg *cr;
537
538	lo = 0;
539	hi = sizeof(config_regs) / sizeof(*config_regs) - 1;
540	while (hi >= lo) {
541		mid = (hi + lo) >> 1;
542		cr = &config_regs[mid];
543		if (cr->cr_offset == offset)
544			return (cr);
545		if (cr->cr_offset < offset)
546			lo = mid + 1;
547		else
548			hi = mid - 1;
549	}
550	return (NULL);
551}
552
553/*
554 * Handle pci config space reads.
555 * If it's to the MSI-X info, do that.
556 * If it's part of the virtio standard stuff, do that.
557 * Otherwise dispatch to the actual driver.
558 */
559uint64_t
560vi_pci_read(struct pci_devinst *pi, int baridx, uint64_t offset, int size)
561{
562	struct virtio_softc *vs = pi->pi_arg;
563	struct virtio_consts *vc;
564	struct config_reg *cr;
565	uint64_t virtio_config_size, max;
566	const char *name;
567	uint32_t newoff;
568	uint32_t value;
569	int error;
570
571	if (vs->vs_flags & VIRTIO_USE_MSIX) {
572		if (baridx == pci_msix_table_bar(pi) ||
573		    baridx == pci_msix_pba_bar(pi)) {
574			return (pci_emul_msix_tread(pi, offset, size));
575		}
576	}
577
578	/* XXX probably should do something better than just assert() */
579	assert(baridx == 0);
580
581	if (vs->vs_mtx)
582		pthread_mutex_lock(vs->vs_mtx);
583
584	vc = vs->vs_vc;
585	name = vc->vc_name;
586	value = size == 1 ? 0xff : size == 2 ? 0xffff : 0xffffffff;
587
588	if (size != 1 && size != 2 && size != 4)
589		goto bad;
590
591	virtio_config_size = VIRTIO_PCI_CONFIG_OFF(pci_msix_enabled(pi));
592
593	if (offset >= virtio_config_size) {
594		/*
595		 * Subtract off the standard size (including MSI-X
596		 * registers if enabled) and dispatch to underlying driver.
597		 * If that fails, fall into general code.
598		 */
599		newoff = offset - virtio_config_size;
600		max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000;
601		if (newoff + size > max)
602			goto bad;
603		if (vc->vc_cfgread != NULL)
604			error = (*vc->vc_cfgread)(DEV_SOFTC(vs), newoff, size, &value);
605		else
606			error = 0;
607		if (!error)
608			goto done;
609	}
610
611bad:
612	cr = vi_find_cr(offset);
613	if (cr == NULL || cr->cr_size != size) {
614		if (cr != NULL) {
615			/* offset must be OK, so size must be bad */
616			EPRINTLN(
617			    "%s: read from %s: bad size %d",
618			    name, cr->cr_name, size);
619		} else {
620			EPRINTLN(
621			    "%s: read from bad offset/size %jd/%d",
622			    name, (uintmax_t)offset, size);
623		}
624		goto done;
625	}
626
627	switch (offset) {
628	case VIRTIO_PCI_HOST_FEATURES:
629		value = vc->vc_hv_caps;
630		break;
631	case VIRTIO_PCI_GUEST_FEATURES:
632		value = vs->vs_negotiated_caps;
633		break;
634	case VIRTIO_PCI_QUEUE_PFN:
635		if (vs->vs_curq < vc->vc_nvq)
636			value = vs->vs_queues[vs->vs_curq].vq_pfn;
637		break;
638	case VIRTIO_PCI_QUEUE_NUM:
639		value = vs->vs_curq < vc->vc_nvq ?
640		    vs->vs_queues[vs->vs_curq].vq_qsize : 0;
641		break;
642	case VIRTIO_PCI_QUEUE_SEL:
643		value = vs->vs_curq;
644		break;
645	case VIRTIO_PCI_QUEUE_NOTIFY:
646		value = 0;	/* XXX */
647		break;
648	case VIRTIO_PCI_STATUS:
649		value = vs->vs_status;
650		break;
651	case VIRTIO_PCI_ISR:
652		value = vs->vs_isr;
653		vs->vs_isr = 0;		/* a read clears this flag */
654		if (value)
655			pci_lintr_deassert(pi);
656		break;
657	case VIRTIO_MSI_CONFIG_VECTOR:
658		value = vs->vs_msix_cfg_idx;
659		break;
660	case VIRTIO_MSI_QUEUE_VECTOR:
661		value = vs->vs_curq < vc->vc_nvq ?
662		    vs->vs_queues[vs->vs_curq].vq_msix_idx :
663		    VIRTIO_MSI_NO_VECTOR;
664		break;
665	}
666done:
667	if (vs->vs_mtx)
668		pthread_mutex_unlock(vs->vs_mtx);
669	return (value);
670}
671
672/*
673 * Handle pci config space writes.
674 * If it's to the MSI-X info, do that.
675 * If it's part of the virtio standard stuff, do that.
676 * Otherwise dispatch to the actual driver.
677 */
678void
679vi_pci_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size,
680    uint64_t value)
681{
682	struct virtio_softc *vs = pi->pi_arg;
683	struct vqueue_info *vq;
684	struct virtio_consts *vc;
685	struct config_reg *cr;
686	uint64_t virtio_config_size, max;
687	const char *name;
688	uint32_t newoff;
689	int error;
690
691	if (vs->vs_flags & VIRTIO_USE_MSIX) {
692		if (baridx == pci_msix_table_bar(pi) ||
693		    baridx == pci_msix_pba_bar(pi)) {
694			pci_emul_msix_twrite(pi, offset, size, value);
695			return;
696		}
697	}
698
699	/* XXX probably should do something better than just assert() */
700	assert(baridx == 0);
701
702	if (vs->vs_mtx)
703		pthread_mutex_lock(vs->vs_mtx);
704
705	vc = vs->vs_vc;
706	name = vc->vc_name;
707
708	if (size != 1 && size != 2 && size != 4)
709		goto bad;
710
711	virtio_config_size = VIRTIO_PCI_CONFIG_OFF(pci_msix_enabled(pi));
712
713	if (offset >= virtio_config_size) {
714		/*
715		 * Subtract off the standard size (including MSI-X
716		 * registers if enabled) and dispatch to underlying driver.
717		 */
718		newoff = offset - virtio_config_size;
719		max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000;
720		if (newoff + size > max)
721			goto bad;
722		if (vc->vc_cfgwrite != NULL)
723			error = (*vc->vc_cfgwrite)(DEV_SOFTC(vs), newoff, size, value);
724		else
725			error = 0;
726		if (!error)
727			goto done;
728	}
729
730bad:
731	cr = vi_find_cr(offset);
732	if (cr == NULL || cr->cr_size != size || cr->cr_ro) {
733		if (cr != NULL) {
734			/* offset must be OK, wrong size and/or reg is R/O */
735			if (cr->cr_size != size)
736				EPRINTLN(
737				    "%s: write to %s: bad size %d",
738				    name, cr->cr_name, size);
739			if (cr->cr_ro)
740				EPRINTLN(
741				    "%s: write to read-only reg %s",
742				    name, cr->cr_name);
743		} else {
744			EPRINTLN(
745			    "%s: write to bad offset/size %jd/%d",
746			    name, (uintmax_t)offset, size);
747		}
748		goto done;
749	}
750
751	switch (offset) {
752	case VIRTIO_PCI_GUEST_FEATURES:
753		vs->vs_negotiated_caps = value & vc->vc_hv_caps;
754		if (vc->vc_apply_features)
755			(*vc->vc_apply_features)(DEV_SOFTC(vs),
756			    vs->vs_negotiated_caps);
757		break;
758	case VIRTIO_PCI_QUEUE_PFN:
759		if (vs->vs_curq >= vc->vc_nvq)
760			goto bad_qindex;
761		vi_vq_init(vs, value);
762		break;
763	case VIRTIO_PCI_QUEUE_SEL:
764		/*
765		 * Note that the guest is allowed to select an
766		 * invalid queue; we just need to return a QNUM
767		 * of 0 while the bad queue is selected.
768		 */
769		vs->vs_curq = value;
770		break;
771	case VIRTIO_PCI_QUEUE_NOTIFY:
772		if (value >= (unsigned int)vc->vc_nvq) {
773			EPRINTLN("%s: queue %d notify out of range",
774				name, (int)value);
775			goto done;
776		}
777		vq = &vs->vs_queues[value];
778		if (vq->vq_notify)
779			(*vq->vq_notify)(DEV_SOFTC(vs), vq);
780		else if (vc->vc_qnotify)
781			(*vc->vc_qnotify)(DEV_SOFTC(vs), vq);
782		else
783			EPRINTLN(
784			    "%s: qnotify queue %d: missing vq/vc notify",
785				name, (int)value);
786		break;
787	case VIRTIO_PCI_STATUS:
788		vs->vs_status = value;
789		if (value == 0)
790			(*vc->vc_reset)(DEV_SOFTC(vs));
791		break;
792	case VIRTIO_MSI_CONFIG_VECTOR:
793		vs->vs_msix_cfg_idx = value;
794		break;
795	case VIRTIO_MSI_QUEUE_VECTOR:
796		if (vs->vs_curq >= vc->vc_nvq)
797			goto bad_qindex;
798		vq = &vs->vs_queues[vs->vs_curq];
799		vq->vq_msix_idx = value;
800		break;
801	}
802	goto done;
803
804bad_qindex:
805	EPRINTLN(
806	    "%s: write config reg %s: curq %d >= max %d",
807	    name, cr->cr_name, vs->vs_curq, vc->vc_nvq);
808done:
809	if (vs->vs_mtx)
810		pthread_mutex_unlock(vs->vs_mtx);
811}
812
813#ifdef BHYVE_SNAPSHOT
814int
815vi_pci_pause(struct pci_devinst *pi)
816{
817	struct virtio_softc *vs;
818	struct virtio_consts *vc;
819
820	vs = pi->pi_arg;
821	vc = vs->vs_vc;
822
823	vc = vs->vs_vc;
824	assert(vc->vc_pause != NULL);
825	(*vc->vc_pause)(DEV_SOFTC(vs));
826
827	return (0);
828}
829
830int
831vi_pci_resume(struct pci_devinst *pi)
832{
833	struct virtio_softc *vs;
834	struct virtio_consts *vc;
835
836	vs = pi->pi_arg;
837	vc = vs->vs_vc;
838
839	vc = vs->vs_vc;
840	assert(vc->vc_resume != NULL);
841	(*vc->vc_resume)(DEV_SOFTC(vs));
842
843	return (0);
844}
845
846static int
847vi_pci_snapshot_softc(struct virtio_softc *vs, struct vm_snapshot_meta *meta)
848{
849	int ret;
850
851	SNAPSHOT_VAR_OR_LEAVE(vs->vs_flags, meta, ret, done);
852	SNAPSHOT_VAR_OR_LEAVE(vs->vs_negotiated_caps, meta, ret, done);
853	SNAPSHOT_VAR_OR_LEAVE(vs->vs_curq, meta, ret, done);
854	SNAPSHOT_VAR_OR_LEAVE(vs->vs_status, meta, ret, done);
855	SNAPSHOT_VAR_OR_LEAVE(vs->vs_isr, meta, ret, done);
856	SNAPSHOT_VAR_OR_LEAVE(vs->vs_msix_cfg_idx, meta, ret, done);
857
858done:
859	return (ret);
860}
861
862static int
863vi_pci_snapshot_consts(struct virtio_consts *vc, struct vm_snapshot_meta *meta)
864{
865	int ret;
866
867	SNAPSHOT_VAR_CMP_OR_LEAVE(vc->vc_nvq, meta, ret, done);
868	SNAPSHOT_VAR_CMP_OR_LEAVE(vc->vc_cfgsize, meta, ret, done);
869	SNAPSHOT_VAR_CMP_OR_LEAVE(vc->vc_hv_caps, meta, ret, done);
870
871done:
872	return (ret);
873}
874
875static int
876vi_pci_snapshot_queues(struct virtio_softc *vs, struct vm_snapshot_meta *meta)
877{
878	int i;
879	int ret;
880	struct virtio_consts *vc;
881	struct vqueue_info *vq;
882	struct vmctx *ctx;
883	uint64_t addr_size;
884
885	ctx = vs->vs_pi->pi_vmctx;
886	vc = vs->vs_vc;
887
888	/* Save virtio queue info */
889	for (i = 0; i < vc->vc_nvq; i++) {
890		vq = &vs->vs_queues[i];
891
892		SNAPSHOT_VAR_CMP_OR_LEAVE(vq->vq_qsize, meta, ret, done);
893		SNAPSHOT_VAR_CMP_OR_LEAVE(vq->vq_num, meta, ret, done);
894
895		SNAPSHOT_VAR_OR_LEAVE(vq->vq_flags, meta, ret, done);
896		SNAPSHOT_VAR_OR_LEAVE(vq->vq_last_avail, meta, ret, done);
897		SNAPSHOT_VAR_OR_LEAVE(vq->vq_next_used, meta, ret, done);
898		SNAPSHOT_VAR_OR_LEAVE(vq->vq_save_used, meta, ret, done);
899		SNAPSHOT_VAR_OR_LEAVE(vq->vq_msix_idx, meta, ret, done);
900
901		SNAPSHOT_VAR_OR_LEAVE(vq->vq_pfn, meta, ret, done);
902
903		if (!vq_ring_ready(vq))
904			continue;
905
906		addr_size = vq->vq_qsize * sizeof(struct vring_desc);
907		SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(ctx, vq->vq_desc, addr_size,
908			false, meta, ret, done);
909
910		addr_size = (2 + vq->vq_qsize + 1) * sizeof(uint16_t);
911		SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(ctx, vq->vq_avail, addr_size,
912			false, meta, ret, done);
913
914		addr_size  = (2 + 2 * vq->vq_qsize + 1) * sizeof(uint16_t);
915		SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(ctx, vq->vq_used, addr_size,
916			false, meta, ret, done);
917
918		SNAPSHOT_BUF_OR_LEAVE(vq->vq_desc,
919			vring_size_aligned(vq->vq_qsize), meta, ret, done);
920	}
921
922done:
923	return (ret);
924}
925
926int
927vi_pci_snapshot(struct vm_snapshot_meta *meta)
928{
929	int ret;
930	struct pci_devinst *pi;
931	struct virtio_softc *vs;
932	struct virtio_consts *vc;
933
934	pi = meta->dev_data;
935	vs = pi->pi_arg;
936	vc = vs->vs_vc;
937
938	/* Save virtio softc */
939	ret = vi_pci_snapshot_softc(vs, meta);
940	if (ret != 0)
941		goto done;
942
943	/* Save virtio consts */
944	ret = vi_pci_snapshot_consts(vc, meta);
945	if (ret != 0)
946		goto done;
947
948	/* Save virtio queue info */
949	ret = vi_pci_snapshot_queues(vs, meta);
950	if (ret != 0)
951		goto done;
952
953	/* Save device softc, if needed */
954	if (vc->vc_snapshot != NULL) {
955		ret = (*vc->vc_snapshot)(DEV_SOFTC(vs), meta);
956		if (ret != 0)
957			goto done;
958	}
959
960done:
961	return (ret);
962}
963#endif
964