1221828Sgrehan/*-
2221828Sgrehan * Copyright (c) 2011 NetApp, Inc.
3221828Sgrehan * All rights reserved.
4221828Sgrehan *
5221828Sgrehan * Redistribution and use in source and binary forms, with or without
6221828Sgrehan * modification, are permitted provided that the following conditions
7221828Sgrehan * are met:
8221828Sgrehan * 1. Redistributions of source code must retain the above copyright
9221828Sgrehan *    notice, this list of conditions and the following disclaimer.
10221828Sgrehan * 2. Redistributions in binary form must reproduce the above copyright
11221828Sgrehan *    notice, this list of conditions and the following disclaimer in the
12221828Sgrehan *    documentation and/or other materials provided with the distribution.
13221828Sgrehan *
14221828Sgrehan * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15221828Sgrehan * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16221828Sgrehan * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17221828Sgrehan * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18221828Sgrehan * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19221828Sgrehan * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20221828Sgrehan * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21221828Sgrehan * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22221828Sgrehan * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23221828Sgrehan * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24221828Sgrehan * SUCH DAMAGE.
25221828Sgrehan *
26221828Sgrehan * $FreeBSD: releng/11.0/usr.sbin/bhyve/pci_passthru.c 302365 2016-07-06 05:05:03Z ngie $
27221828Sgrehan */
28221828Sgrehan
29221828Sgrehan#include <sys/cdefs.h>
30221828Sgrehan__FBSDID("$FreeBSD: releng/11.0/usr.sbin/bhyve/pci_passthru.c 302365 2016-07-06 05:05:03Z ngie $");
31221828Sgrehan
32221828Sgrehan#include <sys/param.h>
33221828Sgrehan#include <sys/types.h>
34297932Sjhb#include <sys/mman.h>
35221828Sgrehan#include <sys/pciio.h>
36221828Sgrehan#include <sys/ioctl.h>
37221828Sgrehan
38221828Sgrehan#include <dev/io/iodev.h>
39245749Sneel#include <dev/pci/pcireg.h>
40245749Sneel
41221828Sgrehan#include <machine/iodev.h>
42221828Sgrehan
43221828Sgrehan#include <stdio.h>
44221828Sgrehan#include <stdlib.h>
45221828Sgrehan#include <string.h>
46298295Sjhb#include <err.h>
47221828Sgrehan#include <fcntl.h>
48221828Sgrehan#include <unistd.h>
49221828Sgrehan
50221828Sgrehan#include <machine/vmm.h>
51221828Sgrehan#include <vmmapi.h>
52221828Sgrehan#include "pci_emul.h"
53241744Sgrehan#include "mem.h"
54221828Sgrehan
55221828Sgrehan#ifndef _PATH_DEVPCI
56221828Sgrehan#define	_PATH_DEVPCI	"/dev/pci"
57221828Sgrehan#endif
58221828Sgrehan
59221828Sgrehan#ifndef	_PATH_DEVIO
60221828Sgrehan#define	_PATH_DEVIO	"/dev/io"
61221828Sgrehan#endif
62221828Sgrehan
63297932Sjhb#ifndef _PATH_MEM
64297932Sjhb#define	_PATH_MEM	"/dev/mem"
65297932Sjhb#endif
66297932Sjhb
67221828Sgrehan#define	LEGACY_SUPPORT	1
68221828Sgrehan
69245749Sneel#define MSIX_TABLE_COUNT(ctrl) (((ctrl) & PCIM_MSIXCTRL_TABLE_SIZE) + 1)
70234761Sgrehan#define MSIX_CAPLEN 12
71234761Sgrehan
72221828Sgrehanstatic int pcifd = -1;
73221828Sgrehanstatic int iofd = -1;
74297932Sjhbstatic int memfd = -1;
75221828Sgrehan
76221828Sgrehanstruct passthru_softc {
77221828Sgrehan	struct pci_devinst *psc_pi;
78221828Sgrehan	struct pcibar psc_bar[PCI_BARMAX + 1];
79221828Sgrehan	struct {
80221828Sgrehan		int		capoff;
81221828Sgrehan		int		msgctrl;
82221828Sgrehan		int		emulated;
83221828Sgrehan	} psc_msi;
84234761Sgrehan	struct {
85234761Sgrehan		int		capoff;
86234761Sgrehan	} psc_msix;
87221828Sgrehan	struct pcisel psc_sel;
88221828Sgrehan};
89221828Sgrehan
90221828Sgrehanstatic int
91221828Sgrehanmsi_caplen(int msgctrl)
92221828Sgrehan{
93221828Sgrehan	int len;
94221828Sgrehan
95221828Sgrehan	len = 10;		/* minimum length of msi capability */
96221828Sgrehan
97221828Sgrehan	if (msgctrl & PCIM_MSICTRL_64BIT)
98221828Sgrehan		len += 4;
99221828Sgrehan
100221828Sgrehan#if 0
101221828Sgrehan	/*
102221828Sgrehan	 * Ignore the 'mask' and 'pending' bits in the MSI capability.
103221828Sgrehan	 * We'll let the guest manipulate them directly.
104221828Sgrehan	 */
105221828Sgrehan	if (msgctrl & PCIM_MSICTRL_VECTOR)
106221828Sgrehan		len += 10;
107221828Sgrehan#endif
108221828Sgrehan
109221828Sgrehan	return (len);
110221828Sgrehan}
111221828Sgrehan
112221828Sgrehanstatic uint32_t
113221828Sgrehanread_config(const struct pcisel *sel, long reg, int width)
114221828Sgrehan{
115221828Sgrehan	struct pci_io pi;
116221828Sgrehan
117221828Sgrehan	bzero(&pi, sizeof(pi));
118221828Sgrehan	pi.pi_sel = *sel;
119221828Sgrehan	pi.pi_reg = reg;
120221828Sgrehan	pi.pi_width = width;
121221828Sgrehan
122221828Sgrehan	if (ioctl(pcifd, PCIOCREAD, &pi) < 0)
123221828Sgrehan		return (0);				/* XXX */
124221828Sgrehan	else
125221828Sgrehan		return (pi.pi_data);
126221828Sgrehan}
127221828Sgrehan
128221828Sgrehanstatic void
129221828Sgrehanwrite_config(const struct pcisel *sel, long reg, int width, uint32_t data)
130221828Sgrehan{
131221828Sgrehan	struct pci_io pi;
132221828Sgrehan
133221828Sgrehan	bzero(&pi, sizeof(pi));
134221828Sgrehan	pi.pi_sel = *sel;
135221828Sgrehan	pi.pi_reg = reg;
136221828Sgrehan	pi.pi_width = width;
137221828Sgrehan	pi.pi_data = data;
138221828Sgrehan
139221828Sgrehan	(void)ioctl(pcifd, PCIOCWRITE, &pi);		/* XXX */
140221828Sgrehan}
141221828Sgrehan
142221828Sgrehan#ifdef LEGACY_SUPPORT
143221828Sgrehanstatic int
144221828Sgrehanpassthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr)
145221828Sgrehan{
146221828Sgrehan	int capoff, i;
147221828Sgrehan	struct msicap msicap;
148221828Sgrehan	u_char *capdata;
149221828Sgrehan
150221828Sgrehan	pci_populate_msicap(&msicap, msgnum, nextptr);
151221828Sgrehan
152221828Sgrehan	/*
153221828Sgrehan	 * XXX
154221828Sgrehan	 * Copy the msi capability structure in the last 16 bytes of the
155221828Sgrehan	 * config space. This is wrong because it could shadow something
156221828Sgrehan	 * useful to the device.
157221828Sgrehan	 */
158221828Sgrehan	capoff = 256 - roundup(sizeof(msicap), 4);
159221828Sgrehan	capdata = (u_char *)&msicap;
160221828Sgrehan	for (i = 0; i < sizeof(msicap); i++)
161221828Sgrehan		pci_set_cfgdata8(pi, capoff + i, capdata[i]);
162221828Sgrehan
163221828Sgrehan	return (capoff);
164221828Sgrehan}
165221828Sgrehan#endif	/* LEGACY_SUPPORT */
166221828Sgrehan
167221828Sgrehanstatic int
168221828Sgrehancfginitmsi(struct passthru_softc *sc)
169221828Sgrehan{
170245749Sneel	int i, ptr, capptr, cap, sts, caplen, table_size;
171221828Sgrehan	uint32_t u32;
172221828Sgrehan	struct pcisel sel;
173221828Sgrehan	struct pci_devinst *pi;
174234761Sgrehan	struct msixcap msixcap;
175234761Sgrehan	uint32_t *msixcap_ptr;
176221828Sgrehan
177221828Sgrehan	pi = sc->psc_pi;
178221828Sgrehan	sel = sc->psc_sel;
179221828Sgrehan
180221828Sgrehan	/*
181221828Sgrehan	 * Parse the capabilities and cache the location of the MSI
182234761Sgrehan	 * and MSI-X capabilities.
183221828Sgrehan	 */
184221828Sgrehan	sts = read_config(&sel, PCIR_STATUS, 2);
185221828Sgrehan	if (sts & PCIM_STATUS_CAPPRESENT) {
186221828Sgrehan		ptr = read_config(&sel, PCIR_CAP_PTR, 1);
187221828Sgrehan		while (ptr != 0 && ptr != 0xff) {
188221828Sgrehan			cap = read_config(&sel, ptr + PCICAP_ID, 1);
189221828Sgrehan			if (cap == PCIY_MSI) {
190221828Sgrehan				/*
191221828Sgrehan				 * Copy the MSI capability into the config
192221828Sgrehan				 * space of the emulated pci device
193221828Sgrehan				 */
194221828Sgrehan				sc->psc_msi.capoff = ptr;
195221828Sgrehan				sc->psc_msi.msgctrl = read_config(&sel,
196221828Sgrehan								  ptr + 2, 2);
197221828Sgrehan				sc->psc_msi.emulated = 0;
198221828Sgrehan				caplen = msi_caplen(sc->psc_msi.msgctrl);
199234761Sgrehan				capptr = ptr;
200221828Sgrehan				while (caplen > 0) {
201234761Sgrehan					u32 = read_config(&sel, capptr, 4);
202234761Sgrehan					pci_set_cfgdata32(pi, capptr, u32);
203221828Sgrehan					caplen -= 4;
204234761Sgrehan					capptr += 4;
205221828Sgrehan				}
206234761Sgrehan			} else if (cap == PCIY_MSIX) {
207234761Sgrehan				/*
208234761Sgrehan				 * Copy the MSI-X capability
209234761Sgrehan				 */
210234761Sgrehan				sc->psc_msix.capoff = ptr;
211234761Sgrehan				caplen = 12;
212234761Sgrehan				msixcap_ptr = (uint32_t*) &msixcap;
213234761Sgrehan				capptr = ptr;
214234761Sgrehan				while (caplen > 0) {
215234761Sgrehan					u32 = read_config(&sel, capptr, 4);
216234761Sgrehan					*msixcap_ptr = u32;
217234761Sgrehan					pci_set_cfgdata32(pi, capptr, u32);
218234761Sgrehan					caplen -= 4;
219234761Sgrehan					capptr += 4;
220234761Sgrehan					msixcap_ptr++;
221234761Sgrehan				}
222221828Sgrehan			}
223221828Sgrehan			ptr = read_config(&sel, ptr + PCICAP_NEXTPTR, 1);
224221828Sgrehan		}
225221828Sgrehan	}
226221828Sgrehan
227241744Sgrehan	if (sc->psc_msix.capoff != 0) {
228241744Sgrehan		pi->pi_msix.pba_bar =
229245749Sneel		    msixcap.pba_info & PCIM_MSIX_BIR_MASK;
230241744Sgrehan		pi->pi_msix.pba_offset =
231245749Sneel		    msixcap.pba_info & ~PCIM_MSIX_BIR_MASK;
232241744Sgrehan		pi->pi_msix.table_bar =
233245749Sneel		    msixcap.table_info & PCIM_MSIX_BIR_MASK;
234241744Sgrehan		pi->pi_msix.table_offset =
235245749Sneel		    msixcap.table_info & ~PCIM_MSIX_BIR_MASK;
236241744Sgrehan		pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl);
237262184Sneel		pi->pi_msix.pba_size = PBA_SIZE(pi->pi_msix.table_count);
238245749Sneel
239245749Sneel		/* Allocate the emulated MSI-X table array */
240245749Sneel		table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
241264770Sdelphij		pi->pi_msix.table = calloc(1, table_size);
242245749Sneel
243245749Sneel		/* Mask all table entries */
244245749Sneel		for (i = 0; i < pi->pi_msix.table_count; i++) {
245245749Sneel			pi->pi_msix.table[i].vector_control |=
246245749Sneel						PCIM_MSIX_VCTRL_MASK;
247245749Sneel		}
248241744Sgrehan	}
249234761Sgrehan
250221828Sgrehan#ifdef LEGACY_SUPPORT
251221828Sgrehan	/*
252221828Sgrehan	 * If the passthrough device does not support MSI then craft a
253221828Sgrehan	 * MSI capability for it. We link the new MSI capability at the
254221828Sgrehan	 * head of the list of capabilities.
255221828Sgrehan	 */
256221828Sgrehan	if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) {
257221828Sgrehan		int origptr, msiptr;
258221828Sgrehan		origptr = read_config(&sel, PCIR_CAP_PTR, 1);
259221828Sgrehan		msiptr = passthru_add_msicap(pi, 1, origptr);
260221828Sgrehan		sc->psc_msi.capoff = msiptr;
261221828Sgrehan		sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2);
262221828Sgrehan		sc->psc_msi.emulated = 1;
263221828Sgrehan		pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr);
264221828Sgrehan	}
265221828Sgrehan#endif
266221828Sgrehan
267234761Sgrehan	/* Make sure one of the capabilities is present */
268234761Sgrehan	if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0)
269221828Sgrehan		return (-1);
270221828Sgrehan	else
271221828Sgrehan		return (0);
272221828Sgrehan}
273221828Sgrehan
274241744Sgrehanstatic uint64_t
275241744Sgrehanmsix_table_read(struct passthru_softc *sc, uint64_t offset, int size)
276234761Sgrehan{
277234761Sgrehan	struct pci_devinst *pi;
278241744Sgrehan	struct msix_table_entry *entry;
279234761Sgrehan	uint8_t *src8;
280234761Sgrehan	uint16_t *src16;
281234761Sgrehan	uint32_t *src32;
282234761Sgrehan	uint64_t *src64;
283241744Sgrehan	uint64_t data;
284241744Sgrehan	size_t entry_offset;
285241744Sgrehan	int index;
286234761Sgrehan
287234761Sgrehan	pi = sc->psc_pi;
288297932Sjhb	if (offset >= pi->pi_msix.pba_offset &&
289297932Sjhb	    offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
290297932Sjhb		switch(size) {
291297932Sjhb		case 1:
292297932Sjhb			src8 = (uint8_t *)(pi->pi_msix.pba_page + offset -
293297932Sjhb			    pi->pi_msix.pba_page_offset);
294297932Sjhb			data = *src8;
295297932Sjhb			break;
296297932Sjhb		case 2:
297297932Sjhb			src16 = (uint16_t *)(pi->pi_msix.pba_page + offset -
298297932Sjhb			    pi->pi_msix.pba_page_offset);
299297932Sjhb			data = *src16;
300297932Sjhb			break;
301297932Sjhb		case 4:
302297932Sjhb			src32 = (uint32_t *)(pi->pi_msix.pba_page + offset -
303297932Sjhb			    pi->pi_msix.pba_page_offset);
304297932Sjhb			data = *src32;
305297932Sjhb			break;
306297932Sjhb		case 8:
307297932Sjhb			src64 = (uint64_t *)(pi->pi_msix.pba_page + offset -
308297932Sjhb			    pi->pi_msix.pba_page_offset);
309297932Sjhb			data = *src64;
310297932Sjhb			break;
311297932Sjhb		default:
312297932Sjhb			return (-1);
313297932Sjhb		}
314297932Sjhb		return (data);
315297932Sjhb	}
316297932Sjhb
317262184Sneel	if (offset < pi->pi_msix.table_offset)
318262184Sneel		return (-1);
319262184Sneel
320248171Sneel	offset -= pi->pi_msix.table_offset;
321234761Sgrehan	index = offset / MSIX_TABLE_ENTRY_SIZE;
322245749Sneel	if (index >= pi->pi_msix.table_count)
323245749Sneel		return (-1);
324245749Sneel
325234761Sgrehan	entry = &pi->pi_msix.table[index];
326245749Sneel	entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
327234761Sgrehan
328234761Sgrehan	switch(size) {
329234761Sgrehan	case 1:
330241744Sgrehan		src8 = (uint8_t *)((void *)entry + entry_offset);
331241744Sgrehan		data = *src8;
332234761Sgrehan		break;
333234761Sgrehan	case 2:
334241744Sgrehan		src16 = (uint16_t *)((void *)entry + entry_offset);
335241744Sgrehan		data = *src16;
336234761Sgrehan		break;
337234761Sgrehan	case 4:
338241744Sgrehan		src32 = (uint32_t *)((void *)entry + entry_offset);
339241744Sgrehan		data = *src32;
340234761Sgrehan		break;
341234761Sgrehan	case 8:
342241744Sgrehan		src64 = (uint64_t *)((void *)entry + entry_offset);
343241744Sgrehan		data = *src64;
344234761Sgrehan		break;
345234761Sgrehan	default:
346234761Sgrehan		return (-1);
347234761Sgrehan	}
348234761Sgrehan
349241744Sgrehan	return (data);
350234761Sgrehan}
351234761Sgrehan
352241744Sgrehanstatic void
353241744Sgrehanmsix_table_write(struct vmctx *ctx, int vcpu, struct passthru_softc *sc,
354241744Sgrehan		 uint64_t offset, int size, uint64_t data)
355234761Sgrehan{
356234761Sgrehan	struct pci_devinst *pi;
357241744Sgrehan	struct msix_table_entry *entry;
358297932Sjhb	uint8_t *dest8;
359297932Sjhb	uint16_t *dest16;
360297932Sjhb	uint32_t *dest32;
361297932Sjhb	uint64_t *dest64;
362241744Sgrehan	size_t entry_offset;
363234761Sgrehan	uint32_t vector_control;
364302365Sngie	int index;
365234761Sgrehan
366234761Sgrehan	pi = sc->psc_pi;
367297932Sjhb	if (offset >= pi->pi_msix.pba_offset &&
368297932Sjhb	    offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
369297932Sjhb		switch(size) {
370297932Sjhb		case 1:
371297932Sjhb			dest8 = (uint8_t *)(pi->pi_msix.pba_page + offset -
372297932Sjhb			    pi->pi_msix.pba_page_offset);
373297932Sjhb			*dest8 = data;
374297932Sjhb			break;
375297932Sjhb		case 2:
376297932Sjhb			dest16 = (uint16_t *)(pi->pi_msix.pba_page + offset -
377297932Sjhb			    pi->pi_msix.pba_page_offset);
378297932Sjhb			*dest16 = data;
379297932Sjhb			break;
380297932Sjhb		case 4:
381297932Sjhb			dest32 = (uint32_t *)(pi->pi_msix.pba_page + offset -
382297932Sjhb			    pi->pi_msix.pba_page_offset);
383297932Sjhb			*dest32 = data;
384297932Sjhb			break;
385297932Sjhb		case 8:
386297932Sjhb			dest64 = (uint64_t *)(pi->pi_msix.pba_page + offset -
387297932Sjhb			    pi->pi_msix.pba_page_offset);
388297932Sjhb			*dest64 = data;
389297932Sjhb			break;
390297932Sjhb		default:
391297932Sjhb			break;
392297932Sjhb		}
393297932Sjhb		return;
394297932Sjhb	}
395297932Sjhb
396262184Sneel	if (offset < pi->pi_msix.table_offset)
397262184Sneel		return;
398262184Sneel
399248171Sneel	offset -= pi->pi_msix.table_offset;
400234761Sgrehan	index = offset / MSIX_TABLE_ENTRY_SIZE;
401245749Sneel	if (index >= pi->pi_msix.table_count)
402245749Sneel		return;
403245749Sneel
404234761Sgrehan	entry = &pi->pi_msix.table[index];
405245749Sneel	entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
406234761Sgrehan
407234761Sgrehan	/* Only 4 byte naturally-aligned writes are supported */
408241744Sgrehan	assert(size == 4);
409241744Sgrehan	assert(entry_offset % 4 == 0);
410241744Sgrehan
411241744Sgrehan	vector_control = entry->vector_control;
412297932Sjhb	dest32 = (uint32_t *)((void *)entry + entry_offset);
413297932Sjhb	*dest32 = data;
414241744Sgrehan	/* If MSI-X hasn't been enabled, do nothing */
415241744Sgrehan	if (pi->pi_msix.enabled) {
416241744Sgrehan		/* If the entry is masked, don't set it up */
417241744Sgrehan		if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 ||
418241744Sgrehan		    (vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
419302365Sngie			(void)vm_setup_pptdev_msix(ctx, vcpu,
420302365Sngie			    sc->psc_sel.pc_bus, sc->psc_sel.pc_dev,
421259537Sneel			    sc->psc_sel.pc_func, index, entry->addr,
422259537Sneel			    entry->msg_data, entry->vector_control);
423234761Sgrehan		}
424234761Sgrehan	}
425234761Sgrehan}
426234761Sgrehan
427234761Sgrehanstatic int
428234761Sgrehaninit_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base)
429234761Sgrehan{
430246191Sneel	int b, s, f;
431246191Sneel	int error, idx;
432262184Sneel	size_t len, remaining;
433262184Sneel	uint32_t table_size, table_offset;
434262184Sneel	uint32_t pba_size, pba_offset;
435234761Sgrehan	vm_paddr_t start;
436234761Sgrehan	struct pci_devinst *pi = sc->psc_pi;
437234761Sgrehan
438246190Sneel	assert(pci_msix_table_bar(pi) >= 0 && pci_msix_pba_bar(pi) >= 0);
439246190Sneel
440246191Sneel	b = sc->psc_sel.pc_bus;
441246191Sneel	s = sc->psc_sel.pc_dev;
442246191Sneel	f = sc->psc_sel.pc_func;
443246191Sneel
444234761Sgrehan	/*
445234761Sgrehan	 * If the MSI-X table BAR maps memory intended for
446234761Sgrehan	 * other uses, it is at least assured that the table
447234761Sgrehan	 * either resides in its own page within the region,
448234761Sgrehan	 * or it resides in a page shared with only the PBA.
449234761Sgrehan	 */
450262184Sneel	table_offset = rounddown2(pi->pi_msix.table_offset, 4096);
451241744Sgrehan
452262184Sneel	table_size = pi->pi_msix.table_offset - table_offset;
453262184Sneel	table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
454246191Sneel	table_size = roundup2(table_size, 4096);
455246191Sneel
456297932Sjhb	idx = pi->pi_msix.table_bar;
457297932Sjhb	start = pi->pi_bar[idx].addr;
458297932Sjhb	remaining = pi->pi_bar[idx].size;
459297932Sjhb
460262184Sneel	if (pi->pi_msix.pba_bar == pi->pi_msix.table_bar) {
461262184Sneel		pba_offset = pi->pi_msix.pba_offset;
462262184Sneel		pba_size = pi->pi_msix.pba_size;
463262184Sneel		if (pba_offset >= table_offset + table_size ||
464262184Sneel		    table_offset >= pba_offset + pba_size) {
465262184Sneel			/*
466297932Sjhb			 * If the PBA does not share a page with the MSI-x
467297932Sjhb			 * tables, no PBA emulation is required.
468262184Sneel			 */
469297932Sjhb			pi->pi_msix.pba_page = NULL;
470297932Sjhb			pi->pi_msix.pba_page_offset = 0;
471262184Sneel		} else {
472297932Sjhb			/*
473297932Sjhb			 * The PBA overlaps with either the first or last
474297932Sjhb			 * page of the MSI-X table region.  Map the
475297932Sjhb			 * appropriate page.
476297932Sjhb			 */
477297932Sjhb			if (pba_offset <= table_offset)
478297932Sjhb				pi->pi_msix.pba_page_offset = table_offset;
479297932Sjhb			else
480297932Sjhb				pi->pi_msix.pba_page_offset = table_offset +
481297932Sjhb				    table_size - 4096;
482297932Sjhb			pi->pi_msix.pba_page = mmap(NULL, 4096, PROT_READ |
483297932Sjhb			    PROT_WRITE, MAP_SHARED, memfd, start +
484297932Sjhb			    pi->pi_msix.pba_page_offset);
485297932Sjhb			if (pi->pi_msix.pba_page == MAP_FAILED) {
486298295Sjhb				warn(
487298295Sjhb			    "Failed to map PBA page for MSI-X on %d/%d/%d",
488298295Sjhb				    b, s, f);
489297932Sjhb				return (-1);
490297932Sjhb			}
491262184Sneel		}
492262184Sneel	}
493262184Sneel
494246191Sneel	/* Map everything before the MSI-X table */
495262184Sneel	if (table_offset > 0) {
496262184Sneel		len = table_offset;
497246191Sneel		error = vm_map_pptdev_mmio(ctx, b, s, f, start, len, base);
498246191Sneel		if (error)
499246191Sneel			return (error);
500246191Sneel
501246191Sneel		base += len;
502246191Sneel		start += len;
503246191Sneel		remaining -= len;
504234761Sgrehan	}
505246191Sneel
506246191Sneel	/* Skip the MSI-X table */
507246191Sneel	base += table_size;
508246191Sneel	start += table_size;
509246191Sneel	remaining -= table_size;
510246191Sneel
511246191Sneel	/* Map everything beyond the end of the MSI-X table */
512246191Sneel	if (remaining > 0) {
513246191Sneel		len = remaining;
514246191Sneel		error = vm_map_pptdev_mmio(ctx, b, s, f, start, len, base);
515246191Sneel		if (error)
516246191Sneel			return (error);
517246191Sneel	}
518246191Sneel
519246191Sneel	return (0);
520234761Sgrehan}
521234761Sgrehan
522234761Sgrehanstatic int
523221828Sgrehancfginitbar(struct vmctx *ctx, struct passthru_softc *sc)
524221828Sgrehan{
525221828Sgrehan	int i, error;
526221828Sgrehan	struct pci_devinst *pi;
527221828Sgrehan	struct pci_bar_io bar;
528221828Sgrehan	enum pcibar_type bartype;
529262184Sneel	uint64_t base, size;
530221828Sgrehan
531221828Sgrehan	pi = sc->psc_pi;
532221828Sgrehan
533221828Sgrehan	/*
534221828Sgrehan	 * Initialize BAR registers
535221828Sgrehan	 */
536221828Sgrehan	for (i = 0; i <= PCI_BARMAX; i++) {
537221828Sgrehan		bzero(&bar, sizeof(bar));
538221828Sgrehan		bar.pbi_sel = sc->psc_sel;
539221828Sgrehan		bar.pbi_reg = PCIR_BAR(i);
540221828Sgrehan
541221828Sgrehan		if (ioctl(pcifd, PCIOCGETBAR, &bar) < 0)
542221828Sgrehan			continue;
543221828Sgrehan
544221828Sgrehan		if (PCI_BAR_IO(bar.pbi_base)) {
545221828Sgrehan			bartype = PCIBAR_IO;
546221828Sgrehan			base = bar.pbi_base & PCIM_BAR_IO_BASE;
547221828Sgrehan		} else {
548221828Sgrehan			switch (bar.pbi_base & PCIM_BAR_MEM_TYPE) {
549221828Sgrehan			case PCIM_BAR_MEM_64:
550221828Sgrehan				bartype = PCIBAR_MEM64;
551221828Sgrehan				break;
552221828Sgrehan			default:
553221828Sgrehan				bartype = PCIBAR_MEM32;
554221828Sgrehan				break;
555221828Sgrehan			}
556221828Sgrehan			base = bar.pbi_base & PCIM_BAR_MEM_BASE;
557221828Sgrehan		}
558262184Sneel		size = bar.pbi_length;
559221828Sgrehan
560262184Sneel		if (bartype != PCIBAR_IO) {
561262184Sneel			if (((base | size) & PAGE_MASK) != 0) {
562298295Sjhb				warnx("passthru device %d/%d/%d BAR %d: "
563262184Sneel				    "base %#lx or size %#lx not page aligned\n",
564262184Sneel				    sc->psc_sel.pc_bus, sc->psc_sel.pc_dev,
565262184Sneel				    sc->psc_sel.pc_func, i, base, size);
566262184Sneel				return (-1);
567262184Sneel			}
568262184Sneel		}
569262184Sneel
570221828Sgrehan		/* Cache information about the "real" BAR */
571221828Sgrehan		sc->psc_bar[i].type = bartype;
572262184Sneel		sc->psc_bar[i].size = size;
573221828Sgrehan		sc->psc_bar[i].addr = base;
574221828Sgrehan
575221828Sgrehan		/* Allocate the BAR in the guest I/O or MMIO space */
576262184Sneel		error = pci_emul_alloc_pbar(pi, i, base, bartype, size);
577221828Sgrehan		if (error)
578221828Sgrehan			return (-1);
579221828Sgrehan
580234761Sgrehan		/* The MSI-X table needs special handling */
581246190Sneel		if (i == pci_msix_table_bar(pi)) {
582234761Sgrehan			error = init_msix_table(ctx, sc, base);
583234761Sgrehan			if (error)
584234761Sgrehan				return (-1);
585234761Sgrehan		} else if (bartype != PCIBAR_IO) {
586262184Sneel			/* Map the physical BAR in the guest MMIO space */
587221828Sgrehan			error = vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
588221828Sgrehan				sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
589221828Sgrehan				pi->pi_bar[i].addr, pi->pi_bar[i].size, base);
590221828Sgrehan			if (error)
591221828Sgrehan				return (-1);
592221828Sgrehan		}
593221828Sgrehan
594221828Sgrehan		/*
595221828Sgrehan		 * 64-bit BAR takes up two slots so skip the next one.
596221828Sgrehan		 */
597221828Sgrehan		if (bartype == PCIBAR_MEM64) {
598221828Sgrehan			i++;
599221828Sgrehan			assert(i <= PCI_BARMAX);
600221828Sgrehan			sc->psc_bar[i].type = PCIBAR_MEMHI64;
601221828Sgrehan		}
602221828Sgrehan	}
603221828Sgrehan	return (0);
604221828Sgrehan}
605221828Sgrehan
606221828Sgrehanstatic int
607221828Sgrehancfginit(struct vmctx *ctx, struct pci_devinst *pi, int bus, int slot, int func)
608221828Sgrehan{
609221828Sgrehan	int error;
610221828Sgrehan	struct passthru_softc *sc;
611221828Sgrehan
612221828Sgrehan	error = 1;
613221828Sgrehan	sc = pi->pi_arg;
614221828Sgrehan
615221828Sgrehan	bzero(&sc->psc_sel, sizeof(struct pcisel));
616221828Sgrehan	sc->psc_sel.pc_bus = bus;
617221828Sgrehan	sc->psc_sel.pc_dev = slot;
618221828Sgrehan	sc->psc_sel.pc_func = func;
619221828Sgrehan
620298295Sjhb	if (cfginitmsi(sc) != 0) {
621298295Sjhb		warnx("failed to initialize MSI for PCI %d/%d/%d",
622298295Sjhb		    bus, slot, func);
623234761Sgrehan		goto done;
624298295Sjhb	}
625234761Sgrehan
626298295Sjhb	if (cfginitbar(ctx, sc) != 0) {
627298295Sjhb		warnx("failed to initialize BARs for PCI %d/%d/%d",
628298295Sjhb		    bus, slot, func);
629221828Sgrehan		goto done;
630298295Sjhb	}
631221828Sgrehan
632221828Sgrehan	error = 0;				/* success */
633221828Sgrehandone:
634221828Sgrehan	return (error);
635221828Sgrehan}
636221828Sgrehan
637221828Sgrehanstatic int
638221828Sgrehanpassthru_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
639221828Sgrehan{
640284539Sneel	int bus, slot, func, error, memflags;
641221828Sgrehan	struct passthru_softc *sc;
642221828Sgrehan
643221828Sgrehan	sc = NULL;
644221828Sgrehan	error = 1;
645221828Sgrehan
646284539Sneel	memflags = vm_get_memflags(ctx);
647284539Sneel	if (!(memflags & VM_MEM_F_WIRED)) {
648298295Sjhb		warnx("passthru requires guest memory to be wired");
649284539Sneel		goto done;
650284539Sneel	}
651284539Sneel
652221828Sgrehan	if (pcifd < 0) {
653221828Sgrehan		pcifd = open(_PATH_DEVPCI, O_RDWR, 0);
654298295Sjhb		if (pcifd < 0) {
655298295Sjhb			warn("failed to open %s", _PATH_DEVPCI);
656221828Sgrehan			goto done;
657298295Sjhb		}
658221828Sgrehan	}
659221828Sgrehan
660221828Sgrehan	if (iofd < 0) {
661221828Sgrehan		iofd = open(_PATH_DEVIO, O_RDWR, 0);
662298295Sjhb		if (iofd < 0) {
663298295Sjhb			warn("failed to open %s", _PATH_DEVIO);
664221828Sgrehan			goto done;
665298295Sjhb		}
666221828Sgrehan	}
667221828Sgrehan
668297932Sjhb	if (memfd < 0) {
669297932Sjhb		memfd = open(_PATH_MEM, O_RDWR, 0);
670298295Sjhb		if (memfd < 0) {
671298295Sjhb			warn("failed to open %s", _PATH_MEM);
672297932Sjhb			goto done;
673298295Sjhb		}
674297932Sjhb	}
675297932Sjhb
676241744Sgrehan	if (opts == NULL ||
677298295Sjhb	    sscanf(opts, "%d/%d/%d", &bus, &slot, &func) != 3) {
678298295Sjhb		warnx("invalid passthru options");
679221828Sgrehan		goto done;
680298295Sjhb	}
681221828Sgrehan
682298295Sjhb	if (vm_assign_pptdev(ctx, bus, slot, func) != 0) {
683298295Sjhb		warnx("PCI device at %d/%d/%d is not using the ppt(4) driver",
684298295Sjhb		    bus, slot, func);
685221828Sgrehan		goto done;
686298295Sjhb	}
687221828Sgrehan
688264770Sdelphij	sc = calloc(1, sizeof(struct passthru_softc));
689221828Sgrehan
690221828Sgrehan	pi->pi_arg = sc;
691221828Sgrehan	sc->psc_pi = pi;
692221828Sgrehan
693221828Sgrehan	/* initialize config space */
694241744Sgrehan	if ((error = cfginit(ctx, pi, bus, slot, func)) != 0)
695221828Sgrehan		goto done;
696221828Sgrehan
697221828Sgrehan	error = 0;		/* success */
698221828Sgrehandone:
699221828Sgrehan	if (error) {
700221828Sgrehan		free(sc);
701221828Sgrehan		vm_unassign_pptdev(ctx, bus, slot, func);
702221828Sgrehan	}
703221828Sgrehan	return (error);
704221828Sgrehan}
705221828Sgrehan
706221828Sgrehanstatic int
707221828Sgrehanbar_access(int coff)
708221828Sgrehan{
709221828Sgrehan	if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1))
710221828Sgrehan		return (1);
711221828Sgrehan	else
712221828Sgrehan		return (0);
713221828Sgrehan}
714221828Sgrehan
715221828Sgrehanstatic int
716221828Sgrehanmsicap_access(struct passthru_softc *sc, int coff)
717221828Sgrehan{
718221828Sgrehan	int caplen;
719221828Sgrehan
720221828Sgrehan	if (sc->psc_msi.capoff == 0)
721221828Sgrehan		return (0);
722221828Sgrehan
723221828Sgrehan	caplen = msi_caplen(sc->psc_msi.msgctrl);
724221828Sgrehan
725221828Sgrehan	if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen)
726221828Sgrehan		return (1);
727221828Sgrehan	else
728221828Sgrehan		return (0);
729221828Sgrehan}
730221828Sgrehan
731234761Sgrehanstatic int
732234761Sgrehanmsixcap_access(struct passthru_softc *sc, int coff)
733234761Sgrehan{
734234761Sgrehan	if (sc->psc_msix.capoff == 0)
735234761Sgrehan		return (0);
736234761Sgrehan
737234761Sgrehan	return (coff >= sc->psc_msix.capoff &&
738234761Sgrehan	        coff < sc->psc_msix.capoff + MSIX_CAPLEN);
739234761Sgrehan}
740234761Sgrehan
741221828Sgrehanstatic int
742241744Sgrehanpassthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
743241744Sgrehan		 int coff, int bytes, uint32_t *rv)
744221828Sgrehan{
745221828Sgrehan	struct passthru_softc *sc;
746221828Sgrehan
747221828Sgrehan	sc = pi->pi_arg;
748221828Sgrehan
749221828Sgrehan	/*
750221828Sgrehan	 * PCI BARs and MSI capability is emulated.
751221828Sgrehan	 */
752221828Sgrehan	if (bar_access(coff) || msicap_access(sc, coff))
753221828Sgrehan		return (-1);
754221828Sgrehan
755221828Sgrehan#ifdef LEGACY_SUPPORT
756221828Sgrehan	/*
757221828Sgrehan	 * Emulate PCIR_CAP_PTR if this device does not support MSI capability
758221828Sgrehan	 * natively.
759221828Sgrehan	 */
760221828Sgrehan	if (sc->psc_msi.emulated) {
761221828Sgrehan		if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4)
762221828Sgrehan			return (-1);
763221828Sgrehan	}
764221828Sgrehan#endif
765221828Sgrehan
766221828Sgrehan	/* Everything else just read from the device's config space */
767221828Sgrehan	*rv = read_config(&sc->psc_sel, coff, bytes);
768221828Sgrehan
769221828Sgrehan	return (0);
770221828Sgrehan}
771221828Sgrehan
772221828Sgrehanstatic int
773241744Sgrehanpassthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
774241744Sgrehan		  int coff, int bytes, uint32_t val)
775221828Sgrehan{
776234761Sgrehan	int error, msix_table_entries, i;
777221828Sgrehan	struct passthru_softc *sc;
778221828Sgrehan
779221828Sgrehan	sc = pi->pi_arg;
780221828Sgrehan
781221828Sgrehan	/*
782221828Sgrehan	 * PCI BARs are emulated
783221828Sgrehan	 */
784221828Sgrehan	if (bar_access(coff))
785221828Sgrehan		return (-1);
786221828Sgrehan
787221828Sgrehan	/*
788221828Sgrehan	 * MSI capability is emulated
789221828Sgrehan	 */
790221828Sgrehan	if (msicap_access(sc, coff)) {
791221828Sgrehan		msicap_cfgwrite(pi, sc->psc_msi.capoff, coff, bytes, val);
792221828Sgrehan
793259537Sneel		error = vm_setup_pptdev_msi(ctx, vcpu, sc->psc_sel.pc_bus,
794259482Sneel			sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
795259482Sneel			pi->pi_msi.addr, pi->pi_msi.msg_data,
796259482Sneel			pi->pi_msi.maxmsgnum);
797298295Sjhb		if (error != 0)
798298295Sjhb			err(1, "vm_setup_pptdev_msi");
799221828Sgrehan		return (0);
800221828Sgrehan	}
801221828Sgrehan
802234761Sgrehan	if (msixcap_access(sc, coff)) {
803234761Sgrehan		msixcap_cfgwrite(pi, sc->psc_msix.capoff, coff, bytes, val);
804234761Sgrehan		if (pi->pi_msix.enabled) {
805234761Sgrehan			msix_table_entries = pi->pi_msix.table_count;
806234761Sgrehan			for (i = 0; i < msix_table_entries; i++) {
807259537Sneel				error = vm_setup_pptdev_msix(ctx, vcpu,
808259482Sneel				    sc->psc_sel.pc_bus, sc->psc_sel.pc_dev,
809259482Sneel				    sc->psc_sel.pc_func, i,
810259482Sneel				    pi->pi_msix.table[i].addr,
811259482Sneel				    pi->pi_msix.table[i].msg_data,
812259482Sneel				    pi->pi_msix.table[i].vector_control);
813234761Sgrehan
814298295Sjhb				if (error)
815298295Sjhb					err(1, "vm_setup_pptdev_msix");
816234761Sgrehan			}
817234761Sgrehan		}
818234761Sgrehan		return (0);
819234761Sgrehan	}
820234761Sgrehan
821221828Sgrehan#ifdef LEGACY_SUPPORT
822221828Sgrehan	/*
823221828Sgrehan	 * If this device does not support MSI natively then we cannot let
824221828Sgrehan	 * the guest disable legacy interrupts from the device. It is the
825221828Sgrehan	 * legacy interrupt that is triggering the virtual MSI to the guest.
826221828Sgrehan	 */
827221828Sgrehan	if (sc->psc_msi.emulated && pci_msi_enabled(pi)) {
828221828Sgrehan		if (coff == PCIR_COMMAND && bytes == 2)
829221828Sgrehan			val &= ~PCIM_CMD_INTxDIS;
830221828Sgrehan	}
831221828Sgrehan#endif
832221828Sgrehan
833221828Sgrehan	write_config(&sc->psc_sel, coff, bytes, val);
834221828Sgrehan
835221828Sgrehan	return (0);
836221828Sgrehan}
837221828Sgrehan
838221828Sgrehanstatic void
839241744Sgrehanpassthru_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
840241744Sgrehan	       uint64_t offset, int size, uint64_t value)
841221828Sgrehan{
842221828Sgrehan	struct passthru_softc *sc;
843221828Sgrehan	struct iodev_pio_req pio;
844221828Sgrehan
845221828Sgrehan	sc = pi->pi_arg;
846221828Sgrehan
847246190Sneel	if (baridx == pci_msix_table_bar(pi)) {
848241744Sgrehan		msix_table_write(ctx, vcpu, sc, offset, size, value);
849241744Sgrehan	} else {
850241744Sgrehan		assert(pi->pi_bar[baridx].type == PCIBAR_IO);
851241744Sgrehan		bzero(&pio, sizeof(struct iodev_pio_req));
852241744Sgrehan		pio.access = IODEV_PIO_WRITE;
853241744Sgrehan		pio.port = sc->psc_bar[baridx].addr + offset;
854241744Sgrehan		pio.width = size;
855241744Sgrehan		pio.val = value;
856241744Sgrehan
857241744Sgrehan		(void)ioctl(iofd, IODEV_PIO, &pio);
858241744Sgrehan	}
859221828Sgrehan}
860221828Sgrehan
861241744Sgrehanstatic uint64_t
862241744Sgrehanpassthru_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
863241744Sgrehan	      uint64_t offset, int size)
864221828Sgrehan{
865221828Sgrehan	struct passthru_softc *sc;
866221828Sgrehan	struct iodev_pio_req pio;
867241744Sgrehan	uint64_t val;
868221828Sgrehan
869221828Sgrehan	sc = pi->pi_arg;
870221828Sgrehan
871246190Sneel	if (baridx == pci_msix_table_bar(pi)) {
872241744Sgrehan		val = msix_table_read(sc, offset, size);
873241744Sgrehan	} else {
874241744Sgrehan		assert(pi->pi_bar[baridx].type == PCIBAR_IO);
875241744Sgrehan		bzero(&pio, sizeof(struct iodev_pio_req));
876241744Sgrehan		pio.access = IODEV_PIO_READ;
877241744Sgrehan		pio.port = sc->psc_bar[baridx].addr + offset;
878241744Sgrehan		pio.width = size;
879241744Sgrehan		pio.val = 0;
880221828Sgrehan
881241744Sgrehan		(void)ioctl(iofd, IODEV_PIO, &pio);
882221828Sgrehan
883241744Sgrehan		val = pio.val;
884241744Sgrehan	}
885241744Sgrehan
886241744Sgrehan	return (val);
887221828Sgrehan}
888221828Sgrehan
889221828Sgrehanstruct pci_devemu passthru = {
890221828Sgrehan	.pe_emu		= "passthru",
891221828Sgrehan	.pe_init	= passthru_init,
892221828Sgrehan	.pe_cfgwrite	= passthru_cfgwrite,
893221828Sgrehan	.pe_cfgread	= passthru_cfgread,
894241744Sgrehan	.pe_barwrite 	= passthru_write,
895241744Sgrehan	.pe_barread    	= passthru_read,
896221828Sgrehan};
897221828SgrehanPCI_EMUL_SET(passthru);
898