ppt.c revision 347443
1/*-
2 * Copyright (c) 2011 NetApp, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD: stable/11/sys/amd64/vmm/io/ppt.c 347443 2019-05-10 16:51:36Z jhb $
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: stable/11/sys/amd64/vmm/io/ppt.c 347443 2019-05-10 16:51:36Z jhb $");
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/kernel.h>
35#include <sys/malloc.h>
36#include <sys/module.h>
37#include <sys/bus.h>
38#include <sys/pciio.h>
39#include <sys/rman.h>
40#include <sys/smp.h>
41#include <sys/sysctl.h>
42
43#include <dev/pci/pcivar.h>
44#include <dev/pci/pcireg.h>
45
46#include <machine/resource.h>
47
48#include <machine/vmm.h>
49#include <machine/vmm_dev.h>
50
51#include "vmm_lapic.h"
52#include "vmm_ktr.h"
53
54#include "iommu.h"
55#include "ppt.h"
56
57/* XXX locking */
58
59#define	MAX_MSIMSGS	32
60
61/*
62 * If the MSI-X table is located in the middle of a BAR then that MMIO
63 * region gets split into two segments - one segment above the MSI-X table
64 * and the other segment below the MSI-X table - with a hole in place of
65 * the MSI-X table so accesses to it can be trapped and emulated.
66 *
67 * So, allocate a MMIO segment for each BAR register + 1 additional segment.
68 */
69#define	MAX_MMIOSEGS	((PCIR_MAX_BAR_0 + 1) + 1)
70
71MALLOC_DEFINE(M_PPTMSIX, "pptmsix", "Passthru MSI-X resources");
72
73struct pptintr_arg {				/* pptintr(pptintr_arg) */
74	struct pptdev	*pptdev;
75	uint64_t	addr;
76	uint64_t	msg_data;
77};
78
79struct pptseg {
80	vm_paddr_t	gpa;
81	size_t		len;
82	int		wired;
83};
84
85struct pptdev {
86	device_t	dev;
87	struct vm	*vm;			/* owner of this device */
88	TAILQ_ENTRY(pptdev)	next;
89	struct pptseg mmio[MAX_MMIOSEGS];
90	struct {
91		int	num_msgs;		/* guest state */
92
93		int	startrid;		/* host state */
94		struct resource *res[MAX_MSIMSGS];
95		void	*cookie[MAX_MSIMSGS];
96		struct pptintr_arg arg[MAX_MSIMSGS];
97	} msi;
98
99	struct {
100		int num_msgs;
101		int startrid;
102		int msix_table_rid;
103		struct resource *msix_table_res;
104		struct resource **res;
105		void **cookie;
106		struct pptintr_arg *arg;
107	} msix;
108};
109
110SYSCTL_DECL(_hw_vmm);
111SYSCTL_NODE(_hw_vmm, OID_AUTO, ppt, CTLFLAG_RW, 0, "bhyve passthru devices");
112
113static int num_pptdevs;
114SYSCTL_INT(_hw_vmm_ppt, OID_AUTO, devices, CTLFLAG_RD, &num_pptdevs, 0,
115    "number of pci passthru devices");
116
117static TAILQ_HEAD(, pptdev) pptdev_list = TAILQ_HEAD_INITIALIZER(pptdev_list);
118
119static int
120ppt_probe(device_t dev)
121{
122	int bus, slot, func;
123	struct pci_devinfo *dinfo;
124
125	dinfo = (struct pci_devinfo *)device_get_ivars(dev);
126
127	bus = pci_get_bus(dev);
128	slot = pci_get_slot(dev);
129	func = pci_get_function(dev);
130
131	/*
132	 * To qualify as a pci passthrough device a device must:
133	 * - be allowed by administrator to be used in this role
134	 * - be an endpoint device
135	 */
136	if ((dinfo->cfg.hdrtype & PCIM_HDRTYPE) != PCIM_HDRTYPE_NORMAL)
137		return (ENXIO);
138	else if (vmm_is_pptdev(bus, slot, func))
139		return (0);
140	else
141		/*
142		 * Returning BUS_PROBE_NOWILDCARD here matches devices that the
143		 * SR-IOV infrastructure specified as "ppt" passthrough devices.
144		 * All normal devices that did not have "ppt" specified as their
145		 * driver will not be matched by this.
146		 */
147		return (BUS_PROBE_NOWILDCARD);
148}
149
150static int
151ppt_attach(device_t dev)
152{
153	struct pptdev *ppt;
154
155	ppt = device_get_softc(dev);
156
157	iommu_remove_device(iommu_host_domain(), pci_get_rid(dev));
158	num_pptdevs++;
159	TAILQ_INSERT_TAIL(&pptdev_list, ppt, next);
160	ppt->dev = dev;
161
162	if (bootverbose)
163		device_printf(dev, "attached\n");
164
165	return (0);
166}
167
168static int
169ppt_detach(device_t dev)
170{
171	struct pptdev *ppt;
172
173	ppt = device_get_softc(dev);
174
175	if (ppt->vm != NULL)
176		return (EBUSY);
177	num_pptdevs--;
178	TAILQ_REMOVE(&pptdev_list, ppt, next);
179	pci_disable_busmaster(dev);
180	iommu_add_device(iommu_host_domain(), pci_get_rid(dev));
181
182	return (0);
183}
184
185static device_method_t ppt_methods[] = {
186	/* Device interface */
187	DEVMETHOD(device_probe,		ppt_probe),
188	DEVMETHOD(device_attach,	ppt_attach),
189	DEVMETHOD(device_detach,	ppt_detach),
190	{0, 0}
191};
192
193static devclass_t ppt_devclass;
194DEFINE_CLASS_0(ppt, ppt_driver, ppt_methods, sizeof(struct pptdev));
195DRIVER_MODULE(ppt, pci, ppt_driver, ppt_devclass, NULL, NULL);
196
197static struct pptdev *
198ppt_find(int bus, int slot, int func)
199{
200	device_t dev;
201	struct pptdev *ppt;
202	int b, s, f;
203
204	TAILQ_FOREACH(ppt, &pptdev_list, next) {
205		dev = ppt->dev;
206		b = pci_get_bus(dev);
207		s = pci_get_slot(dev);
208		f = pci_get_function(dev);
209		if (bus == b && slot == s && func == f)
210			return (ppt);
211	}
212	return (NULL);
213}
214
215static void
216ppt_unmap_mmio(struct vm *vm, struct pptdev *ppt)
217{
218	int i;
219	struct pptseg *seg;
220
221	for (i = 0; i < MAX_MMIOSEGS; i++) {
222		seg = &ppt->mmio[i];
223		if (seg->len == 0)
224			continue;
225		(void)vm_unmap_mmio(vm, seg->gpa, seg->len);
226		bzero(seg, sizeof(struct pptseg));
227	}
228}
229
230static void
231ppt_teardown_msi(struct pptdev *ppt)
232{
233	int i, rid;
234	void *cookie;
235	struct resource *res;
236
237	if (ppt->msi.num_msgs == 0)
238		return;
239
240	for (i = 0; i < ppt->msi.num_msgs; i++) {
241		rid = ppt->msi.startrid + i;
242		res = ppt->msi.res[i];
243		cookie = ppt->msi.cookie[i];
244
245		if (cookie != NULL)
246			bus_teardown_intr(ppt->dev, res, cookie);
247
248		if (res != NULL)
249			bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
250
251		ppt->msi.res[i] = NULL;
252		ppt->msi.cookie[i] = NULL;
253	}
254
255	if (ppt->msi.startrid == 1)
256		pci_release_msi(ppt->dev);
257
258	ppt->msi.num_msgs = 0;
259}
260
261static void
262ppt_teardown_msix_intr(struct pptdev *ppt, int idx)
263{
264	int rid;
265	struct resource *res;
266	void *cookie;
267
268	rid = ppt->msix.startrid + idx;
269	res = ppt->msix.res[idx];
270	cookie = ppt->msix.cookie[idx];
271
272	if (cookie != NULL)
273		bus_teardown_intr(ppt->dev, res, cookie);
274
275	if (res != NULL)
276		bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
277
278	ppt->msix.res[idx] = NULL;
279	ppt->msix.cookie[idx] = NULL;
280}
281
282static void
283ppt_teardown_msix(struct pptdev *ppt)
284{
285	int i;
286
287	if (ppt->msix.num_msgs == 0)
288		return;
289
290	for (i = 0; i < ppt->msix.num_msgs; i++)
291		ppt_teardown_msix_intr(ppt, i);
292
293	if (ppt->msix.msix_table_res) {
294		bus_release_resource(ppt->dev, SYS_RES_MEMORY,
295				     ppt->msix.msix_table_rid,
296				     ppt->msix.msix_table_res);
297		ppt->msix.msix_table_res = NULL;
298		ppt->msix.msix_table_rid = 0;
299	}
300
301	free(ppt->msix.res, M_PPTMSIX);
302	free(ppt->msix.cookie, M_PPTMSIX);
303	free(ppt->msix.arg, M_PPTMSIX);
304
305	pci_release_msi(ppt->dev);
306
307	ppt->msix.num_msgs = 0;
308}
309
310int
311ppt_avail_devices(void)
312{
313
314	return (num_pptdevs);
315}
316
317int
318ppt_assigned_devices(struct vm *vm)
319{
320	struct pptdev *ppt;
321	int num;
322
323	num = 0;
324	TAILQ_FOREACH(ppt, &pptdev_list, next) {
325		if (ppt->vm == vm)
326			num++;
327	}
328	return (num);
329}
330
331boolean_t
332ppt_is_mmio(struct vm *vm, vm_paddr_t gpa)
333{
334	int i;
335	struct pptdev *ppt;
336	struct pptseg *seg;
337
338	TAILQ_FOREACH(ppt, &pptdev_list, next) {
339		if (ppt->vm != vm)
340			continue;
341
342		for (i = 0; i < MAX_MMIOSEGS; i++) {
343			seg = &ppt->mmio[i];
344			if (seg->len == 0)
345				continue;
346			if (gpa >= seg->gpa && gpa < seg->gpa + seg->len)
347				return (TRUE);
348		}
349	}
350
351	return (FALSE);
352}
353
354static void
355ppt_pci_reset(device_t dev)
356{
357
358	if (pcie_flr(dev,
359	     max(pcie_get_max_completion_timeout(dev) / 1000, 10), true))
360		return;
361
362	pci_power_reset(dev);
363}
364
365int
366ppt_assign_device(struct vm *vm, int bus, int slot, int func)
367{
368	struct pptdev *ppt;
369
370	ppt = ppt_find(bus, slot, func);
371	if (ppt != NULL) {
372		/*
373		 * If this device is owned by a different VM then we
374		 * cannot change its owner.
375		 */
376		if (ppt->vm != NULL && ppt->vm != vm)
377			return (EBUSY);
378
379		pci_save_state(ppt->dev);
380		ppt_pci_reset(ppt->dev);
381		pci_restore_state(ppt->dev);
382		ppt->vm = vm;
383		iommu_add_device(vm_iommu_domain(vm), pci_get_rid(ppt->dev));
384		return (0);
385	}
386	return (ENOENT);
387}
388
389int
390ppt_unassign_device(struct vm *vm, int bus, int slot, int func)
391{
392	struct pptdev *ppt;
393
394	ppt = ppt_find(bus, slot, func);
395	if (ppt != NULL) {
396		/*
397		 * If this device is not owned by this 'vm' then bail out.
398		 */
399		if (ppt->vm != vm)
400			return (EBUSY);
401
402		pci_save_state(ppt->dev);
403		ppt_pci_reset(ppt->dev);
404		pci_restore_state(ppt->dev);
405		ppt_unmap_mmio(vm, ppt);
406		ppt_teardown_msi(ppt);
407		ppt_teardown_msix(ppt);
408		iommu_remove_device(vm_iommu_domain(vm), pci_get_rid(ppt->dev));
409		ppt->vm = NULL;
410		return (0);
411	}
412	return (ENOENT);
413}
414
415int
416ppt_unassign_all(struct vm *vm)
417{
418	struct pptdev *ppt;
419	int bus, slot, func;
420	device_t dev;
421
422	TAILQ_FOREACH(ppt, &pptdev_list, next) {
423		if (ppt->vm == vm) {
424			dev = ppt->dev;
425			bus = pci_get_bus(dev);
426			slot = pci_get_slot(dev);
427			func = pci_get_function(dev);
428			vm_unassign_pptdev(vm, bus, slot, func);
429		}
430	}
431
432	return (0);
433}
434
435int
436ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
437	     vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
438{
439	int i, error;
440	struct pptseg *seg;
441	struct pptdev *ppt;
442
443	ppt = ppt_find(bus, slot, func);
444	if (ppt != NULL) {
445		if (ppt->vm != vm)
446			return (EBUSY);
447
448		for (i = 0; i < MAX_MMIOSEGS; i++) {
449			seg = &ppt->mmio[i];
450			if (seg->len == 0) {
451				error = vm_map_mmio(vm, gpa, len, hpa);
452				if (error == 0) {
453					seg->gpa = gpa;
454					seg->len = len;
455				}
456				return (error);
457			}
458		}
459		return (ENOSPC);
460	}
461	return (ENOENT);
462}
463
464static int
465pptintr(void *arg)
466{
467	struct pptdev *ppt;
468	struct pptintr_arg *pptarg;
469
470	pptarg = arg;
471	ppt = pptarg->pptdev;
472
473	if (ppt->vm != NULL)
474		lapic_intr_msi(ppt->vm, pptarg->addr, pptarg->msg_data);
475	else {
476		/*
477		 * XXX
478		 * This is not expected to happen - panic?
479		 */
480	}
481
482	/*
483	 * For legacy interrupts give other filters a chance in case
484	 * the interrupt was not generated by the passthrough device.
485	 */
486	if (ppt->msi.startrid == 0)
487		return (FILTER_STRAY);
488	else
489		return (FILTER_HANDLED);
490}
491
492int
493ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
494	      uint64_t addr, uint64_t msg, int numvec)
495{
496	int i, rid, flags;
497	int msi_count, startrid, error, tmp;
498	struct pptdev *ppt;
499
500	if (numvec < 0 || numvec > MAX_MSIMSGS)
501		return (EINVAL);
502
503	ppt = ppt_find(bus, slot, func);
504	if (ppt == NULL)
505		return (ENOENT);
506	if (ppt->vm != vm)		/* Make sure we own this device */
507		return (EBUSY);
508
509	/* Free any allocated resources */
510	ppt_teardown_msi(ppt);
511
512	if (numvec == 0)		/* nothing more to do */
513		return (0);
514
515	flags = RF_ACTIVE;
516	msi_count = pci_msi_count(ppt->dev);
517	if (msi_count == 0) {
518		startrid = 0;		/* legacy interrupt */
519		msi_count = 1;
520		flags |= RF_SHAREABLE;
521	} else
522		startrid = 1;		/* MSI */
523
524	/*
525	 * The device must be capable of supporting the number of vectors
526	 * the guest wants to allocate.
527	 */
528	if (numvec > msi_count)
529		return (EINVAL);
530
531	/*
532	 * Make sure that we can allocate all the MSI vectors that are needed
533	 * by the guest.
534	 */
535	if (startrid == 1) {
536		tmp = numvec;
537		error = pci_alloc_msi(ppt->dev, &tmp);
538		if (error)
539			return (error);
540		else if (tmp != numvec) {
541			pci_release_msi(ppt->dev);
542			return (ENOSPC);
543		} else {
544			/* success */
545		}
546	}
547
548	ppt->msi.startrid = startrid;
549
550	/*
551	 * Allocate the irq resource and attach it to the interrupt handler.
552	 */
553	for (i = 0; i < numvec; i++) {
554		ppt->msi.num_msgs = i + 1;
555		ppt->msi.cookie[i] = NULL;
556
557		rid = startrid + i;
558		ppt->msi.res[i] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
559							 &rid, flags);
560		if (ppt->msi.res[i] == NULL)
561			break;
562
563		ppt->msi.arg[i].pptdev = ppt;
564		ppt->msi.arg[i].addr = addr;
565		ppt->msi.arg[i].msg_data = msg + i;
566
567		error = bus_setup_intr(ppt->dev, ppt->msi.res[i],
568				       INTR_TYPE_NET | INTR_MPSAFE,
569				       pptintr, NULL, &ppt->msi.arg[i],
570				       &ppt->msi.cookie[i]);
571		if (error != 0)
572			break;
573	}
574
575	if (i < numvec) {
576		ppt_teardown_msi(ppt);
577		return (ENXIO);
578	}
579
580	return (0);
581}
582
583int
584ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
585	       int idx, uint64_t addr, uint64_t msg, uint32_t vector_control)
586{
587	struct pptdev *ppt;
588	struct pci_devinfo *dinfo;
589	int numvec, alloced, rid, error;
590	size_t res_size, cookie_size, arg_size;
591
592	ppt = ppt_find(bus, slot, func);
593	if (ppt == NULL)
594		return (ENOENT);
595	if (ppt->vm != vm)		/* Make sure we own this device */
596		return (EBUSY);
597
598	dinfo = device_get_ivars(ppt->dev);
599	if (!dinfo)
600		return (ENXIO);
601
602	/*
603	 * First-time configuration:
604	 * 	Allocate the MSI-X table
605	 *	Allocate the IRQ resources
606	 *	Set up some variables in ppt->msix
607	 */
608	if (ppt->msix.num_msgs == 0) {
609		numvec = pci_msix_count(ppt->dev);
610		if (numvec <= 0)
611			return (EINVAL);
612
613		ppt->msix.startrid = 1;
614		ppt->msix.num_msgs = numvec;
615
616		res_size = numvec * sizeof(ppt->msix.res[0]);
617		cookie_size = numvec * sizeof(ppt->msix.cookie[0]);
618		arg_size = numvec * sizeof(ppt->msix.arg[0]);
619
620		ppt->msix.res = malloc(res_size, M_PPTMSIX, M_WAITOK | M_ZERO);
621		ppt->msix.cookie = malloc(cookie_size, M_PPTMSIX,
622					  M_WAITOK | M_ZERO);
623		ppt->msix.arg = malloc(arg_size, M_PPTMSIX, M_WAITOK | M_ZERO);
624
625		rid = dinfo->cfg.msix.msix_table_bar;
626		ppt->msix.msix_table_res = bus_alloc_resource_any(ppt->dev,
627					       SYS_RES_MEMORY, &rid, RF_ACTIVE);
628
629		if (ppt->msix.msix_table_res == NULL) {
630			ppt_teardown_msix(ppt);
631			return (ENOSPC);
632		}
633		ppt->msix.msix_table_rid = rid;
634
635		alloced = numvec;
636		error = pci_alloc_msix(ppt->dev, &alloced);
637		if (error || alloced != numvec) {
638			ppt_teardown_msix(ppt);
639			return (error == 0 ? ENOSPC: error);
640		}
641	}
642
643	if ((vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
644		/* Tear down the IRQ if it's already set up */
645		ppt_teardown_msix_intr(ppt, idx);
646
647		/* Allocate the IRQ resource */
648		ppt->msix.cookie[idx] = NULL;
649		rid = ppt->msix.startrid + idx;
650		ppt->msix.res[idx] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
651							    &rid, RF_ACTIVE);
652		if (ppt->msix.res[idx] == NULL)
653			return (ENXIO);
654
655		ppt->msix.arg[idx].pptdev = ppt;
656		ppt->msix.arg[idx].addr = addr;
657		ppt->msix.arg[idx].msg_data = msg;
658
659		/* Setup the MSI-X interrupt */
660		error = bus_setup_intr(ppt->dev, ppt->msix.res[idx],
661				       INTR_TYPE_NET | INTR_MPSAFE,
662				       pptintr, NULL, &ppt->msix.arg[idx],
663				       &ppt->msix.cookie[idx]);
664
665		if (error != 0) {
666			bus_teardown_intr(ppt->dev, ppt->msix.res[idx], ppt->msix.cookie[idx]);
667			bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, ppt->msix.res[idx]);
668			ppt->msix.cookie[idx] = NULL;
669			ppt->msix.res[idx] = NULL;
670			return (ENXIO);
671		}
672	} else {
673		/* Masked, tear it down if it's already been set up */
674		ppt_teardown_msix_intr(ppt, idx);
675	}
676
677	return (0);
678}
679