1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2011 NetApp, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 * $FreeBSD$
29 */
30
31#include <sys/cdefs.h>
32__FBSDID("$FreeBSD$");
33
34#include <sys/param.h>
35#include <sys/systm.h>
36#include <sys/kernel.h>
37#include <sys/malloc.h>
38#include <sys/module.h>
39#include <sys/bus.h>
40#include <sys/pciio.h>
41#include <sys/rman.h>
42#include <sys/smp.h>
43#include <sys/sysctl.h>
44
45#include <dev/pci/pcivar.h>
46#include <dev/pci/pcireg.h>
47
48#include <machine/resource.h>
49
50#include <machine/vmm.h>
51#include <machine/vmm_dev.h>
52
53#include "vmm_lapic.h"
54#include "vmm_ktr.h"
55
56#include "iommu.h"
57#include "ppt.h"
58
59/* XXX locking */
60
61#define	MAX_MSIMSGS	32
62
63/*
64 * If the MSI-X table is located in the middle of a BAR then that MMIO
65 * region gets split into two segments - one segment above the MSI-X table
66 * and the other segment below the MSI-X table - with a hole in place of
67 * the MSI-X table so accesses to it can be trapped and emulated.
68 *
69 * So, allocate a MMIO segment for each BAR register + 1 additional segment.
70 */
71#define	MAX_MMIOSEGS	((PCIR_MAX_BAR_0 + 1) + 1)
72
73MALLOC_DEFINE(M_PPTMSIX, "pptmsix", "Passthru MSI-X resources");
74
75struct pptintr_arg {				/* pptintr(pptintr_arg) */
76	struct pptdev	*pptdev;
77	uint64_t	addr;
78	uint64_t	msg_data;
79};
80
81struct pptseg {
82	vm_paddr_t	gpa;
83	size_t		len;
84	int		wired;
85};
86
87struct pptdev {
88	device_t	dev;
89	struct vm	*vm;			/* owner of this device */
90	TAILQ_ENTRY(pptdev)	next;
91	struct pptseg mmio[MAX_MMIOSEGS];
92	struct {
93		int	num_msgs;		/* guest state */
94
95		int	startrid;		/* host state */
96		struct resource *res[MAX_MSIMSGS];
97		void	*cookie[MAX_MSIMSGS];
98		struct pptintr_arg arg[MAX_MSIMSGS];
99	} msi;
100
101	struct {
102		int num_msgs;
103		int startrid;
104		int msix_table_rid;
105		int msix_pba_rid;
106		struct resource *msix_table_res;
107		struct resource *msix_pba_res;
108		struct resource **res;
109		void **cookie;
110		struct pptintr_arg *arg;
111	} msix;
112};
113
114SYSCTL_DECL(_hw_vmm);
115SYSCTL_NODE(_hw_vmm, OID_AUTO, ppt, CTLFLAG_RW, 0, "bhyve passthru devices");
116
117static int num_pptdevs;
118SYSCTL_INT(_hw_vmm_ppt, OID_AUTO, devices, CTLFLAG_RD, &num_pptdevs, 0,
119    "number of pci passthru devices");
120
121static TAILQ_HEAD(, pptdev) pptdev_list = TAILQ_HEAD_INITIALIZER(pptdev_list);
122
123static int
124ppt_probe(device_t dev)
125{
126	int bus, slot, func;
127	struct pci_devinfo *dinfo;
128
129	dinfo = (struct pci_devinfo *)device_get_ivars(dev);
130
131	bus = pci_get_bus(dev);
132	slot = pci_get_slot(dev);
133	func = pci_get_function(dev);
134
135	/*
136	 * To qualify as a pci passthrough device a device must:
137	 * - be allowed by administrator to be used in this role
138	 * - be an endpoint device
139	 */
140	if ((dinfo->cfg.hdrtype & PCIM_HDRTYPE) != PCIM_HDRTYPE_NORMAL)
141		return (ENXIO);
142	else if (vmm_is_pptdev(bus, slot, func))
143		return (0);
144	else
145		/*
146		 * Returning BUS_PROBE_NOWILDCARD here matches devices that the
147		 * SR-IOV infrastructure specified as "ppt" passthrough devices.
148		 * All normal devices that did not have "ppt" specified as their
149		 * driver will not be matched by this.
150		 */
151		return (BUS_PROBE_NOWILDCARD);
152}
153
154static int
155ppt_attach(device_t dev)
156{
157	struct pptdev *ppt;
158
159	ppt = device_get_softc(dev);
160
161	iommu_remove_device(iommu_host_domain(), pci_get_rid(dev));
162	num_pptdevs++;
163	TAILQ_INSERT_TAIL(&pptdev_list, ppt, next);
164	ppt->dev = dev;
165
166	if (bootverbose)
167		device_printf(dev, "attached\n");
168
169	return (0);
170}
171
172static int
173ppt_detach(device_t dev)
174{
175	struct pptdev *ppt;
176
177	ppt = device_get_softc(dev);
178
179	if (ppt->vm != NULL)
180		return (EBUSY);
181	num_pptdevs--;
182	TAILQ_REMOVE(&pptdev_list, ppt, next);
183	pci_disable_busmaster(dev);
184	iommu_add_device(iommu_host_domain(), pci_get_rid(dev));
185
186	return (0);
187}
188
189static device_method_t ppt_methods[] = {
190	/* Device interface */
191	DEVMETHOD(device_probe,		ppt_probe),
192	DEVMETHOD(device_attach,	ppt_attach),
193	DEVMETHOD(device_detach,	ppt_detach),
194	{0, 0}
195};
196
197static devclass_t ppt_devclass;
198DEFINE_CLASS_0(ppt, ppt_driver, ppt_methods, sizeof(struct pptdev));
199DRIVER_MODULE(ppt, pci, ppt_driver, ppt_devclass, NULL, NULL);
200
201static int
202ppt_find(struct vm *vm, int bus, int slot, int func, struct pptdev **pptp)
203{
204	device_t dev;
205	struct pptdev *ppt;
206	int b, s, f;
207
208	TAILQ_FOREACH(ppt, &pptdev_list, next) {
209		dev = ppt->dev;
210		b = pci_get_bus(dev);
211		s = pci_get_slot(dev);
212		f = pci_get_function(dev);
213		if (bus == b && slot == s && func == f)
214			break;
215	}
216
217	if (ppt == NULL)
218		return (ENOENT);
219	if (ppt->vm != vm)		/* Make sure we own this device */
220		return (EBUSY);
221	*pptp = ppt;
222	return (0);
223}
224
225static void
226ppt_unmap_mmio(struct vm *vm, struct pptdev *ppt)
227{
228	int i;
229	struct pptseg *seg;
230
231	for (i = 0; i < MAX_MMIOSEGS; i++) {
232		seg = &ppt->mmio[i];
233		if (seg->len == 0)
234			continue;
235		(void)vm_unmap_mmio(vm, seg->gpa, seg->len);
236		bzero(seg, sizeof(struct pptseg));
237	}
238}
239
240static void
241ppt_teardown_msi(struct pptdev *ppt)
242{
243	int i, rid;
244	void *cookie;
245	struct resource *res;
246
247	if (ppt->msi.num_msgs == 0)
248		return;
249
250	for (i = 0; i < ppt->msi.num_msgs; i++) {
251		rid = ppt->msi.startrid + i;
252		res = ppt->msi.res[i];
253		cookie = ppt->msi.cookie[i];
254
255		if (cookie != NULL)
256			bus_teardown_intr(ppt->dev, res, cookie);
257
258		if (res != NULL)
259			bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
260
261		ppt->msi.res[i] = NULL;
262		ppt->msi.cookie[i] = NULL;
263	}
264
265	if (ppt->msi.startrid == 1)
266		pci_release_msi(ppt->dev);
267
268	ppt->msi.num_msgs = 0;
269}
270
271static void
272ppt_teardown_msix_intr(struct pptdev *ppt, int idx)
273{
274	int rid;
275	struct resource *res;
276	void *cookie;
277
278	rid = ppt->msix.startrid + idx;
279	res = ppt->msix.res[idx];
280	cookie = ppt->msix.cookie[idx];
281
282	if (cookie != NULL)
283		bus_teardown_intr(ppt->dev, res, cookie);
284
285	if (res != NULL)
286		bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
287
288	ppt->msix.res[idx] = NULL;
289	ppt->msix.cookie[idx] = NULL;
290}
291
292static void
293ppt_teardown_msix(struct pptdev *ppt)
294{
295	int i;
296
297	if (ppt->msix.num_msgs == 0)
298		return;
299
300	for (i = 0; i < ppt->msix.num_msgs; i++)
301		ppt_teardown_msix_intr(ppt, i);
302
303	free(ppt->msix.res, M_PPTMSIX);
304	free(ppt->msix.cookie, M_PPTMSIX);
305	free(ppt->msix.arg, M_PPTMSIX);
306
307	pci_release_msi(ppt->dev);
308
309	if (ppt->msix.msix_table_res) {
310		bus_release_resource(ppt->dev, SYS_RES_MEMORY,
311				     ppt->msix.msix_table_rid,
312				     ppt->msix.msix_table_res);
313		ppt->msix.msix_table_res = NULL;
314		ppt->msix.msix_table_rid = 0;
315	}
316	if (ppt->msix.msix_pba_res) {
317		bus_release_resource(ppt->dev, SYS_RES_MEMORY,
318				     ppt->msix.msix_pba_rid,
319				     ppt->msix.msix_pba_res);
320		ppt->msix.msix_pba_res = NULL;
321		ppt->msix.msix_pba_rid = 0;
322	}
323
324	ppt->msix.num_msgs = 0;
325}
326
327int
328ppt_avail_devices(void)
329{
330
331	return (num_pptdevs);
332}
333
334int
335ppt_assigned_devices(struct vm *vm)
336{
337	struct pptdev *ppt;
338	int num;
339
340	num = 0;
341	TAILQ_FOREACH(ppt, &pptdev_list, next) {
342		if (ppt->vm == vm)
343			num++;
344	}
345	return (num);
346}
347
348bool
349ppt_is_mmio(struct vm *vm, vm_paddr_t gpa)
350{
351	int i;
352	struct pptdev *ppt;
353	struct pptseg *seg;
354
355	TAILQ_FOREACH(ppt, &pptdev_list, next) {
356		if (ppt->vm != vm)
357			continue;
358
359		for (i = 0; i < MAX_MMIOSEGS; i++) {
360			seg = &ppt->mmio[i];
361			if (seg->len == 0)
362				continue;
363			if (gpa >= seg->gpa && gpa < seg->gpa + seg->len)
364				return (true);
365		}
366	}
367
368	return (false);
369}
370
371static void
372ppt_pci_reset(device_t dev)
373{
374
375	if (pcie_flr(dev,
376	     max(pcie_get_max_completion_timeout(dev) / 1000, 10), true))
377		return;
378
379	pci_power_reset(dev);
380}
381
382int
383ppt_assign_device(struct vm *vm, int bus, int slot, int func)
384{
385	struct pptdev *ppt;
386	int error;
387
388	/* Passing NULL requires the device to be unowned. */
389	error = ppt_find(NULL, bus, slot, func, &ppt);
390	if (error)
391		return (error);
392
393	pci_save_state(ppt->dev);
394	ppt_pci_reset(ppt->dev);
395	pci_restore_state(ppt->dev);
396	ppt->vm = vm;
397	iommu_add_device(vm_iommu_domain(vm), pci_get_rid(ppt->dev));
398	return (0);
399}
400
401int
402ppt_unassign_device(struct vm *vm, int bus, int slot, int func)
403{
404	struct pptdev *ppt;
405	int error;
406
407	error = ppt_find(vm, bus, slot, func, &ppt);
408	if (error)
409		return (error);
410
411	pci_save_state(ppt->dev);
412	ppt_pci_reset(ppt->dev);
413	pci_restore_state(ppt->dev);
414	ppt_unmap_mmio(vm, ppt);
415	ppt_teardown_msi(ppt);
416	ppt_teardown_msix(ppt);
417	iommu_remove_device(vm_iommu_domain(vm), pci_get_rid(ppt->dev));
418	ppt->vm = NULL;
419	return (0);
420}
421
422int
423ppt_unassign_all(struct vm *vm)
424{
425	struct pptdev *ppt;
426	int bus, slot, func;
427	device_t dev;
428
429	TAILQ_FOREACH(ppt, &pptdev_list, next) {
430		if (ppt->vm == vm) {
431			dev = ppt->dev;
432			bus = pci_get_bus(dev);
433			slot = pci_get_slot(dev);
434			func = pci_get_function(dev);
435			vm_unassign_pptdev(vm, bus, slot, func);
436		}
437	}
438
439	return (0);
440}
441
442int
443ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
444	     vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
445{
446	int i, error;
447	struct pptseg *seg;
448	struct pptdev *ppt;
449
450	error = ppt_find(vm, bus, slot, func, &ppt);
451	if (error)
452		return (error);
453
454	for (i = 0; i < MAX_MMIOSEGS; i++) {
455		seg = &ppt->mmio[i];
456		if (seg->len == 0) {
457			error = vm_map_mmio(vm, gpa, len, hpa);
458			if (error == 0) {
459				seg->gpa = gpa;
460				seg->len = len;
461			}
462			return (error);
463		}
464	}
465	return (ENOSPC);
466}
467
468static int
469pptintr(void *arg)
470{
471	struct pptdev *ppt;
472	struct pptintr_arg *pptarg;
473
474	pptarg = arg;
475	ppt = pptarg->pptdev;
476
477	if (ppt->vm != NULL)
478		lapic_intr_msi(ppt->vm, pptarg->addr, pptarg->msg_data);
479	else {
480		/*
481		 * XXX
482		 * This is not expected to happen - panic?
483		 */
484	}
485
486	/*
487	 * For legacy interrupts give other filters a chance in case
488	 * the interrupt was not generated by the passthrough device.
489	 */
490	if (ppt->msi.startrid == 0)
491		return (FILTER_STRAY);
492	else
493		return (FILTER_HANDLED);
494}
495
496int
497ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
498	      uint64_t addr, uint64_t msg, int numvec)
499{
500	int i, rid, flags;
501	int msi_count, startrid, error, tmp;
502	struct pptdev *ppt;
503
504	if (numvec < 0 || numvec > MAX_MSIMSGS)
505		return (EINVAL);
506
507	error = ppt_find(vm, bus, slot, func, &ppt);
508	if (error)
509		return (error);
510
511	/* Reject attempts to enable MSI while MSI-X is active. */
512	if (ppt->msix.num_msgs != 0 && numvec != 0)
513		return (EBUSY);
514
515	/* Free any allocated resources */
516	ppt_teardown_msi(ppt);
517
518	if (numvec == 0)		/* nothing more to do */
519		return (0);
520
521	flags = RF_ACTIVE;
522	msi_count = pci_msi_count(ppt->dev);
523	if (msi_count == 0) {
524		startrid = 0;		/* legacy interrupt */
525		msi_count = 1;
526		flags |= RF_SHAREABLE;
527	} else
528		startrid = 1;		/* MSI */
529
530	/*
531	 * The device must be capable of supporting the number of vectors
532	 * the guest wants to allocate.
533	 */
534	if (numvec > msi_count)
535		return (EINVAL);
536
537	/*
538	 * Make sure that we can allocate all the MSI vectors that are needed
539	 * by the guest.
540	 */
541	if (startrid == 1) {
542		tmp = numvec;
543		error = pci_alloc_msi(ppt->dev, &tmp);
544		if (error)
545			return (error);
546		else if (tmp != numvec) {
547			pci_release_msi(ppt->dev);
548			return (ENOSPC);
549		} else {
550			/* success */
551		}
552	}
553
554	ppt->msi.startrid = startrid;
555
556	/*
557	 * Allocate the irq resource and attach it to the interrupt handler.
558	 */
559	for (i = 0; i < numvec; i++) {
560		ppt->msi.num_msgs = i + 1;
561		ppt->msi.cookie[i] = NULL;
562
563		rid = startrid + i;
564		ppt->msi.res[i] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
565							 &rid, flags);
566		if (ppt->msi.res[i] == NULL)
567			break;
568
569		ppt->msi.arg[i].pptdev = ppt;
570		ppt->msi.arg[i].addr = addr;
571		ppt->msi.arg[i].msg_data = msg + i;
572
573		error = bus_setup_intr(ppt->dev, ppt->msi.res[i],
574				       INTR_TYPE_NET | INTR_MPSAFE,
575				       pptintr, NULL, &ppt->msi.arg[i],
576				       &ppt->msi.cookie[i]);
577		if (error != 0)
578			break;
579	}
580
581	if (i < numvec) {
582		ppt_teardown_msi(ppt);
583		return (ENXIO);
584	}
585
586	return (0);
587}
588
589int
590ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
591	       int idx, uint64_t addr, uint64_t msg, uint32_t vector_control)
592{
593	struct pptdev *ppt;
594	struct pci_devinfo *dinfo;
595	int numvec, alloced, rid, error;
596	size_t res_size, cookie_size, arg_size;
597
598	error = ppt_find(vm, bus, slot, func, &ppt);
599	if (error)
600		return (error);
601
602	/* Reject attempts to enable MSI-X while MSI is active. */
603	if (ppt->msi.num_msgs != 0)
604		return (EBUSY);
605
606	dinfo = device_get_ivars(ppt->dev);
607	if (!dinfo)
608		return (ENXIO);
609
610	/*
611	 * First-time configuration:
612	 * 	Allocate the MSI-X table
613	 *	Allocate the IRQ resources
614	 *	Set up some variables in ppt->msix
615	 */
616	if (ppt->msix.num_msgs == 0) {
617		numvec = pci_msix_count(ppt->dev);
618		if (numvec <= 0)
619			return (EINVAL);
620
621		ppt->msix.startrid = 1;
622		ppt->msix.num_msgs = numvec;
623
624		res_size = numvec * sizeof(ppt->msix.res[0]);
625		cookie_size = numvec * sizeof(ppt->msix.cookie[0]);
626		arg_size = numvec * sizeof(ppt->msix.arg[0]);
627
628		ppt->msix.res = malloc(res_size, M_PPTMSIX, M_WAITOK | M_ZERO);
629		ppt->msix.cookie = malloc(cookie_size, M_PPTMSIX,
630					  M_WAITOK | M_ZERO);
631		ppt->msix.arg = malloc(arg_size, M_PPTMSIX, M_WAITOK | M_ZERO);
632
633		rid = dinfo->cfg.msix.msix_table_bar;
634		ppt->msix.msix_table_res = bus_alloc_resource_any(ppt->dev,
635					       SYS_RES_MEMORY, &rid, RF_ACTIVE);
636
637		if (ppt->msix.msix_table_res == NULL) {
638			ppt_teardown_msix(ppt);
639			return (ENOSPC);
640		}
641		ppt->msix.msix_table_rid = rid;
642
643		if (dinfo->cfg.msix.msix_table_bar !=
644		    dinfo->cfg.msix.msix_pba_bar) {
645			rid = dinfo->cfg.msix.msix_pba_bar;
646			ppt->msix.msix_pba_res = bus_alloc_resource_any(
647			    ppt->dev, SYS_RES_MEMORY, &rid, RF_ACTIVE);
648
649			if (ppt->msix.msix_pba_res == NULL) {
650				ppt_teardown_msix(ppt);
651				return (ENOSPC);
652			}
653			ppt->msix.msix_pba_rid = rid;
654		}
655
656		alloced = numvec;
657		error = pci_alloc_msix(ppt->dev, &alloced);
658		if (error || alloced != numvec) {
659			ppt_teardown_msix(ppt);
660			return (error == 0 ? ENOSPC: error);
661		}
662	}
663
664	if ((vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
665		/* Tear down the IRQ if it's already set up */
666		ppt_teardown_msix_intr(ppt, idx);
667
668		/* Allocate the IRQ resource */
669		ppt->msix.cookie[idx] = NULL;
670		rid = ppt->msix.startrid + idx;
671		ppt->msix.res[idx] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
672							    &rid, RF_ACTIVE);
673		if (ppt->msix.res[idx] == NULL)
674			return (ENXIO);
675
676		ppt->msix.arg[idx].pptdev = ppt;
677		ppt->msix.arg[idx].addr = addr;
678		ppt->msix.arg[idx].msg_data = msg;
679
680		/* Setup the MSI-X interrupt */
681		error = bus_setup_intr(ppt->dev, ppt->msix.res[idx],
682				       INTR_TYPE_NET | INTR_MPSAFE,
683				       pptintr, NULL, &ppt->msix.arg[idx],
684				       &ppt->msix.cookie[idx]);
685
686		if (error != 0) {
687			bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, ppt->msix.res[idx]);
688			ppt->msix.cookie[idx] = NULL;
689			ppt->msix.res[idx] = NULL;
690			return (ENXIO);
691		}
692	} else {
693		/* Masked, tear it down if it's already been set up */
694		ppt_teardown_msix_intr(ppt, idx);
695	}
696
697	return (0);
698}
699
700int
701ppt_disable_msix(struct vm *vm, int bus, int slot, int func)
702{
703	struct pptdev *ppt;
704	int error;
705
706	error = ppt_find(vm, bus, slot, func, &ppt);
707	if (error)
708		return (error);
709
710	ppt_teardown_msix(ppt);
711	return (0);
712}
713