pci_iov.c revision 279453
1/*-
2 * Copyright (c) 2013-2015 Sandvine Inc.  All rights reserved.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: head/sys/dev/pci/pci_iov.c 279453 2015-03-01 00:40:57Z rstone $");
29
30#include "opt_bus.h"
31
32#include <sys/param.h>
33#include <sys/conf.h>
34#include <sys/kernel.h>
35#include <sys/systm.h>
36#include <sys/bus.h>
37#include <sys/fcntl.h>
38#include <sys/ioccom.h>
39#include <sys/iov.h>
40#include <sys/linker.h>
41#include <sys/malloc.h>
42#include <sys/module.h>
43#include <sys/pciio.h>
44#include <sys/queue.h>
45#include <sys/rman.h>
46#include <sys/sysctl.h>
47
48#include <machine/bus.h>
49#include <machine/stdarg.h>
50
51#include <sys/nv.h>
52#include <sys/iov_schema.h>
53
54#include <dev/pci/pcireg.h>
55#include <dev/pci/pcivar.h>
56#include <dev/pci/pci_private.h>
57#include <dev/pci/pci_iov_private.h>
58#include <dev/pci/schema_private.h>
59
60#include "pci_if.h"
61#include "pcib_if.h"
62
63static MALLOC_DEFINE(M_SRIOV, "sr_iov", "PCI SR-IOV allocations");
64
65static d_ioctl_t pci_iov_ioctl;
66
67static struct cdevsw iov_cdevsw = {
68	.d_version = D_VERSION,
69	.d_name = "iov",
70	.d_ioctl = pci_iov_ioctl
71};
72
73SYSCTL_DECL(_hw_pci);
74
75/*
76 * The maximum amount of memory we will allocate for user configuration of an
77 * SR-IOV device.  1MB ought to be enough for anyone, but leave this
78 * configurable just in case.
79 */
80static u_long pci_iov_max_config = 1024 * 1024;
81SYSCTL_ULONG(_hw_pci, OID_AUTO, iov_max_config, CTLFLAG_RWTUN,
82    &pci_iov_max_config, 0, "Maximum allowed size of SR-IOV configuration.");
83
84
85#define IOV_READ(d, r, w) \
86	pci_read_config((d)->cfg.dev, (d)->cfg.iov->iov_pos + r, w)
87
88#define IOV_WRITE(d, r, v, w) \
89	pci_write_config((d)->cfg.dev, (d)->cfg.iov->iov_pos + r, v, w)
90
91static nvlist_t	*pci_iov_build_schema(nvlist_t **pf_schema,
92		    nvlist_t **vf_schema);
93static void	pci_iov_build_pf_schema(nvlist_t *schema,
94		    nvlist_t **driver_schema);
95static void	pci_iov_build_vf_schema(nvlist_t *schema,
96		    nvlist_t **driver_schema);
97static nvlist_t	*pci_iov_get_pf_subsystem_schema(void);
98static nvlist_t	*pci_iov_get_vf_subsystem_schema(void);
99
100int
101pci_iov_attach_method(device_t bus, device_t dev, nvlist_t *pf_schema,
102    nvlist_t *vf_schema)
103{
104	device_t pcib;
105	struct pci_devinfo *dinfo;
106	struct pcicfg_iov *iov;
107	nvlist_t *schema;
108	uint32_t version;
109	int error;
110	int iov_pos;
111
112	dinfo = device_get_ivars(dev);
113	pcib = device_get_parent(bus);
114	schema = NULL;
115
116	error = pci_find_extcap(dev, PCIZ_SRIOV, &iov_pos);
117
118	if (error != 0)
119		return (error);
120
121	version = pci_read_config(dev, iov_pos, 4);
122	if (PCI_EXTCAP_VER(version) != 1) {
123		if (bootverbose)
124			device_printf(dev,
125			    "Unsupported version of SR-IOV (%d) detected\n",
126			    PCI_EXTCAP_VER(version));
127
128		return (ENXIO);
129	}
130
131	iov = malloc(sizeof(*dinfo->cfg.iov), M_SRIOV, M_WAITOK | M_ZERO);
132
133	mtx_lock(&Giant);
134	if (dinfo->cfg.iov != NULL) {
135		error = EBUSY;
136		goto cleanup;
137	}
138	iov->iov_pos = iov_pos;
139
140	schema = pci_iov_build_schema(&pf_schema, &vf_schema);
141	if (schema == NULL) {
142		error = ENOMEM;
143		goto cleanup;
144	}
145	iov->iov_schema = schema;
146
147	iov->iov_cdev = make_dev(&iov_cdevsw, device_get_unit(dev),
148	    UID_ROOT, GID_WHEEL, 0600, "iov/%s", device_get_nameunit(dev));
149
150	if (iov->iov_cdev == NULL) {
151		error = ENOMEM;
152		goto cleanup;
153	}
154
155	dinfo->cfg.iov = iov;
156	iov->iov_cdev->si_drv1 = dinfo;
157	mtx_unlock(&Giant);
158
159	return (0);
160
161cleanup:
162	nvlist_destroy(schema);
163	nvlist_destroy(pf_schema);
164	nvlist_destroy(vf_schema);
165	free(iov, M_SRIOV);
166	mtx_unlock(&Giant);
167	return (error);
168}
169
170int
171pci_iov_detach_method(device_t bus, device_t dev)
172{
173	struct pci_devinfo *dinfo;
174	struct pcicfg_iov *iov;
175
176	mtx_lock(&Giant);
177	dinfo = device_get_ivars(dev);
178	iov = dinfo->cfg.iov;
179
180	if (iov == NULL) {
181		mtx_unlock(&Giant);
182		return (0);
183	}
184
185	if (iov->iov_num_vfs != 0 || iov->iov_flags & IOV_BUSY) {
186		mtx_unlock(&Giant);
187		return (EBUSY);
188	}
189
190	dinfo->cfg.iov = NULL;
191
192	if (iov->iov_cdev) {
193		destroy_dev(iov->iov_cdev);
194		iov->iov_cdev = NULL;
195	}
196	nvlist_destroy(iov->iov_schema);
197
198	free(iov, M_SRIOV);
199	mtx_unlock(&Giant);
200
201	return (0);
202}
203
204static nvlist_t *
205pci_iov_build_schema(nvlist_t **pf, nvlist_t **vf)
206{
207	nvlist_t *schema, *pf_driver, *vf_driver;
208
209	/* We always take ownership of the schemas. */
210	pf_driver = *pf;
211	*pf = NULL;
212	vf_driver = *vf;
213	*vf = NULL;
214
215	schema = pci_iov_schema_alloc_node();
216	if (schema == NULL)
217		goto cleanup;
218
219	pci_iov_build_pf_schema(schema, &pf_driver);
220	pci_iov_build_vf_schema(schema, &vf_driver);
221
222	if (nvlist_error(schema) != 0)
223		goto cleanup;
224
225	return (schema);
226
227cleanup:
228	nvlist_destroy(schema);
229	nvlist_destroy(pf_driver);
230	nvlist_destroy(vf_driver);
231	return (NULL);
232}
233
234static void
235pci_iov_build_pf_schema(nvlist_t *schema, nvlist_t **driver_schema)
236{
237	nvlist_t *pf_schema, *iov_schema;
238
239	pf_schema = pci_iov_schema_alloc_node();
240	if (pf_schema == NULL) {
241		nvlist_set_error(schema, ENOMEM);
242		return;
243	}
244
245	iov_schema = pci_iov_get_pf_subsystem_schema();
246
247	/*
248	 * Note that if either *driver_schema or iov_schema is NULL, then
249	 * nvlist_move_nvlist will put the schema in the error state and
250	 * SR-IOV will fail to initialize later, so we don't have to explicitly
251	 * handle that case.
252	 */
253	nvlist_move_nvlist(pf_schema, DRIVER_CONFIG_NAME, *driver_schema);
254	nvlist_move_nvlist(pf_schema, IOV_CONFIG_NAME, iov_schema);
255	nvlist_move_nvlist(schema, PF_CONFIG_NAME, pf_schema);
256	*driver_schema = NULL;
257}
258
259static void
260pci_iov_build_vf_schema(nvlist_t *schema, nvlist_t **driver_schema)
261{
262	nvlist_t *vf_schema, *iov_schema;
263
264	vf_schema = pci_iov_schema_alloc_node();
265	if (vf_schema == NULL) {
266		nvlist_set_error(schema, ENOMEM);
267		return;
268	}
269
270	iov_schema = pci_iov_get_vf_subsystem_schema();
271
272	/*
273	 * Note that if either *driver_schema or iov_schema is NULL, then
274	 * nvlist_move_nvlist will put the schema in the error state and
275	 * SR-IOV will fail to initialize later, so we don't have to explicitly
276	 * handle that case.
277	 */
278	nvlist_move_nvlist(vf_schema, DRIVER_CONFIG_NAME, *driver_schema);
279	nvlist_move_nvlist(vf_schema, IOV_CONFIG_NAME, iov_schema);
280	nvlist_move_nvlist(schema, VF_SCHEMA_NAME, vf_schema);
281	*driver_schema = NULL;
282}
283
284static nvlist_t *
285pci_iov_get_pf_subsystem_schema(void)
286{
287	nvlist_t *pf;
288
289	pf = pci_iov_schema_alloc_node();
290	if (pf == NULL)
291		return (NULL);
292
293	pci_iov_schema_add_uint16(pf, "num_vfs", IOV_SCHEMA_REQUIRED, -1);
294	pci_iov_schema_add_string(pf, "device", IOV_SCHEMA_REQUIRED, NULL);
295
296	return (pf);
297}
298
299static nvlist_t *
300pci_iov_get_vf_subsystem_schema(void)
301{
302	nvlist_t *vf;
303
304	vf = pci_iov_schema_alloc_node();
305	if (vf == NULL)
306		return (NULL);
307
308	pci_iov_schema_add_bool(vf, "passthrough", IOV_SCHEMA_HASDEFAULT, 0);
309
310	return (vf);
311}
312
313static int
314pci_iov_alloc_bar(struct pci_devinfo *dinfo, int bar, pci_addr_t bar_shift)
315{
316	struct resource *res;
317	struct pcicfg_iov *iov;
318	device_t dev, bus;
319	u_long start, end;
320	pci_addr_t bar_size;
321	int rid;
322
323	iov = dinfo->cfg.iov;
324	dev = dinfo->cfg.dev;
325	bus = device_get_parent(dev);
326	rid = iov->iov_pos + PCIR_SRIOV_BAR(bar);
327	bar_size = 1 << bar_shift;
328
329	res = pci_alloc_multi_resource(bus, dev, SYS_RES_MEMORY, &rid, 0ul,
330	    ~0ul, 1, iov->iov_num_vfs, RF_ACTIVE);
331
332	if (res == NULL)
333		return (ENXIO);
334
335	iov->iov_bar[bar].res = res;
336	iov->iov_bar[bar].bar_size = bar_size;
337	iov->iov_bar[bar].bar_shift = bar_shift;
338
339	start = rman_get_start(res);
340	end = rman_get_end(res);
341	return (rman_manage_region(&iov->rman, start, end));
342}
343
344static void
345pci_iov_add_bars(struct pcicfg_iov *iov, struct pci_devinfo *dinfo)
346{
347	struct pci_iov_bar *bar;
348	uint64_t bar_start;
349	int i;
350
351	for (i = 0; i <= PCIR_MAX_BAR_0; i++) {
352		bar = &iov->iov_bar[i];
353		if (bar->res != NULL) {
354			bar_start = rman_get_start(bar->res) +
355			    dinfo->cfg.vf.index * bar->bar_size;
356
357			pci_add_bar(dinfo->cfg.dev, PCIR_BAR(i), bar_start,
358			    bar->bar_shift);
359		}
360	}
361}
362
363static int
364pci_iov_parse_config(struct pcicfg_iov *iov, struct pci_iov_arg *arg,
365    nvlist_t **ret)
366{
367	void *packed_config;
368	nvlist_t *config;
369	int error;
370
371	config = NULL;
372	packed_config = NULL;
373
374	if (arg->len > pci_iov_max_config) {
375		error = EMSGSIZE;
376		goto out;
377	}
378
379	packed_config = malloc(arg->len, M_SRIOV, M_WAITOK);
380
381	error = copyin(arg->config, packed_config, arg->len);
382	if (error != 0)
383		goto out;
384
385	config = nvlist_unpack(packed_config, arg->len);
386	if (config == NULL) {
387		error = EINVAL;
388		goto out;
389	}
390
391	error = pci_iov_schema_validate_config(iov->iov_schema, config);
392	if (error != 0)
393		goto out;
394
395	error = nvlist_error(config);
396	if (error != 0)
397		goto out;
398
399	*ret = config;
400	config = NULL;
401
402out:
403	nvlist_destroy(config);
404	free(packed_config, M_SRIOV);
405	return (error);
406}
407
408/*
409 * Set the ARI_EN bit in the lowest-numbered PCI function with the SR-IOV
410 * capability.  This bit is only writeable on the lowest-numbered PF but
411 * affects all PFs on the device.
412 */
413static int
414pci_iov_set_ari(device_t bus)
415{
416	device_t lowest;
417	device_t *devlist;
418	int i, error, devcount, lowest_func, lowest_pos, iov_pos, dev_func;
419	uint16_t iov_ctl;
420
421	/* If ARI is disabled on the downstream port there is nothing to do. */
422	if (!PCIB_ARI_ENABLED(device_get_parent(bus)))
423		return (0);
424
425	error = device_get_children(bus, &devlist, &devcount);
426
427	if (error != 0)
428		return (error);
429
430	lowest = NULL;
431	for (i = 0; i < devcount; i++) {
432		if (pci_find_extcap(devlist[i], PCIZ_SRIOV, &iov_pos) == 0) {
433			dev_func = pci_get_function(devlist[i]);
434			if (lowest == NULL || dev_func < lowest_func) {
435				lowest = devlist[i];
436				lowest_func = dev_func;
437				lowest_pos = iov_pos;
438			}
439		}
440	}
441
442	/*
443	 * If we called this function some device must have the SR-IOV
444	 * capability.
445	 */
446	KASSERT(lowest != NULL,
447	    ("Could not find child of %s with SR-IOV capability",
448	    device_get_nameunit(bus)));
449
450	iov_ctl = pci_read_config(lowest, iov_pos + PCIR_SRIOV_CTL, 2);
451	iov_ctl |= PCIM_SRIOV_ARI_EN;
452	pci_write_config(lowest, iov_pos + PCIR_SRIOV_CTL, iov_ctl, 2);
453	free(devlist, M_TEMP);
454	return (0);
455}
456
457static int
458pci_iov_config_page_size(struct pci_devinfo *dinfo)
459{
460	uint32_t page_cap, page_size;
461
462	page_cap = IOV_READ(dinfo, PCIR_SRIOV_PAGE_CAP, 4);
463
464	/*
465	 * If the system page size is less than the smallest SR-IOV page size
466	 * then round up to the smallest SR-IOV page size.
467	 */
468	if (PAGE_SHIFT < PCI_SRIOV_BASE_PAGE_SHIFT)
469		page_size = (1 << 0);
470	else
471		page_size = (1 << (PAGE_SHIFT - PCI_SRIOV_BASE_PAGE_SHIFT));
472
473	/* Check that the device supports the system page size. */
474	if (!(page_size & page_cap))
475		return (ENXIO);
476
477	IOV_WRITE(dinfo, PCIR_SRIOV_PAGE_SIZE, page_size, 4);
478	return (0);
479}
480
481static int
482pci_init_iov(device_t dev, uint16_t num_vfs, const nvlist_t *config)
483{
484	const nvlist_t *device, *driver_config;
485
486	device = nvlist_get_nvlist(config, PF_CONFIG_NAME);
487	driver_config = nvlist_get_nvlist(device, DRIVER_CONFIG_NAME);
488	return (PCI_INIT_IOV(dev, num_vfs, driver_config));
489}
490
491static int
492pci_iov_init_rman(device_t pf, struct pcicfg_iov *iov)
493{
494	int error;
495
496	iov->rman.rm_start = 0;
497	iov->rman.rm_end = ~0ul;
498	iov->rman.rm_type = RMAN_ARRAY;
499	snprintf(iov->rman_name, sizeof(iov->rman_name), "%s VF I/O memory",
500	    device_get_nameunit(pf));
501	iov->rman.rm_descr = iov->rman_name;
502
503	error = rman_init(&iov->rman);
504	if (error != 0)
505		return (error);
506
507	iov->iov_flags |= IOV_RMAN_INITED;
508	return (0);
509}
510
511static int
512pci_iov_setup_bars(struct pci_devinfo *dinfo)
513{
514	device_t dev;
515	struct pcicfg_iov *iov;
516	pci_addr_t bar_value, testval;
517	int i, last_64, error;
518
519	iov = dinfo->cfg.iov;
520	dev = dinfo->cfg.dev;
521	last_64 = 0;
522
523	for (i = 0; i <= PCIR_MAX_BAR_0; i++) {
524		/*
525		 * If a PCI BAR is a 64-bit wide BAR, then it spans two
526		 * consecutive registers.  Therefore if the last BAR that
527		 * we looked at was a 64-bit BAR, we need to skip this
528		 * register as it's the second half of the last BAR.
529		 */
530		if (!last_64) {
531			pci_read_bar(dev,
532			    iov->iov_pos + PCIR_SRIOV_BAR(i),
533			    &bar_value, &testval, &last_64);
534
535			if (testval != 0) {
536				error = pci_iov_alloc_bar(dinfo, i,
537				   pci_mapsize(testval));
538				if (error != 0)
539					return (error);
540			}
541		} else
542			last_64 = 0;
543	}
544
545	return (0);
546}
547
548static void
549pci_iov_enumerate_vfs(struct pci_devinfo *dinfo, const nvlist_t *config,
550    uint16_t first_rid, uint16_t rid_stride)
551{
552	char device_name[VF_MAX_NAME];
553	const nvlist_t *device, *driver_config, *iov_config;
554	device_t bus, dev, vf;
555	struct pcicfg_iov *iov;
556	struct pci_devinfo *vfinfo;
557	size_t size;
558	int i, error;
559	uint16_t vid, did, next_rid;
560
561	iov = dinfo->cfg.iov;
562	dev = dinfo->cfg.dev;
563	bus = device_get_parent(dev);
564	size = dinfo->cfg.devinfo_size;
565	next_rid = first_rid;
566	vid = pci_get_vendor(dev);
567	did = IOV_READ(dinfo, PCIR_SRIOV_VF_DID, 2);
568
569	for (i = 0; i < iov->iov_num_vfs; i++, next_rid += rid_stride) {
570		snprintf(device_name, sizeof(device_name), VF_PREFIX"%d", i);
571		device = nvlist_get_nvlist(config, device_name);
572		iov_config = nvlist_get_nvlist(device, IOV_CONFIG_NAME);
573		driver_config = nvlist_get_nvlist(device, DRIVER_CONFIG_NAME);
574
575		vf = PCI_CREATE_IOV_CHILD(bus, dev, next_rid, vid, did);
576		if (vf == NULL)
577			break;
578
579		/*
580		 * If we are creating passthrough devices then force the ppt
581		 * driver to attach to prevent a VF driver from claiming the
582		 * VFs.
583		 */
584		if (nvlist_get_bool(iov_config, "passthrough"))
585			device_set_devclass(vf, "ppt");
586
587		vfinfo = device_get_ivars(vf);
588
589		vfinfo->cfg.iov = iov;
590		vfinfo->cfg.vf.index = i;
591
592		pci_iov_add_bars(iov, vfinfo);
593
594		error = PCI_ADD_VF(dev, i, driver_config);
595		if (error != 0) {
596			device_printf(dev, "Failed to add VF %d\n", i);
597			pci_delete_child(bus, vf);
598		}
599	}
600
601	bus_generic_attach(bus);
602}
603
604static int
605pci_iov_config(struct cdev *cdev, struct pci_iov_arg *arg)
606{
607	device_t bus, dev;
608	struct pci_devinfo *dinfo;
609	struct pcicfg_iov *iov;
610	nvlist_t *config;
611	int i, error;
612	uint16_t rid_off, rid_stride;
613	uint16_t first_rid, last_rid;
614	uint16_t iov_ctl;
615	uint16_t num_vfs, total_vfs;
616	int iov_inited;
617
618	mtx_lock(&Giant);
619	dinfo = cdev->si_drv1;
620	iov = dinfo->cfg.iov;
621	dev = dinfo->cfg.dev;
622	bus = device_get_parent(dev);
623	iov_inited = 0;
624	config = NULL;
625
626	if ((iov->iov_flags & IOV_BUSY) || iov->iov_num_vfs != 0) {
627		mtx_unlock(&Giant);
628		return (EBUSY);
629	}
630	iov->iov_flags |= IOV_BUSY;
631
632	error = pci_iov_parse_config(iov, arg, &config);
633	if (error != 0)
634		goto out;
635
636	num_vfs = pci_iov_config_get_num_vfs(config);
637	total_vfs = IOV_READ(dinfo, PCIR_SRIOV_TOTAL_VFS, 2);
638	if (num_vfs > total_vfs) {
639		error = EINVAL;
640		goto out;
641	}
642
643	error = pci_iov_config_page_size(dinfo);
644	if (error != 0)
645		goto out;
646
647	error = pci_iov_set_ari(bus);
648	if (error != 0)
649		goto out;
650
651	error = pci_init_iov(dev, num_vfs, config);
652	if (error != 0)
653		goto out;
654	iov_inited = 1;
655
656	IOV_WRITE(dinfo, PCIR_SRIOV_NUM_VFS, num_vfs, 2);
657
658	rid_off = IOV_READ(dinfo, PCIR_SRIOV_VF_OFF, 2);
659	rid_stride = IOV_READ(dinfo, PCIR_SRIOV_VF_STRIDE, 2);
660
661	first_rid = pci_get_rid(dev) + rid_off;
662	last_rid = first_rid + (num_vfs - 1) * rid_stride;
663
664	/* We don't yet support allocating extra bus numbers for VFs. */
665	if (pci_get_bus(dev) != PCI_RID2BUS(last_rid)) {
666		error = ENOSPC;
667		goto out;
668	}
669
670	iov_ctl = IOV_READ(dinfo, PCIR_SRIOV_CTL, 2);
671	iov_ctl &= ~(PCIM_SRIOV_VF_EN | PCIM_SRIOV_VF_MSE);
672	IOV_WRITE(dinfo, PCIR_SRIOV_CTL, iov_ctl, 2);
673
674	error = pci_iov_init_rman(dev, iov);
675	if (error != 0)
676		goto out;
677
678	iov->iov_num_vfs = num_vfs;
679
680	error = pci_iov_setup_bars(dinfo);
681	if (error != 0)
682		goto out;
683
684	iov_ctl = IOV_READ(dinfo, PCIR_SRIOV_CTL, 2);
685	iov_ctl |= PCIM_SRIOV_VF_EN | PCIM_SRIOV_VF_MSE;
686	IOV_WRITE(dinfo, PCIR_SRIOV_CTL, iov_ctl, 2);
687
688	/* Per specification, we must wait 100ms before accessing VFs. */
689	pause("iov", roundup(hz, 10));
690	pci_iov_enumerate_vfs(dinfo, config, first_rid, rid_stride);
691
692	nvlist_destroy(config);
693	iov->iov_flags &= ~IOV_BUSY;
694	mtx_unlock(&Giant);
695
696	return (0);
697out:
698	if (iov_inited)
699		PCI_UNINIT_IOV(dev);
700
701	for (i = 0; i <= PCIR_MAX_BAR_0; i++) {
702		if (iov->iov_bar[i].res != NULL) {
703			pci_release_resource(bus, dev, SYS_RES_MEMORY,
704			    iov->iov_pos + PCIR_SRIOV_BAR(i),
705			    iov->iov_bar[i].res);
706			pci_delete_resource(bus, dev, SYS_RES_MEMORY,
707			    iov->iov_pos + PCIR_SRIOV_BAR(i));
708			iov->iov_bar[i].res = NULL;
709		}
710	}
711
712	if (iov->iov_flags & IOV_RMAN_INITED) {
713		rman_fini(&iov->rman);
714		iov->iov_flags &= ~IOV_RMAN_INITED;
715	}
716
717	nvlist_destroy(config);
718	iov->iov_num_vfs = 0;
719	iov->iov_flags &= ~IOV_BUSY;
720	mtx_unlock(&Giant);
721	return (error);
722}
723
724/* Return true if child is a VF of the given PF. */
725static int
726pci_iov_is_child_vf(struct pcicfg_iov *pf, device_t child)
727{
728	struct pci_devinfo *vfinfo;
729
730	vfinfo = device_get_ivars(child);
731
732	if (!(vfinfo->cfg.flags & PCICFG_VF))
733		return (0);
734
735	return (pf == vfinfo->cfg.iov);
736}
737
738static int
739pci_iov_delete(struct cdev *cdev)
740{
741	device_t bus, dev, vf, *devlist;
742	struct pci_devinfo *dinfo;
743	struct pcicfg_iov *iov;
744	int i, error, devcount;
745	uint32_t iov_ctl;
746
747	mtx_lock(&Giant);
748	dinfo = cdev->si_drv1;
749	iov = dinfo->cfg.iov;
750	dev = dinfo->cfg.dev;
751	bus = device_get_parent(dev);
752	devlist = NULL;
753
754	if (iov->iov_flags & IOV_BUSY) {
755		mtx_unlock(&Giant);
756		return (EBUSY);
757	}
758
759	if (iov->iov_num_vfs == 0) {
760		mtx_unlock(&Giant);
761		return (ECHILD);
762	}
763
764	iov->iov_flags |= IOV_BUSY;
765
766	error = device_get_children(bus, &devlist, &devcount);
767
768	if (error != 0)
769		goto out;
770
771	for (i = 0; i < devcount; i++) {
772		vf = devlist[i];
773
774		if (!pci_iov_is_child_vf(iov, vf))
775			continue;
776
777		error = device_detach(vf);
778		if (error != 0) {
779			device_printf(dev,
780			   "Could not disable SR-IOV: failed to detach VF %s\n",
781			    device_get_nameunit(vf));
782			goto out;
783		}
784	}
785
786	for (i = 0; i < devcount; i++) {
787		vf = devlist[i];
788
789		if (pci_iov_is_child_vf(iov, vf))
790			pci_delete_child(bus, vf);
791	}
792	PCI_UNINIT_IOV(dev);
793
794	iov_ctl = IOV_READ(dinfo, PCIR_SRIOV_CTL, 2);
795	iov_ctl &= ~(PCIM_SRIOV_VF_EN | PCIM_SRIOV_VF_MSE);
796	IOV_WRITE(dinfo, PCIR_SRIOV_CTL, iov_ctl, 2);
797	IOV_WRITE(dinfo, PCIR_SRIOV_NUM_VFS, 0, 2);
798
799	iov->iov_num_vfs = 0;
800
801	for (i = 0; i <= PCIR_MAX_BAR_0; i++) {
802		if (iov->iov_bar[i].res != NULL) {
803			pci_release_resource(bus, dev, SYS_RES_MEMORY,
804			    iov->iov_pos + PCIR_SRIOV_BAR(i),
805			    iov->iov_bar[i].res);
806			pci_delete_resource(bus, dev, SYS_RES_MEMORY,
807			    iov->iov_pos + PCIR_SRIOV_BAR(i));
808			iov->iov_bar[i].res = NULL;
809		}
810	}
811
812	if (iov->iov_flags & IOV_RMAN_INITED) {
813		rman_fini(&iov->rman);
814		iov->iov_flags &= ~IOV_RMAN_INITED;
815	}
816
817	error = 0;
818out:
819	free(devlist, M_TEMP);
820	iov->iov_flags &= ~IOV_BUSY;
821	mtx_unlock(&Giant);
822	return (error);
823}
824
825static int
826pci_iov_get_schema_ioctl(struct cdev *cdev, struct pci_iov_schema *output)
827{
828	struct pci_devinfo *dinfo;
829	void *packed;
830	size_t output_len, size;
831	int error;
832
833	packed = NULL;
834
835	mtx_lock(&Giant);
836	dinfo = cdev->si_drv1;
837	packed = nvlist_pack(dinfo->cfg.iov->iov_schema, &size);
838	mtx_unlock(&Giant);
839
840	if (packed == NULL) {
841		error = ENOMEM;
842		goto fail;
843	}
844
845	output_len = output->len;
846	output->len = size;
847	if (size <= output_len) {
848		error = copyout(packed, output->schema, size);
849
850		if (error != 0)
851			goto fail;
852
853		output->error = 0;
854	} else
855		/*
856		 * If we return an error then the ioctl code won't copyout
857		 * output back to userland, so we flag the error in the struct
858		 * instead.
859		 */
860		output->error = EMSGSIZE;
861
862	error = 0;
863
864fail:
865	free(packed, M_NVLIST);
866
867	return (error);
868}
869
870static int
871pci_iov_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag,
872    struct thread *td)
873{
874
875	switch (cmd) {
876	case IOV_CONFIG:
877		return (pci_iov_config(dev, (struct pci_iov_arg *)data));
878	case IOV_DELETE:
879		return (pci_iov_delete(dev));
880	case IOV_GET_SCHEMA:
881		return (pci_iov_get_schema_ioctl(dev,
882		    (struct pci_iov_schema *)data));
883	default:
884		return (EINVAL);
885	}
886}
887
888struct resource *
889pci_vf_alloc_mem_resource(device_t dev, device_t child, int *rid, u_long start,
890    u_long end, u_long count, u_int flags)
891{
892	struct pci_devinfo *dinfo;
893	struct pcicfg_iov *iov;
894	struct pci_map *map;
895	struct resource *res;
896	struct resource_list_entry *rle;
897	u_long bar_start, bar_end;
898	pci_addr_t bar_length;
899	int error;
900
901	dinfo = device_get_ivars(child);
902	iov = dinfo->cfg.iov;
903
904	map = pci_find_bar(child, *rid);
905	if (map == NULL)
906		return (NULL);
907
908	bar_length = 1 << map->pm_size;
909	bar_start = map->pm_value;
910	bar_end = bar_start + bar_length - 1;
911
912	/* Make sure that the resource fits the constraints. */
913	if (bar_start >= end || bar_end <= bar_start || count != 1)
914		return (NULL);
915
916	/* Clamp the resource to the constraints if necessary. */
917	if (bar_start < start)
918		bar_start = start;
919	if (bar_end > end)
920		bar_end = end;
921	bar_length = bar_end - bar_start + 1;
922
923	res = rman_reserve_resource(&iov->rman, bar_start, bar_end,
924	    bar_length, flags, child);
925	if (res == NULL)
926		return (NULL);
927
928	rle = resource_list_add(&dinfo->resources, SYS_RES_MEMORY, *rid,
929	    bar_start, bar_end, 1);
930	if (rle == NULL) {
931		rman_release_resource(res);
932		return (NULL);
933	}
934
935	rman_set_rid(res, *rid);
936
937	if (flags & RF_ACTIVE) {
938		error = bus_activate_resource(child, SYS_RES_MEMORY, *rid, res);
939		if (error != 0) {
940			resource_list_delete(&dinfo->resources, SYS_RES_MEMORY,
941			    *rid);
942			rman_release_resource(res);
943			return (NULL);
944		}
945	}
946	rle->res = res;
947
948	return (res);
949}
950
951int
952pci_vf_release_mem_resource(device_t dev, device_t child, int rid,
953    struct resource *r)
954{
955	struct pci_devinfo *dinfo;
956	struct resource_list_entry *rle;
957	int error;
958
959	dinfo = device_get_ivars(child);
960
961	if (rman_get_flags(r) & RF_ACTIVE) {
962		error = bus_deactivate_resource(child, SYS_RES_MEMORY, rid, r);
963		if (error != 0)
964			return (error);
965	}
966
967	rle = resource_list_find(&dinfo->resources, SYS_RES_MEMORY, rid);
968	if (rle != NULL) {
969		rle->res = NULL;
970		resource_list_delete(&dinfo->resources, SYS_RES_MEMORY,
971		    rid);
972	}
973
974	return (rman_release_resource(r));
975}
976
977