pci_iov.c revision 279449
1/*-
2 * Copyright (c) 2013-2015 Sandvine Inc.  All rights reserved.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: head/sys/dev/pci/pci_iov.c 279449 2015-03-01 00:40:26Z rstone $");
29
30#include "opt_bus.h"
31
32#include <sys/param.h>
33#include <sys/conf.h>
34#include <sys/kernel.h>
35#include <sys/systm.h>
36#include <sys/bus.h>
37#include <sys/fcntl.h>
38#include <sys/ioccom.h>
39#include <sys/iov.h>
40#include <sys/linker.h>
41#include <sys/malloc.h>
42#include <sys/module.h>
43#include <sys/pciio.h>
44#include <sys/queue.h>
45#include <sys/rman.h>
46#include <sys/sysctl.h>
47
48#include <machine/bus.h>
49
50#include <dev/pci/pcireg.h>
51#include <dev/pci/pcivar.h>
52#include <dev/pci/pci_private.h>
53#include <dev/pci/pci_iov_private.h>
54
55#include "pci_if.h"
56#include "pcib_if.h"
57
58static MALLOC_DEFINE(M_SRIOV, "sr_iov", "PCI SR-IOV allocations");
59
60static d_ioctl_t pci_iov_ioctl;
61
62static struct cdevsw iov_cdevsw = {
63	.d_version = D_VERSION,
64	.d_name = "iov",
65	.d_ioctl = pci_iov_ioctl
66};
67
68#define IOV_READ(d, r, w) \
69	pci_read_config((d)->cfg.dev, (d)->cfg.iov->iov_pos + r, w)
70
71#define IOV_WRITE(d, r, v, w) \
72	pci_write_config((d)->cfg.dev, (d)->cfg.iov->iov_pos + r, v, w)
73
74int
75pci_iov_attach_method(device_t bus, device_t dev)
76{
77	device_t pcib;
78	struct pci_devinfo *dinfo;
79	struct pcicfg_iov *iov;
80	uint32_t version;
81	int error;
82	int iov_pos;
83
84	dinfo = device_get_ivars(dev);
85	pcib = device_get_parent(bus);
86
87	error = pci_find_extcap(dev, PCIZ_SRIOV, &iov_pos);
88
89	if (error != 0)
90		return (error);
91
92	version = pci_read_config(dev, iov_pos, 4);
93	if (PCI_EXTCAP_VER(version) != 1) {
94		if (bootverbose)
95			device_printf(dev,
96			    "Unsupported version of SR-IOV (%d) detected\n",
97			    PCI_EXTCAP_VER(version));
98
99		return (ENXIO);
100	}
101
102	iov = malloc(sizeof(*dinfo->cfg.iov), M_SRIOV, M_WAITOK | M_ZERO);
103
104	mtx_lock(&Giant);
105	if (dinfo->cfg.iov != NULL) {
106		error = EBUSY;
107		goto cleanup;
108	}
109	iov->iov_pos = iov_pos;
110
111	iov->iov_cdev = make_dev(&iov_cdevsw, device_get_unit(dev),
112	    UID_ROOT, GID_WHEEL, 0600, "iov/%s", device_get_nameunit(dev));
113
114	if (iov->iov_cdev == NULL) {
115		error = ENOMEM;
116		goto cleanup;
117	}
118
119	dinfo->cfg.iov = iov;
120	iov->iov_cdev->si_drv1 = dinfo;
121	mtx_unlock(&Giant);
122
123	return (0);
124
125cleanup:
126	free(iov, M_SRIOV);
127	mtx_unlock(&Giant);
128	return (error);
129}
130
131int
132pci_iov_detach_method(device_t bus, device_t dev)
133{
134	struct pci_devinfo *dinfo;
135	struct pcicfg_iov *iov;
136
137	mtx_lock(&Giant);
138	dinfo = device_get_ivars(dev);
139	iov = dinfo->cfg.iov;
140
141	if (iov == NULL) {
142		mtx_unlock(&Giant);
143		return (0);
144	}
145
146	if (iov->iov_num_vfs != 0) {
147		mtx_unlock(&Giant);
148		return (EBUSY);
149	}
150
151	dinfo->cfg.iov = NULL;
152
153	if (iov->iov_cdev) {
154		destroy_dev(iov->iov_cdev);
155		iov->iov_cdev = NULL;
156	}
157
158	free(iov, M_SRIOV);
159	mtx_unlock(&Giant);
160
161	return (0);
162}
163
164static int
165pci_iov_alloc_bar(struct pci_devinfo *dinfo, int bar, pci_addr_t bar_shift)
166{
167	struct resource *res;
168	struct pcicfg_iov *iov;
169	device_t dev, bus;
170	u_long start, end;
171	pci_addr_t bar_size;
172	int rid;
173
174	iov = dinfo->cfg.iov;
175	dev = dinfo->cfg.dev;
176	bus = device_get_parent(dev);
177	rid = iov->iov_pos + PCIR_SRIOV_BAR(bar);
178	bar_size = 1 << bar_shift;
179
180	res = pci_alloc_multi_resource(bus, dev, SYS_RES_MEMORY, &rid, 0ul,
181	    ~0ul, 1, iov->iov_num_vfs, RF_ACTIVE);
182
183	if (res == NULL)
184		return (ENXIO);
185
186	iov->iov_bar[bar].res = res;
187	iov->iov_bar[bar].bar_size = bar_size;
188	iov->iov_bar[bar].bar_shift = bar_shift;
189
190	start = rman_get_start(res);
191	end = rman_get_end(res);
192	return (rman_manage_region(&iov->rman, start, end));
193}
194
195static void
196pci_iov_add_bars(struct pcicfg_iov *iov, struct pci_devinfo *dinfo)
197{
198	struct pci_iov_bar *bar;
199	uint64_t bar_start;
200	int i;
201
202	for (i = 0; i <= PCIR_MAX_BAR_0; i++) {
203		bar = &iov->iov_bar[i];
204		if (bar->res != NULL) {
205			bar_start = rman_get_start(bar->res) +
206			    dinfo->cfg.vf.index * bar->bar_size;
207
208			pci_add_bar(dinfo->cfg.dev, PCIR_BAR(i), bar_start,
209			    bar->bar_shift);
210		}
211	}
212}
213
214/*
215 * Set the ARI_EN bit in the lowest-numbered PCI function with the SR-IOV
216 * capability.  This bit is only writeable on the lowest-numbered PF but
217 * affects all PFs on the device.
218 */
219static int
220pci_iov_set_ari(device_t bus)
221{
222	device_t lowest;
223	device_t *devlist;
224	int i, error, devcount, lowest_func, lowest_pos, iov_pos, dev_func;
225	uint16_t iov_ctl;
226
227	/* If ARI is disabled on the downstream port there is nothing to do. */
228	if (!PCIB_ARI_ENABLED(device_get_parent(bus)))
229		return (0);
230
231	error = device_get_children(bus, &devlist, &devcount);
232
233	if (error != 0)
234		return (error);
235
236	lowest = NULL;
237	for (i = 0; i < devcount; i++) {
238		if (pci_find_extcap(devlist[i], PCIZ_SRIOV, &iov_pos) == 0) {
239			dev_func = pci_get_function(devlist[i]);
240			if (lowest == NULL || dev_func < lowest_func) {
241				lowest = devlist[i];
242				lowest_func = dev_func;
243				lowest_pos = iov_pos;
244			}
245		}
246	}
247
248	/*
249	 * If we called this function some device must have the SR-IOV
250	 * capability.
251	 */
252	KASSERT(lowest != NULL,
253	    ("Could not find child of %s with SR-IOV capability",
254	    device_get_nameunit(bus)));
255
256	iov_ctl = pci_read_config(lowest, iov_pos + PCIR_SRIOV_CTL, 2);
257	iov_ctl |= PCIM_SRIOV_ARI_EN;
258	pci_write_config(lowest, iov_pos + PCIR_SRIOV_CTL, iov_ctl, 2);
259	free(devlist, M_TEMP);
260	return (0);
261}
262
263static int
264pci_iov_config_page_size(struct pci_devinfo *dinfo)
265{
266	uint32_t page_cap, page_size;
267
268	page_cap = IOV_READ(dinfo, PCIR_SRIOV_PAGE_CAP, 4);
269
270	/*
271	 * If the system page size is less than the smallest SR-IOV page size
272	 * then round up to the smallest SR-IOV page size.
273	 */
274	if (PAGE_SHIFT < PCI_SRIOV_BASE_PAGE_SHIFT)
275		page_size = (1 << 0);
276	else
277		page_size = (1 << (PAGE_SHIFT - PCI_SRIOV_BASE_PAGE_SHIFT));
278
279	/* Check that the device supports the system page size. */
280	if (!(page_size & page_cap))
281		return (ENXIO);
282
283	IOV_WRITE(dinfo, PCIR_SRIOV_PAGE_SIZE, page_size, 4);
284	return (0);
285}
286
287static int
288pci_iov_init_rman(device_t pf, struct pcicfg_iov *iov)
289{
290	int error;
291
292	iov->rman.rm_start = 0;
293	iov->rman.rm_end = ~0ul;
294	iov->rman.rm_type = RMAN_ARRAY;
295	snprintf(iov->rman_name, sizeof(iov->rman_name), "%s VF I/O memory",
296	    device_get_nameunit(pf));
297	iov->rman.rm_descr = iov->rman_name;
298
299	error = rman_init(&iov->rman);
300	if (error != 0)
301		return (error);
302
303	iov->iov_flags |= IOV_RMAN_INITED;
304	return (0);
305}
306
307static int
308pci_iov_setup_bars(struct pci_devinfo *dinfo)
309{
310	device_t dev;
311	struct pcicfg_iov *iov;
312	pci_addr_t bar_value, testval;
313	int i, last_64, error;
314
315	iov = dinfo->cfg.iov;
316	dev = dinfo->cfg.dev;
317	last_64 = 0;
318
319	for (i = 0; i <= PCIR_MAX_BAR_0; i++) {
320		/*
321		 * If a PCI BAR is a 64-bit wide BAR, then it spans two
322		 * consecutive registers.  Therefore if the last BAR that
323		 * we looked at was a 64-bit BAR, we need to skip this
324		 * register as it's the second half of the last BAR.
325		 */
326		if (!last_64) {
327			pci_read_bar(dev,
328			    iov->iov_pos + PCIR_SRIOV_BAR(i),
329			    &bar_value, &testval, &last_64);
330
331			if (testval != 0) {
332				error = pci_iov_alloc_bar(dinfo, i,
333				   pci_mapsize(testval));
334				if (error != 0)
335					return (error);
336			}
337		} else
338			last_64 = 0;
339	}
340
341	return (0);
342}
343
344static void
345pci_iov_enumerate_vfs(struct pci_devinfo *dinfo, const char *driver,
346    uint16_t first_rid, uint16_t rid_stride)
347{
348	device_t bus, dev, vf;
349	struct pcicfg_iov *iov;
350	struct pci_devinfo *vfinfo;
351	size_t size;
352	int i, error;
353	uint16_t vid, did, next_rid;
354
355	iov = dinfo->cfg.iov;
356	dev = dinfo->cfg.dev;
357	bus = device_get_parent(dev);
358	size = dinfo->cfg.devinfo_size;
359	next_rid = first_rid;
360	vid = pci_get_vendor(dev);
361	did = IOV_READ(dinfo, PCIR_SRIOV_VF_DID, 2);
362
363	for (i = 0; i < iov->iov_num_vfs; i++, next_rid += rid_stride) {
364
365
366		vf = PCI_CREATE_IOV_CHILD(bus, dev, next_rid, vid, did);
367		if (vf == NULL)
368			break;
369
370		vfinfo = device_get_ivars(vf);
371
372		vfinfo->cfg.iov = iov;
373		vfinfo->cfg.vf.index = i;
374
375		pci_iov_add_bars(iov, vfinfo);
376
377		error = PCI_ADD_VF(dev, i);
378		if (error != 0) {
379			device_printf(dev, "Failed to add VF %d\n", i);
380			pci_delete_child(bus, vf);
381		}
382	}
383
384	bus_generic_attach(bus);
385}
386
387static int
388pci_iov_config(struct cdev *cdev, struct pci_iov_arg *arg)
389{
390	device_t bus, dev;
391	const char *driver;
392	struct pci_devinfo *dinfo;
393	struct pcicfg_iov *iov;
394	int i, error;
395	uint16_t rid_off, rid_stride;
396	uint16_t first_rid, last_rid;
397	uint16_t iov_ctl;
398	uint16_t total_vfs;
399	int iov_inited;
400
401	mtx_lock(&Giant);
402	dinfo = cdev->si_drv1;
403	iov = dinfo->cfg.iov;
404	dev = dinfo->cfg.dev;
405	bus = device_get_parent(dev);
406	iov_inited = 0;
407
408	if (iov->iov_num_vfs != 0) {
409		mtx_unlock(&Giant);
410		return (EBUSY);
411	}
412
413	total_vfs = IOV_READ(dinfo, PCIR_SRIOV_TOTAL_VFS, 2);
414
415	if (arg->num_vfs > total_vfs) {
416		error = EINVAL;
417		goto out;
418	}
419
420	/*
421	 * If we are creating passthrough devices then force the ppt driver to
422	 * attach to prevent a VF driver from claming the VFs.
423	 */
424	if (arg->passthrough)
425		driver = "ppt";
426	else
427		driver = NULL;
428
429	error = pci_iov_config_page_size(dinfo);
430	if (error != 0)
431		goto out;
432
433	error = pci_iov_set_ari(bus);
434	if (error != 0)
435		goto out;
436
437	error = PCI_INIT_IOV(dev, arg->num_vfs);
438
439	if (error != 0)
440		goto out;
441
442	iov_inited = 1;
443	IOV_WRITE(dinfo, PCIR_SRIOV_NUM_VFS, arg->num_vfs, 2);
444
445	rid_off = IOV_READ(dinfo, PCIR_SRIOV_VF_OFF, 2);
446	rid_stride = IOV_READ(dinfo, PCIR_SRIOV_VF_STRIDE, 2);
447
448	first_rid = pci_get_rid(dev) + rid_off;
449	last_rid = first_rid + (arg->num_vfs - 1) * rid_stride;
450
451	/* We don't yet support allocating extra bus numbers for VFs. */
452	if (pci_get_bus(dev) != PCI_RID2BUS(last_rid)) {
453		error = ENOSPC;
454		goto out;
455	}
456
457	iov_ctl = IOV_READ(dinfo, PCIR_SRIOV_CTL, 2);
458	iov_ctl &= ~(PCIM_SRIOV_VF_EN | PCIM_SRIOV_VF_MSE);
459	IOV_WRITE(dinfo, PCIR_SRIOV_CTL, iov_ctl, 2);
460
461	error = pci_iov_init_rman(dev, iov);
462	if (error != 0)
463		goto out;
464
465	iov->iov_num_vfs = arg->num_vfs;
466
467	error = pci_iov_setup_bars(dinfo);
468	if (error != 0)
469		goto out;
470
471	iov_ctl = IOV_READ(dinfo, PCIR_SRIOV_CTL, 2);
472	iov_ctl |= PCIM_SRIOV_VF_EN | PCIM_SRIOV_VF_MSE;
473	IOV_WRITE(dinfo, PCIR_SRIOV_CTL, iov_ctl, 2);
474
475	/* Per specification, we must wait 100ms before accessing VFs. */
476	pause("iov", roundup(hz, 10));
477	pci_iov_enumerate_vfs(dinfo, driver, first_rid, rid_stride);
478	mtx_unlock(&Giant);
479
480	return (0);
481out:
482	if (iov_inited)
483		PCI_UNINIT_IOV(dev);
484
485	for (i = 0; i <= PCIR_MAX_BAR_0; i++) {
486		if (iov->iov_bar[i].res != NULL) {
487			pci_release_resource(bus, dev, SYS_RES_MEMORY,
488			    iov->iov_pos + PCIR_SRIOV_BAR(i),
489			    iov->iov_bar[i].res);
490			pci_delete_resource(bus, dev, SYS_RES_MEMORY,
491			    iov->iov_pos + PCIR_SRIOV_BAR(i));
492			iov->iov_bar[i].res = NULL;
493		}
494	}
495
496	if (iov->iov_flags & IOV_RMAN_INITED) {
497		rman_fini(&iov->rman);
498		iov->iov_flags &= ~IOV_RMAN_INITED;
499	}
500	iov->iov_num_vfs = 0;
501	mtx_unlock(&Giant);
502	return (error);
503}
504
505static int
506pci_iov_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag,
507    struct thread *td)
508{
509
510	switch (cmd) {
511	case IOV_CONFIG:
512		return (pci_iov_config(dev, (struct pci_iov_arg *)data));
513	default:
514		return (EINVAL);
515	}
516}
517
518struct resource *
519pci_vf_alloc_mem_resource(device_t dev, device_t child, int *rid, u_long start,
520    u_long end, u_long count, u_int flags)
521{
522	struct pci_devinfo *dinfo;
523	struct pcicfg_iov *iov;
524	struct pci_map *map;
525	struct resource *res;
526	struct resource_list_entry *rle;
527	u_long bar_start, bar_end;
528	pci_addr_t bar_length;
529	int error;
530
531	dinfo = device_get_ivars(child);
532	iov = dinfo->cfg.iov;
533
534	map = pci_find_bar(child, *rid);
535	if (map == NULL)
536		return (NULL);
537
538	bar_length = 1 << map->pm_size;
539	bar_start = map->pm_value;
540	bar_end = bar_start + bar_length - 1;
541
542	/* Make sure that the resource fits the constraints. */
543	if (bar_start >= end || bar_end <= bar_start || count != 1)
544		return (NULL);
545
546	/* Clamp the resource to the constraints if necessary. */
547	if (bar_start < start)
548		bar_start = start;
549	if (bar_end > end)
550		bar_end = end;
551	bar_length = bar_end - bar_start + 1;
552
553	res = rman_reserve_resource(&iov->rman, bar_start, bar_end,
554	    bar_length, flags, child);
555	if (res == NULL)
556		return (NULL);
557
558	rle = resource_list_add(&dinfo->resources, SYS_RES_MEMORY, *rid,
559	    bar_start, bar_end, 1);
560	if (rle == NULL) {
561		rman_release_resource(res);
562		return (NULL);
563	}
564
565	rman_set_rid(res, *rid);
566
567	if (flags & RF_ACTIVE) {
568		error = bus_activate_resource(child, SYS_RES_MEMORY, *rid, res);
569		if (error != 0) {
570			resource_list_delete(&dinfo->resources, SYS_RES_MEMORY,
571			    *rid);
572			rman_release_resource(res);
573			return (NULL);
574		}
575	}
576	rle->res = res;
577
578	return (res);
579}
580
581int
582pci_vf_release_mem_resource(device_t dev, device_t child, int rid,
583    struct resource *r)
584{
585	struct pci_devinfo *dinfo;
586	struct resource_list_entry *rle;
587	int error;
588
589	dinfo = device_get_ivars(child);
590
591	if (rman_get_flags(r) & RF_ACTIVE) {
592		error = bus_deactivate_resource(child, SYS_RES_MEMORY, rid, r);
593		if (error != 0)
594			return (error);
595	}
596
597	rle = resource_list_find(&dinfo->resources, SYS_RES_MEMORY, rid);
598	if (rle != NULL) {
599		rle->res = NULL;
600		resource_list_delete(&dinfo->resources, SYS_RES_MEMORY,
601		    rid);
602	}
603
604	return (rman_release_resource(r));
605}
606
607