1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
4 *     Author: Alex Williamson <alex.williamson@redhat.com>
5 *
6 * Derived from original vfio:
7 * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
8 * Author: Tom Lyon, pugs@cisco.com
9 */
10
11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12
13#include <linux/aperture.h>
14#include <linux/device.h>
15#include <linux/eventfd.h>
16#include <linux/file.h>
17#include <linux/interrupt.h>
18#include <linux/iommu.h>
19#include <linux/module.h>
20#include <linux/mutex.h>
21#include <linux/notifier.h>
22#include <linux/pci.h>
23#include <linux/pm_runtime.h>
24#include <linux/slab.h>
25#include <linux/types.h>
26#include <linux/uaccess.h>
27#include <linux/vgaarb.h>
28#include <linux/nospec.h>
29#include <linux/sched/mm.h>
30#include <linux/iommufd.h>
31#if IS_ENABLED(CONFIG_EEH)
32#include <asm/eeh.h>
33#endif
34
35#include "vfio_pci_priv.h"
36
37#define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
38#define DRIVER_DESC "core driver for VFIO based PCI devices"
39
40static bool nointxmask;
41static bool disable_vga;
42static bool disable_idle_d3;
43
44/* List of PF's that vfio_pci_core_sriov_configure() has been called on */
45static DEFINE_MUTEX(vfio_pci_sriov_pfs_mutex);
46static LIST_HEAD(vfio_pci_sriov_pfs);
47
48struct vfio_pci_dummy_resource {
49	struct resource		resource;
50	int			index;
51	struct list_head	res_next;
52};
53
54struct vfio_pci_vf_token {
55	struct mutex		lock;
56	uuid_t			uuid;
57	int			users;
58};
59
60struct vfio_pci_mmap_vma {
61	struct vm_area_struct	*vma;
62	struct list_head	vma_next;
63};
64
65static inline bool vfio_vga_disabled(void)
66{
67#ifdef CONFIG_VFIO_PCI_VGA
68	return disable_vga;
69#else
70	return true;
71#endif
72}
73
74/*
75 * Our VGA arbiter participation is limited since we don't know anything
76 * about the device itself.  However, if the device is the only VGA device
77 * downstream of a bridge and VFIO VGA support is disabled, then we can
78 * safely return legacy VGA IO and memory as not decoded since the user
79 * has no way to get to it and routing can be disabled externally at the
80 * bridge.
81 */
82static unsigned int vfio_pci_set_decode(struct pci_dev *pdev, bool single_vga)
83{
84	struct pci_dev *tmp = NULL;
85	unsigned char max_busnr;
86	unsigned int decodes;
87
88	if (single_vga || !vfio_vga_disabled() || pci_is_root_bus(pdev->bus))
89		return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM |
90		       VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM;
91
92	max_busnr = pci_bus_max_busnr(pdev->bus);
93	decodes = VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
94
95	while ((tmp = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, tmp)) != NULL) {
96		if (tmp == pdev ||
97		    pci_domain_nr(tmp->bus) != pci_domain_nr(pdev->bus) ||
98		    pci_is_root_bus(tmp->bus))
99			continue;
100
101		if (tmp->bus->number >= pdev->bus->number &&
102		    tmp->bus->number <= max_busnr) {
103			pci_dev_put(tmp);
104			decodes |= VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM;
105			break;
106		}
107	}
108
109	return decodes;
110}
111
112static void vfio_pci_probe_mmaps(struct vfio_pci_core_device *vdev)
113{
114	struct resource *res;
115	int i;
116	struct vfio_pci_dummy_resource *dummy_res;
117
118	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
119		int bar = i + PCI_STD_RESOURCES;
120
121		res = &vdev->pdev->resource[bar];
122
123		if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
124			goto no_mmap;
125
126		if (!(res->flags & IORESOURCE_MEM))
127			goto no_mmap;
128
129		/*
130		 * The PCI core shouldn't set up a resource with a
131		 * type but zero size. But there may be bugs that
132		 * cause us to do that.
133		 */
134		if (!resource_size(res))
135			goto no_mmap;
136
137		if (resource_size(res) >= PAGE_SIZE) {
138			vdev->bar_mmap_supported[bar] = true;
139			continue;
140		}
141
142		if (!(res->start & ~PAGE_MASK)) {
143			/*
144			 * Add a dummy resource to reserve the remainder
145			 * of the exclusive page in case that hot-add
146			 * device's bar is assigned into it.
147			 */
148			dummy_res =
149				kzalloc(sizeof(*dummy_res), GFP_KERNEL_ACCOUNT);
150			if (dummy_res == NULL)
151				goto no_mmap;
152
153			dummy_res->resource.name = "vfio sub-page reserved";
154			dummy_res->resource.start = res->end + 1;
155			dummy_res->resource.end = res->start + PAGE_SIZE - 1;
156			dummy_res->resource.flags = res->flags;
157			if (request_resource(res->parent,
158						&dummy_res->resource)) {
159				kfree(dummy_res);
160				goto no_mmap;
161			}
162			dummy_res->index = bar;
163			list_add(&dummy_res->res_next,
164					&vdev->dummy_resources_list);
165			vdev->bar_mmap_supported[bar] = true;
166			continue;
167		}
168		/*
169		 * Here we don't handle the case when the BAR is not page
170		 * aligned because we can't expect the BAR will be
171		 * assigned into the same location in a page in guest
172		 * when we passthrough the BAR. And it's hard to access
173		 * this BAR in userspace because we have no way to get
174		 * the BAR's location in a page.
175		 */
176no_mmap:
177		vdev->bar_mmap_supported[bar] = false;
178	}
179}
180
181struct vfio_pci_group_info;
182static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set);
183static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
184				      struct vfio_pci_group_info *groups,
185				      struct iommufd_ctx *iommufd_ctx);
186
187/*
188 * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND
189 * _and_ the ability detect when the device is asserting INTx via PCI_STATUS.
190 * If a device implements the former but not the latter we would typically
191 * expect broken_intx_masking be set and require an exclusive interrupt.
192 * However since we do have control of the device's ability to assert INTx,
193 * we can instead pretend that the device does not implement INTx, virtualizing
194 * the pin register to report zero and maintaining DisINTx set on the host.
195 */
196static bool vfio_pci_nointx(struct pci_dev *pdev)
197{
198	switch (pdev->vendor) {
199	case PCI_VENDOR_ID_INTEL:
200		switch (pdev->device) {
201		/* All i40e (XL710/X710/XXV710) 10/20/25/40GbE NICs */
202		case 0x1572:
203		case 0x1574:
204		case 0x1580 ... 0x1581:
205		case 0x1583 ... 0x158b:
206		case 0x37d0 ... 0x37d2:
207		/* X550 */
208		case 0x1563:
209			return true;
210		default:
211			return false;
212		}
213	}
214
215	return false;
216}
217
218static void vfio_pci_probe_power_state(struct vfio_pci_core_device *vdev)
219{
220	struct pci_dev *pdev = vdev->pdev;
221	u16 pmcsr;
222
223	if (!pdev->pm_cap)
224		return;
225
226	pci_read_config_word(pdev, pdev->pm_cap + PCI_PM_CTRL, &pmcsr);
227
228	vdev->needs_pm_restore = !(pmcsr & PCI_PM_CTRL_NO_SOFT_RESET);
229}
230
231/*
232 * pci_set_power_state() wrapper handling devices which perform a soft reset on
233 * D3->D0 transition.  Save state prior to D0/1/2->D3, stash it on the vdev,
234 * restore when returned to D0.  Saved separately from pci_saved_state for use
235 * by PM capability emulation and separately from pci_dev internal saved state
236 * to avoid it being overwritten and consumed around other resets.
237 */
238int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t state)
239{
240	struct pci_dev *pdev = vdev->pdev;
241	bool needs_restore = false, needs_save = false;
242	int ret;
243
244	/* Prevent changing power state for PFs with VFs enabled */
245	if (pci_num_vf(pdev) && state > PCI_D0)
246		return -EBUSY;
247
248	if (vdev->needs_pm_restore) {
249		if (pdev->current_state < PCI_D3hot && state >= PCI_D3hot) {
250			pci_save_state(pdev);
251			needs_save = true;
252		}
253
254		if (pdev->current_state >= PCI_D3hot && state <= PCI_D0)
255			needs_restore = true;
256	}
257
258	ret = pci_set_power_state(pdev, state);
259
260	if (!ret) {
261		/* D3 might be unsupported via quirk, skip unless in D3 */
262		if (needs_save && pdev->current_state >= PCI_D3hot) {
263			/*
264			 * The current PCI state will be saved locally in
265			 * 'pm_save' during the D3hot transition. When the
266			 * device state is changed to D0 again with the current
267			 * function, then pci_store_saved_state() will restore
268			 * the state and will free the memory pointed by
269			 * 'pm_save'. There are few cases where the PCI power
270			 * state can be changed to D0 without the involvement
271			 * of the driver. For these cases, free the earlier
272			 * allocated memory first before overwriting 'pm_save'
273			 * to prevent the memory leak.
274			 */
275			kfree(vdev->pm_save);
276			vdev->pm_save = pci_store_saved_state(pdev);
277		} else if (needs_restore) {
278			pci_load_and_free_saved_state(pdev, &vdev->pm_save);
279			pci_restore_state(pdev);
280		}
281	}
282
283	return ret;
284}
285
286static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev,
287				     struct eventfd_ctx *efdctx)
288{
289	/*
290	 * The vdev power related flags are protected with 'memory_lock'
291	 * semaphore.
292	 */
293	vfio_pci_zap_and_down_write_memory_lock(vdev);
294	if (vdev->pm_runtime_engaged) {
295		up_write(&vdev->memory_lock);
296		return -EINVAL;
297	}
298
299	vdev->pm_runtime_engaged = true;
300	vdev->pm_wake_eventfd_ctx = efdctx;
301	pm_runtime_put_noidle(&vdev->pdev->dev);
302	up_write(&vdev->memory_lock);
303
304	return 0;
305}
306
307static int vfio_pci_core_pm_entry(struct vfio_device *device, u32 flags,
308				  void __user *arg, size_t argsz)
309{
310	struct vfio_pci_core_device *vdev =
311		container_of(device, struct vfio_pci_core_device, vdev);
312	int ret;
313
314	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0);
315	if (ret != 1)
316		return ret;
317
318	/*
319	 * Inside vfio_pci_runtime_pm_entry(), only the runtime PM usage count
320	 * will be decremented. The pm_runtime_put() will be invoked again
321	 * while returning from the ioctl and then the device can go into
322	 * runtime suspended state.
323	 */
324	return vfio_pci_runtime_pm_entry(vdev, NULL);
325}
326
327static int vfio_pci_core_pm_entry_with_wakeup(
328	struct vfio_device *device, u32 flags,
329	struct vfio_device_low_power_entry_with_wakeup __user *arg,
330	size_t argsz)
331{
332	struct vfio_pci_core_device *vdev =
333		container_of(device, struct vfio_pci_core_device, vdev);
334	struct vfio_device_low_power_entry_with_wakeup entry;
335	struct eventfd_ctx *efdctx;
336	int ret;
337
338	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET,
339				 sizeof(entry));
340	if (ret != 1)
341		return ret;
342
343	if (copy_from_user(&entry, arg, sizeof(entry)))
344		return -EFAULT;
345
346	if (entry.wakeup_eventfd < 0)
347		return -EINVAL;
348
349	efdctx = eventfd_ctx_fdget(entry.wakeup_eventfd);
350	if (IS_ERR(efdctx))
351		return PTR_ERR(efdctx);
352
353	ret = vfio_pci_runtime_pm_entry(vdev, efdctx);
354	if (ret)
355		eventfd_ctx_put(efdctx);
356
357	return ret;
358}
359
360static void __vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev)
361{
362	if (vdev->pm_runtime_engaged) {
363		vdev->pm_runtime_engaged = false;
364		pm_runtime_get_noresume(&vdev->pdev->dev);
365
366		if (vdev->pm_wake_eventfd_ctx) {
367			eventfd_ctx_put(vdev->pm_wake_eventfd_ctx);
368			vdev->pm_wake_eventfd_ctx = NULL;
369		}
370	}
371}
372
373static void vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev)
374{
375	/*
376	 * The vdev power related flags are protected with 'memory_lock'
377	 * semaphore.
378	 */
379	down_write(&vdev->memory_lock);
380	__vfio_pci_runtime_pm_exit(vdev);
381	up_write(&vdev->memory_lock);
382}
383
384static int vfio_pci_core_pm_exit(struct vfio_device *device, u32 flags,
385				 void __user *arg, size_t argsz)
386{
387	struct vfio_pci_core_device *vdev =
388		container_of(device, struct vfio_pci_core_device, vdev);
389	int ret;
390
391	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0);
392	if (ret != 1)
393		return ret;
394
395	/*
396	 * The device is always in the active state here due to pm wrappers
397	 * around ioctls. If the device had entered a low power state and
398	 * pm_wake_eventfd_ctx is valid, vfio_pci_core_runtime_resume() has
399	 * already signaled the eventfd and exited low power mode itself.
400	 * pm_runtime_engaged protects the redundant call here.
401	 */
402	vfio_pci_runtime_pm_exit(vdev);
403	return 0;
404}
405
406#ifdef CONFIG_PM
407static int vfio_pci_core_runtime_suspend(struct device *dev)
408{
409	struct vfio_pci_core_device *vdev = dev_get_drvdata(dev);
410
411	down_write(&vdev->memory_lock);
412	/*
413	 * The user can move the device into D3hot state before invoking
414	 * power management IOCTL. Move the device into D0 state here and then
415	 * the pci-driver core runtime PM suspend function will move the device
416	 * into the low power state. Also, for the devices which have
417	 * NoSoftRst-, it will help in restoring the original state
418	 * (saved locally in 'vdev->pm_save').
419	 */
420	vfio_pci_set_power_state(vdev, PCI_D0);
421	up_write(&vdev->memory_lock);
422
423	/*
424	 * If INTx is enabled, then mask INTx before going into the runtime
425	 * suspended state and unmask the same in the runtime resume.
426	 * If INTx has already been masked by the user, then
427	 * vfio_pci_intx_mask() will return false and in that case, INTx
428	 * should not be unmasked in the runtime resume.
429	 */
430	vdev->pm_intx_masked = ((vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) &&
431				vfio_pci_intx_mask(vdev));
432
433	return 0;
434}
435
436static int vfio_pci_core_runtime_resume(struct device *dev)
437{
438	struct vfio_pci_core_device *vdev = dev_get_drvdata(dev);
439
440	/*
441	 * Resume with a pm_wake_eventfd_ctx signals the eventfd and exit
442	 * low power mode.
443	 */
444	down_write(&vdev->memory_lock);
445	if (vdev->pm_wake_eventfd_ctx) {
446		eventfd_signal(vdev->pm_wake_eventfd_ctx);
447		__vfio_pci_runtime_pm_exit(vdev);
448	}
449	up_write(&vdev->memory_lock);
450
451	if (vdev->pm_intx_masked)
452		vfio_pci_intx_unmask(vdev);
453
454	return 0;
455}
456#endif /* CONFIG_PM */
457
458/*
459 * The pci-driver core runtime PM routines always save the device state
460 * before going into suspended state. If the device is going into low power
461 * state with only with runtime PM ops, then no explicit handling is needed
462 * for the devices which have NoSoftRst-.
463 */
464static const struct dev_pm_ops vfio_pci_core_pm_ops = {
465	SET_RUNTIME_PM_OPS(vfio_pci_core_runtime_suspend,
466			   vfio_pci_core_runtime_resume,
467			   NULL)
468};
469
470int vfio_pci_core_enable(struct vfio_pci_core_device *vdev)
471{
472	struct pci_dev *pdev = vdev->pdev;
473	int ret;
474	u16 cmd;
475	u8 msix_pos;
476
477	if (!disable_idle_d3) {
478		ret = pm_runtime_resume_and_get(&pdev->dev);
479		if (ret < 0)
480			return ret;
481	}
482
483	/* Don't allow our initial saved state to include busmaster */
484	pci_clear_master(pdev);
485
486	ret = pci_enable_device(pdev);
487	if (ret)
488		goto out_power;
489
490	/* If reset fails because of the device lock, fail this path entirely */
491	ret = pci_try_reset_function(pdev);
492	if (ret == -EAGAIN)
493		goto out_disable_device;
494
495	vdev->reset_works = !ret;
496	pci_save_state(pdev);
497	vdev->pci_saved_state = pci_store_saved_state(pdev);
498	if (!vdev->pci_saved_state)
499		pci_dbg(pdev, "%s: Couldn't store saved state\n", __func__);
500
501	if (likely(!nointxmask)) {
502		if (vfio_pci_nointx(pdev)) {
503			pci_info(pdev, "Masking broken INTx support\n");
504			vdev->nointx = true;
505			pci_intx(pdev, 0);
506		} else
507			vdev->pci_2_3 = pci_intx_mask_supported(pdev);
508	}
509
510	pci_read_config_word(pdev, PCI_COMMAND, &cmd);
511	if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) {
512		cmd &= ~PCI_COMMAND_INTX_DISABLE;
513		pci_write_config_word(pdev, PCI_COMMAND, cmd);
514	}
515
516	ret = vfio_pci_zdev_open_device(vdev);
517	if (ret)
518		goto out_free_state;
519
520	ret = vfio_config_init(vdev);
521	if (ret)
522		goto out_free_zdev;
523
524	msix_pos = pdev->msix_cap;
525	if (msix_pos) {
526		u16 flags;
527		u32 table;
528
529		pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags);
530		pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table);
531
532		vdev->msix_bar = table & PCI_MSIX_TABLE_BIR;
533		vdev->msix_offset = table & PCI_MSIX_TABLE_OFFSET;
534		vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16;
535		vdev->has_dyn_msix = pci_msix_can_alloc_dyn(pdev);
536	} else {
537		vdev->msix_bar = 0xFF;
538		vdev->has_dyn_msix = false;
539	}
540
541	if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev))
542		vdev->has_vga = true;
543
544
545	return 0;
546
547out_free_zdev:
548	vfio_pci_zdev_close_device(vdev);
549out_free_state:
550	kfree(vdev->pci_saved_state);
551	vdev->pci_saved_state = NULL;
552out_disable_device:
553	pci_disable_device(pdev);
554out_power:
555	if (!disable_idle_d3)
556		pm_runtime_put(&pdev->dev);
557	return ret;
558}
559EXPORT_SYMBOL_GPL(vfio_pci_core_enable);
560
561void vfio_pci_core_disable(struct vfio_pci_core_device *vdev)
562{
563	struct pci_dev *pdev = vdev->pdev;
564	struct vfio_pci_dummy_resource *dummy_res, *tmp;
565	struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp;
566	int i, bar;
567
568	/* For needs_reset */
569	lockdep_assert_held(&vdev->vdev.dev_set->lock);
570
571	/*
572	 * This function can be invoked while the power state is non-D0.
573	 * This non-D0 power state can be with or without runtime PM.
574	 * vfio_pci_runtime_pm_exit() will internally increment the usage
575	 * count corresponding to pm_runtime_put() called during low power
576	 * feature entry and then pm_runtime_resume() will wake up the device,
577	 * if the device has already gone into the suspended state. Otherwise,
578	 * the vfio_pci_set_power_state() will change the device power state
579	 * to D0.
580	 */
581	vfio_pci_runtime_pm_exit(vdev);
582	pm_runtime_resume(&pdev->dev);
583
584	/*
585	 * This function calls __pci_reset_function_locked() which internally
586	 * can use pci_pm_reset() for the function reset. pci_pm_reset() will
587	 * fail if the power state is non-D0. Also, for the devices which
588	 * have NoSoftRst-, the reset function can cause the PCI config space
589	 * reset without restoring the original state (saved locally in
590	 * 'vdev->pm_save').
591	 */
592	vfio_pci_set_power_state(vdev, PCI_D0);
593
594	/* Stop the device from further DMA */
595	pci_clear_master(pdev);
596
597	vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
598				VFIO_IRQ_SET_ACTION_TRIGGER,
599				vdev->irq_type, 0, 0, NULL);
600
601	/* Device closed, don't need mutex here */
602	list_for_each_entry_safe(ioeventfd, ioeventfd_tmp,
603				 &vdev->ioeventfds_list, next) {
604		vfio_virqfd_disable(&ioeventfd->virqfd);
605		list_del(&ioeventfd->next);
606		kfree(ioeventfd);
607	}
608	vdev->ioeventfds_nr = 0;
609
610	vdev->virq_disabled = false;
611
612	for (i = 0; i < vdev->num_regions; i++)
613		vdev->region[i].ops->release(vdev, &vdev->region[i]);
614
615	vdev->num_regions = 0;
616	kfree(vdev->region);
617	vdev->region = NULL; /* don't krealloc a freed pointer */
618
619	vfio_config_free(vdev);
620
621	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
622		bar = i + PCI_STD_RESOURCES;
623		if (!vdev->barmap[bar])
624			continue;
625		pci_iounmap(pdev, vdev->barmap[bar]);
626		pci_release_selected_regions(pdev, 1 << bar);
627		vdev->barmap[bar] = NULL;
628	}
629
630	list_for_each_entry_safe(dummy_res, tmp,
631				 &vdev->dummy_resources_list, res_next) {
632		list_del(&dummy_res->res_next);
633		release_resource(&dummy_res->resource);
634		kfree(dummy_res);
635	}
636
637	vdev->needs_reset = true;
638
639	vfio_pci_zdev_close_device(vdev);
640
641	/*
642	 * If we have saved state, restore it.  If we can reset the device,
643	 * even better.  Resetting with current state seems better than
644	 * nothing, but saving and restoring current state without reset
645	 * is just busy work.
646	 */
647	if (pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state)) {
648		pci_info(pdev, "%s: Couldn't reload saved state\n", __func__);
649
650		if (!vdev->reset_works)
651			goto out;
652
653		pci_save_state(pdev);
654	}
655
656	/*
657	 * Disable INTx and MSI, presumably to avoid spurious interrupts
658	 * during reset.  Stolen from pci_reset_function()
659	 */
660	pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE);
661
662	/*
663	 * Try to get the locks ourselves to prevent a deadlock. The
664	 * success of this is dependent on being able to lock the device,
665	 * which is not always possible.
666	 * We can not use the "try" reset interface here, which will
667	 * overwrite the previously restored configuration information.
668	 */
669	if (vdev->reset_works && pci_dev_trylock(pdev)) {
670		if (!__pci_reset_function_locked(pdev))
671			vdev->needs_reset = false;
672		pci_dev_unlock(pdev);
673	}
674
675	pci_restore_state(pdev);
676out:
677	pci_disable_device(pdev);
678
679	vfio_pci_dev_set_try_reset(vdev->vdev.dev_set);
680
681	/* Put the pm-runtime usage counter acquired during enable */
682	if (!disable_idle_d3)
683		pm_runtime_put(&pdev->dev);
684}
685EXPORT_SYMBOL_GPL(vfio_pci_core_disable);
686
687void vfio_pci_core_close_device(struct vfio_device *core_vdev)
688{
689	struct vfio_pci_core_device *vdev =
690		container_of(core_vdev, struct vfio_pci_core_device, vdev);
691
692	if (vdev->sriov_pf_core_dev) {
693		mutex_lock(&vdev->sriov_pf_core_dev->vf_token->lock);
694		WARN_ON(!vdev->sriov_pf_core_dev->vf_token->users);
695		vdev->sriov_pf_core_dev->vf_token->users--;
696		mutex_unlock(&vdev->sriov_pf_core_dev->vf_token->lock);
697	}
698#if IS_ENABLED(CONFIG_EEH)
699	eeh_dev_release(vdev->pdev);
700#endif
701	vfio_pci_core_disable(vdev);
702
703	mutex_lock(&vdev->igate);
704	if (vdev->err_trigger) {
705		eventfd_ctx_put(vdev->err_trigger);
706		vdev->err_trigger = NULL;
707	}
708	if (vdev->req_trigger) {
709		eventfd_ctx_put(vdev->req_trigger);
710		vdev->req_trigger = NULL;
711	}
712	mutex_unlock(&vdev->igate);
713}
714EXPORT_SYMBOL_GPL(vfio_pci_core_close_device);
715
716void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev)
717{
718	vfio_pci_probe_mmaps(vdev);
719#if IS_ENABLED(CONFIG_EEH)
720	eeh_dev_open(vdev->pdev);
721#endif
722
723	if (vdev->sriov_pf_core_dev) {
724		mutex_lock(&vdev->sriov_pf_core_dev->vf_token->lock);
725		vdev->sriov_pf_core_dev->vf_token->users++;
726		mutex_unlock(&vdev->sriov_pf_core_dev->vf_token->lock);
727	}
728}
729EXPORT_SYMBOL_GPL(vfio_pci_core_finish_enable);
730
731static int vfio_pci_get_irq_count(struct vfio_pci_core_device *vdev, int irq_type)
732{
733	if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
734		u8 pin;
735
736		if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) ||
737		    vdev->nointx || vdev->pdev->is_virtfn)
738			return 0;
739
740		pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin);
741
742		return pin ? 1 : 0;
743	} else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
744		u8 pos;
745		u16 flags;
746
747		pos = vdev->pdev->msi_cap;
748		if (pos) {
749			pci_read_config_word(vdev->pdev,
750					     pos + PCI_MSI_FLAGS, &flags);
751			return 1 << ((flags & PCI_MSI_FLAGS_QMASK) >> 1);
752		}
753	} else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) {
754		u8 pos;
755		u16 flags;
756
757		pos = vdev->pdev->msix_cap;
758		if (pos) {
759			pci_read_config_word(vdev->pdev,
760					     pos + PCI_MSIX_FLAGS, &flags);
761
762			return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
763		}
764	} else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) {
765		if (pci_is_pcie(vdev->pdev))
766			return 1;
767	} else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) {
768		return 1;
769	}
770
771	return 0;
772}
773
774static int vfio_pci_count_devs(struct pci_dev *pdev, void *data)
775{
776	(*(int *)data)++;
777	return 0;
778}
779
780struct vfio_pci_fill_info {
781	struct vfio_pci_dependent_device __user *devices;
782	struct vfio_pci_dependent_device __user *devices_end;
783	struct vfio_device *vdev;
784	u32 count;
785	u32 flags;
786};
787
788static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
789{
790	struct vfio_pci_dependent_device info = {
791		.segment = pci_domain_nr(pdev->bus),
792		.bus = pdev->bus->number,
793		.devfn = pdev->devfn,
794	};
795	struct vfio_pci_fill_info *fill = data;
796
797	fill->count++;
798	if (fill->devices >= fill->devices_end)
799		return 0;
800
801	if (fill->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID) {
802		struct iommufd_ctx *iommufd = vfio_iommufd_device_ictx(fill->vdev);
803		struct vfio_device_set *dev_set = fill->vdev->dev_set;
804		struct vfio_device *vdev;
805
806		/*
807		 * hot-reset requires all affected devices be represented in
808		 * the dev_set.
809		 */
810		vdev = vfio_find_device_in_devset(dev_set, &pdev->dev);
811		if (!vdev) {
812			info.devid = VFIO_PCI_DEVID_NOT_OWNED;
813		} else {
814			int id = vfio_iommufd_get_dev_id(vdev, iommufd);
815
816			if (id > 0)
817				info.devid = id;
818			else if (id == -ENOENT)
819				info.devid = VFIO_PCI_DEVID_OWNED;
820			else
821				info.devid = VFIO_PCI_DEVID_NOT_OWNED;
822		}
823		/* If devid is VFIO_PCI_DEVID_NOT_OWNED, clear owned flag. */
824		if (info.devid == VFIO_PCI_DEVID_NOT_OWNED)
825			fill->flags &= ~VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED;
826	} else {
827		struct iommu_group *iommu_group;
828
829		iommu_group = iommu_group_get(&pdev->dev);
830		if (!iommu_group)
831			return -EPERM; /* Cannot reset non-isolated devices */
832
833		info.group_id = iommu_group_id(iommu_group);
834		iommu_group_put(iommu_group);
835	}
836
837	if (copy_to_user(fill->devices, &info, sizeof(info)))
838		return -EFAULT;
839	fill->devices++;
840	return 0;
841}
842
843struct vfio_pci_group_info {
844	int count;
845	struct file **files;
846};
847
848static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot)
849{
850	for (; pdev; pdev = pdev->bus->self)
851		if (pdev->bus == slot->bus)
852			return (pdev->slot == slot);
853	return false;
854}
855
856struct vfio_pci_walk_info {
857	int (*fn)(struct pci_dev *pdev, void *data);
858	void *data;
859	struct pci_dev *pdev;
860	bool slot;
861	int ret;
862};
863
864static int vfio_pci_walk_wrapper(struct pci_dev *pdev, void *data)
865{
866	struct vfio_pci_walk_info *walk = data;
867
868	if (!walk->slot || vfio_pci_dev_below_slot(pdev, walk->pdev->slot))
869		walk->ret = walk->fn(pdev, walk->data);
870
871	return walk->ret;
872}
873
874static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
875					 int (*fn)(struct pci_dev *,
876						   void *data), void *data,
877					 bool slot)
878{
879	struct vfio_pci_walk_info walk = {
880		.fn = fn, .data = data, .pdev = pdev, .slot = slot, .ret = 0,
881	};
882
883	pci_walk_bus(pdev->bus, vfio_pci_walk_wrapper, &walk);
884
885	return walk.ret;
886}
887
888static int msix_mmappable_cap(struct vfio_pci_core_device *vdev,
889			      struct vfio_info_cap *caps)
890{
891	struct vfio_info_cap_header header = {
892		.id = VFIO_REGION_INFO_CAP_MSIX_MAPPABLE,
893		.version = 1
894	};
895
896	return vfio_info_add_capability(caps, &header, sizeof(header));
897}
898
899int vfio_pci_core_register_dev_region(struct vfio_pci_core_device *vdev,
900				      unsigned int type, unsigned int subtype,
901				      const struct vfio_pci_regops *ops,
902				      size_t size, u32 flags, void *data)
903{
904	struct vfio_pci_region *region;
905
906	region = krealloc(vdev->region,
907			  (vdev->num_regions + 1) * sizeof(*region),
908			  GFP_KERNEL_ACCOUNT);
909	if (!region)
910		return -ENOMEM;
911
912	vdev->region = region;
913	vdev->region[vdev->num_regions].type = type;
914	vdev->region[vdev->num_regions].subtype = subtype;
915	vdev->region[vdev->num_regions].ops = ops;
916	vdev->region[vdev->num_regions].size = size;
917	vdev->region[vdev->num_regions].flags = flags;
918	vdev->region[vdev->num_regions].data = data;
919
920	vdev->num_regions++;
921
922	return 0;
923}
924EXPORT_SYMBOL_GPL(vfio_pci_core_register_dev_region);
925
926static int vfio_pci_info_atomic_cap(struct vfio_pci_core_device *vdev,
927				    struct vfio_info_cap *caps)
928{
929	struct vfio_device_info_cap_pci_atomic_comp cap = {
930		.header.id = VFIO_DEVICE_INFO_CAP_PCI_ATOMIC_COMP,
931		.header.version = 1
932	};
933	struct pci_dev *pdev = pci_physfn(vdev->pdev);
934	u32 devcap2;
935
936	pcie_capability_read_dword(pdev, PCI_EXP_DEVCAP2, &devcap2);
937
938	if ((devcap2 & PCI_EXP_DEVCAP2_ATOMIC_COMP32) &&
939	    !pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP32))
940		cap.flags |= VFIO_PCI_ATOMIC_COMP32;
941
942	if ((devcap2 & PCI_EXP_DEVCAP2_ATOMIC_COMP64) &&
943	    !pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP64))
944		cap.flags |= VFIO_PCI_ATOMIC_COMP64;
945
946	if ((devcap2 & PCI_EXP_DEVCAP2_ATOMIC_COMP128) &&
947	    !pci_enable_atomic_ops_to_root(pdev,
948					   PCI_EXP_DEVCAP2_ATOMIC_COMP128))
949		cap.flags |= VFIO_PCI_ATOMIC_COMP128;
950
951	if (!cap.flags)
952		return -ENODEV;
953
954	return vfio_info_add_capability(caps, &cap.header, sizeof(cap));
955}
956
957static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev,
958				   struct vfio_device_info __user *arg)
959{
960	unsigned long minsz = offsetofend(struct vfio_device_info, num_irqs);
961	struct vfio_device_info info = {};
962	struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
963	int ret;
964
965	if (copy_from_user(&info, arg, minsz))
966		return -EFAULT;
967
968	if (info.argsz < minsz)
969		return -EINVAL;
970
971	minsz = min_t(size_t, info.argsz, sizeof(info));
972
973	info.flags = VFIO_DEVICE_FLAGS_PCI;
974
975	if (vdev->reset_works)
976		info.flags |= VFIO_DEVICE_FLAGS_RESET;
977
978	info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions;
979	info.num_irqs = VFIO_PCI_NUM_IRQS;
980
981	ret = vfio_pci_info_zdev_add_caps(vdev, &caps);
982	if (ret && ret != -ENODEV) {
983		pci_warn(vdev->pdev,
984			 "Failed to setup zPCI info capabilities\n");
985		return ret;
986	}
987
988	ret = vfio_pci_info_atomic_cap(vdev, &caps);
989	if (ret && ret != -ENODEV) {
990		pci_warn(vdev->pdev,
991			 "Failed to setup AtomicOps info capability\n");
992		return ret;
993	}
994
995	if (caps.size) {
996		info.flags |= VFIO_DEVICE_FLAGS_CAPS;
997		if (info.argsz < sizeof(info) + caps.size) {
998			info.argsz = sizeof(info) + caps.size;
999		} else {
1000			vfio_info_cap_shift(&caps, sizeof(info));
1001			if (copy_to_user(arg + 1, caps.buf, caps.size)) {
1002				kfree(caps.buf);
1003				return -EFAULT;
1004			}
1005			info.cap_offset = sizeof(*arg);
1006		}
1007
1008		kfree(caps.buf);
1009	}
1010
1011	return copy_to_user(arg, &info, minsz) ? -EFAULT : 0;
1012}
1013
1014static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev,
1015					  struct vfio_region_info __user *arg)
1016{
1017	unsigned long minsz = offsetofend(struct vfio_region_info, offset);
1018	struct pci_dev *pdev = vdev->pdev;
1019	struct vfio_region_info info;
1020	struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
1021	int i, ret;
1022
1023	if (copy_from_user(&info, arg, minsz))
1024		return -EFAULT;
1025
1026	if (info.argsz < minsz)
1027		return -EINVAL;
1028
1029	switch (info.index) {
1030	case VFIO_PCI_CONFIG_REGION_INDEX:
1031		info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1032		info.size = pdev->cfg_size;
1033		info.flags = VFIO_REGION_INFO_FLAG_READ |
1034			     VFIO_REGION_INFO_FLAG_WRITE;
1035		break;
1036	case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
1037		info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1038		info.size = pci_resource_len(pdev, info.index);
1039		if (!info.size) {
1040			info.flags = 0;
1041			break;
1042		}
1043
1044		info.flags = VFIO_REGION_INFO_FLAG_READ |
1045			     VFIO_REGION_INFO_FLAG_WRITE;
1046		if (vdev->bar_mmap_supported[info.index]) {
1047			info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
1048			if (info.index == vdev->msix_bar) {
1049				ret = msix_mmappable_cap(vdev, &caps);
1050				if (ret)
1051					return ret;
1052			}
1053		}
1054
1055		break;
1056	case VFIO_PCI_ROM_REGION_INDEX: {
1057		void __iomem *io;
1058		size_t size;
1059		u16 cmd;
1060
1061		info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1062		info.flags = 0;
1063
1064		/* Report the BAR size, not the ROM size */
1065		info.size = pci_resource_len(pdev, info.index);
1066		if (!info.size) {
1067			/* Shadow ROMs appear as PCI option ROMs */
1068			if (pdev->resource[PCI_ROM_RESOURCE].flags &
1069			    IORESOURCE_ROM_SHADOW)
1070				info.size = 0x20000;
1071			else
1072				break;
1073		}
1074
1075		/*
1076		 * Is it really there?  Enable memory decode for implicit access
1077		 * in pci_map_rom().
1078		 */
1079		cmd = vfio_pci_memory_lock_and_enable(vdev);
1080		io = pci_map_rom(pdev, &size);
1081		if (io) {
1082			info.flags = VFIO_REGION_INFO_FLAG_READ;
1083			pci_unmap_rom(pdev, io);
1084		} else {
1085			info.size = 0;
1086		}
1087		vfio_pci_memory_unlock_and_restore(vdev, cmd);
1088
1089		break;
1090	}
1091	case VFIO_PCI_VGA_REGION_INDEX:
1092		if (!vdev->has_vga)
1093			return -EINVAL;
1094
1095		info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1096		info.size = 0xc0000;
1097		info.flags = VFIO_REGION_INFO_FLAG_READ |
1098			     VFIO_REGION_INFO_FLAG_WRITE;
1099
1100		break;
1101	default: {
1102		struct vfio_region_info_cap_type cap_type = {
1103			.header.id = VFIO_REGION_INFO_CAP_TYPE,
1104			.header.version = 1
1105		};
1106
1107		if (info.index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
1108			return -EINVAL;
1109		info.index = array_index_nospec(
1110			info.index, VFIO_PCI_NUM_REGIONS + vdev->num_regions);
1111
1112		i = info.index - VFIO_PCI_NUM_REGIONS;
1113
1114		info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1115		info.size = vdev->region[i].size;
1116		info.flags = vdev->region[i].flags;
1117
1118		cap_type.type = vdev->region[i].type;
1119		cap_type.subtype = vdev->region[i].subtype;
1120
1121		ret = vfio_info_add_capability(&caps, &cap_type.header,
1122					       sizeof(cap_type));
1123		if (ret)
1124			return ret;
1125
1126		if (vdev->region[i].ops->add_capability) {
1127			ret = vdev->region[i].ops->add_capability(
1128				vdev, &vdev->region[i], &caps);
1129			if (ret)
1130				return ret;
1131		}
1132	}
1133	}
1134
1135	if (caps.size) {
1136		info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
1137		if (info.argsz < sizeof(info) + caps.size) {
1138			info.argsz = sizeof(info) + caps.size;
1139			info.cap_offset = 0;
1140		} else {
1141			vfio_info_cap_shift(&caps, sizeof(info));
1142			if (copy_to_user(arg + 1, caps.buf, caps.size)) {
1143				kfree(caps.buf);
1144				return -EFAULT;
1145			}
1146			info.cap_offset = sizeof(*arg);
1147		}
1148
1149		kfree(caps.buf);
1150	}
1151
1152	return copy_to_user(arg, &info, minsz) ? -EFAULT : 0;
1153}
1154
1155static int vfio_pci_ioctl_get_irq_info(struct vfio_pci_core_device *vdev,
1156				       struct vfio_irq_info __user *arg)
1157{
1158	unsigned long minsz = offsetofend(struct vfio_irq_info, count);
1159	struct vfio_irq_info info;
1160
1161	if (copy_from_user(&info, arg, minsz))
1162		return -EFAULT;
1163
1164	if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
1165		return -EINVAL;
1166
1167	switch (info.index) {
1168	case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX:
1169	case VFIO_PCI_REQ_IRQ_INDEX:
1170		break;
1171	case VFIO_PCI_ERR_IRQ_INDEX:
1172		if (pci_is_pcie(vdev->pdev))
1173			break;
1174		fallthrough;
1175	default:
1176		return -EINVAL;
1177	}
1178
1179	info.flags = VFIO_IRQ_INFO_EVENTFD;
1180
1181	info.count = vfio_pci_get_irq_count(vdev, info.index);
1182
1183	if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
1184		info.flags |=
1185			(VFIO_IRQ_INFO_MASKABLE | VFIO_IRQ_INFO_AUTOMASKED);
1186	else if (info.index != VFIO_PCI_MSIX_IRQ_INDEX || !vdev->has_dyn_msix)
1187		info.flags |= VFIO_IRQ_INFO_NORESIZE;
1188
1189	return copy_to_user(arg, &info, minsz) ? -EFAULT : 0;
1190}
1191
1192static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev,
1193				   struct vfio_irq_set __user *arg)
1194{
1195	unsigned long minsz = offsetofend(struct vfio_irq_set, count);
1196	struct vfio_irq_set hdr;
1197	u8 *data = NULL;
1198	int max, ret = 0;
1199	size_t data_size = 0;
1200
1201	if (copy_from_user(&hdr, arg, minsz))
1202		return -EFAULT;
1203
1204	max = vfio_pci_get_irq_count(vdev, hdr.index);
1205
1206	ret = vfio_set_irqs_validate_and_prepare(&hdr, max, VFIO_PCI_NUM_IRQS,
1207						 &data_size);
1208	if (ret)
1209		return ret;
1210
1211	if (data_size) {
1212		data = memdup_user(&arg->data, data_size);
1213		if (IS_ERR(data))
1214			return PTR_ERR(data);
1215	}
1216
1217	mutex_lock(&vdev->igate);
1218
1219	ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index, hdr.start,
1220				      hdr.count, data);
1221
1222	mutex_unlock(&vdev->igate);
1223	kfree(data);
1224
1225	return ret;
1226}
1227
1228static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev,
1229				void __user *arg)
1230{
1231	int ret;
1232
1233	if (!vdev->reset_works)
1234		return -EINVAL;
1235
1236	vfio_pci_zap_and_down_write_memory_lock(vdev);
1237
1238	/*
1239	 * This function can be invoked while the power state is non-D0. If
1240	 * pci_try_reset_function() has been called while the power state is
1241	 * non-D0, then pci_try_reset_function() will internally set the power
1242	 * state to D0 without vfio driver involvement. For the devices which
1243	 * have NoSoftRst-, the reset function can cause the PCI config space
1244	 * reset without restoring the original state (saved locally in
1245	 * 'vdev->pm_save').
1246	 */
1247	vfio_pci_set_power_state(vdev, PCI_D0);
1248
1249	ret = pci_try_reset_function(vdev->pdev);
1250	up_write(&vdev->memory_lock);
1251
1252	return ret;
1253}
1254
1255static int vfio_pci_ioctl_get_pci_hot_reset_info(
1256	struct vfio_pci_core_device *vdev,
1257	struct vfio_pci_hot_reset_info __user *arg)
1258{
1259	unsigned long minsz =
1260		offsetofend(struct vfio_pci_hot_reset_info, count);
1261	struct vfio_pci_hot_reset_info hdr;
1262	struct vfio_pci_fill_info fill = {};
1263	bool slot = false;
1264	int ret = 0;
1265
1266	if (copy_from_user(&hdr, arg, minsz))
1267		return -EFAULT;
1268
1269	if (hdr.argsz < minsz)
1270		return -EINVAL;
1271
1272	hdr.flags = 0;
1273
1274	/* Can we do a slot or bus reset or neither? */
1275	if (!pci_probe_reset_slot(vdev->pdev->slot))
1276		slot = true;
1277	else if (pci_probe_reset_bus(vdev->pdev->bus))
1278		return -ENODEV;
1279
1280	fill.devices = arg->devices;
1281	fill.devices_end = arg->devices +
1282			   (hdr.argsz - sizeof(hdr)) / sizeof(arg->devices[0]);
1283	fill.vdev = &vdev->vdev;
1284
1285	if (vfio_device_cdev_opened(&vdev->vdev))
1286		fill.flags |= VFIO_PCI_HOT_RESET_FLAG_DEV_ID |
1287			     VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED;
1288
1289	mutex_lock(&vdev->vdev.dev_set->lock);
1290	ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_fill_devs,
1291					    &fill, slot);
1292	mutex_unlock(&vdev->vdev.dev_set->lock);
1293	if (ret)
1294		return ret;
1295
1296	hdr.count = fill.count;
1297	hdr.flags = fill.flags;
1298	if (copy_to_user(arg, &hdr, minsz))
1299		return -EFAULT;
1300
1301	if (fill.count > fill.devices - arg->devices)
1302		return -ENOSPC;
1303	return 0;
1304}
1305
1306static int
1307vfio_pci_ioctl_pci_hot_reset_groups(struct vfio_pci_core_device *vdev,
1308				    int array_count, bool slot,
1309				    struct vfio_pci_hot_reset __user *arg)
1310{
1311	int32_t *group_fds;
1312	struct file **files;
1313	struct vfio_pci_group_info info;
1314	int file_idx, count = 0, ret = 0;
1315
1316	/*
1317	 * We can't let userspace give us an arbitrarily large buffer to copy,
1318	 * so verify how many we think there could be.  Note groups can have
1319	 * multiple devices so one group per device is the max.
1320	 */
1321	ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs,
1322					    &count, slot);
1323	if (ret)
1324		return ret;
1325
1326	if (array_count > count)
1327		return -EINVAL;
1328
1329	group_fds = kcalloc(array_count, sizeof(*group_fds), GFP_KERNEL);
1330	files = kcalloc(array_count, sizeof(*files), GFP_KERNEL);
1331	if (!group_fds || !files) {
1332		kfree(group_fds);
1333		kfree(files);
1334		return -ENOMEM;
1335	}
1336
1337	if (copy_from_user(group_fds, arg->group_fds,
1338			   array_count * sizeof(*group_fds))) {
1339		kfree(group_fds);
1340		kfree(files);
1341		return -EFAULT;
1342	}
1343
1344	/*
1345	 * Get the group file for each fd to ensure the group is held across
1346	 * the reset
1347	 */
1348	for (file_idx = 0; file_idx < array_count; file_idx++) {
1349		struct file *file = fget(group_fds[file_idx]);
1350
1351		if (!file) {
1352			ret = -EBADF;
1353			break;
1354		}
1355
1356		/* Ensure the FD is a vfio group FD.*/
1357		if (!vfio_file_is_group(file)) {
1358			fput(file);
1359			ret = -EINVAL;
1360			break;
1361		}
1362
1363		files[file_idx] = file;
1364	}
1365
1366	kfree(group_fds);
1367
1368	/* release reference to groups on error */
1369	if (ret)
1370		goto hot_reset_release;
1371
1372	info.count = array_count;
1373	info.files = files;
1374
1375	ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info, NULL);
1376
1377hot_reset_release:
1378	for (file_idx--; file_idx >= 0; file_idx--)
1379		fput(files[file_idx]);
1380
1381	kfree(files);
1382	return ret;
1383}
1384
1385static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev,
1386					struct vfio_pci_hot_reset __user *arg)
1387{
1388	unsigned long minsz = offsetofend(struct vfio_pci_hot_reset, count);
1389	struct vfio_pci_hot_reset hdr;
1390	bool slot = false;
1391
1392	if (copy_from_user(&hdr, arg, minsz))
1393		return -EFAULT;
1394
1395	if (hdr.argsz < minsz || hdr.flags)
1396		return -EINVAL;
1397
1398	/* zero-length array is only for cdev opened devices */
1399	if (!!hdr.count == vfio_device_cdev_opened(&vdev->vdev))
1400		return -EINVAL;
1401
1402	/* Can we do a slot or bus reset or neither? */
1403	if (!pci_probe_reset_slot(vdev->pdev->slot))
1404		slot = true;
1405	else if (pci_probe_reset_bus(vdev->pdev->bus))
1406		return -ENODEV;
1407
1408	if (hdr.count)
1409		return vfio_pci_ioctl_pci_hot_reset_groups(vdev, hdr.count, slot, arg);
1410
1411	return vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, NULL,
1412					  vfio_iommufd_device_ictx(&vdev->vdev));
1413}
1414
1415static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev,
1416				    struct vfio_device_ioeventfd __user *arg)
1417{
1418	unsigned long minsz = offsetofend(struct vfio_device_ioeventfd, fd);
1419	struct vfio_device_ioeventfd ioeventfd;
1420	int count;
1421
1422	if (copy_from_user(&ioeventfd, arg, minsz))
1423		return -EFAULT;
1424
1425	if (ioeventfd.argsz < minsz)
1426		return -EINVAL;
1427
1428	if (ioeventfd.flags & ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK)
1429		return -EINVAL;
1430
1431	count = ioeventfd.flags & VFIO_DEVICE_IOEVENTFD_SIZE_MASK;
1432
1433	if (hweight8(count) != 1 || ioeventfd.fd < -1)
1434		return -EINVAL;
1435
1436	return vfio_pci_ioeventfd(vdev, ioeventfd.offset, ioeventfd.data, count,
1437				  ioeventfd.fd);
1438}
1439
1440long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
1441			 unsigned long arg)
1442{
1443	struct vfio_pci_core_device *vdev =
1444		container_of(core_vdev, struct vfio_pci_core_device, vdev);
1445	void __user *uarg = (void __user *)arg;
1446
1447	switch (cmd) {
1448	case VFIO_DEVICE_GET_INFO:
1449		return vfio_pci_ioctl_get_info(vdev, uarg);
1450	case VFIO_DEVICE_GET_IRQ_INFO:
1451		return vfio_pci_ioctl_get_irq_info(vdev, uarg);
1452	case VFIO_DEVICE_GET_PCI_HOT_RESET_INFO:
1453		return vfio_pci_ioctl_get_pci_hot_reset_info(vdev, uarg);
1454	case VFIO_DEVICE_GET_REGION_INFO:
1455		return vfio_pci_ioctl_get_region_info(vdev, uarg);
1456	case VFIO_DEVICE_IOEVENTFD:
1457		return vfio_pci_ioctl_ioeventfd(vdev, uarg);
1458	case VFIO_DEVICE_PCI_HOT_RESET:
1459		return vfio_pci_ioctl_pci_hot_reset(vdev, uarg);
1460	case VFIO_DEVICE_RESET:
1461		return vfio_pci_ioctl_reset(vdev, uarg);
1462	case VFIO_DEVICE_SET_IRQS:
1463		return vfio_pci_ioctl_set_irqs(vdev, uarg);
1464	default:
1465		return -ENOTTY;
1466	}
1467}
1468EXPORT_SYMBOL_GPL(vfio_pci_core_ioctl);
1469
1470static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags,
1471				       uuid_t __user *arg, size_t argsz)
1472{
1473	struct vfio_pci_core_device *vdev =
1474		container_of(device, struct vfio_pci_core_device, vdev);
1475	uuid_t uuid;
1476	int ret;
1477
1478	if (!vdev->vf_token)
1479		return -ENOTTY;
1480	/*
1481	 * We do not support GET of the VF Token UUID as this could
1482	 * expose the token of the previous device user.
1483	 */
1484	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET,
1485				 sizeof(uuid));
1486	if (ret != 1)
1487		return ret;
1488
1489	if (copy_from_user(&uuid, arg, sizeof(uuid)))
1490		return -EFAULT;
1491
1492	mutex_lock(&vdev->vf_token->lock);
1493	uuid_copy(&vdev->vf_token->uuid, &uuid);
1494	mutex_unlock(&vdev->vf_token->lock);
1495	return 0;
1496}
1497
1498int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
1499				void __user *arg, size_t argsz)
1500{
1501	switch (flags & VFIO_DEVICE_FEATURE_MASK) {
1502	case VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY:
1503		return vfio_pci_core_pm_entry(device, flags, arg, argsz);
1504	case VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP:
1505		return vfio_pci_core_pm_entry_with_wakeup(device, flags,
1506							  arg, argsz);
1507	case VFIO_DEVICE_FEATURE_LOW_POWER_EXIT:
1508		return vfio_pci_core_pm_exit(device, flags, arg, argsz);
1509	case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
1510		return vfio_pci_core_feature_token(device, flags, arg, argsz);
1511	default:
1512		return -ENOTTY;
1513	}
1514}
1515EXPORT_SYMBOL_GPL(vfio_pci_core_ioctl_feature);
1516
1517static ssize_t vfio_pci_rw(struct vfio_pci_core_device *vdev, char __user *buf,
1518			   size_t count, loff_t *ppos, bool iswrite)
1519{
1520	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
1521	int ret;
1522
1523	if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
1524		return -EINVAL;
1525
1526	ret = pm_runtime_resume_and_get(&vdev->pdev->dev);
1527	if (ret) {
1528		pci_info_ratelimited(vdev->pdev, "runtime resume failed %d\n",
1529				     ret);
1530		return -EIO;
1531	}
1532
1533	switch (index) {
1534	case VFIO_PCI_CONFIG_REGION_INDEX:
1535		ret = vfio_pci_config_rw(vdev, buf, count, ppos, iswrite);
1536		break;
1537
1538	case VFIO_PCI_ROM_REGION_INDEX:
1539		if (iswrite)
1540			ret = -EINVAL;
1541		else
1542			ret = vfio_pci_bar_rw(vdev, buf, count, ppos, false);
1543		break;
1544
1545	case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
1546		ret = vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite);
1547		break;
1548
1549	case VFIO_PCI_VGA_REGION_INDEX:
1550		ret = vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite);
1551		break;
1552
1553	default:
1554		index -= VFIO_PCI_NUM_REGIONS;
1555		ret = vdev->region[index].ops->rw(vdev, buf,
1556						   count, ppos, iswrite);
1557		break;
1558	}
1559
1560	pm_runtime_put(&vdev->pdev->dev);
1561	return ret;
1562}
1563
1564ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf,
1565		size_t count, loff_t *ppos)
1566{
1567	struct vfio_pci_core_device *vdev =
1568		container_of(core_vdev, struct vfio_pci_core_device, vdev);
1569
1570	if (!count)
1571		return 0;
1572
1573	return vfio_pci_rw(vdev, buf, count, ppos, false);
1574}
1575EXPORT_SYMBOL_GPL(vfio_pci_core_read);
1576
1577ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf,
1578		size_t count, loff_t *ppos)
1579{
1580	struct vfio_pci_core_device *vdev =
1581		container_of(core_vdev, struct vfio_pci_core_device, vdev);
1582
1583	if (!count)
1584		return 0;
1585
1586	return vfio_pci_rw(vdev, (char __user *)buf, count, ppos, true);
1587}
1588EXPORT_SYMBOL_GPL(vfio_pci_core_write);
1589
1590/* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */
1591static int vfio_pci_zap_and_vma_lock(struct vfio_pci_core_device *vdev, bool try)
1592{
1593	struct vfio_pci_mmap_vma *mmap_vma, *tmp;
1594
1595	/*
1596	 * Lock ordering:
1597	 * vma_lock is nested under mmap_lock for vm_ops callback paths.
1598	 * The memory_lock semaphore is used by both code paths calling
1599	 * into this function to zap vmas and the vm_ops.fault callback
1600	 * to protect the memory enable state of the device.
1601	 *
1602	 * When zapping vmas we need to maintain the mmap_lock => vma_lock
1603	 * ordering, which requires using vma_lock to walk vma_list to
1604	 * acquire an mm, then dropping vma_lock to get the mmap_lock and
1605	 * reacquiring vma_lock.  This logic is derived from similar
1606	 * requirements in uverbs_user_mmap_disassociate().
1607	 *
1608	 * mmap_lock must always be the top-level lock when it is taken.
1609	 * Therefore we can only hold the memory_lock write lock when
1610	 * vma_list is empty, as we'd need to take mmap_lock to clear
1611	 * entries.  vma_list can only be guaranteed empty when holding
1612	 * vma_lock, thus memory_lock is nested under vma_lock.
1613	 *
1614	 * This enables the vm_ops.fault callback to acquire vma_lock,
1615	 * followed by memory_lock read lock, while already holding
1616	 * mmap_lock without risk of deadlock.
1617	 */
1618	while (1) {
1619		struct mm_struct *mm = NULL;
1620
1621		if (try) {
1622			if (!mutex_trylock(&vdev->vma_lock))
1623				return 0;
1624		} else {
1625			mutex_lock(&vdev->vma_lock);
1626		}
1627		while (!list_empty(&vdev->vma_list)) {
1628			mmap_vma = list_first_entry(&vdev->vma_list,
1629						    struct vfio_pci_mmap_vma,
1630						    vma_next);
1631			mm = mmap_vma->vma->vm_mm;
1632			if (mmget_not_zero(mm))
1633				break;
1634
1635			list_del(&mmap_vma->vma_next);
1636			kfree(mmap_vma);
1637			mm = NULL;
1638		}
1639		if (!mm)
1640			return 1;
1641		mutex_unlock(&vdev->vma_lock);
1642
1643		if (try) {
1644			if (!mmap_read_trylock(mm)) {
1645				mmput(mm);
1646				return 0;
1647			}
1648		} else {
1649			mmap_read_lock(mm);
1650		}
1651		if (try) {
1652			if (!mutex_trylock(&vdev->vma_lock)) {
1653				mmap_read_unlock(mm);
1654				mmput(mm);
1655				return 0;
1656			}
1657		} else {
1658			mutex_lock(&vdev->vma_lock);
1659		}
1660		list_for_each_entry_safe(mmap_vma, tmp,
1661					 &vdev->vma_list, vma_next) {
1662			struct vm_area_struct *vma = mmap_vma->vma;
1663
1664			if (vma->vm_mm != mm)
1665				continue;
1666
1667			list_del(&mmap_vma->vma_next);
1668			kfree(mmap_vma);
1669
1670			zap_vma_ptes(vma, vma->vm_start,
1671				     vma->vm_end - vma->vm_start);
1672		}
1673		mutex_unlock(&vdev->vma_lock);
1674		mmap_read_unlock(mm);
1675		mmput(mm);
1676	}
1677}
1678
1679void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev)
1680{
1681	vfio_pci_zap_and_vma_lock(vdev, false);
1682	down_write(&vdev->memory_lock);
1683	mutex_unlock(&vdev->vma_lock);
1684}
1685
1686u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev)
1687{
1688	u16 cmd;
1689
1690	down_write(&vdev->memory_lock);
1691	pci_read_config_word(vdev->pdev, PCI_COMMAND, &cmd);
1692	if (!(cmd & PCI_COMMAND_MEMORY))
1693		pci_write_config_word(vdev->pdev, PCI_COMMAND,
1694				      cmd | PCI_COMMAND_MEMORY);
1695
1696	return cmd;
1697}
1698
1699void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, u16 cmd)
1700{
1701	pci_write_config_word(vdev->pdev, PCI_COMMAND, cmd);
1702	up_write(&vdev->memory_lock);
1703}
1704
1705/* Caller holds vma_lock */
1706static int __vfio_pci_add_vma(struct vfio_pci_core_device *vdev,
1707			      struct vm_area_struct *vma)
1708{
1709	struct vfio_pci_mmap_vma *mmap_vma;
1710
1711	mmap_vma = kmalloc(sizeof(*mmap_vma), GFP_KERNEL_ACCOUNT);
1712	if (!mmap_vma)
1713		return -ENOMEM;
1714
1715	mmap_vma->vma = vma;
1716	list_add(&mmap_vma->vma_next, &vdev->vma_list);
1717
1718	return 0;
1719}
1720
1721/*
1722 * Zap mmaps on open so that we can fault them in on access and therefore
1723 * our vma_list only tracks mappings accessed since last zap.
1724 */
1725static void vfio_pci_mmap_open(struct vm_area_struct *vma)
1726{
1727	zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
1728}
1729
1730static void vfio_pci_mmap_close(struct vm_area_struct *vma)
1731{
1732	struct vfio_pci_core_device *vdev = vma->vm_private_data;
1733	struct vfio_pci_mmap_vma *mmap_vma;
1734
1735	mutex_lock(&vdev->vma_lock);
1736	list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) {
1737		if (mmap_vma->vma == vma) {
1738			list_del(&mmap_vma->vma_next);
1739			kfree(mmap_vma);
1740			break;
1741		}
1742	}
1743	mutex_unlock(&vdev->vma_lock);
1744}
1745
1746static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf)
1747{
1748	struct vm_area_struct *vma = vmf->vma;
1749	struct vfio_pci_core_device *vdev = vma->vm_private_data;
1750	struct vfio_pci_mmap_vma *mmap_vma;
1751	vm_fault_t ret = VM_FAULT_NOPAGE;
1752
1753	mutex_lock(&vdev->vma_lock);
1754	down_read(&vdev->memory_lock);
1755
1756	/*
1757	 * Memory region cannot be accessed if the low power feature is engaged
1758	 * or memory access is disabled.
1759	 */
1760	if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) {
1761		ret = VM_FAULT_SIGBUS;
1762		goto up_out;
1763	}
1764
1765	/*
1766	 * We populate the whole vma on fault, so we need to test whether
1767	 * the vma has already been mapped, such as for concurrent faults
1768	 * to the same vma.  io_remap_pfn_range() will trigger a BUG_ON if
1769	 * we ask it to fill the same range again.
1770	 */
1771	list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) {
1772		if (mmap_vma->vma == vma)
1773			goto up_out;
1774	}
1775
1776	if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
1777			       vma->vm_end - vma->vm_start,
1778			       vma->vm_page_prot)) {
1779		ret = VM_FAULT_SIGBUS;
1780		zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
1781		goto up_out;
1782	}
1783
1784	if (__vfio_pci_add_vma(vdev, vma)) {
1785		ret = VM_FAULT_OOM;
1786		zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
1787	}
1788
1789up_out:
1790	up_read(&vdev->memory_lock);
1791	mutex_unlock(&vdev->vma_lock);
1792	return ret;
1793}
1794
1795static const struct vm_operations_struct vfio_pci_mmap_ops = {
1796	.open = vfio_pci_mmap_open,
1797	.close = vfio_pci_mmap_close,
1798	.fault = vfio_pci_mmap_fault,
1799};
1800
1801int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma)
1802{
1803	struct vfio_pci_core_device *vdev =
1804		container_of(core_vdev, struct vfio_pci_core_device, vdev);
1805	struct pci_dev *pdev = vdev->pdev;
1806	unsigned int index;
1807	u64 phys_len, req_len, pgoff, req_start;
1808	int ret;
1809
1810	index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
1811
1812	if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
1813		return -EINVAL;
1814	if (vma->vm_end < vma->vm_start)
1815		return -EINVAL;
1816	if ((vma->vm_flags & VM_SHARED) == 0)
1817		return -EINVAL;
1818	if (index >= VFIO_PCI_NUM_REGIONS) {
1819		int regnum = index - VFIO_PCI_NUM_REGIONS;
1820		struct vfio_pci_region *region = vdev->region + regnum;
1821
1822		if (region->ops && region->ops->mmap &&
1823		    (region->flags & VFIO_REGION_INFO_FLAG_MMAP))
1824			return region->ops->mmap(vdev, region, vma);
1825		return -EINVAL;
1826	}
1827	if (index >= VFIO_PCI_ROM_REGION_INDEX)
1828		return -EINVAL;
1829	if (!vdev->bar_mmap_supported[index])
1830		return -EINVAL;
1831
1832	phys_len = PAGE_ALIGN(pci_resource_len(pdev, index));
1833	req_len = vma->vm_end - vma->vm_start;
1834	pgoff = vma->vm_pgoff &
1835		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
1836	req_start = pgoff << PAGE_SHIFT;
1837
1838	if (req_start + req_len > phys_len)
1839		return -EINVAL;
1840
1841	/*
1842	 * Even though we don't make use of the barmap for the mmap,
1843	 * we need to request the region and the barmap tracks that.
1844	 */
1845	if (!vdev->barmap[index]) {
1846		ret = pci_request_selected_regions(pdev,
1847						   1 << index, "vfio-pci");
1848		if (ret)
1849			return ret;
1850
1851		vdev->barmap[index] = pci_iomap(pdev, index, 0);
1852		if (!vdev->barmap[index]) {
1853			pci_release_selected_regions(pdev, 1 << index);
1854			return -ENOMEM;
1855		}
1856	}
1857
1858	vma->vm_private_data = vdev;
1859	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
1860	vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
1861
1862	/*
1863	 * See remap_pfn_range(), called from vfio_pci_fault() but we can't
1864	 * change vm_flags within the fault handler.  Set them now.
1865	 *
1866	 * VM_ALLOW_ANY_UNCACHED: The VMA flag is implemented for ARM64,
1867	 * allowing KVM stage 2 device mapping attributes to use Normal-NC
1868	 * rather than DEVICE_nGnRE, which allows guest mappings
1869	 * supporting write-combining attributes (WC). ARM does not
1870	 * architecturally guarantee this is safe, and indeed some MMIO
1871	 * regions like the GICv2 VCPU interface can trigger uncontained
1872	 * faults if Normal-NC is used.
1873	 *
1874	 * To safely use VFIO in KVM the platform must guarantee full
1875	 * safety in the guest where no action taken against a MMIO
1876	 * mapping can trigger an uncontained failure. The assumption is
1877	 * that most VFIO PCI platforms support this for both mapping types,
1878	 * at least in common flows, based on some expectations of how
1879	 * PCI IP is integrated. Hence VM_ALLOW_ANY_UNCACHED is set in
1880	 * the VMA flags.
1881	 */
1882	vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED | VM_IO | VM_PFNMAP |
1883			VM_DONTEXPAND | VM_DONTDUMP);
1884	vma->vm_ops = &vfio_pci_mmap_ops;
1885
1886	return 0;
1887}
1888EXPORT_SYMBOL_GPL(vfio_pci_core_mmap);
1889
1890void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count)
1891{
1892	struct vfio_pci_core_device *vdev =
1893		container_of(core_vdev, struct vfio_pci_core_device, vdev);
1894	struct pci_dev *pdev = vdev->pdev;
1895
1896	mutex_lock(&vdev->igate);
1897
1898	if (vdev->req_trigger) {
1899		if (!(count % 10))
1900			pci_notice_ratelimited(pdev,
1901				"Relaying device request to user (#%u)\n",
1902				count);
1903		eventfd_signal(vdev->req_trigger);
1904	} else if (count == 0) {
1905		pci_warn(pdev,
1906			"No device request channel registered, blocked until released by user\n");
1907	}
1908
1909	mutex_unlock(&vdev->igate);
1910}
1911EXPORT_SYMBOL_GPL(vfio_pci_core_request);
1912
1913static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev,
1914				      bool vf_token, uuid_t *uuid)
1915{
1916	/*
1917	 * There's always some degree of trust or collaboration between SR-IOV
1918	 * PF and VFs, even if just that the PF hosts the SR-IOV capability and
1919	 * can disrupt VFs with a reset, but often the PF has more explicit
1920	 * access to deny service to the VF or access data passed through the
1921	 * VF.  We therefore require an opt-in via a shared VF token (UUID) to
1922	 * represent this trust.  This both prevents that a VF driver might
1923	 * assume the PF driver is a trusted, in-kernel driver, and also that
1924	 * a PF driver might be replaced with a rogue driver, unknown to in-use
1925	 * VF drivers.
1926	 *
1927	 * Therefore when presented with a VF, if the PF is a vfio device and
1928	 * it is bound to the vfio-pci driver, the user needs to provide a VF
1929	 * token to access the device, in the form of appending a vf_token to
1930	 * the device name, for example:
1931	 *
1932	 * "0000:04:10.0 vf_token=bd8d9d2b-5a5f-4f5a-a211-f591514ba1f3"
1933	 *
1934	 * When presented with a PF which has VFs in use, the user must also
1935	 * provide the current VF token to prove collaboration with existing
1936	 * VF users.  If VFs are not in use, the VF token provided for the PF
1937	 * device will act to set the VF token.
1938	 *
1939	 * If the VF token is provided but unused, an error is generated.
1940	 */
1941	if (vdev->pdev->is_virtfn) {
1942		struct vfio_pci_core_device *pf_vdev = vdev->sriov_pf_core_dev;
1943		bool match;
1944
1945		if (!pf_vdev) {
1946			if (!vf_token)
1947				return 0; /* PF is not vfio-pci, no VF token */
1948
1949			pci_info_ratelimited(vdev->pdev,
1950				"VF token incorrectly provided, PF not bound to vfio-pci\n");
1951			return -EINVAL;
1952		}
1953
1954		if (!vf_token) {
1955			pci_info_ratelimited(vdev->pdev,
1956				"VF token required to access device\n");
1957			return -EACCES;
1958		}
1959
1960		mutex_lock(&pf_vdev->vf_token->lock);
1961		match = uuid_equal(uuid, &pf_vdev->vf_token->uuid);
1962		mutex_unlock(&pf_vdev->vf_token->lock);
1963
1964		if (!match) {
1965			pci_info_ratelimited(vdev->pdev,
1966				"Incorrect VF token provided for device\n");
1967			return -EACCES;
1968		}
1969	} else if (vdev->vf_token) {
1970		mutex_lock(&vdev->vf_token->lock);
1971		if (vdev->vf_token->users) {
1972			if (!vf_token) {
1973				mutex_unlock(&vdev->vf_token->lock);
1974				pci_info_ratelimited(vdev->pdev,
1975					"VF token required to access device\n");
1976				return -EACCES;
1977			}
1978
1979			if (!uuid_equal(uuid, &vdev->vf_token->uuid)) {
1980				mutex_unlock(&vdev->vf_token->lock);
1981				pci_info_ratelimited(vdev->pdev,
1982					"Incorrect VF token provided for device\n");
1983				return -EACCES;
1984			}
1985		} else if (vf_token) {
1986			uuid_copy(&vdev->vf_token->uuid, uuid);
1987		}
1988
1989		mutex_unlock(&vdev->vf_token->lock);
1990	} else if (vf_token) {
1991		pci_info_ratelimited(vdev->pdev,
1992			"VF token incorrectly provided, not a PF or VF\n");
1993		return -EINVAL;
1994	}
1995
1996	return 0;
1997}
1998
1999#define VF_TOKEN_ARG "vf_token="
2000
2001int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf)
2002{
2003	struct vfio_pci_core_device *vdev =
2004		container_of(core_vdev, struct vfio_pci_core_device, vdev);
2005	bool vf_token = false;
2006	uuid_t uuid;
2007	int ret;
2008
2009	if (strncmp(pci_name(vdev->pdev), buf, strlen(pci_name(vdev->pdev))))
2010		return 0; /* No match */
2011
2012	if (strlen(buf) > strlen(pci_name(vdev->pdev))) {
2013		buf += strlen(pci_name(vdev->pdev));
2014
2015		if (*buf != ' ')
2016			return 0; /* No match: non-whitespace after name */
2017
2018		while (*buf) {
2019			if (*buf == ' ') {
2020				buf++;
2021				continue;
2022			}
2023
2024			if (!vf_token && !strncmp(buf, VF_TOKEN_ARG,
2025						  strlen(VF_TOKEN_ARG))) {
2026				buf += strlen(VF_TOKEN_ARG);
2027
2028				if (strlen(buf) < UUID_STRING_LEN)
2029					return -EINVAL;
2030
2031				ret = uuid_parse(buf, &uuid);
2032				if (ret)
2033					return ret;
2034
2035				vf_token = true;
2036				buf += UUID_STRING_LEN;
2037			} else {
2038				/* Unknown/duplicate option */
2039				return -EINVAL;
2040			}
2041		}
2042	}
2043
2044	ret = vfio_pci_validate_vf_token(vdev, vf_token, &uuid);
2045	if (ret)
2046		return ret;
2047
2048	return 1; /* Match */
2049}
2050EXPORT_SYMBOL_GPL(vfio_pci_core_match);
2051
2052static int vfio_pci_bus_notifier(struct notifier_block *nb,
2053				 unsigned long action, void *data)
2054{
2055	struct vfio_pci_core_device *vdev = container_of(nb,
2056						    struct vfio_pci_core_device, nb);
2057	struct device *dev = data;
2058	struct pci_dev *pdev = to_pci_dev(dev);
2059	struct pci_dev *physfn = pci_physfn(pdev);
2060
2061	if (action == BUS_NOTIFY_ADD_DEVICE &&
2062	    pdev->is_virtfn && physfn == vdev->pdev) {
2063		pci_info(vdev->pdev, "Captured SR-IOV VF %s driver_override\n",
2064			 pci_name(pdev));
2065		pdev->driver_override = kasprintf(GFP_KERNEL, "%s",
2066						  vdev->vdev.ops->name);
2067		WARN_ON(!pdev->driver_override);
2068	} else if (action == BUS_NOTIFY_BOUND_DRIVER &&
2069		   pdev->is_virtfn && physfn == vdev->pdev) {
2070		struct pci_driver *drv = pci_dev_driver(pdev);
2071
2072		if (drv && drv != pci_dev_driver(vdev->pdev))
2073			pci_warn(vdev->pdev,
2074				 "VF %s bound to driver %s while PF bound to driver %s\n",
2075				 pci_name(pdev), drv->name,
2076				 pci_dev_driver(vdev->pdev)->name);
2077	}
2078
2079	return 0;
2080}
2081
2082static int vfio_pci_vf_init(struct vfio_pci_core_device *vdev)
2083{
2084	struct pci_dev *pdev = vdev->pdev;
2085	struct vfio_pci_core_device *cur;
2086	struct pci_dev *physfn;
2087	int ret;
2088
2089	if (pdev->is_virtfn) {
2090		/*
2091		 * If this VF was created by our vfio_pci_core_sriov_configure()
2092		 * then we can find the PF vfio_pci_core_device now, and due to
2093		 * the locking in pci_disable_sriov() it cannot change until
2094		 * this VF device driver is removed.
2095		 */
2096		physfn = pci_physfn(vdev->pdev);
2097		mutex_lock(&vfio_pci_sriov_pfs_mutex);
2098		list_for_each_entry(cur, &vfio_pci_sriov_pfs, sriov_pfs_item) {
2099			if (cur->pdev == physfn) {
2100				vdev->sriov_pf_core_dev = cur;
2101				break;
2102			}
2103		}
2104		mutex_unlock(&vfio_pci_sriov_pfs_mutex);
2105		return 0;
2106	}
2107
2108	/* Not a SRIOV PF */
2109	if (!pdev->is_physfn)
2110		return 0;
2111
2112	vdev->vf_token = kzalloc(sizeof(*vdev->vf_token), GFP_KERNEL);
2113	if (!vdev->vf_token)
2114		return -ENOMEM;
2115
2116	mutex_init(&vdev->vf_token->lock);
2117	uuid_gen(&vdev->vf_token->uuid);
2118
2119	vdev->nb.notifier_call = vfio_pci_bus_notifier;
2120	ret = bus_register_notifier(&pci_bus_type, &vdev->nb);
2121	if (ret) {
2122		kfree(vdev->vf_token);
2123		return ret;
2124	}
2125	return 0;
2126}
2127
2128static void vfio_pci_vf_uninit(struct vfio_pci_core_device *vdev)
2129{
2130	if (!vdev->vf_token)
2131		return;
2132
2133	bus_unregister_notifier(&pci_bus_type, &vdev->nb);
2134	WARN_ON(vdev->vf_token->users);
2135	mutex_destroy(&vdev->vf_token->lock);
2136	kfree(vdev->vf_token);
2137}
2138
2139static int vfio_pci_vga_init(struct vfio_pci_core_device *vdev)
2140{
2141	struct pci_dev *pdev = vdev->pdev;
2142	int ret;
2143
2144	if (!vfio_pci_is_vga(pdev))
2145		return 0;
2146
2147	ret = aperture_remove_conflicting_pci_devices(pdev, vdev->vdev.ops->name);
2148	if (ret)
2149		return ret;
2150
2151	ret = vga_client_register(pdev, vfio_pci_set_decode);
2152	if (ret)
2153		return ret;
2154	vga_set_legacy_decoding(pdev, vfio_pci_set_decode(pdev, false));
2155	return 0;
2156}
2157
2158static void vfio_pci_vga_uninit(struct vfio_pci_core_device *vdev)
2159{
2160	struct pci_dev *pdev = vdev->pdev;
2161
2162	if (!vfio_pci_is_vga(pdev))
2163		return;
2164	vga_client_unregister(pdev);
2165	vga_set_legacy_decoding(pdev, VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM |
2166					      VGA_RSRC_LEGACY_IO |
2167					      VGA_RSRC_LEGACY_MEM);
2168}
2169
2170int vfio_pci_core_init_dev(struct vfio_device *core_vdev)
2171{
2172	struct vfio_pci_core_device *vdev =
2173		container_of(core_vdev, struct vfio_pci_core_device, vdev);
2174
2175	vdev->pdev = to_pci_dev(core_vdev->dev);
2176	vdev->irq_type = VFIO_PCI_NUM_IRQS;
2177	mutex_init(&vdev->igate);
2178	spin_lock_init(&vdev->irqlock);
2179	mutex_init(&vdev->ioeventfds_lock);
2180	INIT_LIST_HEAD(&vdev->dummy_resources_list);
2181	INIT_LIST_HEAD(&vdev->ioeventfds_list);
2182	mutex_init(&vdev->vma_lock);
2183	INIT_LIST_HEAD(&vdev->vma_list);
2184	INIT_LIST_HEAD(&vdev->sriov_pfs_item);
2185	init_rwsem(&vdev->memory_lock);
2186	xa_init(&vdev->ctx);
2187
2188	return 0;
2189}
2190EXPORT_SYMBOL_GPL(vfio_pci_core_init_dev);
2191
2192void vfio_pci_core_release_dev(struct vfio_device *core_vdev)
2193{
2194	struct vfio_pci_core_device *vdev =
2195		container_of(core_vdev, struct vfio_pci_core_device, vdev);
2196
2197	mutex_destroy(&vdev->igate);
2198	mutex_destroy(&vdev->ioeventfds_lock);
2199	mutex_destroy(&vdev->vma_lock);
2200	kfree(vdev->region);
2201	kfree(vdev->pm_save);
2202}
2203EXPORT_SYMBOL_GPL(vfio_pci_core_release_dev);
2204
2205int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev)
2206{
2207	struct pci_dev *pdev = vdev->pdev;
2208	struct device *dev = &pdev->dev;
2209	int ret;
2210
2211	/* Drivers must set the vfio_pci_core_device to their drvdata */
2212	if (WARN_ON(vdev != dev_get_drvdata(dev)))
2213		return -EINVAL;
2214
2215	if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
2216		return -EINVAL;
2217
2218	if (vdev->vdev.mig_ops) {
2219		if (!(vdev->vdev.mig_ops->migration_get_state &&
2220		      vdev->vdev.mig_ops->migration_set_state &&
2221		      vdev->vdev.mig_ops->migration_get_data_size) ||
2222		    !(vdev->vdev.migration_flags & VFIO_MIGRATION_STOP_COPY))
2223			return -EINVAL;
2224	}
2225
2226	if (vdev->vdev.log_ops && !(vdev->vdev.log_ops->log_start &&
2227	    vdev->vdev.log_ops->log_stop &&
2228	    vdev->vdev.log_ops->log_read_and_clear))
2229		return -EINVAL;
2230
2231	/*
2232	 * Prevent binding to PFs with VFs enabled, the VFs might be in use
2233	 * by the host or other users.  We cannot capture the VFs if they
2234	 * already exist, nor can we track VF users.  Disabling SR-IOV here
2235	 * would initiate removing the VFs, which would unbind the driver,
2236	 * which is prone to blocking if that VF is also in use by vfio-pci.
2237	 * Just reject these PFs and let the user sort it out.
2238	 */
2239	if (pci_num_vf(pdev)) {
2240		pci_warn(pdev, "Cannot bind to PF with SR-IOV enabled\n");
2241		return -EBUSY;
2242	}
2243
2244	if (pci_is_root_bus(pdev->bus)) {
2245		ret = vfio_assign_device_set(&vdev->vdev, vdev);
2246	} else if (!pci_probe_reset_slot(pdev->slot)) {
2247		ret = vfio_assign_device_set(&vdev->vdev, pdev->slot);
2248	} else {
2249		/*
2250		 * If there is no slot reset support for this device, the whole
2251		 * bus needs to be grouped together to support bus-wide resets.
2252		 */
2253		ret = vfio_assign_device_set(&vdev->vdev, pdev->bus);
2254	}
2255
2256	if (ret)
2257		return ret;
2258	ret = vfio_pci_vf_init(vdev);
2259	if (ret)
2260		return ret;
2261	ret = vfio_pci_vga_init(vdev);
2262	if (ret)
2263		goto out_vf;
2264
2265	vfio_pci_probe_power_state(vdev);
2266
2267	/*
2268	 * pci-core sets the device power state to an unknown value at
2269	 * bootup and after being removed from a driver.  The only
2270	 * transition it allows from this unknown state is to D0, which
2271	 * typically happens when a driver calls pci_enable_device().
2272	 * We're not ready to enable the device yet, but we do want to
2273	 * be able to get to D3.  Therefore first do a D0 transition
2274	 * before enabling runtime PM.
2275	 */
2276	vfio_pci_set_power_state(vdev, PCI_D0);
2277
2278	dev->driver->pm = &vfio_pci_core_pm_ops;
2279	pm_runtime_allow(dev);
2280	if (!disable_idle_d3)
2281		pm_runtime_put(dev);
2282
2283	ret = vfio_register_group_dev(&vdev->vdev);
2284	if (ret)
2285		goto out_power;
2286	return 0;
2287
2288out_power:
2289	if (!disable_idle_d3)
2290		pm_runtime_get_noresume(dev);
2291
2292	pm_runtime_forbid(dev);
2293out_vf:
2294	vfio_pci_vf_uninit(vdev);
2295	return ret;
2296}
2297EXPORT_SYMBOL_GPL(vfio_pci_core_register_device);
2298
2299void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev)
2300{
2301	vfio_pci_core_sriov_configure(vdev, 0);
2302
2303	vfio_unregister_group_dev(&vdev->vdev);
2304
2305	vfio_pci_vf_uninit(vdev);
2306	vfio_pci_vga_uninit(vdev);
2307
2308	if (!disable_idle_d3)
2309		pm_runtime_get_noresume(&vdev->pdev->dev);
2310
2311	pm_runtime_forbid(&vdev->pdev->dev);
2312}
2313EXPORT_SYMBOL_GPL(vfio_pci_core_unregister_device);
2314
2315pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev,
2316						pci_channel_state_t state)
2317{
2318	struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev);
2319
2320	mutex_lock(&vdev->igate);
2321
2322	if (vdev->err_trigger)
2323		eventfd_signal(vdev->err_trigger);
2324
2325	mutex_unlock(&vdev->igate);
2326
2327	return PCI_ERS_RESULT_CAN_RECOVER;
2328}
2329EXPORT_SYMBOL_GPL(vfio_pci_core_aer_err_detected);
2330
2331int vfio_pci_core_sriov_configure(struct vfio_pci_core_device *vdev,
2332				  int nr_virtfn)
2333{
2334	struct pci_dev *pdev = vdev->pdev;
2335	int ret = 0;
2336
2337	device_lock_assert(&pdev->dev);
2338
2339	if (nr_virtfn) {
2340		mutex_lock(&vfio_pci_sriov_pfs_mutex);
2341		/*
2342		 * The thread that adds the vdev to the list is the only thread
2343		 * that gets to call pci_enable_sriov() and we will only allow
2344		 * it to be called once without going through
2345		 * pci_disable_sriov()
2346		 */
2347		if (!list_empty(&vdev->sriov_pfs_item)) {
2348			ret = -EINVAL;
2349			goto out_unlock;
2350		}
2351		list_add_tail(&vdev->sriov_pfs_item, &vfio_pci_sriov_pfs);
2352		mutex_unlock(&vfio_pci_sriov_pfs_mutex);
2353
2354		/*
2355		 * The PF power state should always be higher than the VF power
2356		 * state. The PF can be in low power state either with runtime
2357		 * power management (when there is no user) or PCI_PM_CTRL
2358		 * register write by the user. If PF is in the low power state,
2359		 * then change the power state to D0 first before enabling
2360		 * SR-IOV. Also, this function can be called at any time, and
2361		 * userspace PCI_PM_CTRL write can race against this code path,
2362		 * so protect the same with 'memory_lock'.
2363		 */
2364		ret = pm_runtime_resume_and_get(&pdev->dev);
2365		if (ret)
2366			goto out_del;
2367
2368		down_write(&vdev->memory_lock);
2369		vfio_pci_set_power_state(vdev, PCI_D0);
2370		ret = pci_enable_sriov(pdev, nr_virtfn);
2371		up_write(&vdev->memory_lock);
2372		if (ret) {
2373			pm_runtime_put(&pdev->dev);
2374			goto out_del;
2375		}
2376		return nr_virtfn;
2377	}
2378
2379	if (pci_num_vf(pdev)) {
2380		pci_disable_sriov(pdev);
2381		pm_runtime_put(&pdev->dev);
2382	}
2383
2384out_del:
2385	mutex_lock(&vfio_pci_sriov_pfs_mutex);
2386	list_del_init(&vdev->sriov_pfs_item);
2387out_unlock:
2388	mutex_unlock(&vfio_pci_sriov_pfs_mutex);
2389	return ret;
2390}
2391EXPORT_SYMBOL_GPL(vfio_pci_core_sriov_configure);
2392
2393const struct pci_error_handlers vfio_pci_core_err_handlers = {
2394	.error_detected = vfio_pci_core_aer_err_detected,
2395};
2396EXPORT_SYMBOL_GPL(vfio_pci_core_err_handlers);
2397
2398static bool vfio_dev_in_groups(struct vfio_device *vdev,
2399			       struct vfio_pci_group_info *groups)
2400{
2401	unsigned int i;
2402
2403	if (!groups)
2404		return false;
2405
2406	for (i = 0; i < groups->count; i++)
2407		if (vfio_file_has_dev(groups->files[i], vdev))
2408			return true;
2409	return false;
2410}
2411
2412static int vfio_pci_is_device_in_set(struct pci_dev *pdev, void *data)
2413{
2414	struct vfio_device_set *dev_set = data;
2415
2416	return vfio_find_device_in_devset(dev_set, &pdev->dev) ? 0 : -ENODEV;
2417}
2418
2419/*
2420 * vfio-core considers a group to be viable and will create a vfio_device even
2421 * if some devices are bound to drivers like pci-stub or pcieport. Here we
2422 * require all PCI devices to be inside our dev_set since that ensures they stay
2423 * put and that every driver controlling the device can co-ordinate with the
2424 * device reset.
2425 *
2426 * Returns the pci_dev to pass to pci_reset_bus() if every PCI device to be
2427 * reset is inside the dev_set, and pci_reset_bus() can succeed. NULL otherwise.
2428 */
2429static struct pci_dev *
2430vfio_pci_dev_set_resettable(struct vfio_device_set *dev_set)
2431{
2432	struct pci_dev *pdev;
2433
2434	lockdep_assert_held(&dev_set->lock);
2435
2436	/*
2437	 * By definition all PCI devices in the dev_set share the same PCI
2438	 * reset, so any pci_dev will have the same outcomes for
2439	 * pci_probe_reset_*() and pci_reset_bus().
2440	 */
2441	pdev = list_first_entry(&dev_set->device_list,
2442				struct vfio_pci_core_device,
2443				vdev.dev_set_list)->pdev;
2444
2445	/* pci_reset_bus() is supported */
2446	if (pci_probe_reset_slot(pdev->slot) && pci_probe_reset_bus(pdev->bus))
2447		return NULL;
2448
2449	if (vfio_pci_for_each_slot_or_bus(pdev, vfio_pci_is_device_in_set,
2450					  dev_set,
2451					  !pci_probe_reset_slot(pdev->slot)))
2452		return NULL;
2453	return pdev;
2454}
2455
2456static int vfio_pci_dev_set_pm_runtime_get(struct vfio_device_set *dev_set)
2457{
2458	struct vfio_pci_core_device *cur;
2459	int ret;
2460
2461	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
2462		ret = pm_runtime_resume_and_get(&cur->pdev->dev);
2463		if (ret)
2464			goto unwind;
2465	}
2466
2467	return 0;
2468
2469unwind:
2470	list_for_each_entry_continue_reverse(cur, &dev_set->device_list,
2471					     vdev.dev_set_list)
2472		pm_runtime_put(&cur->pdev->dev);
2473
2474	return ret;
2475}
2476
2477/*
2478 * We need to get memory_lock for each device, but devices can share mmap_lock,
2479 * therefore we need to zap and hold the vma_lock for each device, and only then
2480 * get each memory_lock.
2481 */
2482static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
2483				      struct vfio_pci_group_info *groups,
2484				      struct iommufd_ctx *iommufd_ctx)
2485{
2486	struct vfio_pci_core_device *cur_mem;
2487	struct vfio_pci_core_device *cur_vma;
2488	struct vfio_pci_core_device *cur;
2489	struct pci_dev *pdev;
2490	bool is_mem = true;
2491	int ret;
2492
2493	mutex_lock(&dev_set->lock);
2494	cur_mem = list_first_entry(&dev_set->device_list,
2495				   struct vfio_pci_core_device,
2496				   vdev.dev_set_list);
2497
2498	pdev = vfio_pci_dev_set_resettable(dev_set);
2499	if (!pdev) {
2500		ret = -EINVAL;
2501		goto err_unlock;
2502	}
2503
2504	/*
2505	 * Some of the devices in the dev_set can be in the runtime suspended
2506	 * state. Increment the usage count for all the devices in the dev_set
2507	 * before reset and decrement the same after reset.
2508	 */
2509	ret = vfio_pci_dev_set_pm_runtime_get(dev_set);
2510	if (ret)
2511		goto err_unlock;
2512
2513	list_for_each_entry(cur_vma, &dev_set->device_list, vdev.dev_set_list) {
2514		bool owned;
2515
2516		/*
2517		 * Test whether all the affected devices can be reset by the
2518		 * user.
2519		 *
2520		 * If called from a group opened device and the user provides
2521		 * a set of groups, all the devices in the dev_set should be
2522		 * contained by the set of groups provided by the user.
2523		 *
2524		 * If called from a cdev opened device and the user provides
2525		 * a zero-length array, all the devices in the dev_set must
2526		 * be bound to the same iommufd_ctx as the input iommufd_ctx.
2527		 * If there is any device that has not been bound to any
2528		 * iommufd_ctx yet, check if its iommu_group has any device
2529		 * bound to the input iommufd_ctx.  Such devices can be
2530		 * considered owned by the input iommufd_ctx as the device
2531		 * cannot be owned by another iommufd_ctx when its iommu_group
2532		 * is owned.
2533		 *
2534		 * Otherwise, reset is not allowed.
2535		 */
2536		if (iommufd_ctx) {
2537			int devid = vfio_iommufd_get_dev_id(&cur_vma->vdev,
2538							    iommufd_ctx);
2539
2540			owned = (devid > 0 || devid == -ENOENT);
2541		} else {
2542			owned = vfio_dev_in_groups(&cur_vma->vdev, groups);
2543		}
2544
2545		if (!owned) {
2546			ret = -EINVAL;
2547			goto err_undo;
2548		}
2549
2550		/*
2551		 * Locking multiple devices is prone to deadlock, runaway and
2552		 * unwind if we hit contention.
2553		 */
2554		if (!vfio_pci_zap_and_vma_lock(cur_vma, true)) {
2555			ret = -EBUSY;
2556			goto err_undo;
2557		}
2558	}
2559	cur_vma = NULL;
2560
2561	list_for_each_entry(cur_mem, &dev_set->device_list, vdev.dev_set_list) {
2562		if (!down_write_trylock(&cur_mem->memory_lock)) {
2563			ret = -EBUSY;
2564			goto err_undo;
2565		}
2566		mutex_unlock(&cur_mem->vma_lock);
2567	}
2568	cur_mem = NULL;
2569
2570	/*
2571	 * The pci_reset_bus() will reset all the devices in the bus.
2572	 * The power state can be non-D0 for some of the devices in the bus.
2573	 * For these devices, the pci_reset_bus() will internally set
2574	 * the power state to D0 without vfio driver involvement.
2575	 * For the devices which have NoSoftRst-, the reset function can
2576	 * cause the PCI config space reset without restoring the original
2577	 * state (saved locally in 'vdev->pm_save').
2578	 */
2579	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list)
2580		vfio_pci_set_power_state(cur, PCI_D0);
2581
2582	ret = pci_reset_bus(pdev);
2583
2584err_undo:
2585	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
2586		if (cur == cur_mem)
2587			is_mem = false;
2588		if (cur == cur_vma)
2589			break;
2590		if (is_mem)
2591			up_write(&cur->memory_lock);
2592		else
2593			mutex_unlock(&cur->vma_lock);
2594	}
2595
2596	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list)
2597		pm_runtime_put(&cur->pdev->dev);
2598err_unlock:
2599	mutex_unlock(&dev_set->lock);
2600	return ret;
2601}
2602
2603static bool vfio_pci_dev_set_needs_reset(struct vfio_device_set *dev_set)
2604{
2605	struct vfio_pci_core_device *cur;
2606	bool needs_reset = false;
2607
2608	/* No other VFIO device in the set can be open. */
2609	if (vfio_device_set_open_count(dev_set) > 1)
2610		return false;
2611
2612	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list)
2613		needs_reset |= cur->needs_reset;
2614	return needs_reset;
2615}
2616
2617/*
2618 * If a bus or slot reset is available for the provided dev_set and:
2619 *  - All of the devices affected by that bus or slot reset are unused
2620 *  - At least one of the affected devices is marked dirty via
2621 *    needs_reset (such as by lack of FLR support)
2622 * Then attempt to perform that bus or slot reset.
2623 */
2624static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set)
2625{
2626	struct vfio_pci_core_device *cur;
2627	struct pci_dev *pdev;
2628	bool reset_done = false;
2629
2630	if (!vfio_pci_dev_set_needs_reset(dev_set))
2631		return;
2632
2633	pdev = vfio_pci_dev_set_resettable(dev_set);
2634	if (!pdev)
2635		return;
2636
2637	/*
2638	 * Some of the devices in the bus can be in the runtime suspended
2639	 * state. Increment the usage count for all the devices in the dev_set
2640	 * before reset and decrement the same after reset.
2641	 */
2642	if (!disable_idle_d3 && vfio_pci_dev_set_pm_runtime_get(dev_set))
2643		return;
2644
2645	if (!pci_reset_bus(pdev))
2646		reset_done = true;
2647
2648	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
2649		if (reset_done)
2650			cur->needs_reset = false;
2651
2652		if (!disable_idle_d3)
2653			pm_runtime_put(&cur->pdev->dev);
2654	}
2655}
2656
2657void vfio_pci_core_set_params(bool is_nointxmask, bool is_disable_vga,
2658			      bool is_disable_idle_d3)
2659{
2660	nointxmask = is_nointxmask;
2661	disable_vga = is_disable_vga;
2662	disable_idle_d3 = is_disable_idle_d3;
2663}
2664EXPORT_SYMBOL_GPL(vfio_pci_core_set_params);
2665
2666static void vfio_pci_core_cleanup(void)
2667{
2668	vfio_pci_uninit_perm_bits();
2669}
2670
2671static int __init vfio_pci_core_init(void)
2672{
2673	/* Allocate shared config space permission data used by all devices */
2674	return vfio_pci_init_perm_bits();
2675}
2676
2677module_init(vfio_pci_core_init);
2678module_exit(vfio_pci_core_cleanup);
2679
2680MODULE_LICENSE("GPL v2");
2681MODULE_AUTHOR(DRIVER_AUTHOR);
2682MODULE_DESCRIPTION(DRIVER_DESC);
2683