1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * VFIO core
4 *
5 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6 *     Author: Alex Williamson <alex.williamson@redhat.com>
7 *
8 * Derived from original vfio:
9 * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10 * Author: Tom Lyon, pugs@cisco.com
11 */
12
13#include <linux/cdev.h>
14#include <linux/compat.h>
15#include <linux/device.h>
16#include <linux/fs.h>
17#include <linux/idr.h>
18#include <linux/iommu.h>
19#if IS_ENABLED(CONFIG_KVM)
20#include <linux/kvm_host.h>
21#endif
22#include <linux/list.h>
23#include <linux/miscdevice.h>
24#include <linux/module.h>
25#include <linux/mutex.h>
26#include <linux/pci.h>
27#include <linux/rwsem.h>
28#include <linux/sched.h>
29#include <linux/slab.h>
30#include <linux/stat.h>
31#include <linux/string.h>
32#include <linux/uaccess.h>
33#include <linux/vfio.h>
34#include <linux/wait.h>
35#include <linux/sched/signal.h>
36#include <linux/pm_runtime.h>
37#include <linux/interval_tree.h>
38#include <linux/iova_bitmap.h>
39#include <linux/iommufd.h>
40#include "vfio.h"
41
42#define DRIVER_VERSION	"0.3"
43#define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
44#define DRIVER_DESC	"VFIO - User Level meta-driver"
45
46static struct vfio {
47	struct class			*device_class;
48	struct ida			device_ida;
49} vfio;
50
51#ifdef CONFIG_VFIO_NOIOMMU
52bool vfio_noiommu __read_mostly;
53module_param_named(enable_unsafe_noiommu_mode,
54		   vfio_noiommu, bool, S_IRUGO | S_IWUSR);
55MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
56#endif
57
58static DEFINE_XARRAY(vfio_device_set_xa);
59
60int vfio_assign_device_set(struct vfio_device *device, void *set_id)
61{
62	unsigned long idx = (unsigned long)set_id;
63	struct vfio_device_set *new_dev_set;
64	struct vfio_device_set *dev_set;
65
66	if (WARN_ON(!set_id))
67		return -EINVAL;
68
69	/*
70	 * Atomically acquire a singleton object in the xarray for this set_id
71	 */
72	xa_lock(&vfio_device_set_xa);
73	dev_set = xa_load(&vfio_device_set_xa, idx);
74	if (dev_set)
75		goto found_get_ref;
76	xa_unlock(&vfio_device_set_xa);
77
78	new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
79	if (!new_dev_set)
80		return -ENOMEM;
81	mutex_init(&new_dev_set->lock);
82	INIT_LIST_HEAD(&new_dev_set->device_list);
83	new_dev_set->set_id = set_id;
84
85	xa_lock(&vfio_device_set_xa);
86	dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
87			       GFP_KERNEL);
88	if (!dev_set) {
89		dev_set = new_dev_set;
90		goto found_get_ref;
91	}
92
93	kfree(new_dev_set);
94	if (xa_is_err(dev_set)) {
95		xa_unlock(&vfio_device_set_xa);
96		return xa_err(dev_set);
97	}
98
99found_get_ref:
100	dev_set->device_count++;
101	xa_unlock(&vfio_device_set_xa);
102	mutex_lock(&dev_set->lock);
103	device->dev_set = dev_set;
104	list_add_tail(&device->dev_set_list, &dev_set->device_list);
105	mutex_unlock(&dev_set->lock);
106	return 0;
107}
108EXPORT_SYMBOL_GPL(vfio_assign_device_set);
109
110static void vfio_release_device_set(struct vfio_device *device)
111{
112	struct vfio_device_set *dev_set = device->dev_set;
113
114	if (!dev_set)
115		return;
116
117	mutex_lock(&dev_set->lock);
118	list_del(&device->dev_set_list);
119	mutex_unlock(&dev_set->lock);
120
121	xa_lock(&vfio_device_set_xa);
122	if (!--dev_set->device_count) {
123		__xa_erase(&vfio_device_set_xa,
124			   (unsigned long)dev_set->set_id);
125		mutex_destroy(&dev_set->lock);
126		kfree(dev_set);
127	}
128	xa_unlock(&vfio_device_set_xa);
129}
130
131unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
132{
133	struct vfio_device *cur;
134	unsigned int open_count = 0;
135
136	lockdep_assert_held(&dev_set->lock);
137
138	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
139		open_count += cur->open_count;
140	return open_count;
141}
142EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
143
144struct vfio_device *
145vfio_find_device_in_devset(struct vfio_device_set *dev_set,
146			   struct device *dev)
147{
148	struct vfio_device *cur;
149
150	lockdep_assert_held(&dev_set->lock);
151
152	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
153		if (cur->dev == dev)
154			return cur;
155	return NULL;
156}
157EXPORT_SYMBOL_GPL(vfio_find_device_in_devset);
158
159/*
160 * Device objects - create, release, get, put, search
161 */
162/* Device reference always implies a group reference */
163void vfio_device_put_registration(struct vfio_device *device)
164{
165	if (refcount_dec_and_test(&device->refcount))
166		complete(&device->comp);
167}
168
169bool vfio_device_try_get_registration(struct vfio_device *device)
170{
171	return refcount_inc_not_zero(&device->refcount);
172}
173
174/*
175 * VFIO driver API
176 */
177/* Release helper called by vfio_put_device() */
178static void vfio_device_release(struct device *dev)
179{
180	struct vfio_device *device =
181			container_of(dev, struct vfio_device, device);
182
183	vfio_release_device_set(device);
184	ida_free(&vfio.device_ida, device->index);
185
186	if (device->ops->release)
187		device->ops->release(device);
188
189	kvfree(device);
190}
191
192static int vfio_init_device(struct vfio_device *device, struct device *dev,
193			    const struct vfio_device_ops *ops);
194
195/*
196 * Allocate and initialize vfio_device so it can be registered to vfio
197 * core.
198 *
199 * Drivers should use the wrapper vfio_alloc_device() for allocation.
200 * @size is the size of the structure to be allocated, including any
201 * private data used by the driver.
202 *
203 * Driver may provide an @init callback to cover device private data.
204 *
205 * Use vfio_put_device() to release the structure after success return.
206 */
207struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
208				       const struct vfio_device_ops *ops)
209{
210	struct vfio_device *device;
211	int ret;
212
213	if (WARN_ON(size < sizeof(struct vfio_device)))
214		return ERR_PTR(-EINVAL);
215
216	device = kvzalloc(size, GFP_KERNEL);
217	if (!device)
218		return ERR_PTR(-ENOMEM);
219
220	ret = vfio_init_device(device, dev, ops);
221	if (ret)
222		goto out_free;
223	return device;
224
225out_free:
226	kvfree(device);
227	return ERR_PTR(ret);
228}
229EXPORT_SYMBOL_GPL(_vfio_alloc_device);
230
231/*
232 * Initialize a vfio_device so it can be registered to vfio core.
233 */
234static int vfio_init_device(struct vfio_device *device, struct device *dev,
235			    const struct vfio_device_ops *ops)
236{
237	int ret;
238
239	ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
240	if (ret < 0) {
241		dev_dbg(dev, "Error to alloc index\n");
242		return ret;
243	}
244
245	device->index = ret;
246	init_completion(&device->comp);
247	device->dev = dev;
248	device->ops = ops;
249
250	if (ops->init) {
251		ret = ops->init(device);
252		if (ret)
253			goto out_uninit;
254	}
255
256	device_initialize(&device->device);
257	device->device.release = vfio_device_release;
258	device->device.class = vfio.device_class;
259	device->device.parent = device->dev;
260	return 0;
261
262out_uninit:
263	vfio_release_device_set(device);
264	ida_free(&vfio.device_ida, device->index);
265	return ret;
266}
267
268static int __vfio_register_dev(struct vfio_device *device,
269			       enum vfio_group_type type)
270{
271	int ret;
272
273	if (WARN_ON(IS_ENABLED(CONFIG_IOMMUFD) &&
274		    (!device->ops->bind_iommufd ||
275		     !device->ops->unbind_iommufd ||
276		     !device->ops->attach_ioas ||
277		     !device->ops->detach_ioas)))
278		return -EINVAL;
279
280	/*
281	 * If the driver doesn't specify a set then the device is added to a
282	 * singleton set just for itself.
283	 */
284	if (!device->dev_set)
285		vfio_assign_device_set(device, device);
286
287	ret = dev_set_name(&device->device, "vfio%d", device->index);
288	if (ret)
289		return ret;
290
291	ret = vfio_device_set_group(device, type);
292	if (ret)
293		return ret;
294
295	/*
296	 * VFIO always sets IOMMU_CACHE because we offer no way for userspace to
297	 * restore cache coherency. It has to be checked here because it is only
298	 * valid for cases where we are using iommu groups.
299	 */
300	if (type == VFIO_IOMMU && !vfio_device_is_noiommu(device) &&
301	    !device_iommu_capable(device->dev, IOMMU_CAP_CACHE_COHERENCY)) {
302		ret = -EINVAL;
303		goto err_out;
304	}
305
306	ret = vfio_device_add(device);
307	if (ret)
308		goto err_out;
309
310	/* Refcounting can't start until the driver calls register */
311	refcount_set(&device->refcount, 1);
312
313	vfio_device_group_register(device);
314	vfio_device_debugfs_init(device);
315
316	return 0;
317err_out:
318	vfio_device_remove_group(device);
319	return ret;
320}
321
322int vfio_register_group_dev(struct vfio_device *device)
323{
324	return __vfio_register_dev(device, VFIO_IOMMU);
325}
326EXPORT_SYMBOL_GPL(vfio_register_group_dev);
327
328/*
329 * Register a virtual device without IOMMU backing.  The user of this
330 * device must not be able to directly trigger unmediated DMA.
331 */
332int vfio_register_emulated_iommu_dev(struct vfio_device *device)
333{
334	return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
335}
336EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
337
338/*
339 * Decrement the device reference count and wait for the device to be
340 * removed.  Open file descriptors for the device... */
341void vfio_unregister_group_dev(struct vfio_device *device)
342{
343	unsigned int i = 0;
344	bool interrupted = false;
345	long rc;
346
347	/*
348	 * Prevent new device opened by userspace via the
349	 * VFIO_GROUP_GET_DEVICE_FD in the group path.
350	 */
351	vfio_device_group_unregister(device);
352
353	/*
354	 * Balances vfio_device_add() in register path, also prevents
355	 * new device opened by userspace in the cdev path.
356	 */
357	vfio_device_del(device);
358
359	vfio_device_put_registration(device);
360	rc = try_wait_for_completion(&device->comp);
361	while (rc <= 0) {
362		if (device->ops->request)
363			device->ops->request(device, i++);
364
365		if (interrupted) {
366			rc = wait_for_completion_timeout(&device->comp,
367							 HZ * 10);
368		} else {
369			rc = wait_for_completion_interruptible_timeout(
370				&device->comp, HZ * 10);
371			if (rc < 0) {
372				interrupted = true;
373				dev_warn(device->dev,
374					 "Device is currently in use, task"
375					 " \"%s\" (%d) "
376					 "blocked until device is released",
377					 current->comm, task_pid_nr(current));
378			}
379		}
380	}
381
382	vfio_device_debugfs_exit(device);
383	/* Balances vfio_device_set_group in register path */
384	vfio_device_remove_group(device);
385}
386EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
387
388#if IS_ENABLED(CONFIG_KVM)
389void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
390{
391	void (*pfn)(struct kvm *kvm);
392	bool (*fn)(struct kvm *kvm);
393	bool ret;
394
395	lockdep_assert_held(&device->dev_set->lock);
396
397	if (!kvm)
398		return;
399
400	pfn = symbol_get(kvm_put_kvm);
401	if (WARN_ON(!pfn))
402		return;
403
404	fn = symbol_get(kvm_get_kvm_safe);
405	if (WARN_ON(!fn)) {
406		symbol_put(kvm_put_kvm);
407		return;
408	}
409
410	ret = fn(kvm);
411	symbol_put(kvm_get_kvm_safe);
412	if (!ret) {
413		symbol_put(kvm_put_kvm);
414		return;
415	}
416
417	device->put_kvm = pfn;
418	device->kvm = kvm;
419}
420
421void vfio_device_put_kvm(struct vfio_device *device)
422{
423	lockdep_assert_held(&device->dev_set->lock);
424
425	if (!device->kvm)
426		return;
427
428	if (WARN_ON(!device->put_kvm))
429		goto clear;
430
431	device->put_kvm(device->kvm);
432	device->put_kvm = NULL;
433	symbol_put(kvm_put_kvm);
434
435clear:
436	device->kvm = NULL;
437}
438#endif
439
440/* true if the vfio_device has open_device() called but not close_device() */
441static bool vfio_assert_device_open(struct vfio_device *device)
442{
443	return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
444}
445
446struct vfio_device_file *
447vfio_allocate_device_file(struct vfio_device *device)
448{
449	struct vfio_device_file *df;
450
451	df = kzalloc(sizeof(*df), GFP_KERNEL_ACCOUNT);
452	if (!df)
453		return ERR_PTR(-ENOMEM);
454
455	df->device = device;
456	spin_lock_init(&df->kvm_ref_lock);
457
458	return df;
459}
460
461static int vfio_df_device_first_open(struct vfio_device_file *df)
462{
463	struct vfio_device *device = df->device;
464	struct iommufd_ctx *iommufd = df->iommufd;
465	int ret;
466
467	lockdep_assert_held(&device->dev_set->lock);
468
469	if (!try_module_get(device->dev->driver->owner))
470		return -ENODEV;
471
472	if (iommufd)
473		ret = vfio_df_iommufd_bind(df);
474	else
475		ret = vfio_device_group_use_iommu(device);
476	if (ret)
477		goto err_module_put;
478
479	if (device->ops->open_device) {
480		ret = device->ops->open_device(device);
481		if (ret)
482			goto err_unuse_iommu;
483	}
484	return 0;
485
486err_unuse_iommu:
487	if (iommufd)
488		vfio_df_iommufd_unbind(df);
489	else
490		vfio_device_group_unuse_iommu(device);
491err_module_put:
492	module_put(device->dev->driver->owner);
493	return ret;
494}
495
496static void vfio_df_device_last_close(struct vfio_device_file *df)
497{
498	struct vfio_device *device = df->device;
499	struct iommufd_ctx *iommufd = df->iommufd;
500
501	lockdep_assert_held(&device->dev_set->lock);
502
503	if (device->ops->close_device)
504		device->ops->close_device(device);
505	if (iommufd)
506		vfio_df_iommufd_unbind(df);
507	else
508		vfio_device_group_unuse_iommu(device);
509	module_put(device->dev->driver->owner);
510}
511
512int vfio_df_open(struct vfio_device_file *df)
513{
514	struct vfio_device *device = df->device;
515	int ret = 0;
516
517	lockdep_assert_held(&device->dev_set->lock);
518
519	/*
520	 * Only the group path allows the device to be opened multiple
521	 * times.  The device cdev path doesn't have a secure way for it.
522	 */
523	if (device->open_count != 0 && !df->group)
524		return -EINVAL;
525
526	device->open_count++;
527	if (device->open_count == 1) {
528		ret = vfio_df_device_first_open(df);
529		if (ret)
530			device->open_count--;
531	}
532
533	return ret;
534}
535
536void vfio_df_close(struct vfio_device_file *df)
537{
538	struct vfio_device *device = df->device;
539
540	lockdep_assert_held(&device->dev_set->lock);
541
542	vfio_assert_device_open(device);
543	if (device->open_count == 1)
544		vfio_df_device_last_close(df);
545	device->open_count--;
546}
547
548/*
549 * Wrapper around pm_runtime_resume_and_get().
550 * Return error code on failure or 0 on success.
551 */
552static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
553{
554	struct device *dev = device->dev;
555
556	if (dev->driver && dev->driver->pm) {
557		int ret;
558
559		ret = pm_runtime_resume_and_get(dev);
560		if (ret) {
561			dev_info_ratelimited(dev,
562				"vfio: runtime resume failed %d\n", ret);
563			return -EIO;
564		}
565	}
566
567	return 0;
568}
569
570/*
571 * Wrapper around pm_runtime_put().
572 */
573static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
574{
575	struct device *dev = device->dev;
576
577	if (dev->driver && dev->driver->pm)
578		pm_runtime_put(dev);
579}
580
581/*
582 * VFIO Device fd
583 */
584static int vfio_device_fops_release(struct inode *inode, struct file *filep)
585{
586	struct vfio_device_file *df = filep->private_data;
587	struct vfio_device *device = df->device;
588
589	if (df->group)
590		vfio_df_group_close(df);
591	else
592		vfio_df_unbind_iommufd(df);
593
594	vfio_device_put_registration(device);
595
596	kfree(df);
597
598	return 0;
599}
600
601/*
602 * vfio_mig_get_next_state - Compute the next step in the FSM
603 * @cur_fsm - The current state the device is in
604 * @new_fsm - The target state to reach
605 * @next_fsm - Pointer to the next step to get to new_fsm
606 *
607 * Return 0 upon success, otherwise -errno
608 * Upon success the next step in the state progression between cur_fsm and
609 * new_fsm will be set in next_fsm.
610 *
611 * This breaks down requests for combination transitions into smaller steps and
612 * returns the next step to get to new_fsm. The function may need to be called
613 * multiple times before reaching new_fsm.
614 *
615 */
616int vfio_mig_get_next_state(struct vfio_device *device,
617			    enum vfio_device_mig_state cur_fsm,
618			    enum vfio_device_mig_state new_fsm,
619			    enum vfio_device_mig_state *next_fsm)
620{
621	enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
622	/*
623	 * The coding in this table requires the driver to implement the
624	 * following FSM arcs:
625	 *         RESUMING -> STOP
626	 *         STOP -> RESUMING
627	 *         STOP -> STOP_COPY
628	 *         STOP_COPY -> STOP
629	 *
630	 * If P2P is supported then the driver must also implement these FSM
631	 * arcs:
632	 *         RUNNING -> RUNNING_P2P
633	 *         RUNNING_P2P -> RUNNING
634	 *         RUNNING_P2P -> STOP
635	 *         STOP -> RUNNING_P2P
636	 *
637	 * If precopy is supported then the driver must support these additional
638	 * FSM arcs:
639	 *         RUNNING -> PRE_COPY
640	 *         PRE_COPY -> RUNNING
641	 *         PRE_COPY -> STOP_COPY
642	 * However, if precopy and P2P are supported together then the driver
643	 * must support these additional arcs beyond the P2P arcs above:
644	 *         PRE_COPY -> RUNNING
645	 *         PRE_COPY -> PRE_COPY_P2P
646	 *         PRE_COPY_P2P -> PRE_COPY
647	 *         PRE_COPY_P2P -> RUNNING_P2P
648	 *         PRE_COPY_P2P -> STOP_COPY
649	 *         RUNNING -> PRE_COPY
650	 *         RUNNING_P2P -> PRE_COPY_P2P
651	 *
652	 * Without P2P and precopy the driver must implement:
653	 *         RUNNING -> STOP
654	 *         STOP -> RUNNING
655	 *
656	 * The coding will step through multiple states for some combination
657	 * transitions; if all optional features are supported, this means the
658	 * following ones:
659	 *         PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
660	 *         PRE_COPY -> RUNNING -> RUNNING_P2P
661	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
662	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
663	 *         PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
664	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP
665	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
666	 *         RESUMING -> STOP -> RUNNING_P2P
667	 *         RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
668	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
669	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
670	 *         RESUMING -> STOP -> STOP_COPY
671	 *         RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
672	 *         RUNNING -> RUNNING_P2P -> STOP
673	 *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
674	 *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
675	 *         RUNNING_P2P -> RUNNING -> PRE_COPY
676	 *         RUNNING_P2P -> STOP -> RESUMING
677	 *         RUNNING_P2P -> STOP -> STOP_COPY
678	 *         STOP -> RUNNING_P2P -> PRE_COPY_P2P
679	 *         STOP -> RUNNING_P2P -> RUNNING
680	 *         STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
681	 *         STOP_COPY -> STOP -> RESUMING
682	 *         STOP_COPY -> STOP -> RUNNING_P2P
683	 *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
684	 *
685	 *  The following transitions are blocked:
686	 *         STOP_COPY -> PRE_COPY
687	 *         STOP_COPY -> PRE_COPY_P2P
688	 */
689	static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
690		[VFIO_DEVICE_STATE_STOP] = {
691			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
692			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
693			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
694			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
695			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
696			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
697			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
698			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
699		},
700		[VFIO_DEVICE_STATE_RUNNING] = {
701			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
702			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
703			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
704			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
705			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
706			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
707			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
708			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
709		},
710		[VFIO_DEVICE_STATE_PRE_COPY] = {
711			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
712			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
713			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
714			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
715			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
716			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
717			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
718			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
719		},
720		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
721			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
722			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
723			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
724			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
725			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
726			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
727			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
728			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
729		},
730		[VFIO_DEVICE_STATE_STOP_COPY] = {
731			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
732			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
733			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
734			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
735			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
736			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
737			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
738			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
739		},
740		[VFIO_DEVICE_STATE_RESUMING] = {
741			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
742			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
743			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
744			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
745			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
746			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
747			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
748			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
749		},
750		[VFIO_DEVICE_STATE_RUNNING_P2P] = {
751			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
752			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
753			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
754			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
755			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
756			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
757			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
758			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
759		},
760		[VFIO_DEVICE_STATE_ERROR] = {
761			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
762			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
763			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
764			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
765			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
766			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
767			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
768			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
769		},
770	};
771
772	static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
773		[VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
774		[VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
775		[VFIO_DEVICE_STATE_PRE_COPY] =
776			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
777		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
778						   VFIO_MIGRATION_P2P |
779						   VFIO_MIGRATION_PRE_COPY,
780		[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
781		[VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
782		[VFIO_DEVICE_STATE_RUNNING_P2P] =
783			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
784		[VFIO_DEVICE_STATE_ERROR] = ~0U,
785	};
786
787	if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
788		    (state_flags_table[cur_fsm] & device->migration_flags) !=
789			state_flags_table[cur_fsm]))
790		return -EINVAL;
791
792	if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
793	   (state_flags_table[new_fsm] & device->migration_flags) !=
794			state_flags_table[new_fsm])
795		return -EINVAL;
796
797	/*
798	 * Arcs touching optional and unsupported states are skipped over. The
799	 * driver will instead see an arc from the original state to the next
800	 * logical state, as per the above comment.
801	 */
802	*next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
803	while ((state_flags_table[*next_fsm] & device->migration_flags) !=
804			state_flags_table[*next_fsm])
805		*next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
806
807	return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
808}
809EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
810
811/*
812 * Convert the drivers's struct file into a FD number and return it to userspace
813 */
814static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
815				   struct vfio_device_feature_mig_state *mig)
816{
817	int ret;
818	int fd;
819
820	fd = get_unused_fd_flags(O_CLOEXEC);
821	if (fd < 0) {
822		ret = fd;
823		goto out_fput;
824	}
825
826	mig->data_fd = fd;
827	if (copy_to_user(arg, mig, sizeof(*mig))) {
828		ret = -EFAULT;
829		goto out_put_unused;
830	}
831	fd_install(fd, filp);
832	return 0;
833
834out_put_unused:
835	put_unused_fd(fd);
836out_fput:
837	fput(filp);
838	return ret;
839}
840
841static int
842vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
843					   u32 flags, void __user *arg,
844					   size_t argsz)
845{
846	size_t minsz =
847		offsetofend(struct vfio_device_feature_mig_state, data_fd);
848	struct vfio_device_feature_mig_state mig;
849	struct file *filp = NULL;
850	int ret;
851
852	if (!device->mig_ops)
853		return -ENOTTY;
854
855	ret = vfio_check_feature(flags, argsz,
856				 VFIO_DEVICE_FEATURE_SET |
857				 VFIO_DEVICE_FEATURE_GET,
858				 sizeof(mig));
859	if (ret != 1)
860		return ret;
861
862	if (copy_from_user(&mig, arg, minsz))
863		return -EFAULT;
864
865	if (flags & VFIO_DEVICE_FEATURE_GET) {
866		enum vfio_device_mig_state curr_state;
867
868		ret = device->mig_ops->migration_get_state(device,
869							   &curr_state);
870		if (ret)
871			return ret;
872		mig.device_state = curr_state;
873		goto out_copy;
874	}
875
876	/* Handle the VFIO_DEVICE_FEATURE_SET */
877	filp = device->mig_ops->migration_set_state(device, mig.device_state);
878	if (IS_ERR(filp) || !filp)
879		goto out_copy;
880
881	return vfio_ioct_mig_return_fd(filp, arg, &mig);
882out_copy:
883	mig.data_fd = -1;
884	if (copy_to_user(arg, &mig, sizeof(mig)))
885		return -EFAULT;
886	if (IS_ERR(filp))
887		return PTR_ERR(filp);
888	return 0;
889}
890
891static int
892vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
893					      u32 flags, void __user *arg,
894					      size_t argsz)
895{
896	struct vfio_device_feature_mig_data_size data_size = {};
897	unsigned long stop_copy_length;
898	int ret;
899
900	if (!device->mig_ops)
901		return -ENOTTY;
902
903	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
904				 sizeof(data_size));
905	if (ret != 1)
906		return ret;
907
908	ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
909	if (ret)
910		return ret;
911
912	data_size.stop_copy_length = stop_copy_length;
913	if (copy_to_user(arg, &data_size, sizeof(data_size)))
914		return -EFAULT;
915
916	return 0;
917}
918
919static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
920					       u32 flags, void __user *arg,
921					       size_t argsz)
922{
923	struct vfio_device_feature_migration mig = {
924		.flags = device->migration_flags,
925	};
926	int ret;
927
928	if (!device->mig_ops)
929		return -ENOTTY;
930
931	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
932				 sizeof(mig));
933	if (ret != 1)
934		return ret;
935	if (copy_to_user(arg, &mig, sizeof(mig)))
936		return -EFAULT;
937	return 0;
938}
939
940void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes,
941			      u32 req_nodes)
942{
943	struct interval_tree_node *prev, *curr, *comb_start, *comb_end;
944	unsigned long min_gap, curr_gap;
945
946	/* Special shortcut when a single range is required */
947	if (req_nodes == 1) {
948		unsigned long last;
949
950		comb_start = interval_tree_iter_first(root, 0, ULONG_MAX);
951
952		/* Empty list */
953		if (WARN_ON_ONCE(!comb_start))
954			return;
955
956		curr = comb_start;
957		while (curr) {
958			last = curr->last;
959			prev = curr;
960			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
961			if (prev != comb_start)
962				interval_tree_remove(prev, root);
963		}
964		comb_start->last = last;
965		return;
966	}
967
968	/* Combine ranges which have the smallest gap */
969	while (cur_nodes > req_nodes) {
970		prev = NULL;
971		min_gap = ULONG_MAX;
972		curr = interval_tree_iter_first(root, 0, ULONG_MAX);
973		while (curr) {
974			if (prev) {
975				curr_gap = curr->start - prev->last;
976				if (curr_gap < min_gap) {
977					min_gap = curr_gap;
978					comb_start = prev;
979					comb_end = curr;
980				}
981			}
982			prev = curr;
983			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
984		}
985
986		/* Empty list or no nodes to combine */
987		if (WARN_ON_ONCE(min_gap == ULONG_MAX))
988			break;
989
990		comb_start->last = comb_end->last;
991		interval_tree_remove(comb_end, root);
992		cur_nodes--;
993	}
994}
995EXPORT_SYMBOL_GPL(vfio_combine_iova_ranges);
996
997/* Ranges should fit into a single kernel page */
998#define LOG_MAX_RANGES \
999	(PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
1000
1001static int
1002vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
1003					u32 flags, void __user *arg,
1004					size_t argsz)
1005{
1006	size_t minsz =
1007		offsetofend(struct vfio_device_feature_dma_logging_control,
1008			    ranges);
1009	struct vfio_device_feature_dma_logging_range __user *ranges;
1010	struct vfio_device_feature_dma_logging_control control;
1011	struct vfio_device_feature_dma_logging_range range;
1012	struct rb_root_cached root = RB_ROOT_CACHED;
1013	struct interval_tree_node *nodes;
1014	u64 iova_end;
1015	u32 nnodes;
1016	int i, ret;
1017
1018	if (!device->log_ops)
1019		return -ENOTTY;
1020
1021	ret = vfio_check_feature(flags, argsz,
1022				 VFIO_DEVICE_FEATURE_SET,
1023				 sizeof(control));
1024	if (ret != 1)
1025		return ret;
1026
1027	if (copy_from_user(&control, arg, minsz))
1028		return -EFAULT;
1029
1030	nnodes = control.num_ranges;
1031	if (!nnodes)
1032		return -EINVAL;
1033
1034	if (nnodes > LOG_MAX_RANGES)
1035		return -E2BIG;
1036
1037	ranges = u64_to_user_ptr(control.ranges);
1038	nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
1039			      GFP_KERNEL);
1040	if (!nodes)
1041		return -ENOMEM;
1042
1043	for (i = 0; i < nnodes; i++) {
1044		if (copy_from_user(&range, &ranges[i], sizeof(range))) {
1045			ret = -EFAULT;
1046			goto end;
1047		}
1048		if (!IS_ALIGNED(range.iova, control.page_size) ||
1049		    !IS_ALIGNED(range.length, control.page_size)) {
1050			ret = -EINVAL;
1051			goto end;
1052		}
1053
1054		if (check_add_overflow(range.iova, range.length, &iova_end) ||
1055		    iova_end > ULONG_MAX) {
1056			ret = -EOVERFLOW;
1057			goto end;
1058		}
1059
1060		nodes[i].start = range.iova;
1061		nodes[i].last = range.iova + range.length - 1;
1062		if (interval_tree_iter_first(&root, nodes[i].start,
1063					     nodes[i].last)) {
1064			/* Range overlapping */
1065			ret = -EINVAL;
1066			goto end;
1067		}
1068		interval_tree_insert(nodes + i, &root);
1069	}
1070
1071	ret = device->log_ops->log_start(device, &root, nnodes,
1072					 &control.page_size);
1073	if (ret)
1074		goto end;
1075
1076	if (copy_to_user(arg, &control, sizeof(control))) {
1077		ret = -EFAULT;
1078		device->log_ops->log_stop(device);
1079	}
1080
1081end:
1082	kfree(nodes);
1083	return ret;
1084}
1085
1086static int
1087vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
1088				       u32 flags, void __user *arg,
1089				       size_t argsz)
1090{
1091	int ret;
1092
1093	if (!device->log_ops)
1094		return -ENOTTY;
1095
1096	ret = vfio_check_feature(flags, argsz,
1097				 VFIO_DEVICE_FEATURE_SET, 0);
1098	if (ret != 1)
1099		return ret;
1100
1101	return device->log_ops->log_stop(device);
1102}
1103
1104static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
1105					  unsigned long iova, size_t length,
1106					  void *opaque)
1107{
1108	struct vfio_device *device = opaque;
1109
1110	return device->log_ops->log_read_and_clear(device, iova, length, iter);
1111}
1112
1113static int
1114vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
1115					 u32 flags, void __user *arg,
1116					 size_t argsz)
1117{
1118	size_t minsz =
1119		offsetofend(struct vfio_device_feature_dma_logging_report,
1120			    bitmap);
1121	struct vfio_device_feature_dma_logging_report report;
1122	struct iova_bitmap *iter;
1123	u64 iova_end;
1124	int ret;
1125
1126	if (!device->log_ops)
1127		return -ENOTTY;
1128
1129	ret = vfio_check_feature(flags, argsz,
1130				 VFIO_DEVICE_FEATURE_GET,
1131				 sizeof(report));
1132	if (ret != 1)
1133		return ret;
1134
1135	if (copy_from_user(&report, arg, minsz))
1136		return -EFAULT;
1137
1138	if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
1139		return -EINVAL;
1140
1141	if (check_add_overflow(report.iova, report.length, &iova_end) ||
1142	    iova_end > ULONG_MAX)
1143		return -EOVERFLOW;
1144
1145	iter = iova_bitmap_alloc(report.iova, report.length,
1146				 report.page_size,
1147				 u64_to_user_ptr(report.bitmap));
1148	if (IS_ERR(iter))
1149		return PTR_ERR(iter);
1150
1151	ret = iova_bitmap_for_each(iter, device,
1152				   vfio_device_log_read_and_clear);
1153
1154	iova_bitmap_free(iter);
1155	return ret;
1156}
1157
1158static int vfio_ioctl_device_feature(struct vfio_device *device,
1159				     struct vfio_device_feature __user *arg)
1160{
1161	size_t minsz = offsetofend(struct vfio_device_feature, flags);
1162	struct vfio_device_feature feature;
1163
1164	if (copy_from_user(&feature, arg, minsz))
1165		return -EFAULT;
1166
1167	if (feature.argsz < minsz)
1168		return -EINVAL;
1169
1170	/* Check unknown flags */
1171	if (feature.flags &
1172	    ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1173	      VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1174		return -EINVAL;
1175
1176	/* GET & SET are mutually exclusive except with PROBE */
1177	if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1178	    (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1179	    (feature.flags & VFIO_DEVICE_FEATURE_GET))
1180		return -EINVAL;
1181
1182	switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1183	case VFIO_DEVICE_FEATURE_MIGRATION:
1184		return vfio_ioctl_device_feature_migration(
1185			device, feature.flags, arg->data,
1186			feature.argsz - minsz);
1187	case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1188		return vfio_ioctl_device_feature_mig_device_state(
1189			device, feature.flags, arg->data,
1190			feature.argsz - minsz);
1191	case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1192		return vfio_ioctl_device_feature_logging_start(
1193			device, feature.flags, arg->data,
1194			feature.argsz - minsz);
1195	case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1196		return vfio_ioctl_device_feature_logging_stop(
1197			device, feature.flags, arg->data,
1198			feature.argsz - minsz);
1199	case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1200		return vfio_ioctl_device_feature_logging_report(
1201			device, feature.flags, arg->data,
1202			feature.argsz - minsz);
1203	case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
1204		return vfio_ioctl_device_feature_migration_data_size(
1205			device, feature.flags, arg->data,
1206			feature.argsz - minsz);
1207	default:
1208		if (unlikely(!device->ops->device_feature))
1209			return -EINVAL;
1210		return device->ops->device_feature(device, feature.flags,
1211						   arg->data,
1212						   feature.argsz - minsz);
1213	}
1214}
1215
1216static long vfio_device_fops_unl_ioctl(struct file *filep,
1217				       unsigned int cmd, unsigned long arg)
1218{
1219	struct vfio_device_file *df = filep->private_data;
1220	struct vfio_device *device = df->device;
1221	void __user *uptr = (void __user *)arg;
1222	int ret;
1223
1224	if (cmd == VFIO_DEVICE_BIND_IOMMUFD)
1225		return vfio_df_ioctl_bind_iommufd(df, uptr);
1226
1227	/* Paired with smp_store_release() following vfio_df_open() */
1228	if (!smp_load_acquire(&df->access_granted))
1229		return -EINVAL;
1230
1231	ret = vfio_device_pm_runtime_get(device);
1232	if (ret)
1233		return ret;
1234
1235	/* cdev only ioctls */
1236	if (IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV) && !df->group) {
1237		switch (cmd) {
1238		case VFIO_DEVICE_ATTACH_IOMMUFD_PT:
1239			ret = vfio_df_ioctl_attach_pt(df, uptr);
1240			goto out;
1241
1242		case VFIO_DEVICE_DETACH_IOMMUFD_PT:
1243			ret = vfio_df_ioctl_detach_pt(df, uptr);
1244			goto out;
1245		}
1246	}
1247
1248	switch (cmd) {
1249	case VFIO_DEVICE_FEATURE:
1250		ret = vfio_ioctl_device_feature(device, uptr);
1251		break;
1252
1253	default:
1254		if (unlikely(!device->ops->ioctl))
1255			ret = -EINVAL;
1256		else
1257			ret = device->ops->ioctl(device, cmd, arg);
1258		break;
1259	}
1260out:
1261	vfio_device_pm_runtime_put(device);
1262	return ret;
1263}
1264
1265static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1266				     size_t count, loff_t *ppos)
1267{
1268	struct vfio_device_file *df = filep->private_data;
1269	struct vfio_device *device = df->device;
1270
1271	/* Paired with smp_store_release() following vfio_df_open() */
1272	if (!smp_load_acquire(&df->access_granted))
1273		return -EINVAL;
1274
1275	if (unlikely(!device->ops->read))
1276		return -EINVAL;
1277
1278	return device->ops->read(device, buf, count, ppos);
1279}
1280
1281static ssize_t vfio_device_fops_write(struct file *filep,
1282				      const char __user *buf,
1283				      size_t count, loff_t *ppos)
1284{
1285	struct vfio_device_file *df = filep->private_data;
1286	struct vfio_device *device = df->device;
1287
1288	/* Paired with smp_store_release() following vfio_df_open() */
1289	if (!smp_load_acquire(&df->access_granted))
1290		return -EINVAL;
1291
1292	if (unlikely(!device->ops->write))
1293		return -EINVAL;
1294
1295	return device->ops->write(device, buf, count, ppos);
1296}
1297
1298static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1299{
1300	struct vfio_device_file *df = filep->private_data;
1301	struct vfio_device *device = df->device;
1302
1303	/* Paired with smp_store_release() following vfio_df_open() */
1304	if (!smp_load_acquire(&df->access_granted))
1305		return -EINVAL;
1306
1307	if (unlikely(!device->ops->mmap))
1308		return -EINVAL;
1309
1310	return device->ops->mmap(device, vma);
1311}
1312
1313const struct file_operations vfio_device_fops = {
1314	.owner		= THIS_MODULE,
1315	.open		= vfio_device_fops_cdev_open,
1316	.release	= vfio_device_fops_release,
1317	.read		= vfio_device_fops_read,
1318	.write		= vfio_device_fops_write,
1319	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
1320	.compat_ioctl	= compat_ptr_ioctl,
1321	.mmap		= vfio_device_fops_mmap,
1322};
1323
1324static struct vfio_device *vfio_device_from_file(struct file *file)
1325{
1326	struct vfio_device_file *df = file->private_data;
1327
1328	if (file->f_op != &vfio_device_fops)
1329		return NULL;
1330	return df->device;
1331}
1332
1333/**
1334 * vfio_file_is_valid - True if the file is valid vfio file
1335 * @file: VFIO group file or VFIO device file
1336 */
1337bool vfio_file_is_valid(struct file *file)
1338{
1339	return vfio_group_from_file(file) ||
1340	       vfio_device_from_file(file);
1341}
1342EXPORT_SYMBOL_GPL(vfio_file_is_valid);
1343
1344/**
1345 * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
1346 *        is always CPU cache coherent
1347 * @file: VFIO group file or VFIO device file
1348 *
1349 * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
1350 * bit in DMA transactions. A return of false indicates that the user has
1351 * rights to access additional instructions such as wbinvd on x86.
1352 */
1353bool vfio_file_enforced_coherent(struct file *file)
1354{
1355	struct vfio_device *device;
1356	struct vfio_group *group;
1357
1358	group = vfio_group_from_file(file);
1359	if (group)
1360		return vfio_group_enforced_coherent(group);
1361
1362	device = vfio_device_from_file(file);
1363	if (device)
1364		return device_iommu_capable(device->dev,
1365					    IOMMU_CAP_ENFORCE_CACHE_COHERENCY);
1366
1367	return true;
1368}
1369EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
1370
1371static void vfio_device_file_set_kvm(struct file *file, struct kvm *kvm)
1372{
1373	struct vfio_device_file *df = file->private_data;
1374
1375	/*
1376	 * The kvm is first recorded in the vfio_device_file, and will
1377	 * be propagated to vfio_device::kvm when the file is bound to
1378	 * iommufd successfully in the vfio device cdev path.
1379	 */
1380	spin_lock(&df->kvm_ref_lock);
1381	df->kvm = kvm;
1382	spin_unlock(&df->kvm_ref_lock);
1383}
1384
1385/**
1386 * vfio_file_set_kvm - Link a kvm with VFIO drivers
1387 * @file: VFIO group file or VFIO device file
1388 * @kvm: KVM to link
1389 *
1390 * When a VFIO device is first opened the KVM will be available in
1391 * device->kvm if one was associated with the file.
1392 */
1393void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
1394{
1395	struct vfio_group *group;
1396
1397	group = vfio_group_from_file(file);
1398	if (group)
1399		vfio_group_set_kvm(group, kvm);
1400
1401	if (vfio_device_from_file(file))
1402		vfio_device_file_set_kvm(file, kvm);
1403}
1404EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
1405
1406/*
1407 * Sub-module support
1408 */
1409/*
1410 * Helper for managing a buffer of info chain capabilities, allocate or
1411 * reallocate a buffer with additional @size, filling in @id and @version
1412 * of the capability.  A pointer to the new capability is returned.
1413 *
1414 * NB. The chain is based at the head of the buffer, so new entries are
1415 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1416 * next offsets prior to copying to the user buffer.
1417 */
1418struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1419					       size_t size, u16 id, u16 version)
1420{
1421	void *buf;
1422	struct vfio_info_cap_header *header, *tmp;
1423
1424	/* Ensure that the next capability struct will be aligned */
1425	size = ALIGN(size, sizeof(u64));
1426
1427	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1428	if (!buf) {
1429		kfree(caps->buf);
1430		caps->buf = NULL;
1431		caps->size = 0;
1432		return ERR_PTR(-ENOMEM);
1433	}
1434
1435	caps->buf = buf;
1436	header = buf + caps->size;
1437
1438	/* Eventually copied to user buffer, zero */
1439	memset(header, 0, size);
1440
1441	header->id = id;
1442	header->version = version;
1443
1444	/* Add to the end of the capability chain */
1445	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1446		; /* nothing */
1447
1448	tmp->next = caps->size;
1449	caps->size += size;
1450
1451	return header;
1452}
1453EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1454
1455void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1456{
1457	struct vfio_info_cap_header *tmp;
1458	void *buf = (void *)caps->buf;
1459
1460	/* Capability structs should start with proper alignment */
1461	WARN_ON(!IS_ALIGNED(offset, sizeof(u64)));
1462
1463	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1464		tmp->next += offset;
1465}
1466EXPORT_SYMBOL(vfio_info_cap_shift);
1467
1468int vfio_info_add_capability(struct vfio_info_cap *caps,
1469			     struct vfio_info_cap_header *cap, size_t size)
1470{
1471	struct vfio_info_cap_header *header;
1472
1473	header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1474	if (IS_ERR(header))
1475		return PTR_ERR(header);
1476
1477	memcpy(header + 1, cap + 1, size - sizeof(*header));
1478
1479	return 0;
1480}
1481EXPORT_SYMBOL(vfio_info_add_capability);
1482
1483int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1484				       int max_irq_type, size_t *data_size)
1485{
1486	unsigned long minsz;
1487	size_t size;
1488
1489	minsz = offsetofend(struct vfio_irq_set, count);
1490
1491	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1492	    (hdr->count >= (U32_MAX - hdr->start)) ||
1493	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1494				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1495		return -EINVAL;
1496
1497	if (data_size)
1498		*data_size = 0;
1499
1500	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1501		return -EINVAL;
1502
1503	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1504	case VFIO_IRQ_SET_DATA_NONE:
1505		size = 0;
1506		break;
1507	case VFIO_IRQ_SET_DATA_BOOL:
1508		size = sizeof(uint8_t);
1509		break;
1510	case VFIO_IRQ_SET_DATA_EVENTFD:
1511		size = sizeof(int32_t);
1512		break;
1513	default:
1514		return -EINVAL;
1515	}
1516
1517	if (size) {
1518		if (hdr->argsz - minsz < hdr->count * size)
1519			return -EINVAL;
1520
1521		if (!data_size)
1522			return -EINVAL;
1523
1524		*data_size = hdr->count * size;
1525	}
1526
1527	return 0;
1528}
1529EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1530
1531/*
1532 * Pin contiguous user pages and return their associated host pages for local
1533 * domain only.
1534 * @device [in]  : device
1535 * @iova [in]    : starting IOVA of user pages to be pinned.
1536 * @npage [in]   : count of pages to be pinned.  This count should not
1537 *		   be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1538 * @prot [in]    : protection flags
1539 * @pages[out]   : array of host pages
1540 * Return error or number of pages pinned.
1541 *
1542 * A driver may only call this function if the vfio_device was created
1543 * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
1544 */
1545int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1546		   int npage, int prot, struct page **pages)
1547{
1548	/* group->container cannot change while a vfio device is open */
1549	if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
1550		return -EINVAL;
1551	if (!device->ops->dma_unmap)
1552		return -EINVAL;
1553	if (vfio_device_has_container(device))
1554		return vfio_device_container_pin_pages(device, iova,
1555						       npage, prot, pages);
1556	if (device->iommufd_access) {
1557		int ret;
1558
1559		if (iova > ULONG_MAX)
1560			return -EINVAL;
1561		/*
1562		 * VFIO ignores the sub page offset, npages is from the start of
1563		 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
1564		 * the sub page offset by doing:
1565		 *     pages[0] + (iova % PAGE_SIZE)
1566		 */
1567		ret = iommufd_access_pin_pages(
1568			device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
1569			npage * PAGE_SIZE, pages,
1570			(prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
1571		if (ret)
1572			return ret;
1573		return npage;
1574	}
1575	return -EINVAL;
1576}
1577EXPORT_SYMBOL(vfio_pin_pages);
1578
1579/*
1580 * Unpin contiguous host pages for local domain only.
1581 * @device [in]  : device
1582 * @iova [in]    : starting address of user pages to be unpinned.
1583 * @npage [in]   : count of pages to be unpinned.  This count should not
1584 *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1585 */
1586void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1587{
1588	if (WARN_ON(!vfio_assert_device_open(device)))
1589		return;
1590	if (WARN_ON(!device->ops->dma_unmap))
1591		return;
1592
1593	if (vfio_device_has_container(device)) {
1594		vfio_device_container_unpin_pages(device, iova, npage);
1595		return;
1596	}
1597	if (device->iommufd_access) {
1598		if (WARN_ON(iova > ULONG_MAX))
1599			return;
1600		iommufd_access_unpin_pages(device->iommufd_access,
1601					   ALIGN_DOWN(iova, PAGE_SIZE),
1602					   npage * PAGE_SIZE);
1603		return;
1604	}
1605}
1606EXPORT_SYMBOL(vfio_unpin_pages);
1607
1608/*
1609 * This interface allows the CPUs to perform some sort of virtual DMA on
1610 * behalf of the device.
1611 *
1612 * CPUs read/write from/into a range of IOVAs pointing to user space memory
1613 * into/from a kernel buffer.
1614 *
1615 * As the read/write of user space memory is conducted via the CPUs and is
1616 * not a real device DMA, it is not necessary to pin the user space memory.
1617 *
1618 * @device [in]		: VFIO device
1619 * @iova [in]		: base IOVA of a user space buffer
1620 * @data [in]		: pointer to kernel buffer
1621 * @len [in]		: kernel buffer length
1622 * @write		: indicate read or write
1623 * Return error code on failure or 0 on success.
1624 */
1625int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
1626		size_t len, bool write)
1627{
1628	if (!data || len <= 0 || !vfio_assert_device_open(device))
1629		return -EINVAL;
1630
1631	if (vfio_device_has_container(device))
1632		return vfio_device_container_dma_rw(device, iova,
1633						    data, len, write);
1634
1635	if (device->iommufd_access) {
1636		unsigned int flags = 0;
1637
1638		if (iova > ULONG_MAX)
1639			return -EINVAL;
1640
1641		/* VFIO historically tries to auto-detect a kthread */
1642		if (!current->mm)
1643			flags |= IOMMUFD_ACCESS_RW_KTHREAD;
1644		if (write)
1645			flags |= IOMMUFD_ACCESS_RW_WRITE;
1646		return iommufd_access_rw(device->iommufd_access, iova, data,
1647					 len, flags);
1648	}
1649	return -EINVAL;
1650}
1651EXPORT_SYMBOL(vfio_dma_rw);
1652
1653/*
1654 * Module/class support
1655 */
1656static int __init vfio_init(void)
1657{
1658	int ret;
1659
1660	ida_init(&vfio.device_ida);
1661
1662	ret = vfio_group_init();
1663	if (ret)
1664		return ret;
1665
1666	ret = vfio_virqfd_init();
1667	if (ret)
1668		goto err_virqfd;
1669
1670	/* /sys/class/vfio-dev/vfioX */
1671	vfio.device_class = class_create("vfio-dev");
1672	if (IS_ERR(vfio.device_class)) {
1673		ret = PTR_ERR(vfio.device_class);
1674		goto err_dev_class;
1675	}
1676
1677	ret = vfio_cdev_init(vfio.device_class);
1678	if (ret)
1679		goto err_alloc_dev_chrdev;
1680
1681	vfio_debugfs_create_root();
1682	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1683	return 0;
1684
1685err_alloc_dev_chrdev:
1686	class_destroy(vfio.device_class);
1687	vfio.device_class = NULL;
1688err_dev_class:
1689	vfio_virqfd_exit();
1690err_virqfd:
1691	vfio_group_cleanup();
1692	return ret;
1693}
1694
1695static void __exit vfio_cleanup(void)
1696{
1697	vfio_debugfs_remove_root();
1698	ida_destroy(&vfio.device_ida);
1699	vfio_cdev_cleanup();
1700	class_destroy(vfio.device_class);
1701	vfio.device_class = NULL;
1702	vfio_virqfd_exit();
1703	vfio_group_cleanup();
1704	xa_destroy(&vfio_device_set_xa);
1705}
1706
1707module_init(vfio_init);
1708module_exit(vfio_cleanup);
1709
1710MODULE_IMPORT_NS(IOMMUFD);
1711MODULE_VERSION(DRIVER_VERSION);
1712MODULE_LICENSE("GPL v2");
1713MODULE_AUTHOR(DRIVER_AUTHOR);
1714MODULE_DESCRIPTION(DRIVER_DESC);
1715MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
1716