1/*	$NetBSD: kvmgt.c,v 1.2 2021/12/18 23:45:31 riastradh Exp $	*/
2
3/*
4 * KVMGT - the implementation of Intel mediated pass-through framework for KVM
5 *
6 * Copyright(c) 2014-2016 Intel Corporation. All rights reserved.
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the next
16 * paragraph) shall be included in all copies or substantial portions of the
17 * Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 * SOFTWARE.
26 *
27 * Authors:
28 *    Kevin Tian <kevin.tian@intel.com>
29 *    Jike Song <jike.song@intel.com>
30 *    Xiaoguang Chen <xiaoguang.chen@intel.com>
31 */
32
33#include <sys/cdefs.h>
34__KERNEL_RCSID(0, "$NetBSD: kvmgt.c,v 1.2 2021/12/18 23:45:31 riastradh Exp $");
35
36#include <linux/init.h>
37#include <linux/device.h>
38#include <linux/mm.h>
39#include <linux/mmu_context.h>
40#include <linux/sched/mm.h>
41#include <linux/types.h>
42#include <linux/list.h>
43#include <linux/rbtree.h>
44#include <linux/spinlock.h>
45#include <linux/eventfd.h>
46#include <linux/uuid.h>
47#include <linux/kvm_host.h>
48#include <linux/vfio.h>
49#include <linux/mdev.h>
50#include <linux/debugfs.h>
51
52#include <linux/nospec.h>
53
54#include "i915_drv.h"
55#include "gvt.h"
56
57static const struct intel_gvt_ops *intel_gvt_ops;
58
59/* helper macros copied from vfio-pci */
60#define VFIO_PCI_OFFSET_SHIFT   40
61#define VFIO_PCI_OFFSET_TO_INDEX(off)   (off >> VFIO_PCI_OFFSET_SHIFT)
62#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
63#define VFIO_PCI_OFFSET_MASK    (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
64
65#define EDID_BLOB_OFFSET (PAGE_SIZE/2)
66
67#define OPREGION_SIGNATURE "IntelGraphicsMem"
68
69struct vfio_region;
70struct intel_vgpu_regops {
71	size_t (*rw)(struct intel_vgpu *vgpu, char *buf,
72			size_t count, loff_t *ppos, bool iswrite);
73	void (*release)(struct intel_vgpu *vgpu,
74			struct vfio_region *region);
75};
76
77struct vfio_region {
78	u32				type;
79	u32				subtype;
80	size_t				size;
81	u32				flags;
82	const struct intel_vgpu_regops	*ops;
83	void				*data;
84};
85
86struct vfio_edid_region {
87	struct vfio_region_gfx_edid vfio_edid_regs;
88	void *edid_blob;
89};
90
91struct kvmgt_pgfn {
92	gfn_t gfn;
93	struct hlist_node hnode;
94};
95
96struct kvmgt_guest_info {
97	struct kvm *kvm;
98	struct intel_vgpu *vgpu;
99	struct kvm_page_track_notifier_node track_node;
100#define NR_BKT (1 << 18)
101	struct hlist_head ptable[NR_BKT];
102#undef NR_BKT
103	struct dentry *debugfs_cache_entries;
104};
105
106struct gvt_dma {
107	struct intel_vgpu *vgpu;
108	struct rb_node gfn_node;
109	struct rb_node dma_addr_node;
110	gfn_t gfn;
111	dma_addr_t dma_addr;
112	unsigned long size;
113	struct kref ref;
114};
115
116static inline bool handle_valid(unsigned long handle)
117{
118	return !!(handle & ~0xff);
119}
120
121static int kvmgt_guest_init(struct mdev_device *mdev);
122static void intel_vgpu_release_work(struct work_struct *work);
123static bool kvmgt_guest_exit(struct kvmgt_guest_info *info);
124
125static void gvt_unpin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
126		unsigned long size)
127{
128	int total_pages;
129	int npage;
130	int ret;
131
132	total_pages = roundup(size, PAGE_SIZE) / PAGE_SIZE;
133
134	for (npage = 0; npage < total_pages; npage++) {
135		unsigned long cur_gfn = gfn + npage;
136
137		ret = vfio_unpin_pages(mdev_dev(vgpu->vdev.mdev), &cur_gfn, 1);
138		WARN_ON(ret != 1);
139	}
140}
141
142/* Pin a normal or compound guest page for dma. */
143static int gvt_pin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
144		unsigned long size, struct page **page)
145{
146	unsigned long base_pfn = 0;
147	int total_pages;
148	int npage;
149	int ret;
150
151	total_pages = roundup(size, PAGE_SIZE) / PAGE_SIZE;
152	/*
153	 * We pin the pages one-by-one to avoid allocating a big arrary
154	 * on stack to hold pfns.
155	 */
156	for (npage = 0; npage < total_pages; npage++) {
157		unsigned long cur_gfn = gfn + npage;
158		unsigned long pfn;
159
160		ret = vfio_pin_pages(mdev_dev(vgpu->vdev.mdev), &cur_gfn, 1,
161				     IOMMU_READ | IOMMU_WRITE, &pfn);
162		if (ret != 1) {
163			gvt_vgpu_err("vfio_pin_pages failed for gfn 0x%lx, ret %d\n",
164				     cur_gfn, ret);
165			goto err;
166		}
167
168		if (!pfn_valid(pfn)) {
169			gvt_vgpu_err("pfn 0x%lx is not mem backed\n", pfn);
170			npage++;
171			ret = -EFAULT;
172			goto err;
173		}
174
175		if (npage == 0)
176			base_pfn = pfn;
177		else if (base_pfn + npage != pfn) {
178			gvt_vgpu_err("The pages are not continuous\n");
179			ret = -EINVAL;
180			npage++;
181			goto err;
182		}
183	}
184
185	*page = pfn_to_page(base_pfn);
186	return 0;
187err:
188	gvt_unpin_guest_page(vgpu, gfn, npage * PAGE_SIZE);
189	return ret;
190}
191
192static int gvt_dma_map_page(struct intel_vgpu *vgpu, unsigned long gfn,
193		dma_addr_t *dma_addr, unsigned long size)
194{
195	struct device *dev = &vgpu->gvt->dev_priv->drm.pdev->dev;
196	struct page *page = NULL;
197	int ret;
198
199	ret = gvt_pin_guest_page(vgpu, gfn, size, &page);
200	if (ret)
201		return ret;
202
203	/* Setup DMA mapping. */
204	*dma_addr = dma_map_page(dev, page, 0, size, PCI_DMA_BIDIRECTIONAL);
205	if (dma_mapping_error(dev, *dma_addr)) {
206		gvt_vgpu_err("DMA mapping failed for pfn 0x%lx, ret %d\n",
207			     page_to_pfn(page), ret);
208		gvt_unpin_guest_page(vgpu, gfn, size);
209		return -ENOMEM;
210	}
211
212	return 0;
213}
214
215static void gvt_dma_unmap_page(struct intel_vgpu *vgpu, unsigned long gfn,
216		dma_addr_t dma_addr, unsigned long size)
217{
218	struct device *dev = &vgpu->gvt->dev_priv->drm.pdev->dev;
219
220	dma_unmap_page(dev, dma_addr, size, PCI_DMA_BIDIRECTIONAL);
221	gvt_unpin_guest_page(vgpu, gfn, size);
222}
223
224static struct gvt_dma *__gvt_cache_find_dma_addr(struct intel_vgpu *vgpu,
225		dma_addr_t dma_addr)
226{
227	struct rb_node *node = vgpu->vdev.dma_addr_cache.rb_node;
228	struct gvt_dma *itr;
229
230	while (node) {
231		itr = rb_entry(node, struct gvt_dma, dma_addr_node);
232
233		if (dma_addr < itr->dma_addr)
234			node = node->rb_left;
235		else if (dma_addr > itr->dma_addr)
236			node = node->rb_right;
237		else
238			return itr;
239	}
240	return NULL;
241}
242
243static struct gvt_dma *__gvt_cache_find_gfn(struct intel_vgpu *vgpu, gfn_t gfn)
244{
245	struct rb_node *node = vgpu->vdev.gfn_cache.rb_node;
246	struct gvt_dma *itr;
247
248	while (node) {
249		itr = rb_entry(node, struct gvt_dma, gfn_node);
250
251		if (gfn < itr->gfn)
252			node = node->rb_left;
253		else if (gfn > itr->gfn)
254			node = node->rb_right;
255		else
256			return itr;
257	}
258	return NULL;
259}
260
261static int __gvt_cache_add(struct intel_vgpu *vgpu, gfn_t gfn,
262		dma_addr_t dma_addr, unsigned long size)
263{
264	struct gvt_dma *new, *itr;
265	struct rb_node **link, *parent = NULL;
266
267	new = kzalloc(sizeof(struct gvt_dma), GFP_KERNEL);
268	if (!new)
269		return -ENOMEM;
270
271	new->vgpu = vgpu;
272	new->gfn = gfn;
273	new->dma_addr = dma_addr;
274	new->size = size;
275	kref_init(&new->ref);
276
277	/* gfn_cache maps gfn to struct gvt_dma. */
278	link = &vgpu->vdev.gfn_cache.rb_node;
279	while (*link) {
280		parent = *link;
281		itr = rb_entry(parent, struct gvt_dma, gfn_node);
282
283		if (gfn < itr->gfn)
284			link = &parent->rb_left;
285		else
286			link = &parent->rb_right;
287	}
288	rb_link_node(&new->gfn_node, parent, link);
289	rb_insert_color(&new->gfn_node, &vgpu->vdev.gfn_cache);
290
291	/* dma_addr_cache maps dma addr to struct gvt_dma. */
292	parent = NULL;
293	link = &vgpu->vdev.dma_addr_cache.rb_node;
294	while (*link) {
295		parent = *link;
296		itr = rb_entry(parent, struct gvt_dma, dma_addr_node);
297
298		if (dma_addr < itr->dma_addr)
299			link = &parent->rb_left;
300		else
301			link = &parent->rb_right;
302	}
303	rb_link_node(&new->dma_addr_node, parent, link);
304	rb_insert_color(&new->dma_addr_node, &vgpu->vdev.dma_addr_cache);
305
306	vgpu->vdev.nr_cache_entries++;
307	return 0;
308}
309
310static void __gvt_cache_remove_entry(struct intel_vgpu *vgpu,
311				struct gvt_dma *entry)
312{
313	rb_erase(&entry->gfn_node, &vgpu->vdev.gfn_cache);
314	rb_erase(&entry->dma_addr_node, &vgpu->vdev.dma_addr_cache);
315	kfree(entry);
316	vgpu->vdev.nr_cache_entries--;
317}
318
319static void gvt_cache_destroy(struct intel_vgpu *vgpu)
320{
321	struct gvt_dma *dma;
322	struct rb_node *node = NULL;
323
324	for (;;) {
325		mutex_lock(&vgpu->vdev.cache_lock);
326		node = rb_first(&vgpu->vdev.gfn_cache);
327		if (!node) {
328			mutex_unlock(&vgpu->vdev.cache_lock);
329			break;
330		}
331		dma = rb_entry(node, struct gvt_dma, gfn_node);
332		gvt_dma_unmap_page(vgpu, dma->gfn, dma->dma_addr, dma->size);
333		__gvt_cache_remove_entry(vgpu, dma);
334		mutex_unlock(&vgpu->vdev.cache_lock);
335	}
336}
337
338static void gvt_cache_init(struct intel_vgpu *vgpu)
339{
340	vgpu->vdev.gfn_cache = RB_ROOT;
341	vgpu->vdev.dma_addr_cache = RB_ROOT;
342	vgpu->vdev.nr_cache_entries = 0;
343	mutex_init(&vgpu->vdev.cache_lock);
344}
345
346static void kvmgt_protect_table_init(struct kvmgt_guest_info *info)
347{
348	hash_init(info->ptable);
349}
350
351static void kvmgt_protect_table_destroy(struct kvmgt_guest_info *info)
352{
353	struct kvmgt_pgfn *p;
354	struct hlist_node *tmp;
355	int i;
356
357	hash_for_each_safe(info->ptable, i, tmp, p, hnode) {
358		hash_del(&p->hnode);
359		kfree(p);
360	}
361}
362
363static struct kvmgt_pgfn *
364__kvmgt_protect_table_find(struct kvmgt_guest_info *info, gfn_t gfn)
365{
366	struct kvmgt_pgfn *p, *res = NULL;
367
368	hash_for_each_possible(info->ptable, p, hnode, gfn) {
369		if (gfn == p->gfn) {
370			res = p;
371			break;
372		}
373	}
374
375	return res;
376}
377
378static bool kvmgt_gfn_is_write_protected(struct kvmgt_guest_info *info,
379				gfn_t gfn)
380{
381	struct kvmgt_pgfn *p;
382
383	p = __kvmgt_protect_table_find(info, gfn);
384	return !!p;
385}
386
387static void kvmgt_protect_table_add(struct kvmgt_guest_info *info, gfn_t gfn)
388{
389	struct kvmgt_pgfn *p;
390
391	if (kvmgt_gfn_is_write_protected(info, gfn))
392		return;
393
394	p = kzalloc(sizeof(struct kvmgt_pgfn), GFP_ATOMIC);
395	if (WARN(!p, "gfn: 0x%llx\n", gfn))
396		return;
397
398	p->gfn = gfn;
399	hash_add(info->ptable, &p->hnode, gfn);
400}
401
402static void kvmgt_protect_table_del(struct kvmgt_guest_info *info,
403				gfn_t gfn)
404{
405	struct kvmgt_pgfn *p;
406
407	p = __kvmgt_protect_table_find(info, gfn);
408	if (p) {
409		hash_del(&p->hnode);
410		kfree(p);
411	}
412}
413
414static size_t intel_vgpu_reg_rw_opregion(struct intel_vgpu *vgpu, char *buf,
415		size_t count, loff_t *ppos, bool iswrite)
416{
417	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) -
418			VFIO_PCI_NUM_REGIONS;
419	void *base = vgpu->vdev.region[i].data;
420	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
421
422	if (pos >= vgpu->vdev.region[i].size || iswrite) {
423		gvt_vgpu_err("invalid op or offset for Intel vgpu OpRegion\n");
424		return -EINVAL;
425	}
426	count = min(count, (size_t)(vgpu->vdev.region[i].size - pos));
427	memcpy(buf, base + pos, count);
428
429	return count;
430}
431
432static void intel_vgpu_reg_release_opregion(struct intel_vgpu *vgpu,
433		struct vfio_region *region)
434{
435}
436
437static const struct intel_vgpu_regops intel_vgpu_regops_opregion = {
438	.rw = intel_vgpu_reg_rw_opregion,
439	.release = intel_vgpu_reg_release_opregion,
440};
441
442static int handle_edid_regs(struct intel_vgpu *vgpu,
443			struct vfio_edid_region *region, char *buf,
444			size_t count, u16 offset, bool is_write)
445{
446	struct vfio_region_gfx_edid *regs = &region->vfio_edid_regs;
447	unsigned int data;
448
449	if (offset + count > sizeof(*regs))
450		return -EINVAL;
451
452	if (count != 4)
453		return -EINVAL;
454
455	if (is_write) {
456		data = *((unsigned int *)buf);
457		switch (offset) {
458		case offsetof(struct vfio_region_gfx_edid, link_state):
459			if (data == VFIO_DEVICE_GFX_LINK_STATE_UP) {
460				if (!drm_edid_block_valid(
461					(u8 *)region->edid_blob,
462					0,
463					true,
464					NULL)) {
465					gvt_vgpu_err("invalid EDID blob\n");
466					return -EINVAL;
467				}
468				intel_gvt_ops->emulate_hotplug(vgpu, true);
469			} else if (data == VFIO_DEVICE_GFX_LINK_STATE_DOWN)
470				intel_gvt_ops->emulate_hotplug(vgpu, false);
471			else {
472				gvt_vgpu_err("invalid EDID link state %d\n",
473					regs->link_state);
474				return -EINVAL;
475			}
476			regs->link_state = data;
477			break;
478		case offsetof(struct vfio_region_gfx_edid, edid_size):
479			if (data > regs->edid_max_size) {
480				gvt_vgpu_err("EDID size is bigger than %d!\n",
481					regs->edid_max_size);
482				return -EINVAL;
483			}
484			regs->edid_size = data;
485			break;
486		default:
487			/* read-only regs */
488			gvt_vgpu_err("write read-only EDID region at offset %d\n",
489				offset);
490			return -EPERM;
491		}
492	} else {
493		memcpy(buf, (char *)regs + offset, count);
494	}
495
496	return count;
497}
498
499static int handle_edid_blob(struct vfio_edid_region *region, char *buf,
500			size_t count, u16 offset, bool is_write)
501{
502	if (offset + count > region->vfio_edid_regs.edid_size)
503		return -EINVAL;
504
505	if (is_write)
506		memcpy(region->edid_blob + offset, buf, count);
507	else
508		memcpy(buf, region->edid_blob + offset, count);
509
510	return count;
511}
512
513static size_t intel_vgpu_reg_rw_edid(struct intel_vgpu *vgpu, char *buf,
514		size_t count, loff_t *ppos, bool iswrite)
515{
516	int ret;
517	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) -
518			VFIO_PCI_NUM_REGIONS;
519	struct vfio_edid_region *region =
520		(struct vfio_edid_region *)vgpu->vdev.region[i].data;
521	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
522
523	if (pos < region->vfio_edid_regs.edid_offset) {
524		ret = handle_edid_regs(vgpu, region, buf, count, pos, iswrite);
525	} else {
526		pos -= EDID_BLOB_OFFSET;
527		ret = handle_edid_blob(region, buf, count, pos, iswrite);
528	}
529
530	if (ret < 0)
531		gvt_vgpu_err("failed to access EDID region\n");
532
533	return ret;
534}
535
536static void intel_vgpu_reg_release_edid(struct intel_vgpu *vgpu,
537					struct vfio_region *region)
538{
539	kfree(region->data);
540}
541
542static const struct intel_vgpu_regops intel_vgpu_regops_edid = {
543	.rw = intel_vgpu_reg_rw_edid,
544	.release = intel_vgpu_reg_release_edid,
545};
546
547static int intel_vgpu_register_reg(struct intel_vgpu *vgpu,
548		unsigned int type, unsigned int subtype,
549		const struct intel_vgpu_regops *ops,
550		size_t size, u32 flags, void *data)
551{
552	struct vfio_region *region;
553
554	region = krealloc(vgpu->vdev.region,
555			(vgpu->vdev.num_regions + 1) * sizeof(*region),
556			GFP_KERNEL);
557	if (!region)
558		return -ENOMEM;
559
560	vgpu->vdev.region = region;
561	vgpu->vdev.region[vgpu->vdev.num_regions].type = type;
562	vgpu->vdev.region[vgpu->vdev.num_regions].subtype = subtype;
563	vgpu->vdev.region[vgpu->vdev.num_regions].ops = ops;
564	vgpu->vdev.region[vgpu->vdev.num_regions].size = size;
565	vgpu->vdev.region[vgpu->vdev.num_regions].flags = flags;
566	vgpu->vdev.region[vgpu->vdev.num_regions].data = data;
567	vgpu->vdev.num_regions++;
568	return 0;
569}
570
571static int kvmgt_get_vfio_device(void *p_vgpu)
572{
573	struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu;
574
575	vgpu->vdev.vfio_device = vfio_device_get_from_dev(
576		mdev_dev(vgpu->vdev.mdev));
577	if (!vgpu->vdev.vfio_device) {
578		gvt_vgpu_err("failed to get vfio device\n");
579		return -ENODEV;
580	}
581	return 0;
582}
583
584
585static int kvmgt_set_opregion(void *p_vgpu)
586{
587	struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu;
588	void *base;
589	int ret;
590
591	/* Each vgpu has its own opregion, although VFIO would create another
592	 * one later. This one is used to expose opregion to VFIO. And the
593	 * other one created by VFIO later, is used by guest actually.
594	 */
595	base = vgpu_opregion(vgpu)->va;
596	if (!base)
597		return -ENOMEM;
598
599	if (memcmp(base, OPREGION_SIGNATURE, 16)) {
600		memunmap(base);
601		return -EINVAL;
602	}
603
604	ret = intel_vgpu_register_reg(vgpu,
605			PCI_VENDOR_ID_INTEL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
606			VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION,
607			&intel_vgpu_regops_opregion, OPREGION_SIZE,
608			VFIO_REGION_INFO_FLAG_READ, base);
609
610	return ret;
611}
612
613static int kvmgt_set_edid(void *p_vgpu, int port_num)
614{
615	struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu;
616	struct intel_vgpu_port *port = intel_vgpu_port(vgpu, port_num);
617	struct vfio_edid_region *base;
618	int ret;
619
620	base = kzalloc(sizeof(*base), GFP_KERNEL);
621	if (!base)
622		return -ENOMEM;
623
624	/* TODO: Add multi-port and EDID extension block support */
625	base->vfio_edid_regs.edid_offset = EDID_BLOB_OFFSET;
626	base->vfio_edid_regs.edid_max_size = EDID_SIZE;
627	base->vfio_edid_regs.edid_size = EDID_SIZE;
628	base->vfio_edid_regs.max_xres = vgpu_edid_xres(port->id);
629	base->vfio_edid_regs.max_yres = vgpu_edid_yres(port->id);
630	base->edid_blob = port->edid->edid_block;
631
632	ret = intel_vgpu_register_reg(vgpu,
633			VFIO_REGION_TYPE_GFX,
634			VFIO_REGION_SUBTYPE_GFX_EDID,
635			&intel_vgpu_regops_edid, EDID_SIZE,
636			VFIO_REGION_INFO_FLAG_READ |
637			VFIO_REGION_INFO_FLAG_WRITE |
638			VFIO_REGION_INFO_FLAG_CAPS, base);
639
640	return ret;
641}
642
643static void kvmgt_put_vfio_device(void *vgpu)
644{
645	if (WARN_ON(!((struct intel_vgpu *)vgpu)->vdev.vfio_device))
646		return;
647
648	vfio_device_put(((struct intel_vgpu *)vgpu)->vdev.vfio_device);
649}
650
651static int intel_vgpu_create(struct kobject *kobj, struct mdev_device *mdev)
652{
653	struct intel_vgpu *vgpu = NULL;
654	struct intel_vgpu_type *type;
655	struct device *pdev;
656	void *gvt;
657	int ret;
658
659	pdev = mdev_parent_dev(mdev);
660	gvt = kdev_to_i915(pdev)->gvt;
661
662	type = intel_gvt_ops->gvt_find_vgpu_type(gvt, kobject_name(kobj));
663	if (!type) {
664		gvt_vgpu_err("failed to find type %s to create\n",
665						kobject_name(kobj));
666		ret = -EINVAL;
667		goto out;
668	}
669
670	vgpu = intel_gvt_ops->vgpu_create(gvt, type);
671	if (IS_ERR_OR_NULL(vgpu)) {
672		ret = vgpu == NULL ? -EFAULT : PTR_ERR(vgpu);
673		gvt_err("failed to create intel vgpu: %d\n", ret);
674		goto out;
675	}
676
677	INIT_WORK(&vgpu->vdev.release_work, intel_vgpu_release_work);
678
679	vgpu->vdev.mdev = mdev;
680	mdev_set_drvdata(mdev, vgpu);
681
682	gvt_dbg_core("intel_vgpu_create succeeded for mdev: %s\n",
683		     dev_name(mdev_dev(mdev)));
684	ret = 0;
685
686out:
687	return ret;
688}
689
690static int intel_vgpu_remove(struct mdev_device *mdev)
691{
692	struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
693
694	if (handle_valid(vgpu->handle))
695		return -EBUSY;
696
697	intel_gvt_ops->vgpu_destroy(vgpu);
698	return 0;
699}
700
701static int intel_vgpu_iommu_notifier(struct notifier_block *nb,
702				     unsigned long action, void *data)
703{
704	struct intel_vgpu *vgpu = container_of(nb,
705					struct intel_vgpu,
706					vdev.iommu_notifier);
707
708	if (action == VFIO_IOMMU_NOTIFY_DMA_UNMAP) {
709		struct vfio_iommu_type1_dma_unmap *unmap = data;
710		struct gvt_dma *entry;
711		unsigned long iov_pfn, end_iov_pfn;
712
713		iov_pfn = unmap->iova >> PAGE_SHIFT;
714		end_iov_pfn = iov_pfn + unmap->size / PAGE_SIZE;
715
716		mutex_lock(&vgpu->vdev.cache_lock);
717		for (; iov_pfn < end_iov_pfn; iov_pfn++) {
718			entry = __gvt_cache_find_gfn(vgpu, iov_pfn);
719			if (!entry)
720				continue;
721
722			gvt_dma_unmap_page(vgpu, entry->gfn, entry->dma_addr,
723					   entry->size);
724			__gvt_cache_remove_entry(vgpu, entry);
725		}
726		mutex_unlock(&vgpu->vdev.cache_lock);
727	}
728
729	return NOTIFY_OK;
730}
731
732static int intel_vgpu_group_notifier(struct notifier_block *nb,
733				     unsigned long action, void *data)
734{
735	struct intel_vgpu *vgpu = container_of(nb,
736					struct intel_vgpu,
737					vdev.group_notifier);
738
739	/* the only action we care about */
740	if (action == VFIO_GROUP_NOTIFY_SET_KVM) {
741		vgpu->vdev.kvm = data;
742
743		if (!data)
744			schedule_work(&vgpu->vdev.release_work);
745	}
746
747	return NOTIFY_OK;
748}
749
750static int intel_vgpu_open(struct mdev_device *mdev)
751{
752	struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
753	unsigned long events;
754	int ret;
755
756	vgpu->vdev.iommu_notifier.notifier_call = intel_vgpu_iommu_notifier;
757	vgpu->vdev.group_notifier.notifier_call = intel_vgpu_group_notifier;
758
759	events = VFIO_IOMMU_NOTIFY_DMA_UNMAP;
760	ret = vfio_register_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, &events,
761				&vgpu->vdev.iommu_notifier);
762	if (ret != 0) {
763		gvt_vgpu_err("vfio_register_notifier for iommu failed: %d\n",
764			ret);
765		goto out;
766	}
767
768	events = VFIO_GROUP_NOTIFY_SET_KVM;
769	ret = vfio_register_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY, &events,
770				&vgpu->vdev.group_notifier);
771	if (ret != 0) {
772		gvt_vgpu_err("vfio_register_notifier for group failed: %d\n",
773			ret);
774		goto undo_iommu;
775	}
776
777	/* Take a module reference as mdev core doesn't take
778	 * a reference for vendor driver.
779	 */
780	if (!try_module_get(THIS_MODULE))
781		goto undo_group;
782
783	ret = kvmgt_guest_init(mdev);
784	if (ret)
785		goto undo_group;
786
787	intel_gvt_ops->vgpu_activate(vgpu);
788
789	atomic_set(&vgpu->vdev.released, 0);
790	return ret;
791
792undo_group:
793	vfio_unregister_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY,
794					&vgpu->vdev.group_notifier);
795
796undo_iommu:
797	vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY,
798					&vgpu->vdev.iommu_notifier);
799out:
800	return ret;
801}
802
803static void intel_vgpu_release_msi_eventfd_ctx(struct intel_vgpu *vgpu)
804{
805	struct eventfd_ctx *trigger;
806
807	trigger = vgpu->vdev.msi_trigger;
808	if (trigger) {
809		eventfd_ctx_put(trigger);
810		vgpu->vdev.msi_trigger = NULL;
811	}
812}
813
814static void __intel_vgpu_release(struct intel_vgpu *vgpu)
815{
816	struct kvmgt_guest_info *info;
817	int ret;
818
819	if (!handle_valid(vgpu->handle))
820		return;
821
822	if (atomic_cmpxchg(&vgpu->vdev.released, 0, 1))
823		return;
824
825	intel_gvt_ops->vgpu_release(vgpu);
826
827	ret = vfio_unregister_notifier(mdev_dev(vgpu->vdev.mdev), VFIO_IOMMU_NOTIFY,
828					&vgpu->vdev.iommu_notifier);
829	WARN(ret, "vfio_unregister_notifier for iommu failed: %d\n", ret);
830
831	ret = vfio_unregister_notifier(mdev_dev(vgpu->vdev.mdev), VFIO_GROUP_NOTIFY,
832					&vgpu->vdev.group_notifier);
833	WARN(ret, "vfio_unregister_notifier for group failed: %d\n", ret);
834
835	/* dereference module reference taken at open */
836	module_put(THIS_MODULE);
837
838	info = (struct kvmgt_guest_info *)vgpu->handle;
839	kvmgt_guest_exit(info);
840
841	intel_vgpu_release_msi_eventfd_ctx(vgpu);
842
843	vgpu->vdev.kvm = NULL;
844	vgpu->handle = 0;
845}
846
847static void intel_vgpu_release(struct mdev_device *mdev)
848{
849	struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
850
851	__intel_vgpu_release(vgpu);
852}
853
854static void intel_vgpu_release_work(struct work_struct *work)
855{
856	struct intel_vgpu *vgpu = container_of(work, struct intel_vgpu,
857					vdev.release_work);
858
859	__intel_vgpu_release(vgpu);
860}
861
862static u64 intel_vgpu_get_bar_addr(struct intel_vgpu *vgpu, int bar)
863{
864	u32 start_lo, start_hi;
865	u32 mem_type;
866
867	start_lo = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + bar)) &
868			PCI_BASE_ADDRESS_MEM_MASK;
869	mem_type = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + bar)) &
870			PCI_BASE_ADDRESS_MEM_TYPE_MASK;
871
872	switch (mem_type) {
873	case PCI_BASE_ADDRESS_MEM_TYPE_64:
874		start_hi = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space
875						+ bar + 4));
876		break;
877	case PCI_BASE_ADDRESS_MEM_TYPE_32:
878	case PCI_BASE_ADDRESS_MEM_TYPE_1M:
879		/* 1M mem BAR treated as 32-bit BAR */
880	default:
881		/* mem unknown type treated as 32-bit BAR */
882		start_hi = 0;
883		break;
884	}
885
886	return ((u64)start_hi << 32) | start_lo;
887}
888
889static int intel_vgpu_bar_rw(struct intel_vgpu *vgpu, int bar, u64 off,
890			     void *buf, unsigned int count, bool is_write)
891{
892	u64 bar_start = intel_vgpu_get_bar_addr(vgpu, bar);
893	int ret;
894
895	if (is_write)
896		ret = intel_gvt_ops->emulate_mmio_write(vgpu,
897					bar_start + off, buf, count);
898	else
899		ret = intel_gvt_ops->emulate_mmio_read(vgpu,
900					bar_start + off, buf, count);
901	return ret;
902}
903
904static inline bool intel_vgpu_in_aperture(struct intel_vgpu *vgpu, u64 off)
905{
906	return off >= vgpu_aperture_offset(vgpu) &&
907	       off < vgpu_aperture_offset(vgpu) + vgpu_aperture_sz(vgpu);
908}
909
910static int intel_vgpu_aperture_rw(struct intel_vgpu *vgpu, u64 off,
911		void *buf, unsigned long count, bool is_write)
912{
913	void __iomem *aperture_va;
914
915	if (!intel_vgpu_in_aperture(vgpu, off) ||
916	    !intel_vgpu_in_aperture(vgpu, off + count)) {
917		gvt_vgpu_err("Invalid aperture offset %llu\n", off);
918		return -EINVAL;
919	}
920
921	aperture_va = io_mapping_map_wc(&vgpu->gvt->dev_priv->ggtt.iomap,
922					ALIGN_DOWN(off, PAGE_SIZE),
923					count + offset_in_page(off));
924	if (!aperture_va)
925		return -EIO;
926
927	if (is_write)
928		memcpy_toio(aperture_va + offset_in_page(off), buf, count);
929	else
930		memcpy_fromio(buf, aperture_va + offset_in_page(off), count);
931
932	io_mapping_unmap(aperture_va);
933
934	return 0;
935}
936
937static ssize_t intel_vgpu_rw(struct mdev_device *mdev, char *buf,
938			size_t count, loff_t *ppos, bool is_write)
939{
940	struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
941	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
942	u64 pos = *ppos & VFIO_PCI_OFFSET_MASK;
943	int ret = -EINVAL;
944
945
946	if (index >= VFIO_PCI_NUM_REGIONS + vgpu->vdev.num_regions) {
947		gvt_vgpu_err("invalid index: %u\n", index);
948		return -EINVAL;
949	}
950
951	switch (index) {
952	case VFIO_PCI_CONFIG_REGION_INDEX:
953		if (is_write)
954			ret = intel_gvt_ops->emulate_cfg_write(vgpu, pos,
955						buf, count);
956		else
957			ret = intel_gvt_ops->emulate_cfg_read(vgpu, pos,
958						buf, count);
959		break;
960	case VFIO_PCI_BAR0_REGION_INDEX:
961		ret = intel_vgpu_bar_rw(vgpu, PCI_BASE_ADDRESS_0, pos,
962					buf, count, is_write);
963		break;
964	case VFIO_PCI_BAR2_REGION_INDEX:
965		ret = intel_vgpu_aperture_rw(vgpu, pos, buf, count, is_write);
966		break;
967	case VFIO_PCI_BAR1_REGION_INDEX:
968	case VFIO_PCI_BAR3_REGION_INDEX:
969	case VFIO_PCI_BAR4_REGION_INDEX:
970	case VFIO_PCI_BAR5_REGION_INDEX:
971	case VFIO_PCI_VGA_REGION_INDEX:
972	case VFIO_PCI_ROM_REGION_INDEX:
973		break;
974	default:
975		if (index >= VFIO_PCI_NUM_REGIONS + vgpu->vdev.num_regions)
976			return -EINVAL;
977
978		index -= VFIO_PCI_NUM_REGIONS;
979		return vgpu->vdev.region[index].ops->rw(vgpu, buf, count,
980				ppos, is_write);
981	}
982
983	return ret == 0 ? count : ret;
984}
985
986static bool gtt_entry(struct mdev_device *mdev, loff_t *ppos)
987{
988	struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
989	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
990	struct intel_gvt *gvt = vgpu->gvt;
991	int offset;
992
993	/* Only allow MMIO GGTT entry access */
994	if (index != PCI_BASE_ADDRESS_0)
995		return false;
996
997	offset = (u64)(*ppos & VFIO_PCI_OFFSET_MASK) -
998		intel_vgpu_get_bar_gpa(vgpu, PCI_BASE_ADDRESS_0);
999
1000	return (offset >= gvt->device_info.gtt_start_offset &&
1001		offset < gvt->device_info.gtt_start_offset + gvt_ggtt_sz(gvt)) ?
1002			true : false;
1003}
1004
1005static ssize_t intel_vgpu_read(struct mdev_device *mdev, char __user *buf,
1006			size_t count, loff_t *ppos)
1007{
1008	unsigned int done = 0;
1009	int ret;
1010
1011	while (count) {
1012		size_t filled;
1013
1014		/* Only support GGTT entry 8 bytes read */
1015		if (count >= 8 && !(*ppos % 8) &&
1016			gtt_entry(mdev, ppos)) {
1017			u64 val;
1018
1019			ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
1020					ppos, false);
1021			if (ret <= 0)
1022				goto read_err;
1023
1024			if (copy_to_user(buf, &val, sizeof(val)))
1025				goto read_err;
1026
1027			filled = 8;
1028		} else if (count >= 4 && !(*ppos % 4)) {
1029			u32 val;
1030
1031			ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
1032					ppos, false);
1033			if (ret <= 0)
1034				goto read_err;
1035
1036			if (copy_to_user(buf, &val, sizeof(val)))
1037				goto read_err;
1038
1039			filled = 4;
1040		} else if (count >= 2 && !(*ppos % 2)) {
1041			u16 val;
1042
1043			ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
1044					ppos, false);
1045			if (ret <= 0)
1046				goto read_err;
1047
1048			if (copy_to_user(buf, &val, sizeof(val)))
1049				goto read_err;
1050
1051			filled = 2;
1052		} else {
1053			u8 val;
1054
1055			ret = intel_vgpu_rw(mdev, &val, sizeof(val), ppos,
1056					false);
1057			if (ret <= 0)
1058				goto read_err;
1059
1060			if (copy_to_user(buf, &val, sizeof(val)))
1061				goto read_err;
1062
1063			filled = 1;
1064		}
1065
1066		count -= filled;
1067		done += filled;
1068		*ppos += filled;
1069		buf += filled;
1070	}
1071
1072	return done;
1073
1074read_err:
1075	return -EFAULT;
1076}
1077
1078static ssize_t intel_vgpu_write(struct mdev_device *mdev,
1079				const char __user *buf,
1080				size_t count, loff_t *ppos)
1081{
1082	unsigned int done = 0;
1083	int ret;
1084
1085	while (count) {
1086		size_t filled;
1087
1088		/* Only support GGTT entry 8 bytes write */
1089		if (count >= 8 && !(*ppos % 8) &&
1090			gtt_entry(mdev, ppos)) {
1091			u64 val;
1092
1093			if (copy_from_user(&val, buf, sizeof(val)))
1094				goto write_err;
1095
1096			ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
1097					ppos, true);
1098			if (ret <= 0)
1099				goto write_err;
1100
1101			filled = 8;
1102		} else if (count >= 4 && !(*ppos % 4)) {
1103			u32 val;
1104
1105			if (copy_from_user(&val, buf, sizeof(val)))
1106				goto write_err;
1107
1108			ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
1109					ppos, true);
1110			if (ret <= 0)
1111				goto write_err;
1112
1113			filled = 4;
1114		} else if (count >= 2 && !(*ppos % 2)) {
1115			u16 val;
1116
1117			if (copy_from_user(&val, buf, sizeof(val)))
1118				goto write_err;
1119
1120			ret = intel_vgpu_rw(mdev, (char *)&val,
1121					sizeof(val), ppos, true);
1122			if (ret <= 0)
1123				goto write_err;
1124
1125			filled = 2;
1126		} else {
1127			u8 val;
1128
1129			if (copy_from_user(&val, buf, sizeof(val)))
1130				goto write_err;
1131
1132			ret = intel_vgpu_rw(mdev, &val, sizeof(val),
1133					ppos, true);
1134			if (ret <= 0)
1135				goto write_err;
1136
1137			filled = 1;
1138		}
1139
1140		count -= filled;
1141		done += filled;
1142		*ppos += filled;
1143		buf += filled;
1144	}
1145
1146	return done;
1147write_err:
1148	return -EFAULT;
1149}
1150
1151static int intel_vgpu_mmap(struct mdev_device *mdev, struct vm_area_struct *vma)
1152{
1153	unsigned int index;
1154	u64 virtaddr;
1155	unsigned long req_size, pgoff, req_start;
1156	pgprot_t pg_prot;
1157	struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
1158
1159	index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
1160	if (index >= VFIO_PCI_ROM_REGION_INDEX)
1161		return -EINVAL;
1162
1163	if (vma->vm_end < vma->vm_start)
1164		return -EINVAL;
1165	if ((vma->vm_flags & VM_SHARED) == 0)
1166		return -EINVAL;
1167	if (index != VFIO_PCI_BAR2_REGION_INDEX)
1168		return -EINVAL;
1169
1170	pg_prot = vma->vm_page_prot;
1171	virtaddr = vma->vm_start;
1172	req_size = vma->vm_end - vma->vm_start;
1173	pgoff = vma->vm_pgoff &
1174		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
1175	req_start = pgoff << PAGE_SHIFT;
1176
1177	if (!intel_vgpu_in_aperture(vgpu, req_start))
1178		return -EINVAL;
1179	if (req_start + req_size >
1180	    vgpu_aperture_offset(vgpu) + vgpu_aperture_sz(vgpu))
1181		return -EINVAL;
1182
1183	pgoff = (gvt_aperture_pa_base(vgpu->gvt) >> PAGE_SHIFT) + pgoff;
1184
1185	return remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot);
1186}
1187
1188static int intel_vgpu_get_irq_count(struct intel_vgpu *vgpu, int type)
1189{
1190	if (type == VFIO_PCI_INTX_IRQ_INDEX || type == VFIO_PCI_MSI_IRQ_INDEX)
1191		return 1;
1192
1193	return 0;
1194}
1195
1196static int intel_vgpu_set_intx_mask(struct intel_vgpu *vgpu,
1197			unsigned int index, unsigned int start,
1198			unsigned int count, u32 flags,
1199			void *data)
1200{
1201	return 0;
1202}
1203
1204static int intel_vgpu_set_intx_unmask(struct intel_vgpu *vgpu,
1205			unsigned int index, unsigned int start,
1206			unsigned int count, u32 flags, void *data)
1207{
1208	return 0;
1209}
1210
1211static int intel_vgpu_set_intx_trigger(struct intel_vgpu *vgpu,
1212		unsigned int index, unsigned int start, unsigned int count,
1213		u32 flags, void *data)
1214{
1215	return 0;
1216}
1217
1218static int intel_vgpu_set_msi_trigger(struct intel_vgpu *vgpu,
1219		unsigned int index, unsigned int start, unsigned int count,
1220		u32 flags, void *data)
1221{
1222	struct eventfd_ctx *trigger;
1223
1224	if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
1225		int fd = *(int *)data;
1226
1227		trigger = eventfd_ctx_fdget(fd);
1228		if (IS_ERR(trigger)) {
1229			gvt_vgpu_err("eventfd_ctx_fdget failed\n");
1230			return PTR_ERR(trigger);
1231		}
1232		vgpu->vdev.msi_trigger = trigger;
1233	} else if ((flags & VFIO_IRQ_SET_DATA_NONE) && !count)
1234		intel_vgpu_release_msi_eventfd_ctx(vgpu);
1235
1236	return 0;
1237}
1238
1239static int intel_vgpu_set_irqs(struct intel_vgpu *vgpu, u32 flags,
1240		unsigned int index, unsigned int start, unsigned int count,
1241		void *data)
1242{
1243	int (*func)(struct intel_vgpu *vgpu, unsigned int index,
1244			unsigned int start, unsigned int count, u32 flags,
1245			void *data) = NULL;
1246
1247	switch (index) {
1248	case VFIO_PCI_INTX_IRQ_INDEX:
1249		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
1250		case VFIO_IRQ_SET_ACTION_MASK:
1251			func = intel_vgpu_set_intx_mask;
1252			break;
1253		case VFIO_IRQ_SET_ACTION_UNMASK:
1254			func = intel_vgpu_set_intx_unmask;
1255			break;
1256		case VFIO_IRQ_SET_ACTION_TRIGGER:
1257			func = intel_vgpu_set_intx_trigger;
1258			break;
1259		}
1260		break;
1261	case VFIO_PCI_MSI_IRQ_INDEX:
1262		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
1263		case VFIO_IRQ_SET_ACTION_MASK:
1264		case VFIO_IRQ_SET_ACTION_UNMASK:
1265			/* XXX Need masking support exported */
1266			break;
1267		case VFIO_IRQ_SET_ACTION_TRIGGER:
1268			func = intel_vgpu_set_msi_trigger;
1269			break;
1270		}
1271		break;
1272	}
1273
1274	if (!func)
1275		return -ENOTTY;
1276
1277	return func(vgpu, index, start, count, flags, data);
1278}
1279
1280static long intel_vgpu_ioctl(struct mdev_device *mdev, unsigned int cmd,
1281			     unsigned long arg)
1282{
1283	struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
1284	unsigned long minsz;
1285
1286	gvt_dbg_core("vgpu%d ioctl, cmd: %d\n", vgpu->id, cmd);
1287
1288	if (cmd == VFIO_DEVICE_GET_INFO) {
1289		struct vfio_device_info info;
1290
1291		minsz = offsetofend(struct vfio_device_info, num_irqs);
1292
1293		if (copy_from_user(&info, (void __user *)arg, minsz))
1294			return -EFAULT;
1295
1296		if (info.argsz < minsz)
1297			return -EINVAL;
1298
1299		info.flags = VFIO_DEVICE_FLAGS_PCI;
1300		info.flags |= VFIO_DEVICE_FLAGS_RESET;
1301		info.num_regions = VFIO_PCI_NUM_REGIONS +
1302				vgpu->vdev.num_regions;
1303		info.num_irqs = VFIO_PCI_NUM_IRQS;
1304
1305		return copy_to_user((void __user *)arg, &info, minsz) ?
1306			-EFAULT : 0;
1307
1308	} else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
1309		struct vfio_region_info info;
1310		struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
1311		unsigned int i;
1312		int ret;
1313		struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
1314		int nr_areas = 1;
1315		int cap_type_id;
1316
1317		minsz = offsetofend(struct vfio_region_info, offset);
1318
1319		if (copy_from_user(&info, (void __user *)arg, minsz))
1320			return -EFAULT;
1321
1322		if (info.argsz < minsz)
1323			return -EINVAL;
1324
1325		switch (info.index) {
1326		case VFIO_PCI_CONFIG_REGION_INDEX:
1327			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1328			info.size = vgpu->gvt->device_info.cfg_space_size;
1329			info.flags = VFIO_REGION_INFO_FLAG_READ |
1330				     VFIO_REGION_INFO_FLAG_WRITE;
1331			break;
1332		case VFIO_PCI_BAR0_REGION_INDEX:
1333			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1334			info.size = vgpu->cfg_space.bar[info.index].size;
1335			if (!info.size) {
1336				info.flags = 0;
1337				break;
1338			}
1339
1340			info.flags = VFIO_REGION_INFO_FLAG_READ |
1341				     VFIO_REGION_INFO_FLAG_WRITE;
1342			break;
1343		case VFIO_PCI_BAR1_REGION_INDEX:
1344			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1345			info.size = 0;
1346			info.flags = 0;
1347			break;
1348		case VFIO_PCI_BAR2_REGION_INDEX:
1349			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1350			info.flags = VFIO_REGION_INFO_FLAG_CAPS |
1351					VFIO_REGION_INFO_FLAG_MMAP |
1352					VFIO_REGION_INFO_FLAG_READ |
1353					VFIO_REGION_INFO_FLAG_WRITE;
1354			info.size = gvt_aperture_sz(vgpu->gvt);
1355
1356			sparse = kzalloc(struct_size(sparse, areas, nr_areas),
1357					 GFP_KERNEL);
1358			if (!sparse)
1359				return -ENOMEM;
1360
1361			sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
1362			sparse->header.version = 1;
1363			sparse->nr_areas = nr_areas;
1364			cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
1365			sparse->areas[0].offset =
1366					PAGE_ALIGN(vgpu_aperture_offset(vgpu));
1367			sparse->areas[0].size = vgpu_aperture_sz(vgpu);
1368			break;
1369
1370		case VFIO_PCI_BAR3_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
1371			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1372			info.size = 0;
1373			info.flags = 0;
1374
1375			gvt_dbg_core("get region info bar:%d\n", info.index);
1376			break;
1377
1378		case VFIO_PCI_ROM_REGION_INDEX:
1379		case VFIO_PCI_VGA_REGION_INDEX:
1380			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1381			info.size = 0;
1382			info.flags = 0;
1383
1384			gvt_dbg_core("get region info index:%d\n", info.index);
1385			break;
1386		default:
1387			{
1388				struct vfio_region_info_cap_type cap_type = {
1389					.header.id = VFIO_REGION_INFO_CAP_TYPE,
1390					.header.version = 1 };
1391
1392				if (info.index >= VFIO_PCI_NUM_REGIONS +
1393						vgpu->vdev.num_regions)
1394					return -EINVAL;
1395				info.index =
1396					array_index_nospec(info.index,
1397							VFIO_PCI_NUM_REGIONS +
1398							vgpu->vdev.num_regions);
1399
1400				i = info.index - VFIO_PCI_NUM_REGIONS;
1401
1402				info.offset =
1403					VFIO_PCI_INDEX_TO_OFFSET(info.index);
1404				info.size = vgpu->vdev.region[i].size;
1405				info.flags = vgpu->vdev.region[i].flags;
1406
1407				cap_type.type = vgpu->vdev.region[i].type;
1408				cap_type.subtype = vgpu->vdev.region[i].subtype;
1409
1410				ret = vfio_info_add_capability(&caps,
1411							&cap_type.header,
1412							sizeof(cap_type));
1413				if (ret)
1414					return ret;
1415			}
1416		}
1417
1418		if ((info.flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) {
1419			switch (cap_type_id) {
1420			case VFIO_REGION_INFO_CAP_SPARSE_MMAP:
1421				ret = vfio_info_add_capability(&caps,
1422					&sparse->header,
1423					struct_size(sparse, areas,
1424						    sparse->nr_areas));
1425				if (ret) {
1426					kfree(sparse);
1427					return ret;
1428				}
1429				break;
1430			default:
1431				kfree(sparse);
1432				return -EINVAL;
1433			}
1434		}
1435
1436		if (caps.size) {
1437			info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
1438			if (info.argsz < sizeof(info) + caps.size) {
1439				info.argsz = sizeof(info) + caps.size;
1440				info.cap_offset = 0;
1441			} else {
1442				vfio_info_cap_shift(&caps, sizeof(info));
1443				if (copy_to_user((void __user *)arg +
1444						  sizeof(info), caps.buf,
1445						  caps.size)) {
1446					kfree(caps.buf);
1447					kfree(sparse);
1448					return -EFAULT;
1449				}
1450				info.cap_offset = sizeof(info);
1451			}
1452
1453			kfree(caps.buf);
1454		}
1455
1456		kfree(sparse);
1457		return copy_to_user((void __user *)arg, &info, minsz) ?
1458			-EFAULT : 0;
1459	} else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
1460		struct vfio_irq_info info;
1461
1462		minsz = offsetofend(struct vfio_irq_info, count);
1463
1464		if (copy_from_user(&info, (void __user *)arg, minsz))
1465			return -EFAULT;
1466
1467		if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
1468			return -EINVAL;
1469
1470		switch (info.index) {
1471		case VFIO_PCI_INTX_IRQ_INDEX:
1472		case VFIO_PCI_MSI_IRQ_INDEX:
1473			break;
1474		default:
1475			return -EINVAL;
1476		}
1477
1478		info.flags = VFIO_IRQ_INFO_EVENTFD;
1479
1480		info.count = intel_vgpu_get_irq_count(vgpu, info.index);
1481
1482		if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
1483			info.flags |= (VFIO_IRQ_INFO_MASKABLE |
1484				       VFIO_IRQ_INFO_AUTOMASKED);
1485		else
1486			info.flags |= VFIO_IRQ_INFO_NORESIZE;
1487
1488		return copy_to_user((void __user *)arg, &info, minsz) ?
1489			-EFAULT : 0;
1490	} else if (cmd == VFIO_DEVICE_SET_IRQS) {
1491		struct vfio_irq_set hdr;
1492		u8 *data = NULL;
1493		int ret = 0;
1494		size_t data_size = 0;
1495
1496		minsz = offsetofend(struct vfio_irq_set, count);
1497
1498		if (copy_from_user(&hdr, (void __user *)arg, minsz))
1499			return -EFAULT;
1500
1501		if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
1502			int max = intel_vgpu_get_irq_count(vgpu, hdr.index);
1503
1504			ret = vfio_set_irqs_validate_and_prepare(&hdr, max,
1505						VFIO_PCI_NUM_IRQS, &data_size);
1506			if (ret) {
1507				gvt_vgpu_err("intel:vfio_set_irqs_validate_and_prepare failed\n");
1508				return -EINVAL;
1509			}
1510			if (data_size) {
1511				data = memdup_user((void __user *)(arg + minsz),
1512						   data_size);
1513				if (IS_ERR(data))
1514					return PTR_ERR(data);
1515			}
1516		}
1517
1518		ret = intel_vgpu_set_irqs(vgpu, hdr.flags, hdr.index,
1519					hdr.start, hdr.count, data);
1520		kfree(data);
1521
1522		return ret;
1523	} else if (cmd == VFIO_DEVICE_RESET) {
1524		intel_gvt_ops->vgpu_reset(vgpu);
1525		return 0;
1526	} else if (cmd == VFIO_DEVICE_QUERY_GFX_PLANE) {
1527		struct vfio_device_gfx_plane_info dmabuf;
1528		int ret = 0;
1529
1530		minsz = offsetofend(struct vfio_device_gfx_plane_info,
1531				    dmabuf_id);
1532		if (copy_from_user(&dmabuf, (void __user *)arg, minsz))
1533			return -EFAULT;
1534		if (dmabuf.argsz < minsz)
1535			return -EINVAL;
1536
1537		ret = intel_gvt_ops->vgpu_query_plane(vgpu, &dmabuf);
1538		if (ret != 0)
1539			return ret;
1540
1541		return copy_to_user((void __user *)arg, &dmabuf, minsz) ?
1542								-EFAULT : 0;
1543	} else if (cmd == VFIO_DEVICE_GET_GFX_DMABUF) {
1544		__u32 dmabuf_id;
1545		__s32 dmabuf_fd;
1546
1547		if (get_user(dmabuf_id, (__u32 __user *)arg))
1548			return -EFAULT;
1549
1550		dmabuf_fd = intel_gvt_ops->vgpu_get_dmabuf(vgpu, dmabuf_id);
1551		return dmabuf_fd;
1552
1553	}
1554
1555	return -ENOTTY;
1556}
1557
1558static ssize_t
1559vgpu_id_show(struct device *dev, struct device_attribute *attr,
1560	     char *buf)
1561{
1562	struct mdev_device *mdev = mdev_from_dev(dev);
1563
1564	if (mdev) {
1565		struct intel_vgpu *vgpu = (struct intel_vgpu *)
1566			mdev_get_drvdata(mdev);
1567		return sprintf(buf, "%d\n", vgpu->id);
1568	}
1569	return sprintf(buf, "\n");
1570}
1571
1572static DEVICE_ATTR_RO(vgpu_id);
1573
1574static struct attribute *intel_vgpu_attrs[] = {
1575	&dev_attr_vgpu_id.attr,
1576	NULL
1577};
1578
1579static const struct attribute_group intel_vgpu_group = {
1580	.name = "intel_vgpu",
1581	.attrs = intel_vgpu_attrs,
1582};
1583
1584static const struct attribute_group *intel_vgpu_groups[] = {
1585	&intel_vgpu_group,
1586	NULL,
1587};
1588
1589static struct mdev_parent_ops intel_vgpu_ops = {
1590	.mdev_attr_groups       = intel_vgpu_groups,
1591	.create			= intel_vgpu_create,
1592	.remove			= intel_vgpu_remove,
1593
1594	.open			= intel_vgpu_open,
1595	.release		= intel_vgpu_release,
1596
1597	.read			= intel_vgpu_read,
1598	.write			= intel_vgpu_write,
1599	.mmap			= intel_vgpu_mmap,
1600	.ioctl			= intel_vgpu_ioctl,
1601};
1602
1603static int kvmgt_host_init(struct device *dev, void *gvt, const void *ops)
1604{
1605	struct attribute **kvm_type_attrs;
1606	struct attribute_group **kvm_vgpu_type_groups;
1607
1608	intel_gvt_ops = ops;
1609	if (!intel_gvt_ops->get_gvt_attrs(&kvm_type_attrs,
1610			&kvm_vgpu_type_groups))
1611		return -EFAULT;
1612	intel_vgpu_ops.supported_type_groups = kvm_vgpu_type_groups;
1613
1614	return mdev_register_device(dev, &intel_vgpu_ops);
1615}
1616
1617static void kvmgt_host_exit(struct device *dev)
1618{
1619	mdev_unregister_device(dev);
1620}
1621
1622static int kvmgt_page_track_add(unsigned long handle, u64 gfn)
1623{
1624	struct kvmgt_guest_info *info;
1625	struct kvm *kvm;
1626	struct kvm_memory_slot *slot;
1627	int idx;
1628
1629	if (!handle_valid(handle))
1630		return -ESRCH;
1631
1632	info = (struct kvmgt_guest_info *)handle;
1633	kvm = info->kvm;
1634
1635	idx = srcu_read_lock(&kvm->srcu);
1636	slot = gfn_to_memslot(kvm, gfn);
1637	if (!slot) {
1638		srcu_read_unlock(&kvm->srcu, idx);
1639		return -EINVAL;
1640	}
1641
1642	spin_lock(&kvm->mmu_lock);
1643
1644	if (kvmgt_gfn_is_write_protected(info, gfn))
1645		goto out;
1646
1647	kvm_slot_page_track_add_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE);
1648	kvmgt_protect_table_add(info, gfn);
1649
1650out:
1651	spin_unlock(&kvm->mmu_lock);
1652	srcu_read_unlock(&kvm->srcu, idx);
1653	return 0;
1654}
1655
1656static int kvmgt_page_track_remove(unsigned long handle, u64 gfn)
1657{
1658	struct kvmgt_guest_info *info;
1659	struct kvm *kvm;
1660	struct kvm_memory_slot *slot;
1661	int idx;
1662
1663	if (!handle_valid(handle))
1664		return 0;
1665
1666	info = (struct kvmgt_guest_info *)handle;
1667	kvm = info->kvm;
1668
1669	idx = srcu_read_lock(&kvm->srcu);
1670	slot = gfn_to_memslot(kvm, gfn);
1671	if (!slot) {
1672		srcu_read_unlock(&kvm->srcu, idx);
1673		return -EINVAL;
1674	}
1675
1676	spin_lock(&kvm->mmu_lock);
1677
1678	if (!kvmgt_gfn_is_write_protected(info, gfn))
1679		goto out;
1680
1681	kvm_slot_page_track_remove_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE);
1682	kvmgt_protect_table_del(info, gfn);
1683
1684out:
1685	spin_unlock(&kvm->mmu_lock);
1686	srcu_read_unlock(&kvm->srcu, idx);
1687	return 0;
1688}
1689
1690static void kvmgt_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1691		const u8 *val, int len,
1692		struct kvm_page_track_notifier_node *node)
1693{
1694	struct kvmgt_guest_info *info = container_of(node,
1695					struct kvmgt_guest_info, track_node);
1696
1697	if (kvmgt_gfn_is_write_protected(info, gpa_to_gfn(gpa)))
1698		intel_gvt_ops->write_protect_handler(info->vgpu, gpa,
1699						     (void *)val, len);
1700}
1701
1702static void kvmgt_page_track_flush_slot(struct kvm *kvm,
1703		struct kvm_memory_slot *slot,
1704		struct kvm_page_track_notifier_node *node)
1705{
1706	int i;
1707	gfn_t gfn;
1708	struct kvmgt_guest_info *info = container_of(node,
1709					struct kvmgt_guest_info, track_node);
1710
1711	spin_lock(&kvm->mmu_lock);
1712	for (i = 0; i < slot->npages; i++) {
1713		gfn = slot->base_gfn + i;
1714		if (kvmgt_gfn_is_write_protected(info, gfn)) {
1715			kvm_slot_page_track_remove_page(kvm, slot, gfn,
1716						KVM_PAGE_TRACK_WRITE);
1717			kvmgt_protect_table_del(info, gfn);
1718		}
1719	}
1720	spin_unlock(&kvm->mmu_lock);
1721}
1722
1723static bool __kvmgt_vgpu_exist(struct intel_vgpu *vgpu, struct kvm *kvm)
1724{
1725	struct intel_vgpu *itr;
1726	struct kvmgt_guest_info *info;
1727	int id;
1728	bool ret = false;
1729
1730	mutex_lock(&vgpu->gvt->lock);
1731	for_each_active_vgpu(vgpu->gvt, itr, id) {
1732		if (!handle_valid(itr->handle))
1733			continue;
1734
1735		info = (struct kvmgt_guest_info *)itr->handle;
1736		if (kvm && kvm == info->kvm) {
1737			ret = true;
1738			goto out;
1739		}
1740	}
1741out:
1742	mutex_unlock(&vgpu->gvt->lock);
1743	return ret;
1744}
1745
1746static int kvmgt_guest_init(struct mdev_device *mdev)
1747{
1748	struct kvmgt_guest_info *info;
1749	struct intel_vgpu *vgpu;
1750	struct kvm *kvm;
1751
1752	vgpu = mdev_get_drvdata(mdev);
1753	if (handle_valid(vgpu->handle))
1754		return -EEXIST;
1755
1756	kvm = vgpu->vdev.kvm;
1757	if (!kvm || kvm->mm != current->mm) {
1758		gvt_vgpu_err("KVM is required to use Intel vGPU\n");
1759		return -ESRCH;
1760	}
1761
1762	if (__kvmgt_vgpu_exist(vgpu, kvm))
1763		return -EEXIST;
1764
1765	info = vzalloc(sizeof(struct kvmgt_guest_info));
1766	if (!info)
1767		return -ENOMEM;
1768
1769	vgpu->handle = (unsigned long)info;
1770	info->vgpu = vgpu;
1771	info->kvm = kvm;
1772	kvm_get_kvm(info->kvm);
1773
1774	kvmgt_protect_table_init(info);
1775	gvt_cache_init(vgpu);
1776
1777	init_completion(&vgpu->vblank_done);
1778
1779	info->track_node.track_write = kvmgt_page_track_write;
1780	info->track_node.track_flush_slot = kvmgt_page_track_flush_slot;
1781	kvm_page_track_register_notifier(kvm, &info->track_node);
1782
1783	info->debugfs_cache_entries = debugfs_create_ulong(
1784						"kvmgt_nr_cache_entries",
1785						0444, vgpu->debugfs,
1786						&vgpu->vdev.nr_cache_entries);
1787	return 0;
1788}
1789
1790static bool kvmgt_guest_exit(struct kvmgt_guest_info *info)
1791{
1792	debugfs_remove(info->debugfs_cache_entries);
1793
1794	kvm_page_track_unregister_notifier(info->kvm, &info->track_node);
1795	kvm_put_kvm(info->kvm);
1796	kvmgt_protect_table_destroy(info);
1797	gvt_cache_destroy(info->vgpu);
1798	vfree(info);
1799
1800	return true;
1801}
1802
1803static int kvmgt_attach_vgpu(void *vgpu, unsigned long *handle)
1804{
1805	/* nothing to do here */
1806	return 0;
1807}
1808
1809static void kvmgt_detach_vgpu(void *p_vgpu)
1810{
1811	int i;
1812	struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu;
1813
1814	if (!vgpu->vdev.region)
1815		return;
1816
1817	for (i = 0; i < vgpu->vdev.num_regions; i++)
1818		if (vgpu->vdev.region[i].ops->release)
1819			vgpu->vdev.region[i].ops->release(vgpu,
1820					&vgpu->vdev.region[i]);
1821	vgpu->vdev.num_regions = 0;
1822	kfree(vgpu->vdev.region);
1823	vgpu->vdev.region = NULL;
1824}
1825
1826static int kvmgt_inject_msi(unsigned long handle, u32 addr, u16 data)
1827{
1828	struct kvmgt_guest_info *info;
1829	struct intel_vgpu *vgpu;
1830
1831	if (!handle_valid(handle))
1832		return -ESRCH;
1833
1834	info = (struct kvmgt_guest_info *)handle;
1835	vgpu = info->vgpu;
1836
1837	/*
1838	 * When guest is poweroff, msi_trigger is set to NULL, but vgpu's
1839	 * config and mmio register isn't restored to default during guest
1840	 * poweroff. If this vgpu is still used in next vm, this vgpu's pipe
1841	 * may be enabled, then once this vgpu is active, it will get inject
1842	 * vblank interrupt request. But msi_trigger is null until msi is
1843	 * enabled by guest. so if msi_trigger is null, success is still
1844	 * returned and don't inject interrupt into guest.
1845	 */
1846	if (vgpu->vdev.msi_trigger == NULL)
1847		return 0;
1848
1849	if (eventfd_signal(vgpu->vdev.msi_trigger, 1) == 1)
1850		return 0;
1851
1852	return -EFAULT;
1853}
1854
1855static unsigned long kvmgt_gfn_to_pfn(unsigned long handle, unsigned long gfn)
1856{
1857	struct kvmgt_guest_info *info;
1858	kvm_pfn_t pfn;
1859
1860	if (!handle_valid(handle))
1861		return INTEL_GVT_INVALID_ADDR;
1862
1863	info = (struct kvmgt_guest_info *)handle;
1864
1865	pfn = gfn_to_pfn(info->kvm, gfn);
1866	if (is_error_noslot_pfn(pfn))
1867		return INTEL_GVT_INVALID_ADDR;
1868
1869	return pfn;
1870}
1871
1872static int kvmgt_dma_map_guest_page(unsigned long handle, unsigned long gfn,
1873		unsigned long size, dma_addr_t *dma_addr)
1874{
1875	struct kvmgt_guest_info *info;
1876	struct intel_vgpu *vgpu;
1877	struct gvt_dma *entry;
1878	int ret;
1879
1880	if (!handle_valid(handle))
1881		return -EINVAL;
1882
1883	info = (struct kvmgt_guest_info *)handle;
1884	vgpu = info->vgpu;
1885
1886	mutex_lock(&info->vgpu->vdev.cache_lock);
1887
1888	entry = __gvt_cache_find_gfn(info->vgpu, gfn);
1889	if (!entry) {
1890		ret = gvt_dma_map_page(vgpu, gfn, dma_addr, size);
1891		if (ret)
1892			goto err_unlock;
1893
1894		ret = __gvt_cache_add(info->vgpu, gfn, *dma_addr, size);
1895		if (ret)
1896			goto err_unmap;
1897	} else if (entry->size != size) {
1898		/* the same gfn with different size: unmap and re-map */
1899		gvt_dma_unmap_page(vgpu, gfn, entry->dma_addr, entry->size);
1900		__gvt_cache_remove_entry(vgpu, entry);
1901
1902		ret = gvt_dma_map_page(vgpu, gfn, dma_addr, size);
1903		if (ret)
1904			goto err_unlock;
1905
1906		ret = __gvt_cache_add(info->vgpu, gfn, *dma_addr, size);
1907		if (ret)
1908			goto err_unmap;
1909	} else {
1910		kref_get(&entry->ref);
1911		*dma_addr = entry->dma_addr;
1912	}
1913
1914	mutex_unlock(&info->vgpu->vdev.cache_lock);
1915	return 0;
1916
1917err_unmap:
1918	gvt_dma_unmap_page(vgpu, gfn, *dma_addr, size);
1919err_unlock:
1920	mutex_unlock(&info->vgpu->vdev.cache_lock);
1921	return ret;
1922}
1923
1924static int kvmgt_dma_pin_guest_page(unsigned long handle, dma_addr_t dma_addr)
1925{
1926	struct kvmgt_guest_info *info;
1927	struct gvt_dma *entry;
1928	int ret = 0;
1929
1930	if (!handle_valid(handle))
1931		return -ENODEV;
1932
1933	info = (struct kvmgt_guest_info *)handle;
1934
1935	mutex_lock(&info->vgpu->vdev.cache_lock);
1936	entry = __gvt_cache_find_dma_addr(info->vgpu, dma_addr);
1937	if (entry)
1938		kref_get(&entry->ref);
1939	else
1940		ret = -ENOMEM;
1941	mutex_unlock(&info->vgpu->vdev.cache_lock);
1942
1943	return ret;
1944}
1945
1946static void __gvt_dma_release(struct kref *ref)
1947{
1948	struct gvt_dma *entry = container_of(ref, typeof(*entry), ref);
1949
1950	gvt_dma_unmap_page(entry->vgpu, entry->gfn, entry->dma_addr,
1951			   entry->size);
1952	__gvt_cache_remove_entry(entry->vgpu, entry);
1953}
1954
1955static void kvmgt_dma_unmap_guest_page(unsigned long handle, dma_addr_t dma_addr)
1956{
1957	struct kvmgt_guest_info *info;
1958	struct gvt_dma *entry;
1959
1960	if (!handle_valid(handle))
1961		return;
1962
1963	info = (struct kvmgt_guest_info *)handle;
1964
1965	mutex_lock(&info->vgpu->vdev.cache_lock);
1966	entry = __gvt_cache_find_dma_addr(info->vgpu, dma_addr);
1967	if (entry)
1968		kref_put(&entry->ref, __gvt_dma_release);
1969	mutex_unlock(&info->vgpu->vdev.cache_lock);
1970}
1971
1972static int kvmgt_rw_gpa(unsigned long handle, unsigned long gpa,
1973			void *buf, unsigned long len, bool write)
1974{
1975	struct kvmgt_guest_info *info;
1976	struct kvm *kvm;
1977	int idx, ret;
1978	bool kthread = current->mm == NULL;
1979
1980	if (!handle_valid(handle))
1981		return -ESRCH;
1982
1983	info = (struct kvmgt_guest_info *)handle;
1984	kvm = info->kvm;
1985
1986	if (kthread) {
1987		if (!mmget_not_zero(kvm->mm))
1988			return -EFAULT;
1989		use_mm(kvm->mm);
1990	}
1991
1992	idx = srcu_read_lock(&kvm->srcu);
1993	ret = write ? kvm_write_guest(kvm, gpa, buf, len) :
1994		      kvm_read_guest(kvm, gpa, buf, len);
1995	srcu_read_unlock(&kvm->srcu, idx);
1996
1997	if (kthread) {
1998		unuse_mm(kvm->mm);
1999		mmput(kvm->mm);
2000	}
2001
2002	return ret;
2003}
2004
2005static int kvmgt_read_gpa(unsigned long handle, unsigned long gpa,
2006			void *buf, unsigned long len)
2007{
2008	return kvmgt_rw_gpa(handle, gpa, buf, len, false);
2009}
2010
2011static int kvmgt_write_gpa(unsigned long handle, unsigned long gpa,
2012			void *buf, unsigned long len)
2013{
2014	return kvmgt_rw_gpa(handle, gpa, buf, len, true);
2015}
2016
2017static unsigned long kvmgt_virt_to_pfn(void *addr)
2018{
2019	return PFN_DOWN(__pa(addr));
2020}
2021
2022static bool kvmgt_is_valid_gfn(unsigned long handle, unsigned long gfn)
2023{
2024	struct kvmgt_guest_info *info;
2025	struct kvm *kvm;
2026	int idx;
2027	bool ret;
2028
2029	if (!handle_valid(handle))
2030		return false;
2031
2032	info = (struct kvmgt_guest_info *)handle;
2033	kvm = info->kvm;
2034
2035	idx = srcu_read_lock(&kvm->srcu);
2036	ret = kvm_is_visible_gfn(kvm, gfn);
2037	srcu_read_unlock(&kvm->srcu, idx);
2038
2039	return ret;
2040}
2041
2042static struct intel_gvt_mpt kvmgt_mpt = {
2043	.type = INTEL_GVT_HYPERVISOR_KVM,
2044	.host_init = kvmgt_host_init,
2045	.host_exit = kvmgt_host_exit,
2046	.attach_vgpu = kvmgt_attach_vgpu,
2047	.detach_vgpu = kvmgt_detach_vgpu,
2048	.inject_msi = kvmgt_inject_msi,
2049	.from_virt_to_mfn = kvmgt_virt_to_pfn,
2050	.enable_page_track = kvmgt_page_track_add,
2051	.disable_page_track = kvmgt_page_track_remove,
2052	.read_gpa = kvmgt_read_gpa,
2053	.write_gpa = kvmgt_write_gpa,
2054	.gfn_to_mfn = kvmgt_gfn_to_pfn,
2055	.dma_map_guest_page = kvmgt_dma_map_guest_page,
2056	.dma_unmap_guest_page = kvmgt_dma_unmap_guest_page,
2057	.dma_pin_guest_page = kvmgt_dma_pin_guest_page,
2058	.set_opregion = kvmgt_set_opregion,
2059	.set_edid = kvmgt_set_edid,
2060	.get_vfio_device = kvmgt_get_vfio_device,
2061	.put_vfio_device = kvmgt_put_vfio_device,
2062	.is_valid_gfn = kvmgt_is_valid_gfn,
2063};
2064
2065static int __init kvmgt_init(void)
2066{
2067	if (intel_gvt_register_hypervisor(&kvmgt_mpt) < 0)
2068		return -ENODEV;
2069	return 0;
2070}
2071
2072static void __exit kvmgt_exit(void)
2073{
2074	intel_gvt_unregister_hypervisor();
2075}
2076
2077module_init(kvmgt_init);
2078module_exit(kvmgt_exit);
2079
2080MODULE_LICENSE("GPL and additional rights");
2081MODULE_AUTHOR("Intel Corporation");
2082