vmm.c revision 261275
1221828Sgrehan/*-
2221828Sgrehan * Copyright (c) 2011 NetApp, Inc.
3221828Sgrehan * All rights reserved.
4221828Sgrehan *
5221828Sgrehan * Redistribution and use in source and binary forms, with or without
6221828Sgrehan * modification, are permitted provided that the following conditions
7221828Sgrehan * are met:
8221828Sgrehan * 1. Redistributions of source code must retain the above copyright
9221828Sgrehan *    notice, this list of conditions and the following disclaimer.
10221828Sgrehan * 2. Redistributions in binary form must reproduce the above copyright
11221828Sgrehan *    notice, this list of conditions and the following disclaimer in the
12221828Sgrehan *    documentation and/or other materials provided with the distribution.
13221828Sgrehan *
14221828Sgrehan * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15221828Sgrehan * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16221828Sgrehan * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17221828Sgrehan * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18221828Sgrehan * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19221828Sgrehan * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20221828Sgrehan * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21221828Sgrehan * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22221828Sgrehan * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23221828Sgrehan * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24221828Sgrehan * SUCH DAMAGE.
25221828Sgrehan *
26221828Sgrehan * $FreeBSD: stable/10/sys/amd64/vmm/vmm.c 261275 2014-01-29 21:23:37Z jhb $
27221828Sgrehan */
28221828Sgrehan
29221828Sgrehan#include <sys/cdefs.h>
30221828Sgrehan__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/vmm.c 261275 2014-01-29 21:23:37Z jhb $");
31221828Sgrehan
32221828Sgrehan#include <sys/param.h>
33234695Sgrehan#include <sys/systm.h>
34221828Sgrehan#include <sys/kernel.h>
35221828Sgrehan#include <sys/module.h>
36221828Sgrehan#include <sys/sysctl.h>
37221828Sgrehan#include <sys/malloc.h>
38221828Sgrehan#include <sys/pcpu.h>
39221828Sgrehan#include <sys/lock.h>
40221828Sgrehan#include <sys/mutex.h>
41221828Sgrehan#include <sys/proc.h>
42256072Sneel#include <sys/rwlock.h>
43221828Sgrehan#include <sys/sched.h>
44221828Sgrehan#include <sys/smp.h>
45221828Sgrehan#include <sys/systm.h>
46221828Sgrehan
47221828Sgrehan#include <vm/vm.h>
48256072Sneel#include <vm/vm_object.h>
49256072Sneel#include <vm/vm_page.h>
50256072Sneel#include <vm/pmap.h>
51256072Sneel#include <vm/vm_map.h>
52256072Sneel#include <vm/vm_extern.h>
53256072Sneel#include <vm/vm_param.h>
54221828Sgrehan
55261275Sjhb#include <machine/cpu.h>
56221828Sgrehan#include <machine/vm.h>
57221828Sgrehan#include <machine/pcb.h>
58241489Sneel#include <machine/smp.h>
59221914Sjhb#include <x86/apicreg.h>
60256072Sneel#include <machine/pmap.h>
61256072Sneel#include <machine/vmparam.h>
62221828Sgrehan
63221828Sgrehan#include <machine/vmm.h>
64261088Sjhb#include <machine/vmm_dev.h>
65261088Sjhb
66256072Sneel#include "vmm_ktr.h"
67242275Sneel#include "vmm_host.h"
68221828Sgrehan#include "vmm_mem.h"
69221828Sgrehan#include "vmm_util.h"
70261088Sjhb#include "vhpet.h"
71261088Sjhb#include "vioapic.h"
72221828Sgrehan#include "vlapic.h"
73221828Sgrehan#include "vmm_msr.h"
74221828Sgrehan#include "vmm_ipi.h"
75221828Sgrehan#include "vmm_stat.h"
76242065Sneel#include "vmm_lapic.h"
77221828Sgrehan
78221828Sgrehan#include "io/ppt.h"
79221828Sgrehan#include "io/iommu.h"
80221828Sgrehan
81221828Sgrehanstruct vlapic;
82221828Sgrehan
83221828Sgrehanstruct vcpu {
84221828Sgrehan	int		flags;
85241489Sneel	enum vcpu_state	state;
86241489Sneel	struct mtx	mtx;
87221828Sgrehan	int		hostcpu;	/* host cpuid this vcpu last ran on */
88221828Sgrehan	uint64_t	guest_msrs[VMM_MSR_NUM];
89221828Sgrehan	struct vlapic	*vlapic;
90221828Sgrehan	int		 vcpuid;
91234695Sgrehan	struct savefpu	*guestfpu;	/* guest fpu state */
92221828Sgrehan	void		*stats;
93240894Sneel	struct vm_exit	exitinfo;
94240922Sneel	enum x2apic_state x2apic_state;
95241982Sneel	int		nmi_pending;
96221828Sgrehan};
97221828Sgrehan
98242065Sneel#define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
99242065Sneel#define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
100242065Sneel#define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
101256072Sneel#define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
102241489Sneel
103256072Sneelstruct mem_seg {
104256072Sneel	vm_paddr_t	gpa;
105256072Sneel	size_t		len;
106256072Sneel	boolean_t	wired;
107256072Sneel	vm_object_t	object;
108256072Sneel};
109221828Sgrehan#define	VM_MAX_MEMORY_SEGMENTS	2
110221828Sgrehan
111221828Sgrehanstruct vm {
112221828Sgrehan	void		*cookie;	/* processor-specific data */
113221828Sgrehan	void		*iommu;		/* iommu-specific data */
114261088Sjhb	struct vhpet	*vhpet;		/* virtual HPET */
115261088Sjhb	struct vioapic	*vioapic;	/* virtual ioapic */
116256072Sneel	struct vmspace	*vmspace;	/* guest's address space */
117221828Sgrehan	struct vcpu	vcpu[VM_MAXCPU];
118221828Sgrehan	int		num_mem_segs;
119256072Sneel	struct mem_seg	mem_segs[VM_MAX_MEMORY_SEGMENTS];
120221828Sgrehan	char		name[VM_MAX_NAMELEN];
121221828Sgrehan
122221828Sgrehan	/*
123223621Sgrehan	 * Set of active vcpus.
124221828Sgrehan	 * An active vcpu is one that has been started implicitly (BSP) or
125221828Sgrehan	 * explicitly (AP) by sending it a startup ipi.
126221828Sgrehan	 */
127223621Sgrehan	cpuset_t	active_cpus;
128221828Sgrehan};
129221828Sgrehan
130249396Sneelstatic int vmm_initialized;
131249396Sneel
132221828Sgrehanstatic struct vmm_ops *ops;
133221828Sgrehan#define	VMM_INIT()	(ops != NULL ? (*ops->init)() : 0)
134221828Sgrehan#define	VMM_CLEANUP()	(ops != NULL ? (*ops->cleanup)() : 0)
135261275Sjhb#define	VMM_RESUME()	(ops != NULL ? (*ops->resume)() : 0)
136221828Sgrehan
137256072Sneel#define	VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
138256072Sneel#define	VMRUN(vmi, vcpu, rip, pmap) \
139256072Sneel	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap) : ENXIO)
140221828Sgrehan#define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
141256072Sneel#define	VMSPACE_ALLOC(min, max) \
142256072Sneel	(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
143256072Sneel#define	VMSPACE_FREE(vmspace) \
144256072Sneel	(ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
145221828Sgrehan#define	VMGETREG(vmi, vcpu, num, retval)		\
146221828Sgrehan	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
147221828Sgrehan#define	VMSETREG(vmi, vcpu, num, val)		\
148221828Sgrehan	(ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
149221828Sgrehan#define	VMGETDESC(vmi, vcpu, num, desc)		\
150221828Sgrehan	(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
151221828Sgrehan#define	VMSETDESC(vmi, vcpu, num, desc)		\
152221828Sgrehan	(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
153221828Sgrehan#define	VMINJECT(vmi, vcpu, type, vec, ec, ecv)	\
154221828Sgrehan	(ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO)
155221828Sgrehan#define	VMGETCAP(vmi, vcpu, num, retval)	\
156221828Sgrehan	(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
157221828Sgrehan#define	VMSETCAP(vmi, vcpu, num, val)		\
158221828Sgrehan	(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
159221828Sgrehan
160245021Sneel#define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
161245021Sneel#define	fpu_stop_emulating()	clts()
162221828Sgrehan
163221828Sgrehanstatic MALLOC_DEFINE(M_VM, "vm", "vm");
164221828SgrehanCTASSERT(VMM_MSR_NUM <= 64);	/* msr_mask can keep track of up to 64 msrs */
165221828Sgrehan
166221828Sgrehan/* statistics */
167248389Sneelstatic VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
168221828Sgrehan
169221828Sgrehanstatic void
170221828Sgrehanvcpu_cleanup(struct vcpu *vcpu)
171221828Sgrehan{
172221828Sgrehan	vlapic_cleanup(vcpu->vlapic);
173234695Sgrehan	vmm_stat_free(vcpu->stats);
174234695Sgrehan	fpu_save_area_free(vcpu->guestfpu);
175221828Sgrehan}
176221828Sgrehan
177221828Sgrehanstatic void
178221828Sgrehanvcpu_init(struct vm *vm, uint32_t vcpu_id)
179221828Sgrehan{
180221828Sgrehan	struct vcpu *vcpu;
181221828Sgrehan
182221828Sgrehan	vcpu = &vm->vcpu[vcpu_id];
183221828Sgrehan
184241489Sneel	vcpu_lock_init(vcpu);
185241489Sneel	vcpu->hostcpu = NOCPU;
186221828Sgrehan	vcpu->vcpuid = vcpu_id;
187221828Sgrehan	vcpu->vlapic = vlapic_init(vm, vcpu_id);
188240943Sneel	vm_set_x2apic_state(vm, vcpu_id, X2APIC_ENABLED);
189234695Sgrehan	vcpu->guestfpu = fpu_save_area_alloc();
190234695Sgrehan	fpu_save_area_reset(vcpu->guestfpu);
191221828Sgrehan	vcpu->stats = vmm_stat_alloc();
192221828Sgrehan}
193221828Sgrehan
194240894Sneelstruct vm_exit *
195240894Sneelvm_exitinfo(struct vm *vm, int cpuid)
196240894Sneel{
197240894Sneel	struct vcpu *vcpu;
198240894Sneel
199240894Sneel	if (cpuid < 0 || cpuid >= VM_MAXCPU)
200240894Sneel		panic("vm_exitinfo: invalid cpuid %d", cpuid);
201240894Sneel
202240894Sneel	vcpu = &vm->vcpu[cpuid];
203240894Sneel
204240894Sneel	return (&vcpu->exitinfo);
205240894Sneel}
206240894Sneel
207261275Sjhbstatic void
208261275Sjhbvmm_resume(void)
209261275Sjhb{
210261275Sjhb	VMM_RESUME();
211261275Sjhb}
212261275Sjhb
213221828Sgrehanstatic int
214221828Sgrehanvmm_init(void)
215221828Sgrehan{
216221828Sgrehan	int error;
217221828Sgrehan
218242275Sneel	vmm_host_state_init();
219221828Sgrehan	vmm_ipi_init();
220221828Sgrehan
221221828Sgrehan	error = vmm_mem_init();
222221828Sgrehan	if (error)
223221828Sgrehan		return (error);
224221828Sgrehan
225221828Sgrehan	if (vmm_is_intel())
226221828Sgrehan		ops = &vmm_ops_intel;
227221828Sgrehan	else if (vmm_is_amd())
228221828Sgrehan		ops = &vmm_ops_amd;
229221828Sgrehan	else
230221828Sgrehan		return (ENXIO);
231221828Sgrehan
232221828Sgrehan	vmm_msr_init();
233261275Sjhb	vmm_resume_p = vmm_resume;
234221828Sgrehan
235221828Sgrehan	return (VMM_INIT());
236221828Sgrehan}
237221828Sgrehan
238221828Sgrehanstatic int
239221828Sgrehanvmm_handler(module_t mod, int what, void *arg)
240221828Sgrehan{
241221828Sgrehan	int error;
242221828Sgrehan
243221828Sgrehan	switch (what) {
244221828Sgrehan	case MOD_LOAD:
245221828Sgrehan		vmmdev_init();
246256072Sneel		iommu_init();
247221828Sgrehan		error = vmm_init();
248249396Sneel		if (error == 0)
249249396Sneel			vmm_initialized = 1;
250221828Sgrehan		break;
251221828Sgrehan	case MOD_UNLOAD:
252241454Sneel		error = vmmdev_cleanup();
253241454Sneel		if (error == 0) {
254261275Sjhb			vmm_resume_p = NULL;
255241454Sneel			iommu_cleanup();
256241454Sneel			vmm_ipi_cleanup();
257241454Sneel			error = VMM_CLEANUP();
258253854Sgrehan			/*
259253854Sgrehan			 * Something bad happened - prevent new
260253854Sgrehan			 * VMs from being created
261253854Sgrehan			 */
262253854Sgrehan			if (error)
263253854Sgrehan				vmm_initialized = 0;
264241454Sneel		}
265221828Sgrehan		break;
266221828Sgrehan	default:
267221828Sgrehan		error = 0;
268221828Sgrehan		break;
269221828Sgrehan	}
270221828Sgrehan	return (error);
271221828Sgrehan}
272221828Sgrehan
273221828Sgrehanstatic moduledata_t vmm_kmod = {
274221828Sgrehan	"vmm",
275221828Sgrehan	vmm_handler,
276221828Sgrehan	NULL
277221828Sgrehan};
278221828Sgrehan
279221828Sgrehan/*
280245704Sneel * vmm initialization has the following dependencies:
281245704Sneel *
282245704Sneel * - iommu initialization must happen after the pci passthru driver has had
283245704Sneel *   a chance to attach to any passthru devices (after SI_SUB_CONFIGURE).
284245704Sneel *
285245704Sneel * - VT-x initialization requires smp_rendezvous() and therefore must happen
286245704Sneel *   after SMP is fully functional (after SI_SUB_SMP).
287221828Sgrehan */
288245704SneelDECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
289221828SgrehanMODULE_VERSION(vmm, 1);
290221828Sgrehan
291221828SgrehanSYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
292221828Sgrehan
293249396Sneelint
294249396Sneelvm_create(const char *name, struct vm **retvm)
295221828Sgrehan{
296221828Sgrehan	int i;
297221828Sgrehan	struct vm *vm;
298256072Sneel	struct vmspace *vmspace;
299221828Sgrehan
300221828Sgrehan	const int BSP = 0;
301221828Sgrehan
302249396Sneel	/*
303249396Sneel	 * If vmm.ko could not be successfully initialized then don't attempt
304249396Sneel	 * to create the virtual machine.
305249396Sneel	 */
306249396Sneel	if (!vmm_initialized)
307249396Sneel		return (ENXIO);
308249396Sneel
309221828Sgrehan	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
310249396Sneel		return (EINVAL);
311221828Sgrehan
312256072Sneel	vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS);
313256072Sneel	if (vmspace == NULL)
314256072Sneel		return (ENOMEM);
315256072Sneel
316221828Sgrehan	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
317221828Sgrehan	strcpy(vm->name, name);
318256072Sneel	vm->cookie = VMINIT(vm, vmspace_pmap(vmspace));
319261088Sjhb	vm->vioapic = vioapic_init(vm);
320261088Sjhb	vm->vhpet = vhpet_init(vm);
321221828Sgrehan
322221828Sgrehan	for (i = 0; i < VM_MAXCPU; i++) {
323221828Sgrehan		vcpu_init(vm, i);
324221828Sgrehan		guest_msrs_init(vm, i);
325221828Sgrehan	}
326221828Sgrehan
327221828Sgrehan	vm_activate_cpu(vm, BSP);
328256072Sneel	vm->vmspace = vmspace;
329221828Sgrehan
330249396Sneel	*retvm = vm;
331249396Sneel	return (0);
332221828Sgrehan}
333221828Sgrehan
334241178Sneelstatic void
335256072Sneelvm_free_mem_seg(struct vm *vm, struct mem_seg *seg)
336241178Sneel{
337241178Sneel
338256072Sneel	if (seg->object != NULL)
339256072Sneel		vmm_mem_free(vm->vmspace, seg->gpa, seg->len);
340241362Sneel
341256072Sneel	bzero(seg, sizeof(*seg));
342241178Sneel}
343241178Sneel
344221828Sgrehanvoid
345221828Sgrehanvm_destroy(struct vm *vm)
346221828Sgrehan{
347221828Sgrehan	int i;
348221828Sgrehan
349221828Sgrehan	ppt_unassign_all(vm);
350221828Sgrehan
351256072Sneel	if (vm->iommu != NULL)
352256072Sneel		iommu_destroy_domain(vm->iommu);
353256072Sneel
354261088Sjhb	vhpet_cleanup(vm->vhpet);
355261088Sjhb	vioapic_cleanup(vm->vioapic);
356261088Sjhb
357221828Sgrehan	for (i = 0; i < vm->num_mem_segs; i++)
358241178Sneel		vm_free_mem_seg(vm, &vm->mem_segs[i]);
359221828Sgrehan
360241178Sneel	vm->num_mem_segs = 0;
361241178Sneel
362221828Sgrehan	for (i = 0; i < VM_MAXCPU; i++)
363221828Sgrehan		vcpu_cleanup(&vm->vcpu[i]);
364221828Sgrehan
365256072Sneel	VMSPACE_FREE(vm->vmspace);
366221828Sgrehan
367221828Sgrehan	VMCLEANUP(vm->cookie);
368221828Sgrehan
369221828Sgrehan	free(vm, M_VM);
370221828Sgrehan}
371221828Sgrehan
372221828Sgrehanconst char *
373221828Sgrehanvm_name(struct vm *vm)
374221828Sgrehan{
375221828Sgrehan	return (vm->name);
376221828Sgrehan}
377221828Sgrehan
378221828Sgrehanint
379221828Sgrehanvm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
380221828Sgrehan{
381256072Sneel	vm_object_t obj;
382221828Sgrehan
383256072Sneel	if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
384256072Sneel		return (ENOMEM);
385256072Sneel	else
386256072Sneel		return (0);
387221828Sgrehan}
388221828Sgrehan
389221828Sgrehanint
390221828Sgrehanvm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
391221828Sgrehan{
392221828Sgrehan
393256072Sneel	vmm_mmio_free(vm->vmspace, gpa, len);
394256072Sneel	return (0);
395221828Sgrehan}
396221828Sgrehan
397256072Sneelboolean_t
398256072Sneelvm_mem_allocated(struct vm *vm, vm_paddr_t gpa)
399241041Sneel{
400241041Sneel	int i;
401241041Sneel	vm_paddr_t gpabase, gpalimit;
402241041Sneel
403241041Sneel	for (i = 0; i < vm->num_mem_segs; i++) {
404241041Sneel		gpabase = vm->mem_segs[i].gpa;
405241041Sneel		gpalimit = gpabase + vm->mem_segs[i].len;
406241041Sneel		if (gpa >= gpabase && gpa < gpalimit)
407256072Sneel			return (TRUE);		/* 'gpa' is regular memory */
408241041Sneel	}
409241041Sneel
410256072Sneel	if (ppt_is_mmio(vm, gpa))
411256072Sneel		return (TRUE);			/* 'gpa' is pci passthru mmio */
412256072Sneel
413256072Sneel	return (FALSE);
414241041Sneel}
415241041Sneel
416221828Sgrehanint
417241041Sneelvm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
418221828Sgrehan{
419256072Sneel	int available, allocated;
420256072Sneel	struct mem_seg *seg;
421256072Sneel	vm_object_t object;
422256072Sneel	vm_paddr_t g;
423221828Sgrehan
424241041Sneel	if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
425241041Sneel		return (EINVAL);
426221828Sgrehan
427241041Sneel	available = allocated = 0;
428241041Sneel	g = gpa;
429241041Sneel	while (g < gpa + len) {
430256072Sneel		if (vm_mem_allocated(vm, g))
431256072Sneel			allocated++;
432256072Sneel		else
433241041Sneel			available++;
434241041Sneel
435241041Sneel		g += PAGE_SIZE;
436241041Sneel	}
437241041Sneel
438221828Sgrehan	/*
439241041Sneel	 * If there are some allocated and some available pages in the address
440241041Sneel	 * range then it is an error.
441221828Sgrehan	 */
442241041Sneel	if (allocated && available)
443241041Sneel		return (EINVAL);
444221828Sgrehan
445241041Sneel	/*
446241041Sneel	 * If the entire address range being requested has already been
447241041Sneel	 * allocated then there isn't anything more to do.
448241041Sneel	 */
449241041Sneel	if (allocated && available == 0)
450241041Sneel		return (0);
451241041Sneel
452221828Sgrehan	if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
453221828Sgrehan		return (E2BIG);
454221828Sgrehan
455241178Sneel	seg = &vm->mem_segs[vm->num_mem_segs];
456221828Sgrehan
457256072Sneel	if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL)
458256072Sneel		return (ENOMEM);
459256072Sneel
460241178Sneel	seg->gpa = gpa;
461256072Sneel	seg->len = len;
462256072Sneel	seg->object = object;
463256072Sneel	seg->wired = FALSE;
464241178Sneel
465256072Sneel	vm->num_mem_segs++;
466256072Sneel
467256072Sneel	return (0);
468256072Sneel}
469256072Sneel
470256072Sneelstatic void
471256072Sneelvm_gpa_unwire(struct vm *vm)
472256072Sneel{
473256072Sneel	int i, rv;
474256072Sneel	struct mem_seg *seg;
475256072Sneel
476256072Sneel	for (i = 0; i < vm->num_mem_segs; i++) {
477256072Sneel		seg = &vm->mem_segs[i];
478256072Sneel		if (!seg->wired)
479256072Sneel			continue;
480256072Sneel
481256072Sneel		rv = vm_map_unwire(&vm->vmspace->vm_map,
482256072Sneel				   seg->gpa, seg->gpa + seg->len,
483256072Sneel				   VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
484256072Sneel		KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment "
485256072Sneel		    "%#lx/%ld could not be unwired: %d",
486256072Sneel		    vm_name(vm), seg->gpa, seg->len, rv));
487256072Sneel
488256072Sneel		seg->wired = FALSE;
489256072Sneel	}
490256072Sneel}
491256072Sneel
492256072Sneelstatic int
493256072Sneelvm_gpa_wire(struct vm *vm)
494256072Sneel{
495256072Sneel	int i, rv;
496256072Sneel	struct mem_seg *seg;
497256072Sneel
498256072Sneel	for (i = 0; i < vm->num_mem_segs; i++) {
499256072Sneel		seg = &vm->mem_segs[i];
500256072Sneel		if (seg->wired)
501256072Sneel			continue;
502256072Sneel
503256072Sneel		/* XXX rlimits? */
504256072Sneel		rv = vm_map_wire(&vm->vmspace->vm_map,
505256072Sneel				 seg->gpa, seg->gpa + seg->len,
506256072Sneel				 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
507256072Sneel		if (rv != KERN_SUCCESS)
508241178Sneel			break;
509241178Sneel
510256072Sneel		seg->wired = TRUE;
511256072Sneel	}
512256072Sneel
513256072Sneel	if (i < vm->num_mem_segs) {
514241362Sneel		/*
515256072Sneel		 * Undo the wiring before returning an error.
516241362Sneel		 */
517256072Sneel		vm_gpa_unwire(vm);
518256072Sneel		return (EAGAIN);
519256072Sneel	}
520241178Sneel
521256072Sneel	return (0);
522256072Sneel}
523256072Sneel
524256072Sneelstatic void
525256072Sneelvm_iommu_modify(struct vm *vm, boolean_t map)
526256072Sneel{
527256072Sneel	int i, sz;
528256072Sneel	vm_paddr_t gpa, hpa;
529256072Sneel	struct mem_seg *seg;
530256072Sneel	void *vp, *cookie, *host_domain;
531256072Sneel
532256072Sneel	sz = PAGE_SIZE;
533256072Sneel	host_domain = iommu_host_domain();
534256072Sneel
535256072Sneel	for (i = 0; i < vm->num_mem_segs; i++) {
536256072Sneel		seg = &vm->mem_segs[i];
537256072Sneel		KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired",
538256072Sneel		    vm_name(vm), seg->gpa, seg->len));
539256072Sneel
540256072Sneel		gpa = seg->gpa;
541256072Sneel		while (gpa < seg->gpa + seg->len) {
542256072Sneel			vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE,
543256072Sneel					 &cookie);
544256072Sneel			KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
545256072Sneel			    vm_name(vm), gpa));
546256072Sneel
547256072Sneel			vm_gpa_release(cookie);
548256072Sneel
549256072Sneel			hpa = DMAP_TO_PHYS((uintptr_t)vp);
550256072Sneel			if (map) {
551256072Sneel				iommu_create_mapping(vm->iommu, gpa, hpa, sz);
552256072Sneel				iommu_remove_mapping(host_domain, hpa, sz);
553256072Sneel			} else {
554256072Sneel				iommu_remove_mapping(vm->iommu, gpa, sz);
555256072Sneel				iommu_create_mapping(host_domain, hpa, hpa, sz);
556256072Sneel			}
557256072Sneel
558256072Sneel			gpa += PAGE_SIZE;
559256072Sneel		}
560241178Sneel	}
561241178Sneel
562256072Sneel	/*
563256072Sneel	 * Invalidate the cached translations associated with the domain
564256072Sneel	 * from which pages were removed.
565256072Sneel	 */
566256072Sneel	if (map)
567256072Sneel		iommu_invalidate_tlb(host_domain);
568256072Sneel	else
569256072Sneel		iommu_invalidate_tlb(vm->iommu);
570256072Sneel}
571256072Sneel
572256072Sneel#define	vm_iommu_unmap(vm)	vm_iommu_modify((vm), FALSE)
573256072Sneel#define	vm_iommu_map(vm)	vm_iommu_modify((vm), TRUE)
574256072Sneel
575256072Sneelint
576256072Sneelvm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
577256072Sneel{
578256072Sneel	int error;
579256072Sneel
580256072Sneel	error = ppt_unassign_device(vm, bus, slot, func);
581256072Sneel	if (error)
582221828Sgrehan		return (error);
583256072Sneel
584256072Sneel	if (ppt_num_devices(vm) == 0) {
585256072Sneel		vm_iommu_unmap(vm);
586256072Sneel		vm_gpa_unwire(vm);
587221828Sgrehan	}
588256072Sneel	return (0);
589256072Sneel}
590221828Sgrehan
591256072Sneelint
592256072Sneelvm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
593256072Sneel{
594256072Sneel	int error;
595256072Sneel	vm_paddr_t maxaddr;
596256072Sneel
597241362Sneel	/*
598256072Sneel	 * Virtual machines with pci passthru devices get special treatment:
599256072Sneel	 * - the guest physical memory is wired
600256072Sneel	 * - the iommu is programmed to do the 'gpa' to 'hpa' translation
601256072Sneel	 *
602256072Sneel	 * We need to do this before the first pci passthru device is attached.
603241362Sneel	 */
604256072Sneel	if (ppt_num_devices(vm) == 0) {
605256072Sneel		KASSERT(vm->iommu == NULL,
606256072Sneel		    ("vm_assign_pptdev: iommu must be NULL"));
607256072Sneel		maxaddr = vmm_mem_maxaddr();
608256072Sneel		vm->iommu = iommu_create_domain(maxaddr);
609241362Sneel
610256072Sneel		error = vm_gpa_wire(vm);
611256072Sneel		if (error)
612256072Sneel			return (error);
613241041Sneel
614256072Sneel		vm_iommu_map(vm);
615256072Sneel	}
616256072Sneel
617256072Sneel	error = ppt_assign_device(vm, bus, slot, func);
618256072Sneel	return (error);
619221828Sgrehan}
620221828Sgrehan
621256072Sneelvoid *
622256072Sneelvm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
623256072Sneel	    void **cookie)
624221828Sgrehan{
625256072Sneel	int count, pageoff;
626256072Sneel	vm_page_t m;
627221828Sgrehan
628256072Sneel	pageoff = gpa & PAGE_MASK;
629256072Sneel	if (len > PAGE_SIZE - pageoff)
630256072Sneel		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
631241148Sneel
632256072Sneel	count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
633256072Sneel	    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
634256072Sneel
635256072Sneel	if (count == 1) {
636256072Sneel		*cookie = m;
637256072Sneel		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
638256072Sneel	} else {
639256072Sneel		*cookie = NULL;
640256072Sneel		return (NULL);
641256072Sneel	}
642221828Sgrehan}
643221828Sgrehan
644256072Sneelvoid
645256072Sneelvm_gpa_release(void *cookie)
646256072Sneel{
647256072Sneel	vm_page_t m = cookie;
648256072Sneel
649256072Sneel	vm_page_lock(m);
650256072Sneel	vm_page_unhold(m);
651256072Sneel	vm_page_unlock(m);
652256072Sneel}
653256072Sneel
654221828Sgrehanint
655221828Sgrehanvm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
656221828Sgrehan		  struct vm_memory_segment *seg)
657221828Sgrehan{
658221828Sgrehan	int i;
659221828Sgrehan
660221828Sgrehan	for (i = 0; i < vm->num_mem_segs; i++) {
661221828Sgrehan		if (gpabase == vm->mem_segs[i].gpa) {
662256072Sneel			seg->gpa = vm->mem_segs[i].gpa;
663256072Sneel			seg->len = vm->mem_segs[i].len;
664256072Sneel			seg->wired = vm->mem_segs[i].wired;
665221828Sgrehan			return (0);
666221828Sgrehan		}
667221828Sgrehan	}
668221828Sgrehan	return (-1);
669221828Sgrehan}
670221828Sgrehan
671221828Sgrehanint
672256072Sneelvm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
673256072Sneel	      vm_offset_t *offset, struct vm_object **object)
674256072Sneel{
675256072Sneel	int i;
676256072Sneel	size_t seg_len;
677256072Sneel	vm_paddr_t seg_gpa;
678256072Sneel	vm_object_t seg_obj;
679256072Sneel
680256072Sneel	for (i = 0; i < vm->num_mem_segs; i++) {
681256072Sneel		if ((seg_obj = vm->mem_segs[i].object) == NULL)
682256072Sneel			continue;
683256072Sneel
684256072Sneel		seg_gpa = vm->mem_segs[i].gpa;
685256072Sneel		seg_len = vm->mem_segs[i].len;
686256072Sneel
687256072Sneel		if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) {
688256072Sneel			*offset = gpa - seg_gpa;
689256072Sneel			*object = seg_obj;
690256072Sneel			vm_object_reference(seg_obj);
691256072Sneel			return (0);
692256072Sneel		}
693256072Sneel	}
694256072Sneel
695256072Sneel	return (EINVAL);
696256072Sneel}
697256072Sneel
698256072Sneelint
699221828Sgrehanvm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
700221828Sgrehan{
701221828Sgrehan
702221828Sgrehan	if (vcpu < 0 || vcpu >= VM_MAXCPU)
703221828Sgrehan		return (EINVAL);
704221828Sgrehan
705221828Sgrehan	if (reg >= VM_REG_LAST)
706221828Sgrehan		return (EINVAL);
707221828Sgrehan
708221828Sgrehan	return (VMGETREG(vm->cookie, vcpu, reg, retval));
709221828Sgrehan}
710221828Sgrehan
711221828Sgrehanint
712221828Sgrehanvm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
713221828Sgrehan{
714221828Sgrehan
715221828Sgrehan	if (vcpu < 0 || vcpu >= VM_MAXCPU)
716221828Sgrehan		return (EINVAL);
717221828Sgrehan
718221828Sgrehan	if (reg >= VM_REG_LAST)
719221828Sgrehan		return (EINVAL);
720221828Sgrehan
721221828Sgrehan	return (VMSETREG(vm->cookie, vcpu, reg, val));
722221828Sgrehan}
723221828Sgrehan
724221828Sgrehanstatic boolean_t
725221828Sgrehanis_descriptor_table(int reg)
726221828Sgrehan{
727221828Sgrehan
728221828Sgrehan	switch (reg) {
729221828Sgrehan	case VM_REG_GUEST_IDTR:
730221828Sgrehan	case VM_REG_GUEST_GDTR:
731221828Sgrehan		return (TRUE);
732221828Sgrehan	default:
733221828Sgrehan		return (FALSE);
734221828Sgrehan	}
735221828Sgrehan}
736221828Sgrehan
737221828Sgrehanstatic boolean_t
738221828Sgrehanis_segment_register(int reg)
739221828Sgrehan{
740221828Sgrehan
741221828Sgrehan	switch (reg) {
742221828Sgrehan	case VM_REG_GUEST_ES:
743221828Sgrehan	case VM_REG_GUEST_CS:
744221828Sgrehan	case VM_REG_GUEST_SS:
745221828Sgrehan	case VM_REG_GUEST_DS:
746221828Sgrehan	case VM_REG_GUEST_FS:
747221828Sgrehan	case VM_REG_GUEST_GS:
748221828Sgrehan	case VM_REG_GUEST_TR:
749221828Sgrehan	case VM_REG_GUEST_LDTR:
750221828Sgrehan		return (TRUE);
751221828Sgrehan	default:
752221828Sgrehan		return (FALSE);
753221828Sgrehan	}
754221828Sgrehan}
755221828Sgrehan
756221828Sgrehanint
757221828Sgrehanvm_get_seg_desc(struct vm *vm, int vcpu, int reg,
758221828Sgrehan		struct seg_desc *desc)
759221828Sgrehan{
760221828Sgrehan
761221828Sgrehan	if (vcpu < 0 || vcpu >= VM_MAXCPU)
762221828Sgrehan		return (EINVAL);
763221828Sgrehan
764221828Sgrehan	if (!is_segment_register(reg) && !is_descriptor_table(reg))
765221828Sgrehan		return (EINVAL);
766221828Sgrehan
767221828Sgrehan	return (VMGETDESC(vm->cookie, vcpu, reg, desc));
768221828Sgrehan}
769221828Sgrehan
770221828Sgrehanint
771221828Sgrehanvm_set_seg_desc(struct vm *vm, int vcpu, int reg,
772221828Sgrehan		struct seg_desc *desc)
773221828Sgrehan{
774221828Sgrehan	if (vcpu < 0 || vcpu >= VM_MAXCPU)
775221828Sgrehan		return (EINVAL);
776221828Sgrehan
777221828Sgrehan	if (!is_segment_register(reg) && !is_descriptor_table(reg))
778221828Sgrehan		return (EINVAL);
779221828Sgrehan
780221828Sgrehan	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
781221828Sgrehan}
782221828Sgrehan
783221828Sgrehanstatic void
784221828Sgrehanrestore_guest_fpustate(struct vcpu *vcpu)
785221828Sgrehan{
786221828Sgrehan
787234695Sgrehan	/* flush host state to the pcb */
788234695Sgrehan	fpuexit(curthread);
789242122Sneel
790242122Sneel	/* restore guest FPU state */
791221828Sgrehan	fpu_stop_emulating();
792234695Sgrehan	fpurestore(vcpu->guestfpu);
793242122Sneel
794242122Sneel	/*
795242122Sneel	 * The FPU is now "dirty" with the guest's state so turn on emulation
796242122Sneel	 * to trap any access to the FPU by the host.
797242122Sneel	 */
798242122Sneel	fpu_start_emulating();
799221828Sgrehan}
800221828Sgrehan
801221828Sgrehanstatic void
802221828Sgrehansave_guest_fpustate(struct vcpu *vcpu)
803221828Sgrehan{
804221828Sgrehan
805242122Sneel	if ((rcr0() & CR0_TS) == 0)
806242122Sneel		panic("fpu emulation not enabled in host!");
807242122Sneel
808242122Sneel	/* save guest FPU state */
809242122Sneel	fpu_stop_emulating();
810234695Sgrehan	fpusave(vcpu->guestfpu);
811221828Sgrehan	fpu_start_emulating();
812221828Sgrehan}
813221828Sgrehan
814248389Sneelstatic VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
815242065Sneel
816256072Sneelstatic int
817256072Sneelvcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
818256072Sneel{
819256072Sneel	int error;
820256072Sneel
821256072Sneel	vcpu_assert_locked(vcpu);
822256072Sneel
823256072Sneel	/*
824256072Sneel	 * The following state transitions are allowed:
825256072Sneel	 * IDLE -> FROZEN -> IDLE
826256072Sneel	 * FROZEN -> RUNNING -> FROZEN
827256072Sneel	 * FROZEN -> SLEEPING -> FROZEN
828256072Sneel	 */
829256072Sneel	switch (vcpu->state) {
830256072Sneel	case VCPU_IDLE:
831256072Sneel	case VCPU_RUNNING:
832256072Sneel	case VCPU_SLEEPING:
833256072Sneel		error = (newstate != VCPU_FROZEN);
834256072Sneel		break;
835256072Sneel	case VCPU_FROZEN:
836256072Sneel		error = (newstate == VCPU_FROZEN);
837256072Sneel		break;
838256072Sneel	default:
839256072Sneel		error = 1;
840256072Sneel		break;
841256072Sneel	}
842256072Sneel
843256072Sneel	if (error == 0)
844256072Sneel		vcpu->state = newstate;
845256072Sneel	else
846256072Sneel		error = EBUSY;
847256072Sneel
848256072Sneel	return (error);
849256072Sneel}
850256072Sneel
851256072Sneelstatic void
852256072Sneelvcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
853256072Sneel{
854256072Sneel	int error;
855256072Sneel
856256072Sneel	if ((error = vcpu_set_state(vm, vcpuid, newstate)) != 0)
857256072Sneel		panic("Error %d setting state to %d\n", error, newstate);
858256072Sneel}
859256072Sneel
860256072Sneelstatic void
861256072Sneelvcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
862256072Sneel{
863256072Sneel	int error;
864256072Sneel
865256072Sneel	if ((error = vcpu_set_state_locked(vcpu, newstate)) != 0)
866256072Sneel		panic("Error %d setting state to %d", error, newstate);
867256072Sneel}
868256072Sneel
869256072Sneel/*
870256072Sneel * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
871256072Sneel */
872256072Sneelstatic int
873256072Sneelvm_handle_hlt(struct vm *vm, int vcpuid, boolean_t *retu)
874256072Sneel{
875256072Sneel	struct vcpu *vcpu;
876256072Sneel	int sleepticks, t;
877256072Sneel
878256072Sneel	vcpu = &vm->vcpu[vcpuid];
879256072Sneel
880256072Sneel	vcpu_lock(vcpu);
881256072Sneel
882256072Sneel	/*
883256072Sneel	 * Figure out the number of host ticks until the next apic
884256072Sneel	 * timer interrupt in the guest.
885256072Sneel	 */
886256072Sneel	sleepticks = lapic_timer_tick(vm, vcpuid);
887256072Sneel
888256072Sneel	/*
889256072Sneel	 * If the guest local apic timer is disabled then sleep for
890256072Sneel	 * a long time but not forever.
891256072Sneel	 */
892256072Sneel	if (sleepticks < 0)
893256072Sneel		sleepticks = hz;
894256072Sneel
895256072Sneel	/*
896256072Sneel	 * Do a final check for pending NMI or interrupts before
897256072Sneel	 * really putting this thread to sleep.
898256072Sneel	 *
899256072Sneel	 * These interrupts could have happened any time after we
900256072Sneel	 * returned from VMRUN() and before we grabbed the vcpu lock.
901256072Sneel	 */
902256072Sneel	if (!vm_nmi_pending(vm, vcpuid) && lapic_pending_intr(vm, vcpuid) < 0) {
903256072Sneel		if (sleepticks <= 0)
904256072Sneel			panic("invalid sleepticks %d", sleepticks);
905256072Sneel		t = ticks;
906256072Sneel		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
907256072Sneel		msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks);
908256072Sneel		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
909256072Sneel		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
910256072Sneel	}
911256072Sneel	vcpu_unlock(vcpu);
912256072Sneel
913256072Sneel	return (0);
914256072Sneel}
915256072Sneel
916256072Sneelstatic int
917256072Sneelvm_handle_paging(struct vm *vm, int vcpuid, boolean_t *retu)
918256072Sneel{
919256072Sneel	int rv, ftype;
920256072Sneel	struct vm_map *map;
921256072Sneel	struct vcpu *vcpu;
922256072Sneel	struct vm_exit *vme;
923256072Sneel
924256072Sneel	vcpu = &vm->vcpu[vcpuid];
925256072Sneel	vme = &vcpu->exitinfo;
926256072Sneel
927256072Sneel	ftype = vme->u.paging.fault_type;
928256072Sneel	KASSERT(ftype == VM_PROT_READ ||
929256072Sneel	    ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
930256072Sneel	    ("vm_handle_paging: invalid fault_type %d", ftype));
931256072Sneel
932256072Sneel	if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
933256072Sneel		rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
934256072Sneel		    vme->u.paging.gpa, ftype);
935256072Sneel		if (rv == 0)
936256072Sneel			goto done;
937256072Sneel	}
938256072Sneel
939256072Sneel	map = &vm->vmspace->vm_map;
940256072Sneel	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
941256072Sneel
942261088Sjhb	VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, "
943261088Sjhb	    "ftype = %d", rv, vme->u.paging.gpa, ftype);
944256072Sneel
945256072Sneel	if (rv != KERN_SUCCESS)
946256072Sneel		return (EFAULT);
947256072Sneeldone:
948256072Sneel	/* restart execution at the faulting instruction */
949256072Sneel	vme->inst_length = 0;
950256072Sneel
951256072Sneel	return (0);
952256072Sneel}
953256072Sneel
954256072Sneelstatic int
955256072Sneelvm_handle_inst_emul(struct vm *vm, int vcpuid, boolean_t *retu)
956256072Sneel{
957256072Sneel	struct vie *vie;
958256072Sneel	struct vcpu *vcpu;
959256072Sneel	struct vm_exit *vme;
960256072Sneel	int error, inst_length;
961256072Sneel	uint64_t rip, gla, gpa, cr3;
962261088Sjhb	mem_region_read_t mread;
963261088Sjhb	mem_region_write_t mwrite;
964256072Sneel
965256072Sneel	vcpu = &vm->vcpu[vcpuid];
966256072Sneel	vme = &vcpu->exitinfo;
967256072Sneel
968256072Sneel	rip = vme->rip;
969256072Sneel	inst_length = vme->inst_length;
970256072Sneel
971256072Sneel	gla = vme->u.inst_emul.gla;
972256072Sneel	gpa = vme->u.inst_emul.gpa;
973256072Sneel	cr3 = vme->u.inst_emul.cr3;
974256072Sneel	vie = &vme->u.inst_emul.vie;
975256072Sneel
976256072Sneel	vie_init(vie);
977256072Sneel
978256072Sneel	/* Fetch, decode and emulate the faulting instruction */
979256072Sneel	if (vmm_fetch_instruction(vm, vcpuid, rip, inst_length, cr3, vie) != 0)
980256072Sneel		return (EFAULT);
981256072Sneel
982256072Sneel	if (vmm_decode_instruction(vm, vcpuid, gla, vie) != 0)
983256072Sneel		return (EFAULT);
984256072Sneel
985261088Sjhb	/* return to userland unless this is an in-kernel emulated device */
986261088Sjhb	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
987261088Sjhb		mread = lapic_mmio_read;
988261088Sjhb		mwrite = lapic_mmio_write;
989261088Sjhb	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
990261088Sjhb		mread = vioapic_mmio_read;
991261088Sjhb		mwrite = vioapic_mmio_write;
992261088Sjhb	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
993261088Sjhb		mread = vhpet_mmio_read;
994261088Sjhb		mwrite = vhpet_mmio_write;
995261088Sjhb	} else {
996256072Sneel		*retu = TRUE;
997256072Sneel		return (0);
998256072Sneel	}
999256072Sneel
1000261088Sjhb	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, mread, mwrite, 0);
1001256072Sneel
1002256072Sneel	/* return to userland to spin up the AP */
1003256072Sneel	if (error == 0 && vme->exitcode == VM_EXITCODE_SPINUP_AP)
1004256072Sneel		*retu = TRUE;
1005256072Sneel
1006256072Sneel	return (error);
1007256072Sneel}
1008256072Sneel
1009221828Sgrehanint
1010221828Sgrehanvm_run(struct vm *vm, struct vm_run *vmrun)
1011221828Sgrehan{
1012256072Sneel	int error, vcpuid;
1013221828Sgrehan	struct vcpu *vcpu;
1014221828Sgrehan	struct pcb *pcb;
1015242065Sneel	uint64_t tscval, rip;
1016242065Sneel	struct vm_exit *vme;
1017256072Sneel	boolean_t retu;
1018256072Sneel	pmap_t pmap;
1019221828Sgrehan
1020221828Sgrehan	vcpuid = vmrun->cpuid;
1021221828Sgrehan
1022221828Sgrehan	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1023221828Sgrehan		return (EINVAL);
1024221828Sgrehan
1025256072Sneel	pmap = vmspace_pmap(vm->vmspace);
1026221828Sgrehan	vcpu = &vm->vcpu[vcpuid];
1027256072Sneel	vme = &vcpu->exitinfo;
1028242065Sneel	rip = vmrun->rip;
1029242065Sneelrestart:
1030221828Sgrehan	critical_enter();
1031221828Sgrehan
1032256072Sneel	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
1033256072Sneel	    ("vm_run: absurd pm_active"));
1034256072Sneel
1035221828Sgrehan	tscval = rdtsc();
1036221828Sgrehan
1037221828Sgrehan	pcb = PCPU_GET(curpcb);
1038221914Sjhb	set_pcb_flags(pcb, PCB_FULL_IRET);
1039221828Sgrehan
1040234695Sgrehan	restore_guest_msrs(vm, vcpuid);
1041221828Sgrehan	restore_guest_fpustate(vcpu);
1042241489Sneel
1043256072Sneel	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
1044241489Sneel	vcpu->hostcpu = curcpu;
1045256072Sneel	error = VMRUN(vm->cookie, vcpuid, rip, pmap);
1046241489Sneel	vcpu->hostcpu = NOCPU;
1047256072Sneel	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
1048241489Sneel
1049221828Sgrehan	save_guest_fpustate(vcpu);
1050221828Sgrehan	restore_host_msrs(vm, vcpuid);
1051221828Sgrehan
1052221828Sgrehan	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
1053221828Sgrehan
1054221828Sgrehan	critical_exit();
1055221828Sgrehan
1056256072Sneel	if (error == 0) {
1057256072Sneel		retu = FALSE;
1058256072Sneel		switch (vme->exitcode) {
1059256072Sneel		case VM_EXITCODE_HLT:
1060256072Sneel			error = vm_handle_hlt(vm, vcpuid, &retu);
1061256072Sneel			break;
1062256072Sneel		case VM_EXITCODE_PAGING:
1063256072Sneel			error = vm_handle_paging(vm, vcpuid, &retu);
1064256072Sneel			break;
1065256072Sneel		case VM_EXITCODE_INST_EMUL:
1066256072Sneel			error = vm_handle_inst_emul(vm, vcpuid, &retu);
1067256072Sneel			break;
1068256072Sneel		default:
1069256072Sneel			retu = TRUE;	/* handled in userland */
1070256072Sneel			break;
1071242065Sneel		}
1072256072Sneel	}
1073242065Sneel
1074256072Sneel	if (error == 0 && retu == FALSE) {
1075242065Sneel		rip = vme->rip + vme->inst_length;
1076242065Sneel		goto restart;
1077242065Sneel	}
1078242065Sneel
1079256072Sneel	/* copy the exit information */
1080256072Sneel	bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
1081221828Sgrehan	return (error);
1082221828Sgrehan}
1083221828Sgrehan
1084221828Sgrehanint
1085221828Sgrehanvm_inject_event(struct vm *vm, int vcpuid, int type,
1086221828Sgrehan		int vector, uint32_t code, int code_valid)
1087221828Sgrehan{
1088221828Sgrehan	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1089221828Sgrehan		return (EINVAL);
1090221828Sgrehan
1091221828Sgrehan	if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0)
1092221828Sgrehan		return (EINVAL);
1093221828Sgrehan
1094221828Sgrehan	if (vector < 0 || vector > 255)
1095221828Sgrehan		return (EINVAL);
1096221828Sgrehan
1097221828Sgrehan	return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid));
1098221828Sgrehan}
1099221828Sgrehan
1100248389Sneelstatic VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
1101241982Sneel
1102221828Sgrehanint
1103241982Sneelvm_inject_nmi(struct vm *vm, int vcpuid)
1104221828Sgrehan{
1105241982Sneel	struct vcpu *vcpu;
1106221828Sgrehan
1107241982Sneel	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1108221828Sgrehan		return (EINVAL);
1109221828Sgrehan
1110241982Sneel	vcpu = &vm->vcpu[vcpuid];
1111241982Sneel
1112241982Sneel	vcpu->nmi_pending = 1;
1113241982Sneel	vm_interrupt_hostcpu(vm, vcpuid);
1114241982Sneel	return (0);
1115221828Sgrehan}
1116221828Sgrehan
1117221828Sgrehanint
1118241982Sneelvm_nmi_pending(struct vm *vm, int vcpuid)
1119241982Sneel{
1120241982Sneel	struct vcpu *vcpu;
1121241982Sneel
1122241982Sneel	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1123241982Sneel		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1124241982Sneel
1125241982Sneel	vcpu = &vm->vcpu[vcpuid];
1126241982Sneel
1127241982Sneel	return (vcpu->nmi_pending);
1128241982Sneel}
1129241982Sneel
1130241982Sneelvoid
1131241982Sneelvm_nmi_clear(struct vm *vm, int vcpuid)
1132241982Sneel{
1133241982Sneel	struct vcpu *vcpu;
1134241982Sneel
1135241982Sneel	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1136241982Sneel		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1137241982Sneel
1138241982Sneel	vcpu = &vm->vcpu[vcpuid];
1139241982Sneel
1140241982Sneel	if (vcpu->nmi_pending == 0)
1141241982Sneel		panic("vm_nmi_clear: inconsistent nmi_pending state");
1142241982Sneel
1143241982Sneel	vcpu->nmi_pending = 0;
1144241982Sneel	vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
1145241982Sneel}
1146241982Sneel
1147241982Sneelint
1148221828Sgrehanvm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
1149221828Sgrehan{
1150221828Sgrehan	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1151221828Sgrehan		return (EINVAL);
1152221828Sgrehan
1153221828Sgrehan	if (type < 0 || type >= VM_CAP_MAX)
1154221828Sgrehan		return (EINVAL);
1155221828Sgrehan
1156221828Sgrehan	return (VMGETCAP(vm->cookie, vcpu, type, retval));
1157221828Sgrehan}
1158221828Sgrehan
1159221828Sgrehanint
1160221828Sgrehanvm_set_capability(struct vm *vm, int vcpu, int type, int val)
1161221828Sgrehan{
1162221828Sgrehan	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1163221828Sgrehan		return (EINVAL);
1164221828Sgrehan
1165221828Sgrehan	if (type < 0 || type >= VM_CAP_MAX)
1166221828Sgrehan		return (EINVAL);
1167221828Sgrehan
1168221828Sgrehan	return (VMSETCAP(vm->cookie, vcpu, type, val));
1169221828Sgrehan}
1170221828Sgrehan
1171221828Sgrehanuint64_t *
1172221828Sgrehanvm_guest_msrs(struct vm *vm, int cpu)
1173221828Sgrehan{
1174221828Sgrehan	return (vm->vcpu[cpu].guest_msrs);
1175221828Sgrehan}
1176221828Sgrehan
1177221828Sgrehanstruct vlapic *
1178221828Sgrehanvm_lapic(struct vm *vm, int cpu)
1179221828Sgrehan{
1180221828Sgrehan	return (vm->vcpu[cpu].vlapic);
1181221828Sgrehan}
1182221828Sgrehan
1183261088Sjhbstruct vioapic *
1184261088Sjhbvm_ioapic(struct vm *vm)
1185261088Sjhb{
1186261088Sjhb
1187261088Sjhb	return (vm->vioapic);
1188261088Sjhb}
1189261088Sjhb
1190261088Sjhbstruct vhpet *
1191261088Sjhbvm_hpet(struct vm *vm)
1192261088Sjhb{
1193261088Sjhb
1194261088Sjhb	return (vm->vhpet);
1195261088Sjhb}
1196261088Sjhb
1197221828Sgrehanboolean_t
1198221828Sgrehanvmm_is_pptdev(int bus, int slot, int func)
1199221828Sgrehan{
1200246188Sneel	int found, i, n;
1201246188Sneel	int b, s, f;
1202221828Sgrehan	char *val, *cp, *cp2;
1203221828Sgrehan
1204221828Sgrehan	/*
1205246188Sneel	 * XXX
1206246188Sneel	 * The length of an environment variable is limited to 128 bytes which
1207246188Sneel	 * puts an upper limit on the number of passthru devices that may be
1208246188Sneel	 * specified using a single environment variable.
1209246188Sneel	 *
1210246188Sneel	 * Work around this by scanning multiple environment variable
1211246188Sneel	 * names instead of a single one - yuck!
1212221828Sgrehan	 */
1213246188Sneel	const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
1214246188Sneel
1215246188Sneel	/* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
1216221828Sgrehan	found = 0;
1217246188Sneel	for (i = 0; names[i] != NULL && !found; i++) {
1218246188Sneel		cp = val = getenv(names[i]);
1219246188Sneel		while (cp != NULL && *cp != '\0') {
1220246188Sneel			if ((cp2 = strchr(cp, ' ')) != NULL)
1221246188Sneel				*cp2 = '\0';
1222221828Sgrehan
1223246188Sneel			n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
1224246188Sneel			if (n == 3 && bus == b && slot == s && func == f) {
1225246188Sneel				found = 1;
1226246188Sneel				break;
1227246188Sneel			}
1228221828Sgrehan
1229246188Sneel			if (cp2 != NULL)
1230246188Sneel				*cp2++ = ' ';
1231221828Sgrehan
1232246188Sneel			cp = cp2;
1233246188Sneel		}
1234246188Sneel		freeenv(val);
1235221828Sgrehan	}
1236221828Sgrehan	return (found);
1237221828Sgrehan}
1238221828Sgrehan
1239221828Sgrehanvoid *
1240221828Sgrehanvm_iommu_domain(struct vm *vm)
1241221828Sgrehan{
1242221828Sgrehan
1243221828Sgrehan	return (vm->iommu);
1244221828Sgrehan}
1245221828Sgrehan
1246241489Sneelint
1247256072Sneelvcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1248221828Sgrehan{
1249241489Sneel	int error;
1250221828Sgrehan	struct vcpu *vcpu;
1251221828Sgrehan
1252221828Sgrehan	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1253221828Sgrehan		panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
1254221828Sgrehan
1255221828Sgrehan	vcpu = &vm->vcpu[vcpuid];
1256221828Sgrehan
1257241489Sneel	vcpu_lock(vcpu);
1258256072Sneel	error = vcpu_set_state_locked(vcpu, newstate);
1259241489Sneel	vcpu_unlock(vcpu);
1260241489Sneel
1261241489Sneel	return (error);
1262221828Sgrehan}
1263221828Sgrehan
1264241489Sneelenum vcpu_state
1265249879Sgrehanvcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
1266221828Sgrehan{
1267221828Sgrehan	struct vcpu *vcpu;
1268241489Sneel	enum vcpu_state state;
1269221828Sgrehan
1270221828Sgrehan	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1271221828Sgrehan		panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
1272221828Sgrehan
1273221828Sgrehan	vcpu = &vm->vcpu[vcpuid];
1274221828Sgrehan
1275241489Sneel	vcpu_lock(vcpu);
1276241489Sneel	state = vcpu->state;
1277249879Sgrehan	if (hostcpu != NULL)
1278249879Sgrehan		*hostcpu = vcpu->hostcpu;
1279241489Sneel	vcpu_unlock(vcpu);
1280221828Sgrehan
1281241489Sneel	return (state);
1282221828Sgrehan}
1283221828Sgrehan
1284221828Sgrehanvoid
1285221828Sgrehanvm_activate_cpu(struct vm *vm, int vcpuid)
1286221828Sgrehan{
1287221828Sgrehan
1288221828Sgrehan	if (vcpuid >= 0 && vcpuid < VM_MAXCPU)
1289223621Sgrehan		CPU_SET(vcpuid, &vm->active_cpus);
1290221828Sgrehan}
1291221828Sgrehan
1292223621Sgrehancpuset_t
1293221828Sgrehanvm_active_cpus(struct vm *vm)
1294221828Sgrehan{
1295221828Sgrehan
1296221828Sgrehan	return (vm->active_cpus);
1297221828Sgrehan}
1298221828Sgrehan
1299221828Sgrehanvoid *
1300221828Sgrehanvcpu_stats(struct vm *vm, int vcpuid)
1301221828Sgrehan{
1302221828Sgrehan
1303221828Sgrehan	return (vm->vcpu[vcpuid].stats);
1304221828Sgrehan}
1305240922Sneel
1306240922Sneelint
1307240922Sneelvm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
1308240922Sneel{
1309240922Sneel	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1310240922Sneel		return (EINVAL);
1311240922Sneel
1312240922Sneel	*state = vm->vcpu[vcpuid].x2apic_state;
1313240922Sneel
1314240922Sneel	return (0);
1315240922Sneel}
1316240922Sneel
1317240922Sneelint
1318240922Sneelvm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
1319240922Sneel{
1320240922Sneel	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1321240922Sneel		return (EINVAL);
1322240922Sneel
1323248392Sneel	if (state >= X2APIC_STATE_LAST)
1324240922Sneel		return (EINVAL);
1325240922Sneel
1326240922Sneel	vm->vcpu[vcpuid].x2apic_state = state;
1327240922Sneel
1328240943Sneel	vlapic_set_x2apic_state(vm, vcpuid, state);
1329240943Sneel
1330240922Sneel	return (0);
1331240922Sneel}
1332241489Sneel
1333241489Sneelvoid
1334241489Sneelvm_interrupt_hostcpu(struct vm *vm, int vcpuid)
1335241489Sneel{
1336241489Sneel	int hostcpu;
1337241489Sneel	struct vcpu *vcpu;
1338241489Sneel
1339241489Sneel	vcpu = &vm->vcpu[vcpuid];
1340241489Sneel
1341242065Sneel	vcpu_lock(vcpu);
1342241489Sneel	hostcpu = vcpu->hostcpu;
1343242065Sneel	if (hostcpu == NOCPU) {
1344256072Sneel		if (vcpu->state == VCPU_SLEEPING)
1345242065Sneel			wakeup_one(vcpu);
1346242065Sneel	} else {
1347242065Sneel		if (vcpu->state != VCPU_RUNNING)
1348242065Sneel			panic("invalid vcpu state %d", vcpu->state);
1349242065Sneel		if (hostcpu != curcpu)
1350242065Sneel			ipi_cpu(hostcpu, vmm_ipinum);
1351242065Sneel	}
1352242065Sneel	vcpu_unlock(vcpu);
1353241489Sneel}
1354256072Sneel
1355256072Sneelstruct vmspace *
1356256072Sneelvm_get_vmspace(struct vm *vm)
1357256072Sneel{
1358256072Sneel
1359256072Sneel	return (vm->vmspace);
1360256072Sneel}
1361261088Sjhb
1362261088Sjhbint
1363261088Sjhbvm_apicid2vcpuid(struct vm *vm, int apicid)
1364261088Sjhb{
1365261088Sjhb	/*
1366261088Sjhb	 * XXX apic id is assumed to be numerically identical to vcpu id
1367261088Sjhb	 */
1368261088Sjhb	return (apicid);
1369261088Sjhb}
1370