vmm.c revision 262349
1124524Sume/*-
266776Skris * Copyright (c) 2011 NetApp, Inc.
355163Sshin * All rights reserved.
455163Sshin *
5222732Shrs * Redistribution and use in source and binary forms, with or without
655163Sshin * modification, are permitted provided that the following conditions
762632Skris * are met:
855163Sshin * 1. Redistributions of source code must retain the above copyright
955163Sshin *    notice, this list of conditions and the following disclaimer.
1055163Sshin * 2. Redistributions in binary form must reproduce the above copyright
1155163Sshin *    notice, this list of conditions and the following disclaimer in the
1255163Sshin *    documentation and/or other materials provided with the distribution.
1355163Sshin *
1455163Sshin * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
1555163Sshin * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1655163Sshin * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1755163Sshin * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
1855163Sshin * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
1962632Skris * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2055163Sshin * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2155163Sshin * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2255163Sshin * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2355163Sshin * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2455163Sshin * SUCH DAMAGE.
2555163Sshin *
2655163Sshin * $FreeBSD: stable/10/sys/amd64/vmm/vmm.c 262349 2014-02-22 23:34:39Z jhb $
2755163Sshin */
2855163Sshin
2955163Sshin#include <sys/cdefs.h>
3055163Sshin__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/vmm.c 262349 2014-02-22 23:34:39Z jhb $");
3155163Sshin
3255163Sshin#include <sys/param.h>
3355163Sshin#include <sys/systm.h>
3455163Sshin#include <sys/kernel.h>
3555163Sshin#include <sys/module.h>
3655163Sshin#include <sys/sysctl.h>
3755163Sshin#include <sys/malloc.h>
3866776Skris#include <sys/pcpu.h>
39118661Sume#include <sys/lock.h>
40118661Sume#include <sys/mutex.h>
4155163Sshin#include <sys/proc.h>
4255163Sshin#include <sys/rwlock.h>
4355163Sshin#include <sys/sched.h>
4455163Sshin#include <sys/smp.h>
4555163Sshin#include <sys/systm.h>
46222732Shrs
4755163Sshin#include <vm/vm.h>
48222732Shrs#include <vm/vm_object.h>
4955163Sshin#include <vm/vm_page.h>
5055163Sshin#include <vm/pmap.h>
5155163Sshin#include <vm/vm_map.h>
5255163Sshin#include <vm/vm_extern.h>
5355163Sshin#include <vm/vm_param.h>
5455163Sshin
55222732Shrs#include <machine/cpu.h>
5655163Sshin#include <machine/vm.h>
57119026Sume#include <machine/pcb.h>
5855163Sshin#include <machine/smp.h>
5955163Sshin#include <x86/apicreg.h>
60253970Shrs#include <machine/vmparam.h>
6155163Sshin
6255163Sshin#include <machine/vmm.h>
6355163Sshin#include <machine/vmm_dev.h>
6455163Sshin
6555163Sshin#include "vmm_ktr.h"
6655163Sshin#include "vmm_host.h"
6755163Sshin#include "vmm_mem.h"
6855163Sshin#include "vmm_util.h"
6955163Sshin#include "vhpet.h"
7055163Sshin#include "vioapic.h"
7155163Sshin#include "vlapic.h"
7255163Sshin#include "vmm_msr.h"
73119026Sume#include "vmm_ipi.h"
7455163Sshin#include "vmm_stat.h"
7562632Skris#include "vmm_lapic.h"
76225520Shrs
77222732Shrs#include "io/ppt.h"
78222732Shrs#include "io/iommu.h"
7955163Sshin
80222732Shrsstruct vlapic;
81204407Suqs
82204407Suqsstruct vcpu {
83222732Shrs	int		flags;
84204407Suqs	enum vcpu_state	state;
8555163Sshin	struct mtx	mtx;
86225520Shrs	int		hostcpu;	/* host cpuid this vcpu last ran on */
87225520Shrs	uint64_t	guest_msrs[VMM_MSR_NUM];
88222732Shrs	struct vlapic	*vlapic;
89173412Skevlo	int		 vcpuid;
90222861Shrs	struct savefpu	*guestfpu;	/* guest fpu state */
91225520Shrs	void		*stats;
92225520Shrs	struct vm_exit	exitinfo;
93225520Shrs	enum x2apic_state x2apic_state;
94118661Sume	int		nmi_pending;
95222732Shrs};
96225520Shrs
97225520Shrs#define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
98222732Shrs#define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
99222732Shrs#define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
100222732Shrs#define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
101222732Shrs
102222732Shrsstruct mem_seg {
103222732Shrs	vm_paddr_t	gpa;
104222732Shrs	size_t		len;
105222732Shrs	boolean_t	wired;
106222732Shrs	vm_object_t	object;
107222732Shrs};
108222732Shrs#define	VM_MAX_MEMORY_SEGMENTS	2
109222732Shrs
110222732Shrsstruct vm {
111222732Shrs	void		*cookie;	/* processor-specific data */
112222732Shrs	void		*iommu;		/* iommu-specific data */
113222732Shrs	struct vhpet	*vhpet;		/* virtual HPET */
114222732Shrs	struct vioapic	*vioapic;	/* virtual ioapic */
115222732Shrs	struct vmspace	*vmspace;	/* guest's address space */
11655163Sshin	struct vcpu	vcpu[VM_MAXCPU];
117124524Sume	int		num_mem_segs;
11855163Sshin	struct mem_seg	mem_segs[VM_MAX_MEMORY_SEGMENTS];
119119026Sume	char		name[VM_MAX_NAMELEN];
120119026Sume
121119026Sume	/*
12255163Sshin	 * Set of active vcpus.
12355163Sshin	 * An active vcpu is one that has been started implicitly (BSP) or
12462632Skris	 * explicitly (AP) by sending it a startup ipi.
125118664Sume	 */
12662632Skris	cpuset_t	active_cpus;
127118660Sume};
128118664Sume
129222732Shrsstatic int vmm_initialized;
13062632Skris
131118664Sumestatic struct vmm_ops *ops;
132118660Sume#define	VMM_INIT()	(ops != NULL ? (*ops->init)() : 0)
133118664Sume#define	VMM_CLEANUP()	(ops != NULL ? (*ops->cleanup)() : 0)
134222732Shrs#define	VMM_RESUME()	(ops != NULL ? (*ops->resume)() : 0)
13562632Skris
13655163Sshin#define	VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
137118660Sume#define	VMRUN(vmi, vcpu, rip, pmap) \
138222732Shrs	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap) : ENXIO)
13955163Sshin#define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
14055163Sshin#define	VMSPACE_ALLOC(min, max) \
14155163Sshin	(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
14255163Sshin#define	VMSPACE_FREE(vmspace) \
14362632Skris	(ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
144118664Sume#define	VMGETREG(vmi, vcpu, num, retval)		\
145118660Sume	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
146118664Sume#define	VMSETREG(vmi, vcpu, num, val)		\
14762632Skris	(ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
14862632Skris#define	VMGETDESC(vmi, vcpu, num, desc)		\
14955163Sshin	(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
150222732Shrs#define	VMSETDESC(vmi, vcpu, num, desc)		\
15155163Sshin	(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
15262632Skris#define	VMINJECT(vmi, vcpu, type, vec, ec, ecv)	\
153118664Sume	(ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO)
154118660Sume#define	VMGETCAP(vmi, vcpu, num, retval)	\
155118664Sume	(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
15662632Skris#define	VMSETCAP(vmi, vcpu, num, val)		\
15762632Skris	(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
15855163Sshin
15955163Sshin#define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
16055163Sshin#define	fpu_stop_emulating()	clts()
16155163Sshin
16255163Sshinstatic MALLOC_DEFINE(M_VM, "vm", "vm");
163118664SumeCTASSERT(VMM_MSR_NUM <= 64);	/* msr_mask can keep track of up to 64 msrs */
164118660Sume
165118664Sume/* statistics */
16655163Sshinstatic VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
16755163Sshin
16855163Sshinstatic void
16955163Sshinvcpu_cleanup(struct vcpu *vcpu)
17055163Sshin{
17155163Sshin	vlapic_cleanup(vcpu->vlapic);
17255163Sshin	vmm_stat_free(vcpu->stats);
17355163Sshin	fpu_save_area_free(vcpu->guestfpu);
17455163Sshin}
17555163Sshin
17655163Sshinstatic void
17755163Sshinvcpu_init(struct vm *vm, uint32_t vcpu_id)
17855163Sshin{
17955163Sshin	struct vcpu *vcpu;
18055163Sshin
18155163Sshin	vcpu = &vm->vcpu[vcpu_id];
18262632Skris
18355163Sshin	vcpu_lock_init(vcpu);
184222732Shrs	vcpu->hostcpu = NOCPU;
18555163Sshin	vcpu->vcpuid = vcpu_id;
18655163Sshin	vcpu->vlapic = vlapic_init(vm, vcpu_id);
18755163Sshin	vm_set_x2apic_state(vm, vcpu_id, X2APIC_ENABLED);
188222732Shrs	vcpu->guestfpu = fpu_save_area_alloc();
18955163Sshin	fpu_save_area_reset(vcpu->guestfpu);
190118664Sume	vcpu->stats = vmm_stat_alloc();
191118664Sume}
192118664Sume
193204407Suqsstruct vm_exit *
194119026Sumevm_exitinfo(struct vm *vm, int cpuid)
19555163Sshin{
196119026Sume	struct vcpu *vcpu;
197222732Shrs
198119026Sume	if (cpuid < 0 || cpuid >= VM_MAXCPU)
199119026Sume		panic("vm_exitinfo: invalid cpuid %d", cpuid);
200222732Shrs
201222732Shrs	vcpu = &vm->vcpu[cpuid];
20255163Sshin
20355163Sshin	return (&vcpu->exitinfo);
20455163Sshin}
20555163Sshin
20655163Sshinstatic void
20755163Sshinvmm_resume(void)
208254462Shrs{
20955163Sshin	VMM_RESUME();
210222732Shrs}
21155163Sshin
21255163Sshinstatic int
213118664Sumevmm_init(void)
214118664Sume{
215118664Sume	int error;
216118664Sume
217118664Sume	vmm_host_state_init();
21855163Sshin	vmm_ipi_init();
219118664Sume
220118664Sume	error = vmm_mem_init();
221222732Shrs	if (error)
22255163Sshin		return (error);
223222732Shrs
22455163Sshin	if (vmm_is_intel())
22555163Sshin		ops = &vmm_ops_intel;
22655163Sshin	else if (vmm_is_amd())
22755163Sshin		ops = &vmm_ops_amd;
22855163Sshin	else
229118660Sume		return (ENXIO);
230222732Shrs
23155163Sshin	vmm_msr_init();
23255163Sshin	vmm_resume_p = vmm_resume;
23355163Sshin
234222732Shrs	return (VMM_INIT());
23555163Sshin}
23655163Sshin
23755163Sshinstatic int
23855163Sshinvmm_handler(module_t mod, int what, void *arg)
23955163Sshin{
240253376Skevlo	int error;
241222732Shrs
242222732Shrs	switch (what) {
24355163Sshin	case MOD_LOAD:
24455163Sshin		vmmdev_init();
245222732Shrs		iommu_init();
246119026Sume		error = vmm_init();
247118661Sume		if (error == 0)
248119026Sume			vmm_initialized = 1;
249222861Shrs		break;
250222732Shrs	case MOD_UNLOAD:
251222732Shrs		error = vmmdev_cleanup();
252222732Shrs		if (error == 0) {
253222732Shrs			vmm_resume_p = NULL;
254222732Shrs			iommu_cleanup();
255222732Shrs			vmm_ipi_cleanup();
256222732Shrs			error = VMM_CLEANUP();
257222732Shrs			/*
258222732Shrs			 * Something bad happened - prevent new
259253970Shrs			 * VMs from being created
260253970Shrs			 */
261222861Shrs			if (error)
262222861Shrs				vmm_initialized = 0;
26355163Sshin		}
264119026Sume		break;
265119026Sume	default:
266119026Sume		error = 0;
267222732Shrs		break;
268118660Sume	}
26955163Sshin	return (error);
27055163Sshin}
27155163Sshin
27255163Sshinstatic moduledata_t vmm_kmod = {
273118664Sume	"vmm",
274118664Sume	vmm_handler,
27555163Sshin	NULL
27655163Sshin};
27755163Sshin
278254462Shrs/*
27955163Sshin * vmm initialization has the following dependencies:
28055163Sshin *
28155163Sshin * - iommu initialization must happen after the pci passthru driver has had
28255163Sshin *   a chance to attach to any passthru devices (after SI_SUB_CONFIGURE).
28355163Sshin *
284254462Shrs * - VT-x initialization requires smp_rendezvous() and therefore must happen
28555163Sshin *   after SMP is fully functional (after SI_SUB_SMP).
28655163Sshin */
28755163SshinDECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
288118664SumeMODULE_VERSION(vmm, 1);
289118664Sume
29055163SshinSYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
29155163Sshin
29255163Sshinint
293118664Sumevm_create(const char *name, struct vm **retvm)
294118664Sume{
29555163Sshin	int i;
29655163Sshin	struct vm *vm;
29755163Sshin	struct vmspace *vmspace;
298222732Shrs
299118906Sume	const int BSP = 0;
300222732Shrs
30155163Sshin	/*
30255163Sshin	 * If vmm.ko could not be successfully initialized then don't attempt
30355163Sshin	 * to create the virtual machine.
30455163Sshin	 */
30555163Sshin	if (!vmm_initialized)
30655163Sshin		return (ENXIO);
307118906Sume
308118906Sume	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
309118906Sume		return (EINVAL);
310118906Sume
311118660Sume	vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS);
312118664Sume	if (vmspace == NULL)
313118664Sume		return (ENOMEM);
314225520Shrs
315118664Sume	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
31655163Sshin	strcpy(vm->name, name);
31755163Sshin	vm->cookie = VMINIT(vm, vmspace_pmap(vmspace));
31855163Sshin	vm->vioapic = vioapic_init(vm);
31955163Sshin	vm->vhpet = vhpet_init(vm);
320118906Sume
321118664Sume	for (i = 0; i < VM_MAXCPU; i++) {
322118664Sume		vcpu_init(vm, i);
323225520Shrs		guest_msrs_init(vm, i);
324118664Sume	}
32555163Sshin
32655163Sshin	vm_activate_cpu(vm, BSP);
32755163Sshin	vm->vmspace = vmspace;
32855163Sshin
329118906Sume	*retvm = vm;
330118664Sume	return (0);
331118664Sume}
332118664Sume
333225520Shrsstatic void
334118664Sumevm_free_mem_seg(struct vm *vm, struct mem_seg *seg)
33555163Sshin{
33655163Sshin
33755163Sshin	if (seg->object != NULL)
33855163Sshin		vmm_mem_free(vm->vmspace, seg->gpa, seg->len);
339118906Sume
340118664Sume	bzero(seg, sizeof(*seg));
341118664Sume}
342225520Shrs
343118664Sumevoid
34455163Sshinvm_destroy(struct vm *vm)
34555163Sshin{
34655163Sshin	int i;
34755163Sshin
34855163Sshin	ppt_unassign_all(vm);
34955163Sshin
350118906Sume	if (vm->iommu != NULL)
351118664Sume		iommu_destroy_domain(vm->iommu);
352118664Sume
353225520Shrs	vhpet_cleanup(vm->vhpet);
354118664Sume	vioapic_cleanup(vm->vioapic);
35555163Sshin
35655163Sshin	for (i = 0; i < vm->num_mem_segs; i++)
35755163Sshin		vm_free_mem_seg(vm, &vm->mem_segs[i]);
358118660Sume
359118664Sume	vm->num_mem_segs = 0;
360225520Shrs
361118664Sume	for (i = 0; i < VM_MAXCPU; i++)
36255163Sshin		vcpu_cleanup(&vm->vcpu[i]);
363118661Sume
364118661Sume	VMSPACE_FREE(vm->vmspace);
365118661Sume
366118661Sume	VMCLEANUP(vm->cookie);
367118661Sume
368118661Sume	free(vm, M_VM);
369118661Sume}
370118661Sume
371118661Sumeconst char *
372118661Sumevm_name(struct vm *vm)
373118661Sume{
374118661Sume	return (vm->name);
375118661Sume}
376118661Sume
377222732Shrsint
378118661Sumevm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
379253970Shrs{
380222861Shrs	vm_object_t obj;
381222861Shrs
382222861Shrs	if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
383222861Shrs		return (ENOMEM);
384222861Shrs	else
385222861Shrs		return (0);
386225520Shrs}
387225520Shrs
388222861Shrsint
389222861Shrsvm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
390222861Shrs{
391222861Shrs
392222732Shrs	vmm_mmio_free(vm->vmspace, gpa, len);
393222732Shrs	return (0);
394222732Shrs}
395222732Shrs
396222732Shrsboolean_t
397222732Shrsvm_mem_allocated(struct vm *vm, vm_paddr_t gpa)
398222732Shrs{
399222732Shrs	int i;
400222732Shrs	vm_paddr_t gpabase, gpalimit;
401222732Shrs
402222732Shrs	for (i = 0; i < vm->num_mem_segs; i++) {
403222732Shrs		gpabase = vm->mem_segs[i].gpa;
404222732Shrs		gpalimit = gpabase + vm->mem_segs[i].len;
405222732Shrs		if (gpa >= gpabase && gpa < gpalimit)
406222732Shrs			return (TRUE);		/* 'gpa' is regular memory */
407222732Shrs	}
408222732Shrs
409222732Shrs	if (ppt_is_mmio(vm, gpa))
410222732Shrs		return (TRUE);			/* 'gpa' is pci passthru mmio */
411222732Shrs
412222732Shrs	return (FALSE);
413222732Shrs}
414222732Shrs
415222732Shrsint
416225520Shrsvm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
417222732Shrs{
418222732Shrs	int available, allocated;
419222732Shrs	struct mem_seg *seg;
420254462Shrs	vm_object_t object;
421222732Shrs	vm_paddr_t g;
422222732Shrs
423225520Shrs	if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
424222732Shrs		return (EINVAL);
425222732Shrs
426222732Shrs	available = allocated = 0;
427222732Shrs	g = gpa;
428225520Shrs	while (g < gpa + len) {
429222732Shrs		if (vm_mem_allocated(vm, g))
430222732Shrs			allocated++;
431222732Shrs		else
432222732Shrs			available++;
433222732Shrs
434222732Shrs		g += PAGE_SIZE;
435222732Shrs	}
436222732Shrs
437222732Shrs	/*
438222732Shrs	 * If there are some allocated and some available pages in the address
439222732Shrs	 * range then it is an error.
440222732Shrs	 */
441222732Shrs	if (allocated && available)
442222732Shrs		return (EINVAL);
443222732Shrs
444222732Shrs	/*
445222732Shrs	 * If the entire address range being requested has already been
446222732Shrs	 * allocated then there isn't anything more to do.
447222732Shrs	 */
448222732Shrs	if (allocated && available == 0)
449222732Shrs		return (0);
450222732Shrs
451222861Shrs	if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
452222861Shrs		return (E2BIG);
453222861Shrs
454222861Shrs	seg = &vm->mem_segs[vm->num_mem_segs];
455222861Shrs
456222861Shrs	if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL)
457222861Shrs		return (ENOMEM);
458222861Shrs
459222861Shrs	seg->gpa = gpa;
460222861Shrs	seg->len = len;
461222861Shrs	seg->object = object;
462222861Shrs	seg->wired = FALSE;
463222861Shrs
464222861Shrs	vm->num_mem_segs++;
465222861Shrs
466222861Shrs	return (0);
467222861Shrs}
468222732Shrs
469222732Shrsstatic void
470222861Shrsvm_gpa_unwire(struct vm *vm)
471222861Shrs{
472222732Shrs	int i, rv;
473222861Shrs	struct mem_seg *seg;
474222861Shrs
475253970Shrs	for (i = 0; i < vm->num_mem_segs; i++) {
476222732Shrs		seg = &vm->mem_segs[i];
477222861Shrs		if (!seg->wired)
478222861Shrs			continue;
479222861Shrs
480222732Shrs		rv = vm_map_unwire(&vm->vmspace->vm_map,
481222732Shrs				   seg->gpa, seg->gpa + seg->len,
482222732Shrs				   VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
483222732Shrs		KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment "
484222732Shrs		    "%#lx/%ld could not be unwired: %d",
485222732Shrs		    vm_name(vm), seg->gpa, seg->len, rv));
486222732Shrs
487222732Shrs		seg->wired = FALSE;
488222732Shrs	}
489222732Shrs}
490222732Shrs
491222732Shrsstatic int
492225520Shrsvm_gpa_wire(struct vm *vm)
493222732Shrs{
494222732Shrs	int i, rv;
495222732Shrs	struct mem_seg *seg;
496222732Shrs
497222732Shrs	for (i = 0; i < vm->num_mem_segs; i++) {
498222732Shrs		seg = &vm->mem_segs[i];
499222732Shrs		if (seg->wired)
500222732Shrs			continue;
501222732Shrs
502222732Shrs		/* XXX rlimits? */
503222732Shrs		rv = vm_map_wire(&vm->vmspace->vm_map,
504222732Shrs				 seg->gpa, seg->gpa + seg->len,
505222732Shrs				 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
506222732Shrs		if (rv != KERN_SUCCESS)
507222732Shrs			break;
508222732Shrs
509222732Shrs		seg->wired = TRUE;
510222861Shrs	}
511222861Shrs
512222861Shrs	if (i < vm->num_mem_segs) {
513222861Shrs		/*
514222861Shrs		 * Undo the wiring before returning an error.
515222861Shrs		 */
516222861Shrs		vm_gpa_unwire(vm);
517222861Shrs		return (EAGAIN);
518222861Shrs	}
519222861Shrs
520222861Shrs	return (0);
521222861Shrs}
522222861Shrs
523222861Shrsstatic void
524222861Shrsvm_iommu_modify(struct vm *vm, boolean_t map)
525222861Shrs{
526222861Shrs	int i, sz;
527222732Shrs	vm_paddr_t gpa, hpa;
528222732Shrs	struct mem_seg *seg;
529222861Shrs	void *vp, *cookie, *host_domain;
530222861Shrs
531222732Shrs	sz = PAGE_SIZE;
532222861Shrs	host_domain = iommu_host_domain();
533222861Shrs
534253970Shrs	for (i = 0; i < vm->num_mem_segs; i++) {
535222732Shrs		seg = &vm->mem_segs[i];
536222861Shrs		KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired",
537222861Shrs		    vm_name(vm), seg->gpa, seg->len));
538222861Shrs
539222732Shrs		gpa = seg->gpa;
540222732Shrs		while (gpa < seg->gpa + seg->len) {
541222732Shrs			vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE,
542222732Shrs					 &cookie);
543222732Shrs			KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
544222732Shrs			    vm_name(vm), gpa));
545222732Shrs
546222732Shrs			vm_gpa_release(cookie);
547222732Shrs
548222861Shrs			hpa = DMAP_TO_PHYS((uintptr_t)vp);
549222861Shrs			if (map) {
550222861Shrs				iommu_create_mapping(vm->iommu, gpa, hpa, sz);
551222732Shrs				iommu_remove_mapping(host_domain, hpa, sz);
55255163Sshin			} else {
55355163Sshin				iommu_remove_mapping(vm->iommu, gpa, sz);
554118664Sume				iommu_create_mapping(host_domain, hpa, hpa, sz);
555118664Sume			}
556118664Sume
557118664Sume			gpa += PAGE_SIZE;
558118664Sume		}
559118664Sume	}
560118664Sume
561118664Sume	/*
562118664Sume	 * Invalidate the cached translations associated with the domain
56355163Sshin	 * from which pages were removed.
56455163Sshin	 */
565118661Sume	if (map)
566222732Shrs		iommu_invalidate_tlb(host_domain);
567222732Shrs	else
568222732Shrs		iommu_invalidate_tlb(vm->iommu);
569222732Shrs}
570222732Shrs
571222732Shrs#define	vm_iommu_unmap(vm)	vm_iommu_modify((vm), FALSE)
572222732Shrs#define	vm_iommu_map(vm)	vm_iommu_modify((vm), TRUE)
573222732Shrs
574222732Shrsint
575222861Shrsvm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
576222732Shrs{
577253970Shrs	int error;
578225520Shrs
579225520Shrs	error = ppt_unassign_device(vm, bus, slot, func);
580225520Shrs	if (error)
581225520Shrs		return (error);
582225520Shrs
583222732Shrs	if (ppt_num_devices(vm) == 0) {
584222732Shrs		vm_iommu_unmap(vm);
585222732Shrs		vm_gpa_unwire(vm);
586222732Shrs	}
587253970Shrs	return (0);
588222732Shrs}
589222861Shrs
590222861Shrsint
591222861Shrsvm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
592222861Shrs{
593222861Shrs	int error;
594222861Shrs	vm_paddr_t maxaddr;
595222861Shrs
596222861Shrs	/*
597222861Shrs	 * Virtual machines with pci passthru devices get special treatment:
598253970Shrs	 * - the guest physical memory is wired
599222861Shrs	 * - the iommu is programmed to do the 'gpa' to 'hpa' translation
600222861Shrs	 *
601222861Shrs	 * We need to do this before the first pci passthru device is attached.
602222861Shrs	 */
603222861Shrs	if (ppt_num_devices(vm) == 0) {
604222861Shrs		KASSERT(vm->iommu == NULL,
605222861Shrs		    ("vm_assign_pptdev: iommu must be NULL"));
606222861Shrs		maxaddr = vmm_mem_maxaddr();
607222861Shrs		vm->iommu = iommu_create_domain(maxaddr);
608222861Shrs
609222861Shrs		error = vm_gpa_wire(vm);
610222861Shrs		if (error)
611222861Shrs			return (error);
612222861Shrs
613222861Shrs		vm_iommu_map(vm);
614222861Shrs	}
615222861Shrs
616222861Shrs	error = ppt_assign_device(vm, bus, slot, func);
617222861Shrs	return (error);
618222732Shrs}
619222861Shrs
620253970Shrsvoid *
621222861Shrsvm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
622222861Shrs	    void **cookie)
623222861Shrs{
624222861Shrs	int count, pageoff;
625222861Shrs	vm_page_t m;
626222861Shrs
627222861Shrs	pageoff = gpa & PAGE_MASK;
628222861Shrs	if (len > PAGE_SIZE - pageoff)
629222861Shrs		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
630222861Shrs
631222861Shrs	count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
632222861Shrs	    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
633222861Shrs
634222861Shrs	if (count == 1) {
635222861Shrs		*cookie = m;
636222861Shrs		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
637222861Shrs	} else {
638222861Shrs		*cookie = NULL;
639222861Shrs		return (NULL);
640222861Shrs	}
641222861Shrs}
642222861Shrs
643222861Shrsvoid
644222861Shrsvm_gpa_release(void *cookie)
645222861Shrs{
646222861Shrs	vm_page_t m = cookie;
647222861Shrs
648222861Shrs	vm_page_lock(m);
649222861Shrs	vm_page_unhold(m);
650222861Shrs	vm_page_unlock(m);
651222861Shrs}
652222861Shrs
653222861Shrsint
654222861Shrsvm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
655222861Shrs		  struct vm_memory_segment *seg)
656222861Shrs{
657222732Shrs	int i;
658222861Shrs
659222861Shrs	for (i = 0; i < vm->num_mem_segs; i++) {
660222861Shrs		if (gpabase == vm->mem_segs[i].gpa) {
661222732Shrs			seg->gpa = vm->mem_segs[i].gpa;
662222732Shrs			seg->len = vm->mem_segs[i].len;
663222861Shrs			seg->wired = vm->mem_segs[i].wired;
664222732Shrs			return (0);
665222861Shrs		}
666222732Shrs	}
667222861Shrs	return (-1);
668222861Shrs}
669225520Shrs
670225520Shrsint
671225520Shrsvm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
672225520Shrs	      vm_offset_t *offset, struct vm_object **object)
673222732Shrs{
674225520Shrs	int i;
675225520Shrs	size_t seg_len;
676225520Shrs	vm_paddr_t seg_gpa;
677225520Shrs	vm_object_t seg_obj;
678225520Shrs
679225520Shrs	for (i = 0; i < vm->num_mem_segs; i++) {
680225520Shrs		if ((seg_obj = vm->mem_segs[i].object) == NULL)
681225520Shrs			continue;
682225520Shrs
683225520Shrs		seg_gpa = vm->mem_segs[i].gpa;
684225520Shrs		seg_len = vm->mem_segs[i].len;
685225520Shrs
686225520Shrs		if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) {
687225520Shrs			*offset = gpa - seg_gpa;
688225520Shrs			*object = seg_obj;
689225520Shrs			vm_object_reference(seg_obj);
690225520Shrs			return (0);
691225520Shrs		}
692225520Shrs	}
693225520Shrs
694225520Shrs	return (EINVAL);
695225520Shrs}
696225520Shrs
697225520Shrsint
698225520Shrsvm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
699225520Shrs{
700225520Shrs
701225520Shrs	if (vcpu < 0 || vcpu >= VM_MAXCPU)
702225520Shrs		return (EINVAL);
703225520Shrs
704225520Shrs	if (reg >= VM_REG_LAST)
705225520Shrs		return (EINVAL);
706225520Shrs
707225520Shrs	return (VMGETREG(vm->cookie, vcpu, reg, retval));
708225520Shrs}
709225520Shrs
710225520Shrsint
711225520Shrsvm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
712222732Shrs{
713225520Shrs
714225520Shrs	if (vcpu < 0 || vcpu >= VM_MAXCPU)
715222732Shrs		return (EINVAL);
716225520Shrs
717222732Shrs	if (reg >= VM_REG_LAST)
718225520Shrs		return (EINVAL);
719222732Shrs
720225520Shrs	return (VMSETREG(vm->cookie, vcpu, reg, val));
721225520Shrs}
722225520Shrs
723225520Shrsstatic boolean_t
724225520Shrsis_descriptor_table(int reg)
725225520Shrs{
726225520Shrs
727225520Shrs	switch (reg) {
728225520Shrs	case VM_REG_GUEST_IDTR:
729225520Shrs	case VM_REG_GUEST_GDTR:
730225520Shrs		return (TRUE);
731225520Shrs	default:
732222861Shrs		return (FALSE);
733222861Shrs	}
734222732Shrs}
735222861Shrs
736222861Shrsstatic boolean_t
737222861Shrsis_segment_register(int reg)
738222732Shrs{
739225520Shrs
740222732Shrs	switch (reg) {
741225520Shrs	case VM_REG_GUEST_ES:
742225520Shrs	case VM_REG_GUEST_CS:
743225520Shrs	case VM_REG_GUEST_SS:
744222732Shrs	case VM_REG_GUEST_DS:
745222732Shrs	case VM_REG_GUEST_FS:
746222732Shrs	case VM_REG_GUEST_GS:
747225520Shrs	case VM_REG_GUEST_TR:
748225520Shrs	case VM_REG_GUEST_LDTR:
749225520Shrs		return (TRUE);
750222861Shrs	default:
751222861Shrs		return (FALSE);
752222861Shrs	}
753225520Shrs}
754222732Shrs
755222732Shrsint
756222861Shrsvm_get_seg_desc(struct vm *vm, int vcpu, int reg,
757222861Shrs		struct seg_desc *desc)
758222861Shrs{
759222861Shrs
760222861Shrs	if (vcpu < 0 || vcpu >= VM_MAXCPU)
761222861Shrs		return (EINVAL);
762222861Shrs
763222861Shrs	if (!is_segment_register(reg) && !is_descriptor_table(reg))
764222861Shrs		return (EINVAL);
765222861Shrs
766222861Shrs	return (VMGETDESC(vm->cookie, vcpu, reg, desc));
767222861Shrs}
768222861Shrs
769222861Shrsint
770222861Shrsvm_set_seg_desc(struct vm *vm, int vcpu, int reg,
771118661Sume		struct seg_desc *desc)
772225520Shrs{
773225520Shrs	if (vcpu < 0 || vcpu >= VM_MAXCPU)
774118661Sume		return (EINVAL);
775222732Shrs
776222732Shrs	if (!is_segment_register(reg) && !is_descriptor_table(reg))
777222732Shrs		return (EINVAL);
778118661Sume
779118661Sume	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
780222732Shrs}
781118661Sume
782118661Sumestatic void
783222732Shrsrestore_guest_fpustate(struct vcpu *vcpu)
784222732Shrs{
785222732Shrs
786222732Shrs	/* flush host state to the pcb */
787222732Shrs	fpuexit(curthread);
788222732Shrs
789222732Shrs	/* restore guest FPU state */
790222732Shrs	fpu_stop_emulating();
791222732Shrs	fpurestore(vcpu->guestfpu);
792222732Shrs
793118661Sume	/*
794118661Sume	 * The FPU is now "dirty" with the guest's state so turn on emulation
795118661Sume	 * to trap any access to the FPU by the host.
796118661Sume	 */
797118661Sume	fpu_start_emulating();
798118661Sume}
799222732Shrs
800118661Sumestatic void
801118661Sumesave_guest_fpustate(struct vcpu *vcpu)
802222732Shrs{
803222732Shrs
804222732Shrs	if ((rcr0() & CR0_TS) == 0)
805222732Shrs		panic("fpu emulation not enabled in host!");
806222732Shrs
807222732Shrs	/* save guest FPU state */
808222732Shrs	fpu_stop_emulating();
809222732Shrs	fpusave(vcpu->guestfpu);
810222732Shrs	fpu_start_emulating();
811222732Shrs}
812222732Shrs
813222732Shrsstatic VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
814222732Shrs
815222732Shrsstatic int
816222732Shrsvcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
817222732Shrs{
818222732Shrs	int error;
819222732Shrs
820222732Shrs	vcpu_assert_locked(vcpu);
821118661Sume
822118661Sume	/*
823118661Sume	 * The following state transitions are allowed:
824118661Sume	 * IDLE -> FROZEN -> IDLE
825118661Sume	 * FROZEN -> RUNNING -> FROZEN
826118661Sume	 * FROZEN -> SLEEPING -> FROZEN
827118661Sume	 */
828222732Shrs	switch (vcpu->state) {
829118661Sume	case VCPU_IDLE:
830118661Sume	case VCPU_RUNNING:
831222732Shrs	case VCPU_SLEEPING:
832222732Shrs		error = (newstate != VCPU_FROZEN);
833222732Shrs		break;
834118661Sume	case VCPU_FROZEN:
835118661Sume		error = (newstate == VCPU_FROZEN);
836118661Sume		break;
837118661Sume	default:
838118661Sume		error = 1;
839118661Sume		break;
840118661Sume	}
841222732Shrs
842222732Shrs	if (error == 0)
843222732Shrs		vcpu->state = newstate;
844222732Shrs	else
845222732Shrs		error = EBUSY;
846222732Shrs
847222732Shrs	return (error);
848222732Shrs}
849222732Shrs
850222732Shrsstatic void
851222732Shrsvcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
852222732Shrs{
853222732Shrs	int error;
854222732Shrs
855222732Shrs	if ((error = vcpu_set_state(vm, vcpuid, newstate)) != 0)
856222732Shrs		panic("Error %d setting state to %d\n", error, newstate);
857222732Shrs}
858222732Shrs
859222732Shrsstatic void
860222732Shrsvcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
861222732Shrs{
862222732Shrs	int error;
863222732Shrs
864222732Shrs	if ((error = vcpu_set_state_locked(vcpu, newstate)) != 0)
865118661Sume		panic("Error %d setting state to %d", error, newstate);
866222732Shrs}
867222732Shrs
868222732Shrs/*
869222732Shrs * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
870222732Shrs */
871118661Sumestatic int
872222732Shrsvm_handle_hlt(struct vm *vm, int vcpuid, boolean_t *retu)
873222732Shrs{
874118661Sume	struct vcpu *vcpu;
875118661Sume	int sleepticks, t;
876222732Shrs
877118661Sume	vcpu = &vm->vcpu[vcpuid];
878118661Sume
879118661Sume	vcpu_lock(vcpu);
880118661Sume
881118661Sume	/*
882118661Sume	 * Figure out the number of host ticks until the next apic
883124524Sume	 * timer interrupt in the guest.
884118661Sume	 */
885118661Sume	sleepticks = lapic_timer_tick(vm, vcpuid);
886118661Sume
887118661Sume	/*
888118661Sume	 * If the guest local apic timer is disabled then sleep for
889118661Sume	 * a long time but not forever.
890118661Sume	 */
891118661Sume	if (sleepticks < 0)
892118661Sume		sleepticks = hz;
893118661Sume
894118661Sume	/*
895118661Sume	 * Do a final check for pending NMI or interrupts before
896118661Sume	 * really putting this thread to sleep.
897118661Sume	 *
898118661Sume	 * These interrupts could have happened any time after we
899118661Sume	 * returned from VMRUN() and before we grabbed the vcpu lock.
900118661Sume	 */
901118661Sume	if (!vm_nmi_pending(vm, vcpuid) && lapic_pending_intr(vm, vcpuid) < 0) {
902118661Sume		if (sleepticks <= 0)
903118661Sume			panic("invalid sleepticks %d", sleepticks);
904118661Sume		t = ticks;
905118661Sume		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
906118661Sume		msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks);
907118661Sume		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
908118661Sume		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
909118661Sume	}
910118661Sume	vcpu_unlock(vcpu);
911118661Sume
912118661Sume	return (0);
913118661Sume}
914118661Sume
915118661Sumestatic int
916118661Sumevm_handle_paging(struct vm *vm, int vcpuid, boolean_t *retu)
917118661Sume{
918118661Sume	int rv, ftype;
919118661Sume	struct vm_map *map;
920118661Sume	struct vcpu *vcpu;
921222732Shrs	struct vm_exit *vme;
922222732Shrs
923222732Shrs	vcpu = &vm->vcpu[vcpuid];
924222732Shrs	vme = &vcpu->exitinfo;
925222732Shrs
926222732Shrs	ftype = vme->u.paging.fault_type;
927222732Shrs	KASSERT(ftype == VM_PROT_READ ||
928222732Shrs	    ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
929222732Shrs	    ("vm_handle_paging: invalid fault_type %d", ftype));
930222732Shrs
931222732Shrs	if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
932222732Shrs		rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
933222732Shrs		    vme->u.paging.gpa, ftype);
934222732Shrs		if (rv == 0)
935222732Shrs			goto done;
936222732Shrs	}
937222732Shrs
938222732Shrs	map = &vm->vmspace->vm_map;
939222732Shrs	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
940222732Shrs
941222732Shrs	VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, "
942222732Shrs	    "ftype = %d", rv, vme->u.paging.gpa, ftype);
943222732Shrs
944222732Shrs	if (rv != KERN_SUCCESS)
945222732Shrs		return (EFAULT);
946222732Shrsdone:
947222732Shrs	/* restart execution at the faulting instruction */
948222732Shrs	vme->inst_length = 0;
949222732Shrs
950222732Shrs	return (0);
951222732Shrs}
952222732Shrs
953222732Shrsstatic int
954222732Shrsvm_handle_inst_emul(struct vm *vm, int vcpuid, boolean_t *retu)
955{
956	struct vie *vie;
957	struct vcpu *vcpu;
958	struct vm_exit *vme;
959	int error, inst_length;
960	uint64_t rip, gla, gpa, cr3;
961	mem_region_read_t mread;
962	mem_region_write_t mwrite;
963
964	vcpu = &vm->vcpu[vcpuid];
965	vme = &vcpu->exitinfo;
966
967	rip = vme->rip;
968	inst_length = vme->inst_length;
969
970	gla = vme->u.inst_emul.gla;
971	gpa = vme->u.inst_emul.gpa;
972	cr3 = vme->u.inst_emul.cr3;
973	vie = &vme->u.inst_emul.vie;
974
975	vie_init(vie);
976
977	/* Fetch, decode and emulate the faulting instruction */
978	if (vmm_fetch_instruction(vm, vcpuid, rip, inst_length, cr3, vie) != 0)
979		return (EFAULT);
980
981	if (vmm_decode_instruction(vm, vcpuid, gla, vie) != 0)
982		return (EFAULT);
983
984	/* return to userland unless this is an in-kernel emulated device */
985	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
986		mread = lapic_mmio_read;
987		mwrite = lapic_mmio_write;
988	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
989		mread = vioapic_mmio_read;
990		mwrite = vioapic_mmio_write;
991	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
992		mread = vhpet_mmio_read;
993		mwrite = vhpet_mmio_write;
994	} else {
995		*retu = TRUE;
996		return (0);
997	}
998
999	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, mread, mwrite, 0);
1000
1001	/* return to userland to spin up the AP */
1002	if (error == 0 && vme->exitcode == VM_EXITCODE_SPINUP_AP)
1003		*retu = TRUE;
1004
1005	return (error);
1006}
1007
1008int
1009vm_run(struct vm *vm, struct vm_run *vmrun)
1010{
1011	int error, vcpuid;
1012	struct vcpu *vcpu;
1013	struct pcb *pcb;
1014	uint64_t tscval, rip;
1015	struct vm_exit *vme;
1016	boolean_t retu;
1017	pmap_t pmap;
1018
1019	vcpuid = vmrun->cpuid;
1020
1021	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1022		return (EINVAL);
1023
1024	pmap = vmspace_pmap(vm->vmspace);
1025	vcpu = &vm->vcpu[vcpuid];
1026	vme = &vcpu->exitinfo;
1027	rip = vmrun->rip;
1028restart:
1029	critical_enter();
1030
1031	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
1032	    ("vm_run: absurd pm_active"));
1033
1034	tscval = rdtsc();
1035
1036	pcb = PCPU_GET(curpcb);
1037	set_pcb_flags(pcb, PCB_FULL_IRET);
1038
1039	restore_guest_msrs(vm, vcpuid);
1040	restore_guest_fpustate(vcpu);
1041
1042	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
1043	vcpu->hostcpu = curcpu;
1044	error = VMRUN(vm->cookie, vcpuid, rip, pmap);
1045	vcpu->hostcpu = NOCPU;
1046	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
1047
1048	save_guest_fpustate(vcpu);
1049	restore_host_msrs(vm, vcpuid);
1050
1051	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
1052
1053	critical_exit();
1054
1055	if (error == 0) {
1056		retu = FALSE;
1057		switch (vme->exitcode) {
1058		case VM_EXITCODE_HLT:
1059			error = vm_handle_hlt(vm, vcpuid, &retu);
1060			break;
1061		case VM_EXITCODE_PAGING:
1062			error = vm_handle_paging(vm, vcpuid, &retu);
1063			break;
1064		case VM_EXITCODE_INST_EMUL:
1065			error = vm_handle_inst_emul(vm, vcpuid, &retu);
1066			break;
1067		default:
1068			retu = TRUE;	/* handled in userland */
1069			break;
1070		}
1071	}
1072
1073	if (error == 0 && retu == FALSE) {
1074		rip = vme->rip + vme->inst_length;
1075		goto restart;
1076	}
1077
1078	/* copy the exit information */
1079	bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
1080	return (error);
1081}
1082
1083int
1084vm_inject_event(struct vm *vm, int vcpuid, int type,
1085		int vector, uint32_t code, int code_valid)
1086{
1087	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1088		return (EINVAL);
1089
1090	if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0)
1091		return (EINVAL);
1092
1093	if (vector < 0 || vector > 255)
1094		return (EINVAL);
1095
1096	return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid));
1097}
1098
1099static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
1100
1101int
1102vm_inject_nmi(struct vm *vm, int vcpuid)
1103{
1104	struct vcpu *vcpu;
1105
1106	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1107		return (EINVAL);
1108
1109	vcpu = &vm->vcpu[vcpuid];
1110
1111	vcpu->nmi_pending = 1;
1112	vm_interrupt_hostcpu(vm, vcpuid);
1113	return (0);
1114}
1115
1116int
1117vm_nmi_pending(struct vm *vm, int vcpuid)
1118{
1119	struct vcpu *vcpu;
1120
1121	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1122		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1123
1124	vcpu = &vm->vcpu[vcpuid];
1125
1126	return (vcpu->nmi_pending);
1127}
1128
1129void
1130vm_nmi_clear(struct vm *vm, int vcpuid)
1131{
1132	struct vcpu *vcpu;
1133
1134	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1135		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1136
1137	vcpu = &vm->vcpu[vcpuid];
1138
1139	if (vcpu->nmi_pending == 0)
1140		panic("vm_nmi_clear: inconsistent nmi_pending state");
1141
1142	vcpu->nmi_pending = 0;
1143	vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
1144}
1145
1146int
1147vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
1148{
1149	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1150		return (EINVAL);
1151
1152	if (type < 0 || type >= VM_CAP_MAX)
1153		return (EINVAL);
1154
1155	return (VMGETCAP(vm->cookie, vcpu, type, retval));
1156}
1157
1158int
1159vm_set_capability(struct vm *vm, int vcpu, int type, int val)
1160{
1161	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1162		return (EINVAL);
1163
1164	if (type < 0 || type >= VM_CAP_MAX)
1165		return (EINVAL);
1166
1167	return (VMSETCAP(vm->cookie, vcpu, type, val));
1168}
1169
1170uint64_t *
1171vm_guest_msrs(struct vm *vm, int cpu)
1172{
1173	return (vm->vcpu[cpu].guest_msrs);
1174}
1175
1176struct vlapic *
1177vm_lapic(struct vm *vm, int cpu)
1178{
1179	return (vm->vcpu[cpu].vlapic);
1180}
1181
1182struct vioapic *
1183vm_ioapic(struct vm *vm)
1184{
1185
1186	return (vm->vioapic);
1187}
1188
1189struct vhpet *
1190vm_hpet(struct vm *vm)
1191{
1192
1193	return (vm->vhpet);
1194}
1195
1196boolean_t
1197vmm_is_pptdev(int bus, int slot, int func)
1198{
1199	int found, i, n;
1200	int b, s, f;
1201	char *val, *cp, *cp2;
1202
1203	/*
1204	 * XXX
1205	 * The length of an environment variable is limited to 128 bytes which
1206	 * puts an upper limit on the number of passthru devices that may be
1207	 * specified using a single environment variable.
1208	 *
1209	 * Work around this by scanning multiple environment variable
1210	 * names instead of a single one - yuck!
1211	 */
1212	const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
1213
1214	/* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
1215	found = 0;
1216	for (i = 0; names[i] != NULL && !found; i++) {
1217		cp = val = getenv(names[i]);
1218		while (cp != NULL && *cp != '\0') {
1219			if ((cp2 = strchr(cp, ' ')) != NULL)
1220				*cp2 = '\0';
1221
1222			n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
1223			if (n == 3 && bus == b && slot == s && func == f) {
1224				found = 1;
1225				break;
1226			}
1227
1228			if (cp2 != NULL)
1229				*cp2++ = ' ';
1230
1231			cp = cp2;
1232		}
1233		freeenv(val);
1234	}
1235	return (found);
1236}
1237
1238void *
1239vm_iommu_domain(struct vm *vm)
1240{
1241
1242	return (vm->iommu);
1243}
1244
1245int
1246vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1247{
1248	int error;
1249	struct vcpu *vcpu;
1250
1251	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1252		panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
1253
1254	vcpu = &vm->vcpu[vcpuid];
1255
1256	vcpu_lock(vcpu);
1257	error = vcpu_set_state_locked(vcpu, newstate);
1258	vcpu_unlock(vcpu);
1259
1260	return (error);
1261}
1262
1263enum vcpu_state
1264vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
1265{
1266	struct vcpu *vcpu;
1267	enum vcpu_state state;
1268
1269	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1270		panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
1271
1272	vcpu = &vm->vcpu[vcpuid];
1273
1274	vcpu_lock(vcpu);
1275	state = vcpu->state;
1276	if (hostcpu != NULL)
1277		*hostcpu = vcpu->hostcpu;
1278	vcpu_unlock(vcpu);
1279
1280	return (state);
1281}
1282
1283void
1284vm_activate_cpu(struct vm *vm, int vcpuid)
1285{
1286
1287	if (vcpuid >= 0 && vcpuid < VM_MAXCPU)
1288		CPU_SET(vcpuid, &vm->active_cpus);
1289}
1290
1291cpuset_t
1292vm_active_cpus(struct vm *vm)
1293{
1294
1295	return (vm->active_cpus);
1296}
1297
1298void *
1299vcpu_stats(struct vm *vm, int vcpuid)
1300{
1301
1302	return (vm->vcpu[vcpuid].stats);
1303}
1304
1305int
1306vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
1307{
1308	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1309		return (EINVAL);
1310
1311	*state = vm->vcpu[vcpuid].x2apic_state;
1312
1313	return (0);
1314}
1315
1316int
1317vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
1318{
1319	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1320		return (EINVAL);
1321
1322	if (state >= X2APIC_STATE_LAST)
1323		return (EINVAL);
1324
1325	vm->vcpu[vcpuid].x2apic_state = state;
1326
1327	vlapic_set_x2apic_state(vm, vcpuid, state);
1328
1329	return (0);
1330}
1331
1332void
1333vm_interrupt_hostcpu(struct vm *vm, int vcpuid)
1334{
1335	int hostcpu;
1336	struct vcpu *vcpu;
1337
1338	vcpu = &vm->vcpu[vcpuid];
1339
1340	vcpu_lock(vcpu);
1341	hostcpu = vcpu->hostcpu;
1342	if (hostcpu == NOCPU) {
1343		if (vcpu->state == VCPU_SLEEPING)
1344			wakeup_one(vcpu);
1345	} else {
1346		if (vcpu->state != VCPU_RUNNING)
1347			panic("invalid vcpu state %d", vcpu->state);
1348		if (hostcpu != curcpu)
1349			ipi_cpu(hostcpu, vmm_ipinum);
1350	}
1351	vcpu_unlock(vcpu);
1352}
1353
1354struct vmspace *
1355vm_get_vmspace(struct vm *vm)
1356{
1357
1358	return (vm->vmspace);
1359}
1360
1361int
1362vm_apicid2vcpuid(struct vm *vm, int apicid)
1363{
1364	/*
1365	 * XXX apic id is assumed to be numerically identical to vcpu id
1366	 */
1367	return (apicid);
1368}
1369