1221828Sgrehan/*-
2221828Sgrehan * Copyright (c) 2011 NetApp, Inc.
3221828Sgrehan * All rights reserved.
4221828Sgrehan *
5221828Sgrehan * Redistribution and use in source and binary forms, with or without
6221828Sgrehan * modification, are permitted provided that the following conditions
7221828Sgrehan * are met:
8221828Sgrehan * 1. Redistributions of source code must retain the above copyright
9221828Sgrehan *    notice, this list of conditions and the following disclaimer.
10221828Sgrehan * 2. Redistributions in binary form must reproduce the above copyright
11221828Sgrehan *    notice, this list of conditions and the following disclaimer in the
12221828Sgrehan *    documentation and/or other materials provided with the distribution.
13221828Sgrehan *
14221828Sgrehan * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15221828Sgrehan * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16221828Sgrehan * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17221828Sgrehan * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18221828Sgrehan * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19221828Sgrehan * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20221828Sgrehan * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21221828Sgrehan * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22221828Sgrehan * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23221828Sgrehan * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24221828Sgrehan * SUCH DAMAGE.
25221828Sgrehan *
26221828Sgrehan * $FreeBSD$
27221828Sgrehan */
28221828Sgrehan
29221828Sgrehan#include <sys/cdefs.h>
30221828Sgrehan__FBSDID("$FreeBSD$");
31221828Sgrehan
32221828Sgrehan#include <sys/param.h>
33234695Sgrehan#include <sys/systm.h>
34221828Sgrehan#include <sys/kernel.h>
35221828Sgrehan#include <sys/module.h>
36221828Sgrehan#include <sys/sysctl.h>
37221828Sgrehan#include <sys/malloc.h>
38221828Sgrehan#include <sys/pcpu.h>
39221828Sgrehan#include <sys/lock.h>
40221828Sgrehan#include <sys/mutex.h>
41221828Sgrehan#include <sys/proc.h>
42256072Sneel#include <sys/rwlock.h>
43221828Sgrehan#include <sys/sched.h>
44221828Sgrehan#include <sys/smp.h>
45221828Sgrehan#include <sys/systm.h>
46221828Sgrehan
47221828Sgrehan#include <vm/vm.h>
48256072Sneel#include <vm/vm_object.h>
49256072Sneel#include <vm/vm_page.h>
50256072Sneel#include <vm/pmap.h>
51256072Sneel#include <vm/vm_map.h>
52256072Sneel#include <vm/vm_extern.h>
53256072Sneel#include <vm/vm_param.h>
54221828Sgrehan
55261275Sjhb#include <machine/cpu.h>
56221828Sgrehan#include <machine/vm.h>
57221828Sgrehan#include <machine/pcb.h>
58241489Sneel#include <machine/smp.h>
59262350Sjhb#include <x86/psl.h>
60221914Sjhb#include <x86/apicreg.h>
61256072Sneel#include <machine/vmparam.h>
62221828Sgrehan
63221828Sgrehan#include <machine/vmm.h>
64261088Sjhb#include <machine/vmm_dev.h>
65268976Sjhb#include <machine/vmm_instruction_emul.h>
66261088Sjhb
67268976Sjhb#include "vmm_ioport.h"
68256072Sneel#include "vmm_ktr.h"
69242275Sneel#include "vmm_host.h"
70221828Sgrehan#include "vmm_mem.h"
71221828Sgrehan#include "vmm_util.h"
72268891Sjhb#include "vatpic.h"
73268891Sjhb#include "vatpit.h"
74261088Sjhb#include "vhpet.h"
75261088Sjhb#include "vioapic.h"
76221828Sgrehan#include "vlapic.h"
77221828Sgrehan#include "vmm_msr.h"
78221828Sgrehan#include "vmm_ipi.h"
79221828Sgrehan#include "vmm_stat.h"
80242065Sneel#include "vmm_lapic.h"
81221828Sgrehan
82221828Sgrehan#include "io/ppt.h"
83221828Sgrehan#include "io/iommu.h"
84221828Sgrehan
85221828Sgrehanstruct vlapic;
86221828Sgrehan
87270071Sgrehan/*
88270071Sgrehan * Initialization:
89270071Sgrehan * (a) allocated when vcpu is created
90270071Sgrehan * (i) initialized when vcpu is created and when it is reinitialized
91270071Sgrehan * (o) initialized the first time the vcpu is created
92270071Sgrehan * (x) initialized before use
93270071Sgrehan */
94221828Sgrehanstruct vcpu {
95270071Sgrehan	struct mtx 	mtx;		/* (o) protects 'state' and 'hostcpu' */
96270071Sgrehan	enum vcpu_state	state;		/* (o) vcpu state */
97270071Sgrehan	int		hostcpu;	/* (o) vcpu's host cpu */
98270071Sgrehan	struct vlapic	*vlapic;	/* (i) APIC device model */
99270071Sgrehan	enum x2apic_state x2apic_state;	/* (i) APIC mode */
100270159Sgrehan	uint64_t	exitintinfo;	/* (i) events pending at VM exit */
101270071Sgrehan	int		nmi_pending;	/* (i) NMI pending */
102270071Sgrehan	int		extint_pending;	/* (i) INTR pending */
103270071Sgrehan	struct vm_exception exception;	/* (x) exception collateral */
104270071Sgrehan	int	exception_pending;	/* (i) exception pending */
105270071Sgrehan	struct savefpu	*guestfpu;	/* (a,i) guest fpu state */
106270071Sgrehan	uint64_t	guest_xcr0;	/* (i) guest %xcr0 register */
107270071Sgrehan	void		*stats;		/* (a,i) statistics */
108270071Sgrehan	uint64_t guest_msrs[VMM_MSR_NUM]; /* (i) emulated MSRs */
109270071Sgrehan	struct vm_exit	exitinfo;	/* (x) exit reason and collateral */
110221828Sgrehan};
111221828Sgrehan
112270071Sgrehan#define	vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
113242065Sneel#define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
114242065Sneel#define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
115242065Sneel#define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
116256072Sneel#define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
117241489Sneel
118256072Sneelstruct mem_seg {
119256072Sneel	vm_paddr_t	gpa;
120256072Sneel	size_t		len;
121256072Sneel	boolean_t	wired;
122256072Sneel	vm_object_t	object;
123256072Sneel};
124221828Sgrehan#define	VM_MAX_MEMORY_SEGMENTS	2
125221828Sgrehan
126270071Sgrehan/*
127270071Sgrehan * Initialization:
128270071Sgrehan * (o) initialized the first time the VM is created
129270071Sgrehan * (i) initialized when VM is created and when it is reinitialized
130270071Sgrehan * (x) initialized before use
131270071Sgrehan */
132221828Sgrehanstruct vm {
133270071Sgrehan	void		*cookie;		/* (i) cpu-specific data */
134270071Sgrehan	void		*iommu;			/* (x) iommu-specific data */
135270071Sgrehan	struct vhpet	*vhpet;			/* (i) virtual HPET */
136270071Sgrehan	struct vioapic	*vioapic;		/* (i) virtual ioapic */
137270071Sgrehan	struct vatpic	*vatpic;		/* (i) virtual atpic */
138270071Sgrehan	struct vatpit	*vatpit;		/* (i) virtual atpit */
139270071Sgrehan	volatile cpuset_t active_cpus;		/* (i) active vcpus */
140270071Sgrehan	int		suspend;		/* (i) stop VM execution */
141270071Sgrehan	volatile cpuset_t suspended_cpus; 	/* (i) suspended vcpus */
142270071Sgrehan	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
143270071Sgrehan	cpuset_t	rendezvous_req_cpus;	/* (x) rendezvous requested */
144270071Sgrehan	cpuset_t	rendezvous_done_cpus;	/* (x) rendezvous finished */
145270071Sgrehan	void		*rendezvous_arg;	/* (x) rendezvous func/arg */
146270071Sgrehan	vm_rendezvous_func_t rendezvous_func;
147270071Sgrehan	struct mtx	rendezvous_mtx;		/* (o) rendezvous lock */
148270071Sgrehan	int		num_mem_segs;		/* (o) guest memory segments */
149256072Sneel	struct mem_seg	mem_segs[VM_MAX_MEMORY_SEGMENTS];
150270071Sgrehan	struct vmspace	*vmspace;		/* (o) guest's address space */
151270071Sgrehan	char		name[VM_MAX_NAMELEN];	/* (o) virtual machine name */
152270071Sgrehan	struct vcpu	vcpu[VM_MAXCPU];	/* (i) guest vcpus */
153221828Sgrehan};
154221828Sgrehan
155249396Sneelstatic int vmm_initialized;
156249396Sneel
157221828Sgrehanstatic struct vmm_ops *ops;
158266339Sjhb#define	VMM_INIT(num)	(ops != NULL ? (*ops->init)(num) : 0)
159221828Sgrehan#define	VMM_CLEANUP()	(ops != NULL ? (*ops->cleanup)() : 0)
160261275Sjhb#define	VMM_RESUME()	(ops != NULL ? (*ops->resume)() : 0)
161221828Sgrehan
162256072Sneel#define	VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
163268935Sjhb#define	VMRUN(vmi, vcpu, rip, pmap, rptr, sptr) \
164268935Sjhb	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, rptr, sptr) : ENXIO)
165221828Sgrehan#define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
166256072Sneel#define	VMSPACE_ALLOC(min, max) \
167256072Sneel	(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
168256072Sneel#define	VMSPACE_FREE(vmspace) \
169256072Sneel	(ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
170221828Sgrehan#define	VMGETREG(vmi, vcpu, num, retval)		\
171221828Sgrehan	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
172221828Sgrehan#define	VMSETREG(vmi, vcpu, num, val)		\
173221828Sgrehan	(ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
174221828Sgrehan#define	VMGETDESC(vmi, vcpu, num, desc)		\
175221828Sgrehan	(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
176221828Sgrehan#define	VMSETDESC(vmi, vcpu, num, desc)		\
177221828Sgrehan	(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
178221828Sgrehan#define	VMGETCAP(vmi, vcpu, num, retval)	\
179221828Sgrehan	(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
180221828Sgrehan#define	VMSETCAP(vmi, vcpu, num, val)		\
181221828Sgrehan	(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
182266339Sjhb#define	VLAPIC_INIT(vmi, vcpu)			\
183266339Sjhb	(ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL)
184266339Sjhb#define	VLAPIC_CLEANUP(vmi, vlapic)		\
185266339Sjhb	(ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL)
186221828Sgrehan
187245021Sneel#define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
188245021Sneel#define	fpu_stop_emulating()	clts()
189221828Sgrehan
190221828Sgrehanstatic MALLOC_DEFINE(M_VM, "vm", "vm");
191221828SgrehanCTASSERT(VMM_MSR_NUM <= 64);	/* msr_mask can keep track of up to 64 msrs */
192221828Sgrehan
193221828Sgrehan/* statistics */
194248389Sneelstatic VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
195221828Sgrehan
196266339SjhbSYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
197266339Sjhb
198268935Sjhb/*
199268935Sjhb * Halt the guest if all vcpus are executing a HLT instruction with
200268935Sjhb * interrupts disabled.
201268935Sjhb */
202268935Sjhbstatic int halt_detection_enabled = 1;
203268935SjhbTUNABLE_INT("hw.vmm.halt_detection", &halt_detection_enabled);
204268935SjhbSYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN,
205268935Sjhb    &halt_detection_enabled, 0,
206268935Sjhb    "Halt VM if all vcpus execute HLT with interrupts disabled");
207268935Sjhb
208266339Sjhbstatic int vmm_ipinum;
209266339SjhbSYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
210266339Sjhb    "IPI vector used for vcpu notifications");
211266339Sjhb
212221828Sgrehanstatic void
213270071Sgrehanvcpu_cleanup(struct vm *vm, int i, bool destroy)
214221828Sgrehan{
215266339Sjhb	struct vcpu *vcpu = &vm->vcpu[i];
216266339Sjhb
217266339Sjhb	VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
218270071Sgrehan	if (destroy) {
219270071Sgrehan		vmm_stat_free(vcpu->stats);
220270071Sgrehan		fpu_save_area_free(vcpu->guestfpu);
221270071Sgrehan	}
222221828Sgrehan}
223221828Sgrehan
224221828Sgrehanstatic void
225270071Sgrehanvcpu_init(struct vm *vm, int vcpu_id, bool create)
226221828Sgrehan{
227221828Sgrehan	struct vcpu *vcpu;
228270071Sgrehan
229270071Sgrehan	KASSERT(vcpu_id >= 0 && vcpu_id < VM_MAXCPU,
230270071Sgrehan	    ("vcpu_init: invalid vcpu %d", vcpu_id));
231270071Sgrehan
232221828Sgrehan	vcpu = &vm->vcpu[vcpu_id];
233221828Sgrehan
234270071Sgrehan	if (create) {
235270071Sgrehan		KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already "
236270071Sgrehan		    "initialized", vcpu_id));
237270071Sgrehan		vcpu_lock_init(vcpu);
238270071Sgrehan		vcpu->state = VCPU_IDLE;
239270071Sgrehan		vcpu->hostcpu = NOCPU;
240270071Sgrehan		vcpu->guestfpu = fpu_save_area_alloc();
241270071Sgrehan		vcpu->stats = vmm_stat_alloc();
242270071Sgrehan	}
243270071Sgrehan
244266339Sjhb	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
245267447Sjhb	vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
246270159Sgrehan	vcpu->exitintinfo = 0;
247270071Sgrehan	vcpu->nmi_pending = 0;
248270071Sgrehan	vcpu->extint_pending = 0;
249270071Sgrehan	vcpu->exception_pending = 0;
250267427Sjhb	vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
251234695Sgrehan	fpu_save_area_reset(vcpu->guestfpu);
252270071Sgrehan	vmm_stat_init(vcpu->stats);
253270071Sgrehan	guest_msrs_init(vm, vcpu_id);
254221828Sgrehan}
255221828Sgrehan
256240894Sneelstruct vm_exit *
257240894Sneelvm_exitinfo(struct vm *vm, int cpuid)
258240894Sneel{
259240894Sneel	struct vcpu *vcpu;
260240894Sneel
261240894Sneel	if (cpuid < 0 || cpuid >= VM_MAXCPU)
262240894Sneel		panic("vm_exitinfo: invalid cpuid %d", cpuid);
263240894Sneel
264240894Sneel	vcpu = &vm->vcpu[cpuid];
265240894Sneel
266240894Sneel	return (&vcpu->exitinfo);
267240894Sneel}
268240894Sneel
269261275Sjhbstatic void
270261275Sjhbvmm_resume(void)
271261275Sjhb{
272261275Sjhb	VMM_RESUME();
273261275Sjhb}
274261275Sjhb
275221828Sgrehanstatic int
276221828Sgrehanvmm_init(void)
277221828Sgrehan{
278221828Sgrehan	int error;
279221828Sgrehan
280242275Sneel	vmm_host_state_init();
281221828Sgrehan
282266339Sjhb	vmm_ipinum = vmm_ipi_alloc();
283266339Sjhb	if (vmm_ipinum == 0)
284266339Sjhb		vmm_ipinum = IPI_AST;
285266339Sjhb
286221828Sgrehan	error = vmm_mem_init();
287221828Sgrehan	if (error)
288221828Sgrehan		return (error);
289221828Sgrehan
290221828Sgrehan	if (vmm_is_intel())
291221828Sgrehan		ops = &vmm_ops_intel;
292221828Sgrehan	else if (vmm_is_amd())
293221828Sgrehan		ops = &vmm_ops_amd;
294221828Sgrehan	else
295221828Sgrehan		return (ENXIO);
296221828Sgrehan
297221828Sgrehan	vmm_msr_init();
298261275Sjhb	vmm_resume_p = vmm_resume;
299221828Sgrehan
300266339Sjhb	return (VMM_INIT(vmm_ipinum));
301221828Sgrehan}
302221828Sgrehan
303221828Sgrehanstatic int
304221828Sgrehanvmm_handler(module_t mod, int what, void *arg)
305221828Sgrehan{
306221828Sgrehan	int error;
307221828Sgrehan
308221828Sgrehan	switch (what) {
309221828Sgrehan	case MOD_LOAD:
310221828Sgrehan		vmmdev_init();
311267070Sjhb		if (ppt_avail_devices() > 0)
312267070Sjhb			iommu_init();
313221828Sgrehan		error = vmm_init();
314249396Sneel		if (error == 0)
315249396Sneel			vmm_initialized = 1;
316221828Sgrehan		break;
317221828Sgrehan	case MOD_UNLOAD:
318241454Sneel		error = vmmdev_cleanup();
319241454Sneel		if (error == 0) {
320261275Sjhb			vmm_resume_p = NULL;
321241454Sneel			iommu_cleanup();
322266339Sjhb			if (vmm_ipinum != IPI_AST)
323266339Sjhb				vmm_ipi_free(vmm_ipinum);
324241454Sneel			error = VMM_CLEANUP();
325253854Sgrehan			/*
326253854Sgrehan			 * Something bad happened - prevent new
327253854Sgrehan			 * VMs from being created
328253854Sgrehan			 */
329253854Sgrehan			if (error)
330253854Sgrehan				vmm_initialized = 0;
331241454Sneel		}
332221828Sgrehan		break;
333221828Sgrehan	default:
334221828Sgrehan		error = 0;
335221828Sgrehan		break;
336221828Sgrehan	}
337221828Sgrehan	return (error);
338221828Sgrehan}
339221828Sgrehan
340221828Sgrehanstatic moduledata_t vmm_kmod = {
341221828Sgrehan	"vmm",
342221828Sgrehan	vmm_handler,
343221828Sgrehan	NULL
344221828Sgrehan};
345221828Sgrehan
346221828Sgrehan/*
347245704Sneel * vmm initialization has the following dependencies:
348245704Sneel *
349245704Sneel * - iommu initialization must happen after the pci passthru driver has had
350245704Sneel *   a chance to attach to any passthru devices (after SI_SUB_CONFIGURE).
351245704Sneel *
352245704Sneel * - VT-x initialization requires smp_rendezvous() and therefore must happen
353245704Sneel *   after SMP is fully functional (after SI_SUB_SMP).
354221828Sgrehan */
355245704SneelDECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
356221828SgrehanMODULE_VERSION(vmm, 1);
357221828Sgrehan
358270071Sgrehanstatic void
359270071Sgrehanvm_init(struct vm *vm, bool create)
360270071Sgrehan{
361270071Sgrehan	int i;
362270071Sgrehan
363270071Sgrehan	vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace));
364270071Sgrehan	vm->iommu = NULL;
365270071Sgrehan	vm->vioapic = vioapic_init(vm);
366270071Sgrehan	vm->vhpet = vhpet_init(vm);
367270071Sgrehan	vm->vatpic = vatpic_init(vm);
368270071Sgrehan	vm->vatpit = vatpit_init(vm);
369270071Sgrehan
370270071Sgrehan	CPU_ZERO(&vm->active_cpus);
371270071Sgrehan
372270071Sgrehan	vm->suspend = 0;
373270071Sgrehan	CPU_ZERO(&vm->suspended_cpus);
374270071Sgrehan
375270071Sgrehan	for (i = 0; i < VM_MAXCPU; i++)
376270071Sgrehan		vcpu_init(vm, i, create);
377270071Sgrehan}
378270071Sgrehan
379249396Sneelint
380249396Sneelvm_create(const char *name, struct vm **retvm)
381221828Sgrehan{
382221828Sgrehan	struct vm *vm;
383256072Sneel	struct vmspace *vmspace;
384221828Sgrehan
385249396Sneel	/*
386249396Sneel	 * If vmm.ko could not be successfully initialized then don't attempt
387249396Sneel	 * to create the virtual machine.
388249396Sneel	 */
389249396Sneel	if (!vmm_initialized)
390249396Sneel		return (ENXIO);
391249396Sneel
392221828Sgrehan	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
393249396Sneel		return (EINVAL);
394221828Sgrehan
395256072Sneel	vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS);
396256072Sneel	if (vmspace == NULL)
397256072Sneel		return (ENOMEM);
398256072Sneel
399221828Sgrehan	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
400221828Sgrehan	strcpy(vm->name, name);
401270071Sgrehan	vm->num_mem_segs = 0;
402266339Sjhb	vm->vmspace = vmspace;
403266339Sjhb	mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
404221828Sgrehan
405270071Sgrehan	vm_init(vm, true);
406221828Sgrehan
407249396Sneel	*retvm = vm;
408249396Sneel	return (0);
409221828Sgrehan}
410221828Sgrehan
411241178Sneelstatic void
412256072Sneelvm_free_mem_seg(struct vm *vm, struct mem_seg *seg)
413241178Sneel{
414241178Sneel
415256072Sneel	if (seg->object != NULL)
416256072Sneel		vmm_mem_free(vm->vmspace, seg->gpa, seg->len);
417241362Sneel
418256072Sneel	bzero(seg, sizeof(*seg));
419241178Sneel}
420241178Sneel
421270071Sgrehanstatic void
422270071Sgrehanvm_cleanup(struct vm *vm, bool destroy)
423221828Sgrehan{
424221828Sgrehan	int i;
425221828Sgrehan
426221828Sgrehan	ppt_unassign_all(vm);
427221828Sgrehan
428256072Sneel	if (vm->iommu != NULL)
429256072Sneel		iommu_destroy_domain(vm->iommu);
430256072Sneel
431268891Sjhb	vatpit_cleanup(vm->vatpit);
432261088Sjhb	vhpet_cleanup(vm->vhpet);
433268891Sjhb	vatpic_cleanup(vm->vatpic);
434261088Sjhb	vioapic_cleanup(vm->vioapic);
435261088Sjhb
436270071Sgrehan	for (i = 0; i < VM_MAXCPU; i++)
437270071Sgrehan		vcpu_cleanup(vm, i, destroy);
438221828Sgrehan
439270071Sgrehan	VMCLEANUP(vm->cookie);
440241178Sneel
441270071Sgrehan	if (destroy) {
442270071Sgrehan		for (i = 0; i < vm->num_mem_segs; i++)
443270071Sgrehan			vm_free_mem_seg(vm, &vm->mem_segs[i]);
444221828Sgrehan
445270071Sgrehan		vm->num_mem_segs = 0;
446221828Sgrehan
447270071Sgrehan		VMSPACE_FREE(vm->vmspace);
448270071Sgrehan		vm->vmspace = NULL;
449270071Sgrehan	}
450270071Sgrehan}
451221828Sgrehan
452270071Sgrehanvoid
453270071Sgrehanvm_destroy(struct vm *vm)
454270071Sgrehan{
455270071Sgrehan	vm_cleanup(vm, true);
456221828Sgrehan	free(vm, M_VM);
457221828Sgrehan}
458221828Sgrehan
459270071Sgrehanint
460270071Sgrehanvm_reinit(struct vm *vm)
461270071Sgrehan{
462270071Sgrehan	int error;
463270071Sgrehan
464270071Sgrehan	/*
465270071Sgrehan	 * A virtual machine can be reset only if all vcpus are suspended.
466270071Sgrehan	 */
467270071Sgrehan	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
468270071Sgrehan		vm_cleanup(vm, false);
469270071Sgrehan		vm_init(vm, false);
470270071Sgrehan		error = 0;
471270071Sgrehan	} else {
472270071Sgrehan		error = EBUSY;
473270071Sgrehan	}
474270071Sgrehan
475270071Sgrehan	return (error);
476270071Sgrehan}
477270071Sgrehan
478221828Sgrehanconst char *
479221828Sgrehanvm_name(struct vm *vm)
480221828Sgrehan{
481221828Sgrehan	return (vm->name);
482221828Sgrehan}
483221828Sgrehan
484221828Sgrehanint
485221828Sgrehanvm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
486221828Sgrehan{
487256072Sneel	vm_object_t obj;
488221828Sgrehan
489256072Sneel	if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
490256072Sneel		return (ENOMEM);
491256072Sneel	else
492256072Sneel		return (0);
493221828Sgrehan}
494221828Sgrehan
495221828Sgrehanint
496221828Sgrehanvm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
497221828Sgrehan{
498221828Sgrehan
499256072Sneel	vmm_mmio_free(vm->vmspace, gpa, len);
500256072Sneel	return (0);
501221828Sgrehan}
502221828Sgrehan
503256072Sneelboolean_t
504256072Sneelvm_mem_allocated(struct vm *vm, vm_paddr_t gpa)
505241041Sneel{
506241041Sneel	int i;
507241041Sneel	vm_paddr_t gpabase, gpalimit;
508241041Sneel
509241041Sneel	for (i = 0; i < vm->num_mem_segs; i++) {
510241041Sneel		gpabase = vm->mem_segs[i].gpa;
511241041Sneel		gpalimit = gpabase + vm->mem_segs[i].len;
512241041Sneel		if (gpa >= gpabase && gpa < gpalimit)
513256072Sneel			return (TRUE);		/* 'gpa' is regular memory */
514241041Sneel	}
515241041Sneel
516256072Sneel	if (ppt_is_mmio(vm, gpa))
517256072Sneel		return (TRUE);			/* 'gpa' is pci passthru mmio */
518256072Sneel
519256072Sneel	return (FALSE);
520241041Sneel}
521241041Sneel
522221828Sgrehanint
523241041Sneelvm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
524221828Sgrehan{
525256072Sneel	int available, allocated;
526256072Sneel	struct mem_seg *seg;
527256072Sneel	vm_object_t object;
528256072Sneel	vm_paddr_t g;
529221828Sgrehan
530241041Sneel	if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
531241041Sneel		return (EINVAL);
532221828Sgrehan
533241041Sneel	available = allocated = 0;
534241041Sneel	g = gpa;
535241041Sneel	while (g < gpa + len) {
536256072Sneel		if (vm_mem_allocated(vm, g))
537256072Sneel			allocated++;
538256072Sneel		else
539241041Sneel			available++;
540241041Sneel
541241041Sneel		g += PAGE_SIZE;
542241041Sneel	}
543241041Sneel
544221828Sgrehan	/*
545241041Sneel	 * If there are some allocated and some available pages in the address
546241041Sneel	 * range then it is an error.
547221828Sgrehan	 */
548241041Sneel	if (allocated && available)
549241041Sneel		return (EINVAL);
550221828Sgrehan
551241041Sneel	/*
552241041Sneel	 * If the entire address range being requested has already been
553241041Sneel	 * allocated then there isn't anything more to do.
554241041Sneel	 */
555241041Sneel	if (allocated && available == 0)
556241041Sneel		return (0);
557241041Sneel
558221828Sgrehan	if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
559221828Sgrehan		return (E2BIG);
560221828Sgrehan
561241178Sneel	seg = &vm->mem_segs[vm->num_mem_segs];
562221828Sgrehan
563256072Sneel	if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL)
564256072Sneel		return (ENOMEM);
565256072Sneel
566241178Sneel	seg->gpa = gpa;
567256072Sneel	seg->len = len;
568256072Sneel	seg->object = object;
569256072Sneel	seg->wired = FALSE;
570241178Sneel
571256072Sneel	vm->num_mem_segs++;
572256072Sneel
573256072Sneel	return (0);
574256072Sneel}
575256072Sneel
576270159Sgrehanstatic vm_paddr_t
577270159Sgrehanvm_maxmem(struct vm *vm)
578270159Sgrehan{
579270159Sgrehan	int i;
580270159Sgrehan	vm_paddr_t gpa, maxmem;
581270159Sgrehan
582270159Sgrehan	maxmem = 0;
583270159Sgrehan	for (i = 0; i < vm->num_mem_segs; i++) {
584270159Sgrehan		gpa = vm->mem_segs[i].gpa + vm->mem_segs[i].len;
585270159Sgrehan		if (gpa > maxmem)
586270159Sgrehan			maxmem = gpa;
587270159Sgrehan	}
588270159Sgrehan	return (maxmem);
589270159Sgrehan}
590270159Sgrehan
591256072Sneelstatic void
592256072Sneelvm_gpa_unwire(struct vm *vm)
593256072Sneel{
594256072Sneel	int i, rv;
595256072Sneel	struct mem_seg *seg;
596256072Sneel
597256072Sneel	for (i = 0; i < vm->num_mem_segs; i++) {
598256072Sneel		seg = &vm->mem_segs[i];
599256072Sneel		if (!seg->wired)
600256072Sneel			continue;
601256072Sneel
602256072Sneel		rv = vm_map_unwire(&vm->vmspace->vm_map,
603256072Sneel				   seg->gpa, seg->gpa + seg->len,
604256072Sneel				   VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
605256072Sneel		KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment "
606256072Sneel		    "%#lx/%ld could not be unwired: %d",
607256072Sneel		    vm_name(vm), seg->gpa, seg->len, rv));
608256072Sneel
609256072Sneel		seg->wired = FALSE;
610256072Sneel	}
611256072Sneel}
612256072Sneel
613256072Sneelstatic int
614256072Sneelvm_gpa_wire(struct vm *vm)
615256072Sneel{
616256072Sneel	int i, rv;
617256072Sneel	struct mem_seg *seg;
618256072Sneel
619256072Sneel	for (i = 0; i < vm->num_mem_segs; i++) {
620256072Sneel		seg = &vm->mem_segs[i];
621256072Sneel		if (seg->wired)
622256072Sneel			continue;
623256072Sneel
624256072Sneel		/* XXX rlimits? */
625256072Sneel		rv = vm_map_wire(&vm->vmspace->vm_map,
626256072Sneel				 seg->gpa, seg->gpa + seg->len,
627256072Sneel				 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
628256072Sneel		if (rv != KERN_SUCCESS)
629241178Sneel			break;
630241178Sneel
631256072Sneel		seg->wired = TRUE;
632256072Sneel	}
633256072Sneel
634256072Sneel	if (i < vm->num_mem_segs) {
635241362Sneel		/*
636256072Sneel		 * Undo the wiring before returning an error.
637241362Sneel		 */
638256072Sneel		vm_gpa_unwire(vm);
639256072Sneel		return (EAGAIN);
640256072Sneel	}
641241178Sneel
642256072Sneel	return (0);
643256072Sneel}
644256072Sneel
645256072Sneelstatic void
646256072Sneelvm_iommu_modify(struct vm *vm, boolean_t map)
647256072Sneel{
648256072Sneel	int i, sz;
649256072Sneel	vm_paddr_t gpa, hpa;
650256072Sneel	struct mem_seg *seg;
651256072Sneel	void *vp, *cookie, *host_domain;
652256072Sneel
653256072Sneel	sz = PAGE_SIZE;
654256072Sneel	host_domain = iommu_host_domain();
655256072Sneel
656256072Sneel	for (i = 0; i < vm->num_mem_segs; i++) {
657256072Sneel		seg = &vm->mem_segs[i];
658256072Sneel		KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired",
659256072Sneel		    vm_name(vm), seg->gpa, seg->len));
660256072Sneel
661256072Sneel		gpa = seg->gpa;
662256072Sneel		while (gpa < seg->gpa + seg->len) {
663256072Sneel			vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE,
664256072Sneel					 &cookie);
665256072Sneel			KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
666256072Sneel			    vm_name(vm), gpa));
667256072Sneel
668256072Sneel			vm_gpa_release(cookie);
669256072Sneel
670256072Sneel			hpa = DMAP_TO_PHYS((uintptr_t)vp);
671256072Sneel			if (map) {
672256072Sneel				iommu_create_mapping(vm->iommu, gpa, hpa, sz);
673256072Sneel				iommu_remove_mapping(host_domain, hpa, sz);
674256072Sneel			} else {
675256072Sneel				iommu_remove_mapping(vm->iommu, gpa, sz);
676256072Sneel				iommu_create_mapping(host_domain, hpa, hpa, sz);
677256072Sneel			}
678256072Sneel
679256072Sneel			gpa += PAGE_SIZE;
680256072Sneel		}
681241178Sneel	}
682241178Sneel
683256072Sneel	/*
684256072Sneel	 * Invalidate the cached translations associated with the domain
685256072Sneel	 * from which pages were removed.
686256072Sneel	 */
687256072Sneel	if (map)
688256072Sneel		iommu_invalidate_tlb(host_domain);
689256072Sneel	else
690256072Sneel		iommu_invalidate_tlb(vm->iommu);
691256072Sneel}
692256072Sneel
693256072Sneel#define	vm_iommu_unmap(vm)	vm_iommu_modify((vm), FALSE)
694256072Sneel#define	vm_iommu_map(vm)	vm_iommu_modify((vm), TRUE)
695256072Sneel
696256072Sneelint
697256072Sneelvm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
698256072Sneel{
699256072Sneel	int error;
700256072Sneel
701256072Sneel	error = ppt_unassign_device(vm, bus, slot, func);
702256072Sneel	if (error)
703221828Sgrehan		return (error);
704256072Sneel
705267070Sjhb	if (ppt_assigned_devices(vm) == 0) {
706256072Sneel		vm_iommu_unmap(vm);
707256072Sneel		vm_gpa_unwire(vm);
708221828Sgrehan	}
709256072Sneel	return (0);
710256072Sneel}
711221828Sgrehan
712256072Sneelint
713256072Sneelvm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
714256072Sneel{
715256072Sneel	int error;
716256072Sneel	vm_paddr_t maxaddr;
717256072Sneel
718241362Sneel	/*
719256072Sneel	 * Virtual machines with pci passthru devices get special treatment:
720256072Sneel	 * - the guest physical memory is wired
721256072Sneel	 * - the iommu is programmed to do the 'gpa' to 'hpa' translation
722256072Sneel	 *
723256072Sneel	 * We need to do this before the first pci passthru device is attached.
724241362Sneel	 */
725267070Sjhb	if (ppt_assigned_devices(vm) == 0) {
726256072Sneel		KASSERT(vm->iommu == NULL,
727256072Sneel		    ("vm_assign_pptdev: iommu must be NULL"));
728270159Sgrehan		maxaddr = vm_maxmem(vm);
729256072Sneel		vm->iommu = iommu_create_domain(maxaddr);
730241362Sneel
731256072Sneel		error = vm_gpa_wire(vm);
732256072Sneel		if (error)
733256072Sneel			return (error);
734241041Sneel
735256072Sneel		vm_iommu_map(vm);
736256072Sneel	}
737256072Sneel
738256072Sneel	error = ppt_assign_device(vm, bus, slot, func);
739256072Sneel	return (error);
740221828Sgrehan}
741221828Sgrehan
742256072Sneelvoid *
743256072Sneelvm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
744256072Sneel	    void **cookie)
745221828Sgrehan{
746256072Sneel	int count, pageoff;
747256072Sneel	vm_page_t m;
748221828Sgrehan
749256072Sneel	pageoff = gpa & PAGE_MASK;
750256072Sneel	if (len > PAGE_SIZE - pageoff)
751256072Sneel		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
752241148Sneel
753256072Sneel	count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
754256072Sneel	    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
755256072Sneel
756256072Sneel	if (count == 1) {
757256072Sneel		*cookie = m;
758256072Sneel		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
759256072Sneel	} else {
760256072Sneel		*cookie = NULL;
761256072Sneel		return (NULL);
762256072Sneel	}
763221828Sgrehan}
764221828Sgrehan
765256072Sneelvoid
766256072Sneelvm_gpa_release(void *cookie)
767256072Sneel{
768256072Sneel	vm_page_t m = cookie;
769256072Sneel
770256072Sneel	vm_page_lock(m);
771256072Sneel	vm_page_unhold(m);
772256072Sneel	vm_page_unlock(m);
773256072Sneel}
774256072Sneel
775221828Sgrehanint
776221828Sgrehanvm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
777221828Sgrehan		  struct vm_memory_segment *seg)
778221828Sgrehan{
779221828Sgrehan	int i;
780221828Sgrehan
781221828Sgrehan	for (i = 0; i < vm->num_mem_segs; i++) {
782221828Sgrehan		if (gpabase == vm->mem_segs[i].gpa) {
783256072Sneel			seg->gpa = vm->mem_segs[i].gpa;
784256072Sneel			seg->len = vm->mem_segs[i].len;
785256072Sneel			seg->wired = vm->mem_segs[i].wired;
786221828Sgrehan			return (0);
787221828Sgrehan		}
788221828Sgrehan	}
789221828Sgrehan	return (-1);
790221828Sgrehan}
791221828Sgrehan
792221828Sgrehanint
793256072Sneelvm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
794256072Sneel	      vm_offset_t *offset, struct vm_object **object)
795256072Sneel{
796256072Sneel	int i;
797256072Sneel	size_t seg_len;
798256072Sneel	vm_paddr_t seg_gpa;
799256072Sneel	vm_object_t seg_obj;
800256072Sneel
801256072Sneel	for (i = 0; i < vm->num_mem_segs; i++) {
802256072Sneel		if ((seg_obj = vm->mem_segs[i].object) == NULL)
803256072Sneel			continue;
804256072Sneel
805256072Sneel		seg_gpa = vm->mem_segs[i].gpa;
806256072Sneel		seg_len = vm->mem_segs[i].len;
807256072Sneel
808256072Sneel		if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) {
809256072Sneel			*offset = gpa - seg_gpa;
810256072Sneel			*object = seg_obj;
811256072Sneel			vm_object_reference(seg_obj);
812256072Sneel			return (0);
813256072Sneel		}
814256072Sneel	}
815256072Sneel
816256072Sneel	return (EINVAL);
817256072Sneel}
818256072Sneel
819256072Sneelint
820221828Sgrehanvm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
821221828Sgrehan{
822221828Sgrehan
823221828Sgrehan	if (vcpu < 0 || vcpu >= VM_MAXCPU)
824221828Sgrehan		return (EINVAL);
825221828Sgrehan
826221828Sgrehan	if (reg >= VM_REG_LAST)
827221828Sgrehan		return (EINVAL);
828221828Sgrehan
829221828Sgrehan	return (VMGETREG(vm->cookie, vcpu, reg, retval));
830221828Sgrehan}
831221828Sgrehan
832221828Sgrehanint
833221828Sgrehanvm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
834221828Sgrehan{
835221828Sgrehan
836221828Sgrehan	if (vcpu < 0 || vcpu >= VM_MAXCPU)
837221828Sgrehan		return (EINVAL);
838221828Sgrehan
839221828Sgrehan	if (reg >= VM_REG_LAST)
840221828Sgrehan		return (EINVAL);
841221828Sgrehan
842221828Sgrehan	return (VMSETREG(vm->cookie, vcpu, reg, val));
843221828Sgrehan}
844221828Sgrehan
845221828Sgrehanstatic boolean_t
846221828Sgrehanis_descriptor_table(int reg)
847221828Sgrehan{
848221828Sgrehan
849221828Sgrehan	switch (reg) {
850221828Sgrehan	case VM_REG_GUEST_IDTR:
851221828Sgrehan	case VM_REG_GUEST_GDTR:
852221828Sgrehan		return (TRUE);
853221828Sgrehan	default:
854221828Sgrehan		return (FALSE);
855221828Sgrehan	}
856221828Sgrehan}
857221828Sgrehan
858221828Sgrehanstatic boolean_t
859221828Sgrehanis_segment_register(int reg)
860221828Sgrehan{
861221828Sgrehan
862221828Sgrehan	switch (reg) {
863221828Sgrehan	case VM_REG_GUEST_ES:
864221828Sgrehan	case VM_REG_GUEST_CS:
865221828Sgrehan	case VM_REG_GUEST_SS:
866221828Sgrehan	case VM_REG_GUEST_DS:
867221828Sgrehan	case VM_REG_GUEST_FS:
868221828Sgrehan	case VM_REG_GUEST_GS:
869221828Sgrehan	case VM_REG_GUEST_TR:
870221828Sgrehan	case VM_REG_GUEST_LDTR:
871221828Sgrehan		return (TRUE);
872221828Sgrehan	default:
873221828Sgrehan		return (FALSE);
874221828Sgrehan	}
875221828Sgrehan}
876221828Sgrehan
877221828Sgrehanint
878221828Sgrehanvm_get_seg_desc(struct vm *vm, int vcpu, int reg,
879221828Sgrehan		struct seg_desc *desc)
880221828Sgrehan{
881221828Sgrehan
882221828Sgrehan	if (vcpu < 0 || vcpu >= VM_MAXCPU)
883221828Sgrehan		return (EINVAL);
884221828Sgrehan
885221828Sgrehan	if (!is_segment_register(reg) && !is_descriptor_table(reg))
886221828Sgrehan		return (EINVAL);
887221828Sgrehan
888221828Sgrehan	return (VMGETDESC(vm->cookie, vcpu, reg, desc));
889221828Sgrehan}
890221828Sgrehan
891221828Sgrehanint
892221828Sgrehanvm_set_seg_desc(struct vm *vm, int vcpu, int reg,
893221828Sgrehan		struct seg_desc *desc)
894221828Sgrehan{
895221828Sgrehan	if (vcpu < 0 || vcpu >= VM_MAXCPU)
896221828Sgrehan		return (EINVAL);
897221828Sgrehan
898221828Sgrehan	if (!is_segment_register(reg) && !is_descriptor_table(reg))
899221828Sgrehan		return (EINVAL);
900221828Sgrehan
901221828Sgrehan	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
902221828Sgrehan}
903221828Sgrehan
904221828Sgrehanstatic void
905221828Sgrehanrestore_guest_fpustate(struct vcpu *vcpu)
906221828Sgrehan{
907221828Sgrehan
908234695Sgrehan	/* flush host state to the pcb */
909234695Sgrehan	fpuexit(curthread);
910242122Sneel
911242122Sneel	/* restore guest FPU state */
912221828Sgrehan	fpu_stop_emulating();
913234695Sgrehan	fpurestore(vcpu->guestfpu);
914242122Sneel
915267427Sjhb	/* restore guest XCR0 if XSAVE is enabled in the host */
916267427Sjhb	if (rcr4() & CR4_XSAVE)
917267427Sjhb		load_xcr(0, vcpu->guest_xcr0);
918267427Sjhb
919242122Sneel	/*
920242122Sneel	 * The FPU is now "dirty" with the guest's state so turn on emulation
921242122Sneel	 * to trap any access to the FPU by the host.
922242122Sneel	 */
923242122Sneel	fpu_start_emulating();
924221828Sgrehan}
925221828Sgrehan
926221828Sgrehanstatic void
927221828Sgrehansave_guest_fpustate(struct vcpu *vcpu)
928221828Sgrehan{
929221828Sgrehan
930242122Sneel	if ((rcr0() & CR0_TS) == 0)
931242122Sneel		panic("fpu emulation not enabled in host!");
932242122Sneel
933267427Sjhb	/* save guest XCR0 and restore host XCR0 */
934267427Sjhb	if (rcr4() & CR4_XSAVE) {
935267427Sjhb		vcpu->guest_xcr0 = rxcr(0);
936267427Sjhb		load_xcr(0, vmm_get_host_xcr0());
937267427Sjhb	}
938267427Sjhb
939242122Sneel	/* save guest FPU state */
940242122Sneel	fpu_stop_emulating();
941234695Sgrehan	fpusave(vcpu->guestfpu);
942221828Sgrehan	fpu_start_emulating();
943221828Sgrehan}
944221828Sgrehan
945248389Sneelstatic VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
946242065Sneel
947256072Sneelstatic int
948266393Sjhbvcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
949266393Sjhb    bool from_idle)
950256072Sneel{
951256072Sneel	int error;
952256072Sneel
953256072Sneel	vcpu_assert_locked(vcpu);
954256072Sneel
955256072Sneel	/*
956266393Sjhb	 * State transitions from the vmmdev_ioctl() must always begin from
957266393Sjhb	 * the VCPU_IDLE state. This guarantees that there is only a single
958266393Sjhb	 * ioctl() operating on a vcpu at any point.
959266393Sjhb	 */
960266393Sjhb	if (from_idle) {
961266393Sjhb		while (vcpu->state != VCPU_IDLE)
962266393Sjhb			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
963266393Sjhb	} else {
964266393Sjhb		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
965266393Sjhb		    "vcpu idle state"));
966266393Sjhb	}
967266393Sjhb
968266393Sjhb	if (vcpu->state == VCPU_RUNNING) {
969266393Sjhb		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
970266393Sjhb		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
971266393Sjhb	} else {
972266393Sjhb		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
973266393Sjhb		    "vcpu that is not running", vcpu->hostcpu));
974266393Sjhb	}
975266393Sjhb
976266393Sjhb	/*
977256072Sneel	 * The following state transitions are allowed:
978256072Sneel	 * IDLE -> FROZEN -> IDLE
979256072Sneel	 * FROZEN -> RUNNING -> FROZEN
980256072Sneel	 * FROZEN -> SLEEPING -> FROZEN
981256072Sneel	 */
982256072Sneel	switch (vcpu->state) {
983256072Sneel	case VCPU_IDLE:
984256072Sneel	case VCPU_RUNNING:
985256072Sneel	case VCPU_SLEEPING:
986256072Sneel		error = (newstate != VCPU_FROZEN);
987256072Sneel		break;
988256072Sneel	case VCPU_FROZEN:
989256072Sneel		error = (newstate == VCPU_FROZEN);
990256072Sneel		break;
991256072Sneel	default:
992256072Sneel		error = 1;
993256072Sneel		break;
994256072Sneel	}
995256072Sneel
996266393Sjhb	if (error)
997266393Sjhb		return (EBUSY);
998266393Sjhb
999266393Sjhb	vcpu->state = newstate;
1000266393Sjhb	if (newstate == VCPU_RUNNING)
1001266393Sjhb		vcpu->hostcpu = curcpu;
1002256072Sneel	else
1003266393Sjhb		vcpu->hostcpu = NOCPU;
1004256072Sneel
1005266393Sjhb	if (newstate == VCPU_IDLE)
1006266393Sjhb		wakeup(&vcpu->state);
1007266393Sjhb
1008266393Sjhb	return (0);
1009256072Sneel}
1010256072Sneel
1011256072Sneelstatic void
1012256072Sneelvcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1013256072Sneel{
1014256072Sneel	int error;
1015256072Sneel
1016266393Sjhb	if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
1017256072Sneel		panic("Error %d setting state to %d\n", error, newstate);
1018256072Sneel}
1019256072Sneel
1020256072Sneelstatic void
1021256072Sneelvcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
1022256072Sneel{
1023256072Sneel	int error;
1024256072Sneel
1025266393Sjhb	if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
1026256072Sneel		panic("Error %d setting state to %d", error, newstate);
1027256072Sneel}
1028256072Sneel
1029266339Sjhbstatic void
1030266339Sjhbvm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func)
1031266339Sjhb{
1032266339Sjhb
1033266339Sjhb	KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked"));
1034266339Sjhb
1035266339Sjhb	/*
1036266339Sjhb	 * Update 'rendezvous_func' and execute a write memory barrier to
1037266339Sjhb	 * ensure that it is visible across all host cpus. This is not needed
1038266339Sjhb	 * for correctness but it does ensure that all the vcpus will notice
1039266339Sjhb	 * that the rendezvous is requested immediately.
1040266339Sjhb	 */
1041266339Sjhb	vm->rendezvous_func = func;
1042266339Sjhb	wmb();
1043266339Sjhb}
1044266339Sjhb
1045266339Sjhb#define	RENDEZVOUS_CTR0(vm, vcpuid, fmt)				\
1046266339Sjhb	do {								\
1047266339Sjhb		if (vcpuid >= 0)					\
1048266339Sjhb			VCPU_CTR0(vm, vcpuid, fmt);			\
1049266339Sjhb		else							\
1050266339Sjhb			VM_CTR0(vm, fmt);				\
1051266339Sjhb	} while (0)
1052266339Sjhb
1053266339Sjhbstatic void
1054266339Sjhbvm_handle_rendezvous(struct vm *vm, int vcpuid)
1055266339Sjhb{
1056266339Sjhb
1057266339Sjhb	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
1058266339Sjhb	    ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid));
1059266339Sjhb
1060266339Sjhb	mtx_lock(&vm->rendezvous_mtx);
1061266339Sjhb	while (vm->rendezvous_func != NULL) {
1062266339Sjhb		/* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
1063266339Sjhb		CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus);
1064266339Sjhb
1065266339Sjhb		if (vcpuid != -1 &&
1066266339Sjhb		    CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
1067266339Sjhb		    !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
1068266339Sjhb			VCPU_CTR0(vm, vcpuid, "Calling rendezvous func");
1069266339Sjhb			(*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg);
1070266339Sjhb			CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
1071266339Sjhb		}
1072266339Sjhb		if (CPU_CMP(&vm->rendezvous_req_cpus,
1073266339Sjhb		    &vm->rendezvous_done_cpus) == 0) {
1074266339Sjhb			VCPU_CTR0(vm, vcpuid, "Rendezvous completed");
1075266339Sjhb			vm_set_rendezvous_func(vm, NULL);
1076266339Sjhb			wakeup(&vm->rendezvous_func);
1077266339Sjhb			break;
1078266339Sjhb		}
1079266339Sjhb		RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion");
1080266339Sjhb		mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
1081266339Sjhb		    "vmrndv", 0);
1082266339Sjhb	}
1083266339Sjhb	mtx_unlock(&vm->rendezvous_mtx);
1084266339Sjhb}
1085266339Sjhb
1086256072Sneel/*
1087256072Sneel * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
1088256072Sneel */
1089256072Sneelstatic int
1090262350Sjhbvm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
1091256072Sneel{
1092256072Sneel	struct vcpu *vcpu;
1093268935Sjhb	const char *wmesg;
1094268935Sjhb	int t, vcpu_halted, vm_halted;
1095256072Sneel
1096268935Sjhb	KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
1097268935Sjhb
1098256072Sneel	vcpu = &vm->vcpu[vcpuid];
1099268935Sjhb	vcpu_halted = 0;
1100268935Sjhb	vm_halted = 0;
1101256072Sneel
1102256072Sneel	vcpu_lock(vcpu);
1103268935Sjhb	while (1) {
1104268935Sjhb		/*
1105268935Sjhb		 * Do a final check for pending NMI or interrupts before
1106268935Sjhb		 * really putting this thread to sleep. Also check for
1107268935Sjhb		 * software events that would cause this vcpu to wakeup.
1108268935Sjhb		 *
1109268935Sjhb		 * These interrupts/events could have happened after the
1110268935Sjhb		 * vcpu returned from VMRUN() and before it acquired the
1111268935Sjhb		 * vcpu lock above.
1112268935Sjhb		 */
1113268935Sjhb		if (vm->rendezvous_func != NULL || vm->suspend)
1114268935Sjhb			break;
1115268935Sjhb		if (vm_nmi_pending(vm, vcpuid))
1116268935Sjhb			break;
1117268935Sjhb		if (!intr_disabled) {
1118268935Sjhb			if (vm_extint_pending(vm, vcpuid) ||
1119268935Sjhb			    vlapic_pending_intr(vcpu->vlapic, NULL)) {
1120268935Sjhb				break;
1121268935Sjhb			}
1122268935Sjhb		}
1123256072Sneel
1124270159Sgrehan		/* Don't go to sleep if the vcpu thread needs to yield */
1125270159Sgrehan		if (vcpu_should_yield(vm, vcpuid))
1126270159Sgrehan			break;
1127270159Sgrehan
1128268935Sjhb		/*
1129268935Sjhb		 * Some Linux guests implement "halt" by having all vcpus
1130268935Sjhb		 * execute HLT with interrupts disabled. 'halted_cpus' keeps
1131268935Sjhb		 * track of the vcpus that have entered this state. When all
1132268935Sjhb		 * vcpus enter the halted state the virtual machine is halted.
1133268935Sjhb		 */
1134268935Sjhb		if (intr_disabled) {
1135268935Sjhb			wmesg = "vmhalt";
1136268935Sjhb			VCPU_CTR0(vm, vcpuid, "Halted");
1137268935Sjhb			if (!vcpu_halted && halt_detection_enabled) {
1138268935Sjhb				vcpu_halted = 1;
1139268935Sjhb				CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
1140268935Sjhb			}
1141268935Sjhb			if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
1142268935Sjhb				vm_halted = 1;
1143268935Sjhb				break;
1144268935Sjhb			}
1145268935Sjhb		} else {
1146268935Sjhb			wmesg = "vmidle";
1147268935Sjhb		}
1148268935Sjhb
1149256072Sneel		t = ticks;
1150256072Sneel		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1151270159Sgrehan		/*
1152270159Sgrehan		 * XXX msleep_spin() cannot be interrupted by signals so
1153270159Sgrehan		 * wake up periodically to check pending signals.
1154270159Sgrehan		 */
1155270159Sgrehan		msleep_spin(vcpu, &vcpu->mtx, wmesg, hz);
1156256072Sneel		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1157256072Sneel		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
1158256072Sneel	}
1159268935Sjhb
1160268935Sjhb	if (vcpu_halted)
1161268935Sjhb		CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
1162268935Sjhb
1163256072Sneel	vcpu_unlock(vcpu);
1164256072Sneel
1165268935Sjhb	if (vm_halted)
1166268935Sjhb		vm_suspend(vm, VM_SUSPEND_HALT);
1167266339Sjhb
1168256072Sneel	return (0);
1169256072Sneel}
1170256072Sneel
1171256072Sneelstatic int
1172262350Sjhbvm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
1173256072Sneel{
1174256072Sneel	int rv, ftype;
1175256072Sneel	struct vm_map *map;
1176256072Sneel	struct vcpu *vcpu;
1177256072Sneel	struct vm_exit *vme;
1178256072Sneel
1179256072Sneel	vcpu = &vm->vcpu[vcpuid];
1180256072Sneel	vme = &vcpu->exitinfo;
1181256072Sneel
1182256072Sneel	ftype = vme->u.paging.fault_type;
1183256072Sneel	KASSERT(ftype == VM_PROT_READ ||
1184256072Sneel	    ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
1185256072Sneel	    ("vm_handle_paging: invalid fault_type %d", ftype));
1186256072Sneel
1187256072Sneel	if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
1188256072Sneel		rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
1189256072Sneel		    vme->u.paging.gpa, ftype);
1190256072Sneel		if (rv == 0)
1191256072Sneel			goto done;
1192256072Sneel	}
1193256072Sneel
1194256072Sneel	map = &vm->vmspace->vm_map;
1195256072Sneel	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
1196256072Sneel
1197261088Sjhb	VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, "
1198261088Sjhb	    "ftype = %d", rv, vme->u.paging.gpa, ftype);
1199256072Sneel
1200256072Sneel	if (rv != KERN_SUCCESS)
1201256072Sneel		return (EFAULT);
1202256072Sneeldone:
1203256072Sneel	/* restart execution at the faulting instruction */
1204256072Sneel	vme->inst_length = 0;
1205256072Sneel
1206256072Sneel	return (0);
1207256072Sneel}
1208256072Sneel
1209256072Sneelstatic int
1210262350Sjhbvm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
1211256072Sneel{
1212256072Sneel	struct vie *vie;
1213256072Sneel	struct vcpu *vcpu;
1214256072Sneel	struct vm_exit *vme;
1215268976Sjhb	uint64_t gla, gpa;
1216268976Sjhb	struct vm_guest_paging *paging;
1217261088Sjhb	mem_region_read_t mread;
1218261088Sjhb	mem_region_write_t mwrite;
1219270159Sgrehan	enum vm_cpu_mode cpu_mode;
1220270159Sgrehan	int cs_d, error;
1221256072Sneel
1222256072Sneel	vcpu = &vm->vcpu[vcpuid];
1223256072Sneel	vme = &vcpu->exitinfo;
1224256072Sneel
1225256072Sneel	gla = vme->u.inst_emul.gla;
1226256072Sneel	gpa = vme->u.inst_emul.gpa;
1227270159Sgrehan	cs_d = vme->u.inst_emul.cs_d;
1228256072Sneel	vie = &vme->u.inst_emul.vie;
1229268976Sjhb	paging = &vme->u.inst_emul.paging;
1230270159Sgrehan	cpu_mode = paging->cpu_mode;
1231256072Sneel
1232256072Sneel	vie_init(vie);
1233256072Sneel
1234256072Sneel	/* Fetch, decode and emulate the faulting instruction */
1235268976Sjhb	error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip,
1236268976Sjhb	    vme->inst_length, vie);
1237268976Sjhb	if (error == 1)
1238268976Sjhb		return (0);		/* Resume guest to handle page fault */
1239268976Sjhb	else if (error == -1)
1240256072Sneel		return (EFAULT);
1241268976Sjhb	else if (error != 0)
1242268976Sjhb		panic("%s: vmm_fetch_instruction error %d", __func__, error);
1243256072Sneel
1244270159Sgrehan	if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0)
1245256072Sneel		return (EFAULT);
1246256072Sneel
1247261088Sjhb	/* return to userland unless this is an in-kernel emulated device */
1248261088Sjhb	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1249261088Sjhb		mread = lapic_mmio_read;
1250261088Sjhb		mwrite = lapic_mmio_write;
1251261088Sjhb	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1252261088Sjhb		mread = vioapic_mmio_read;
1253261088Sjhb		mwrite = vioapic_mmio_write;
1254261088Sjhb	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1255261088Sjhb		mread = vhpet_mmio_read;
1256261088Sjhb		mwrite = vhpet_mmio_write;
1257261088Sjhb	} else {
1258262350Sjhb		*retu = true;
1259256072Sneel		return (0);
1260256072Sneel	}
1261256072Sneel
1262270159Sgrehan	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging,
1263270159Sgrehan	    mread, mwrite, retu);
1264256072Sneel
1265256072Sneel	return (error);
1266256072Sneel}
1267256072Sneel
1268268935Sjhbstatic int
1269268935Sjhbvm_handle_suspend(struct vm *vm, int vcpuid, bool *retu)
1270268935Sjhb{
1271268935Sjhb	int i, done;
1272268935Sjhb	struct vcpu *vcpu;
1273268935Sjhb
1274268935Sjhb	done = 0;
1275268935Sjhb	vcpu = &vm->vcpu[vcpuid];
1276268935Sjhb
1277268935Sjhb	CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
1278268935Sjhb
1279268935Sjhb	/*
1280268935Sjhb	 * Wait until all 'active_cpus' have suspended themselves.
1281268935Sjhb	 *
1282268935Sjhb	 * Since a VM may be suspended at any time including when one or
1283268935Sjhb	 * more vcpus are doing a rendezvous we need to call the rendezvous
1284268935Sjhb	 * handler while we are waiting to prevent a deadlock.
1285268935Sjhb	 */
1286268935Sjhb	vcpu_lock(vcpu);
1287268935Sjhb	while (1) {
1288268935Sjhb		if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
1289268935Sjhb			VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
1290268935Sjhb			break;
1291268935Sjhb		}
1292268935Sjhb
1293268935Sjhb		if (vm->rendezvous_func == NULL) {
1294268935Sjhb			VCPU_CTR0(vm, vcpuid, "Sleeping during suspend");
1295268935Sjhb			vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1296268935Sjhb			msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
1297268935Sjhb			vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1298268935Sjhb		} else {
1299268935Sjhb			VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend");
1300268935Sjhb			vcpu_unlock(vcpu);
1301268935Sjhb			vm_handle_rendezvous(vm, vcpuid);
1302268935Sjhb			vcpu_lock(vcpu);
1303268935Sjhb		}
1304268935Sjhb	}
1305268935Sjhb	vcpu_unlock(vcpu);
1306268935Sjhb
1307268935Sjhb	/*
1308268935Sjhb	 * Wakeup the other sleeping vcpus and return to userspace.
1309268935Sjhb	 */
1310268935Sjhb	for (i = 0; i < VM_MAXCPU; i++) {
1311268935Sjhb		if (CPU_ISSET(i, &vm->suspended_cpus)) {
1312268935Sjhb			vcpu_notify_event(vm, i, false);
1313268935Sjhb		}
1314268935Sjhb	}
1315268935Sjhb
1316268935Sjhb	*retu = true;
1317268935Sjhb	return (0);
1318268935Sjhb}
1319268935Sjhb
1320221828Sgrehanint
1321268935Sjhbvm_suspend(struct vm *vm, enum vm_suspend_how how)
1322268935Sjhb{
1323268935Sjhb	int i;
1324268935Sjhb
1325268935Sjhb	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
1326268935Sjhb		return (EINVAL);
1327268935Sjhb
1328268935Sjhb	if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
1329268935Sjhb		VM_CTR2(vm, "virtual machine already suspended %d/%d",
1330268935Sjhb		    vm->suspend, how);
1331268935Sjhb		return (EALREADY);
1332268935Sjhb	}
1333268935Sjhb
1334268935Sjhb	VM_CTR1(vm, "virtual machine successfully suspended %d", how);
1335268935Sjhb
1336268935Sjhb	/*
1337268935Sjhb	 * Notify all active vcpus that they are now suspended.
1338268935Sjhb	 */
1339268935Sjhb	for (i = 0; i < VM_MAXCPU; i++) {
1340268935Sjhb		if (CPU_ISSET(i, &vm->active_cpus))
1341268935Sjhb			vcpu_notify_event(vm, i, false);
1342268935Sjhb	}
1343268935Sjhb
1344268935Sjhb	return (0);
1345268935Sjhb}
1346268935Sjhb
1347268935Sjhbvoid
1348268935Sjhbvm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip)
1349268935Sjhb{
1350268935Sjhb	struct vm_exit *vmexit;
1351268935Sjhb
1352268935Sjhb	KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
1353268935Sjhb	    ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
1354268935Sjhb
1355268935Sjhb	vmexit = vm_exitinfo(vm, vcpuid);
1356268935Sjhb	vmexit->rip = rip;
1357268935Sjhb	vmexit->inst_length = 0;
1358268935Sjhb	vmexit->exitcode = VM_EXITCODE_SUSPENDED;
1359268935Sjhb	vmexit->u.suspended.how = vm->suspend;
1360268935Sjhb}
1361268935Sjhb
1362270074Sgrehanvoid
1363270074Sgrehanvm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip)
1364270074Sgrehan{
1365270074Sgrehan	struct vm_exit *vmexit;
1366270074Sgrehan
1367270074Sgrehan	KASSERT(vm->rendezvous_func != NULL, ("rendezvous not in progress"));
1368270074Sgrehan
1369270074Sgrehan	vmexit = vm_exitinfo(vm, vcpuid);
1370270074Sgrehan	vmexit->rip = rip;
1371270074Sgrehan	vmexit->inst_length = 0;
1372270074Sgrehan	vmexit->exitcode = VM_EXITCODE_RENDEZVOUS;
1373270074Sgrehan	vmm_stat_incr(vm, vcpuid, VMEXIT_RENDEZVOUS, 1);
1374270074Sgrehan}
1375270074Sgrehan
1376270074Sgrehanvoid
1377270074Sgrehanvm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip)
1378270074Sgrehan{
1379270074Sgrehan	struct vm_exit *vmexit;
1380270074Sgrehan
1381270074Sgrehan	vmexit = vm_exitinfo(vm, vcpuid);
1382270074Sgrehan	vmexit->rip = rip;
1383270074Sgrehan	vmexit->inst_length = 0;
1384270074Sgrehan	vmexit->exitcode = VM_EXITCODE_BOGUS;
1385270074Sgrehan	vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
1386270074Sgrehan}
1387270074Sgrehan
1388268935Sjhbint
1389221828Sgrehanvm_run(struct vm *vm, struct vm_run *vmrun)
1390221828Sgrehan{
1391256072Sneel	int error, vcpuid;
1392221828Sgrehan	struct vcpu *vcpu;
1393221828Sgrehan	struct pcb *pcb;
1394242065Sneel	uint64_t tscval, rip;
1395242065Sneel	struct vm_exit *vme;
1396262350Sjhb	bool retu, intr_disabled;
1397256072Sneel	pmap_t pmap;
1398268935Sjhb	void *rptr, *sptr;
1399221828Sgrehan
1400221828Sgrehan	vcpuid = vmrun->cpuid;
1401221828Sgrehan
1402221828Sgrehan	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1403221828Sgrehan		return (EINVAL);
1404221828Sgrehan
1405270070Sgrehan	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
1406270070Sgrehan		return (EINVAL);
1407270070Sgrehan
1408270070Sgrehan	if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
1409270070Sgrehan		return (EINVAL);
1410270070Sgrehan
1411268935Sjhb	rptr = &vm->rendezvous_func;
1412268935Sjhb	sptr = &vm->suspend;
1413256072Sneel	pmap = vmspace_pmap(vm->vmspace);
1414221828Sgrehan	vcpu = &vm->vcpu[vcpuid];
1415256072Sneel	vme = &vcpu->exitinfo;
1416242065Sneel	rip = vmrun->rip;
1417242065Sneelrestart:
1418221828Sgrehan	critical_enter();
1419221828Sgrehan
1420256072Sneel	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
1421256072Sneel	    ("vm_run: absurd pm_active"));
1422256072Sneel
1423221828Sgrehan	tscval = rdtsc();
1424221828Sgrehan
1425221828Sgrehan	pcb = PCPU_GET(curpcb);
1426221914Sjhb	set_pcb_flags(pcb, PCB_FULL_IRET);
1427221828Sgrehan
1428234695Sgrehan	restore_guest_msrs(vm, vcpuid);
1429221828Sgrehan	restore_guest_fpustate(vcpu);
1430241489Sneel
1431256072Sneel	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
1432268935Sjhb	error = VMRUN(vm->cookie, vcpuid, rip, pmap, rptr, sptr);
1433256072Sneel	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
1434241489Sneel
1435221828Sgrehan	save_guest_fpustate(vcpu);
1436221828Sgrehan	restore_host_msrs(vm, vcpuid);
1437221828Sgrehan
1438221828Sgrehan	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
1439221828Sgrehan
1440221828Sgrehan	critical_exit();
1441221828Sgrehan
1442256072Sneel	if (error == 0) {
1443262350Sjhb		retu = false;
1444256072Sneel		switch (vme->exitcode) {
1445268935Sjhb		case VM_EXITCODE_SUSPENDED:
1446268935Sjhb			error = vm_handle_suspend(vm, vcpuid, &retu);
1447268935Sjhb			break;
1448266339Sjhb		case VM_EXITCODE_IOAPIC_EOI:
1449266339Sjhb			vioapic_process_eoi(vm, vcpuid,
1450266339Sjhb			    vme->u.ioapic_eoi.vector);
1451266339Sjhb			break;
1452266339Sjhb		case VM_EXITCODE_RENDEZVOUS:
1453266339Sjhb			vm_handle_rendezvous(vm, vcpuid);
1454266339Sjhb			error = 0;
1455266339Sjhb			break;
1456256072Sneel		case VM_EXITCODE_HLT:
1457262350Sjhb			intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
1458262350Sjhb			error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu);
1459256072Sneel			break;
1460256072Sneel		case VM_EXITCODE_PAGING:
1461256072Sneel			error = vm_handle_paging(vm, vcpuid, &retu);
1462256072Sneel			break;
1463256072Sneel		case VM_EXITCODE_INST_EMUL:
1464256072Sneel			error = vm_handle_inst_emul(vm, vcpuid, &retu);
1465256072Sneel			break;
1466268976Sjhb		case VM_EXITCODE_INOUT:
1467268976Sjhb		case VM_EXITCODE_INOUT_STR:
1468268976Sjhb			error = vm_handle_inout(vm, vcpuid, vme, &retu);
1469268976Sjhb			break;
1470256072Sneel		default:
1471262350Sjhb			retu = true;	/* handled in userland */
1472256072Sneel			break;
1473242065Sneel		}
1474256072Sneel	}
1475242065Sneel
1476262350Sjhb	if (error == 0 && retu == false) {
1477242065Sneel		rip = vme->rip + vme->inst_length;
1478242065Sneel		goto restart;
1479242065Sneel	}
1480242065Sneel
1481256072Sneel	/* copy the exit information */
1482256072Sneel	bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
1483221828Sgrehan	return (error);
1484221828Sgrehan}
1485221828Sgrehan
1486221828Sgrehanint
1487270159Sgrehanvm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
1488270159Sgrehan{
1489270159Sgrehan	struct vcpu *vcpu;
1490270159Sgrehan	int type, vector;
1491270159Sgrehan
1492270159Sgrehan	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1493270159Sgrehan		return (EINVAL);
1494270159Sgrehan
1495270159Sgrehan	vcpu = &vm->vcpu[vcpuid];
1496270159Sgrehan
1497270159Sgrehan	if (info & VM_INTINFO_VALID) {
1498270159Sgrehan		type = info & VM_INTINFO_TYPE;
1499270159Sgrehan		vector = info & 0xff;
1500270159Sgrehan		if (type == VM_INTINFO_NMI && vector != IDT_NMI)
1501270159Sgrehan			return (EINVAL);
1502270159Sgrehan		if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
1503270159Sgrehan			return (EINVAL);
1504270159Sgrehan		if (info & VM_INTINFO_RSVD)
1505270159Sgrehan			return (EINVAL);
1506270159Sgrehan	} else {
1507270159Sgrehan		info = 0;
1508270159Sgrehan	}
1509270159Sgrehan	VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info);
1510270159Sgrehan	vcpu->exitintinfo = info;
1511270159Sgrehan	return (0);
1512270159Sgrehan}
1513270159Sgrehan
1514270159Sgrehanenum exc_class {
1515270159Sgrehan	EXC_BENIGN,
1516270159Sgrehan	EXC_CONTRIBUTORY,
1517270159Sgrehan	EXC_PAGEFAULT
1518270159Sgrehan};
1519270159Sgrehan
1520270159Sgrehan#define	IDT_VE	20	/* Virtualization Exception (Intel specific) */
1521270159Sgrehan
1522270159Sgrehanstatic enum exc_class
1523270159Sgrehanexception_class(uint64_t info)
1524270159Sgrehan{
1525270159Sgrehan	int type, vector;
1526270159Sgrehan
1527270159Sgrehan	KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info));
1528270159Sgrehan	type = info & VM_INTINFO_TYPE;
1529270159Sgrehan	vector = info & 0xff;
1530270159Sgrehan
1531270159Sgrehan	/* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
1532270159Sgrehan	switch (type) {
1533270159Sgrehan	case VM_INTINFO_HWINTR:
1534270159Sgrehan	case VM_INTINFO_SWINTR:
1535270159Sgrehan	case VM_INTINFO_NMI:
1536270159Sgrehan		return (EXC_BENIGN);
1537270159Sgrehan	default:
1538270159Sgrehan		/*
1539270159Sgrehan		 * Hardware exception.
1540270159Sgrehan		 *
1541270159Sgrehan		 * SVM and VT-x use identical type values to represent NMI,
1542270159Sgrehan		 * hardware interrupt and software interrupt.
1543270159Sgrehan		 *
1544270159Sgrehan		 * SVM uses type '3' for all exceptions. VT-x uses type '3'
1545270159Sgrehan		 * for exceptions except #BP and #OF. #BP and #OF use a type
1546270159Sgrehan		 * value of '5' or '6'. Therefore we don't check for explicit
1547270159Sgrehan		 * values of 'type' to classify 'intinfo' into a hardware
1548270159Sgrehan		 * exception.
1549270159Sgrehan		 */
1550270159Sgrehan		break;
1551270159Sgrehan	}
1552270159Sgrehan
1553270159Sgrehan	switch (vector) {
1554270159Sgrehan	case IDT_PF:
1555270159Sgrehan	case IDT_VE:
1556270159Sgrehan		return (EXC_PAGEFAULT);
1557270159Sgrehan	case IDT_DE:
1558270159Sgrehan	case IDT_TS:
1559270159Sgrehan	case IDT_NP:
1560270159Sgrehan	case IDT_SS:
1561270159Sgrehan	case IDT_GP:
1562270159Sgrehan		return (EXC_CONTRIBUTORY);
1563270159Sgrehan	default:
1564270159Sgrehan		return (EXC_BENIGN);
1565270159Sgrehan	}
1566270159Sgrehan}
1567270159Sgrehan
1568270159Sgrehanstatic int
1569270159Sgrehannested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
1570270159Sgrehan    uint64_t *retinfo)
1571270159Sgrehan{
1572270159Sgrehan	enum exc_class exc1, exc2;
1573270159Sgrehan	int type1, vector1;
1574270159Sgrehan
1575270159Sgrehan	KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1));
1576270159Sgrehan	KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2));
1577270159Sgrehan
1578270159Sgrehan	/*
1579270159Sgrehan	 * If an exception occurs while attempting to call the double-fault
1580270159Sgrehan	 * handler the processor enters shutdown mode (aka triple fault).
1581270159Sgrehan	 */
1582270159Sgrehan	type1 = info1 & VM_INTINFO_TYPE;
1583270159Sgrehan	vector1 = info1 & 0xff;
1584270159Sgrehan	if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
1585270159Sgrehan		VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)",
1586270159Sgrehan		    info1, info2);
1587270159Sgrehan		vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
1588270159Sgrehan		*retinfo = 0;
1589270159Sgrehan		return (0);
1590270159Sgrehan	}
1591270159Sgrehan
1592270159Sgrehan	/*
1593270159Sgrehan	 * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
1594270159Sgrehan	 */
1595270159Sgrehan	exc1 = exception_class(info1);
1596270159Sgrehan	exc2 = exception_class(info2);
1597270159Sgrehan	if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
1598270159Sgrehan	    (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
1599270159Sgrehan		/* Convert nested fault into a double fault. */
1600270159Sgrehan		*retinfo = IDT_DF;
1601270159Sgrehan		*retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
1602270159Sgrehan		*retinfo |= VM_INTINFO_DEL_ERRCODE;
1603270159Sgrehan	} else {
1604270159Sgrehan		/* Handle exceptions serially */
1605270159Sgrehan		*retinfo = info2;
1606270159Sgrehan	}
1607270159Sgrehan	return (1);
1608270159Sgrehan}
1609270159Sgrehan
1610270159Sgrehanstatic uint64_t
1611270159Sgrehanvcpu_exception_intinfo(struct vcpu *vcpu)
1612270159Sgrehan{
1613270159Sgrehan	uint64_t info = 0;
1614270159Sgrehan
1615270159Sgrehan	if (vcpu->exception_pending) {
1616270159Sgrehan		info = vcpu->exception.vector & 0xff;
1617270159Sgrehan		info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
1618270159Sgrehan		if (vcpu->exception.error_code_valid) {
1619270159Sgrehan			info |= VM_INTINFO_DEL_ERRCODE;
1620270159Sgrehan			info |= (uint64_t)vcpu->exception.error_code << 32;
1621270159Sgrehan		}
1622270159Sgrehan	}
1623270159Sgrehan	return (info);
1624270159Sgrehan}
1625270159Sgrehan
1626270159Sgrehanint
1627270159Sgrehanvm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
1628270159Sgrehan{
1629270159Sgrehan	struct vcpu *vcpu;
1630270159Sgrehan	uint64_t info1, info2;
1631270159Sgrehan	int valid;
1632270159Sgrehan
1633270159Sgrehan	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));
1634270159Sgrehan
1635270159Sgrehan	vcpu = &vm->vcpu[vcpuid];
1636270159Sgrehan
1637270159Sgrehan	info1 = vcpu->exitintinfo;
1638270159Sgrehan	vcpu->exitintinfo = 0;
1639270159Sgrehan
1640270159Sgrehan	info2 = 0;
1641270159Sgrehan	if (vcpu->exception_pending) {
1642270159Sgrehan		info2 = vcpu_exception_intinfo(vcpu);
1643270159Sgrehan		vcpu->exception_pending = 0;
1644270159Sgrehan		VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx",
1645270159Sgrehan		    vcpu->exception.vector, info2);
1646270159Sgrehan	}
1647270159Sgrehan
1648270159Sgrehan	if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
1649270159Sgrehan		valid = nested_fault(vm, vcpuid, info1, info2, retinfo);
1650270159Sgrehan	} else if (info1 & VM_INTINFO_VALID) {
1651270159Sgrehan		*retinfo = info1;
1652270159Sgrehan		valid = 1;
1653270159Sgrehan	} else if (info2 & VM_INTINFO_VALID) {
1654270159Sgrehan		*retinfo = info2;
1655270159Sgrehan		valid = 1;
1656270159Sgrehan	} else {
1657270159Sgrehan		valid = 0;
1658270159Sgrehan	}
1659270159Sgrehan
1660270159Sgrehan	if (valid) {
1661270159Sgrehan		VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), "
1662270159Sgrehan		    "retinfo(%#lx)", __func__, info1, info2, *retinfo);
1663270159Sgrehan	}
1664270159Sgrehan
1665270159Sgrehan	return (valid);
1666270159Sgrehan}
1667270159Sgrehan
1668270159Sgrehanint
1669270159Sgrehanvm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
1670270159Sgrehan{
1671270159Sgrehan	struct vcpu *vcpu;
1672270159Sgrehan
1673270159Sgrehan	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1674270159Sgrehan		return (EINVAL);
1675270159Sgrehan
1676270159Sgrehan	vcpu = &vm->vcpu[vcpuid];
1677270159Sgrehan	*info1 = vcpu->exitintinfo;
1678270159Sgrehan	*info2 = vcpu_exception_intinfo(vcpu);
1679270159Sgrehan	return (0);
1680270159Sgrehan}
1681270159Sgrehan
1682270159Sgrehanint
1683267427Sjhbvm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
1684221828Sgrehan{
1685267427Sjhb	struct vcpu *vcpu;
1686267427Sjhb
1687221828Sgrehan	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1688221828Sgrehan		return (EINVAL);
1689221828Sgrehan
1690267427Sjhb	if (exception->vector < 0 || exception->vector >= 32)
1691221828Sgrehan		return (EINVAL);
1692221828Sgrehan
1693270159Sgrehan	/*
1694270159Sgrehan	 * A double fault exception should never be injected directly into
1695270159Sgrehan	 * the guest. It is a derived exception that results from specific
1696270159Sgrehan	 * combinations of nested faults.
1697270159Sgrehan	 */
1698270159Sgrehan	if (exception->vector == IDT_DF)
1699270159Sgrehan		return (EINVAL);
1700270159Sgrehan
1701267427Sjhb	vcpu = &vm->vcpu[vcpuid];
1702221828Sgrehan
1703267427Sjhb	if (vcpu->exception_pending) {
1704267427Sjhb		VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
1705267427Sjhb		    "pending exception %d", exception->vector,
1706267427Sjhb		    vcpu->exception.vector);
1707267427Sjhb		return (EBUSY);
1708267427Sjhb	}
1709267427Sjhb
1710267427Sjhb	vcpu->exception_pending = 1;
1711267427Sjhb	vcpu->exception = *exception;
1712267427Sjhb	VCPU_CTR1(vm, vcpuid, "Exception %d pending", exception->vector);
1713267427Sjhb	return (0);
1714221828Sgrehan}
1715221828Sgrehan
1716270159Sgrehanvoid
1717270159Sgrehanvm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid,
1718270159Sgrehan    int errcode)
1719267427Sjhb{
1720270159Sgrehan	struct vm_exception exception;
1721267427Sjhb	struct vm_exit *vmexit;
1722270159Sgrehan	struct vm *vm;
1723267427Sjhb	int error;
1724267427Sjhb
1725270159Sgrehan	vm = vmarg;
1726270159Sgrehan
1727270159Sgrehan	exception.vector = vector;
1728270159Sgrehan	exception.error_code = errcode;
1729270159Sgrehan	exception.error_code_valid = errcode_valid;
1730270159Sgrehan	error = vm_inject_exception(vm, vcpuid, &exception);
1731267427Sjhb	KASSERT(error == 0, ("vm_inject_exception error %d", error));
1732267427Sjhb
1733267427Sjhb	/*
1734267427Sjhb	 * A fault-like exception allows the instruction to be restarted
1735267427Sjhb	 * after the exception handler returns.
1736267427Sjhb	 *
1737267427Sjhb	 * By setting the inst_length to 0 we ensure that the instruction
1738267427Sjhb	 * pointer remains at the faulting instruction.
1739267427Sjhb	 */
1740267427Sjhb	vmexit = vm_exitinfo(vm, vcpuid);
1741267427Sjhb	vmexit->inst_length = 0;
1742267427Sjhb}
1743267427Sjhb
1744267427Sjhbvoid
1745270159Sgrehanvm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2)
1746268976Sjhb{
1747270159Sgrehan	struct vm *vm;
1748268976Sjhb	int error;
1749268976Sjhb
1750270159Sgrehan	vm = vmarg;
1751268976Sjhb	VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx",
1752268976Sjhb	    error_code, cr2);
1753268976Sjhb
1754268976Sjhb	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
1755268976Sjhb	KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
1756268976Sjhb
1757270159Sgrehan	vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);
1758268976Sjhb}
1759268976Sjhb
1760248389Sneelstatic VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
1761241982Sneel
1762221828Sgrehanint
1763241982Sneelvm_inject_nmi(struct vm *vm, int vcpuid)
1764221828Sgrehan{
1765241982Sneel	struct vcpu *vcpu;
1766221828Sgrehan
1767241982Sneel	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1768221828Sgrehan		return (EINVAL);
1769221828Sgrehan
1770241982Sneel	vcpu = &vm->vcpu[vcpuid];
1771241982Sneel
1772241982Sneel	vcpu->nmi_pending = 1;
1773266339Sjhb	vcpu_notify_event(vm, vcpuid, false);
1774241982Sneel	return (0);
1775221828Sgrehan}
1776221828Sgrehan
1777221828Sgrehanint
1778241982Sneelvm_nmi_pending(struct vm *vm, int vcpuid)
1779241982Sneel{
1780241982Sneel	struct vcpu *vcpu;
1781241982Sneel
1782241982Sneel	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1783241982Sneel		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1784241982Sneel
1785241982Sneel	vcpu = &vm->vcpu[vcpuid];
1786241982Sneel
1787241982Sneel	return (vcpu->nmi_pending);
1788241982Sneel}
1789241982Sneel
1790241982Sneelvoid
1791241982Sneelvm_nmi_clear(struct vm *vm, int vcpuid)
1792241982Sneel{
1793241982Sneel	struct vcpu *vcpu;
1794241982Sneel
1795241982Sneel	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1796241982Sneel		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1797241982Sneel
1798241982Sneel	vcpu = &vm->vcpu[vcpuid];
1799241982Sneel
1800241982Sneel	if (vcpu->nmi_pending == 0)
1801241982Sneel		panic("vm_nmi_clear: inconsistent nmi_pending state");
1802241982Sneel
1803241982Sneel	vcpu->nmi_pending = 0;
1804241982Sneel	vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
1805241982Sneel}
1806241982Sneel
1807268891Sjhbstatic VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
1808268891Sjhb
1809241982Sneelint
1810268891Sjhbvm_inject_extint(struct vm *vm, int vcpuid)
1811268891Sjhb{
1812268891Sjhb	struct vcpu *vcpu;
1813268891Sjhb
1814268891Sjhb	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1815268891Sjhb		return (EINVAL);
1816268891Sjhb
1817268891Sjhb	vcpu = &vm->vcpu[vcpuid];
1818268891Sjhb
1819268891Sjhb	vcpu->extint_pending = 1;
1820268891Sjhb	vcpu_notify_event(vm, vcpuid, false);
1821268891Sjhb	return (0);
1822268891Sjhb}
1823268891Sjhb
1824268891Sjhbint
1825268891Sjhbvm_extint_pending(struct vm *vm, int vcpuid)
1826268891Sjhb{
1827268891Sjhb	struct vcpu *vcpu;
1828268891Sjhb
1829268891Sjhb	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1830268891Sjhb		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
1831268891Sjhb
1832268891Sjhb	vcpu = &vm->vcpu[vcpuid];
1833268891Sjhb
1834268891Sjhb	return (vcpu->extint_pending);
1835268891Sjhb}
1836268891Sjhb
1837268891Sjhbvoid
1838268891Sjhbvm_extint_clear(struct vm *vm, int vcpuid)
1839268891Sjhb{
1840268891Sjhb	struct vcpu *vcpu;
1841268891Sjhb
1842268891Sjhb	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1843268891Sjhb		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
1844268891Sjhb
1845268891Sjhb	vcpu = &vm->vcpu[vcpuid];
1846268891Sjhb
1847268891Sjhb	if (vcpu->extint_pending == 0)
1848268891Sjhb		panic("vm_extint_clear: inconsistent extint_pending state");
1849268891Sjhb
1850268891Sjhb	vcpu->extint_pending = 0;
1851268891Sjhb	vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
1852268891Sjhb}
1853268891Sjhb
1854268891Sjhbint
1855221828Sgrehanvm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
1856221828Sgrehan{
1857221828Sgrehan	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1858221828Sgrehan		return (EINVAL);
1859221828Sgrehan
1860221828Sgrehan	if (type < 0 || type >= VM_CAP_MAX)
1861221828Sgrehan		return (EINVAL);
1862221828Sgrehan
1863221828Sgrehan	return (VMGETCAP(vm->cookie, vcpu, type, retval));
1864221828Sgrehan}
1865221828Sgrehan
1866221828Sgrehanint
1867221828Sgrehanvm_set_capability(struct vm *vm, int vcpu, int type, int val)
1868221828Sgrehan{
1869221828Sgrehan	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1870221828Sgrehan		return (EINVAL);
1871221828Sgrehan
1872221828Sgrehan	if (type < 0 || type >= VM_CAP_MAX)
1873221828Sgrehan		return (EINVAL);
1874221828Sgrehan
1875221828Sgrehan	return (VMSETCAP(vm->cookie, vcpu, type, val));
1876221828Sgrehan}
1877221828Sgrehan
1878221828Sgrehanuint64_t *
1879221828Sgrehanvm_guest_msrs(struct vm *vm, int cpu)
1880221828Sgrehan{
1881221828Sgrehan	return (vm->vcpu[cpu].guest_msrs);
1882221828Sgrehan}
1883221828Sgrehan
1884221828Sgrehanstruct vlapic *
1885221828Sgrehanvm_lapic(struct vm *vm, int cpu)
1886221828Sgrehan{
1887221828Sgrehan	return (vm->vcpu[cpu].vlapic);
1888221828Sgrehan}
1889221828Sgrehan
1890261088Sjhbstruct vioapic *
1891261088Sjhbvm_ioapic(struct vm *vm)
1892261088Sjhb{
1893261088Sjhb
1894261088Sjhb	return (vm->vioapic);
1895261088Sjhb}
1896261088Sjhb
1897261088Sjhbstruct vhpet *
1898261088Sjhbvm_hpet(struct vm *vm)
1899261088Sjhb{
1900261088Sjhb
1901261088Sjhb	return (vm->vhpet);
1902261088Sjhb}
1903261088Sjhb
1904221828Sgrehanboolean_t
1905221828Sgrehanvmm_is_pptdev(int bus, int slot, int func)
1906221828Sgrehan{
1907246188Sneel	int found, i, n;
1908246188Sneel	int b, s, f;
1909221828Sgrehan	char *val, *cp, *cp2;
1910221828Sgrehan
1911221828Sgrehan	/*
1912246188Sneel	 * XXX
1913246188Sneel	 * The length of an environment variable is limited to 128 bytes which
1914246188Sneel	 * puts an upper limit on the number of passthru devices that may be
1915246188Sneel	 * specified using a single environment variable.
1916246188Sneel	 *
1917246188Sneel	 * Work around this by scanning multiple environment variable
1918246188Sneel	 * names instead of a single one - yuck!
1919221828Sgrehan	 */
1920246188Sneel	const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
1921246188Sneel
1922246188Sneel	/* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
1923221828Sgrehan	found = 0;
1924246188Sneel	for (i = 0; names[i] != NULL && !found; i++) {
1925246188Sneel		cp = val = getenv(names[i]);
1926246188Sneel		while (cp != NULL && *cp != '\0') {
1927246188Sneel			if ((cp2 = strchr(cp, ' ')) != NULL)
1928246188Sneel				*cp2 = '\0';
1929221828Sgrehan
1930246188Sneel			n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
1931246188Sneel			if (n == 3 && bus == b && slot == s && func == f) {
1932246188Sneel				found = 1;
1933246188Sneel				break;
1934246188Sneel			}
1935221828Sgrehan
1936246188Sneel			if (cp2 != NULL)
1937246188Sneel				*cp2++ = ' ';
1938221828Sgrehan
1939246188Sneel			cp = cp2;
1940246188Sneel		}
1941246188Sneel		freeenv(val);
1942221828Sgrehan	}
1943221828Sgrehan	return (found);
1944221828Sgrehan}
1945221828Sgrehan
1946221828Sgrehanvoid *
1947221828Sgrehanvm_iommu_domain(struct vm *vm)
1948221828Sgrehan{
1949221828Sgrehan
1950221828Sgrehan	return (vm->iommu);
1951221828Sgrehan}
1952221828Sgrehan
1953241489Sneelint
1954266393Sjhbvcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
1955266393Sjhb    bool from_idle)
1956221828Sgrehan{
1957241489Sneel	int error;
1958221828Sgrehan	struct vcpu *vcpu;
1959221828Sgrehan
1960221828Sgrehan	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1961221828Sgrehan		panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
1962221828Sgrehan
1963221828Sgrehan	vcpu = &vm->vcpu[vcpuid];
1964221828Sgrehan
1965241489Sneel	vcpu_lock(vcpu);
1966266393Sjhb	error = vcpu_set_state_locked(vcpu, newstate, from_idle);
1967241489Sneel	vcpu_unlock(vcpu);
1968241489Sneel
1969241489Sneel	return (error);
1970221828Sgrehan}
1971221828Sgrehan
1972241489Sneelenum vcpu_state
1973249879Sgrehanvcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
1974221828Sgrehan{
1975221828Sgrehan	struct vcpu *vcpu;
1976241489Sneel	enum vcpu_state state;
1977221828Sgrehan
1978221828Sgrehan	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1979221828Sgrehan		panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
1980221828Sgrehan
1981221828Sgrehan	vcpu = &vm->vcpu[vcpuid];
1982221828Sgrehan
1983241489Sneel	vcpu_lock(vcpu);
1984241489Sneel	state = vcpu->state;
1985249879Sgrehan	if (hostcpu != NULL)
1986249879Sgrehan		*hostcpu = vcpu->hostcpu;
1987241489Sneel	vcpu_unlock(vcpu);
1988221828Sgrehan
1989241489Sneel	return (state);
1990221828Sgrehan}
1991221828Sgrehan
1992270070Sgrehanint
1993221828Sgrehanvm_activate_cpu(struct vm *vm, int vcpuid)
1994221828Sgrehan{
1995221828Sgrehan
1996270070Sgrehan	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1997270070Sgrehan		return (EINVAL);
1998266339Sjhb
1999270070Sgrehan	if (CPU_ISSET(vcpuid, &vm->active_cpus))
2000270070Sgrehan		return (EBUSY);
2001270070Sgrehan
2002266339Sjhb	VCPU_CTR0(vm, vcpuid, "activated");
2003266339Sjhb	CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
2004270070Sgrehan	return (0);
2005221828Sgrehan}
2006221828Sgrehan
2007223621Sgrehancpuset_t
2008221828Sgrehanvm_active_cpus(struct vm *vm)
2009221828Sgrehan{
2010221828Sgrehan
2011221828Sgrehan	return (vm->active_cpus);
2012221828Sgrehan}
2013221828Sgrehan
2014270070Sgrehancpuset_t
2015270070Sgrehanvm_suspended_cpus(struct vm *vm)
2016270070Sgrehan{
2017270070Sgrehan
2018270070Sgrehan	return (vm->suspended_cpus);
2019270070Sgrehan}
2020270070Sgrehan
2021221828Sgrehanvoid *
2022221828Sgrehanvcpu_stats(struct vm *vm, int vcpuid)
2023221828Sgrehan{
2024221828Sgrehan
2025221828Sgrehan	return (vm->vcpu[vcpuid].stats);
2026221828Sgrehan}
2027240922Sneel
2028240922Sneelint
2029240922Sneelvm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
2030240922Sneel{
2031240922Sneel	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2032240922Sneel		return (EINVAL);
2033240922Sneel
2034240922Sneel	*state = vm->vcpu[vcpuid].x2apic_state;
2035240922Sneel
2036240922Sneel	return (0);
2037240922Sneel}
2038240922Sneel
2039240922Sneelint
2040240922Sneelvm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
2041240922Sneel{
2042240922Sneel	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2043240922Sneel		return (EINVAL);
2044240922Sneel
2045248392Sneel	if (state >= X2APIC_STATE_LAST)
2046240922Sneel		return (EINVAL);
2047240922Sneel
2048240922Sneel	vm->vcpu[vcpuid].x2apic_state = state;
2049240922Sneel
2050240943Sneel	vlapic_set_x2apic_state(vm, vcpuid, state);
2051240943Sneel
2052240922Sneel	return (0);
2053240922Sneel}
2054241489Sneel
2055262350Sjhb/*
2056262350Sjhb * This function is called to ensure that a vcpu "sees" a pending event
2057262350Sjhb * as soon as possible:
2058262350Sjhb * - If the vcpu thread is sleeping then it is woken up.
2059262350Sjhb * - If the vcpu is running on a different host_cpu then an IPI will be directed
2060262350Sjhb *   to the host_cpu to cause the vcpu to trap into the hypervisor.
2061262350Sjhb */
2062241489Sneelvoid
2063266339Sjhbvcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
2064241489Sneel{
2065241489Sneel	int hostcpu;
2066241489Sneel	struct vcpu *vcpu;
2067241489Sneel
2068241489Sneel	vcpu = &vm->vcpu[vcpuid];
2069241489Sneel
2070242065Sneel	vcpu_lock(vcpu);
2071241489Sneel	hostcpu = vcpu->hostcpu;
2072266393Sjhb	if (vcpu->state == VCPU_RUNNING) {
2073266393Sjhb		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
2074266339Sjhb		if (hostcpu != curcpu) {
2075266393Sjhb			if (lapic_intr) {
2076266339Sjhb				vlapic_post_intr(vcpu->vlapic, hostcpu,
2077266339Sjhb				    vmm_ipinum);
2078266393Sjhb			} else {
2079266339Sjhb				ipi_cpu(hostcpu, vmm_ipinum);
2080266393Sjhb			}
2081266393Sjhb		} else {
2082266393Sjhb			/*
2083266393Sjhb			 * If the 'vcpu' is running on 'curcpu' then it must
2084266393Sjhb			 * be sending a notification to itself (e.g. SELF_IPI).
2085266393Sjhb			 * The pending event will be picked up when the vcpu
2086266393Sjhb			 * transitions back to guest context.
2087266393Sjhb			 */
2088266339Sjhb		}
2089266393Sjhb	} else {
2090266393Sjhb		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
2091266393Sjhb		    "with hostcpu %d", vcpu->state, hostcpu));
2092266393Sjhb		if (vcpu->state == VCPU_SLEEPING)
2093266393Sjhb			wakeup_one(vcpu);
2094242065Sneel	}
2095242065Sneel	vcpu_unlock(vcpu);
2096241489Sneel}
2097256072Sneel
2098256072Sneelstruct vmspace *
2099256072Sneelvm_get_vmspace(struct vm *vm)
2100256072Sneel{
2101256072Sneel
2102256072Sneel	return (vm->vmspace);
2103256072Sneel}
2104261088Sjhb
2105261088Sjhbint
2106261088Sjhbvm_apicid2vcpuid(struct vm *vm, int apicid)
2107261088Sjhb{
2108261088Sjhb	/*
2109261088Sjhb	 * XXX apic id is assumed to be numerically identical to vcpu id
2110261088Sjhb	 */
2111261088Sjhb	return (apicid);
2112261088Sjhb}
2113266339Sjhb
2114266339Sjhbvoid
2115266339Sjhbvm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
2116266339Sjhb    vm_rendezvous_func_t func, void *arg)
2117266339Sjhb{
2118266339Sjhb	int i;
2119266339Sjhb
2120266339Sjhb	/*
2121266339Sjhb	 * Enforce that this function is called without any locks
2122266339Sjhb	 */
2123266339Sjhb	WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous");
2124266339Sjhb	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
2125266339Sjhb	    ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid));
2126266339Sjhb
2127266339Sjhbrestart:
2128266339Sjhb	mtx_lock(&vm->rendezvous_mtx);
2129266339Sjhb	if (vm->rendezvous_func != NULL) {
2130266339Sjhb		/*
2131266339Sjhb		 * If a rendezvous is already in progress then we need to
2132266339Sjhb		 * call the rendezvous handler in case this 'vcpuid' is one
2133266339Sjhb		 * of the targets of the rendezvous.
2134266339Sjhb		 */
2135266339Sjhb		RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress");
2136266339Sjhb		mtx_unlock(&vm->rendezvous_mtx);
2137266339Sjhb		vm_handle_rendezvous(vm, vcpuid);
2138266339Sjhb		goto restart;
2139266339Sjhb	}
2140266339Sjhb	KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous "
2141266339Sjhb	    "rendezvous is still in progress"));
2142266339Sjhb
2143266339Sjhb	RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous");
2144266339Sjhb	vm->rendezvous_req_cpus = dest;
2145266339Sjhb	CPU_ZERO(&vm->rendezvous_done_cpus);
2146266339Sjhb	vm->rendezvous_arg = arg;
2147266339Sjhb	vm_set_rendezvous_func(vm, func);
2148266339Sjhb	mtx_unlock(&vm->rendezvous_mtx);
2149266339Sjhb
2150266339Sjhb	/*
2151266339Sjhb	 * Wake up any sleeping vcpus and trigger a VM-exit in any running
2152266339Sjhb	 * vcpus so they handle the rendezvous as soon as possible.
2153266339Sjhb	 */
2154266339Sjhb	for (i = 0; i < VM_MAXCPU; i++) {
2155266339Sjhb		if (CPU_ISSET(i, &dest))
2156266339Sjhb			vcpu_notify_event(vm, i, false);
2157266339Sjhb	}
2158266339Sjhb
2159266339Sjhb	vm_handle_rendezvous(vm, vcpuid);
2160266339Sjhb}
2161268891Sjhb
2162268891Sjhbstruct vatpic *
2163268891Sjhbvm_atpic(struct vm *vm)
2164268891Sjhb{
2165268891Sjhb	return (vm->vatpic);
2166268891Sjhb}
2167268891Sjhb
2168268891Sjhbstruct vatpit *
2169268891Sjhbvm_atpit(struct vm *vm)
2170268891Sjhb{
2171268891Sjhb	return (vm->vatpit);
2172268891Sjhb}
2173268976Sjhb
2174268976Sjhbenum vm_reg_name
2175268976Sjhbvm_segment_name(int seg)
2176268976Sjhb{
2177268976Sjhb	static enum vm_reg_name seg_names[] = {
2178268976Sjhb		VM_REG_GUEST_ES,
2179268976Sjhb		VM_REG_GUEST_CS,
2180268976Sjhb		VM_REG_GUEST_SS,
2181268976Sjhb		VM_REG_GUEST_DS,
2182268976Sjhb		VM_REG_GUEST_FS,
2183268976Sjhb		VM_REG_GUEST_GS
2184268976Sjhb	};
2185268976Sjhb
2186268976Sjhb	KASSERT(seg >= 0 && seg < nitems(seg_names),
2187268976Sjhb	    ("%s: invalid segment encoding %d", __func__, seg));
2188268976Sjhb	return (seg_names[seg]);
2189268976Sjhb}
2190270074Sgrehan
2191270159Sgrehanvoid
2192270159Sgrehanvm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
2193270159Sgrehan    int num_copyinfo)
2194270159Sgrehan{
2195270159Sgrehan	int idx;
2196270074Sgrehan
2197270159Sgrehan	for (idx = 0; idx < num_copyinfo; idx++) {
2198270159Sgrehan		if (copyinfo[idx].cookie != NULL)
2199270159Sgrehan			vm_gpa_release(copyinfo[idx].cookie);
2200270159Sgrehan	}
2201270159Sgrehan	bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo));
2202270159Sgrehan}
2203270159Sgrehan
2204270159Sgrehanint
2205270159Sgrehanvm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
2206270159Sgrehan    uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
2207270159Sgrehan    int num_copyinfo)
2208270159Sgrehan{
2209270159Sgrehan	int error, idx, nused;
2210270159Sgrehan	size_t n, off, remaining;
2211270159Sgrehan	void *hva, *cookie;
2212270159Sgrehan	uint64_t gpa;
2213270159Sgrehan
2214270159Sgrehan	bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo);
2215270159Sgrehan
2216270159Sgrehan	nused = 0;
2217270159Sgrehan	remaining = len;
2218270159Sgrehan	while (remaining > 0) {
2219270159Sgrehan		KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
2220270159Sgrehan		error = vmm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa);
2221270159Sgrehan		if (error)
2222270159Sgrehan			return (error);
2223270159Sgrehan		off = gpa & PAGE_MASK;
2224270159Sgrehan		n = min(remaining, PAGE_SIZE - off);
2225270159Sgrehan		copyinfo[nused].gpa = gpa;
2226270159Sgrehan		copyinfo[nused].len = n;
2227270159Sgrehan		remaining -= n;
2228270159Sgrehan		gla += n;
2229270159Sgrehan		nused++;
2230270159Sgrehan	}
2231270159Sgrehan
2232270159Sgrehan	for (idx = 0; idx < nused; idx++) {
2233270159Sgrehan		hva = vm_gpa_hold(vm, copyinfo[idx].gpa, copyinfo[idx].len,
2234270159Sgrehan		    prot, &cookie);
2235270159Sgrehan		if (hva == NULL)
2236270159Sgrehan			break;
2237270159Sgrehan		copyinfo[idx].hva = hva;
2238270159Sgrehan		copyinfo[idx].cookie = cookie;
2239270159Sgrehan	}
2240270159Sgrehan
2241270159Sgrehan	if (idx != nused) {
2242270159Sgrehan		vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
2243270159Sgrehan		return (-1);
2244270159Sgrehan	} else {
2245270159Sgrehan		return (0);
2246270159Sgrehan	}
2247270159Sgrehan}
2248270159Sgrehan
2249270159Sgrehanvoid
2250270159Sgrehanvm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
2251270159Sgrehan    size_t len)
2252270159Sgrehan{
2253270159Sgrehan	char *dst;
2254270159Sgrehan	int idx;
2255270159Sgrehan
2256270159Sgrehan	dst = kaddr;
2257270159Sgrehan	idx = 0;
2258270159Sgrehan	while (len > 0) {
2259270159Sgrehan		bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
2260270159Sgrehan		len -= copyinfo[idx].len;
2261270159Sgrehan		dst += copyinfo[idx].len;
2262270159Sgrehan		idx++;
2263270159Sgrehan	}
2264270159Sgrehan}
2265270159Sgrehan
2266270159Sgrehanvoid
2267270159Sgrehanvm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
2268270159Sgrehan    struct vm_copyinfo *copyinfo, size_t len)
2269270159Sgrehan{
2270270159Sgrehan	const char *src;
2271270159Sgrehan	int idx;
2272270159Sgrehan
2273270159Sgrehan	src = kaddr;
2274270159Sgrehan	idx = 0;
2275270159Sgrehan	while (len > 0) {
2276270159Sgrehan		bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
2277270159Sgrehan		len -= copyinfo[idx].len;
2278270159Sgrehan		src += copyinfo[idx].len;
2279270159Sgrehan		idx++;
2280270159Sgrehan	}
2281270159Sgrehan}
2282270159Sgrehan
2283270074Sgrehan/*
2284270074Sgrehan * Return the amount of in-use and wired memory for the VM. Since
2285270074Sgrehan * these are global stats, only return the values with for vCPU 0
2286270074Sgrehan */
2287270074SgrehanVMM_STAT_DECLARE(VMM_MEM_RESIDENT);
2288270074SgrehanVMM_STAT_DECLARE(VMM_MEM_WIRED);
2289270074Sgrehan
2290270074Sgrehanstatic void
2291270074Sgrehanvm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
2292270074Sgrehan{
2293270074Sgrehan
2294270074Sgrehan	if (vcpu == 0) {
2295270074Sgrehan		vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
2296270074Sgrehan	       	    PAGE_SIZE * vmspace_resident_count(vm->vmspace));
2297270074Sgrehan	}
2298270074Sgrehan}
2299270074Sgrehan
2300270074Sgrehanstatic void
2301270074Sgrehanvm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
2302270074Sgrehan{
2303270074Sgrehan
2304270074Sgrehan	if (vcpu == 0) {
2305270074Sgrehan		vmm_stat_set(vm, vcpu, VMM_MEM_WIRED,
2306270074Sgrehan	      	    PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace)));
2307270074Sgrehan	}
2308270074Sgrehan}
2309270074Sgrehan
2310270074SgrehanVMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
2311270074SgrehanVMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt);
2312