1221828Sgrehan/*-
2221828Sgrehan * Copyright (c) 2011 NetApp, Inc.
3221828Sgrehan * All rights reserved.
4221828Sgrehan *
5221828Sgrehan * Redistribution and use in source and binary forms, with or without
6221828Sgrehan * modification, are permitted provided that the following conditions
7221828Sgrehan * are met:
8221828Sgrehan * 1. Redistributions of source code must retain the above copyright
9221828Sgrehan *    notice, this list of conditions and the following disclaimer.
10221828Sgrehan * 2. Redistributions in binary form must reproduce the above copyright
11221828Sgrehan *    notice, this list of conditions and the following disclaimer in the
12221828Sgrehan *    documentation and/or other materials provided with the distribution.
13221828Sgrehan *
14221828Sgrehan * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15221828Sgrehan * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16221828Sgrehan * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17221828Sgrehan * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18221828Sgrehan * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19221828Sgrehan * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20221828Sgrehan * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21221828Sgrehan * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22221828Sgrehan * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23221828Sgrehan * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24221828Sgrehan * SUCH DAMAGE.
25221828Sgrehan *
26221828Sgrehan * $FreeBSD: stable/10/sys/amd64/vmm/vmm.c 306472 2016-09-30 01:42:29Z jhb $
27221828Sgrehan */
28221828Sgrehan
29221828Sgrehan#include <sys/cdefs.h>
30221828Sgrehan__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/vmm.c 306472 2016-09-30 01:42:29Z jhb $");
31221828Sgrehan
32221828Sgrehan#include <sys/param.h>
33234695Sgrehan#include <sys/systm.h>
34221828Sgrehan#include <sys/kernel.h>
35221828Sgrehan#include <sys/module.h>
36221828Sgrehan#include <sys/sysctl.h>
37221828Sgrehan#include <sys/malloc.h>
38221828Sgrehan#include <sys/pcpu.h>
39221828Sgrehan#include <sys/lock.h>
40221828Sgrehan#include <sys/mutex.h>
41221828Sgrehan#include <sys/proc.h>
42256072Sneel#include <sys/rwlock.h>
43221828Sgrehan#include <sys/sched.h>
44221828Sgrehan#include <sys/smp.h>
45221828Sgrehan#include <sys/systm.h>
46221828Sgrehan
47221828Sgrehan#include <vm/vm.h>
48256072Sneel#include <vm/vm_object.h>
49256072Sneel#include <vm/vm_page.h>
50256072Sneel#include <vm/pmap.h>
51256072Sneel#include <vm/vm_map.h>
52256072Sneel#include <vm/vm_extern.h>
53256072Sneel#include <vm/vm_param.h>
54221828Sgrehan
55261275Sjhb#include <machine/cpu.h>
56221828Sgrehan#include <machine/vm.h>
57221828Sgrehan#include <machine/pcb.h>
58241489Sneel#include <machine/smp.h>
59262350Sjhb#include <x86/psl.h>
60221914Sjhb#include <x86/apicreg.h>
61256072Sneel#include <machine/vmparam.h>
62221828Sgrehan
63221828Sgrehan#include <machine/vmm.h>
64261088Sjhb#include <machine/vmm_dev.h>
65268976Sjhb#include <machine/vmm_instruction_emul.h>
66261088Sjhb
67268976Sjhb#include "vmm_ioport.h"
68256072Sneel#include "vmm_ktr.h"
69242275Sneel#include "vmm_host.h"
70221828Sgrehan#include "vmm_mem.h"
71221828Sgrehan#include "vmm_util.h"
72268891Sjhb#include "vatpic.h"
73268891Sjhb#include "vatpit.h"
74261088Sjhb#include "vhpet.h"
75261088Sjhb#include "vioapic.h"
76221828Sgrehan#include "vlapic.h"
77276429Sneel#include "vpmtmr.h"
78284894Sneel#include "vrtc.h"
79221828Sgrehan#include "vmm_ipi.h"
80221828Sgrehan#include "vmm_stat.h"
81242065Sneel#include "vmm_lapic.h"
82221828Sgrehan
83221828Sgrehan#include "io/ppt.h"
84221828Sgrehan#include "io/iommu.h"
85221828Sgrehan
86221828Sgrehanstruct vlapic;
87221828Sgrehan
88270071Sgrehan/*
89270071Sgrehan * Initialization:
90270071Sgrehan * (a) allocated when vcpu is created
91270071Sgrehan * (i) initialized when vcpu is created and when it is reinitialized
92270071Sgrehan * (o) initialized the first time the vcpu is created
93270071Sgrehan * (x) initialized before use
94270071Sgrehan */
95221828Sgrehanstruct vcpu {
96270071Sgrehan	struct mtx 	mtx;		/* (o) protects 'state' and 'hostcpu' */
97270071Sgrehan	enum vcpu_state	state;		/* (o) vcpu state */
98270071Sgrehan	int		hostcpu;	/* (o) vcpu's host cpu */
99284900Sneel	int		reqidle;	/* (i) request vcpu to idle */
100270071Sgrehan	struct vlapic	*vlapic;	/* (i) APIC device model */
101270071Sgrehan	enum x2apic_state x2apic_state;	/* (i) APIC mode */
102270159Sgrehan	uint64_t	exitintinfo;	/* (i) events pending at VM exit */
103270071Sgrehan	int		nmi_pending;	/* (i) NMI pending */
104270071Sgrehan	int		extint_pending;	/* (i) INTR pending */
105270071Sgrehan	int	exception_pending;	/* (i) exception pending */
106284894Sneel	int	exc_vector;		/* (x) exception collateral */
107284894Sneel	int	exc_errcode_valid;
108284894Sneel	uint32_t exc_errcode;
109270071Sgrehan	struct savefpu	*guestfpu;	/* (a,i) guest fpu state */
110270071Sgrehan	uint64_t	guest_xcr0;	/* (i) guest %xcr0 register */
111270071Sgrehan	void		*stats;		/* (a,i) statistics */
112270071Sgrehan	struct vm_exit	exitinfo;	/* (x) exit reason and collateral */
113284894Sneel	uint64_t	nextrip;	/* (x) next instruction to execute */
114221828Sgrehan};
115221828Sgrehan
116270071Sgrehan#define	vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
117242065Sneel#define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
118242065Sneel#define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
119242065Sneel#define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
120256072Sneel#define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
121241489Sneel
122256072Sneelstruct mem_seg {
123295124Sgrehan	size_t	len;
124295124Sgrehan	bool	sysmem;
125295124Sgrehan	struct vm_object *object;
126295124Sgrehan};
127295124Sgrehan#define	VM_MAX_MEMSEGS	2
128295124Sgrehan
129295124Sgrehanstruct mem_map {
130256072Sneel	vm_paddr_t	gpa;
131256072Sneel	size_t		len;
132295124Sgrehan	vm_ooffset_t	segoff;
133295124Sgrehan	int		segid;
134295124Sgrehan	int		prot;
135295124Sgrehan	int		flags;
136256072Sneel};
137295124Sgrehan#define	VM_MAX_MEMMAPS	4
138221828Sgrehan
139270071Sgrehan/*
140270071Sgrehan * Initialization:
141270071Sgrehan * (o) initialized the first time the VM is created
142270071Sgrehan * (i) initialized when VM is created and when it is reinitialized
143270071Sgrehan * (x) initialized before use
144270071Sgrehan */
145221828Sgrehanstruct vm {
146270071Sgrehan	void		*cookie;		/* (i) cpu-specific data */
147270071Sgrehan	void		*iommu;			/* (x) iommu-specific data */
148270071Sgrehan	struct vhpet	*vhpet;			/* (i) virtual HPET */
149270071Sgrehan	struct vioapic	*vioapic;		/* (i) virtual ioapic */
150270071Sgrehan	struct vatpic	*vatpic;		/* (i) virtual atpic */
151270071Sgrehan	struct vatpit	*vatpit;		/* (i) virtual atpit */
152276429Sneel	struct vpmtmr	*vpmtmr;		/* (i) virtual ACPI PM timer */
153284894Sneel	struct vrtc	*vrtc;			/* (o) virtual RTC */
154270071Sgrehan	volatile cpuset_t active_cpus;		/* (i) active vcpus */
155270071Sgrehan	int		suspend;		/* (i) stop VM execution */
156270071Sgrehan	volatile cpuset_t suspended_cpus; 	/* (i) suspended vcpus */
157270071Sgrehan	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
158270071Sgrehan	cpuset_t	rendezvous_req_cpus;	/* (x) rendezvous requested */
159270071Sgrehan	cpuset_t	rendezvous_done_cpus;	/* (x) rendezvous finished */
160270071Sgrehan	void		*rendezvous_arg;	/* (x) rendezvous func/arg */
161270071Sgrehan	vm_rendezvous_func_t rendezvous_func;
162270071Sgrehan	struct mtx	rendezvous_mtx;		/* (o) rendezvous lock */
163295124Sgrehan	struct mem_map	mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
164295124Sgrehan	struct mem_seg	mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
165270071Sgrehan	struct vmspace	*vmspace;		/* (o) guest's address space */
166270071Sgrehan	char		name[VM_MAX_NAMELEN];	/* (o) virtual machine name */
167270071Sgrehan	struct vcpu	vcpu[VM_MAXCPU];	/* (i) guest vcpus */
168221828Sgrehan};
169221828Sgrehan
170249396Sneelstatic int vmm_initialized;
171249396Sneel
172221828Sgrehanstatic struct vmm_ops *ops;
173266339Sjhb#define	VMM_INIT(num)	(ops != NULL ? (*ops->init)(num) : 0)
174221828Sgrehan#define	VMM_CLEANUP()	(ops != NULL ? (*ops->cleanup)() : 0)
175261275Sjhb#define	VMM_RESUME()	(ops != NULL ? (*ops->resume)() : 0)
176221828Sgrehan
177256072Sneel#define	VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
178284900Sneel#define	VMRUN(vmi, vcpu, rip, pmap, evinfo) \
179284900Sneel	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, evinfo) : ENXIO)
180221828Sgrehan#define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
181256072Sneel#define	VMSPACE_ALLOC(min, max) \
182256072Sneel	(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
183256072Sneel#define	VMSPACE_FREE(vmspace) \
184256072Sneel	(ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
185221828Sgrehan#define	VMGETREG(vmi, vcpu, num, retval)		\
186221828Sgrehan	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
187221828Sgrehan#define	VMSETREG(vmi, vcpu, num, val)		\
188221828Sgrehan	(ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
189221828Sgrehan#define	VMGETDESC(vmi, vcpu, num, desc)		\
190221828Sgrehan	(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
191221828Sgrehan#define	VMSETDESC(vmi, vcpu, num, desc)		\
192221828Sgrehan	(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
193221828Sgrehan#define	VMGETCAP(vmi, vcpu, num, retval)	\
194221828Sgrehan	(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
195221828Sgrehan#define	VMSETCAP(vmi, vcpu, num, val)		\
196221828Sgrehan	(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
197266339Sjhb#define	VLAPIC_INIT(vmi, vcpu)			\
198266339Sjhb	(ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL)
199266339Sjhb#define	VLAPIC_CLEANUP(vmi, vlapic)		\
200266339Sjhb	(ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL)
201221828Sgrehan
202245021Sneel#define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
203245021Sneel#define	fpu_stop_emulating()	clts()
204221828Sgrehan
205221828Sgrehanstatic MALLOC_DEFINE(M_VM, "vm", "vm");
206221828Sgrehan
207221828Sgrehan/* statistics */
208248389Sneelstatic VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
209221828Sgrehan
210266339SjhbSYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
211266339Sjhb
212268935Sjhb/*
213268935Sjhb * Halt the guest if all vcpus are executing a HLT instruction with
214268935Sjhb * interrupts disabled.
215268935Sjhb */
216268935Sjhbstatic int halt_detection_enabled = 1;
217268935SjhbTUNABLE_INT("hw.vmm.halt_detection", &halt_detection_enabled);
218268935SjhbSYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN,
219268935Sjhb    &halt_detection_enabled, 0,
220268935Sjhb    "Halt VM if all vcpus execute HLT with interrupts disabled");
221268935Sjhb
222266339Sjhbstatic int vmm_ipinum;
223266339SjhbSYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
224266339Sjhb    "IPI vector used for vcpu notifications");
225266339Sjhb
226276403Sneelstatic int trace_guest_exceptions;
227276403SneelSYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN,
228276403Sneel    &trace_guest_exceptions, 0,
229276403Sneel    "Trap into hypervisor on all guest exceptions and reflect them back");
230276403Sneel
231295124Sgrehanstatic void vm_free_memmap(struct vm *vm, int ident);
232295124Sgrehanstatic bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
233284900Sneelstatic void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr);
234284900Sneel
235284900Sneel#ifdef KTR
236284900Sneelstatic const char *
237284900Sneelvcpu_state2str(enum vcpu_state state)
238284900Sneel{
239284900Sneel
240284900Sneel	switch (state) {
241284900Sneel	case VCPU_IDLE:
242284900Sneel		return ("idle");
243284900Sneel	case VCPU_FROZEN:
244284900Sneel		return ("frozen");
245284900Sneel	case VCPU_RUNNING:
246284900Sneel		return ("running");
247284900Sneel	case VCPU_SLEEPING:
248284900Sneel		return ("sleeping");
249284900Sneel	default:
250284900Sneel		return ("unknown");
251284900Sneel	}
252284900Sneel}
253284900Sneel#endif
254284900Sneel
255221828Sgrehanstatic void
256270071Sgrehanvcpu_cleanup(struct vm *vm, int i, bool destroy)
257221828Sgrehan{
258266339Sjhb	struct vcpu *vcpu = &vm->vcpu[i];
259266339Sjhb
260266339Sjhb	VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
261270071Sgrehan	if (destroy) {
262270071Sgrehan		vmm_stat_free(vcpu->stats);
263270071Sgrehan		fpu_save_area_free(vcpu->guestfpu);
264270071Sgrehan	}
265221828Sgrehan}
266221828Sgrehan
267221828Sgrehanstatic void
268270071Sgrehanvcpu_init(struct vm *vm, int vcpu_id, bool create)
269221828Sgrehan{
270221828Sgrehan	struct vcpu *vcpu;
271270071Sgrehan
272270071Sgrehan	KASSERT(vcpu_id >= 0 && vcpu_id < VM_MAXCPU,
273270071Sgrehan	    ("vcpu_init: invalid vcpu %d", vcpu_id));
274270071Sgrehan
275221828Sgrehan	vcpu = &vm->vcpu[vcpu_id];
276221828Sgrehan
277270071Sgrehan	if (create) {
278270071Sgrehan		KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already "
279270071Sgrehan		    "initialized", vcpu_id));
280270071Sgrehan		vcpu_lock_init(vcpu);
281270071Sgrehan		vcpu->state = VCPU_IDLE;
282270071Sgrehan		vcpu->hostcpu = NOCPU;
283270071Sgrehan		vcpu->guestfpu = fpu_save_area_alloc();
284270071Sgrehan		vcpu->stats = vmm_stat_alloc();
285270071Sgrehan	}
286270071Sgrehan
287266339Sjhb	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
288267447Sjhb	vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
289284900Sneel	vcpu->reqidle = 0;
290270159Sgrehan	vcpu->exitintinfo = 0;
291270071Sgrehan	vcpu->nmi_pending = 0;
292270071Sgrehan	vcpu->extint_pending = 0;
293270071Sgrehan	vcpu->exception_pending = 0;
294267427Sjhb	vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
295234695Sgrehan	fpu_save_area_reset(vcpu->guestfpu);
296270071Sgrehan	vmm_stat_init(vcpu->stats);
297221828Sgrehan}
298221828Sgrehan
299276403Sneelint
300276403Sneelvcpu_trace_exceptions(struct vm *vm, int vcpuid)
301276403Sneel{
302276403Sneel
303276403Sneel	return (trace_guest_exceptions);
304276403Sneel}
305276403Sneel
306240894Sneelstruct vm_exit *
307240894Sneelvm_exitinfo(struct vm *vm, int cpuid)
308240894Sneel{
309240894Sneel	struct vcpu *vcpu;
310240894Sneel
311240894Sneel	if (cpuid < 0 || cpuid >= VM_MAXCPU)
312240894Sneel		panic("vm_exitinfo: invalid cpuid %d", cpuid);
313240894Sneel
314240894Sneel	vcpu = &vm->vcpu[cpuid];
315240894Sneel
316240894Sneel	return (&vcpu->exitinfo);
317240894Sneel}
318240894Sneel
319261275Sjhbstatic void
320261275Sjhbvmm_resume(void)
321261275Sjhb{
322261275Sjhb	VMM_RESUME();
323261275Sjhb}
324261275Sjhb
325221828Sgrehanstatic int
326221828Sgrehanvmm_init(void)
327221828Sgrehan{
328221828Sgrehan	int error;
329221828Sgrehan
330242275Sneel	vmm_host_state_init();
331221828Sgrehan
332266339Sjhb	vmm_ipinum = vmm_ipi_alloc();
333266339Sjhb	if (vmm_ipinum == 0)
334266339Sjhb		vmm_ipinum = IPI_AST;
335266339Sjhb
336221828Sgrehan	error = vmm_mem_init();
337221828Sgrehan	if (error)
338221828Sgrehan		return (error);
339221828Sgrehan
340221828Sgrehan	if (vmm_is_intel())
341221828Sgrehan		ops = &vmm_ops_intel;
342221828Sgrehan	else if (vmm_is_amd())
343221828Sgrehan		ops = &vmm_ops_amd;
344221828Sgrehan	else
345221828Sgrehan		return (ENXIO);
346221828Sgrehan
347261275Sjhb	vmm_resume_p = vmm_resume;
348221828Sgrehan
349266339Sjhb	return (VMM_INIT(vmm_ipinum));
350221828Sgrehan}
351221828Sgrehan
352221828Sgrehanstatic int
353221828Sgrehanvmm_handler(module_t mod, int what, void *arg)
354221828Sgrehan{
355221828Sgrehan	int error;
356221828Sgrehan
357221828Sgrehan	switch (what) {
358221828Sgrehan	case MOD_LOAD:
359221828Sgrehan		vmmdev_init();
360221828Sgrehan		error = vmm_init();
361249396Sneel		if (error == 0)
362249396Sneel			vmm_initialized = 1;
363221828Sgrehan		break;
364221828Sgrehan	case MOD_UNLOAD:
365241454Sneel		error = vmmdev_cleanup();
366241454Sneel		if (error == 0) {
367261275Sjhb			vmm_resume_p = NULL;
368241454Sneel			iommu_cleanup();
369266339Sjhb			if (vmm_ipinum != IPI_AST)
370266339Sjhb				vmm_ipi_free(vmm_ipinum);
371241454Sneel			error = VMM_CLEANUP();
372253854Sgrehan			/*
373253854Sgrehan			 * Something bad happened - prevent new
374253854Sgrehan			 * VMs from being created
375253854Sgrehan			 */
376253854Sgrehan			if (error)
377253854Sgrehan				vmm_initialized = 0;
378241454Sneel		}
379221828Sgrehan		break;
380221828Sgrehan	default:
381221828Sgrehan		error = 0;
382221828Sgrehan		break;
383221828Sgrehan	}
384221828Sgrehan	return (error);
385221828Sgrehan}
386221828Sgrehan
387221828Sgrehanstatic moduledata_t vmm_kmod = {
388221828Sgrehan	"vmm",
389221828Sgrehan	vmm_handler,
390221828Sgrehan	NULL
391221828Sgrehan};
392221828Sgrehan
393221828Sgrehan/*
394245704Sneel * vmm initialization has the following dependencies:
395245704Sneel *
396245704Sneel * - VT-x initialization requires smp_rendezvous() and therefore must happen
397245704Sneel *   after SMP is fully functional (after SI_SUB_SMP).
398221828Sgrehan */
399245704SneelDECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
400221828SgrehanMODULE_VERSION(vmm, 1);
401221828Sgrehan
402270071Sgrehanstatic void
403270071Sgrehanvm_init(struct vm *vm, bool create)
404270071Sgrehan{
405270071Sgrehan	int i;
406270071Sgrehan
407270071Sgrehan	vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace));
408270071Sgrehan	vm->iommu = NULL;
409270071Sgrehan	vm->vioapic = vioapic_init(vm);
410270071Sgrehan	vm->vhpet = vhpet_init(vm);
411270071Sgrehan	vm->vatpic = vatpic_init(vm);
412270071Sgrehan	vm->vatpit = vatpit_init(vm);
413276429Sneel	vm->vpmtmr = vpmtmr_init(vm);
414284894Sneel	if (create)
415284894Sneel		vm->vrtc = vrtc_init(vm);
416270071Sgrehan
417270071Sgrehan	CPU_ZERO(&vm->active_cpus);
418270071Sgrehan
419270071Sgrehan	vm->suspend = 0;
420270071Sgrehan	CPU_ZERO(&vm->suspended_cpus);
421270071Sgrehan
422270071Sgrehan	for (i = 0; i < VM_MAXCPU; i++)
423270071Sgrehan		vcpu_init(vm, i, create);
424270071Sgrehan}
425270071Sgrehan
426249396Sneelint
427249396Sneelvm_create(const char *name, struct vm **retvm)
428221828Sgrehan{
429221828Sgrehan	struct vm *vm;
430256072Sneel	struct vmspace *vmspace;
431221828Sgrehan
432249396Sneel	/*
433249396Sneel	 * If vmm.ko could not be successfully initialized then don't attempt
434249396Sneel	 * to create the virtual machine.
435249396Sneel	 */
436249396Sneel	if (!vmm_initialized)
437249396Sneel		return (ENXIO);
438249396Sneel
439221828Sgrehan	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
440249396Sneel		return (EINVAL);
441221828Sgrehan
442276429Sneel	vmspace = VMSPACE_ALLOC(0, VM_MAXUSER_ADDRESS);
443256072Sneel	if (vmspace == NULL)
444256072Sneel		return (ENOMEM);
445256072Sneel
446221828Sgrehan	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
447221828Sgrehan	strcpy(vm->name, name);
448266339Sjhb	vm->vmspace = vmspace;
449266339Sjhb	mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
450221828Sgrehan
451270071Sgrehan	vm_init(vm, true);
452221828Sgrehan
453249396Sneel	*retvm = vm;
454249396Sneel	return (0);
455221828Sgrehan}
456221828Sgrehan
457241178Sneelstatic void
458270071Sgrehanvm_cleanup(struct vm *vm, bool destroy)
459221828Sgrehan{
460295124Sgrehan	struct mem_map *mm;
461221828Sgrehan	int i;
462221828Sgrehan
463221828Sgrehan	ppt_unassign_all(vm);
464221828Sgrehan
465256072Sneel	if (vm->iommu != NULL)
466256072Sneel		iommu_destroy_domain(vm->iommu);
467256072Sneel
468284894Sneel	if (destroy)
469284894Sneel		vrtc_cleanup(vm->vrtc);
470284894Sneel	else
471284894Sneel		vrtc_reset(vm->vrtc);
472276429Sneel	vpmtmr_cleanup(vm->vpmtmr);
473268891Sjhb	vatpit_cleanup(vm->vatpit);
474261088Sjhb	vhpet_cleanup(vm->vhpet);
475268891Sjhb	vatpic_cleanup(vm->vatpic);
476261088Sjhb	vioapic_cleanup(vm->vioapic);
477261088Sjhb
478270071Sgrehan	for (i = 0; i < VM_MAXCPU; i++)
479270071Sgrehan		vcpu_cleanup(vm, i, destroy);
480221828Sgrehan
481270071Sgrehan	VMCLEANUP(vm->cookie);
482241178Sneel
483295124Sgrehan	/*
484295124Sgrehan	 * System memory is removed from the guest address space only when
485295124Sgrehan	 * the VM is destroyed. This is because the mapping remains the same
486295124Sgrehan	 * across VM reset.
487295124Sgrehan	 *
488295124Sgrehan	 * Device memory can be relocated by the guest (e.g. using PCI BARs)
489295124Sgrehan	 * so those mappings are removed on a VM reset.
490295124Sgrehan	 */
491295124Sgrehan	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
492295124Sgrehan		mm = &vm->mem_maps[i];
493295124Sgrehan		if (destroy || !sysmem_mapping(vm, mm))
494295124Sgrehan			vm_free_memmap(vm, i);
495295124Sgrehan	}
496295124Sgrehan
497270071Sgrehan	if (destroy) {
498295124Sgrehan		for (i = 0; i < VM_MAX_MEMSEGS; i++)
499295124Sgrehan			vm_free_memseg(vm, i);
500221828Sgrehan
501270071Sgrehan		VMSPACE_FREE(vm->vmspace);
502270071Sgrehan		vm->vmspace = NULL;
503270071Sgrehan	}
504270071Sgrehan}
505221828Sgrehan
506270071Sgrehanvoid
507270071Sgrehanvm_destroy(struct vm *vm)
508270071Sgrehan{
509270071Sgrehan	vm_cleanup(vm, true);
510221828Sgrehan	free(vm, M_VM);
511221828Sgrehan}
512221828Sgrehan
513270071Sgrehanint
514270071Sgrehanvm_reinit(struct vm *vm)
515270071Sgrehan{
516270071Sgrehan	int error;
517270071Sgrehan
518270071Sgrehan	/*
519270071Sgrehan	 * A virtual machine can be reset only if all vcpus are suspended.
520270071Sgrehan	 */
521270071Sgrehan	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
522270071Sgrehan		vm_cleanup(vm, false);
523270071Sgrehan		vm_init(vm, false);
524270071Sgrehan		error = 0;
525270071Sgrehan	} else {
526270071Sgrehan		error = EBUSY;
527270071Sgrehan	}
528270071Sgrehan
529270071Sgrehan	return (error);
530270071Sgrehan}
531270071Sgrehan
532221828Sgrehanconst char *
533221828Sgrehanvm_name(struct vm *vm)
534221828Sgrehan{
535221828Sgrehan	return (vm->name);
536221828Sgrehan}
537221828Sgrehan
538221828Sgrehanint
539221828Sgrehanvm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
540221828Sgrehan{
541256072Sneel	vm_object_t obj;
542221828Sgrehan
543256072Sneel	if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
544256072Sneel		return (ENOMEM);
545256072Sneel	else
546256072Sneel		return (0);
547221828Sgrehan}
548221828Sgrehan
549221828Sgrehanint
550221828Sgrehanvm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
551221828Sgrehan{
552221828Sgrehan
553256072Sneel	vmm_mmio_free(vm->vmspace, gpa, len);
554256072Sneel	return (0);
555221828Sgrehan}
556221828Sgrehan
557295124Sgrehan/*
558295124Sgrehan * Return 'true' if 'gpa' is allocated in the guest address space.
559295124Sgrehan *
560295124Sgrehan * This function is called in the context of a running vcpu which acts as
561295124Sgrehan * an implicit lock on 'vm->mem_maps[]'.
562295124Sgrehan */
563295124Sgrehanbool
564295124Sgrehanvm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa)
565241041Sneel{
566295124Sgrehan	struct mem_map *mm;
567241041Sneel	int i;
568241041Sneel
569295124Sgrehan#ifdef INVARIANTS
570295124Sgrehan	int hostcpu, state;
571295124Sgrehan	state = vcpu_get_state(vm, vcpuid, &hostcpu);
572295124Sgrehan	KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
573295124Sgrehan	    ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
574295124Sgrehan#endif
575295124Sgrehan
576295124Sgrehan	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
577295124Sgrehan		mm = &vm->mem_maps[i];
578295124Sgrehan		if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
579295124Sgrehan			return (true);		/* 'gpa' is sysmem or devmem */
580241041Sneel	}
581241041Sneel
582256072Sneel	if (ppt_is_mmio(vm, gpa))
583295124Sgrehan		return (true);			/* 'gpa' is pci passthru mmio */
584256072Sneel
585295124Sgrehan	return (false);
586241041Sneel}
587241041Sneel
588221828Sgrehanint
589295124Sgrehanvm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
590221828Sgrehan{
591256072Sneel	struct mem_seg *seg;
592295124Sgrehan	vm_object_t obj;
593221828Sgrehan
594295124Sgrehan	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
595241041Sneel		return (EINVAL);
596241041Sneel
597295124Sgrehan	if (len == 0 || (len & PAGE_MASK))
598241041Sneel		return (EINVAL);
599221828Sgrehan
600295124Sgrehan	seg = &vm->mem_segs[ident];
601295124Sgrehan	if (seg->object != NULL) {
602295124Sgrehan		if (seg->len == len && seg->sysmem == sysmem)
603295124Sgrehan			return (EEXIST);
604295124Sgrehan		else
605295124Sgrehan			return (EINVAL);
606295124Sgrehan	}
607241041Sneel
608295124Sgrehan	obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
609295124Sgrehan	if (obj == NULL)
610256072Sneel		return (ENOMEM);
611256072Sneel
612256072Sneel	seg->len = len;
613295124Sgrehan	seg->object = obj;
614295124Sgrehan	seg->sysmem = sysmem;
615295124Sgrehan	return (0);
616295124Sgrehan}
617241178Sneel
618295124Sgrehanint
619295124Sgrehanvm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
620295124Sgrehan    vm_object_t *objptr)
621295124Sgrehan{
622295124Sgrehan	struct mem_seg *seg;
623256072Sneel
624295124Sgrehan	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
625295124Sgrehan		return (EINVAL);
626295124Sgrehan
627295124Sgrehan	seg = &vm->mem_segs[ident];
628295124Sgrehan	if (len)
629295124Sgrehan		*len = seg->len;
630295124Sgrehan	if (sysmem)
631295124Sgrehan		*sysmem = seg->sysmem;
632295124Sgrehan	if (objptr)
633295124Sgrehan		*objptr = seg->object;
634256072Sneel	return (0);
635256072Sneel}
636256072Sneel
637295124Sgrehanvoid
638295124Sgrehanvm_free_memseg(struct vm *vm, int ident)
639270159Sgrehan{
640295124Sgrehan	struct mem_seg *seg;
641270159Sgrehan
642295124Sgrehan	KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
643295124Sgrehan	    ("%s: invalid memseg ident %d", __func__, ident));
644295124Sgrehan
645295124Sgrehan	seg = &vm->mem_segs[ident];
646295124Sgrehan	if (seg->object != NULL) {
647295124Sgrehan		vm_object_deallocate(seg->object);
648295124Sgrehan		bzero(seg, sizeof(struct mem_seg));
649270159Sgrehan	}
650270159Sgrehan}
651270159Sgrehan
652295124Sgrehanint
653295124Sgrehanvm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
654295124Sgrehan    size_t len, int prot, int flags)
655256072Sneel{
656256072Sneel	struct mem_seg *seg;
657295124Sgrehan	struct mem_map *m, *map;
658295124Sgrehan	vm_ooffset_t last;
659295124Sgrehan	int i, error;
660256072Sneel
661295124Sgrehan	if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0)
662295124Sgrehan		return (EINVAL);
663256072Sneel
664295124Sgrehan	if (flags & ~VM_MEMMAP_F_WIRED)
665295124Sgrehan		return (EINVAL);
666256072Sneel
667295124Sgrehan	if (segid < 0 || segid >= VM_MAX_MEMSEGS)
668295124Sgrehan		return (EINVAL);
669295124Sgrehan
670295124Sgrehan	seg = &vm->mem_segs[segid];
671295124Sgrehan	if (seg->object == NULL)
672295124Sgrehan		return (EINVAL);
673295124Sgrehan
674295124Sgrehan	last = first + len;
675295124Sgrehan	if (first < 0 || first >= last || last > seg->len)
676295124Sgrehan		return (EINVAL);
677295124Sgrehan
678295124Sgrehan	if ((gpa | first | last) & PAGE_MASK)
679295124Sgrehan		return (EINVAL);
680295124Sgrehan
681295124Sgrehan	map = NULL;
682295124Sgrehan	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
683295124Sgrehan		m = &vm->mem_maps[i];
684295124Sgrehan		if (m->len == 0) {
685295124Sgrehan			map = m;
686295124Sgrehan			break;
687295124Sgrehan		}
688256072Sneel	}
689295124Sgrehan
690295124Sgrehan	if (map == NULL)
691295124Sgrehan		return (ENOSPC);
692295124Sgrehan
693295124Sgrehan	error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa,
694295124Sgrehan	    len, 0, VMFS_NO_SPACE, prot, prot, 0);
695295124Sgrehan	if (error != KERN_SUCCESS)
696295124Sgrehan		return (EFAULT);
697295124Sgrehan
698295124Sgrehan	vm_object_reference(seg->object);
699295124Sgrehan
700295124Sgrehan	if (flags & VM_MEMMAP_F_WIRED) {
701295124Sgrehan		error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len,
702295124Sgrehan		    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
703295124Sgrehan		if (error != KERN_SUCCESS) {
704295124Sgrehan			vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len);
705295124Sgrehan			return (EFAULT);
706295124Sgrehan		}
707295124Sgrehan	}
708295124Sgrehan
709295124Sgrehan	map->gpa = gpa;
710295124Sgrehan	map->len = len;
711295124Sgrehan	map->segoff = first;
712295124Sgrehan	map->segid = segid;
713295124Sgrehan	map->prot = prot;
714295124Sgrehan	map->flags = flags;
715295124Sgrehan	return (0);
716256072Sneel}
717256072Sneel
718295124Sgrehanint
719295124Sgrehanvm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
720295124Sgrehan    vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
721256072Sneel{
722295124Sgrehan	struct mem_map *mm, *mmnext;
723295124Sgrehan	int i;
724256072Sneel
725295124Sgrehan	mmnext = NULL;
726295124Sgrehan	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
727295124Sgrehan		mm = &vm->mem_maps[i];
728295124Sgrehan		if (mm->len == 0 || mm->gpa < *gpa)
729256072Sneel			continue;
730295124Sgrehan		if (mmnext == NULL || mm->gpa < mmnext->gpa)
731295124Sgrehan			mmnext = mm;
732295124Sgrehan	}
733256072Sneel
734295124Sgrehan	if (mmnext != NULL) {
735295124Sgrehan		*gpa = mmnext->gpa;
736295124Sgrehan		if (segid)
737295124Sgrehan			*segid = mmnext->segid;
738295124Sgrehan		if (segoff)
739295124Sgrehan			*segoff = mmnext->segoff;
740295124Sgrehan		if (len)
741295124Sgrehan			*len = mmnext->len;
742295124Sgrehan		if (prot)
743295124Sgrehan			*prot = mmnext->prot;
744295124Sgrehan		if (flags)
745295124Sgrehan			*flags = mmnext->flags;
746295124Sgrehan		return (0);
747295124Sgrehan	} else {
748295124Sgrehan		return (ENOENT);
749256072Sneel	}
750295124Sgrehan}
751256072Sneel
752295124Sgrehanstatic void
753295124Sgrehanvm_free_memmap(struct vm *vm, int ident)
754295124Sgrehan{
755295124Sgrehan	struct mem_map *mm;
756295124Sgrehan	int error;
757295124Sgrehan
758295124Sgrehan	mm = &vm->mem_maps[ident];
759295124Sgrehan	if (mm->len) {
760295124Sgrehan		error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa,
761295124Sgrehan		    mm->gpa + mm->len);
762295124Sgrehan		KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d",
763295124Sgrehan		    __func__, error));
764295124Sgrehan		bzero(mm, sizeof(struct mem_map));
765256072Sneel	}
766295124Sgrehan}
767241178Sneel
768295124Sgrehanstatic __inline bool
769295124Sgrehansysmem_mapping(struct vm *vm, struct mem_map *mm)
770295124Sgrehan{
771295124Sgrehan
772295124Sgrehan	if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem)
773295124Sgrehan		return (true);
774295124Sgrehan	else
775295124Sgrehan		return (false);
776256072Sneel}
777256072Sneel
778295124Sgrehanstatic vm_paddr_t
779295124Sgrehansysmem_maxaddr(struct vm *vm)
780295124Sgrehan{
781295124Sgrehan	struct mem_map *mm;
782295124Sgrehan	vm_paddr_t maxaddr;
783295124Sgrehan	int i;
784295124Sgrehan
785295124Sgrehan	maxaddr = 0;
786295124Sgrehan	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
787295124Sgrehan		mm = &vm->mem_maps[i];
788295124Sgrehan		if (sysmem_mapping(vm, mm)) {
789295124Sgrehan			if (maxaddr < mm->gpa + mm->len)
790295124Sgrehan				maxaddr = mm->gpa + mm->len;
791295124Sgrehan		}
792295124Sgrehan	}
793295124Sgrehan	return (maxaddr);
794295124Sgrehan}
795295124Sgrehan
796256072Sneelstatic void
797256072Sneelvm_iommu_modify(struct vm *vm, boolean_t map)
798256072Sneel{
799256072Sneel	int i, sz;
800256072Sneel	vm_paddr_t gpa, hpa;
801295124Sgrehan	struct mem_map *mm;
802256072Sneel	void *vp, *cookie, *host_domain;
803256072Sneel
804256072Sneel	sz = PAGE_SIZE;
805256072Sneel	host_domain = iommu_host_domain();
806256072Sneel
807295124Sgrehan	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
808295124Sgrehan		mm = &vm->mem_maps[i];
809295124Sgrehan		if (!sysmem_mapping(vm, mm))
810295124Sgrehan			continue;
811256072Sneel
812295124Sgrehan		if (map) {
813295124Sgrehan			KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0,
814295124Sgrehan			    ("iommu map found invalid memmap %#lx/%#lx/%#x",
815295124Sgrehan			    mm->gpa, mm->len, mm->flags));
816295124Sgrehan			if ((mm->flags & VM_MEMMAP_F_WIRED) == 0)
817295124Sgrehan				continue;
818295124Sgrehan			mm->flags |= VM_MEMMAP_F_IOMMU;
819295124Sgrehan		} else {
820295124Sgrehan			if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0)
821295124Sgrehan				continue;
822295124Sgrehan			mm->flags &= ~VM_MEMMAP_F_IOMMU;
823295124Sgrehan			KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0,
824295124Sgrehan			    ("iommu unmap found invalid memmap %#lx/%#lx/%#x",
825295124Sgrehan			    mm->gpa, mm->len, mm->flags));
826295124Sgrehan		}
827295124Sgrehan
828295124Sgrehan		gpa = mm->gpa;
829295124Sgrehan		while (gpa < mm->gpa + mm->len) {
830295124Sgrehan			vp = vm_gpa_hold(vm, -1, gpa, PAGE_SIZE, VM_PROT_WRITE,
831256072Sneel					 &cookie);
832256072Sneel			KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
833256072Sneel			    vm_name(vm), gpa));
834256072Sneel
835256072Sneel			vm_gpa_release(cookie);
836256072Sneel
837256072Sneel			hpa = DMAP_TO_PHYS((uintptr_t)vp);
838256072Sneel			if (map) {
839256072Sneel				iommu_create_mapping(vm->iommu, gpa, hpa, sz);
840256072Sneel				iommu_remove_mapping(host_domain, hpa, sz);
841256072Sneel			} else {
842256072Sneel				iommu_remove_mapping(vm->iommu, gpa, sz);
843256072Sneel				iommu_create_mapping(host_domain, hpa, hpa, sz);
844256072Sneel			}
845256072Sneel
846256072Sneel			gpa += PAGE_SIZE;
847256072Sneel		}
848241178Sneel	}
849241178Sneel
850256072Sneel	/*
851256072Sneel	 * Invalidate the cached translations associated with the domain
852256072Sneel	 * from which pages were removed.
853256072Sneel	 */
854256072Sneel	if (map)
855256072Sneel		iommu_invalidate_tlb(host_domain);
856256072Sneel	else
857256072Sneel		iommu_invalidate_tlb(vm->iommu);
858256072Sneel}
859256072Sneel
860256072Sneel#define	vm_iommu_unmap(vm)	vm_iommu_modify((vm), FALSE)
861256072Sneel#define	vm_iommu_map(vm)	vm_iommu_modify((vm), TRUE)
862256072Sneel
863256072Sneelint
864256072Sneelvm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
865256072Sneel{
866256072Sneel	int error;
867256072Sneel
868256072Sneel	error = ppt_unassign_device(vm, bus, slot, func);
869256072Sneel	if (error)
870221828Sgrehan		return (error);
871256072Sneel
872295124Sgrehan	if (ppt_assigned_devices(vm) == 0)
873256072Sneel		vm_iommu_unmap(vm);
874295124Sgrehan
875256072Sneel	return (0);
876256072Sneel}
877221828Sgrehan
878256072Sneelint
879256072Sneelvm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
880256072Sneel{
881256072Sneel	int error;
882256072Sneel	vm_paddr_t maxaddr;
883256072Sneel
884295124Sgrehan	/* Set up the IOMMU to do the 'gpa' to 'hpa' translation */
885267070Sjhb	if (ppt_assigned_devices(vm) == 0) {
886256072Sneel		KASSERT(vm->iommu == NULL,
887256072Sneel		    ("vm_assign_pptdev: iommu must be NULL"));
888295124Sgrehan		maxaddr = sysmem_maxaddr(vm);
889256072Sneel		vm->iommu = iommu_create_domain(maxaddr);
890306472Sjhb		if (vm->iommu == NULL)
891306472Sjhb			return (ENXIO);
892256072Sneel		vm_iommu_map(vm);
893256072Sneel	}
894256072Sneel
895256072Sneel	error = ppt_assign_device(vm, bus, slot, func);
896256072Sneel	return (error);
897221828Sgrehan}
898221828Sgrehan
899256072Sneelvoid *
900295124Sgrehanvm_gpa_hold(struct vm *vm, int vcpuid, vm_paddr_t gpa, size_t len, int reqprot,
901256072Sneel	    void **cookie)
902221828Sgrehan{
903295124Sgrehan	int i, count, pageoff;
904295124Sgrehan	struct mem_map *mm;
905256072Sneel	vm_page_t m;
906295124Sgrehan#ifdef INVARIANTS
907295124Sgrehan	/*
908295124Sgrehan	 * All vcpus are frozen by ioctls that modify the memory map
909295124Sgrehan	 * (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is
910295124Sgrehan	 * guaranteed if at least one vcpu is in the VCPU_FROZEN state.
911295124Sgrehan	 */
912295124Sgrehan	int state;
913305673Sjhb	KASSERT(vcpuid >= -1 && vcpuid < VM_MAXCPU, ("%s: invalid vcpuid %d",
914295124Sgrehan	    __func__, vcpuid));
915295124Sgrehan	for (i = 0; i < VM_MAXCPU; i++) {
916295124Sgrehan		if (vcpuid != -1 && vcpuid != i)
917295124Sgrehan			continue;
918295124Sgrehan		state = vcpu_get_state(vm, i, NULL);
919295124Sgrehan		KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d",
920295124Sgrehan		    __func__, state));
921295124Sgrehan	}
922295124Sgrehan#endif
923256072Sneel	pageoff = gpa & PAGE_MASK;
924256072Sneel	if (len > PAGE_SIZE - pageoff)
925256072Sneel		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
926241148Sneel
927295124Sgrehan	count = 0;
928295124Sgrehan	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
929295124Sgrehan		mm = &vm->mem_maps[i];
930295124Sgrehan		if (sysmem_mapping(vm, mm) && gpa >= mm->gpa &&
931295124Sgrehan		    gpa < mm->gpa + mm->len) {
932295124Sgrehan			count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
933295124Sgrehan			    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
934295124Sgrehan			break;
935295124Sgrehan		}
936295124Sgrehan	}
937256072Sneel
938256072Sneel	if (count == 1) {
939256072Sneel		*cookie = m;
940256072Sneel		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
941256072Sneel	} else {
942256072Sneel		*cookie = NULL;
943256072Sneel		return (NULL);
944256072Sneel	}
945221828Sgrehan}
946221828Sgrehan
947256072Sneelvoid
948256072Sneelvm_gpa_release(void *cookie)
949256072Sneel{
950256072Sneel	vm_page_t m = cookie;
951256072Sneel
952256072Sneel	vm_page_lock(m);
953256072Sneel	vm_page_unhold(m);
954256072Sneel	vm_page_unlock(m);
955256072Sneel}
956256072Sneel
957221828Sgrehanint
958221828Sgrehanvm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
959221828Sgrehan{
960221828Sgrehan
961221828Sgrehan	if (vcpu < 0 || vcpu >= VM_MAXCPU)
962221828Sgrehan		return (EINVAL);
963221828Sgrehan
964221828Sgrehan	if (reg >= VM_REG_LAST)
965221828Sgrehan		return (EINVAL);
966221828Sgrehan
967221828Sgrehan	return (VMGETREG(vm->cookie, vcpu, reg, retval));
968221828Sgrehan}
969221828Sgrehan
970221828Sgrehanint
971284894Sneelvm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val)
972221828Sgrehan{
973284894Sneel	struct vcpu *vcpu;
974284894Sneel	int error;
975221828Sgrehan
976284894Sneel	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
977221828Sgrehan		return (EINVAL);
978221828Sgrehan
979221828Sgrehan	if (reg >= VM_REG_LAST)
980221828Sgrehan		return (EINVAL);
981221828Sgrehan
982284894Sneel	error = VMSETREG(vm->cookie, vcpuid, reg, val);
983284894Sneel	if (error || reg != VM_REG_GUEST_RIP)
984284894Sneel		return (error);
985284894Sneel
986284894Sneel	/* Set 'nextrip' to match the value of %rip */
987284894Sneel	VCPU_CTR1(vm, vcpuid, "Setting nextrip to %#lx", val);
988284894Sneel	vcpu = &vm->vcpu[vcpuid];
989284894Sneel	vcpu->nextrip = val;
990284894Sneel	return (0);
991221828Sgrehan}
992221828Sgrehan
993221828Sgrehanstatic boolean_t
994221828Sgrehanis_descriptor_table(int reg)
995221828Sgrehan{
996221828Sgrehan
997221828Sgrehan	switch (reg) {
998221828Sgrehan	case VM_REG_GUEST_IDTR:
999221828Sgrehan	case VM_REG_GUEST_GDTR:
1000221828Sgrehan		return (TRUE);
1001221828Sgrehan	default:
1002221828Sgrehan		return (FALSE);
1003221828Sgrehan	}
1004221828Sgrehan}
1005221828Sgrehan
1006221828Sgrehanstatic boolean_t
1007221828Sgrehanis_segment_register(int reg)
1008221828Sgrehan{
1009221828Sgrehan
1010221828Sgrehan	switch (reg) {
1011221828Sgrehan	case VM_REG_GUEST_ES:
1012221828Sgrehan	case VM_REG_GUEST_CS:
1013221828Sgrehan	case VM_REG_GUEST_SS:
1014221828Sgrehan	case VM_REG_GUEST_DS:
1015221828Sgrehan	case VM_REG_GUEST_FS:
1016221828Sgrehan	case VM_REG_GUEST_GS:
1017221828Sgrehan	case VM_REG_GUEST_TR:
1018221828Sgrehan	case VM_REG_GUEST_LDTR:
1019221828Sgrehan		return (TRUE);
1020221828Sgrehan	default:
1021221828Sgrehan		return (FALSE);
1022221828Sgrehan	}
1023221828Sgrehan}
1024221828Sgrehan
1025221828Sgrehanint
1026221828Sgrehanvm_get_seg_desc(struct vm *vm, int vcpu, int reg,
1027221828Sgrehan		struct seg_desc *desc)
1028221828Sgrehan{
1029221828Sgrehan
1030221828Sgrehan	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1031221828Sgrehan		return (EINVAL);
1032221828Sgrehan
1033221828Sgrehan	if (!is_segment_register(reg) && !is_descriptor_table(reg))
1034221828Sgrehan		return (EINVAL);
1035221828Sgrehan
1036221828Sgrehan	return (VMGETDESC(vm->cookie, vcpu, reg, desc));
1037221828Sgrehan}
1038221828Sgrehan
1039221828Sgrehanint
1040221828Sgrehanvm_set_seg_desc(struct vm *vm, int vcpu, int reg,
1041221828Sgrehan		struct seg_desc *desc)
1042221828Sgrehan{
1043221828Sgrehan	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1044221828Sgrehan		return (EINVAL);
1045221828Sgrehan
1046221828Sgrehan	if (!is_segment_register(reg) && !is_descriptor_table(reg))
1047221828Sgrehan		return (EINVAL);
1048221828Sgrehan
1049221828Sgrehan	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
1050221828Sgrehan}
1051221828Sgrehan
1052221828Sgrehanstatic void
1053221828Sgrehanrestore_guest_fpustate(struct vcpu *vcpu)
1054221828Sgrehan{
1055221828Sgrehan
1056234695Sgrehan	/* flush host state to the pcb */
1057234695Sgrehan	fpuexit(curthread);
1058242122Sneel
1059242122Sneel	/* restore guest FPU state */
1060221828Sgrehan	fpu_stop_emulating();
1061234695Sgrehan	fpurestore(vcpu->guestfpu);
1062242122Sneel
1063267427Sjhb	/* restore guest XCR0 if XSAVE is enabled in the host */
1064267427Sjhb	if (rcr4() & CR4_XSAVE)
1065267427Sjhb		load_xcr(0, vcpu->guest_xcr0);
1066267427Sjhb
1067242122Sneel	/*
1068242122Sneel	 * The FPU is now "dirty" with the guest's state so turn on emulation
1069242122Sneel	 * to trap any access to the FPU by the host.
1070242122Sneel	 */
1071242122Sneel	fpu_start_emulating();
1072221828Sgrehan}
1073221828Sgrehan
1074221828Sgrehanstatic void
1075221828Sgrehansave_guest_fpustate(struct vcpu *vcpu)
1076221828Sgrehan{
1077221828Sgrehan
1078242122Sneel	if ((rcr0() & CR0_TS) == 0)
1079242122Sneel		panic("fpu emulation not enabled in host!");
1080242122Sneel
1081267427Sjhb	/* save guest XCR0 and restore host XCR0 */
1082267427Sjhb	if (rcr4() & CR4_XSAVE) {
1083267427Sjhb		vcpu->guest_xcr0 = rxcr(0);
1084267427Sjhb		load_xcr(0, vmm_get_host_xcr0());
1085267427Sjhb	}
1086267427Sjhb
1087242122Sneel	/* save guest FPU state */
1088242122Sneel	fpu_stop_emulating();
1089234695Sgrehan	fpusave(vcpu->guestfpu);
1090221828Sgrehan	fpu_start_emulating();
1091221828Sgrehan}
1092221828Sgrehan
1093248389Sneelstatic VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
1094242065Sneel
1095256072Sneelstatic int
1096284900Sneelvcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate,
1097266393Sjhb    bool from_idle)
1098256072Sneel{
1099284900Sneel	struct vcpu *vcpu;
1100256072Sneel	int error;
1101256072Sneel
1102284900Sneel	vcpu = &vm->vcpu[vcpuid];
1103256072Sneel	vcpu_assert_locked(vcpu);
1104256072Sneel
1105256072Sneel	/*
1106266393Sjhb	 * State transitions from the vmmdev_ioctl() must always begin from
1107266393Sjhb	 * the VCPU_IDLE state. This guarantees that there is only a single
1108266393Sjhb	 * ioctl() operating on a vcpu at any point.
1109266393Sjhb	 */
1110266393Sjhb	if (from_idle) {
1111284900Sneel		while (vcpu->state != VCPU_IDLE) {
1112284900Sneel			vcpu->reqidle = 1;
1113284900Sneel			vcpu_notify_event_locked(vcpu, false);
1114284900Sneel			VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to "
1115284900Sneel			    "idle requested", vcpu_state2str(vcpu->state));
1116266393Sjhb			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
1117284900Sneel		}
1118266393Sjhb	} else {
1119266393Sjhb		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
1120266393Sjhb		    "vcpu idle state"));
1121266393Sjhb	}
1122266393Sjhb
1123266393Sjhb	if (vcpu->state == VCPU_RUNNING) {
1124266393Sjhb		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
1125266393Sjhb		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
1126266393Sjhb	} else {
1127266393Sjhb		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
1128266393Sjhb		    "vcpu that is not running", vcpu->hostcpu));
1129266393Sjhb	}
1130266393Sjhb
1131266393Sjhb	/*
1132256072Sneel	 * The following state transitions are allowed:
1133256072Sneel	 * IDLE -> FROZEN -> IDLE
1134256072Sneel	 * FROZEN -> RUNNING -> FROZEN
1135256072Sneel	 * FROZEN -> SLEEPING -> FROZEN
1136256072Sneel	 */
1137256072Sneel	switch (vcpu->state) {
1138256072Sneel	case VCPU_IDLE:
1139256072Sneel	case VCPU_RUNNING:
1140256072Sneel	case VCPU_SLEEPING:
1141256072Sneel		error = (newstate != VCPU_FROZEN);
1142256072Sneel		break;
1143256072Sneel	case VCPU_FROZEN:
1144256072Sneel		error = (newstate == VCPU_FROZEN);
1145256072Sneel		break;
1146256072Sneel	default:
1147256072Sneel		error = 1;
1148256072Sneel		break;
1149256072Sneel	}
1150256072Sneel
1151266393Sjhb	if (error)
1152266393Sjhb		return (EBUSY);
1153266393Sjhb
1154284900Sneel	VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s",
1155284900Sneel	    vcpu_state2str(vcpu->state), vcpu_state2str(newstate));
1156284900Sneel
1157266393Sjhb	vcpu->state = newstate;
1158266393Sjhb	if (newstate == VCPU_RUNNING)
1159266393Sjhb		vcpu->hostcpu = curcpu;
1160256072Sneel	else
1161266393Sjhb		vcpu->hostcpu = NOCPU;
1162256072Sneel
1163266393Sjhb	if (newstate == VCPU_IDLE)
1164266393Sjhb		wakeup(&vcpu->state);
1165266393Sjhb
1166266393Sjhb	return (0);
1167256072Sneel}
1168256072Sneel
1169256072Sneelstatic void
1170256072Sneelvcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1171256072Sneel{
1172256072Sneel	int error;
1173256072Sneel
1174266393Sjhb	if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
1175256072Sneel		panic("Error %d setting state to %d\n", error, newstate);
1176256072Sneel}
1177256072Sneel
1178256072Sneelstatic void
1179284900Sneelvcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1180256072Sneel{
1181256072Sneel	int error;
1182256072Sneel
1183284900Sneel	if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0)
1184256072Sneel		panic("Error %d setting state to %d", error, newstate);
1185256072Sneel}
1186256072Sneel
1187266339Sjhbstatic void
1188266339Sjhbvm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func)
1189266339Sjhb{
1190266339Sjhb
1191266339Sjhb	KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked"));
1192266339Sjhb
1193266339Sjhb	/*
1194266339Sjhb	 * Update 'rendezvous_func' and execute a write memory barrier to
1195266339Sjhb	 * ensure that it is visible across all host cpus. This is not needed
1196266339Sjhb	 * for correctness but it does ensure that all the vcpus will notice
1197266339Sjhb	 * that the rendezvous is requested immediately.
1198266339Sjhb	 */
1199266339Sjhb	vm->rendezvous_func = func;
1200266339Sjhb	wmb();
1201266339Sjhb}
1202266339Sjhb
1203266339Sjhb#define	RENDEZVOUS_CTR0(vm, vcpuid, fmt)				\
1204266339Sjhb	do {								\
1205266339Sjhb		if (vcpuid >= 0)					\
1206266339Sjhb			VCPU_CTR0(vm, vcpuid, fmt);			\
1207266339Sjhb		else							\
1208266339Sjhb			VM_CTR0(vm, fmt);				\
1209266339Sjhb	} while (0)
1210266339Sjhb
1211266339Sjhbstatic void
1212266339Sjhbvm_handle_rendezvous(struct vm *vm, int vcpuid)
1213266339Sjhb{
1214266339Sjhb
1215266339Sjhb	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
1216266339Sjhb	    ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid));
1217266339Sjhb
1218266339Sjhb	mtx_lock(&vm->rendezvous_mtx);
1219266339Sjhb	while (vm->rendezvous_func != NULL) {
1220266339Sjhb		/* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
1221266339Sjhb		CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus);
1222266339Sjhb
1223266339Sjhb		if (vcpuid != -1 &&
1224266339Sjhb		    CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
1225266339Sjhb		    !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
1226266339Sjhb			VCPU_CTR0(vm, vcpuid, "Calling rendezvous func");
1227266339Sjhb			(*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg);
1228266339Sjhb			CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
1229266339Sjhb		}
1230266339Sjhb		if (CPU_CMP(&vm->rendezvous_req_cpus,
1231266339Sjhb		    &vm->rendezvous_done_cpus) == 0) {
1232266339Sjhb			VCPU_CTR0(vm, vcpuid, "Rendezvous completed");
1233266339Sjhb			vm_set_rendezvous_func(vm, NULL);
1234266339Sjhb			wakeup(&vm->rendezvous_func);
1235266339Sjhb			break;
1236266339Sjhb		}
1237266339Sjhb		RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion");
1238266339Sjhb		mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
1239266339Sjhb		    "vmrndv", 0);
1240266339Sjhb	}
1241266339Sjhb	mtx_unlock(&vm->rendezvous_mtx);
1242266339Sjhb}
1243266339Sjhb
1244256072Sneel/*
1245256072Sneel * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
1246256072Sneel */
1247256072Sneelstatic int
1248262350Sjhbvm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
1249256072Sneel{
1250256072Sneel	struct vcpu *vcpu;
1251268935Sjhb	const char *wmesg;
1252284894Sneel	int t, vcpu_halted, vm_halted;
1253256072Sneel
1254268935Sjhb	KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
1255268935Sjhb
1256256072Sneel	vcpu = &vm->vcpu[vcpuid];
1257268935Sjhb	vcpu_halted = 0;
1258268935Sjhb	vm_halted = 0;
1259256072Sneel
1260256072Sneel	vcpu_lock(vcpu);
1261268935Sjhb	while (1) {
1262268935Sjhb		/*
1263268935Sjhb		 * Do a final check for pending NMI or interrupts before
1264268935Sjhb		 * really putting this thread to sleep. Also check for
1265268935Sjhb		 * software events that would cause this vcpu to wakeup.
1266268935Sjhb		 *
1267268935Sjhb		 * These interrupts/events could have happened after the
1268268935Sjhb		 * vcpu returned from VMRUN() and before it acquired the
1269268935Sjhb		 * vcpu lock above.
1270268935Sjhb		 */
1271284900Sneel		if (vm->rendezvous_func != NULL || vm->suspend || vcpu->reqidle)
1272268935Sjhb			break;
1273268935Sjhb		if (vm_nmi_pending(vm, vcpuid))
1274268935Sjhb			break;
1275268935Sjhb		if (!intr_disabled) {
1276268935Sjhb			if (vm_extint_pending(vm, vcpuid) ||
1277268935Sjhb			    vlapic_pending_intr(vcpu->vlapic, NULL)) {
1278268935Sjhb				break;
1279268935Sjhb			}
1280268935Sjhb		}
1281256072Sneel
1282270159Sgrehan		/* Don't go to sleep if the vcpu thread needs to yield */
1283270159Sgrehan		if (vcpu_should_yield(vm, vcpuid))
1284270159Sgrehan			break;
1285270159Sgrehan
1286268935Sjhb		/*
1287268935Sjhb		 * Some Linux guests implement "halt" by having all vcpus
1288268935Sjhb		 * execute HLT with interrupts disabled. 'halted_cpus' keeps
1289268935Sjhb		 * track of the vcpus that have entered this state. When all
1290268935Sjhb		 * vcpus enter the halted state the virtual machine is halted.
1291268935Sjhb		 */
1292268935Sjhb		if (intr_disabled) {
1293268935Sjhb			wmesg = "vmhalt";
1294268935Sjhb			VCPU_CTR0(vm, vcpuid, "Halted");
1295268935Sjhb			if (!vcpu_halted && halt_detection_enabled) {
1296268935Sjhb				vcpu_halted = 1;
1297268935Sjhb				CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
1298268935Sjhb			}
1299268935Sjhb			if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
1300268935Sjhb				vm_halted = 1;
1301268935Sjhb				break;
1302268935Sjhb			}
1303268935Sjhb		} else {
1304268935Sjhb			wmesg = "vmidle";
1305268935Sjhb		}
1306268935Sjhb
1307256072Sneel		t = ticks;
1308284900Sneel		vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1309270159Sgrehan		/*
1310270159Sgrehan		 * XXX msleep_spin() cannot be interrupted by signals so
1311270159Sgrehan		 * wake up periodically to check pending signals.
1312270159Sgrehan		 */
1313270159Sgrehan		msleep_spin(vcpu, &vcpu->mtx, wmesg, hz);
1314284900Sneel		vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1315256072Sneel		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
1316256072Sneel	}
1317268935Sjhb
1318268935Sjhb	if (vcpu_halted)
1319268935Sjhb		CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
1320268935Sjhb
1321256072Sneel	vcpu_unlock(vcpu);
1322256072Sneel
1323268935Sjhb	if (vm_halted)
1324268935Sjhb		vm_suspend(vm, VM_SUSPEND_HALT);
1325266339Sjhb
1326256072Sneel	return (0);
1327256072Sneel}
1328256072Sneel
1329256072Sneelstatic int
1330262350Sjhbvm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
1331256072Sneel{
1332256072Sneel	int rv, ftype;
1333256072Sneel	struct vm_map *map;
1334256072Sneel	struct vcpu *vcpu;
1335256072Sneel	struct vm_exit *vme;
1336256072Sneel
1337256072Sneel	vcpu = &vm->vcpu[vcpuid];
1338256072Sneel	vme = &vcpu->exitinfo;
1339256072Sneel
1340284894Sneel	KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1341284894Sneel	    __func__, vme->inst_length));
1342284894Sneel
1343256072Sneel	ftype = vme->u.paging.fault_type;
1344256072Sneel	KASSERT(ftype == VM_PROT_READ ||
1345256072Sneel	    ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
1346256072Sneel	    ("vm_handle_paging: invalid fault_type %d", ftype));
1347256072Sneel
1348256072Sneel	if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
1349256072Sneel		rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
1350256072Sneel		    vme->u.paging.gpa, ftype);
1351276349Sneel		if (rv == 0) {
1352276349Sneel			VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %#lx",
1353276349Sneel			    ftype == VM_PROT_READ ? "accessed" : "dirty",
1354276349Sneel			    vme->u.paging.gpa);
1355256072Sneel			goto done;
1356276349Sneel		}
1357256072Sneel	}
1358256072Sneel
1359256072Sneel	map = &vm->vmspace->vm_map;
1360256072Sneel	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
1361256072Sneel
1362261088Sjhb	VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, "
1363261088Sjhb	    "ftype = %d", rv, vme->u.paging.gpa, ftype);
1364256072Sneel
1365256072Sneel	if (rv != KERN_SUCCESS)
1366256072Sneel		return (EFAULT);
1367256072Sneeldone:
1368256072Sneel	return (0);
1369256072Sneel}
1370256072Sneel
1371256072Sneelstatic int
1372262350Sjhbvm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
1373256072Sneel{
1374256072Sneel	struct vie *vie;
1375256072Sneel	struct vcpu *vcpu;
1376256072Sneel	struct vm_exit *vme;
1377284899Sneel	uint64_t gla, gpa, cs_base;
1378268976Sjhb	struct vm_guest_paging *paging;
1379261088Sjhb	mem_region_read_t mread;
1380261088Sjhb	mem_region_write_t mwrite;
1381270159Sgrehan	enum vm_cpu_mode cpu_mode;
1382284900Sneel	int cs_d, error, fault;
1383256072Sneel
1384256072Sneel	vcpu = &vm->vcpu[vcpuid];
1385256072Sneel	vme = &vcpu->exitinfo;
1386256072Sneel
1387284900Sneel	KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1388284900Sneel	    __func__, vme->inst_length));
1389284900Sneel
1390256072Sneel	gla = vme->u.inst_emul.gla;
1391256072Sneel	gpa = vme->u.inst_emul.gpa;
1392284899Sneel	cs_base = vme->u.inst_emul.cs_base;
1393270159Sgrehan	cs_d = vme->u.inst_emul.cs_d;
1394256072Sneel	vie = &vme->u.inst_emul.vie;
1395268976Sjhb	paging = &vme->u.inst_emul.paging;
1396270159Sgrehan	cpu_mode = paging->cpu_mode;
1397256072Sneel
1398276349Sneel	VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %#lx", gpa);
1399276349Sneel
1400256072Sneel	/* Fetch, decode and emulate the faulting instruction */
1401276403Sneel	if (vie->num_valid == 0) {
1402284899Sneel		error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip +
1403284900Sneel		    cs_base, VIE_INST_SIZE, vie, &fault);
1404276403Sneel	} else {
1405276403Sneel		/*
1406276403Sneel		 * The instruction bytes have already been copied into 'vie'
1407276403Sneel		 */
1408284900Sneel		error = fault = 0;
1409276403Sneel	}
1410284900Sneel	if (error || fault)
1411284900Sneel		return (error);
1412256072Sneel
1413284900Sneel	if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0) {
1414284900Sneel		VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %#lx",
1415284900Sneel		    vme->rip + cs_base);
1416284900Sneel		*retu = true;	    /* dump instruction bytes in userspace */
1417284900Sneel		return (0);
1418284900Sneel	}
1419256072Sneel
1420276403Sneel	/*
1421284900Sneel	 * Update 'nextrip' based on the length of the emulated instruction.
1422276403Sneel	 */
1423284900Sneel	vme->inst_length = vie->num_processed;
1424284900Sneel	vcpu->nextrip += vie->num_processed;
1425284900Sneel	VCPU_CTR1(vm, vcpuid, "nextrip updated to %#lx after instruction "
1426284900Sneel	    "decoding", vcpu->nextrip);
1427276403Sneel
1428261088Sjhb	/* return to userland unless this is an in-kernel emulated device */
1429261088Sjhb	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1430261088Sjhb		mread = lapic_mmio_read;
1431261088Sjhb		mwrite = lapic_mmio_write;
1432261088Sjhb	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1433261088Sjhb		mread = vioapic_mmio_read;
1434261088Sjhb		mwrite = vioapic_mmio_write;
1435261088Sjhb	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1436261088Sjhb		mread = vhpet_mmio_read;
1437261088Sjhb		mwrite = vhpet_mmio_write;
1438261088Sjhb	} else {
1439262350Sjhb		*retu = true;
1440256072Sneel		return (0);
1441256072Sneel	}
1442256072Sneel
1443270159Sgrehan	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging,
1444270159Sgrehan	    mread, mwrite, retu);
1445256072Sneel
1446256072Sneel	return (error);
1447256072Sneel}
1448256072Sneel
1449268935Sjhbstatic int
1450268935Sjhbvm_handle_suspend(struct vm *vm, int vcpuid, bool *retu)
1451268935Sjhb{
1452268935Sjhb	int i, done;
1453268935Sjhb	struct vcpu *vcpu;
1454268935Sjhb
1455268935Sjhb	done = 0;
1456268935Sjhb	vcpu = &vm->vcpu[vcpuid];
1457268935Sjhb
1458268935Sjhb	CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
1459268935Sjhb
1460268935Sjhb	/*
1461268935Sjhb	 * Wait until all 'active_cpus' have suspended themselves.
1462268935Sjhb	 *
1463268935Sjhb	 * Since a VM may be suspended at any time including when one or
1464268935Sjhb	 * more vcpus are doing a rendezvous we need to call the rendezvous
1465268935Sjhb	 * handler while we are waiting to prevent a deadlock.
1466268935Sjhb	 */
1467268935Sjhb	vcpu_lock(vcpu);
1468268935Sjhb	while (1) {
1469268935Sjhb		if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
1470268935Sjhb			VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
1471268935Sjhb			break;
1472268935Sjhb		}
1473268935Sjhb
1474268935Sjhb		if (vm->rendezvous_func == NULL) {
1475268935Sjhb			VCPU_CTR0(vm, vcpuid, "Sleeping during suspend");
1476284900Sneel			vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1477268935Sjhb			msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
1478284900Sneel			vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1479268935Sjhb		} else {
1480268935Sjhb			VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend");
1481268935Sjhb			vcpu_unlock(vcpu);
1482268935Sjhb			vm_handle_rendezvous(vm, vcpuid);
1483268935Sjhb			vcpu_lock(vcpu);
1484268935Sjhb		}
1485268935Sjhb	}
1486268935Sjhb	vcpu_unlock(vcpu);
1487268935Sjhb
1488268935Sjhb	/*
1489268935Sjhb	 * Wakeup the other sleeping vcpus and return to userspace.
1490268935Sjhb	 */
1491268935Sjhb	for (i = 0; i < VM_MAXCPU; i++) {
1492268935Sjhb		if (CPU_ISSET(i, &vm->suspended_cpus)) {
1493268935Sjhb			vcpu_notify_event(vm, i, false);
1494268935Sjhb		}
1495268935Sjhb	}
1496268935Sjhb
1497268935Sjhb	*retu = true;
1498268935Sjhb	return (0);
1499268935Sjhb}
1500268935Sjhb
1501284900Sneelstatic int
1502284900Sneelvm_handle_reqidle(struct vm *vm, int vcpuid, bool *retu)
1503284900Sneel{
1504284900Sneel	struct vcpu *vcpu = &vm->vcpu[vcpuid];
1505284900Sneel
1506284900Sneel	vcpu_lock(vcpu);
1507284900Sneel	KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle));
1508284900Sneel	vcpu->reqidle = 0;
1509284900Sneel	vcpu_unlock(vcpu);
1510284900Sneel	*retu = true;
1511284900Sneel	return (0);
1512284900Sneel}
1513284900Sneel
1514221828Sgrehanint
1515268935Sjhbvm_suspend(struct vm *vm, enum vm_suspend_how how)
1516268935Sjhb{
1517268935Sjhb	int i;
1518268935Sjhb
1519268935Sjhb	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
1520268935Sjhb		return (EINVAL);
1521268935Sjhb
1522268935Sjhb	if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
1523268935Sjhb		VM_CTR2(vm, "virtual machine already suspended %d/%d",
1524268935Sjhb		    vm->suspend, how);
1525268935Sjhb		return (EALREADY);
1526268935Sjhb	}
1527268935Sjhb
1528268935Sjhb	VM_CTR1(vm, "virtual machine successfully suspended %d", how);
1529268935Sjhb
1530268935Sjhb	/*
1531268935Sjhb	 * Notify all active vcpus that they are now suspended.
1532268935Sjhb	 */
1533268935Sjhb	for (i = 0; i < VM_MAXCPU; i++) {
1534268935Sjhb		if (CPU_ISSET(i, &vm->active_cpus))
1535268935Sjhb			vcpu_notify_event(vm, i, false);
1536268935Sjhb	}
1537268935Sjhb
1538268935Sjhb	return (0);
1539268935Sjhb}
1540268935Sjhb
1541268935Sjhbvoid
1542268935Sjhbvm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip)
1543268935Sjhb{
1544268935Sjhb	struct vm_exit *vmexit;
1545268935Sjhb
1546268935Sjhb	KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
1547268935Sjhb	    ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
1548268935Sjhb
1549268935Sjhb	vmexit = vm_exitinfo(vm, vcpuid);
1550268935Sjhb	vmexit->rip = rip;
1551268935Sjhb	vmexit->inst_length = 0;
1552268935Sjhb	vmexit->exitcode = VM_EXITCODE_SUSPENDED;
1553268935Sjhb	vmexit->u.suspended.how = vm->suspend;
1554268935Sjhb}
1555268935Sjhb
1556270074Sgrehanvoid
1557270074Sgrehanvm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip)
1558270074Sgrehan{
1559270074Sgrehan	struct vm_exit *vmexit;
1560270074Sgrehan
1561270074Sgrehan	KASSERT(vm->rendezvous_func != NULL, ("rendezvous not in progress"));
1562270074Sgrehan
1563270074Sgrehan	vmexit = vm_exitinfo(vm, vcpuid);
1564270074Sgrehan	vmexit->rip = rip;
1565270074Sgrehan	vmexit->inst_length = 0;
1566270074Sgrehan	vmexit->exitcode = VM_EXITCODE_RENDEZVOUS;
1567270074Sgrehan	vmm_stat_incr(vm, vcpuid, VMEXIT_RENDEZVOUS, 1);
1568270074Sgrehan}
1569270074Sgrehan
1570270074Sgrehanvoid
1571284900Sneelvm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip)
1572284900Sneel{
1573284900Sneel	struct vm_exit *vmexit;
1574284900Sneel
1575284900Sneel	vmexit = vm_exitinfo(vm, vcpuid);
1576284900Sneel	vmexit->rip = rip;
1577284900Sneel	vmexit->inst_length = 0;
1578284900Sneel	vmexit->exitcode = VM_EXITCODE_REQIDLE;
1579284900Sneel	vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
1580284900Sneel}
1581284900Sneel
1582284900Sneelvoid
1583270074Sgrehanvm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip)
1584270074Sgrehan{
1585270074Sgrehan	struct vm_exit *vmexit;
1586270074Sgrehan
1587270074Sgrehan	vmexit = vm_exitinfo(vm, vcpuid);
1588270074Sgrehan	vmexit->rip = rip;
1589270074Sgrehan	vmexit->inst_length = 0;
1590270074Sgrehan	vmexit->exitcode = VM_EXITCODE_BOGUS;
1591270074Sgrehan	vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
1592270074Sgrehan}
1593270074Sgrehan
1594268935Sjhbint
1595221828Sgrehanvm_run(struct vm *vm, struct vm_run *vmrun)
1596221828Sgrehan{
1597284900Sneel	struct vm_eventinfo evinfo;
1598256072Sneel	int error, vcpuid;
1599221828Sgrehan	struct vcpu *vcpu;
1600221828Sgrehan	struct pcb *pcb;
1601284894Sneel	uint64_t tscval;
1602242065Sneel	struct vm_exit *vme;
1603262350Sjhb	bool retu, intr_disabled;
1604256072Sneel	pmap_t pmap;
1605221828Sgrehan
1606221828Sgrehan	vcpuid = vmrun->cpuid;
1607221828Sgrehan
1608221828Sgrehan	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1609221828Sgrehan		return (EINVAL);
1610221828Sgrehan
1611270070Sgrehan	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
1612270070Sgrehan		return (EINVAL);
1613270070Sgrehan
1614270070Sgrehan	if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
1615270070Sgrehan		return (EINVAL);
1616270070Sgrehan
1617256072Sneel	pmap = vmspace_pmap(vm->vmspace);
1618221828Sgrehan	vcpu = &vm->vcpu[vcpuid];
1619256072Sneel	vme = &vcpu->exitinfo;
1620284900Sneel	evinfo.rptr = &vm->rendezvous_func;
1621284900Sneel	evinfo.sptr = &vm->suspend;
1622284900Sneel	evinfo.iptr = &vcpu->reqidle;
1623242065Sneelrestart:
1624221828Sgrehan	critical_enter();
1625221828Sgrehan
1626256072Sneel	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
1627256072Sneel	    ("vm_run: absurd pm_active"));
1628256072Sneel
1629221828Sgrehan	tscval = rdtsc();
1630221828Sgrehan
1631221828Sgrehan	pcb = PCPU_GET(curpcb);
1632221914Sjhb	set_pcb_flags(pcb, PCB_FULL_IRET);
1633221828Sgrehan
1634221828Sgrehan	restore_guest_fpustate(vcpu);
1635241489Sneel
1636256072Sneel	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
1637284900Sneel	error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, &evinfo);
1638256072Sneel	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
1639241489Sneel
1640221828Sgrehan	save_guest_fpustate(vcpu);
1641221828Sgrehan
1642221828Sgrehan	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
1643221828Sgrehan
1644221828Sgrehan	critical_exit();
1645221828Sgrehan
1646256072Sneel	if (error == 0) {
1647262350Sjhb		retu = false;
1648284894Sneel		vcpu->nextrip = vme->rip + vme->inst_length;
1649256072Sneel		switch (vme->exitcode) {
1650284900Sneel		case VM_EXITCODE_REQIDLE:
1651284900Sneel			error = vm_handle_reqidle(vm, vcpuid, &retu);
1652284900Sneel			break;
1653268935Sjhb		case VM_EXITCODE_SUSPENDED:
1654268935Sjhb			error = vm_handle_suspend(vm, vcpuid, &retu);
1655268935Sjhb			break;
1656266339Sjhb		case VM_EXITCODE_IOAPIC_EOI:
1657266339Sjhb			vioapic_process_eoi(vm, vcpuid,
1658266339Sjhb			    vme->u.ioapic_eoi.vector);
1659266339Sjhb			break;
1660266339Sjhb		case VM_EXITCODE_RENDEZVOUS:
1661266339Sjhb			vm_handle_rendezvous(vm, vcpuid);
1662266339Sjhb			error = 0;
1663266339Sjhb			break;
1664256072Sneel		case VM_EXITCODE_HLT:
1665262350Sjhb			intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
1666262350Sjhb			error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu);
1667256072Sneel			break;
1668256072Sneel		case VM_EXITCODE_PAGING:
1669256072Sneel			error = vm_handle_paging(vm, vcpuid, &retu);
1670256072Sneel			break;
1671256072Sneel		case VM_EXITCODE_INST_EMUL:
1672256072Sneel			error = vm_handle_inst_emul(vm, vcpuid, &retu);
1673256072Sneel			break;
1674268976Sjhb		case VM_EXITCODE_INOUT:
1675268976Sjhb		case VM_EXITCODE_INOUT_STR:
1676268976Sjhb			error = vm_handle_inout(vm, vcpuid, vme, &retu);
1677268976Sjhb			break;
1678276349Sneel		case VM_EXITCODE_MONITOR:
1679276349Sneel		case VM_EXITCODE_MWAIT:
1680276349Sneel			vm_inject_ud(vm, vcpuid);
1681276349Sneel			break;
1682256072Sneel		default:
1683262350Sjhb			retu = true;	/* handled in userland */
1684256072Sneel			break;
1685242065Sneel		}
1686256072Sneel	}
1687242065Sneel
1688284894Sneel	if (error == 0 && retu == false)
1689242065Sneel		goto restart;
1690242065Sneel
1691284900Sneel	VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode);
1692284900Sneel
1693256072Sneel	/* copy the exit information */
1694256072Sneel	bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
1695221828Sgrehan	return (error);
1696221828Sgrehan}
1697221828Sgrehan
1698221828Sgrehanint
1699284894Sneelvm_restart_instruction(void *arg, int vcpuid)
1700284894Sneel{
1701284894Sneel	struct vm *vm;
1702284894Sneel	struct vcpu *vcpu;
1703284894Sneel	enum vcpu_state state;
1704284894Sneel	uint64_t rip;
1705284894Sneel	int error;
1706284894Sneel
1707284894Sneel	vm = arg;
1708284894Sneel	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1709284894Sneel		return (EINVAL);
1710284894Sneel
1711284894Sneel	vcpu = &vm->vcpu[vcpuid];
1712284894Sneel	state = vcpu_get_state(vm, vcpuid, NULL);
1713284894Sneel	if (state == VCPU_RUNNING) {
1714284894Sneel		/*
1715284894Sneel		 * When a vcpu is "running" the next instruction is determined
1716284894Sneel		 * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'.
1717284894Sneel		 * Thus setting 'inst_length' to zero will cause the current
1718284894Sneel		 * instruction to be restarted.
1719284894Sneel		 */
1720284894Sneel		vcpu->exitinfo.inst_length = 0;
1721284894Sneel		VCPU_CTR1(vm, vcpuid, "restarting instruction at %#lx by "
1722284894Sneel		    "setting inst_length to zero", vcpu->exitinfo.rip);
1723284894Sneel	} else if (state == VCPU_FROZEN) {
1724284894Sneel		/*
1725284894Sneel		 * When a vcpu is "frozen" it is outside the critical section
1726284894Sneel		 * around VMRUN() and 'nextrip' points to the next instruction.
1727284894Sneel		 * Thus instruction restart is achieved by setting 'nextrip'
1728284894Sneel		 * to the vcpu's %rip.
1729284894Sneel		 */
1730284894Sneel		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip);
1731284894Sneel		KASSERT(!error, ("%s: error %d getting rip", __func__, error));
1732284894Sneel		VCPU_CTR2(vm, vcpuid, "restarting instruction by updating "
1733284894Sneel		    "nextrip from %#lx to %#lx", vcpu->nextrip, rip);
1734284894Sneel		vcpu->nextrip = rip;
1735284894Sneel	} else {
1736284894Sneel		panic("%s: invalid state %d", __func__, state);
1737284894Sneel	}
1738284894Sneel	return (0);
1739284894Sneel}
1740284894Sneel
1741284894Sneelint
1742270159Sgrehanvm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
1743270159Sgrehan{
1744270159Sgrehan	struct vcpu *vcpu;
1745270159Sgrehan	int type, vector;
1746270159Sgrehan
1747270159Sgrehan	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1748270159Sgrehan		return (EINVAL);
1749270159Sgrehan
1750270159Sgrehan	vcpu = &vm->vcpu[vcpuid];
1751270159Sgrehan
1752270159Sgrehan	if (info & VM_INTINFO_VALID) {
1753270159Sgrehan		type = info & VM_INTINFO_TYPE;
1754270159Sgrehan		vector = info & 0xff;
1755270159Sgrehan		if (type == VM_INTINFO_NMI && vector != IDT_NMI)
1756270159Sgrehan			return (EINVAL);
1757270159Sgrehan		if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
1758270159Sgrehan			return (EINVAL);
1759270159Sgrehan		if (info & VM_INTINFO_RSVD)
1760270159Sgrehan			return (EINVAL);
1761270159Sgrehan	} else {
1762270159Sgrehan		info = 0;
1763270159Sgrehan	}
1764270159Sgrehan	VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info);
1765270159Sgrehan	vcpu->exitintinfo = info;
1766270159Sgrehan	return (0);
1767270159Sgrehan}
1768270159Sgrehan
1769270159Sgrehanenum exc_class {
1770270159Sgrehan	EXC_BENIGN,
1771270159Sgrehan	EXC_CONTRIBUTORY,
1772270159Sgrehan	EXC_PAGEFAULT
1773270159Sgrehan};
1774270159Sgrehan
1775270159Sgrehan#define	IDT_VE	20	/* Virtualization Exception (Intel specific) */
1776270159Sgrehan
1777270159Sgrehanstatic enum exc_class
1778270159Sgrehanexception_class(uint64_t info)
1779270159Sgrehan{
1780270159Sgrehan	int type, vector;
1781270159Sgrehan
1782270159Sgrehan	KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info));
1783270159Sgrehan	type = info & VM_INTINFO_TYPE;
1784270159Sgrehan	vector = info & 0xff;
1785270159Sgrehan
1786270159Sgrehan	/* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
1787270159Sgrehan	switch (type) {
1788270159Sgrehan	case VM_INTINFO_HWINTR:
1789270159Sgrehan	case VM_INTINFO_SWINTR:
1790270159Sgrehan	case VM_INTINFO_NMI:
1791270159Sgrehan		return (EXC_BENIGN);
1792270159Sgrehan	default:
1793270159Sgrehan		/*
1794270159Sgrehan		 * Hardware exception.
1795270159Sgrehan		 *
1796270159Sgrehan		 * SVM and VT-x use identical type values to represent NMI,
1797270159Sgrehan		 * hardware interrupt and software interrupt.
1798270159Sgrehan		 *
1799270159Sgrehan		 * SVM uses type '3' for all exceptions. VT-x uses type '3'
1800270159Sgrehan		 * for exceptions except #BP and #OF. #BP and #OF use a type
1801270159Sgrehan		 * value of '5' or '6'. Therefore we don't check for explicit
1802270159Sgrehan		 * values of 'type' to classify 'intinfo' into a hardware
1803270159Sgrehan		 * exception.
1804270159Sgrehan		 */
1805270159Sgrehan		break;
1806270159Sgrehan	}
1807270159Sgrehan
1808270159Sgrehan	switch (vector) {
1809270159Sgrehan	case IDT_PF:
1810270159Sgrehan	case IDT_VE:
1811270159Sgrehan		return (EXC_PAGEFAULT);
1812270159Sgrehan	case IDT_DE:
1813270159Sgrehan	case IDT_TS:
1814270159Sgrehan	case IDT_NP:
1815270159Sgrehan	case IDT_SS:
1816270159Sgrehan	case IDT_GP:
1817270159Sgrehan		return (EXC_CONTRIBUTORY);
1818270159Sgrehan	default:
1819270159Sgrehan		return (EXC_BENIGN);
1820270159Sgrehan	}
1821270159Sgrehan}
1822270159Sgrehan
1823270159Sgrehanstatic int
1824270159Sgrehannested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
1825270159Sgrehan    uint64_t *retinfo)
1826270159Sgrehan{
1827270159Sgrehan	enum exc_class exc1, exc2;
1828270159Sgrehan	int type1, vector1;
1829270159Sgrehan
1830270159Sgrehan	KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1));
1831270159Sgrehan	KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2));
1832270159Sgrehan
1833270159Sgrehan	/*
1834270159Sgrehan	 * If an exception occurs while attempting to call the double-fault
1835270159Sgrehan	 * handler the processor enters shutdown mode (aka triple fault).
1836270159Sgrehan	 */
1837270159Sgrehan	type1 = info1 & VM_INTINFO_TYPE;
1838270159Sgrehan	vector1 = info1 & 0xff;
1839270159Sgrehan	if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
1840270159Sgrehan		VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)",
1841270159Sgrehan		    info1, info2);
1842270159Sgrehan		vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
1843270159Sgrehan		*retinfo = 0;
1844270159Sgrehan		return (0);
1845270159Sgrehan	}
1846270159Sgrehan
1847270159Sgrehan	/*
1848270159Sgrehan	 * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
1849270159Sgrehan	 */
1850270159Sgrehan	exc1 = exception_class(info1);
1851270159Sgrehan	exc2 = exception_class(info2);
1852270159Sgrehan	if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
1853270159Sgrehan	    (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
1854270159Sgrehan		/* Convert nested fault into a double fault. */
1855270159Sgrehan		*retinfo = IDT_DF;
1856270159Sgrehan		*retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
1857270159Sgrehan		*retinfo |= VM_INTINFO_DEL_ERRCODE;
1858270159Sgrehan	} else {
1859270159Sgrehan		/* Handle exceptions serially */
1860270159Sgrehan		*retinfo = info2;
1861270159Sgrehan	}
1862270159Sgrehan	return (1);
1863270159Sgrehan}
1864270159Sgrehan
1865270159Sgrehanstatic uint64_t
1866270159Sgrehanvcpu_exception_intinfo(struct vcpu *vcpu)
1867270159Sgrehan{
1868270159Sgrehan	uint64_t info = 0;
1869270159Sgrehan
1870270159Sgrehan	if (vcpu->exception_pending) {
1871284894Sneel		info = vcpu->exc_vector & 0xff;
1872270159Sgrehan		info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
1873284894Sneel		if (vcpu->exc_errcode_valid) {
1874270159Sgrehan			info |= VM_INTINFO_DEL_ERRCODE;
1875284894Sneel			info |= (uint64_t)vcpu->exc_errcode << 32;
1876270159Sgrehan		}
1877270159Sgrehan	}
1878270159Sgrehan	return (info);
1879270159Sgrehan}
1880270159Sgrehan
1881270159Sgrehanint
1882270159Sgrehanvm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
1883270159Sgrehan{
1884270159Sgrehan	struct vcpu *vcpu;
1885270159Sgrehan	uint64_t info1, info2;
1886270159Sgrehan	int valid;
1887270159Sgrehan
1888270159Sgrehan	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));
1889270159Sgrehan
1890270159Sgrehan	vcpu = &vm->vcpu[vcpuid];
1891270159Sgrehan
1892270159Sgrehan	info1 = vcpu->exitintinfo;
1893270159Sgrehan	vcpu->exitintinfo = 0;
1894270159Sgrehan
1895270159Sgrehan	info2 = 0;
1896270159Sgrehan	if (vcpu->exception_pending) {
1897270159Sgrehan		info2 = vcpu_exception_intinfo(vcpu);
1898270159Sgrehan		vcpu->exception_pending = 0;
1899270159Sgrehan		VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx",
1900284894Sneel		    vcpu->exc_vector, info2);
1901270159Sgrehan	}
1902270159Sgrehan
1903270159Sgrehan	if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
1904270159Sgrehan		valid = nested_fault(vm, vcpuid, info1, info2, retinfo);
1905270159Sgrehan	} else if (info1 & VM_INTINFO_VALID) {
1906270159Sgrehan		*retinfo = info1;
1907270159Sgrehan		valid = 1;
1908270159Sgrehan	} else if (info2 & VM_INTINFO_VALID) {
1909270159Sgrehan		*retinfo = info2;
1910270159Sgrehan		valid = 1;
1911270159Sgrehan	} else {
1912270159Sgrehan		valid = 0;
1913270159Sgrehan	}
1914270159Sgrehan
1915270159Sgrehan	if (valid) {
1916270159Sgrehan		VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), "
1917270159Sgrehan		    "retinfo(%#lx)", __func__, info1, info2, *retinfo);
1918270159Sgrehan	}
1919270159Sgrehan
1920270159Sgrehan	return (valid);
1921270159Sgrehan}
1922270159Sgrehan
1923270159Sgrehanint
1924270159Sgrehanvm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
1925270159Sgrehan{
1926270159Sgrehan	struct vcpu *vcpu;
1927270159Sgrehan
1928270159Sgrehan	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1929270159Sgrehan		return (EINVAL);
1930270159Sgrehan
1931270159Sgrehan	vcpu = &vm->vcpu[vcpuid];
1932270159Sgrehan	*info1 = vcpu->exitintinfo;
1933270159Sgrehan	*info2 = vcpu_exception_intinfo(vcpu);
1934270159Sgrehan	return (0);
1935270159Sgrehan}
1936270159Sgrehan
1937270159Sgrehanint
1938284894Sneelvm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid,
1939284894Sneel    uint32_t errcode, int restart_instruction)
1940221828Sgrehan{
1941267427Sjhb	struct vcpu *vcpu;
1942284900Sneel	uint64_t regval;
1943284894Sneel	int error;
1944267427Sjhb
1945221828Sgrehan	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1946221828Sgrehan		return (EINVAL);
1947221828Sgrehan
1948284894Sneel	if (vector < 0 || vector >= 32)
1949221828Sgrehan		return (EINVAL);
1950221828Sgrehan
1951270159Sgrehan	/*
1952270159Sgrehan	 * A double fault exception should never be injected directly into
1953270159Sgrehan	 * the guest. It is a derived exception that results from specific
1954270159Sgrehan	 * combinations of nested faults.
1955270159Sgrehan	 */
1956284894Sneel	if (vector == IDT_DF)
1957270159Sgrehan		return (EINVAL);
1958270159Sgrehan
1959267427Sjhb	vcpu = &vm->vcpu[vcpuid];
1960221828Sgrehan
1961267427Sjhb	if (vcpu->exception_pending) {
1962267427Sjhb		VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
1963284894Sneel		    "pending exception %d", vector, vcpu->exc_vector);
1964267427Sjhb		return (EBUSY);
1965267427Sjhb	}
1966267427Sjhb
1967284900Sneel	if (errcode_valid) {
1968284900Sneel		/*
1969284900Sneel		 * Exceptions don't deliver an error code in real mode.
1970284900Sneel		 */
1971284900Sneel		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &regval);
1972284900Sneel		KASSERT(!error, ("%s: error %d getting CR0", __func__, error));
1973284900Sneel		if (!(regval & CR0_PE))
1974284900Sneel			errcode_valid = 0;
1975284900Sneel	}
1976284900Sneel
1977284894Sneel	/*
1978284894Sneel	 * From section 26.6.1 "Interruptibility State" in Intel SDM:
1979284894Sneel	 *
1980284894Sneel	 * Event blocking by "STI" or "MOV SS" is cleared after guest executes
1981284894Sneel	 * one instruction or incurs an exception.
1982284894Sneel	 */
1983284894Sneel	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0);
1984284894Sneel	KASSERT(error == 0, ("%s: error %d clearing interrupt shadow",
1985284894Sneel	    __func__, error));
1986284894Sneel
1987284894Sneel	if (restart_instruction)
1988284894Sneel		vm_restart_instruction(vm, vcpuid);
1989284894Sneel
1990267427Sjhb	vcpu->exception_pending = 1;
1991284894Sneel	vcpu->exc_vector = vector;
1992284894Sneel	vcpu->exc_errcode = errcode;
1993284894Sneel	vcpu->exc_errcode_valid = errcode_valid;
1994284894Sneel	VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector);
1995267427Sjhb	return (0);
1996221828Sgrehan}
1997221828Sgrehan
1998270159Sgrehanvoid
1999270159Sgrehanvm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid,
2000270159Sgrehan    int errcode)
2001267427Sjhb{
2002270159Sgrehan	struct vm *vm;
2003284894Sneel	int error, restart_instruction;
2004267427Sjhb
2005270159Sgrehan	vm = vmarg;
2006284894Sneel	restart_instruction = 1;
2007270159Sgrehan
2008284894Sneel	error = vm_inject_exception(vm, vcpuid, vector, errcode_valid,
2009284894Sneel	    errcode, restart_instruction);
2010267427Sjhb	KASSERT(error == 0, ("vm_inject_exception error %d", error));
2011267427Sjhb}
2012267427Sjhb
2013267427Sjhbvoid
2014270159Sgrehanvm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2)
2015268976Sjhb{
2016270159Sgrehan	struct vm *vm;
2017268976Sjhb	int error;
2018268976Sjhb
2019270159Sgrehan	vm = vmarg;
2020268976Sjhb	VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx",
2021268976Sjhb	    error_code, cr2);
2022268976Sjhb
2023268976Sjhb	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
2024268976Sjhb	KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
2025268976Sjhb
2026270159Sgrehan	vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);
2027268976Sjhb}
2028268976Sjhb
2029248389Sneelstatic VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
2030241982Sneel
2031221828Sgrehanint
2032241982Sneelvm_inject_nmi(struct vm *vm, int vcpuid)
2033221828Sgrehan{
2034241982Sneel	struct vcpu *vcpu;
2035221828Sgrehan
2036241982Sneel	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2037221828Sgrehan		return (EINVAL);
2038221828Sgrehan
2039241982Sneel	vcpu = &vm->vcpu[vcpuid];
2040241982Sneel
2041241982Sneel	vcpu->nmi_pending = 1;
2042266339Sjhb	vcpu_notify_event(vm, vcpuid, false);
2043241982Sneel	return (0);
2044221828Sgrehan}
2045221828Sgrehan
2046221828Sgrehanint
2047241982Sneelvm_nmi_pending(struct vm *vm, int vcpuid)
2048241982Sneel{
2049241982Sneel	struct vcpu *vcpu;
2050241982Sneel
2051241982Sneel	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2052241982Sneel		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
2053241982Sneel
2054241982Sneel	vcpu = &vm->vcpu[vcpuid];
2055241982Sneel
2056241982Sneel	return (vcpu->nmi_pending);
2057241982Sneel}
2058241982Sneel
2059241982Sneelvoid
2060241982Sneelvm_nmi_clear(struct vm *vm, int vcpuid)
2061241982Sneel{
2062241982Sneel	struct vcpu *vcpu;
2063241982Sneel
2064241982Sneel	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2065241982Sneel		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
2066241982Sneel
2067241982Sneel	vcpu = &vm->vcpu[vcpuid];
2068241982Sneel
2069241982Sneel	if (vcpu->nmi_pending == 0)
2070241982Sneel		panic("vm_nmi_clear: inconsistent nmi_pending state");
2071241982Sneel
2072241982Sneel	vcpu->nmi_pending = 0;
2073241982Sneel	vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
2074241982Sneel}
2075241982Sneel
2076268891Sjhbstatic VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
2077268891Sjhb
2078241982Sneelint
2079268891Sjhbvm_inject_extint(struct vm *vm, int vcpuid)
2080268891Sjhb{
2081268891Sjhb	struct vcpu *vcpu;
2082268891Sjhb
2083268891Sjhb	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2084268891Sjhb		return (EINVAL);
2085268891Sjhb
2086268891Sjhb	vcpu = &vm->vcpu[vcpuid];
2087268891Sjhb
2088268891Sjhb	vcpu->extint_pending = 1;
2089268891Sjhb	vcpu_notify_event(vm, vcpuid, false);
2090268891Sjhb	return (0);
2091268891Sjhb}
2092268891Sjhb
2093268891Sjhbint
2094268891Sjhbvm_extint_pending(struct vm *vm, int vcpuid)
2095268891Sjhb{
2096268891Sjhb	struct vcpu *vcpu;
2097268891Sjhb
2098268891Sjhb	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2099268891Sjhb		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
2100268891Sjhb
2101268891Sjhb	vcpu = &vm->vcpu[vcpuid];
2102268891Sjhb
2103268891Sjhb	return (vcpu->extint_pending);
2104268891Sjhb}
2105268891Sjhb
2106268891Sjhbvoid
2107268891Sjhbvm_extint_clear(struct vm *vm, int vcpuid)
2108268891Sjhb{
2109268891Sjhb	struct vcpu *vcpu;
2110268891Sjhb
2111268891Sjhb	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2112268891Sjhb		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
2113268891Sjhb
2114268891Sjhb	vcpu = &vm->vcpu[vcpuid];
2115268891Sjhb
2116268891Sjhb	if (vcpu->extint_pending == 0)
2117268891Sjhb		panic("vm_extint_clear: inconsistent extint_pending state");
2118268891Sjhb
2119268891Sjhb	vcpu->extint_pending = 0;
2120268891Sjhb	vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
2121268891Sjhb}
2122268891Sjhb
2123268891Sjhbint
2124221828Sgrehanvm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
2125221828Sgrehan{
2126221828Sgrehan	if (vcpu < 0 || vcpu >= VM_MAXCPU)
2127221828Sgrehan		return (EINVAL);
2128221828Sgrehan
2129221828Sgrehan	if (type < 0 || type >= VM_CAP_MAX)
2130221828Sgrehan		return (EINVAL);
2131221828Sgrehan
2132221828Sgrehan	return (VMGETCAP(vm->cookie, vcpu, type, retval));
2133221828Sgrehan}
2134221828Sgrehan
2135221828Sgrehanint
2136221828Sgrehanvm_set_capability(struct vm *vm, int vcpu, int type, int val)
2137221828Sgrehan{
2138221828Sgrehan	if (vcpu < 0 || vcpu >= VM_MAXCPU)
2139221828Sgrehan		return (EINVAL);
2140221828Sgrehan
2141221828Sgrehan	if (type < 0 || type >= VM_CAP_MAX)
2142221828Sgrehan		return (EINVAL);
2143221828Sgrehan
2144221828Sgrehan	return (VMSETCAP(vm->cookie, vcpu, type, val));
2145221828Sgrehan}
2146221828Sgrehan
2147221828Sgrehanstruct vlapic *
2148221828Sgrehanvm_lapic(struct vm *vm, int cpu)
2149221828Sgrehan{
2150221828Sgrehan	return (vm->vcpu[cpu].vlapic);
2151221828Sgrehan}
2152221828Sgrehan
2153261088Sjhbstruct vioapic *
2154261088Sjhbvm_ioapic(struct vm *vm)
2155261088Sjhb{
2156261088Sjhb
2157261088Sjhb	return (vm->vioapic);
2158261088Sjhb}
2159261088Sjhb
2160261088Sjhbstruct vhpet *
2161261088Sjhbvm_hpet(struct vm *vm)
2162261088Sjhb{
2163261088Sjhb
2164261088Sjhb	return (vm->vhpet);
2165261088Sjhb}
2166261088Sjhb
2167221828Sgrehanboolean_t
2168221828Sgrehanvmm_is_pptdev(int bus, int slot, int func)
2169221828Sgrehan{
2170246188Sneel	int found, i, n;
2171246188Sneel	int b, s, f;
2172221828Sgrehan	char *val, *cp, *cp2;
2173221828Sgrehan
2174221828Sgrehan	/*
2175246188Sneel	 * XXX
2176246188Sneel	 * The length of an environment variable is limited to 128 bytes which
2177246188Sneel	 * puts an upper limit on the number of passthru devices that may be
2178246188Sneel	 * specified using a single environment variable.
2179246188Sneel	 *
2180246188Sneel	 * Work around this by scanning multiple environment variable
2181246188Sneel	 * names instead of a single one - yuck!
2182221828Sgrehan	 */
2183246188Sneel	const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
2184246188Sneel
2185246188Sneel	/* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
2186221828Sgrehan	found = 0;
2187246188Sneel	for (i = 0; names[i] != NULL && !found; i++) {
2188246188Sneel		cp = val = getenv(names[i]);
2189246188Sneel		while (cp != NULL && *cp != '\0') {
2190246188Sneel			if ((cp2 = strchr(cp, ' ')) != NULL)
2191246188Sneel				*cp2 = '\0';
2192221828Sgrehan
2193246188Sneel			n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
2194246188Sneel			if (n == 3 && bus == b && slot == s && func == f) {
2195246188Sneel				found = 1;
2196246188Sneel				break;
2197246188Sneel			}
2198221828Sgrehan
2199246188Sneel			if (cp2 != NULL)
2200246188Sneel				*cp2++ = ' ';
2201221828Sgrehan
2202246188Sneel			cp = cp2;
2203246188Sneel		}
2204246188Sneel		freeenv(val);
2205221828Sgrehan	}
2206221828Sgrehan	return (found);
2207221828Sgrehan}
2208221828Sgrehan
2209221828Sgrehanvoid *
2210221828Sgrehanvm_iommu_domain(struct vm *vm)
2211221828Sgrehan{
2212221828Sgrehan
2213221828Sgrehan	return (vm->iommu);
2214221828Sgrehan}
2215221828Sgrehan
2216241489Sneelint
2217266393Sjhbvcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
2218266393Sjhb    bool from_idle)
2219221828Sgrehan{
2220241489Sneel	int error;
2221221828Sgrehan	struct vcpu *vcpu;
2222221828Sgrehan
2223221828Sgrehan	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2224221828Sgrehan		panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
2225221828Sgrehan
2226221828Sgrehan	vcpu = &vm->vcpu[vcpuid];
2227221828Sgrehan
2228241489Sneel	vcpu_lock(vcpu);
2229284900Sneel	error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle);
2230241489Sneel	vcpu_unlock(vcpu);
2231241489Sneel
2232241489Sneel	return (error);
2233221828Sgrehan}
2234221828Sgrehan
2235241489Sneelenum vcpu_state
2236249879Sgrehanvcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
2237221828Sgrehan{
2238221828Sgrehan	struct vcpu *vcpu;
2239241489Sneel	enum vcpu_state state;
2240221828Sgrehan
2241221828Sgrehan	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2242221828Sgrehan		panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
2243221828Sgrehan
2244221828Sgrehan	vcpu = &vm->vcpu[vcpuid];
2245221828Sgrehan
2246241489Sneel	vcpu_lock(vcpu);
2247241489Sneel	state = vcpu->state;
2248249879Sgrehan	if (hostcpu != NULL)
2249249879Sgrehan		*hostcpu = vcpu->hostcpu;
2250241489Sneel	vcpu_unlock(vcpu);
2251221828Sgrehan
2252241489Sneel	return (state);
2253221828Sgrehan}
2254221828Sgrehan
2255270070Sgrehanint
2256221828Sgrehanvm_activate_cpu(struct vm *vm, int vcpuid)
2257221828Sgrehan{
2258221828Sgrehan
2259270070Sgrehan	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2260270070Sgrehan		return (EINVAL);
2261266339Sjhb
2262270070Sgrehan	if (CPU_ISSET(vcpuid, &vm->active_cpus))
2263270070Sgrehan		return (EBUSY);
2264270070Sgrehan
2265266339Sjhb	VCPU_CTR0(vm, vcpuid, "activated");
2266266339Sjhb	CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
2267270070Sgrehan	return (0);
2268221828Sgrehan}
2269221828Sgrehan
2270223621Sgrehancpuset_t
2271221828Sgrehanvm_active_cpus(struct vm *vm)
2272221828Sgrehan{
2273221828Sgrehan
2274221828Sgrehan	return (vm->active_cpus);
2275221828Sgrehan}
2276221828Sgrehan
2277270070Sgrehancpuset_t
2278270070Sgrehanvm_suspended_cpus(struct vm *vm)
2279270070Sgrehan{
2280270070Sgrehan
2281270070Sgrehan	return (vm->suspended_cpus);
2282270070Sgrehan}
2283270070Sgrehan
2284221828Sgrehanvoid *
2285221828Sgrehanvcpu_stats(struct vm *vm, int vcpuid)
2286221828Sgrehan{
2287221828Sgrehan
2288221828Sgrehan	return (vm->vcpu[vcpuid].stats);
2289221828Sgrehan}
2290240922Sneel
2291240922Sneelint
2292240922Sneelvm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
2293240922Sneel{
2294240922Sneel	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2295240922Sneel		return (EINVAL);
2296240922Sneel
2297240922Sneel	*state = vm->vcpu[vcpuid].x2apic_state;
2298240922Sneel
2299240922Sneel	return (0);
2300240922Sneel}
2301240922Sneel
2302240922Sneelint
2303240922Sneelvm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
2304240922Sneel{
2305240922Sneel	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2306240922Sneel		return (EINVAL);
2307240922Sneel
2308248392Sneel	if (state >= X2APIC_STATE_LAST)
2309240922Sneel		return (EINVAL);
2310240922Sneel
2311240922Sneel	vm->vcpu[vcpuid].x2apic_state = state;
2312240922Sneel
2313240943Sneel	vlapic_set_x2apic_state(vm, vcpuid, state);
2314240943Sneel
2315240922Sneel	return (0);
2316240922Sneel}
2317241489Sneel
2318262350Sjhb/*
2319262350Sjhb * This function is called to ensure that a vcpu "sees" a pending event
2320262350Sjhb * as soon as possible:
2321262350Sjhb * - If the vcpu thread is sleeping then it is woken up.
2322262350Sjhb * - If the vcpu is running on a different host_cpu then an IPI will be directed
2323262350Sjhb *   to the host_cpu to cause the vcpu to trap into the hypervisor.
2324262350Sjhb */
2325284900Sneelstatic void
2326284900Sneelvcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr)
2327241489Sneel{
2328241489Sneel	int hostcpu;
2329241489Sneel
2330241489Sneel	hostcpu = vcpu->hostcpu;
2331266393Sjhb	if (vcpu->state == VCPU_RUNNING) {
2332266393Sjhb		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
2333266339Sjhb		if (hostcpu != curcpu) {
2334266393Sjhb			if (lapic_intr) {
2335266339Sjhb				vlapic_post_intr(vcpu->vlapic, hostcpu,
2336266339Sjhb				    vmm_ipinum);
2337266393Sjhb			} else {
2338266339Sjhb				ipi_cpu(hostcpu, vmm_ipinum);
2339266393Sjhb			}
2340266393Sjhb		} else {
2341266393Sjhb			/*
2342266393Sjhb			 * If the 'vcpu' is running on 'curcpu' then it must
2343266393Sjhb			 * be sending a notification to itself (e.g. SELF_IPI).
2344266393Sjhb			 * The pending event will be picked up when the vcpu
2345266393Sjhb			 * transitions back to guest context.
2346266393Sjhb			 */
2347266339Sjhb		}
2348266393Sjhb	} else {
2349266393Sjhb		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
2350266393Sjhb		    "with hostcpu %d", vcpu->state, hostcpu));
2351266393Sjhb		if (vcpu->state == VCPU_SLEEPING)
2352266393Sjhb			wakeup_one(vcpu);
2353242065Sneel	}
2354284900Sneel}
2355284900Sneel
2356284900Sneelvoid
2357284900Sneelvcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
2358284900Sneel{
2359284900Sneel	struct vcpu *vcpu = &vm->vcpu[vcpuid];
2360284900Sneel
2361284900Sneel	vcpu_lock(vcpu);
2362284900Sneel	vcpu_notify_event_locked(vcpu, lapic_intr);
2363242065Sneel	vcpu_unlock(vcpu);
2364241489Sneel}
2365256072Sneel
2366256072Sneelstruct vmspace *
2367256072Sneelvm_get_vmspace(struct vm *vm)
2368256072Sneel{
2369256072Sneel
2370256072Sneel	return (vm->vmspace);
2371256072Sneel}
2372261088Sjhb
2373261088Sjhbint
2374261088Sjhbvm_apicid2vcpuid(struct vm *vm, int apicid)
2375261088Sjhb{
2376261088Sjhb	/*
2377261088Sjhb	 * XXX apic id is assumed to be numerically identical to vcpu id
2378261088Sjhb	 */
2379261088Sjhb	return (apicid);
2380261088Sjhb}
2381266339Sjhb
2382266339Sjhbvoid
2383266339Sjhbvm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
2384266339Sjhb    vm_rendezvous_func_t func, void *arg)
2385266339Sjhb{
2386266339Sjhb	int i;
2387266339Sjhb
2388266339Sjhb	/*
2389266339Sjhb	 * Enforce that this function is called without any locks
2390266339Sjhb	 */
2391266339Sjhb	WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous");
2392266339Sjhb	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
2393266339Sjhb	    ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid));
2394266339Sjhb
2395266339Sjhbrestart:
2396266339Sjhb	mtx_lock(&vm->rendezvous_mtx);
2397266339Sjhb	if (vm->rendezvous_func != NULL) {
2398266339Sjhb		/*
2399266339Sjhb		 * If a rendezvous is already in progress then we need to
2400266339Sjhb		 * call the rendezvous handler in case this 'vcpuid' is one
2401266339Sjhb		 * of the targets of the rendezvous.
2402266339Sjhb		 */
2403266339Sjhb		RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress");
2404266339Sjhb		mtx_unlock(&vm->rendezvous_mtx);
2405266339Sjhb		vm_handle_rendezvous(vm, vcpuid);
2406266339Sjhb		goto restart;
2407266339Sjhb	}
2408266339Sjhb	KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous "
2409266339Sjhb	    "rendezvous is still in progress"));
2410266339Sjhb
2411266339Sjhb	RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous");
2412266339Sjhb	vm->rendezvous_req_cpus = dest;
2413266339Sjhb	CPU_ZERO(&vm->rendezvous_done_cpus);
2414266339Sjhb	vm->rendezvous_arg = arg;
2415266339Sjhb	vm_set_rendezvous_func(vm, func);
2416266339Sjhb	mtx_unlock(&vm->rendezvous_mtx);
2417266339Sjhb
2418266339Sjhb	/*
2419266339Sjhb	 * Wake up any sleeping vcpus and trigger a VM-exit in any running
2420266339Sjhb	 * vcpus so they handle the rendezvous as soon as possible.
2421266339Sjhb	 */
2422266339Sjhb	for (i = 0; i < VM_MAXCPU; i++) {
2423266339Sjhb		if (CPU_ISSET(i, &dest))
2424266339Sjhb			vcpu_notify_event(vm, i, false);
2425266339Sjhb	}
2426266339Sjhb
2427266339Sjhb	vm_handle_rendezvous(vm, vcpuid);
2428266339Sjhb}
2429268891Sjhb
2430268891Sjhbstruct vatpic *
2431268891Sjhbvm_atpic(struct vm *vm)
2432268891Sjhb{
2433268891Sjhb	return (vm->vatpic);
2434268891Sjhb}
2435268891Sjhb
2436268891Sjhbstruct vatpit *
2437268891Sjhbvm_atpit(struct vm *vm)
2438268891Sjhb{
2439268891Sjhb	return (vm->vatpit);
2440268891Sjhb}
2441268976Sjhb
2442276429Sneelstruct vpmtmr *
2443276429Sneelvm_pmtmr(struct vm *vm)
2444276429Sneel{
2445276429Sneel
2446276429Sneel	return (vm->vpmtmr);
2447276429Sneel}
2448276429Sneel
2449284894Sneelstruct vrtc *
2450284894Sneelvm_rtc(struct vm *vm)
2451284894Sneel{
2452284894Sneel
2453284894Sneel	return (vm->vrtc);
2454284894Sneel}
2455284894Sneel
2456268976Sjhbenum vm_reg_name
2457268976Sjhbvm_segment_name(int seg)
2458268976Sjhb{
2459268976Sjhb	static enum vm_reg_name seg_names[] = {
2460268976Sjhb		VM_REG_GUEST_ES,
2461268976Sjhb		VM_REG_GUEST_CS,
2462268976Sjhb		VM_REG_GUEST_SS,
2463268976Sjhb		VM_REG_GUEST_DS,
2464268976Sjhb		VM_REG_GUEST_FS,
2465268976Sjhb		VM_REG_GUEST_GS
2466268976Sjhb	};
2467268976Sjhb
2468268976Sjhb	KASSERT(seg >= 0 && seg < nitems(seg_names),
2469268976Sjhb	    ("%s: invalid segment encoding %d", __func__, seg));
2470268976Sjhb	return (seg_names[seg]);
2471268976Sjhb}
2472270074Sgrehan
2473270159Sgrehanvoid
2474270159Sgrehanvm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
2475270159Sgrehan    int num_copyinfo)
2476270159Sgrehan{
2477270159Sgrehan	int idx;
2478270074Sgrehan
2479270159Sgrehan	for (idx = 0; idx < num_copyinfo; idx++) {
2480270159Sgrehan		if (copyinfo[idx].cookie != NULL)
2481270159Sgrehan			vm_gpa_release(copyinfo[idx].cookie);
2482270159Sgrehan	}
2483270159Sgrehan	bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo));
2484270159Sgrehan}
2485270159Sgrehan
2486270159Sgrehanint
2487270159Sgrehanvm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
2488270159Sgrehan    uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
2489284900Sneel    int num_copyinfo, int *fault)
2490270159Sgrehan{
2491270159Sgrehan	int error, idx, nused;
2492270159Sgrehan	size_t n, off, remaining;
2493270159Sgrehan	void *hva, *cookie;
2494270159Sgrehan	uint64_t gpa;
2495270159Sgrehan
2496270159Sgrehan	bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo);
2497270159Sgrehan
2498270159Sgrehan	nused = 0;
2499270159Sgrehan	remaining = len;
2500270159Sgrehan	while (remaining > 0) {
2501270159Sgrehan		KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
2502284900Sneel		error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault);
2503284900Sneel		if (error || *fault)
2504270159Sgrehan			return (error);
2505270159Sgrehan		off = gpa & PAGE_MASK;
2506270159Sgrehan		n = min(remaining, PAGE_SIZE - off);
2507270159Sgrehan		copyinfo[nused].gpa = gpa;
2508270159Sgrehan		copyinfo[nused].len = n;
2509270159Sgrehan		remaining -= n;
2510270159Sgrehan		gla += n;
2511270159Sgrehan		nused++;
2512270159Sgrehan	}
2513270159Sgrehan
2514270159Sgrehan	for (idx = 0; idx < nused; idx++) {
2515295124Sgrehan		hva = vm_gpa_hold(vm, vcpuid, copyinfo[idx].gpa,
2516295124Sgrehan		    copyinfo[idx].len, prot, &cookie);
2517270159Sgrehan		if (hva == NULL)
2518270159Sgrehan			break;
2519270159Sgrehan		copyinfo[idx].hva = hva;
2520270159Sgrehan		copyinfo[idx].cookie = cookie;
2521270159Sgrehan	}
2522270159Sgrehan
2523270159Sgrehan	if (idx != nused) {
2524270159Sgrehan		vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
2525284900Sneel		return (EFAULT);
2526270159Sgrehan	} else {
2527284900Sneel		*fault = 0;
2528270159Sgrehan		return (0);
2529270159Sgrehan	}
2530270159Sgrehan}
2531270159Sgrehan
2532270159Sgrehanvoid
2533270159Sgrehanvm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
2534270159Sgrehan    size_t len)
2535270159Sgrehan{
2536270159Sgrehan	char *dst;
2537270159Sgrehan	int idx;
2538270159Sgrehan
2539270159Sgrehan	dst = kaddr;
2540270159Sgrehan	idx = 0;
2541270159Sgrehan	while (len > 0) {
2542270159Sgrehan		bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
2543270159Sgrehan		len -= copyinfo[idx].len;
2544270159Sgrehan		dst += copyinfo[idx].len;
2545270159Sgrehan		idx++;
2546270159Sgrehan	}
2547270159Sgrehan}
2548270159Sgrehan
2549270159Sgrehanvoid
2550270159Sgrehanvm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
2551270159Sgrehan    struct vm_copyinfo *copyinfo, size_t len)
2552270159Sgrehan{
2553270159Sgrehan	const char *src;
2554270159Sgrehan	int idx;
2555270159Sgrehan
2556270159Sgrehan	src = kaddr;
2557270159Sgrehan	idx = 0;
2558270159Sgrehan	while (len > 0) {
2559270159Sgrehan		bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
2560270159Sgrehan		len -= copyinfo[idx].len;
2561270159Sgrehan		src += copyinfo[idx].len;
2562270159Sgrehan		idx++;
2563270159Sgrehan	}
2564270159Sgrehan}
2565270159Sgrehan
2566270074Sgrehan/*
2567270074Sgrehan * Return the amount of in-use and wired memory for the VM. Since
2568270074Sgrehan * these are global stats, only return the values with for vCPU 0
2569270074Sgrehan */
2570270074SgrehanVMM_STAT_DECLARE(VMM_MEM_RESIDENT);
2571270074SgrehanVMM_STAT_DECLARE(VMM_MEM_WIRED);
2572270074Sgrehan
2573270074Sgrehanstatic void
2574270074Sgrehanvm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
2575270074Sgrehan{
2576270074Sgrehan
2577270074Sgrehan	if (vcpu == 0) {
2578270074Sgrehan		vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
2579270074Sgrehan	       	    PAGE_SIZE * vmspace_resident_count(vm->vmspace));
2580270074Sgrehan	}
2581270074Sgrehan}
2582270074Sgrehan
2583270074Sgrehanstatic void
2584270074Sgrehanvm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
2585270074Sgrehan{
2586270074Sgrehan
2587270074Sgrehan	if (vcpu == 0) {
2588270074Sgrehan		vmm_stat_set(vm, vcpu, VMM_MEM_WIRED,
2589270074Sgrehan	      	    PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace)));
2590270074Sgrehan	}
2591270074Sgrehan}
2592270074Sgrehan
2593270074SgrehanVMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
2594270074SgrehanVMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt);
2595