vmm.c revision 276429
1/*-
2 * Copyright (c) 2011 NetApp, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD: stable/10/sys/amd64/vmm/vmm.c 276429 2014-12-30 22:22:46Z neel $
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/vmm.c 276429 2014-12-30 22:22:46Z neel $");
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/kernel.h>
35#include <sys/module.h>
36#include <sys/sysctl.h>
37#include <sys/malloc.h>
38#include <sys/pcpu.h>
39#include <sys/lock.h>
40#include <sys/mutex.h>
41#include <sys/proc.h>
42#include <sys/rwlock.h>
43#include <sys/sched.h>
44#include <sys/smp.h>
45#include <sys/systm.h>
46
47#include <vm/vm.h>
48#include <vm/vm_object.h>
49#include <vm/vm_page.h>
50#include <vm/pmap.h>
51#include <vm/vm_map.h>
52#include <vm/vm_extern.h>
53#include <vm/vm_param.h>
54
55#include <machine/cpu.h>
56#include <machine/vm.h>
57#include <machine/pcb.h>
58#include <machine/smp.h>
59#include <x86/psl.h>
60#include <x86/apicreg.h>
61#include <machine/vmparam.h>
62
63#include <machine/vmm.h>
64#include <machine/vmm_dev.h>
65#include <machine/vmm_instruction_emul.h>
66
67#include "vmm_ioport.h"
68#include "vmm_ktr.h"
69#include "vmm_host.h"
70#include "vmm_mem.h"
71#include "vmm_util.h"
72#include "vatpic.h"
73#include "vatpit.h"
74#include "vhpet.h"
75#include "vioapic.h"
76#include "vlapic.h"
77#include "vpmtmr.h"
78#include "vmm_ipi.h"
79#include "vmm_stat.h"
80#include "vmm_lapic.h"
81
82#include "io/ppt.h"
83#include "io/iommu.h"
84
85struct vlapic;
86
87/*
88 * Initialization:
89 * (a) allocated when vcpu is created
90 * (i) initialized when vcpu is created and when it is reinitialized
91 * (o) initialized the first time the vcpu is created
92 * (x) initialized before use
93 */
94struct vcpu {
95	struct mtx 	mtx;		/* (o) protects 'state' and 'hostcpu' */
96	enum vcpu_state	state;		/* (o) vcpu state */
97	int		hostcpu;	/* (o) vcpu's host cpu */
98	struct vlapic	*vlapic;	/* (i) APIC device model */
99	enum x2apic_state x2apic_state;	/* (i) APIC mode */
100	uint64_t	exitintinfo;	/* (i) events pending at VM exit */
101	int		nmi_pending;	/* (i) NMI pending */
102	int		extint_pending;	/* (i) INTR pending */
103	struct vm_exception exception;	/* (x) exception collateral */
104	int	exception_pending;	/* (i) exception pending */
105	struct savefpu	*guestfpu;	/* (a,i) guest fpu state */
106	uint64_t	guest_xcr0;	/* (i) guest %xcr0 register */
107	void		*stats;		/* (a,i) statistics */
108	struct vm_exit	exitinfo;	/* (x) exit reason and collateral */
109};
110
111#define	vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
112#define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
113#define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
114#define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
115#define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
116
117struct mem_seg {
118	vm_paddr_t	gpa;
119	size_t		len;
120	boolean_t	wired;
121	vm_object_t	object;
122};
123#define	VM_MAX_MEMORY_SEGMENTS	2
124
125/*
126 * Initialization:
127 * (o) initialized the first time the VM is created
128 * (i) initialized when VM is created and when it is reinitialized
129 * (x) initialized before use
130 */
131struct vm {
132	void		*cookie;		/* (i) cpu-specific data */
133	void		*iommu;			/* (x) iommu-specific data */
134	struct vhpet	*vhpet;			/* (i) virtual HPET */
135	struct vioapic	*vioapic;		/* (i) virtual ioapic */
136	struct vatpic	*vatpic;		/* (i) virtual atpic */
137	struct vatpit	*vatpit;		/* (i) virtual atpit */
138	struct vpmtmr	*vpmtmr;		/* (i) virtual ACPI PM timer */
139	volatile cpuset_t active_cpus;		/* (i) active vcpus */
140	int		suspend;		/* (i) stop VM execution */
141	volatile cpuset_t suspended_cpus; 	/* (i) suspended vcpus */
142	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
143	cpuset_t	rendezvous_req_cpus;	/* (x) rendezvous requested */
144	cpuset_t	rendezvous_done_cpus;	/* (x) rendezvous finished */
145	void		*rendezvous_arg;	/* (x) rendezvous func/arg */
146	vm_rendezvous_func_t rendezvous_func;
147	struct mtx	rendezvous_mtx;		/* (o) rendezvous lock */
148	int		num_mem_segs;		/* (o) guest memory segments */
149	struct mem_seg	mem_segs[VM_MAX_MEMORY_SEGMENTS];
150	struct vmspace	*vmspace;		/* (o) guest's address space */
151	char		name[VM_MAX_NAMELEN];	/* (o) virtual machine name */
152	struct vcpu	vcpu[VM_MAXCPU];	/* (i) guest vcpus */
153};
154
155static int vmm_initialized;
156
157static struct vmm_ops *ops;
158#define	VMM_INIT(num)	(ops != NULL ? (*ops->init)(num) : 0)
159#define	VMM_CLEANUP()	(ops != NULL ? (*ops->cleanup)() : 0)
160#define	VMM_RESUME()	(ops != NULL ? (*ops->resume)() : 0)
161
162#define	VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
163#define	VMRUN(vmi, vcpu, rip, pmap, rptr, sptr) \
164	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, rptr, sptr) : ENXIO)
165#define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
166#define	VMSPACE_ALLOC(min, max) \
167	(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
168#define	VMSPACE_FREE(vmspace) \
169	(ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
170#define	VMGETREG(vmi, vcpu, num, retval)		\
171	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
172#define	VMSETREG(vmi, vcpu, num, val)		\
173	(ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
174#define	VMGETDESC(vmi, vcpu, num, desc)		\
175	(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
176#define	VMSETDESC(vmi, vcpu, num, desc)		\
177	(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
178#define	VMGETCAP(vmi, vcpu, num, retval)	\
179	(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
180#define	VMSETCAP(vmi, vcpu, num, val)		\
181	(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
182#define	VLAPIC_INIT(vmi, vcpu)			\
183	(ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL)
184#define	VLAPIC_CLEANUP(vmi, vlapic)		\
185	(ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL)
186
187#define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
188#define	fpu_stop_emulating()	clts()
189
190static MALLOC_DEFINE(M_VM, "vm", "vm");
191
192/* statistics */
193static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
194
195SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
196
197/*
198 * Halt the guest if all vcpus are executing a HLT instruction with
199 * interrupts disabled.
200 */
201static int halt_detection_enabled = 1;
202TUNABLE_INT("hw.vmm.halt_detection", &halt_detection_enabled);
203SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN,
204    &halt_detection_enabled, 0,
205    "Halt VM if all vcpus execute HLT with interrupts disabled");
206
207static int vmm_ipinum;
208SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
209    "IPI vector used for vcpu notifications");
210
211static int trace_guest_exceptions;
212SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN,
213    &trace_guest_exceptions, 0,
214    "Trap into hypervisor on all guest exceptions and reflect them back");
215
216static void
217vcpu_cleanup(struct vm *vm, int i, bool destroy)
218{
219	struct vcpu *vcpu = &vm->vcpu[i];
220
221	VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
222	if (destroy) {
223		vmm_stat_free(vcpu->stats);
224		fpu_save_area_free(vcpu->guestfpu);
225	}
226}
227
228static void
229vcpu_init(struct vm *vm, int vcpu_id, bool create)
230{
231	struct vcpu *vcpu;
232
233	KASSERT(vcpu_id >= 0 && vcpu_id < VM_MAXCPU,
234	    ("vcpu_init: invalid vcpu %d", vcpu_id));
235
236	vcpu = &vm->vcpu[vcpu_id];
237
238	if (create) {
239		KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already "
240		    "initialized", vcpu_id));
241		vcpu_lock_init(vcpu);
242		vcpu->state = VCPU_IDLE;
243		vcpu->hostcpu = NOCPU;
244		vcpu->guestfpu = fpu_save_area_alloc();
245		vcpu->stats = vmm_stat_alloc();
246	}
247
248	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
249	vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
250	vcpu->exitintinfo = 0;
251	vcpu->nmi_pending = 0;
252	vcpu->extint_pending = 0;
253	vcpu->exception_pending = 0;
254	vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
255	fpu_save_area_reset(vcpu->guestfpu);
256	vmm_stat_init(vcpu->stats);
257}
258
259int
260vcpu_trace_exceptions(struct vm *vm, int vcpuid)
261{
262
263	return (trace_guest_exceptions);
264}
265
266struct vm_exit *
267vm_exitinfo(struct vm *vm, int cpuid)
268{
269	struct vcpu *vcpu;
270
271	if (cpuid < 0 || cpuid >= VM_MAXCPU)
272		panic("vm_exitinfo: invalid cpuid %d", cpuid);
273
274	vcpu = &vm->vcpu[cpuid];
275
276	return (&vcpu->exitinfo);
277}
278
279static void
280vmm_resume(void)
281{
282	VMM_RESUME();
283}
284
285static int
286vmm_init(void)
287{
288	int error;
289
290	vmm_host_state_init();
291
292	vmm_ipinum = vmm_ipi_alloc();
293	if (vmm_ipinum == 0)
294		vmm_ipinum = IPI_AST;
295
296	error = vmm_mem_init();
297	if (error)
298		return (error);
299
300	if (vmm_is_intel())
301		ops = &vmm_ops_intel;
302	else if (vmm_is_amd())
303		ops = &vmm_ops_amd;
304	else
305		return (ENXIO);
306
307	vmm_resume_p = vmm_resume;
308
309	return (VMM_INIT(vmm_ipinum));
310}
311
312static int
313vmm_handler(module_t mod, int what, void *arg)
314{
315	int error;
316
317	switch (what) {
318	case MOD_LOAD:
319		vmmdev_init();
320		if (ppt_avail_devices() > 0)
321			iommu_init();
322		error = vmm_init();
323		if (error == 0)
324			vmm_initialized = 1;
325		break;
326	case MOD_UNLOAD:
327		error = vmmdev_cleanup();
328		if (error == 0) {
329			vmm_resume_p = NULL;
330			iommu_cleanup();
331			if (vmm_ipinum != IPI_AST)
332				vmm_ipi_free(vmm_ipinum);
333			error = VMM_CLEANUP();
334			/*
335			 * Something bad happened - prevent new
336			 * VMs from being created
337			 */
338			if (error)
339				vmm_initialized = 0;
340		}
341		break;
342	default:
343		error = 0;
344		break;
345	}
346	return (error);
347}
348
349static moduledata_t vmm_kmod = {
350	"vmm",
351	vmm_handler,
352	NULL
353};
354
355/*
356 * vmm initialization has the following dependencies:
357 *
358 * - iommu initialization must happen after the pci passthru driver has had
359 *   a chance to attach to any passthru devices (after SI_SUB_CONFIGURE).
360 *
361 * - VT-x initialization requires smp_rendezvous() and therefore must happen
362 *   after SMP is fully functional (after SI_SUB_SMP).
363 */
364DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
365MODULE_VERSION(vmm, 1);
366
367static void
368vm_init(struct vm *vm, bool create)
369{
370	int i;
371
372	vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace));
373	vm->iommu = NULL;
374	vm->vioapic = vioapic_init(vm);
375	vm->vhpet = vhpet_init(vm);
376	vm->vatpic = vatpic_init(vm);
377	vm->vatpit = vatpit_init(vm);
378	vm->vpmtmr = vpmtmr_init(vm);
379
380	CPU_ZERO(&vm->active_cpus);
381
382	vm->suspend = 0;
383	CPU_ZERO(&vm->suspended_cpus);
384
385	for (i = 0; i < VM_MAXCPU; i++)
386		vcpu_init(vm, i, create);
387}
388
389int
390vm_create(const char *name, struct vm **retvm)
391{
392	struct vm *vm;
393	struct vmspace *vmspace;
394
395	/*
396	 * If vmm.ko could not be successfully initialized then don't attempt
397	 * to create the virtual machine.
398	 */
399	if (!vmm_initialized)
400		return (ENXIO);
401
402	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
403		return (EINVAL);
404
405	vmspace = VMSPACE_ALLOC(0, VM_MAXUSER_ADDRESS);
406	if (vmspace == NULL)
407		return (ENOMEM);
408
409	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
410	strcpy(vm->name, name);
411	vm->num_mem_segs = 0;
412	vm->vmspace = vmspace;
413	mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
414
415	vm_init(vm, true);
416
417	*retvm = vm;
418	return (0);
419}
420
421static void
422vm_free_mem_seg(struct vm *vm, struct mem_seg *seg)
423{
424
425	if (seg->object != NULL)
426		vmm_mem_free(vm->vmspace, seg->gpa, seg->len);
427
428	bzero(seg, sizeof(*seg));
429}
430
431static void
432vm_cleanup(struct vm *vm, bool destroy)
433{
434	int i;
435
436	ppt_unassign_all(vm);
437
438	if (vm->iommu != NULL)
439		iommu_destroy_domain(vm->iommu);
440
441	vpmtmr_cleanup(vm->vpmtmr);
442	vatpit_cleanup(vm->vatpit);
443	vhpet_cleanup(vm->vhpet);
444	vatpic_cleanup(vm->vatpic);
445	vioapic_cleanup(vm->vioapic);
446
447	for (i = 0; i < VM_MAXCPU; i++)
448		vcpu_cleanup(vm, i, destroy);
449
450	VMCLEANUP(vm->cookie);
451
452	if (destroy) {
453		for (i = 0; i < vm->num_mem_segs; i++)
454			vm_free_mem_seg(vm, &vm->mem_segs[i]);
455
456		vm->num_mem_segs = 0;
457
458		VMSPACE_FREE(vm->vmspace);
459		vm->vmspace = NULL;
460	}
461}
462
463void
464vm_destroy(struct vm *vm)
465{
466	vm_cleanup(vm, true);
467	free(vm, M_VM);
468}
469
470int
471vm_reinit(struct vm *vm)
472{
473	int error;
474
475	/*
476	 * A virtual machine can be reset only if all vcpus are suspended.
477	 */
478	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
479		vm_cleanup(vm, false);
480		vm_init(vm, false);
481		error = 0;
482	} else {
483		error = EBUSY;
484	}
485
486	return (error);
487}
488
489const char *
490vm_name(struct vm *vm)
491{
492	return (vm->name);
493}
494
495int
496vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
497{
498	vm_object_t obj;
499
500	if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
501		return (ENOMEM);
502	else
503		return (0);
504}
505
506int
507vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
508{
509
510	vmm_mmio_free(vm->vmspace, gpa, len);
511	return (0);
512}
513
514boolean_t
515vm_mem_allocated(struct vm *vm, vm_paddr_t gpa)
516{
517	int i;
518	vm_paddr_t gpabase, gpalimit;
519
520	for (i = 0; i < vm->num_mem_segs; i++) {
521		gpabase = vm->mem_segs[i].gpa;
522		gpalimit = gpabase + vm->mem_segs[i].len;
523		if (gpa >= gpabase && gpa < gpalimit)
524			return (TRUE);		/* 'gpa' is regular memory */
525	}
526
527	if (ppt_is_mmio(vm, gpa))
528		return (TRUE);			/* 'gpa' is pci passthru mmio */
529
530	return (FALSE);
531}
532
533int
534vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
535{
536	int available, allocated;
537	struct mem_seg *seg;
538	vm_object_t object;
539	vm_paddr_t g;
540
541	if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
542		return (EINVAL);
543
544	available = allocated = 0;
545	g = gpa;
546	while (g < gpa + len) {
547		if (vm_mem_allocated(vm, g))
548			allocated++;
549		else
550			available++;
551
552		g += PAGE_SIZE;
553	}
554
555	/*
556	 * If there are some allocated and some available pages in the address
557	 * range then it is an error.
558	 */
559	if (allocated && available)
560		return (EINVAL);
561
562	/*
563	 * If the entire address range being requested has already been
564	 * allocated then there isn't anything more to do.
565	 */
566	if (allocated && available == 0)
567		return (0);
568
569	if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
570		return (E2BIG);
571
572	seg = &vm->mem_segs[vm->num_mem_segs];
573
574	if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL)
575		return (ENOMEM);
576
577	seg->gpa = gpa;
578	seg->len = len;
579	seg->object = object;
580	seg->wired = FALSE;
581
582	vm->num_mem_segs++;
583
584	return (0);
585}
586
587static vm_paddr_t
588vm_maxmem(struct vm *vm)
589{
590	int i;
591	vm_paddr_t gpa, maxmem;
592
593	maxmem = 0;
594	for (i = 0; i < vm->num_mem_segs; i++) {
595		gpa = vm->mem_segs[i].gpa + vm->mem_segs[i].len;
596		if (gpa > maxmem)
597			maxmem = gpa;
598	}
599	return (maxmem);
600}
601
602static void
603vm_gpa_unwire(struct vm *vm)
604{
605	int i, rv;
606	struct mem_seg *seg;
607
608	for (i = 0; i < vm->num_mem_segs; i++) {
609		seg = &vm->mem_segs[i];
610		if (!seg->wired)
611			continue;
612
613		rv = vm_map_unwire(&vm->vmspace->vm_map,
614				   seg->gpa, seg->gpa + seg->len,
615				   VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
616		KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment "
617		    "%#lx/%ld could not be unwired: %d",
618		    vm_name(vm), seg->gpa, seg->len, rv));
619
620		seg->wired = FALSE;
621	}
622}
623
624static int
625vm_gpa_wire(struct vm *vm)
626{
627	int i, rv;
628	struct mem_seg *seg;
629
630	for (i = 0; i < vm->num_mem_segs; i++) {
631		seg = &vm->mem_segs[i];
632		if (seg->wired)
633			continue;
634
635		/* XXX rlimits? */
636		rv = vm_map_wire(&vm->vmspace->vm_map,
637				 seg->gpa, seg->gpa + seg->len,
638				 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
639		if (rv != KERN_SUCCESS)
640			break;
641
642		seg->wired = TRUE;
643	}
644
645	if (i < vm->num_mem_segs) {
646		/*
647		 * Undo the wiring before returning an error.
648		 */
649		vm_gpa_unwire(vm);
650		return (EAGAIN);
651	}
652
653	return (0);
654}
655
656static void
657vm_iommu_modify(struct vm *vm, boolean_t map)
658{
659	int i, sz;
660	vm_paddr_t gpa, hpa;
661	struct mem_seg *seg;
662	void *vp, *cookie, *host_domain;
663
664	sz = PAGE_SIZE;
665	host_domain = iommu_host_domain();
666
667	for (i = 0; i < vm->num_mem_segs; i++) {
668		seg = &vm->mem_segs[i];
669		KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired",
670		    vm_name(vm), seg->gpa, seg->len));
671
672		gpa = seg->gpa;
673		while (gpa < seg->gpa + seg->len) {
674			vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE,
675					 &cookie);
676			KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
677			    vm_name(vm), gpa));
678
679			vm_gpa_release(cookie);
680
681			hpa = DMAP_TO_PHYS((uintptr_t)vp);
682			if (map) {
683				iommu_create_mapping(vm->iommu, gpa, hpa, sz);
684				iommu_remove_mapping(host_domain, hpa, sz);
685			} else {
686				iommu_remove_mapping(vm->iommu, gpa, sz);
687				iommu_create_mapping(host_domain, hpa, hpa, sz);
688			}
689
690			gpa += PAGE_SIZE;
691		}
692	}
693
694	/*
695	 * Invalidate the cached translations associated with the domain
696	 * from which pages were removed.
697	 */
698	if (map)
699		iommu_invalidate_tlb(host_domain);
700	else
701		iommu_invalidate_tlb(vm->iommu);
702}
703
704#define	vm_iommu_unmap(vm)	vm_iommu_modify((vm), FALSE)
705#define	vm_iommu_map(vm)	vm_iommu_modify((vm), TRUE)
706
707int
708vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
709{
710	int error;
711
712	error = ppt_unassign_device(vm, bus, slot, func);
713	if (error)
714		return (error);
715
716	if (ppt_assigned_devices(vm) == 0) {
717		vm_iommu_unmap(vm);
718		vm_gpa_unwire(vm);
719	}
720	return (0);
721}
722
723int
724vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
725{
726	int error;
727	vm_paddr_t maxaddr;
728
729	/*
730	 * Virtual machines with pci passthru devices get special treatment:
731	 * - the guest physical memory is wired
732	 * - the iommu is programmed to do the 'gpa' to 'hpa' translation
733	 *
734	 * We need to do this before the first pci passthru device is attached.
735	 */
736	if (ppt_assigned_devices(vm) == 0) {
737		KASSERT(vm->iommu == NULL,
738		    ("vm_assign_pptdev: iommu must be NULL"));
739		maxaddr = vm_maxmem(vm);
740		vm->iommu = iommu_create_domain(maxaddr);
741
742		error = vm_gpa_wire(vm);
743		if (error)
744			return (error);
745
746		vm_iommu_map(vm);
747	}
748
749	error = ppt_assign_device(vm, bus, slot, func);
750	return (error);
751}
752
753void *
754vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
755	    void **cookie)
756{
757	int count, pageoff;
758	vm_page_t m;
759
760	pageoff = gpa & PAGE_MASK;
761	if (len > PAGE_SIZE - pageoff)
762		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
763
764	count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
765	    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
766
767	if (count == 1) {
768		*cookie = m;
769		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
770	} else {
771		*cookie = NULL;
772		return (NULL);
773	}
774}
775
776void
777vm_gpa_release(void *cookie)
778{
779	vm_page_t m = cookie;
780
781	vm_page_lock(m);
782	vm_page_unhold(m);
783	vm_page_unlock(m);
784}
785
786int
787vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
788		  struct vm_memory_segment *seg)
789{
790	int i;
791
792	for (i = 0; i < vm->num_mem_segs; i++) {
793		if (gpabase == vm->mem_segs[i].gpa) {
794			seg->gpa = vm->mem_segs[i].gpa;
795			seg->len = vm->mem_segs[i].len;
796			seg->wired = vm->mem_segs[i].wired;
797			return (0);
798		}
799	}
800	return (-1);
801}
802
803int
804vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
805	      vm_offset_t *offset, struct vm_object **object)
806{
807	int i;
808	size_t seg_len;
809	vm_paddr_t seg_gpa;
810	vm_object_t seg_obj;
811
812	for (i = 0; i < vm->num_mem_segs; i++) {
813		if ((seg_obj = vm->mem_segs[i].object) == NULL)
814			continue;
815
816		seg_gpa = vm->mem_segs[i].gpa;
817		seg_len = vm->mem_segs[i].len;
818
819		if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) {
820			*offset = gpa - seg_gpa;
821			*object = seg_obj;
822			vm_object_reference(seg_obj);
823			return (0);
824		}
825	}
826
827	return (EINVAL);
828}
829
830int
831vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
832{
833
834	if (vcpu < 0 || vcpu >= VM_MAXCPU)
835		return (EINVAL);
836
837	if (reg >= VM_REG_LAST)
838		return (EINVAL);
839
840	return (VMGETREG(vm->cookie, vcpu, reg, retval));
841}
842
843int
844vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
845{
846
847	if (vcpu < 0 || vcpu >= VM_MAXCPU)
848		return (EINVAL);
849
850	if (reg >= VM_REG_LAST)
851		return (EINVAL);
852
853	return (VMSETREG(vm->cookie, vcpu, reg, val));
854}
855
856static boolean_t
857is_descriptor_table(int reg)
858{
859
860	switch (reg) {
861	case VM_REG_GUEST_IDTR:
862	case VM_REG_GUEST_GDTR:
863		return (TRUE);
864	default:
865		return (FALSE);
866	}
867}
868
869static boolean_t
870is_segment_register(int reg)
871{
872
873	switch (reg) {
874	case VM_REG_GUEST_ES:
875	case VM_REG_GUEST_CS:
876	case VM_REG_GUEST_SS:
877	case VM_REG_GUEST_DS:
878	case VM_REG_GUEST_FS:
879	case VM_REG_GUEST_GS:
880	case VM_REG_GUEST_TR:
881	case VM_REG_GUEST_LDTR:
882		return (TRUE);
883	default:
884		return (FALSE);
885	}
886}
887
888int
889vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
890		struct seg_desc *desc)
891{
892
893	if (vcpu < 0 || vcpu >= VM_MAXCPU)
894		return (EINVAL);
895
896	if (!is_segment_register(reg) && !is_descriptor_table(reg))
897		return (EINVAL);
898
899	return (VMGETDESC(vm->cookie, vcpu, reg, desc));
900}
901
902int
903vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
904		struct seg_desc *desc)
905{
906	if (vcpu < 0 || vcpu >= VM_MAXCPU)
907		return (EINVAL);
908
909	if (!is_segment_register(reg) && !is_descriptor_table(reg))
910		return (EINVAL);
911
912	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
913}
914
915static void
916restore_guest_fpustate(struct vcpu *vcpu)
917{
918
919	/* flush host state to the pcb */
920	fpuexit(curthread);
921
922	/* restore guest FPU state */
923	fpu_stop_emulating();
924	fpurestore(vcpu->guestfpu);
925
926	/* restore guest XCR0 if XSAVE is enabled in the host */
927	if (rcr4() & CR4_XSAVE)
928		load_xcr(0, vcpu->guest_xcr0);
929
930	/*
931	 * The FPU is now "dirty" with the guest's state so turn on emulation
932	 * to trap any access to the FPU by the host.
933	 */
934	fpu_start_emulating();
935}
936
937static void
938save_guest_fpustate(struct vcpu *vcpu)
939{
940
941	if ((rcr0() & CR0_TS) == 0)
942		panic("fpu emulation not enabled in host!");
943
944	/* save guest XCR0 and restore host XCR0 */
945	if (rcr4() & CR4_XSAVE) {
946		vcpu->guest_xcr0 = rxcr(0);
947		load_xcr(0, vmm_get_host_xcr0());
948	}
949
950	/* save guest FPU state */
951	fpu_stop_emulating();
952	fpusave(vcpu->guestfpu);
953	fpu_start_emulating();
954}
955
956static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
957
958static int
959vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
960    bool from_idle)
961{
962	int error;
963
964	vcpu_assert_locked(vcpu);
965
966	/*
967	 * State transitions from the vmmdev_ioctl() must always begin from
968	 * the VCPU_IDLE state. This guarantees that there is only a single
969	 * ioctl() operating on a vcpu at any point.
970	 */
971	if (from_idle) {
972		while (vcpu->state != VCPU_IDLE)
973			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
974	} else {
975		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
976		    "vcpu idle state"));
977	}
978
979	if (vcpu->state == VCPU_RUNNING) {
980		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
981		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
982	} else {
983		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
984		    "vcpu that is not running", vcpu->hostcpu));
985	}
986
987	/*
988	 * The following state transitions are allowed:
989	 * IDLE -> FROZEN -> IDLE
990	 * FROZEN -> RUNNING -> FROZEN
991	 * FROZEN -> SLEEPING -> FROZEN
992	 */
993	switch (vcpu->state) {
994	case VCPU_IDLE:
995	case VCPU_RUNNING:
996	case VCPU_SLEEPING:
997		error = (newstate != VCPU_FROZEN);
998		break;
999	case VCPU_FROZEN:
1000		error = (newstate == VCPU_FROZEN);
1001		break;
1002	default:
1003		error = 1;
1004		break;
1005	}
1006
1007	if (error)
1008		return (EBUSY);
1009
1010	vcpu->state = newstate;
1011	if (newstate == VCPU_RUNNING)
1012		vcpu->hostcpu = curcpu;
1013	else
1014		vcpu->hostcpu = NOCPU;
1015
1016	if (newstate == VCPU_IDLE)
1017		wakeup(&vcpu->state);
1018
1019	return (0);
1020}
1021
1022static void
1023vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1024{
1025	int error;
1026
1027	if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
1028		panic("Error %d setting state to %d\n", error, newstate);
1029}
1030
1031static void
1032vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
1033{
1034	int error;
1035
1036	if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
1037		panic("Error %d setting state to %d", error, newstate);
1038}
1039
1040static void
1041vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func)
1042{
1043
1044	KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked"));
1045
1046	/*
1047	 * Update 'rendezvous_func' and execute a write memory barrier to
1048	 * ensure that it is visible across all host cpus. This is not needed
1049	 * for correctness but it does ensure that all the vcpus will notice
1050	 * that the rendezvous is requested immediately.
1051	 */
1052	vm->rendezvous_func = func;
1053	wmb();
1054}
1055
1056#define	RENDEZVOUS_CTR0(vm, vcpuid, fmt)				\
1057	do {								\
1058		if (vcpuid >= 0)					\
1059			VCPU_CTR0(vm, vcpuid, fmt);			\
1060		else							\
1061			VM_CTR0(vm, fmt);				\
1062	} while (0)
1063
1064static void
1065vm_handle_rendezvous(struct vm *vm, int vcpuid)
1066{
1067
1068	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
1069	    ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid));
1070
1071	mtx_lock(&vm->rendezvous_mtx);
1072	while (vm->rendezvous_func != NULL) {
1073		/* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
1074		CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus);
1075
1076		if (vcpuid != -1 &&
1077		    CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
1078		    !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
1079			VCPU_CTR0(vm, vcpuid, "Calling rendezvous func");
1080			(*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg);
1081			CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
1082		}
1083		if (CPU_CMP(&vm->rendezvous_req_cpus,
1084		    &vm->rendezvous_done_cpus) == 0) {
1085			VCPU_CTR0(vm, vcpuid, "Rendezvous completed");
1086			vm_set_rendezvous_func(vm, NULL);
1087			wakeup(&vm->rendezvous_func);
1088			break;
1089		}
1090		RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion");
1091		mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
1092		    "vmrndv", 0);
1093	}
1094	mtx_unlock(&vm->rendezvous_mtx);
1095}
1096
1097/*
1098 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
1099 */
1100static int
1101vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
1102{
1103	struct vcpu *vcpu;
1104	const char *wmesg;
1105	int error, t, vcpu_halted, vm_halted;
1106
1107	KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
1108
1109	vcpu = &vm->vcpu[vcpuid];
1110	vcpu_halted = 0;
1111	vm_halted = 0;
1112
1113	/*
1114	 * The typical way to halt a cpu is to execute: "sti; hlt"
1115	 *
1116	 * STI sets RFLAGS.IF to enable interrupts. However, the processor
1117	 * remains in an "interrupt shadow" for an additional instruction
1118	 * following the STI. This guarantees that "sti; hlt" sequence is
1119	 * atomic and a pending interrupt will be recognized after the HLT.
1120	 *
1121	 * After the HLT emulation is done the vcpu is no longer in an
1122	 * interrupt shadow and a pending interrupt can be injected on
1123	 * the next entry into the guest.
1124	 */
1125	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0);
1126	KASSERT(error == 0, ("%s: error %d clearing interrupt shadow",
1127	    __func__, error));
1128
1129	vcpu_lock(vcpu);
1130	while (1) {
1131		/*
1132		 * Do a final check for pending NMI or interrupts before
1133		 * really putting this thread to sleep. Also check for
1134		 * software events that would cause this vcpu to wakeup.
1135		 *
1136		 * These interrupts/events could have happened after the
1137		 * vcpu returned from VMRUN() and before it acquired the
1138		 * vcpu lock above.
1139		 */
1140		if (vm->rendezvous_func != NULL || vm->suspend)
1141			break;
1142		if (vm_nmi_pending(vm, vcpuid))
1143			break;
1144		if (!intr_disabled) {
1145			if (vm_extint_pending(vm, vcpuid) ||
1146			    vlapic_pending_intr(vcpu->vlapic, NULL)) {
1147				break;
1148			}
1149		}
1150
1151		/* Don't go to sleep if the vcpu thread needs to yield */
1152		if (vcpu_should_yield(vm, vcpuid))
1153			break;
1154
1155		/*
1156		 * Some Linux guests implement "halt" by having all vcpus
1157		 * execute HLT with interrupts disabled. 'halted_cpus' keeps
1158		 * track of the vcpus that have entered this state. When all
1159		 * vcpus enter the halted state the virtual machine is halted.
1160		 */
1161		if (intr_disabled) {
1162			wmesg = "vmhalt";
1163			VCPU_CTR0(vm, vcpuid, "Halted");
1164			if (!vcpu_halted && halt_detection_enabled) {
1165				vcpu_halted = 1;
1166				CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
1167			}
1168			if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
1169				vm_halted = 1;
1170				break;
1171			}
1172		} else {
1173			wmesg = "vmidle";
1174		}
1175
1176		t = ticks;
1177		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1178		/*
1179		 * XXX msleep_spin() cannot be interrupted by signals so
1180		 * wake up periodically to check pending signals.
1181		 */
1182		msleep_spin(vcpu, &vcpu->mtx, wmesg, hz);
1183		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1184		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
1185	}
1186
1187	if (vcpu_halted)
1188		CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
1189
1190	vcpu_unlock(vcpu);
1191
1192	if (vm_halted)
1193		vm_suspend(vm, VM_SUSPEND_HALT);
1194
1195	return (0);
1196}
1197
1198static int
1199vm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
1200{
1201	int rv, ftype;
1202	struct vm_map *map;
1203	struct vcpu *vcpu;
1204	struct vm_exit *vme;
1205
1206	vcpu = &vm->vcpu[vcpuid];
1207	vme = &vcpu->exitinfo;
1208
1209	ftype = vme->u.paging.fault_type;
1210	KASSERT(ftype == VM_PROT_READ ||
1211	    ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
1212	    ("vm_handle_paging: invalid fault_type %d", ftype));
1213
1214	if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
1215		rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
1216		    vme->u.paging.gpa, ftype);
1217		if (rv == 0) {
1218			VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %#lx",
1219			    ftype == VM_PROT_READ ? "accessed" : "dirty",
1220			    vme->u.paging.gpa);
1221			goto done;
1222		}
1223	}
1224
1225	map = &vm->vmspace->vm_map;
1226	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
1227
1228	VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, "
1229	    "ftype = %d", rv, vme->u.paging.gpa, ftype);
1230
1231	if (rv != KERN_SUCCESS)
1232		return (EFAULT);
1233done:
1234	/* restart execution at the faulting instruction */
1235	vme->inst_length = 0;
1236
1237	return (0);
1238}
1239
1240static int
1241vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
1242{
1243	struct vie *vie;
1244	struct vcpu *vcpu;
1245	struct vm_exit *vme;
1246	uint64_t gla, gpa;
1247	struct vm_guest_paging *paging;
1248	mem_region_read_t mread;
1249	mem_region_write_t mwrite;
1250	enum vm_cpu_mode cpu_mode;
1251	int cs_d, error, length;
1252
1253	vcpu = &vm->vcpu[vcpuid];
1254	vme = &vcpu->exitinfo;
1255
1256	gla = vme->u.inst_emul.gla;
1257	gpa = vme->u.inst_emul.gpa;
1258	cs_d = vme->u.inst_emul.cs_d;
1259	vie = &vme->u.inst_emul.vie;
1260	paging = &vme->u.inst_emul.paging;
1261	cpu_mode = paging->cpu_mode;
1262
1263	VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %#lx", gpa);
1264
1265	/* Fetch, decode and emulate the faulting instruction */
1266	if (vie->num_valid == 0) {
1267		/*
1268		 * If the instruction length is not known then assume a
1269		 * maximum size instruction.
1270		 */
1271		length = vme->inst_length ? vme->inst_length : VIE_INST_SIZE;
1272		error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip,
1273		    length, vie);
1274	} else {
1275		/*
1276		 * The instruction bytes have already been copied into 'vie'
1277		 */
1278		error = 0;
1279	}
1280	if (error == 1)
1281		return (0);		/* Resume guest to handle page fault */
1282	else if (error == -1)
1283		return (EFAULT);
1284	else if (error != 0)
1285		panic("%s: vmm_fetch_instruction error %d", __func__, error);
1286
1287	if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0)
1288		return (EFAULT);
1289
1290	/*
1291	 * If the instruction length is not specified the update it now.
1292	 */
1293	if (vme->inst_length == 0)
1294		vme->inst_length = vie->num_processed;
1295
1296	/* return to userland unless this is an in-kernel emulated device */
1297	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1298		mread = lapic_mmio_read;
1299		mwrite = lapic_mmio_write;
1300	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1301		mread = vioapic_mmio_read;
1302		mwrite = vioapic_mmio_write;
1303	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1304		mread = vhpet_mmio_read;
1305		mwrite = vhpet_mmio_write;
1306	} else {
1307		*retu = true;
1308		return (0);
1309	}
1310
1311	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging,
1312	    mread, mwrite, retu);
1313
1314	return (error);
1315}
1316
1317static int
1318vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu)
1319{
1320	int i, done;
1321	struct vcpu *vcpu;
1322
1323	done = 0;
1324	vcpu = &vm->vcpu[vcpuid];
1325
1326	CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
1327
1328	/*
1329	 * Wait until all 'active_cpus' have suspended themselves.
1330	 *
1331	 * Since a VM may be suspended at any time including when one or
1332	 * more vcpus are doing a rendezvous we need to call the rendezvous
1333	 * handler while we are waiting to prevent a deadlock.
1334	 */
1335	vcpu_lock(vcpu);
1336	while (1) {
1337		if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
1338			VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
1339			break;
1340		}
1341
1342		if (vm->rendezvous_func == NULL) {
1343			VCPU_CTR0(vm, vcpuid, "Sleeping during suspend");
1344			vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1345			msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
1346			vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1347		} else {
1348			VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend");
1349			vcpu_unlock(vcpu);
1350			vm_handle_rendezvous(vm, vcpuid);
1351			vcpu_lock(vcpu);
1352		}
1353	}
1354	vcpu_unlock(vcpu);
1355
1356	/*
1357	 * Wakeup the other sleeping vcpus and return to userspace.
1358	 */
1359	for (i = 0; i < VM_MAXCPU; i++) {
1360		if (CPU_ISSET(i, &vm->suspended_cpus)) {
1361			vcpu_notify_event(vm, i, false);
1362		}
1363	}
1364
1365	*retu = true;
1366	return (0);
1367}
1368
1369int
1370vm_suspend(struct vm *vm, enum vm_suspend_how how)
1371{
1372	int i;
1373
1374	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
1375		return (EINVAL);
1376
1377	if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
1378		VM_CTR2(vm, "virtual machine already suspended %d/%d",
1379		    vm->suspend, how);
1380		return (EALREADY);
1381	}
1382
1383	VM_CTR1(vm, "virtual machine successfully suspended %d", how);
1384
1385	/*
1386	 * Notify all active vcpus that they are now suspended.
1387	 */
1388	for (i = 0; i < VM_MAXCPU; i++) {
1389		if (CPU_ISSET(i, &vm->active_cpus))
1390			vcpu_notify_event(vm, i, false);
1391	}
1392
1393	return (0);
1394}
1395
1396void
1397vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip)
1398{
1399	struct vm_exit *vmexit;
1400
1401	KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
1402	    ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
1403
1404	vmexit = vm_exitinfo(vm, vcpuid);
1405	vmexit->rip = rip;
1406	vmexit->inst_length = 0;
1407	vmexit->exitcode = VM_EXITCODE_SUSPENDED;
1408	vmexit->u.suspended.how = vm->suspend;
1409}
1410
1411void
1412vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip)
1413{
1414	struct vm_exit *vmexit;
1415
1416	KASSERT(vm->rendezvous_func != NULL, ("rendezvous not in progress"));
1417
1418	vmexit = vm_exitinfo(vm, vcpuid);
1419	vmexit->rip = rip;
1420	vmexit->inst_length = 0;
1421	vmexit->exitcode = VM_EXITCODE_RENDEZVOUS;
1422	vmm_stat_incr(vm, vcpuid, VMEXIT_RENDEZVOUS, 1);
1423}
1424
1425void
1426vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip)
1427{
1428	struct vm_exit *vmexit;
1429
1430	vmexit = vm_exitinfo(vm, vcpuid);
1431	vmexit->rip = rip;
1432	vmexit->inst_length = 0;
1433	vmexit->exitcode = VM_EXITCODE_BOGUS;
1434	vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
1435}
1436
1437int
1438vm_run(struct vm *vm, struct vm_run *vmrun)
1439{
1440	int error, vcpuid;
1441	struct vcpu *vcpu;
1442	struct pcb *pcb;
1443	uint64_t tscval, rip;
1444	struct vm_exit *vme;
1445	bool retu, intr_disabled;
1446	pmap_t pmap;
1447	void *rptr, *sptr;
1448
1449	vcpuid = vmrun->cpuid;
1450
1451	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1452		return (EINVAL);
1453
1454	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
1455		return (EINVAL);
1456
1457	if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
1458		return (EINVAL);
1459
1460	rptr = &vm->rendezvous_func;
1461	sptr = &vm->suspend;
1462	pmap = vmspace_pmap(vm->vmspace);
1463	vcpu = &vm->vcpu[vcpuid];
1464	vme = &vcpu->exitinfo;
1465	rip = vmrun->rip;
1466restart:
1467	critical_enter();
1468
1469	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
1470	    ("vm_run: absurd pm_active"));
1471
1472	tscval = rdtsc();
1473
1474	pcb = PCPU_GET(curpcb);
1475	set_pcb_flags(pcb, PCB_FULL_IRET);
1476
1477	restore_guest_fpustate(vcpu);
1478
1479	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
1480	error = VMRUN(vm->cookie, vcpuid, rip, pmap, rptr, sptr);
1481	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
1482
1483	save_guest_fpustate(vcpu);
1484
1485	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
1486
1487	critical_exit();
1488
1489	if (error == 0) {
1490		retu = false;
1491		switch (vme->exitcode) {
1492		case VM_EXITCODE_SUSPENDED:
1493			error = vm_handle_suspend(vm, vcpuid, &retu);
1494			break;
1495		case VM_EXITCODE_IOAPIC_EOI:
1496			vioapic_process_eoi(vm, vcpuid,
1497			    vme->u.ioapic_eoi.vector);
1498			break;
1499		case VM_EXITCODE_RENDEZVOUS:
1500			vm_handle_rendezvous(vm, vcpuid);
1501			error = 0;
1502			break;
1503		case VM_EXITCODE_HLT:
1504			intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
1505			error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu);
1506			break;
1507		case VM_EXITCODE_PAGING:
1508			error = vm_handle_paging(vm, vcpuid, &retu);
1509			break;
1510		case VM_EXITCODE_INST_EMUL:
1511			error = vm_handle_inst_emul(vm, vcpuid, &retu);
1512			break;
1513		case VM_EXITCODE_INOUT:
1514		case VM_EXITCODE_INOUT_STR:
1515			error = vm_handle_inout(vm, vcpuid, vme, &retu);
1516			break;
1517		case VM_EXITCODE_MONITOR:
1518		case VM_EXITCODE_MWAIT:
1519			vm_inject_ud(vm, vcpuid);
1520			break;
1521		default:
1522			retu = true;	/* handled in userland */
1523			break;
1524		}
1525	}
1526
1527	if (error == 0 && retu == false) {
1528		rip = vme->rip + vme->inst_length;
1529		goto restart;
1530	}
1531
1532	/* copy the exit information */
1533	bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
1534	return (error);
1535}
1536
1537int
1538vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
1539{
1540	struct vcpu *vcpu;
1541	int type, vector;
1542
1543	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1544		return (EINVAL);
1545
1546	vcpu = &vm->vcpu[vcpuid];
1547
1548	if (info & VM_INTINFO_VALID) {
1549		type = info & VM_INTINFO_TYPE;
1550		vector = info & 0xff;
1551		if (type == VM_INTINFO_NMI && vector != IDT_NMI)
1552			return (EINVAL);
1553		if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
1554			return (EINVAL);
1555		if (info & VM_INTINFO_RSVD)
1556			return (EINVAL);
1557	} else {
1558		info = 0;
1559	}
1560	VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info);
1561	vcpu->exitintinfo = info;
1562	return (0);
1563}
1564
1565enum exc_class {
1566	EXC_BENIGN,
1567	EXC_CONTRIBUTORY,
1568	EXC_PAGEFAULT
1569};
1570
1571#define	IDT_VE	20	/* Virtualization Exception (Intel specific) */
1572
1573static enum exc_class
1574exception_class(uint64_t info)
1575{
1576	int type, vector;
1577
1578	KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info));
1579	type = info & VM_INTINFO_TYPE;
1580	vector = info & 0xff;
1581
1582	/* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
1583	switch (type) {
1584	case VM_INTINFO_HWINTR:
1585	case VM_INTINFO_SWINTR:
1586	case VM_INTINFO_NMI:
1587		return (EXC_BENIGN);
1588	default:
1589		/*
1590		 * Hardware exception.
1591		 *
1592		 * SVM and VT-x use identical type values to represent NMI,
1593		 * hardware interrupt and software interrupt.
1594		 *
1595		 * SVM uses type '3' for all exceptions. VT-x uses type '3'
1596		 * for exceptions except #BP and #OF. #BP and #OF use a type
1597		 * value of '5' or '6'. Therefore we don't check for explicit
1598		 * values of 'type' to classify 'intinfo' into a hardware
1599		 * exception.
1600		 */
1601		break;
1602	}
1603
1604	switch (vector) {
1605	case IDT_PF:
1606	case IDT_VE:
1607		return (EXC_PAGEFAULT);
1608	case IDT_DE:
1609	case IDT_TS:
1610	case IDT_NP:
1611	case IDT_SS:
1612	case IDT_GP:
1613		return (EXC_CONTRIBUTORY);
1614	default:
1615		return (EXC_BENIGN);
1616	}
1617}
1618
1619static int
1620nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
1621    uint64_t *retinfo)
1622{
1623	enum exc_class exc1, exc2;
1624	int type1, vector1;
1625
1626	KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1));
1627	KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2));
1628
1629	/*
1630	 * If an exception occurs while attempting to call the double-fault
1631	 * handler the processor enters shutdown mode (aka triple fault).
1632	 */
1633	type1 = info1 & VM_INTINFO_TYPE;
1634	vector1 = info1 & 0xff;
1635	if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
1636		VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)",
1637		    info1, info2);
1638		vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
1639		*retinfo = 0;
1640		return (0);
1641	}
1642
1643	/*
1644	 * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
1645	 */
1646	exc1 = exception_class(info1);
1647	exc2 = exception_class(info2);
1648	if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
1649	    (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
1650		/* Convert nested fault into a double fault. */
1651		*retinfo = IDT_DF;
1652		*retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
1653		*retinfo |= VM_INTINFO_DEL_ERRCODE;
1654	} else {
1655		/* Handle exceptions serially */
1656		*retinfo = info2;
1657	}
1658	return (1);
1659}
1660
1661static uint64_t
1662vcpu_exception_intinfo(struct vcpu *vcpu)
1663{
1664	uint64_t info = 0;
1665
1666	if (vcpu->exception_pending) {
1667		info = vcpu->exception.vector & 0xff;
1668		info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
1669		if (vcpu->exception.error_code_valid) {
1670			info |= VM_INTINFO_DEL_ERRCODE;
1671			info |= (uint64_t)vcpu->exception.error_code << 32;
1672		}
1673	}
1674	return (info);
1675}
1676
1677int
1678vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
1679{
1680	struct vcpu *vcpu;
1681	uint64_t info1, info2;
1682	int valid;
1683
1684	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));
1685
1686	vcpu = &vm->vcpu[vcpuid];
1687
1688	info1 = vcpu->exitintinfo;
1689	vcpu->exitintinfo = 0;
1690
1691	info2 = 0;
1692	if (vcpu->exception_pending) {
1693		info2 = vcpu_exception_intinfo(vcpu);
1694		vcpu->exception_pending = 0;
1695		VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx",
1696		    vcpu->exception.vector, info2);
1697	}
1698
1699	if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
1700		valid = nested_fault(vm, vcpuid, info1, info2, retinfo);
1701	} else if (info1 & VM_INTINFO_VALID) {
1702		*retinfo = info1;
1703		valid = 1;
1704	} else if (info2 & VM_INTINFO_VALID) {
1705		*retinfo = info2;
1706		valid = 1;
1707	} else {
1708		valid = 0;
1709	}
1710
1711	if (valid) {
1712		VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), "
1713		    "retinfo(%#lx)", __func__, info1, info2, *retinfo);
1714	}
1715
1716	return (valid);
1717}
1718
1719int
1720vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
1721{
1722	struct vcpu *vcpu;
1723
1724	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1725		return (EINVAL);
1726
1727	vcpu = &vm->vcpu[vcpuid];
1728	*info1 = vcpu->exitintinfo;
1729	*info2 = vcpu_exception_intinfo(vcpu);
1730	return (0);
1731}
1732
1733int
1734vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
1735{
1736	struct vcpu *vcpu;
1737
1738	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1739		return (EINVAL);
1740
1741	if (exception->vector < 0 || exception->vector >= 32)
1742		return (EINVAL);
1743
1744	/*
1745	 * A double fault exception should never be injected directly into
1746	 * the guest. It is a derived exception that results from specific
1747	 * combinations of nested faults.
1748	 */
1749	if (exception->vector == IDT_DF)
1750		return (EINVAL);
1751
1752	vcpu = &vm->vcpu[vcpuid];
1753
1754	if (vcpu->exception_pending) {
1755		VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
1756		    "pending exception %d", exception->vector,
1757		    vcpu->exception.vector);
1758		return (EBUSY);
1759	}
1760
1761	vcpu->exception_pending = 1;
1762	vcpu->exception = *exception;
1763	VCPU_CTR1(vm, vcpuid, "Exception %d pending", exception->vector);
1764	return (0);
1765}
1766
1767void
1768vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid,
1769    int errcode)
1770{
1771	struct vm_exception exception;
1772	struct vm_exit *vmexit;
1773	struct vm *vm;
1774	int error;
1775
1776	vm = vmarg;
1777
1778	exception.vector = vector;
1779	exception.error_code = errcode;
1780	exception.error_code_valid = errcode_valid;
1781	error = vm_inject_exception(vm, vcpuid, &exception);
1782	KASSERT(error == 0, ("vm_inject_exception error %d", error));
1783
1784	/*
1785	 * A fault-like exception allows the instruction to be restarted
1786	 * after the exception handler returns.
1787	 *
1788	 * By setting the inst_length to 0 we ensure that the instruction
1789	 * pointer remains at the faulting instruction.
1790	 */
1791	vmexit = vm_exitinfo(vm, vcpuid);
1792	vmexit->inst_length = 0;
1793}
1794
1795void
1796vm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2)
1797{
1798	struct vm *vm;
1799	int error;
1800
1801	vm = vmarg;
1802	VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx",
1803	    error_code, cr2);
1804
1805	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
1806	KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
1807
1808	vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);
1809}
1810
1811static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
1812
1813int
1814vm_inject_nmi(struct vm *vm, int vcpuid)
1815{
1816	struct vcpu *vcpu;
1817
1818	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1819		return (EINVAL);
1820
1821	vcpu = &vm->vcpu[vcpuid];
1822
1823	vcpu->nmi_pending = 1;
1824	vcpu_notify_event(vm, vcpuid, false);
1825	return (0);
1826}
1827
1828int
1829vm_nmi_pending(struct vm *vm, int vcpuid)
1830{
1831	struct vcpu *vcpu;
1832
1833	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1834		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1835
1836	vcpu = &vm->vcpu[vcpuid];
1837
1838	return (vcpu->nmi_pending);
1839}
1840
1841void
1842vm_nmi_clear(struct vm *vm, int vcpuid)
1843{
1844	struct vcpu *vcpu;
1845
1846	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1847		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1848
1849	vcpu = &vm->vcpu[vcpuid];
1850
1851	if (vcpu->nmi_pending == 0)
1852		panic("vm_nmi_clear: inconsistent nmi_pending state");
1853
1854	vcpu->nmi_pending = 0;
1855	vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
1856}
1857
1858static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
1859
1860int
1861vm_inject_extint(struct vm *vm, int vcpuid)
1862{
1863	struct vcpu *vcpu;
1864
1865	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1866		return (EINVAL);
1867
1868	vcpu = &vm->vcpu[vcpuid];
1869
1870	vcpu->extint_pending = 1;
1871	vcpu_notify_event(vm, vcpuid, false);
1872	return (0);
1873}
1874
1875int
1876vm_extint_pending(struct vm *vm, int vcpuid)
1877{
1878	struct vcpu *vcpu;
1879
1880	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1881		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
1882
1883	vcpu = &vm->vcpu[vcpuid];
1884
1885	return (vcpu->extint_pending);
1886}
1887
1888void
1889vm_extint_clear(struct vm *vm, int vcpuid)
1890{
1891	struct vcpu *vcpu;
1892
1893	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1894		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
1895
1896	vcpu = &vm->vcpu[vcpuid];
1897
1898	if (vcpu->extint_pending == 0)
1899		panic("vm_extint_clear: inconsistent extint_pending state");
1900
1901	vcpu->extint_pending = 0;
1902	vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
1903}
1904
1905int
1906vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
1907{
1908	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1909		return (EINVAL);
1910
1911	if (type < 0 || type >= VM_CAP_MAX)
1912		return (EINVAL);
1913
1914	return (VMGETCAP(vm->cookie, vcpu, type, retval));
1915}
1916
1917int
1918vm_set_capability(struct vm *vm, int vcpu, int type, int val)
1919{
1920	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1921		return (EINVAL);
1922
1923	if (type < 0 || type >= VM_CAP_MAX)
1924		return (EINVAL);
1925
1926	return (VMSETCAP(vm->cookie, vcpu, type, val));
1927}
1928
1929struct vlapic *
1930vm_lapic(struct vm *vm, int cpu)
1931{
1932	return (vm->vcpu[cpu].vlapic);
1933}
1934
1935struct vioapic *
1936vm_ioapic(struct vm *vm)
1937{
1938
1939	return (vm->vioapic);
1940}
1941
1942struct vhpet *
1943vm_hpet(struct vm *vm)
1944{
1945
1946	return (vm->vhpet);
1947}
1948
1949boolean_t
1950vmm_is_pptdev(int bus, int slot, int func)
1951{
1952	int found, i, n;
1953	int b, s, f;
1954	char *val, *cp, *cp2;
1955
1956	/*
1957	 * XXX
1958	 * The length of an environment variable is limited to 128 bytes which
1959	 * puts an upper limit on the number of passthru devices that may be
1960	 * specified using a single environment variable.
1961	 *
1962	 * Work around this by scanning multiple environment variable
1963	 * names instead of a single one - yuck!
1964	 */
1965	const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
1966
1967	/* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
1968	found = 0;
1969	for (i = 0; names[i] != NULL && !found; i++) {
1970		cp = val = getenv(names[i]);
1971		while (cp != NULL && *cp != '\0') {
1972			if ((cp2 = strchr(cp, ' ')) != NULL)
1973				*cp2 = '\0';
1974
1975			n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
1976			if (n == 3 && bus == b && slot == s && func == f) {
1977				found = 1;
1978				break;
1979			}
1980
1981			if (cp2 != NULL)
1982				*cp2++ = ' ';
1983
1984			cp = cp2;
1985		}
1986		freeenv(val);
1987	}
1988	return (found);
1989}
1990
1991void *
1992vm_iommu_domain(struct vm *vm)
1993{
1994
1995	return (vm->iommu);
1996}
1997
1998int
1999vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
2000    bool from_idle)
2001{
2002	int error;
2003	struct vcpu *vcpu;
2004
2005	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2006		panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
2007
2008	vcpu = &vm->vcpu[vcpuid];
2009
2010	vcpu_lock(vcpu);
2011	error = vcpu_set_state_locked(vcpu, newstate, from_idle);
2012	vcpu_unlock(vcpu);
2013
2014	return (error);
2015}
2016
2017enum vcpu_state
2018vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
2019{
2020	struct vcpu *vcpu;
2021	enum vcpu_state state;
2022
2023	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2024		panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
2025
2026	vcpu = &vm->vcpu[vcpuid];
2027
2028	vcpu_lock(vcpu);
2029	state = vcpu->state;
2030	if (hostcpu != NULL)
2031		*hostcpu = vcpu->hostcpu;
2032	vcpu_unlock(vcpu);
2033
2034	return (state);
2035}
2036
2037int
2038vm_activate_cpu(struct vm *vm, int vcpuid)
2039{
2040
2041	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2042		return (EINVAL);
2043
2044	if (CPU_ISSET(vcpuid, &vm->active_cpus))
2045		return (EBUSY);
2046
2047	VCPU_CTR0(vm, vcpuid, "activated");
2048	CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
2049	return (0);
2050}
2051
2052cpuset_t
2053vm_active_cpus(struct vm *vm)
2054{
2055
2056	return (vm->active_cpus);
2057}
2058
2059cpuset_t
2060vm_suspended_cpus(struct vm *vm)
2061{
2062
2063	return (vm->suspended_cpus);
2064}
2065
2066void *
2067vcpu_stats(struct vm *vm, int vcpuid)
2068{
2069
2070	return (vm->vcpu[vcpuid].stats);
2071}
2072
2073int
2074vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
2075{
2076	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2077		return (EINVAL);
2078
2079	*state = vm->vcpu[vcpuid].x2apic_state;
2080
2081	return (0);
2082}
2083
2084int
2085vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
2086{
2087	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2088		return (EINVAL);
2089
2090	if (state >= X2APIC_STATE_LAST)
2091		return (EINVAL);
2092
2093	vm->vcpu[vcpuid].x2apic_state = state;
2094
2095	vlapic_set_x2apic_state(vm, vcpuid, state);
2096
2097	return (0);
2098}
2099
2100/*
2101 * This function is called to ensure that a vcpu "sees" a pending event
2102 * as soon as possible:
2103 * - If the vcpu thread is sleeping then it is woken up.
2104 * - If the vcpu is running on a different host_cpu then an IPI will be directed
2105 *   to the host_cpu to cause the vcpu to trap into the hypervisor.
2106 */
2107void
2108vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
2109{
2110	int hostcpu;
2111	struct vcpu *vcpu;
2112
2113	vcpu = &vm->vcpu[vcpuid];
2114
2115	vcpu_lock(vcpu);
2116	hostcpu = vcpu->hostcpu;
2117	if (vcpu->state == VCPU_RUNNING) {
2118		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
2119		if (hostcpu != curcpu) {
2120			if (lapic_intr) {
2121				vlapic_post_intr(vcpu->vlapic, hostcpu,
2122				    vmm_ipinum);
2123			} else {
2124				ipi_cpu(hostcpu, vmm_ipinum);
2125			}
2126		} else {
2127			/*
2128			 * If the 'vcpu' is running on 'curcpu' then it must
2129			 * be sending a notification to itself (e.g. SELF_IPI).
2130			 * The pending event will be picked up when the vcpu
2131			 * transitions back to guest context.
2132			 */
2133		}
2134	} else {
2135		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
2136		    "with hostcpu %d", vcpu->state, hostcpu));
2137		if (vcpu->state == VCPU_SLEEPING)
2138			wakeup_one(vcpu);
2139	}
2140	vcpu_unlock(vcpu);
2141}
2142
2143struct vmspace *
2144vm_get_vmspace(struct vm *vm)
2145{
2146
2147	return (vm->vmspace);
2148}
2149
2150int
2151vm_apicid2vcpuid(struct vm *vm, int apicid)
2152{
2153	/*
2154	 * XXX apic id is assumed to be numerically identical to vcpu id
2155	 */
2156	return (apicid);
2157}
2158
2159void
2160vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
2161    vm_rendezvous_func_t func, void *arg)
2162{
2163	int i;
2164
2165	/*
2166	 * Enforce that this function is called without any locks
2167	 */
2168	WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous");
2169	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
2170	    ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid));
2171
2172restart:
2173	mtx_lock(&vm->rendezvous_mtx);
2174	if (vm->rendezvous_func != NULL) {
2175		/*
2176		 * If a rendezvous is already in progress then we need to
2177		 * call the rendezvous handler in case this 'vcpuid' is one
2178		 * of the targets of the rendezvous.
2179		 */
2180		RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress");
2181		mtx_unlock(&vm->rendezvous_mtx);
2182		vm_handle_rendezvous(vm, vcpuid);
2183		goto restart;
2184	}
2185	KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous "
2186	    "rendezvous is still in progress"));
2187
2188	RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous");
2189	vm->rendezvous_req_cpus = dest;
2190	CPU_ZERO(&vm->rendezvous_done_cpus);
2191	vm->rendezvous_arg = arg;
2192	vm_set_rendezvous_func(vm, func);
2193	mtx_unlock(&vm->rendezvous_mtx);
2194
2195	/*
2196	 * Wake up any sleeping vcpus and trigger a VM-exit in any running
2197	 * vcpus so they handle the rendezvous as soon as possible.
2198	 */
2199	for (i = 0; i < VM_MAXCPU; i++) {
2200		if (CPU_ISSET(i, &dest))
2201			vcpu_notify_event(vm, i, false);
2202	}
2203
2204	vm_handle_rendezvous(vm, vcpuid);
2205}
2206
2207struct vatpic *
2208vm_atpic(struct vm *vm)
2209{
2210	return (vm->vatpic);
2211}
2212
2213struct vatpit *
2214vm_atpit(struct vm *vm)
2215{
2216	return (vm->vatpit);
2217}
2218
2219struct vpmtmr *
2220vm_pmtmr(struct vm *vm)
2221{
2222
2223	return (vm->vpmtmr);
2224}
2225
2226enum vm_reg_name
2227vm_segment_name(int seg)
2228{
2229	static enum vm_reg_name seg_names[] = {
2230		VM_REG_GUEST_ES,
2231		VM_REG_GUEST_CS,
2232		VM_REG_GUEST_SS,
2233		VM_REG_GUEST_DS,
2234		VM_REG_GUEST_FS,
2235		VM_REG_GUEST_GS
2236	};
2237
2238	KASSERT(seg >= 0 && seg < nitems(seg_names),
2239	    ("%s: invalid segment encoding %d", __func__, seg));
2240	return (seg_names[seg]);
2241}
2242
2243void
2244vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
2245    int num_copyinfo)
2246{
2247	int idx;
2248
2249	for (idx = 0; idx < num_copyinfo; idx++) {
2250		if (copyinfo[idx].cookie != NULL)
2251			vm_gpa_release(copyinfo[idx].cookie);
2252	}
2253	bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo));
2254}
2255
2256int
2257vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
2258    uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
2259    int num_copyinfo)
2260{
2261	int error, idx, nused;
2262	size_t n, off, remaining;
2263	void *hva, *cookie;
2264	uint64_t gpa;
2265
2266	bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo);
2267
2268	nused = 0;
2269	remaining = len;
2270	while (remaining > 0) {
2271		KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
2272		error = vmm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa);
2273		if (error)
2274			return (error);
2275		off = gpa & PAGE_MASK;
2276		n = min(remaining, PAGE_SIZE - off);
2277		copyinfo[nused].gpa = gpa;
2278		copyinfo[nused].len = n;
2279		remaining -= n;
2280		gla += n;
2281		nused++;
2282	}
2283
2284	for (idx = 0; idx < nused; idx++) {
2285		hva = vm_gpa_hold(vm, copyinfo[idx].gpa, copyinfo[idx].len,
2286		    prot, &cookie);
2287		if (hva == NULL)
2288			break;
2289		copyinfo[idx].hva = hva;
2290		copyinfo[idx].cookie = cookie;
2291	}
2292
2293	if (idx != nused) {
2294		vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
2295		return (-1);
2296	} else {
2297		return (0);
2298	}
2299}
2300
2301void
2302vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
2303    size_t len)
2304{
2305	char *dst;
2306	int idx;
2307
2308	dst = kaddr;
2309	idx = 0;
2310	while (len > 0) {
2311		bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
2312		len -= copyinfo[idx].len;
2313		dst += copyinfo[idx].len;
2314		idx++;
2315	}
2316}
2317
2318void
2319vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
2320    struct vm_copyinfo *copyinfo, size_t len)
2321{
2322	const char *src;
2323	int idx;
2324
2325	src = kaddr;
2326	idx = 0;
2327	while (len > 0) {
2328		bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
2329		len -= copyinfo[idx].len;
2330		src += copyinfo[idx].len;
2331		idx++;
2332	}
2333}
2334
2335/*
2336 * Return the amount of in-use and wired memory for the VM. Since
2337 * these are global stats, only return the values with for vCPU 0
2338 */
2339VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
2340VMM_STAT_DECLARE(VMM_MEM_WIRED);
2341
2342static void
2343vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
2344{
2345
2346	if (vcpu == 0) {
2347		vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
2348	       	    PAGE_SIZE * vmspace_resident_count(vm->vmspace));
2349	}
2350}
2351
2352static void
2353vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
2354{
2355
2356	if (vcpu == 0) {
2357		vmm_stat_set(vm, vcpu, VMM_MEM_WIRED,
2358	      	    PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace)));
2359	}
2360}
2361
2362VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
2363VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt);
2364