vmm.c revision 270074
1/*-
2 * Copyright (c) 2011 NetApp, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD: stable/10/sys/amd64/vmm/vmm.c 270074 2014-08-17 01:23:52Z grehan $
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/vmm.c 270074 2014-08-17 01:23:52Z grehan $");
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/kernel.h>
35#include <sys/module.h>
36#include <sys/sysctl.h>
37#include <sys/malloc.h>
38#include <sys/pcpu.h>
39#include <sys/lock.h>
40#include <sys/mutex.h>
41#include <sys/proc.h>
42#include <sys/rwlock.h>
43#include <sys/sched.h>
44#include <sys/smp.h>
45#include <sys/systm.h>
46
47#include <vm/vm.h>
48#include <vm/vm_object.h>
49#include <vm/vm_page.h>
50#include <vm/pmap.h>
51#include <vm/vm_map.h>
52#include <vm/vm_extern.h>
53#include <vm/vm_param.h>
54
55#include <machine/cpu.h>
56#include <machine/vm.h>
57#include <machine/pcb.h>
58#include <machine/smp.h>
59#include <x86/psl.h>
60#include <x86/apicreg.h>
61#include <machine/vmparam.h>
62
63#include <machine/vmm.h>
64#include <machine/vmm_dev.h>
65#include <machine/vmm_instruction_emul.h>
66
67#include "vmm_ioport.h"
68#include "vmm_ktr.h"
69#include "vmm_host.h"
70#include "vmm_mem.h"
71#include "vmm_util.h"
72#include "vatpic.h"
73#include "vatpit.h"
74#include "vhpet.h"
75#include "vioapic.h"
76#include "vlapic.h"
77#include "vmm_msr.h"
78#include "vmm_ipi.h"
79#include "vmm_stat.h"
80#include "vmm_lapic.h"
81
82#include "io/ppt.h"
83#include "io/iommu.h"
84
85struct vlapic;
86
87/*
88 * Initialization:
89 * (a) allocated when vcpu is created
90 * (i) initialized when vcpu is created and when it is reinitialized
91 * (o) initialized the first time the vcpu is created
92 * (x) initialized before use
93 */
94struct vcpu {
95	struct mtx 	mtx;		/* (o) protects 'state' and 'hostcpu' */
96	enum vcpu_state	state;		/* (o) vcpu state */
97	int		hostcpu;	/* (o) vcpu's host cpu */
98	struct vlapic	*vlapic;	/* (i) APIC device model */
99	enum x2apic_state x2apic_state;	/* (i) APIC mode */
100	int		nmi_pending;	/* (i) NMI pending */
101	int		extint_pending;	/* (i) INTR pending */
102	struct vm_exception exception;	/* (x) exception collateral */
103	int	exception_pending;	/* (i) exception pending */
104	struct savefpu	*guestfpu;	/* (a,i) guest fpu state */
105	uint64_t	guest_xcr0;	/* (i) guest %xcr0 register */
106	void		*stats;		/* (a,i) statistics */
107	uint64_t guest_msrs[VMM_MSR_NUM]; /* (i) emulated MSRs */
108	struct vm_exit	exitinfo;	/* (x) exit reason and collateral */
109};
110
111#define	vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
112#define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
113#define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
114#define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
115#define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
116
117struct mem_seg {
118	vm_paddr_t	gpa;
119	size_t		len;
120	boolean_t	wired;
121	vm_object_t	object;
122};
123#define	VM_MAX_MEMORY_SEGMENTS	2
124
125/*
126 * Initialization:
127 * (o) initialized the first time the VM is created
128 * (i) initialized when VM is created and when it is reinitialized
129 * (x) initialized before use
130 */
131struct vm {
132	void		*cookie;		/* (i) cpu-specific data */
133	void		*iommu;			/* (x) iommu-specific data */
134	struct vhpet	*vhpet;			/* (i) virtual HPET */
135	struct vioapic	*vioapic;		/* (i) virtual ioapic */
136	struct vatpic	*vatpic;		/* (i) virtual atpic */
137	struct vatpit	*vatpit;		/* (i) virtual atpit */
138	volatile cpuset_t active_cpus;		/* (i) active vcpus */
139	int		suspend;		/* (i) stop VM execution */
140	volatile cpuset_t suspended_cpus; 	/* (i) suspended vcpus */
141	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
142	cpuset_t	rendezvous_req_cpus;	/* (x) rendezvous requested */
143	cpuset_t	rendezvous_done_cpus;	/* (x) rendezvous finished */
144	void		*rendezvous_arg;	/* (x) rendezvous func/arg */
145	vm_rendezvous_func_t rendezvous_func;
146	struct mtx	rendezvous_mtx;		/* (o) rendezvous lock */
147	int		num_mem_segs;		/* (o) guest memory segments */
148	struct mem_seg	mem_segs[VM_MAX_MEMORY_SEGMENTS];
149	struct vmspace	*vmspace;		/* (o) guest's address space */
150	char		name[VM_MAX_NAMELEN];	/* (o) virtual machine name */
151	struct vcpu	vcpu[VM_MAXCPU];	/* (i) guest vcpus */
152};
153
154static int vmm_initialized;
155
156static struct vmm_ops *ops;
157#define	VMM_INIT(num)	(ops != NULL ? (*ops->init)(num) : 0)
158#define	VMM_CLEANUP()	(ops != NULL ? (*ops->cleanup)() : 0)
159#define	VMM_RESUME()	(ops != NULL ? (*ops->resume)() : 0)
160
161#define	VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
162#define	VMRUN(vmi, vcpu, rip, pmap, rptr, sptr) \
163	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, rptr, sptr) : ENXIO)
164#define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
165#define	VMSPACE_ALLOC(min, max) \
166	(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
167#define	VMSPACE_FREE(vmspace) \
168	(ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
169#define	VMGETREG(vmi, vcpu, num, retval)		\
170	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
171#define	VMSETREG(vmi, vcpu, num, val)		\
172	(ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
173#define	VMGETDESC(vmi, vcpu, num, desc)		\
174	(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
175#define	VMSETDESC(vmi, vcpu, num, desc)		\
176	(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
177#define	VMGETCAP(vmi, vcpu, num, retval)	\
178	(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
179#define	VMSETCAP(vmi, vcpu, num, val)		\
180	(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
181#define	VLAPIC_INIT(vmi, vcpu)			\
182	(ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL)
183#define	VLAPIC_CLEANUP(vmi, vlapic)		\
184	(ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL)
185
186#define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
187#define	fpu_stop_emulating()	clts()
188
189static MALLOC_DEFINE(M_VM, "vm", "vm");
190CTASSERT(VMM_MSR_NUM <= 64);	/* msr_mask can keep track of up to 64 msrs */
191
192/* statistics */
193static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
194
195SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
196
197/*
198 * Halt the guest if all vcpus are executing a HLT instruction with
199 * interrupts disabled.
200 */
201static int halt_detection_enabled = 1;
202TUNABLE_INT("hw.vmm.halt_detection", &halt_detection_enabled);
203SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN,
204    &halt_detection_enabled, 0,
205    "Halt VM if all vcpus execute HLT with interrupts disabled");
206
207static int vmm_ipinum;
208SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
209    "IPI vector used for vcpu notifications");
210
211static void
212vcpu_cleanup(struct vm *vm, int i, bool destroy)
213{
214	struct vcpu *vcpu = &vm->vcpu[i];
215
216	VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
217	if (destroy) {
218		vmm_stat_free(vcpu->stats);
219		fpu_save_area_free(vcpu->guestfpu);
220	}
221}
222
223static void
224vcpu_init(struct vm *vm, int vcpu_id, bool create)
225{
226	struct vcpu *vcpu;
227
228	KASSERT(vcpu_id >= 0 && vcpu_id < VM_MAXCPU,
229	    ("vcpu_init: invalid vcpu %d", vcpu_id));
230
231	vcpu = &vm->vcpu[vcpu_id];
232
233	if (create) {
234		KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already "
235		    "initialized", vcpu_id));
236		vcpu_lock_init(vcpu);
237		vcpu->state = VCPU_IDLE;
238		vcpu->hostcpu = NOCPU;
239		vcpu->guestfpu = fpu_save_area_alloc();
240		vcpu->stats = vmm_stat_alloc();
241	}
242
243	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
244	vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
245	vcpu->nmi_pending = 0;
246	vcpu->extint_pending = 0;
247	vcpu->exception_pending = 0;
248	vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
249	fpu_save_area_reset(vcpu->guestfpu);
250	vmm_stat_init(vcpu->stats);
251	guest_msrs_init(vm, vcpu_id);
252}
253
254struct vm_exit *
255vm_exitinfo(struct vm *vm, int cpuid)
256{
257	struct vcpu *vcpu;
258
259	if (cpuid < 0 || cpuid >= VM_MAXCPU)
260		panic("vm_exitinfo: invalid cpuid %d", cpuid);
261
262	vcpu = &vm->vcpu[cpuid];
263
264	return (&vcpu->exitinfo);
265}
266
267static void
268vmm_resume(void)
269{
270	VMM_RESUME();
271}
272
273static int
274vmm_init(void)
275{
276	int error;
277
278	vmm_host_state_init();
279
280	vmm_ipinum = vmm_ipi_alloc();
281	if (vmm_ipinum == 0)
282		vmm_ipinum = IPI_AST;
283
284	error = vmm_mem_init();
285	if (error)
286		return (error);
287
288	if (vmm_is_intel())
289		ops = &vmm_ops_intel;
290	else if (vmm_is_amd())
291		ops = &vmm_ops_amd;
292	else
293		return (ENXIO);
294
295	vmm_msr_init();
296	vmm_resume_p = vmm_resume;
297
298	return (VMM_INIT(vmm_ipinum));
299}
300
301static int
302vmm_handler(module_t mod, int what, void *arg)
303{
304	int error;
305
306	switch (what) {
307	case MOD_LOAD:
308		vmmdev_init();
309		if (ppt_avail_devices() > 0)
310			iommu_init();
311		error = vmm_init();
312		if (error == 0)
313			vmm_initialized = 1;
314		break;
315	case MOD_UNLOAD:
316		error = vmmdev_cleanup();
317		if (error == 0) {
318			vmm_resume_p = NULL;
319			iommu_cleanup();
320			if (vmm_ipinum != IPI_AST)
321				vmm_ipi_free(vmm_ipinum);
322			error = VMM_CLEANUP();
323			/*
324			 * Something bad happened - prevent new
325			 * VMs from being created
326			 */
327			if (error)
328				vmm_initialized = 0;
329		}
330		break;
331	default:
332		error = 0;
333		break;
334	}
335	return (error);
336}
337
338static moduledata_t vmm_kmod = {
339	"vmm",
340	vmm_handler,
341	NULL
342};
343
344/*
345 * vmm initialization has the following dependencies:
346 *
347 * - iommu initialization must happen after the pci passthru driver has had
348 *   a chance to attach to any passthru devices (after SI_SUB_CONFIGURE).
349 *
350 * - VT-x initialization requires smp_rendezvous() and therefore must happen
351 *   after SMP is fully functional (after SI_SUB_SMP).
352 */
353DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
354MODULE_VERSION(vmm, 1);
355
356static void
357vm_init(struct vm *vm, bool create)
358{
359	int i;
360
361	vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace));
362	vm->iommu = NULL;
363	vm->vioapic = vioapic_init(vm);
364	vm->vhpet = vhpet_init(vm);
365	vm->vatpic = vatpic_init(vm);
366	vm->vatpit = vatpit_init(vm);
367
368	CPU_ZERO(&vm->active_cpus);
369
370	vm->suspend = 0;
371	CPU_ZERO(&vm->suspended_cpus);
372
373	for (i = 0; i < VM_MAXCPU; i++)
374		vcpu_init(vm, i, create);
375}
376
377int
378vm_create(const char *name, struct vm **retvm)
379{
380	struct vm *vm;
381	struct vmspace *vmspace;
382
383	/*
384	 * If vmm.ko could not be successfully initialized then don't attempt
385	 * to create the virtual machine.
386	 */
387	if (!vmm_initialized)
388		return (ENXIO);
389
390	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
391		return (EINVAL);
392
393	vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS);
394	if (vmspace == NULL)
395		return (ENOMEM);
396
397	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
398	strcpy(vm->name, name);
399	vm->num_mem_segs = 0;
400	vm->vmspace = vmspace;
401	mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
402
403	vm_init(vm, true);
404
405	*retvm = vm;
406	return (0);
407}
408
409static void
410vm_free_mem_seg(struct vm *vm, struct mem_seg *seg)
411{
412
413	if (seg->object != NULL)
414		vmm_mem_free(vm->vmspace, seg->gpa, seg->len);
415
416	bzero(seg, sizeof(*seg));
417}
418
419static void
420vm_cleanup(struct vm *vm, bool destroy)
421{
422	int i;
423
424	ppt_unassign_all(vm);
425
426	if (vm->iommu != NULL)
427		iommu_destroy_domain(vm->iommu);
428
429	vatpit_cleanup(vm->vatpit);
430	vhpet_cleanup(vm->vhpet);
431	vatpic_cleanup(vm->vatpic);
432	vioapic_cleanup(vm->vioapic);
433
434	for (i = 0; i < VM_MAXCPU; i++)
435		vcpu_cleanup(vm, i, destroy);
436
437	VMCLEANUP(vm->cookie);
438
439	if (destroy) {
440		for (i = 0; i < vm->num_mem_segs; i++)
441			vm_free_mem_seg(vm, &vm->mem_segs[i]);
442
443		vm->num_mem_segs = 0;
444
445		VMSPACE_FREE(vm->vmspace);
446		vm->vmspace = NULL;
447	}
448}
449
450void
451vm_destroy(struct vm *vm)
452{
453	vm_cleanup(vm, true);
454	free(vm, M_VM);
455}
456
457int
458vm_reinit(struct vm *vm)
459{
460	int error;
461
462	/*
463	 * A virtual machine can be reset only if all vcpus are suspended.
464	 */
465	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
466		vm_cleanup(vm, false);
467		vm_init(vm, false);
468		error = 0;
469	} else {
470		error = EBUSY;
471	}
472
473	return (error);
474}
475
476const char *
477vm_name(struct vm *vm)
478{
479	return (vm->name);
480}
481
482int
483vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
484{
485	vm_object_t obj;
486
487	if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
488		return (ENOMEM);
489	else
490		return (0);
491}
492
493int
494vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
495{
496
497	vmm_mmio_free(vm->vmspace, gpa, len);
498	return (0);
499}
500
501boolean_t
502vm_mem_allocated(struct vm *vm, vm_paddr_t gpa)
503{
504	int i;
505	vm_paddr_t gpabase, gpalimit;
506
507	for (i = 0; i < vm->num_mem_segs; i++) {
508		gpabase = vm->mem_segs[i].gpa;
509		gpalimit = gpabase + vm->mem_segs[i].len;
510		if (gpa >= gpabase && gpa < gpalimit)
511			return (TRUE);		/* 'gpa' is regular memory */
512	}
513
514	if (ppt_is_mmio(vm, gpa))
515		return (TRUE);			/* 'gpa' is pci passthru mmio */
516
517	return (FALSE);
518}
519
520int
521vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
522{
523	int available, allocated;
524	struct mem_seg *seg;
525	vm_object_t object;
526	vm_paddr_t g;
527
528	if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
529		return (EINVAL);
530
531	available = allocated = 0;
532	g = gpa;
533	while (g < gpa + len) {
534		if (vm_mem_allocated(vm, g))
535			allocated++;
536		else
537			available++;
538
539		g += PAGE_SIZE;
540	}
541
542	/*
543	 * If there are some allocated and some available pages in the address
544	 * range then it is an error.
545	 */
546	if (allocated && available)
547		return (EINVAL);
548
549	/*
550	 * If the entire address range being requested has already been
551	 * allocated then there isn't anything more to do.
552	 */
553	if (allocated && available == 0)
554		return (0);
555
556	if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
557		return (E2BIG);
558
559	seg = &vm->mem_segs[vm->num_mem_segs];
560
561	if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL)
562		return (ENOMEM);
563
564	seg->gpa = gpa;
565	seg->len = len;
566	seg->object = object;
567	seg->wired = FALSE;
568
569	vm->num_mem_segs++;
570
571	return (0);
572}
573
574static void
575vm_gpa_unwire(struct vm *vm)
576{
577	int i, rv;
578	struct mem_seg *seg;
579
580	for (i = 0; i < vm->num_mem_segs; i++) {
581		seg = &vm->mem_segs[i];
582		if (!seg->wired)
583			continue;
584
585		rv = vm_map_unwire(&vm->vmspace->vm_map,
586				   seg->gpa, seg->gpa + seg->len,
587				   VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
588		KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment "
589		    "%#lx/%ld could not be unwired: %d",
590		    vm_name(vm), seg->gpa, seg->len, rv));
591
592		seg->wired = FALSE;
593	}
594}
595
596static int
597vm_gpa_wire(struct vm *vm)
598{
599	int i, rv;
600	struct mem_seg *seg;
601
602	for (i = 0; i < vm->num_mem_segs; i++) {
603		seg = &vm->mem_segs[i];
604		if (seg->wired)
605			continue;
606
607		/* XXX rlimits? */
608		rv = vm_map_wire(&vm->vmspace->vm_map,
609				 seg->gpa, seg->gpa + seg->len,
610				 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
611		if (rv != KERN_SUCCESS)
612			break;
613
614		seg->wired = TRUE;
615	}
616
617	if (i < vm->num_mem_segs) {
618		/*
619		 * Undo the wiring before returning an error.
620		 */
621		vm_gpa_unwire(vm);
622		return (EAGAIN);
623	}
624
625	return (0);
626}
627
628static void
629vm_iommu_modify(struct vm *vm, boolean_t map)
630{
631	int i, sz;
632	vm_paddr_t gpa, hpa;
633	struct mem_seg *seg;
634	void *vp, *cookie, *host_domain;
635
636	sz = PAGE_SIZE;
637	host_domain = iommu_host_domain();
638
639	for (i = 0; i < vm->num_mem_segs; i++) {
640		seg = &vm->mem_segs[i];
641		KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired",
642		    vm_name(vm), seg->gpa, seg->len));
643
644		gpa = seg->gpa;
645		while (gpa < seg->gpa + seg->len) {
646			vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE,
647					 &cookie);
648			KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
649			    vm_name(vm), gpa));
650
651			vm_gpa_release(cookie);
652
653			hpa = DMAP_TO_PHYS((uintptr_t)vp);
654			if (map) {
655				iommu_create_mapping(vm->iommu, gpa, hpa, sz);
656				iommu_remove_mapping(host_domain, hpa, sz);
657			} else {
658				iommu_remove_mapping(vm->iommu, gpa, sz);
659				iommu_create_mapping(host_domain, hpa, hpa, sz);
660			}
661
662			gpa += PAGE_SIZE;
663		}
664	}
665
666	/*
667	 * Invalidate the cached translations associated with the domain
668	 * from which pages were removed.
669	 */
670	if (map)
671		iommu_invalidate_tlb(host_domain);
672	else
673		iommu_invalidate_tlb(vm->iommu);
674}
675
676#define	vm_iommu_unmap(vm)	vm_iommu_modify((vm), FALSE)
677#define	vm_iommu_map(vm)	vm_iommu_modify((vm), TRUE)
678
679int
680vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
681{
682	int error;
683
684	error = ppt_unassign_device(vm, bus, slot, func);
685	if (error)
686		return (error);
687
688	if (ppt_assigned_devices(vm) == 0) {
689		vm_iommu_unmap(vm);
690		vm_gpa_unwire(vm);
691	}
692	return (0);
693}
694
695int
696vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
697{
698	int error;
699	vm_paddr_t maxaddr;
700
701	/*
702	 * Virtual machines with pci passthru devices get special treatment:
703	 * - the guest physical memory is wired
704	 * - the iommu is programmed to do the 'gpa' to 'hpa' translation
705	 *
706	 * We need to do this before the first pci passthru device is attached.
707	 */
708	if (ppt_assigned_devices(vm) == 0) {
709		KASSERT(vm->iommu == NULL,
710		    ("vm_assign_pptdev: iommu must be NULL"));
711		maxaddr = vmm_mem_maxaddr();
712		vm->iommu = iommu_create_domain(maxaddr);
713
714		error = vm_gpa_wire(vm);
715		if (error)
716			return (error);
717
718		vm_iommu_map(vm);
719	}
720
721	error = ppt_assign_device(vm, bus, slot, func);
722	return (error);
723}
724
725void *
726vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
727	    void **cookie)
728{
729	int count, pageoff;
730	vm_page_t m;
731
732	pageoff = gpa & PAGE_MASK;
733	if (len > PAGE_SIZE - pageoff)
734		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
735
736	count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
737	    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
738
739	if (count == 1) {
740		*cookie = m;
741		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
742	} else {
743		*cookie = NULL;
744		return (NULL);
745	}
746}
747
748void
749vm_gpa_release(void *cookie)
750{
751	vm_page_t m = cookie;
752
753	vm_page_lock(m);
754	vm_page_unhold(m);
755	vm_page_unlock(m);
756}
757
758int
759vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
760		  struct vm_memory_segment *seg)
761{
762	int i;
763
764	for (i = 0; i < vm->num_mem_segs; i++) {
765		if (gpabase == vm->mem_segs[i].gpa) {
766			seg->gpa = vm->mem_segs[i].gpa;
767			seg->len = vm->mem_segs[i].len;
768			seg->wired = vm->mem_segs[i].wired;
769			return (0);
770		}
771	}
772	return (-1);
773}
774
775int
776vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
777	      vm_offset_t *offset, struct vm_object **object)
778{
779	int i;
780	size_t seg_len;
781	vm_paddr_t seg_gpa;
782	vm_object_t seg_obj;
783
784	for (i = 0; i < vm->num_mem_segs; i++) {
785		if ((seg_obj = vm->mem_segs[i].object) == NULL)
786			continue;
787
788		seg_gpa = vm->mem_segs[i].gpa;
789		seg_len = vm->mem_segs[i].len;
790
791		if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) {
792			*offset = gpa - seg_gpa;
793			*object = seg_obj;
794			vm_object_reference(seg_obj);
795			return (0);
796		}
797	}
798
799	return (EINVAL);
800}
801
802int
803vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
804{
805
806	if (vcpu < 0 || vcpu >= VM_MAXCPU)
807		return (EINVAL);
808
809	if (reg >= VM_REG_LAST)
810		return (EINVAL);
811
812	return (VMGETREG(vm->cookie, vcpu, reg, retval));
813}
814
815int
816vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
817{
818
819	if (vcpu < 0 || vcpu >= VM_MAXCPU)
820		return (EINVAL);
821
822	if (reg >= VM_REG_LAST)
823		return (EINVAL);
824
825	return (VMSETREG(vm->cookie, vcpu, reg, val));
826}
827
828static boolean_t
829is_descriptor_table(int reg)
830{
831
832	switch (reg) {
833	case VM_REG_GUEST_IDTR:
834	case VM_REG_GUEST_GDTR:
835		return (TRUE);
836	default:
837		return (FALSE);
838	}
839}
840
841static boolean_t
842is_segment_register(int reg)
843{
844
845	switch (reg) {
846	case VM_REG_GUEST_ES:
847	case VM_REG_GUEST_CS:
848	case VM_REG_GUEST_SS:
849	case VM_REG_GUEST_DS:
850	case VM_REG_GUEST_FS:
851	case VM_REG_GUEST_GS:
852	case VM_REG_GUEST_TR:
853	case VM_REG_GUEST_LDTR:
854		return (TRUE);
855	default:
856		return (FALSE);
857	}
858}
859
860int
861vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
862		struct seg_desc *desc)
863{
864
865	if (vcpu < 0 || vcpu >= VM_MAXCPU)
866		return (EINVAL);
867
868	if (!is_segment_register(reg) && !is_descriptor_table(reg))
869		return (EINVAL);
870
871	return (VMGETDESC(vm->cookie, vcpu, reg, desc));
872}
873
874int
875vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
876		struct seg_desc *desc)
877{
878	if (vcpu < 0 || vcpu >= VM_MAXCPU)
879		return (EINVAL);
880
881	if (!is_segment_register(reg) && !is_descriptor_table(reg))
882		return (EINVAL);
883
884	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
885}
886
887static void
888restore_guest_fpustate(struct vcpu *vcpu)
889{
890
891	/* flush host state to the pcb */
892	fpuexit(curthread);
893
894	/* restore guest FPU state */
895	fpu_stop_emulating();
896	fpurestore(vcpu->guestfpu);
897
898	/* restore guest XCR0 if XSAVE is enabled in the host */
899	if (rcr4() & CR4_XSAVE)
900		load_xcr(0, vcpu->guest_xcr0);
901
902	/*
903	 * The FPU is now "dirty" with the guest's state so turn on emulation
904	 * to trap any access to the FPU by the host.
905	 */
906	fpu_start_emulating();
907}
908
909static void
910save_guest_fpustate(struct vcpu *vcpu)
911{
912
913	if ((rcr0() & CR0_TS) == 0)
914		panic("fpu emulation not enabled in host!");
915
916	/* save guest XCR0 and restore host XCR0 */
917	if (rcr4() & CR4_XSAVE) {
918		vcpu->guest_xcr0 = rxcr(0);
919		load_xcr(0, vmm_get_host_xcr0());
920	}
921
922	/* save guest FPU state */
923	fpu_stop_emulating();
924	fpusave(vcpu->guestfpu);
925	fpu_start_emulating();
926}
927
928static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
929
930static int
931vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
932    bool from_idle)
933{
934	int error;
935
936	vcpu_assert_locked(vcpu);
937
938	/*
939	 * State transitions from the vmmdev_ioctl() must always begin from
940	 * the VCPU_IDLE state. This guarantees that there is only a single
941	 * ioctl() operating on a vcpu at any point.
942	 */
943	if (from_idle) {
944		while (vcpu->state != VCPU_IDLE)
945			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
946	} else {
947		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
948		    "vcpu idle state"));
949	}
950
951	if (vcpu->state == VCPU_RUNNING) {
952		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
953		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
954	} else {
955		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
956		    "vcpu that is not running", vcpu->hostcpu));
957	}
958
959	/*
960	 * The following state transitions are allowed:
961	 * IDLE -> FROZEN -> IDLE
962	 * FROZEN -> RUNNING -> FROZEN
963	 * FROZEN -> SLEEPING -> FROZEN
964	 */
965	switch (vcpu->state) {
966	case VCPU_IDLE:
967	case VCPU_RUNNING:
968	case VCPU_SLEEPING:
969		error = (newstate != VCPU_FROZEN);
970		break;
971	case VCPU_FROZEN:
972		error = (newstate == VCPU_FROZEN);
973		break;
974	default:
975		error = 1;
976		break;
977	}
978
979	if (error)
980		return (EBUSY);
981
982	vcpu->state = newstate;
983	if (newstate == VCPU_RUNNING)
984		vcpu->hostcpu = curcpu;
985	else
986		vcpu->hostcpu = NOCPU;
987
988	if (newstate == VCPU_IDLE)
989		wakeup(&vcpu->state);
990
991	return (0);
992}
993
994static void
995vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
996{
997	int error;
998
999	if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
1000		panic("Error %d setting state to %d\n", error, newstate);
1001}
1002
1003static void
1004vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
1005{
1006	int error;
1007
1008	if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
1009		panic("Error %d setting state to %d", error, newstate);
1010}
1011
1012static void
1013vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func)
1014{
1015
1016	KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked"));
1017
1018	/*
1019	 * Update 'rendezvous_func' and execute a write memory barrier to
1020	 * ensure that it is visible across all host cpus. This is not needed
1021	 * for correctness but it does ensure that all the vcpus will notice
1022	 * that the rendezvous is requested immediately.
1023	 */
1024	vm->rendezvous_func = func;
1025	wmb();
1026}
1027
1028#define	RENDEZVOUS_CTR0(vm, vcpuid, fmt)				\
1029	do {								\
1030		if (vcpuid >= 0)					\
1031			VCPU_CTR0(vm, vcpuid, fmt);			\
1032		else							\
1033			VM_CTR0(vm, fmt);				\
1034	} while (0)
1035
1036static void
1037vm_handle_rendezvous(struct vm *vm, int vcpuid)
1038{
1039
1040	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
1041	    ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid));
1042
1043	mtx_lock(&vm->rendezvous_mtx);
1044	while (vm->rendezvous_func != NULL) {
1045		/* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
1046		CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus);
1047
1048		if (vcpuid != -1 &&
1049		    CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
1050		    !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
1051			VCPU_CTR0(vm, vcpuid, "Calling rendezvous func");
1052			(*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg);
1053			CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
1054		}
1055		if (CPU_CMP(&vm->rendezvous_req_cpus,
1056		    &vm->rendezvous_done_cpus) == 0) {
1057			VCPU_CTR0(vm, vcpuid, "Rendezvous completed");
1058			vm_set_rendezvous_func(vm, NULL);
1059			wakeup(&vm->rendezvous_func);
1060			break;
1061		}
1062		RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion");
1063		mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
1064		    "vmrndv", 0);
1065	}
1066	mtx_unlock(&vm->rendezvous_mtx);
1067}
1068
1069/*
1070 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
1071 */
1072static int
1073vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
1074{
1075	struct vcpu *vcpu;
1076	const char *wmesg;
1077	int t, vcpu_halted, vm_halted;
1078
1079	KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
1080
1081	vcpu = &vm->vcpu[vcpuid];
1082	vcpu_halted = 0;
1083	vm_halted = 0;
1084
1085	vcpu_lock(vcpu);
1086	while (1) {
1087		/*
1088		 * Do a final check for pending NMI or interrupts before
1089		 * really putting this thread to sleep. Also check for
1090		 * software events that would cause this vcpu to wakeup.
1091		 *
1092		 * These interrupts/events could have happened after the
1093		 * vcpu returned from VMRUN() and before it acquired the
1094		 * vcpu lock above.
1095		 */
1096		if (vm->rendezvous_func != NULL || vm->suspend)
1097			break;
1098		if (vm_nmi_pending(vm, vcpuid))
1099			break;
1100		if (!intr_disabled) {
1101			if (vm_extint_pending(vm, vcpuid) ||
1102			    vlapic_pending_intr(vcpu->vlapic, NULL)) {
1103				break;
1104			}
1105		}
1106
1107		/*
1108		 * Some Linux guests implement "halt" by having all vcpus
1109		 * execute HLT with interrupts disabled. 'halted_cpus' keeps
1110		 * track of the vcpus that have entered this state. When all
1111		 * vcpus enter the halted state the virtual machine is halted.
1112		 */
1113		if (intr_disabled) {
1114			wmesg = "vmhalt";
1115			VCPU_CTR0(vm, vcpuid, "Halted");
1116			if (!vcpu_halted && halt_detection_enabled) {
1117				vcpu_halted = 1;
1118				CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
1119			}
1120			if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
1121				vm_halted = 1;
1122				break;
1123			}
1124		} else {
1125			wmesg = "vmidle";
1126		}
1127
1128		t = ticks;
1129		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1130		msleep_spin(vcpu, &vcpu->mtx, wmesg, 0);
1131		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1132		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
1133	}
1134
1135	if (vcpu_halted)
1136		CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
1137
1138	vcpu_unlock(vcpu);
1139
1140	if (vm_halted)
1141		vm_suspend(vm, VM_SUSPEND_HALT);
1142
1143	return (0);
1144}
1145
1146static int
1147vm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
1148{
1149	int rv, ftype;
1150	struct vm_map *map;
1151	struct vcpu *vcpu;
1152	struct vm_exit *vme;
1153
1154	vcpu = &vm->vcpu[vcpuid];
1155	vme = &vcpu->exitinfo;
1156
1157	ftype = vme->u.paging.fault_type;
1158	KASSERT(ftype == VM_PROT_READ ||
1159	    ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
1160	    ("vm_handle_paging: invalid fault_type %d", ftype));
1161
1162	if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
1163		rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
1164		    vme->u.paging.gpa, ftype);
1165		if (rv == 0)
1166			goto done;
1167	}
1168
1169	map = &vm->vmspace->vm_map;
1170	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
1171
1172	VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, "
1173	    "ftype = %d", rv, vme->u.paging.gpa, ftype);
1174
1175	if (rv != KERN_SUCCESS)
1176		return (EFAULT);
1177done:
1178	/* restart execution at the faulting instruction */
1179	vme->inst_length = 0;
1180
1181	return (0);
1182}
1183
1184static int
1185vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
1186{
1187	struct vie *vie;
1188	struct vcpu *vcpu;
1189	struct vm_exit *vme;
1190	uint64_t gla, gpa;
1191	struct vm_guest_paging *paging;
1192	mem_region_read_t mread;
1193	mem_region_write_t mwrite;
1194	int error;
1195
1196	vcpu = &vm->vcpu[vcpuid];
1197	vme = &vcpu->exitinfo;
1198
1199	gla = vme->u.inst_emul.gla;
1200	gpa = vme->u.inst_emul.gpa;
1201	vie = &vme->u.inst_emul.vie;
1202	paging = &vme->u.inst_emul.paging;
1203
1204	vie_init(vie);
1205
1206	/* Fetch, decode and emulate the faulting instruction */
1207	error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip,
1208	    vme->inst_length, vie);
1209	if (error == 1)
1210		return (0);		/* Resume guest to handle page fault */
1211	else if (error == -1)
1212		return (EFAULT);
1213	else if (error != 0)
1214		panic("%s: vmm_fetch_instruction error %d", __func__, error);
1215
1216	if (vmm_decode_instruction(vm, vcpuid, gla, paging->cpu_mode, vie) != 0)
1217		return (EFAULT);
1218
1219	/* return to userland unless this is an in-kernel emulated device */
1220	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1221		mread = lapic_mmio_read;
1222		mwrite = lapic_mmio_write;
1223	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1224		mread = vioapic_mmio_read;
1225		mwrite = vioapic_mmio_write;
1226	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1227		mread = vhpet_mmio_read;
1228		mwrite = vhpet_mmio_write;
1229	} else {
1230		*retu = true;
1231		return (0);
1232	}
1233
1234	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, mread, mwrite,
1235	    retu);
1236
1237	return (error);
1238}
1239
1240static int
1241vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu)
1242{
1243	int i, done;
1244	struct vcpu *vcpu;
1245
1246	done = 0;
1247	vcpu = &vm->vcpu[vcpuid];
1248
1249	CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
1250
1251	/*
1252	 * Wait until all 'active_cpus' have suspended themselves.
1253	 *
1254	 * Since a VM may be suspended at any time including when one or
1255	 * more vcpus are doing a rendezvous we need to call the rendezvous
1256	 * handler while we are waiting to prevent a deadlock.
1257	 */
1258	vcpu_lock(vcpu);
1259	while (1) {
1260		if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
1261			VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
1262			break;
1263		}
1264
1265		if (vm->rendezvous_func == NULL) {
1266			VCPU_CTR0(vm, vcpuid, "Sleeping during suspend");
1267			vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1268			msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
1269			vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1270		} else {
1271			VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend");
1272			vcpu_unlock(vcpu);
1273			vm_handle_rendezvous(vm, vcpuid);
1274			vcpu_lock(vcpu);
1275		}
1276	}
1277	vcpu_unlock(vcpu);
1278
1279	/*
1280	 * Wakeup the other sleeping vcpus and return to userspace.
1281	 */
1282	for (i = 0; i < VM_MAXCPU; i++) {
1283		if (CPU_ISSET(i, &vm->suspended_cpus)) {
1284			vcpu_notify_event(vm, i, false);
1285		}
1286	}
1287
1288	*retu = true;
1289	return (0);
1290}
1291
1292int
1293vm_suspend(struct vm *vm, enum vm_suspend_how how)
1294{
1295	int i;
1296
1297	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
1298		return (EINVAL);
1299
1300	if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
1301		VM_CTR2(vm, "virtual machine already suspended %d/%d",
1302		    vm->suspend, how);
1303		return (EALREADY);
1304	}
1305
1306	VM_CTR1(vm, "virtual machine successfully suspended %d", how);
1307
1308	/*
1309	 * Notify all active vcpus that they are now suspended.
1310	 */
1311	for (i = 0; i < VM_MAXCPU; i++) {
1312		if (CPU_ISSET(i, &vm->active_cpus))
1313			vcpu_notify_event(vm, i, false);
1314	}
1315
1316	return (0);
1317}
1318
1319void
1320vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip)
1321{
1322	struct vm_exit *vmexit;
1323
1324	KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
1325	    ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
1326
1327	vmexit = vm_exitinfo(vm, vcpuid);
1328	vmexit->rip = rip;
1329	vmexit->inst_length = 0;
1330	vmexit->exitcode = VM_EXITCODE_SUSPENDED;
1331	vmexit->u.suspended.how = vm->suspend;
1332}
1333
1334void
1335vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip)
1336{
1337	struct vm_exit *vmexit;
1338
1339	KASSERT(vm->rendezvous_func != NULL, ("rendezvous not in progress"));
1340
1341	vmexit = vm_exitinfo(vm, vcpuid);
1342	vmexit->rip = rip;
1343	vmexit->inst_length = 0;
1344	vmexit->exitcode = VM_EXITCODE_RENDEZVOUS;
1345	vmm_stat_incr(vm, vcpuid, VMEXIT_RENDEZVOUS, 1);
1346}
1347
1348void
1349vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip)
1350{
1351	struct vm_exit *vmexit;
1352
1353	vmexit = vm_exitinfo(vm, vcpuid);
1354	vmexit->rip = rip;
1355	vmexit->inst_length = 0;
1356	vmexit->exitcode = VM_EXITCODE_BOGUS;
1357	vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
1358}
1359
1360int
1361vm_run(struct vm *vm, struct vm_run *vmrun)
1362{
1363	int error, vcpuid;
1364	struct vcpu *vcpu;
1365	struct pcb *pcb;
1366	uint64_t tscval, rip;
1367	struct vm_exit *vme;
1368	bool retu, intr_disabled;
1369	pmap_t pmap;
1370	void *rptr, *sptr;
1371
1372	vcpuid = vmrun->cpuid;
1373
1374	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1375		return (EINVAL);
1376
1377	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
1378		return (EINVAL);
1379
1380	if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
1381		return (EINVAL);
1382
1383	rptr = &vm->rendezvous_func;
1384	sptr = &vm->suspend;
1385	pmap = vmspace_pmap(vm->vmspace);
1386	vcpu = &vm->vcpu[vcpuid];
1387	vme = &vcpu->exitinfo;
1388	rip = vmrun->rip;
1389restart:
1390	critical_enter();
1391
1392	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
1393	    ("vm_run: absurd pm_active"));
1394
1395	tscval = rdtsc();
1396
1397	pcb = PCPU_GET(curpcb);
1398	set_pcb_flags(pcb, PCB_FULL_IRET);
1399
1400	restore_guest_msrs(vm, vcpuid);
1401	restore_guest_fpustate(vcpu);
1402
1403	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
1404	error = VMRUN(vm->cookie, vcpuid, rip, pmap, rptr, sptr);
1405	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
1406
1407	save_guest_fpustate(vcpu);
1408	restore_host_msrs(vm, vcpuid);
1409
1410	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
1411
1412	critical_exit();
1413
1414	if (error == 0) {
1415		retu = false;
1416		switch (vme->exitcode) {
1417		case VM_EXITCODE_SUSPENDED:
1418			error = vm_handle_suspend(vm, vcpuid, &retu);
1419			break;
1420		case VM_EXITCODE_IOAPIC_EOI:
1421			vioapic_process_eoi(vm, vcpuid,
1422			    vme->u.ioapic_eoi.vector);
1423			break;
1424		case VM_EXITCODE_RENDEZVOUS:
1425			vm_handle_rendezvous(vm, vcpuid);
1426			error = 0;
1427			break;
1428		case VM_EXITCODE_HLT:
1429			intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
1430			error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu);
1431			break;
1432		case VM_EXITCODE_PAGING:
1433			error = vm_handle_paging(vm, vcpuid, &retu);
1434			break;
1435		case VM_EXITCODE_INST_EMUL:
1436			error = vm_handle_inst_emul(vm, vcpuid, &retu);
1437			break;
1438		case VM_EXITCODE_INOUT:
1439		case VM_EXITCODE_INOUT_STR:
1440			error = vm_handle_inout(vm, vcpuid, vme, &retu);
1441			break;
1442		default:
1443			retu = true;	/* handled in userland */
1444			break;
1445		}
1446	}
1447
1448	if (error == 0 && retu == false) {
1449		rip = vme->rip + vme->inst_length;
1450		goto restart;
1451	}
1452
1453	/* copy the exit information */
1454	bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
1455	return (error);
1456}
1457
1458int
1459vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
1460{
1461	struct vcpu *vcpu;
1462
1463	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1464		return (EINVAL);
1465
1466	if (exception->vector < 0 || exception->vector >= 32)
1467		return (EINVAL);
1468
1469	vcpu = &vm->vcpu[vcpuid];
1470
1471	if (vcpu->exception_pending) {
1472		VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
1473		    "pending exception %d", exception->vector,
1474		    vcpu->exception.vector);
1475		return (EBUSY);
1476	}
1477
1478	vcpu->exception_pending = 1;
1479	vcpu->exception = *exception;
1480	VCPU_CTR1(vm, vcpuid, "Exception %d pending", exception->vector);
1481	return (0);
1482}
1483
1484int
1485vm_exception_pending(struct vm *vm, int vcpuid, struct vm_exception *exception)
1486{
1487	struct vcpu *vcpu;
1488	int pending;
1489
1490	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));
1491
1492	vcpu = &vm->vcpu[vcpuid];
1493	pending = vcpu->exception_pending;
1494	if (pending) {
1495		vcpu->exception_pending = 0;
1496		*exception = vcpu->exception;
1497		VCPU_CTR1(vm, vcpuid, "Exception %d delivered",
1498		    exception->vector);
1499	}
1500	return (pending);
1501}
1502
1503static void
1504vm_inject_fault(struct vm *vm, int vcpuid, struct vm_exception *exception)
1505{
1506	struct vm_exit *vmexit;
1507	int error;
1508
1509	error = vm_inject_exception(vm, vcpuid, exception);
1510	KASSERT(error == 0, ("vm_inject_exception error %d", error));
1511
1512	/*
1513	 * A fault-like exception allows the instruction to be restarted
1514	 * after the exception handler returns.
1515	 *
1516	 * By setting the inst_length to 0 we ensure that the instruction
1517	 * pointer remains at the faulting instruction.
1518	 */
1519	vmexit = vm_exitinfo(vm, vcpuid);
1520	vmexit->inst_length = 0;
1521}
1522
1523void
1524vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2)
1525{
1526	struct vm_exception pf = {
1527		.vector = IDT_PF,
1528		.error_code_valid = 1,
1529		.error_code = error_code
1530	};
1531	int error;
1532
1533	VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx",
1534	    error_code, cr2);
1535
1536	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
1537	KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
1538
1539	vm_inject_fault(vm, vcpuid, &pf);
1540}
1541
1542void
1543vm_inject_gp(struct vm *vm, int vcpuid)
1544{
1545	struct vm_exception gpf = {
1546		.vector = IDT_GP,
1547		.error_code_valid = 1,
1548		.error_code = 0
1549	};
1550
1551	vm_inject_fault(vm, vcpuid, &gpf);
1552}
1553
1554void
1555vm_inject_ud(struct vm *vm, int vcpuid)
1556{
1557	struct vm_exception udf = {
1558		.vector = IDT_UD,
1559		.error_code_valid = 0
1560	};
1561
1562	vm_inject_fault(vm, vcpuid, &udf);
1563}
1564
1565static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
1566
1567int
1568vm_inject_nmi(struct vm *vm, int vcpuid)
1569{
1570	struct vcpu *vcpu;
1571
1572	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1573		return (EINVAL);
1574
1575	vcpu = &vm->vcpu[vcpuid];
1576
1577	vcpu->nmi_pending = 1;
1578	vcpu_notify_event(vm, vcpuid, false);
1579	return (0);
1580}
1581
1582int
1583vm_nmi_pending(struct vm *vm, int vcpuid)
1584{
1585	struct vcpu *vcpu;
1586
1587	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1588		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1589
1590	vcpu = &vm->vcpu[vcpuid];
1591
1592	return (vcpu->nmi_pending);
1593}
1594
1595void
1596vm_nmi_clear(struct vm *vm, int vcpuid)
1597{
1598	struct vcpu *vcpu;
1599
1600	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1601		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1602
1603	vcpu = &vm->vcpu[vcpuid];
1604
1605	if (vcpu->nmi_pending == 0)
1606		panic("vm_nmi_clear: inconsistent nmi_pending state");
1607
1608	vcpu->nmi_pending = 0;
1609	vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
1610}
1611
1612static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
1613
1614int
1615vm_inject_extint(struct vm *vm, int vcpuid)
1616{
1617	struct vcpu *vcpu;
1618
1619	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1620		return (EINVAL);
1621
1622	vcpu = &vm->vcpu[vcpuid];
1623
1624	vcpu->extint_pending = 1;
1625	vcpu_notify_event(vm, vcpuid, false);
1626	return (0);
1627}
1628
1629int
1630vm_extint_pending(struct vm *vm, int vcpuid)
1631{
1632	struct vcpu *vcpu;
1633
1634	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1635		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
1636
1637	vcpu = &vm->vcpu[vcpuid];
1638
1639	return (vcpu->extint_pending);
1640}
1641
1642void
1643vm_extint_clear(struct vm *vm, int vcpuid)
1644{
1645	struct vcpu *vcpu;
1646
1647	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1648		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
1649
1650	vcpu = &vm->vcpu[vcpuid];
1651
1652	if (vcpu->extint_pending == 0)
1653		panic("vm_extint_clear: inconsistent extint_pending state");
1654
1655	vcpu->extint_pending = 0;
1656	vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
1657}
1658
1659int
1660vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
1661{
1662	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1663		return (EINVAL);
1664
1665	if (type < 0 || type >= VM_CAP_MAX)
1666		return (EINVAL);
1667
1668	return (VMGETCAP(vm->cookie, vcpu, type, retval));
1669}
1670
1671int
1672vm_set_capability(struct vm *vm, int vcpu, int type, int val)
1673{
1674	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1675		return (EINVAL);
1676
1677	if (type < 0 || type >= VM_CAP_MAX)
1678		return (EINVAL);
1679
1680	return (VMSETCAP(vm->cookie, vcpu, type, val));
1681}
1682
1683uint64_t *
1684vm_guest_msrs(struct vm *vm, int cpu)
1685{
1686	return (vm->vcpu[cpu].guest_msrs);
1687}
1688
1689struct vlapic *
1690vm_lapic(struct vm *vm, int cpu)
1691{
1692	return (vm->vcpu[cpu].vlapic);
1693}
1694
1695struct vioapic *
1696vm_ioapic(struct vm *vm)
1697{
1698
1699	return (vm->vioapic);
1700}
1701
1702struct vhpet *
1703vm_hpet(struct vm *vm)
1704{
1705
1706	return (vm->vhpet);
1707}
1708
1709boolean_t
1710vmm_is_pptdev(int bus, int slot, int func)
1711{
1712	int found, i, n;
1713	int b, s, f;
1714	char *val, *cp, *cp2;
1715
1716	/*
1717	 * XXX
1718	 * The length of an environment variable is limited to 128 bytes which
1719	 * puts an upper limit on the number of passthru devices that may be
1720	 * specified using a single environment variable.
1721	 *
1722	 * Work around this by scanning multiple environment variable
1723	 * names instead of a single one - yuck!
1724	 */
1725	const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
1726
1727	/* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
1728	found = 0;
1729	for (i = 0; names[i] != NULL && !found; i++) {
1730		cp = val = getenv(names[i]);
1731		while (cp != NULL && *cp != '\0') {
1732			if ((cp2 = strchr(cp, ' ')) != NULL)
1733				*cp2 = '\0';
1734
1735			n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
1736			if (n == 3 && bus == b && slot == s && func == f) {
1737				found = 1;
1738				break;
1739			}
1740
1741			if (cp2 != NULL)
1742				*cp2++ = ' ';
1743
1744			cp = cp2;
1745		}
1746		freeenv(val);
1747	}
1748	return (found);
1749}
1750
1751void *
1752vm_iommu_domain(struct vm *vm)
1753{
1754
1755	return (vm->iommu);
1756}
1757
1758int
1759vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
1760    bool from_idle)
1761{
1762	int error;
1763	struct vcpu *vcpu;
1764
1765	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1766		panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
1767
1768	vcpu = &vm->vcpu[vcpuid];
1769
1770	vcpu_lock(vcpu);
1771	error = vcpu_set_state_locked(vcpu, newstate, from_idle);
1772	vcpu_unlock(vcpu);
1773
1774	return (error);
1775}
1776
1777enum vcpu_state
1778vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
1779{
1780	struct vcpu *vcpu;
1781	enum vcpu_state state;
1782
1783	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1784		panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
1785
1786	vcpu = &vm->vcpu[vcpuid];
1787
1788	vcpu_lock(vcpu);
1789	state = vcpu->state;
1790	if (hostcpu != NULL)
1791		*hostcpu = vcpu->hostcpu;
1792	vcpu_unlock(vcpu);
1793
1794	return (state);
1795}
1796
1797int
1798vm_activate_cpu(struct vm *vm, int vcpuid)
1799{
1800
1801	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1802		return (EINVAL);
1803
1804	if (CPU_ISSET(vcpuid, &vm->active_cpus))
1805		return (EBUSY);
1806
1807	VCPU_CTR0(vm, vcpuid, "activated");
1808	CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
1809	return (0);
1810}
1811
1812cpuset_t
1813vm_active_cpus(struct vm *vm)
1814{
1815
1816	return (vm->active_cpus);
1817}
1818
1819cpuset_t
1820vm_suspended_cpus(struct vm *vm)
1821{
1822
1823	return (vm->suspended_cpus);
1824}
1825
1826void *
1827vcpu_stats(struct vm *vm, int vcpuid)
1828{
1829
1830	return (vm->vcpu[vcpuid].stats);
1831}
1832
1833int
1834vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
1835{
1836	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1837		return (EINVAL);
1838
1839	*state = vm->vcpu[vcpuid].x2apic_state;
1840
1841	return (0);
1842}
1843
1844int
1845vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
1846{
1847	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1848		return (EINVAL);
1849
1850	if (state >= X2APIC_STATE_LAST)
1851		return (EINVAL);
1852
1853	vm->vcpu[vcpuid].x2apic_state = state;
1854
1855	vlapic_set_x2apic_state(vm, vcpuid, state);
1856
1857	return (0);
1858}
1859
1860/*
1861 * This function is called to ensure that a vcpu "sees" a pending event
1862 * as soon as possible:
1863 * - If the vcpu thread is sleeping then it is woken up.
1864 * - If the vcpu is running on a different host_cpu then an IPI will be directed
1865 *   to the host_cpu to cause the vcpu to trap into the hypervisor.
1866 */
1867void
1868vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
1869{
1870	int hostcpu;
1871	struct vcpu *vcpu;
1872
1873	vcpu = &vm->vcpu[vcpuid];
1874
1875	vcpu_lock(vcpu);
1876	hostcpu = vcpu->hostcpu;
1877	if (vcpu->state == VCPU_RUNNING) {
1878		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
1879		if (hostcpu != curcpu) {
1880			if (lapic_intr) {
1881				vlapic_post_intr(vcpu->vlapic, hostcpu,
1882				    vmm_ipinum);
1883			} else {
1884				ipi_cpu(hostcpu, vmm_ipinum);
1885			}
1886		} else {
1887			/*
1888			 * If the 'vcpu' is running on 'curcpu' then it must
1889			 * be sending a notification to itself (e.g. SELF_IPI).
1890			 * The pending event will be picked up when the vcpu
1891			 * transitions back to guest context.
1892			 */
1893		}
1894	} else {
1895		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
1896		    "with hostcpu %d", vcpu->state, hostcpu));
1897		if (vcpu->state == VCPU_SLEEPING)
1898			wakeup_one(vcpu);
1899	}
1900	vcpu_unlock(vcpu);
1901}
1902
1903struct vmspace *
1904vm_get_vmspace(struct vm *vm)
1905{
1906
1907	return (vm->vmspace);
1908}
1909
1910int
1911vm_apicid2vcpuid(struct vm *vm, int apicid)
1912{
1913	/*
1914	 * XXX apic id is assumed to be numerically identical to vcpu id
1915	 */
1916	return (apicid);
1917}
1918
1919void
1920vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
1921    vm_rendezvous_func_t func, void *arg)
1922{
1923	int i;
1924
1925	/*
1926	 * Enforce that this function is called without any locks
1927	 */
1928	WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous");
1929	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
1930	    ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid));
1931
1932restart:
1933	mtx_lock(&vm->rendezvous_mtx);
1934	if (vm->rendezvous_func != NULL) {
1935		/*
1936		 * If a rendezvous is already in progress then we need to
1937		 * call the rendezvous handler in case this 'vcpuid' is one
1938		 * of the targets of the rendezvous.
1939		 */
1940		RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress");
1941		mtx_unlock(&vm->rendezvous_mtx);
1942		vm_handle_rendezvous(vm, vcpuid);
1943		goto restart;
1944	}
1945	KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous "
1946	    "rendezvous is still in progress"));
1947
1948	RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous");
1949	vm->rendezvous_req_cpus = dest;
1950	CPU_ZERO(&vm->rendezvous_done_cpus);
1951	vm->rendezvous_arg = arg;
1952	vm_set_rendezvous_func(vm, func);
1953	mtx_unlock(&vm->rendezvous_mtx);
1954
1955	/*
1956	 * Wake up any sleeping vcpus and trigger a VM-exit in any running
1957	 * vcpus so they handle the rendezvous as soon as possible.
1958	 */
1959	for (i = 0; i < VM_MAXCPU; i++) {
1960		if (CPU_ISSET(i, &dest))
1961			vcpu_notify_event(vm, i, false);
1962	}
1963
1964	vm_handle_rendezvous(vm, vcpuid);
1965}
1966
1967struct vatpic *
1968vm_atpic(struct vm *vm)
1969{
1970	return (vm->vatpic);
1971}
1972
1973struct vatpit *
1974vm_atpit(struct vm *vm)
1975{
1976	return (vm->vatpit);
1977}
1978
1979enum vm_reg_name
1980vm_segment_name(int seg)
1981{
1982	static enum vm_reg_name seg_names[] = {
1983		VM_REG_GUEST_ES,
1984		VM_REG_GUEST_CS,
1985		VM_REG_GUEST_SS,
1986		VM_REG_GUEST_DS,
1987		VM_REG_GUEST_FS,
1988		VM_REG_GUEST_GS
1989	};
1990
1991	KASSERT(seg >= 0 && seg < nitems(seg_names),
1992	    ("%s: invalid segment encoding %d", __func__, seg));
1993	return (seg_names[seg]);
1994}
1995
1996
1997/*
1998 * Return the amount of in-use and wired memory for the VM. Since
1999 * these are global stats, only return the values with for vCPU 0
2000 */
2001VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
2002VMM_STAT_DECLARE(VMM_MEM_WIRED);
2003
2004static void
2005vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
2006{
2007
2008	if (vcpu == 0) {
2009		vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
2010	       	    PAGE_SIZE * vmspace_resident_count(vm->vmspace));
2011	}
2012}
2013
2014static void
2015vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
2016{
2017
2018	if (vcpu == 0) {
2019		vmm_stat_set(vm, vcpu, VMM_MEM_WIRED,
2020	      	    PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace)));
2021	}
2022}
2023
2024VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
2025VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt);
2026