vmm.c revision 268976
1/*-
2 * Copyright (c) 2011 NetApp, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD: stable/10/sys/amd64/vmm/vmm.c 268976 2014-07-22 04:39:16Z jhb $
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/vmm.c 268976 2014-07-22 04:39:16Z jhb $");
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/kernel.h>
35#include <sys/module.h>
36#include <sys/sysctl.h>
37#include <sys/malloc.h>
38#include <sys/pcpu.h>
39#include <sys/lock.h>
40#include <sys/mutex.h>
41#include <sys/proc.h>
42#include <sys/rwlock.h>
43#include <sys/sched.h>
44#include <sys/smp.h>
45#include <sys/systm.h>
46
47#include <vm/vm.h>
48#include <vm/vm_object.h>
49#include <vm/vm_page.h>
50#include <vm/pmap.h>
51#include <vm/vm_map.h>
52#include <vm/vm_extern.h>
53#include <vm/vm_param.h>
54
55#include <machine/cpu.h>
56#include <machine/vm.h>
57#include <machine/pcb.h>
58#include <machine/smp.h>
59#include <x86/psl.h>
60#include <x86/apicreg.h>
61#include <machine/vmparam.h>
62
63#include <machine/vmm.h>
64#include <machine/vmm_dev.h>
65#include <machine/vmm_instruction_emul.h>
66
67#include "vmm_ioport.h"
68#include "vmm_ktr.h"
69#include "vmm_host.h"
70#include "vmm_mem.h"
71#include "vmm_util.h"
72#include "vatpic.h"
73#include "vatpit.h"
74#include "vhpet.h"
75#include "vioapic.h"
76#include "vlapic.h"
77#include "vmm_msr.h"
78#include "vmm_ipi.h"
79#include "vmm_stat.h"
80#include "vmm_lapic.h"
81
82#include "io/ppt.h"
83#include "io/iommu.h"
84
85struct vlapic;
86
87struct vcpu {
88	int		flags;
89	enum vcpu_state	state;
90	struct mtx	mtx;
91	int		hostcpu;	/* host cpuid this vcpu last ran on */
92	uint64_t	guest_msrs[VMM_MSR_NUM];
93	struct vlapic	*vlapic;
94	int		 vcpuid;
95	struct savefpu	*guestfpu;	/* guest fpu state */
96	uint64_t	guest_xcr0;
97	void		*stats;
98	struct vm_exit	exitinfo;
99	enum x2apic_state x2apic_state;
100	int		nmi_pending;
101	int		extint_pending;
102	struct vm_exception exception;
103	int		exception_pending;
104};
105
106#define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
107#define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
108#define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
109#define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
110
111struct mem_seg {
112	vm_paddr_t	gpa;
113	size_t		len;
114	boolean_t	wired;
115	vm_object_t	object;
116};
117#define	VM_MAX_MEMORY_SEGMENTS	2
118
119struct vm {
120	void		*cookie;	/* processor-specific data */
121	void		*iommu;		/* iommu-specific data */
122	struct vhpet	*vhpet;		/* virtual HPET */
123	struct vioapic	*vioapic;	/* virtual ioapic */
124	struct vatpic	*vatpic;	/* virtual atpic */
125	struct vatpit	*vatpit;	/* virtual atpit */
126	struct vmspace	*vmspace;	/* guest's address space */
127	struct vcpu	vcpu[VM_MAXCPU];
128	int		num_mem_segs;
129	struct mem_seg	mem_segs[VM_MAX_MEMORY_SEGMENTS];
130	char		name[VM_MAX_NAMELEN];
131
132	/*
133	 * Set of active vcpus.
134	 * An active vcpu is one that has been started implicitly (BSP) or
135	 * explicitly (AP) by sending it a startup ipi.
136	 */
137	volatile cpuset_t active_cpus;
138
139	struct mtx	rendezvous_mtx;
140	cpuset_t	rendezvous_req_cpus;
141	cpuset_t	rendezvous_done_cpus;
142	void		*rendezvous_arg;
143	vm_rendezvous_func_t rendezvous_func;
144
145	int		suspend;
146	volatile cpuset_t suspended_cpus;
147
148	volatile cpuset_t halted_cpus;
149};
150
151static int vmm_initialized;
152
153static struct vmm_ops *ops;
154#define	VMM_INIT(num)	(ops != NULL ? (*ops->init)(num) : 0)
155#define	VMM_CLEANUP()	(ops != NULL ? (*ops->cleanup)() : 0)
156#define	VMM_RESUME()	(ops != NULL ? (*ops->resume)() : 0)
157
158#define	VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
159#define	VMRUN(vmi, vcpu, rip, pmap, rptr, sptr) \
160	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, rptr, sptr) : ENXIO)
161#define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
162#define	VMSPACE_ALLOC(min, max) \
163	(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
164#define	VMSPACE_FREE(vmspace) \
165	(ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
166#define	VMGETREG(vmi, vcpu, num, retval)		\
167	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
168#define	VMSETREG(vmi, vcpu, num, val)		\
169	(ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
170#define	VMGETDESC(vmi, vcpu, num, desc)		\
171	(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
172#define	VMSETDESC(vmi, vcpu, num, desc)		\
173	(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
174#define	VMGETCAP(vmi, vcpu, num, retval)	\
175	(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
176#define	VMSETCAP(vmi, vcpu, num, val)		\
177	(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
178#define	VLAPIC_INIT(vmi, vcpu)			\
179	(ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL)
180#define	VLAPIC_CLEANUP(vmi, vlapic)		\
181	(ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL)
182
183#define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
184#define	fpu_stop_emulating()	clts()
185
186static MALLOC_DEFINE(M_VM, "vm", "vm");
187CTASSERT(VMM_MSR_NUM <= 64);	/* msr_mask can keep track of up to 64 msrs */
188
189/* statistics */
190static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
191
192SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
193
194/*
195 * Halt the guest if all vcpus are executing a HLT instruction with
196 * interrupts disabled.
197 */
198static int halt_detection_enabled = 1;
199TUNABLE_INT("hw.vmm.halt_detection", &halt_detection_enabled);
200SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN,
201    &halt_detection_enabled, 0,
202    "Halt VM if all vcpus execute HLT with interrupts disabled");
203
204static int vmm_ipinum;
205SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
206    "IPI vector used for vcpu notifications");
207
208static void
209vcpu_cleanup(struct vm *vm, int i)
210{
211	struct vcpu *vcpu = &vm->vcpu[i];
212
213	VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
214	vmm_stat_free(vcpu->stats);
215	fpu_save_area_free(vcpu->guestfpu);
216}
217
218static void
219vcpu_init(struct vm *vm, uint32_t vcpu_id)
220{
221	struct vcpu *vcpu;
222
223	vcpu = &vm->vcpu[vcpu_id];
224
225	vcpu_lock_init(vcpu);
226	vcpu->hostcpu = NOCPU;
227	vcpu->vcpuid = vcpu_id;
228	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
229	vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
230	vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
231	vcpu->guestfpu = fpu_save_area_alloc();
232	fpu_save_area_reset(vcpu->guestfpu);
233	vcpu->stats = vmm_stat_alloc();
234}
235
236struct vm_exit *
237vm_exitinfo(struct vm *vm, int cpuid)
238{
239	struct vcpu *vcpu;
240
241	if (cpuid < 0 || cpuid >= VM_MAXCPU)
242		panic("vm_exitinfo: invalid cpuid %d", cpuid);
243
244	vcpu = &vm->vcpu[cpuid];
245
246	return (&vcpu->exitinfo);
247}
248
249static void
250vmm_resume(void)
251{
252	VMM_RESUME();
253}
254
255static int
256vmm_init(void)
257{
258	int error;
259
260	vmm_host_state_init();
261
262	vmm_ipinum = vmm_ipi_alloc();
263	if (vmm_ipinum == 0)
264		vmm_ipinum = IPI_AST;
265
266	error = vmm_mem_init();
267	if (error)
268		return (error);
269
270	if (vmm_is_intel())
271		ops = &vmm_ops_intel;
272	else if (vmm_is_amd())
273		ops = &vmm_ops_amd;
274	else
275		return (ENXIO);
276
277	vmm_msr_init();
278	vmm_resume_p = vmm_resume;
279
280	return (VMM_INIT(vmm_ipinum));
281}
282
283static int
284vmm_handler(module_t mod, int what, void *arg)
285{
286	int error;
287
288	switch (what) {
289	case MOD_LOAD:
290		vmmdev_init();
291		if (ppt_avail_devices() > 0)
292			iommu_init();
293		error = vmm_init();
294		if (error == 0)
295			vmm_initialized = 1;
296		break;
297	case MOD_UNLOAD:
298		error = vmmdev_cleanup();
299		if (error == 0) {
300			vmm_resume_p = NULL;
301			iommu_cleanup();
302			if (vmm_ipinum != IPI_AST)
303				vmm_ipi_free(vmm_ipinum);
304			error = VMM_CLEANUP();
305			/*
306			 * Something bad happened - prevent new
307			 * VMs from being created
308			 */
309			if (error)
310				vmm_initialized = 0;
311		}
312		break;
313	default:
314		error = 0;
315		break;
316	}
317	return (error);
318}
319
320static moduledata_t vmm_kmod = {
321	"vmm",
322	vmm_handler,
323	NULL
324};
325
326/*
327 * vmm initialization has the following dependencies:
328 *
329 * - iommu initialization must happen after the pci passthru driver has had
330 *   a chance to attach to any passthru devices (after SI_SUB_CONFIGURE).
331 *
332 * - VT-x initialization requires smp_rendezvous() and therefore must happen
333 *   after SMP is fully functional (after SI_SUB_SMP).
334 */
335DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
336MODULE_VERSION(vmm, 1);
337
338int
339vm_create(const char *name, struct vm **retvm)
340{
341	int i;
342	struct vm *vm;
343	struct vmspace *vmspace;
344
345	const int BSP = 0;
346
347	/*
348	 * If vmm.ko could not be successfully initialized then don't attempt
349	 * to create the virtual machine.
350	 */
351	if (!vmm_initialized)
352		return (ENXIO);
353
354	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
355		return (EINVAL);
356
357	vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS);
358	if (vmspace == NULL)
359		return (ENOMEM);
360
361	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
362	strcpy(vm->name, name);
363	vm->vmspace = vmspace;
364	mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
365	vm->cookie = VMINIT(vm, vmspace_pmap(vmspace));
366	vm->vioapic = vioapic_init(vm);
367	vm->vhpet = vhpet_init(vm);
368	vm->vatpic = vatpic_init(vm);
369	vm->vatpit = vatpit_init(vm);
370
371	for (i = 0; i < VM_MAXCPU; i++) {
372		vcpu_init(vm, i);
373		guest_msrs_init(vm, i);
374	}
375
376	vm_activate_cpu(vm, BSP);
377
378	*retvm = vm;
379	return (0);
380}
381
382static void
383vm_free_mem_seg(struct vm *vm, struct mem_seg *seg)
384{
385
386	if (seg->object != NULL)
387		vmm_mem_free(vm->vmspace, seg->gpa, seg->len);
388
389	bzero(seg, sizeof(*seg));
390}
391
392void
393vm_destroy(struct vm *vm)
394{
395	int i;
396
397	ppt_unassign_all(vm);
398
399	if (vm->iommu != NULL)
400		iommu_destroy_domain(vm->iommu);
401
402	vatpit_cleanup(vm->vatpit);
403	vhpet_cleanup(vm->vhpet);
404	vatpic_cleanup(vm->vatpic);
405	vioapic_cleanup(vm->vioapic);
406
407	for (i = 0; i < vm->num_mem_segs; i++)
408		vm_free_mem_seg(vm, &vm->mem_segs[i]);
409
410	vm->num_mem_segs = 0;
411
412	for (i = 0; i < VM_MAXCPU; i++)
413		vcpu_cleanup(vm, i);
414
415	VMSPACE_FREE(vm->vmspace);
416
417	VMCLEANUP(vm->cookie);
418
419	free(vm, M_VM);
420}
421
422const char *
423vm_name(struct vm *vm)
424{
425	return (vm->name);
426}
427
428int
429vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
430{
431	vm_object_t obj;
432
433	if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
434		return (ENOMEM);
435	else
436		return (0);
437}
438
439int
440vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
441{
442
443	vmm_mmio_free(vm->vmspace, gpa, len);
444	return (0);
445}
446
447boolean_t
448vm_mem_allocated(struct vm *vm, vm_paddr_t gpa)
449{
450	int i;
451	vm_paddr_t gpabase, gpalimit;
452
453	for (i = 0; i < vm->num_mem_segs; i++) {
454		gpabase = vm->mem_segs[i].gpa;
455		gpalimit = gpabase + vm->mem_segs[i].len;
456		if (gpa >= gpabase && gpa < gpalimit)
457			return (TRUE);		/* 'gpa' is regular memory */
458	}
459
460	if (ppt_is_mmio(vm, gpa))
461		return (TRUE);			/* 'gpa' is pci passthru mmio */
462
463	return (FALSE);
464}
465
466int
467vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
468{
469	int available, allocated;
470	struct mem_seg *seg;
471	vm_object_t object;
472	vm_paddr_t g;
473
474	if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
475		return (EINVAL);
476
477	available = allocated = 0;
478	g = gpa;
479	while (g < gpa + len) {
480		if (vm_mem_allocated(vm, g))
481			allocated++;
482		else
483			available++;
484
485		g += PAGE_SIZE;
486	}
487
488	/*
489	 * If there are some allocated and some available pages in the address
490	 * range then it is an error.
491	 */
492	if (allocated && available)
493		return (EINVAL);
494
495	/*
496	 * If the entire address range being requested has already been
497	 * allocated then there isn't anything more to do.
498	 */
499	if (allocated && available == 0)
500		return (0);
501
502	if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
503		return (E2BIG);
504
505	seg = &vm->mem_segs[vm->num_mem_segs];
506
507	if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL)
508		return (ENOMEM);
509
510	seg->gpa = gpa;
511	seg->len = len;
512	seg->object = object;
513	seg->wired = FALSE;
514
515	vm->num_mem_segs++;
516
517	return (0);
518}
519
520static void
521vm_gpa_unwire(struct vm *vm)
522{
523	int i, rv;
524	struct mem_seg *seg;
525
526	for (i = 0; i < vm->num_mem_segs; i++) {
527		seg = &vm->mem_segs[i];
528		if (!seg->wired)
529			continue;
530
531		rv = vm_map_unwire(&vm->vmspace->vm_map,
532				   seg->gpa, seg->gpa + seg->len,
533				   VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
534		KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment "
535		    "%#lx/%ld could not be unwired: %d",
536		    vm_name(vm), seg->gpa, seg->len, rv));
537
538		seg->wired = FALSE;
539	}
540}
541
542static int
543vm_gpa_wire(struct vm *vm)
544{
545	int i, rv;
546	struct mem_seg *seg;
547
548	for (i = 0; i < vm->num_mem_segs; i++) {
549		seg = &vm->mem_segs[i];
550		if (seg->wired)
551			continue;
552
553		/* XXX rlimits? */
554		rv = vm_map_wire(&vm->vmspace->vm_map,
555				 seg->gpa, seg->gpa + seg->len,
556				 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
557		if (rv != KERN_SUCCESS)
558			break;
559
560		seg->wired = TRUE;
561	}
562
563	if (i < vm->num_mem_segs) {
564		/*
565		 * Undo the wiring before returning an error.
566		 */
567		vm_gpa_unwire(vm);
568		return (EAGAIN);
569	}
570
571	return (0);
572}
573
574static void
575vm_iommu_modify(struct vm *vm, boolean_t map)
576{
577	int i, sz;
578	vm_paddr_t gpa, hpa;
579	struct mem_seg *seg;
580	void *vp, *cookie, *host_domain;
581
582	sz = PAGE_SIZE;
583	host_domain = iommu_host_domain();
584
585	for (i = 0; i < vm->num_mem_segs; i++) {
586		seg = &vm->mem_segs[i];
587		KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired",
588		    vm_name(vm), seg->gpa, seg->len));
589
590		gpa = seg->gpa;
591		while (gpa < seg->gpa + seg->len) {
592			vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE,
593					 &cookie);
594			KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
595			    vm_name(vm), gpa));
596
597			vm_gpa_release(cookie);
598
599			hpa = DMAP_TO_PHYS((uintptr_t)vp);
600			if (map) {
601				iommu_create_mapping(vm->iommu, gpa, hpa, sz);
602				iommu_remove_mapping(host_domain, hpa, sz);
603			} else {
604				iommu_remove_mapping(vm->iommu, gpa, sz);
605				iommu_create_mapping(host_domain, hpa, hpa, sz);
606			}
607
608			gpa += PAGE_SIZE;
609		}
610	}
611
612	/*
613	 * Invalidate the cached translations associated with the domain
614	 * from which pages were removed.
615	 */
616	if (map)
617		iommu_invalidate_tlb(host_domain);
618	else
619		iommu_invalidate_tlb(vm->iommu);
620}
621
622#define	vm_iommu_unmap(vm)	vm_iommu_modify((vm), FALSE)
623#define	vm_iommu_map(vm)	vm_iommu_modify((vm), TRUE)
624
625int
626vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
627{
628	int error;
629
630	error = ppt_unassign_device(vm, bus, slot, func);
631	if (error)
632		return (error);
633
634	if (ppt_assigned_devices(vm) == 0) {
635		vm_iommu_unmap(vm);
636		vm_gpa_unwire(vm);
637	}
638	return (0);
639}
640
641int
642vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
643{
644	int error;
645	vm_paddr_t maxaddr;
646
647	/*
648	 * Virtual machines with pci passthru devices get special treatment:
649	 * - the guest physical memory is wired
650	 * - the iommu is programmed to do the 'gpa' to 'hpa' translation
651	 *
652	 * We need to do this before the first pci passthru device is attached.
653	 */
654	if (ppt_assigned_devices(vm) == 0) {
655		KASSERT(vm->iommu == NULL,
656		    ("vm_assign_pptdev: iommu must be NULL"));
657		maxaddr = vmm_mem_maxaddr();
658		vm->iommu = iommu_create_domain(maxaddr);
659
660		error = vm_gpa_wire(vm);
661		if (error)
662			return (error);
663
664		vm_iommu_map(vm);
665	}
666
667	error = ppt_assign_device(vm, bus, slot, func);
668	return (error);
669}
670
671void *
672vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
673	    void **cookie)
674{
675	int count, pageoff;
676	vm_page_t m;
677
678	pageoff = gpa & PAGE_MASK;
679	if (len > PAGE_SIZE - pageoff)
680		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
681
682	count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
683	    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
684
685	if (count == 1) {
686		*cookie = m;
687		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
688	} else {
689		*cookie = NULL;
690		return (NULL);
691	}
692}
693
694void
695vm_gpa_release(void *cookie)
696{
697	vm_page_t m = cookie;
698
699	vm_page_lock(m);
700	vm_page_unhold(m);
701	vm_page_unlock(m);
702}
703
704int
705vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
706		  struct vm_memory_segment *seg)
707{
708	int i;
709
710	for (i = 0; i < vm->num_mem_segs; i++) {
711		if (gpabase == vm->mem_segs[i].gpa) {
712			seg->gpa = vm->mem_segs[i].gpa;
713			seg->len = vm->mem_segs[i].len;
714			seg->wired = vm->mem_segs[i].wired;
715			return (0);
716		}
717	}
718	return (-1);
719}
720
721int
722vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
723	      vm_offset_t *offset, struct vm_object **object)
724{
725	int i;
726	size_t seg_len;
727	vm_paddr_t seg_gpa;
728	vm_object_t seg_obj;
729
730	for (i = 0; i < vm->num_mem_segs; i++) {
731		if ((seg_obj = vm->mem_segs[i].object) == NULL)
732			continue;
733
734		seg_gpa = vm->mem_segs[i].gpa;
735		seg_len = vm->mem_segs[i].len;
736
737		if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) {
738			*offset = gpa - seg_gpa;
739			*object = seg_obj;
740			vm_object_reference(seg_obj);
741			return (0);
742		}
743	}
744
745	return (EINVAL);
746}
747
748int
749vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
750{
751
752	if (vcpu < 0 || vcpu >= VM_MAXCPU)
753		return (EINVAL);
754
755	if (reg >= VM_REG_LAST)
756		return (EINVAL);
757
758	return (VMGETREG(vm->cookie, vcpu, reg, retval));
759}
760
761int
762vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
763{
764
765	if (vcpu < 0 || vcpu >= VM_MAXCPU)
766		return (EINVAL);
767
768	if (reg >= VM_REG_LAST)
769		return (EINVAL);
770
771	return (VMSETREG(vm->cookie, vcpu, reg, val));
772}
773
774static boolean_t
775is_descriptor_table(int reg)
776{
777
778	switch (reg) {
779	case VM_REG_GUEST_IDTR:
780	case VM_REG_GUEST_GDTR:
781		return (TRUE);
782	default:
783		return (FALSE);
784	}
785}
786
787static boolean_t
788is_segment_register(int reg)
789{
790
791	switch (reg) {
792	case VM_REG_GUEST_ES:
793	case VM_REG_GUEST_CS:
794	case VM_REG_GUEST_SS:
795	case VM_REG_GUEST_DS:
796	case VM_REG_GUEST_FS:
797	case VM_REG_GUEST_GS:
798	case VM_REG_GUEST_TR:
799	case VM_REG_GUEST_LDTR:
800		return (TRUE);
801	default:
802		return (FALSE);
803	}
804}
805
806int
807vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
808		struct seg_desc *desc)
809{
810
811	if (vcpu < 0 || vcpu >= VM_MAXCPU)
812		return (EINVAL);
813
814	if (!is_segment_register(reg) && !is_descriptor_table(reg))
815		return (EINVAL);
816
817	return (VMGETDESC(vm->cookie, vcpu, reg, desc));
818}
819
820int
821vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
822		struct seg_desc *desc)
823{
824	if (vcpu < 0 || vcpu >= VM_MAXCPU)
825		return (EINVAL);
826
827	if (!is_segment_register(reg) && !is_descriptor_table(reg))
828		return (EINVAL);
829
830	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
831}
832
833static void
834restore_guest_fpustate(struct vcpu *vcpu)
835{
836
837	/* flush host state to the pcb */
838	fpuexit(curthread);
839
840	/* restore guest FPU state */
841	fpu_stop_emulating();
842	fpurestore(vcpu->guestfpu);
843
844	/* restore guest XCR0 if XSAVE is enabled in the host */
845	if (rcr4() & CR4_XSAVE)
846		load_xcr(0, vcpu->guest_xcr0);
847
848	/*
849	 * The FPU is now "dirty" with the guest's state so turn on emulation
850	 * to trap any access to the FPU by the host.
851	 */
852	fpu_start_emulating();
853}
854
855static void
856save_guest_fpustate(struct vcpu *vcpu)
857{
858
859	if ((rcr0() & CR0_TS) == 0)
860		panic("fpu emulation not enabled in host!");
861
862	/* save guest XCR0 and restore host XCR0 */
863	if (rcr4() & CR4_XSAVE) {
864		vcpu->guest_xcr0 = rxcr(0);
865		load_xcr(0, vmm_get_host_xcr0());
866	}
867
868	/* save guest FPU state */
869	fpu_stop_emulating();
870	fpusave(vcpu->guestfpu);
871	fpu_start_emulating();
872}
873
874static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
875
876static int
877vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
878    bool from_idle)
879{
880	int error;
881
882	vcpu_assert_locked(vcpu);
883
884	/*
885	 * State transitions from the vmmdev_ioctl() must always begin from
886	 * the VCPU_IDLE state. This guarantees that there is only a single
887	 * ioctl() operating on a vcpu at any point.
888	 */
889	if (from_idle) {
890		while (vcpu->state != VCPU_IDLE)
891			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
892	} else {
893		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
894		    "vcpu idle state"));
895	}
896
897	if (vcpu->state == VCPU_RUNNING) {
898		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
899		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
900	} else {
901		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
902		    "vcpu that is not running", vcpu->hostcpu));
903	}
904
905	/*
906	 * The following state transitions are allowed:
907	 * IDLE -> FROZEN -> IDLE
908	 * FROZEN -> RUNNING -> FROZEN
909	 * FROZEN -> SLEEPING -> FROZEN
910	 */
911	switch (vcpu->state) {
912	case VCPU_IDLE:
913	case VCPU_RUNNING:
914	case VCPU_SLEEPING:
915		error = (newstate != VCPU_FROZEN);
916		break;
917	case VCPU_FROZEN:
918		error = (newstate == VCPU_FROZEN);
919		break;
920	default:
921		error = 1;
922		break;
923	}
924
925	if (error)
926		return (EBUSY);
927
928	vcpu->state = newstate;
929	if (newstate == VCPU_RUNNING)
930		vcpu->hostcpu = curcpu;
931	else
932		vcpu->hostcpu = NOCPU;
933
934	if (newstate == VCPU_IDLE)
935		wakeup(&vcpu->state);
936
937	return (0);
938}
939
940static void
941vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
942{
943	int error;
944
945	if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
946		panic("Error %d setting state to %d\n", error, newstate);
947}
948
949static void
950vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
951{
952	int error;
953
954	if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
955		panic("Error %d setting state to %d", error, newstate);
956}
957
958static void
959vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func)
960{
961
962	KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked"));
963
964	/*
965	 * Update 'rendezvous_func' and execute a write memory barrier to
966	 * ensure that it is visible across all host cpus. This is not needed
967	 * for correctness but it does ensure that all the vcpus will notice
968	 * that the rendezvous is requested immediately.
969	 */
970	vm->rendezvous_func = func;
971	wmb();
972}
973
974#define	RENDEZVOUS_CTR0(vm, vcpuid, fmt)				\
975	do {								\
976		if (vcpuid >= 0)					\
977			VCPU_CTR0(vm, vcpuid, fmt);			\
978		else							\
979			VM_CTR0(vm, fmt);				\
980	} while (0)
981
982static void
983vm_handle_rendezvous(struct vm *vm, int vcpuid)
984{
985
986	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
987	    ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid));
988
989	mtx_lock(&vm->rendezvous_mtx);
990	while (vm->rendezvous_func != NULL) {
991		/* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
992		CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus);
993
994		if (vcpuid != -1 &&
995		    CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
996		    !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
997			VCPU_CTR0(vm, vcpuid, "Calling rendezvous func");
998			(*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg);
999			CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
1000		}
1001		if (CPU_CMP(&vm->rendezvous_req_cpus,
1002		    &vm->rendezvous_done_cpus) == 0) {
1003			VCPU_CTR0(vm, vcpuid, "Rendezvous completed");
1004			vm_set_rendezvous_func(vm, NULL);
1005			wakeup(&vm->rendezvous_func);
1006			break;
1007		}
1008		RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion");
1009		mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
1010		    "vmrndv", 0);
1011	}
1012	mtx_unlock(&vm->rendezvous_mtx);
1013}
1014
1015/*
1016 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
1017 */
1018static int
1019vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
1020{
1021	struct vcpu *vcpu;
1022	const char *wmesg;
1023	int t, vcpu_halted, vm_halted;
1024
1025	KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
1026
1027	vcpu = &vm->vcpu[vcpuid];
1028	vcpu_halted = 0;
1029	vm_halted = 0;
1030
1031	vcpu_lock(vcpu);
1032	while (1) {
1033		/*
1034		 * Do a final check for pending NMI or interrupts before
1035		 * really putting this thread to sleep. Also check for
1036		 * software events that would cause this vcpu to wakeup.
1037		 *
1038		 * These interrupts/events could have happened after the
1039		 * vcpu returned from VMRUN() and before it acquired the
1040		 * vcpu lock above.
1041		 */
1042		if (vm->rendezvous_func != NULL || vm->suspend)
1043			break;
1044		if (vm_nmi_pending(vm, vcpuid))
1045			break;
1046		if (!intr_disabled) {
1047			if (vm_extint_pending(vm, vcpuid) ||
1048			    vlapic_pending_intr(vcpu->vlapic, NULL)) {
1049				break;
1050			}
1051		}
1052
1053		/*
1054		 * Some Linux guests implement "halt" by having all vcpus
1055		 * execute HLT with interrupts disabled. 'halted_cpus' keeps
1056		 * track of the vcpus that have entered this state. When all
1057		 * vcpus enter the halted state the virtual machine is halted.
1058		 */
1059		if (intr_disabled) {
1060			wmesg = "vmhalt";
1061			VCPU_CTR0(vm, vcpuid, "Halted");
1062			if (!vcpu_halted && halt_detection_enabled) {
1063				vcpu_halted = 1;
1064				CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
1065			}
1066			if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
1067				vm_halted = 1;
1068				break;
1069			}
1070		} else {
1071			wmesg = "vmidle";
1072		}
1073
1074		t = ticks;
1075		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1076		msleep_spin(vcpu, &vcpu->mtx, wmesg, 0);
1077		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1078		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
1079	}
1080
1081	if (vcpu_halted)
1082		CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
1083
1084	vcpu_unlock(vcpu);
1085
1086	if (vm_halted)
1087		vm_suspend(vm, VM_SUSPEND_HALT);
1088
1089	return (0);
1090}
1091
1092static int
1093vm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
1094{
1095	int rv, ftype;
1096	struct vm_map *map;
1097	struct vcpu *vcpu;
1098	struct vm_exit *vme;
1099
1100	vcpu = &vm->vcpu[vcpuid];
1101	vme = &vcpu->exitinfo;
1102
1103	ftype = vme->u.paging.fault_type;
1104	KASSERT(ftype == VM_PROT_READ ||
1105	    ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
1106	    ("vm_handle_paging: invalid fault_type %d", ftype));
1107
1108	if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
1109		rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
1110		    vme->u.paging.gpa, ftype);
1111		if (rv == 0)
1112			goto done;
1113	}
1114
1115	map = &vm->vmspace->vm_map;
1116	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
1117
1118	VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, "
1119	    "ftype = %d", rv, vme->u.paging.gpa, ftype);
1120
1121	if (rv != KERN_SUCCESS)
1122		return (EFAULT);
1123done:
1124	/* restart execution at the faulting instruction */
1125	vme->inst_length = 0;
1126
1127	return (0);
1128}
1129
1130static int
1131vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
1132{
1133	struct vie *vie;
1134	struct vcpu *vcpu;
1135	struct vm_exit *vme;
1136	uint64_t gla, gpa;
1137	struct vm_guest_paging *paging;
1138	mem_region_read_t mread;
1139	mem_region_write_t mwrite;
1140	int error;
1141
1142	vcpu = &vm->vcpu[vcpuid];
1143	vme = &vcpu->exitinfo;
1144
1145	gla = vme->u.inst_emul.gla;
1146	gpa = vme->u.inst_emul.gpa;
1147	vie = &vme->u.inst_emul.vie;
1148	paging = &vme->u.inst_emul.paging;
1149
1150	vie_init(vie);
1151
1152	/* Fetch, decode and emulate the faulting instruction */
1153	error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip,
1154	    vme->inst_length, vie);
1155	if (error == 1)
1156		return (0);		/* Resume guest to handle page fault */
1157	else if (error == -1)
1158		return (EFAULT);
1159	else if (error != 0)
1160		panic("%s: vmm_fetch_instruction error %d", __func__, error);
1161
1162	if (vmm_decode_instruction(vm, vcpuid, gla, paging->cpu_mode, vie) != 0)
1163		return (EFAULT);
1164
1165	/* return to userland unless this is an in-kernel emulated device */
1166	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1167		mread = lapic_mmio_read;
1168		mwrite = lapic_mmio_write;
1169	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1170		mread = vioapic_mmio_read;
1171		mwrite = vioapic_mmio_write;
1172	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1173		mread = vhpet_mmio_read;
1174		mwrite = vhpet_mmio_write;
1175	} else {
1176		*retu = true;
1177		return (0);
1178	}
1179
1180	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, mread, mwrite,
1181	    retu);
1182
1183	return (error);
1184}
1185
1186static int
1187vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu)
1188{
1189	int i, done;
1190	struct vcpu *vcpu;
1191
1192	done = 0;
1193	vcpu = &vm->vcpu[vcpuid];
1194
1195	CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
1196
1197	/*
1198	 * Wait until all 'active_cpus' have suspended themselves.
1199	 *
1200	 * Since a VM may be suspended at any time including when one or
1201	 * more vcpus are doing a rendezvous we need to call the rendezvous
1202	 * handler while we are waiting to prevent a deadlock.
1203	 */
1204	vcpu_lock(vcpu);
1205	while (1) {
1206		if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
1207			VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
1208			break;
1209		}
1210
1211		if (vm->rendezvous_func == NULL) {
1212			VCPU_CTR0(vm, vcpuid, "Sleeping during suspend");
1213			vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1214			msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
1215			vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1216		} else {
1217			VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend");
1218			vcpu_unlock(vcpu);
1219			vm_handle_rendezvous(vm, vcpuid);
1220			vcpu_lock(vcpu);
1221		}
1222	}
1223	vcpu_unlock(vcpu);
1224
1225	/*
1226	 * Wakeup the other sleeping vcpus and return to userspace.
1227	 */
1228	for (i = 0; i < VM_MAXCPU; i++) {
1229		if (CPU_ISSET(i, &vm->suspended_cpus)) {
1230			vcpu_notify_event(vm, i, false);
1231		}
1232	}
1233
1234	*retu = true;
1235	return (0);
1236}
1237
1238int
1239vm_suspend(struct vm *vm, enum vm_suspend_how how)
1240{
1241	int i;
1242
1243	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
1244		return (EINVAL);
1245
1246	if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
1247		VM_CTR2(vm, "virtual machine already suspended %d/%d",
1248		    vm->suspend, how);
1249		return (EALREADY);
1250	}
1251
1252	VM_CTR1(vm, "virtual machine successfully suspended %d", how);
1253
1254	/*
1255	 * Notify all active vcpus that they are now suspended.
1256	 */
1257	for (i = 0; i < VM_MAXCPU; i++) {
1258		if (CPU_ISSET(i, &vm->active_cpus))
1259			vcpu_notify_event(vm, i, false);
1260	}
1261
1262	return (0);
1263}
1264
1265void
1266vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip)
1267{
1268	struct vm_exit *vmexit;
1269
1270	KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
1271	    ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
1272
1273	vmexit = vm_exitinfo(vm, vcpuid);
1274	vmexit->rip = rip;
1275	vmexit->inst_length = 0;
1276	vmexit->exitcode = VM_EXITCODE_SUSPENDED;
1277	vmexit->u.suspended.how = vm->suspend;
1278}
1279
1280int
1281vm_run(struct vm *vm, struct vm_run *vmrun)
1282{
1283	int error, vcpuid;
1284	struct vcpu *vcpu;
1285	struct pcb *pcb;
1286	uint64_t tscval, rip;
1287	struct vm_exit *vme;
1288	bool retu, intr_disabled;
1289	pmap_t pmap;
1290	void *rptr, *sptr;
1291
1292	vcpuid = vmrun->cpuid;
1293
1294	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1295		return (EINVAL);
1296
1297	rptr = &vm->rendezvous_func;
1298	sptr = &vm->suspend;
1299	pmap = vmspace_pmap(vm->vmspace);
1300	vcpu = &vm->vcpu[vcpuid];
1301	vme = &vcpu->exitinfo;
1302	rip = vmrun->rip;
1303restart:
1304	critical_enter();
1305
1306	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
1307	    ("vm_run: absurd pm_active"));
1308
1309	tscval = rdtsc();
1310
1311	pcb = PCPU_GET(curpcb);
1312	set_pcb_flags(pcb, PCB_FULL_IRET);
1313
1314	restore_guest_msrs(vm, vcpuid);
1315	restore_guest_fpustate(vcpu);
1316
1317	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
1318	error = VMRUN(vm->cookie, vcpuid, rip, pmap, rptr, sptr);
1319	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
1320
1321	save_guest_fpustate(vcpu);
1322	restore_host_msrs(vm, vcpuid);
1323
1324	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
1325
1326	critical_exit();
1327
1328	if (error == 0) {
1329		retu = false;
1330		switch (vme->exitcode) {
1331		case VM_EXITCODE_SUSPENDED:
1332			error = vm_handle_suspend(vm, vcpuid, &retu);
1333			break;
1334		case VM_EXITCODE_IOAPIC_EOI:
1335			vioapic_process_eoi(vm, vcpuid,
1336			    vme->u.ioapic_eoi.vector);
1337			break;
1338		case VM_EXITCODE_RENDEZVOUS:
1339			vm_handle_rendezvous(vm, vcpuid);
1340			error = 0;
1341			break;
1342		case VM_EXITCODE_HLT:
1343			intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
1344			error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu);
1345			break;
1346		case VM_EXITCODE_PAGING:
1347			error = vm_handle_paging(vm, vcpuid, &retu);
1348			break;
1349		case VM_EXITCODE_INST_EMUL:
1350			error = vm_handle_inst_emul(vm, vcpuid, &retu);
1351			break;
1352		case VM_EXITCODE_INOUT:
1353		case VM_EXITCODE_INOUT_STR:
1354			error = vm_handle_inout(vm, vcpuid, vme, &retu);
1355			break;
1356		default:
1357			retu = true;	/* handled in userland */
1358			break;
1359		}
1360	}
1361
1362	if (error == 0 && retu == false) {
1363		rip = vme->rip + vme->inst_length;
1364		goto restart;
1365	}
1366
1367	/* copy the exit information */
1368	bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
1369	return (error);
1370}
1371
1372int
1373vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
1374{
1375	struct vcpu *vcpu;
1376
1377	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1378		return (EINVAL);
1379
1380	if (exception->vector < 0 || exception->vector >= 32)
1381		return (EINVAL);
1382
1383	vcpu = &vm->vcpu[vcpuid];
1384
1385	if (vcpu->exception_pending) {
1386		VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
1387		    "pending exception %d", exception->vector,
1388		    vcpu->exception.vector);
1389		return (EBUSY);
1390	}
1391
1392	vcpu->exception_pending = 1;
1393	vcpu->exception = *exception;
1394	VCPU_CTR1(vm, vcpuid, "Exception %d pending", exception->vector);
1395	return (0);
1396}
1397
1398int
1399vm_exception_pending(struct vm *vm, int vcpuid, struct vm_exception *exception)
1400{
1401	struct vcpu *vcpu;
1402	int pending;
1403
1404	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));
1405
1406	vcpu = &vm->vcpu[vcpuid];
1407	pending = vcpu->exception_pending;
1408	if (pending) {
1409		vcpu->exception_pending = 0;
1410		*exception = vcpu->exception;
1411		VCPU_CTR1(vm, vcpuid, "Exception %d delivered",
1412		    exception->vector);
1413	}
1414	return (pending);
1415}
1416
1417static void
1418vm_inject_fault(struct vm *vm, int vcpuid, struct vm_exception *exception)
1419{
1420	struct vm_exit *vmexit;
1421	int error;
1422
1423	error = vm_inject_exception(vm, vcpuid, exception);
1424	KASSERT(error == 0, ("vm_inject_exception error %d", error));
1425
1426	/*
1427	 * A fault-like exception allows the instruction to be restarted
1428	 * after the exception handler returns.
1429	 *
1430	 * By setting the inst_length to 0 we ensure that the instruction
1431	 * pointer remains at the faulting instruction.
1432	 */
1433	vmexit = vm_exitinfo(vm, vcpuid);
1434	vmexit->inst_length = 0;
1435}
1436
1437void
1438vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2)
1439{
1440	struct vm_exception pf = {
1441		.vector = IDT_PF,
1442		.error_code_valid = 1,
1443		.error_code = error_code
1444	};
1445	int error;
1446
1447	VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx",
1448	    error_code, cr2);
1449
1450	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
1451	KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
1452
1453	vm_inject_fault(vm, vcpuid, &pf);
1454}
1455
1456void
1457vm_inject_gp(struct vm *vm, int vcpuid)
1458{
1459	struct vm_exception gpf = {
1460		.vector = IDT_GP,
1461		.error_code_valid = 1,
1462		.error_code = 0
1463	};
1464
1465	vm_inject_fault(vm, vcpuid, &gpf);
1466}
1467
1468void
1469vm_inject_ud(struct vm *vm, int vcpuid)
1470{
1471	struct vm_exception udf = {
1472		.vector = IDT_UD,
1473		.error_code_valid = 0
1474	};
1475
1476	vm_inject_fault(vm, vcpuid, &udf);
1477}
1478
1479static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
1480
1481int
1482vm_inject_nmi(struct vm *vm, int vcpuid)
1483{
1484	struct vcpu *vcpu;
1485
1486	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1487		return (EINVAL);
1488
1489	vcpu = &vm->vcpu[vcpuid];
1490
1491	vcpu->nmi_pending = 1;
1492	vcpu_notify_event(vm, vcpuid, false);
1493	return (0);
1494}
1495
1496int
1497vm_nmi_pending(struct vm *vm, int vcpuid)
1498{
1499	struct vcpu *vcpu;
1500
1501	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1502		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1503
1504	vcpu = &vm->vcpu[vcpuid];
1505
1506	return (vcpu->nmi_pending);
1507}
1508
1509void
1510vm_nmi_clear(struct vm *vm, int vcpuid)
1511{
1512	struct vcpu *vcpu;
1513
1514	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1515		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1516
1517	vcpu = &vm->vcpu[vcpuid];
1518
1519	if (vcpu->nmi_pending == 0)
1520		panic("vm_nmi_clear: inconsistent nmi_pending state");
1521
1522	vcpu->nmi_pending = 0;
1523	vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
1524}
1525
1526static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
1527
1528int
1529vm_inject_extint(struct vm *vm, int vcpuid)
1530{
1531	struct vcpu *vcpu;
1532
1533	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1534		return (EINVAL);
1535
1536	vcpu = &vm->vcpu[vcpuid];
1537
1538	vcpu->extint_pending = 1;
1539	vcpu_notify_event(vm, vcpuid, false);
1540	return (0);
1541}
1542
1543int
1544vm_extint_pending(struct vm *vm, int vcpuid)
1545{
1546	struct vcpu *vcpu;
1547
1548	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1549		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
1550
1551	vcpu = &vm->vcpu[vcpuid];
1552
1553	return (vcpu->extint_pending);
1554}
1555
1556void
1557vm_extint_clear(struct vm *vm, int vcpuid)
1558{
1559	struct vcpu *vcpu;
1560
1561	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1562		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
1563
1564	vcpu = &vm->vcpu[vcpuid];
1565
1566	if (vcpu->extint_pending == 0)
1567		panic("vm_extint_clear: inconsistent extint_pending state");
1568
1569	vcpu->extint_pending = 0;
1570	vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
1571}
1572
1573int
1574vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
1575{
1576	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1577		return (EINVAL);
1578
1579	if (type < 0 || type >= VM_CAP_MAX)
1580		return (EINVAL);
1581
1582	return (VMGETCAP(vm->cookie, vcpu, type, retval));
1583}
1584
1585int
1586vm_set_capability(struct vm *vm, int vcpu, int type, int val)
1587{
1588	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1589		return (EINVAL);
1590
1591	if (type < 0 || type >= VM_CAP_MAX)
1592		return (EINVAL);
1593
1594	return (VMSETCAP(vm->cookie, vcpu, type, val));
1595}
1596
1597uint64_t *
1598vm_guest_msrs(struct vm *vm, int cpu)
1599{
1600	return (vm->vcpu[cpu].guest_msrs);
1601}
1602
1603struct vlapic *
1604vm_lapic(struct vm *vm, int cpu)
1605{
1606	return (vm->vcpu[cpu].vlapic);
1607}
1608
1609struct vioapic *
1610vm_ioapic(struct vm *vm)
1611{
1612
1613	return (vm->vioapic);
1614}
1615
1616struct vhpet *
1617vm_hpet(struct vm *vm)
1618{
1619
1620	return (vm->vhpet);
1621}
1622
1623boolean_t
1624vmm_is_pptdev(int bus, int slot, int func)
1625{
1626	int found, i, n;
1627	int b, s, f;
1628	char *val, *cp, *cp2;
1629
1630	/*
1631	 * XXX
1632	 * The length of an environment variable is limited to 128 bytes which
1633	 * puts an upper limit on the number of passthru devices that may be
1634	 * specified using a single environment variable.
1635	 *
1636	 * Work around this by scanning multiple environment variable
1637	 * names instead of a single one - yuck!
1638	 */
1639	const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
1640
1641	/* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
1642	found = 0;
1643	for (i = 0; names[i] != NULL && !found; i++) {
1644		cp = val = getenv(names[i]);
1645		while (cp != NULL && *cp != '\0') {
1646			if ((cp2 = strchr(cp, ' ')) != NULL)
1647				*cp2 = '\0';
1648
1649			n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
1650			if (n == 3 && bus == b && slot == s && func == f) {
1651				found = 1;
1652				break;
1653			}
1654
1655			if (cp2 != NULL)
1656				*cp2++ = ' ';
1657
1658			cp = cp2;
1659		}
1660		freeenv(val);
1661	}
1662	return (found);
1663}
1664
1665void *
1666vm_iommu_domain(struct vm *vm)
1667{
1668
1669	return (vm->iommu);
1670}
1671
1672int
1673vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
1674    bool from_idle)
1675{
1676	int error;
1677	struct vcpu *vcpu;
1678
1679	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1680		panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
1681
1682	vcpu = &vm->vcpu[vcpuid];
1683
1684	vcpu_lock(vcpu);
1685	error = vcpu_set_state_locked(vcpu, newstate, from_idle);
1686	vcpu_unlock(vcpu);
1687
1688	return (error);
1689}
1690
1691enum vcpu_state
1692vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
1693{
1694	struct vcpu *vcpu;
1695	enum vcpu_state state;
1696
1697	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1698		panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
1699
1700	vcpu = &vm->vcpu[vcpuid];
1701
1702	vcpu_lock(vcpu);
1703	state = vcpu->state;
1704	if (hostcpu != NULL)
1705		*hostcpu = vcpu->hostcpu;
1706	vcpu_unlock(vcpu);
1707
1708	return (state);
1709}
1710
1711void
1712vm_activate_cpu(struct vm *vm, int vcpuid)
1713{
1714
1715	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU,
1716	    ("vm_activate_cpu: invalid vcpuid %d", vcpuid));
1717	KASSERT(!CPU_ISSET(vcpuid, &vm->active_cpus),
1718	    ("vm_activate_cpu: vcpuid %d is already active", vcpuid));
1719
1720	VCPU_CTR0(vm, vcpuid, "activated");
1721	CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
1722}
1723
1724cpuset_t
1725vm_active_cpus(struct vm *vm)
1726{
1727
1728	return (vm->active_cpus);
1729}
1730
1731void *
1732vcpu_stats(struct vm *vm, int vcpuid)
1733{
1734
1735	return (vm->vcpu[vcpuid].stats);
1736}
1737
1738int
1739vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
1740{
1741	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1742		return (EINVAL);
1743
1744	*state = vm->vcpu[vcpuid].x2apic_state;
1745
1746	return (0);
1747}
1748
1749int
1750vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
1751{
1752	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1753		return (EINVAL);
1754
1755	if (state >= X2APIC_STATE_LAST)
1756		return (EINVAL);
1757
1758	vm->vcpu[vcpuid].x2apic_state = state;
1759
1760	vlapic_set_x2apic_state(vm, vcpuid, state);
1761
1762	return (0);
1763}
1764
1765/*
1766 * This function is called to ensure that a vcpu "sees" a pending event
1767 * as soon as possible:
1768 * - If the vcpu thread is sleeping then it is woken up.
1769 * - If the vcpu is running on a different host_cpu then an IPI will be directed
1770 *   to the host_cpu to cause the vcpu to trap into the hypervisor.
1771 */
1772void
1773vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
1774{
1775	int hostcpu;
1776	struct vcpu *vcpu;
1777
1778	vcpu = &vm->vcpu[vcpuid];
1779
1780	vcpu_lock(vcpu);
1781	hostcpu = vcpu->hostcpu;
1782	if (vcpu->state == VCPU_RUNNING) {
1783		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
1784		if (hostcpu != curcpu) {
1785			if (lapic_intr) {
1786				vlapic_post_intr(vcpu->vlapic, hostcpu,
1787				    vmm_ipinum);
1788			} else {
1789				ipi_cpu(hostcpu, vmm_ipinum);
1790			}
1791		} else {
1792			/*
1793			 * If the 'vcpu' is running on 'curcpu' then it must
1794			 * be sending a notification to itself (e.g. SELF_IPI).
1795			 * The pending event will be picked up when the vcpu
1796			 * transitions back to guest context.
1797			 */
1798		}
1799	} else {
1800		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
1801		    "with hostcpu %d", vcpu->state, hostcpu));
1802		if (vcpu->state == VCPU_SLEEPING)
1803			wakeup_one(vcpu);
1804	}
1805	vcpu_unlock(vcpu);
1806}
1807
1808struct vmspace *
1809vm_get_vmspace(struct vm *vm)
1810{
1811
1812	return (vm->vmspace);
1813}
1814
1815int
1816vm_apicid2vcpuid(struct vm *vm, int apicid)
1817{
1818	/*
1819	 * XXX apic id is assumed to be numerically identical to vcpu id
1820	 */
1821	return (apicid);
1822}
1823
1824void
1825vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
1826    vm_rendezvous_func_t func, void *arg)
1827{
1828	int i;
1829
1830	/*
1831	 * Enforce that this function is called without any locks
1832	 */
1833	WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous");
1834	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
1835	    ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid));
1836
1837restart:
1838	mtx_lock(&vm->rendezvous_mtx);
1839	if (vm->rendezvous_func != NULL) {
1840		/*
1841		 * If a rendezvous is already in progress then we need to
1842		 * call the rendezvous handler in case this 'vcpuid' is one
1843		 * of the targets of the rendezvous.
1844		 */
1845		RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress");
1846		mtx_unlock(&vm->rendezvous_mtx);
1847		vm_handle_rendezvous(vm, vcpuid);
1848		goto restart;
1849	}
1850	KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous "
1851	    "rendezvous is still in progress"));
1852
1853	RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous");
1854	vm->rendezvous_req_cpus = dest;
1855	CPU_ZERO(&vm->rendezvous_done_cpus);
1856	vm->rendezvous_arg = arg;
1857	vm_set_rendezvous_func(vm, func);
1858	mtx_unlock(&vm->rendezvous_mtx);
1859
1860	/*
1861	 * Wake up any sleeping vcpus and trigger a VM-exit in any running
1862	 * vcpus so they handle the rendezvous as soon as possible.
1863	 */
1864	for (i = 0; i < VM_MAXCPU; i++) {
1865		if (CPU_ISSET(i, &dest))
1866			vcpu_notify_event(vm, i, false);
1867	}
1868
1869	vm_handle_rendezvous(vm, vcpuid);
1870}
1871
1872struct vatpic *
1873vm_atpic(struct vm *vm)
1874{
1875	return (vm->vatpic);
1876}
1877
1878struct vatpit *
1879vm_atpit(struct vm *vm)
1880{
1881	return (vm->vatpit);
1882}
1883
1884enum vm_reg_name
1885vm_segment_name(int seg)
1886{
1887	static enum vm_reg_name seg_names[] = {
1888		VM_REG_GUEST_ES,
1889		VM_REG_GUEST_CS,
1890		VM_REG_GUEST_SS,
1891		VM_REG_GUEST_DS,
1892		VM_REG_GUEST_FS,
1893		VM_REG_GUEST_GS
1894	};
1895
1896	KASSERT(seg >= 0 && seg < nitems(seg_names),
1897	    ("%s: invalid segment encoding %d", __func__, seg));
1898	return (seg_names[seg]);
1899}
1900