vmm.c revision 266393
1215976Sjmallett/*-
2215976Sjmallett * Copyright (c) 2011 NetApp, Inc.
3215976Sjmallett * All rights reserved.
4215976Sjmallett *
5215976Sjmallett * Redistribution and use in source and binary forms, with or without
6215976Sjmallett * modification, are permitted provided that the following conditions
7215976Sjmallett * are met:
8215976Sjmallett * 1. Redistributions of source code must retain the above copyright
9215976Sjmallett *    notice, this list of conditions and the following disclaimer.
10215976Sjmallett * 2. Redistributions in binary form must reproduce the above copyright
11215976Sjmallett *    notice, this list of conditions and the following disclaimer in the
12215976Sjmallett *    documentation and/or other materials provided with the distribution.
13215976Sjmallett *
14215976Sjmallett * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15215976Sjmallett * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16215976Sjmallett * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17215976Sjmallett * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18215976Sjmallett * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19215976Sjmallett * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20215976Sjmallett * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21215976Sjmallett * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22215976Sjmallett * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23215976Sjmallett * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24215976Sjmallett * SUCH DAMAGE.
25215976Sjmallett *
26215976Sjmallett * $FreeBSD: stable/10/sys/amd64/vmm/vmm.c 266393 2014-05-18 04:33:24Z jhb $
27215976Sjmallett */
28215976Sjmallett
29215976Sjmallett#include <sys/cdefs.h>
30215976Sjmallett__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/vmm.c 266393 2014-05-18 04:33:24Z jhb $");
31215976Sjmallett
32215976Sjmallett#include <sys/param.h>
33215976Sjmallett#include <sys/systm.h>
34215976Sjmallett#include <sys/kernel.h>
35215976Sjmallett#include <sys/module.h>
36215976Sjmallett#include <sys/sysctl.h>
37215976Sjmallett#include <sys/malloc.h>
38215976Sjmallett#include <sys/pcpu.h>
39215976Sjmallett#include <sys/lock.h>
40215976Sjmallett#include <sys/mutex.h>
41215976Sjmallett#include <sys/proc.h>
42215976Sjmallett#include <sys/rwlock.h>
43215976Sjmallett#include <sys/sched.h>
44215976Sjmallett#include <sys/smp.h>
45215976Sjmallett#include <sys/systm.h>
46215976Sjmallett
47215976Sjmallett#include <vm/vm.h>
48215976Sjmallett#include <vm/vm_object.h>
49215976Sjmallett#include <vm/vm_page.h>
50215976Sjmallett#include <vm/pmap.h>
51215976Sjmallett#include <vm/vm_map.h>
52215976Sjmallett#include <vm/vm_extern.h>
53215976Sjmallett#include <vm/vm_param.h>
54215976Sjmallett
55215976Sjmallett#include <machine/cpu.h>
56215976Sjmallett#include <machine/vm.h>
57215976Sjmallett#include <machine/pcb.h>
58215976Sjmallett#include <machine/smp.h>
59215976Sjmallett#include <x86/psl.h>
60215976Sjmallett#include <x86/apicreg.h>
61215976Sjmallett#include <machine/vmparam.h>
62215976Sjmallett
63215976Sjmallett#include <machine/vmm.h>
64215976Sjmallett#include <machine/vmm_dev.h>
65215976Sjmallett
66215976Sjmallett#include "vmm_ktr.h"
67215976Sjmallett#include "vmm_host.h"
68215976Sjmallett#include "vmm_mem.h"
69215976Sjmallett#include "vmm_util.h"
70215976Sjmallett#include "vhpet.h"
71215976Sjmallett#include "vioapic.h"
72215976Sjmallett#include "vlapic.h"
73215976Sjmallett#include "vmm_msr.h"
74215976Sjmallett#include "vmm_ipi.h"
75215976Sjmallett#include "vmm_stat.h"
76215976Sjmallett#include "vmm_lapic.h"
77215976Sjmallett
78215976Sjmallett#include "io/ppt.h"
79215976Sjmallett#include "io/iommu.h"
80215976Sjmallett
81215976Sjmallettstruct vlapic;
82215976Sjmallett
83215976Sjmallettstruct vcpu {
84215976Sjmallett	int		flags;
85215976Sjmallett	enum vcpu_state	state;
86215976Sjmallett	struct mtx	mtx;
87215976Sjmallett	int		hostcpu;	/* host cpuid this vcpu last ran on */
88215976Sjmallett	uint64_t	guest_msrs[VMM_MSR_NUM];
89215976Sjmallett	struct vlapic	*vlapic;
90215976Sjmallett	int		 vcpuid;
91215976Sjmallett	struct savefpu	*guestfpu;	/* guest fpu state */
92215976Sjmallett	void		*stats;
93215976Sjmallett	struct vm_exit	exitinfo;
94215976Sjmallett	enum x2apic_state x2apic_state;
95215976Sjmallett	int		nmi_pending;
96215976Sjmallett};
97215976Sjmallett
98215976Sjmallett#define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
99215976Sjmallett#define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
100215976Sjmallett#define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
101215976Sjmallett#define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
102215976Sjmallett
103215976Sjmallettstruct mem_seg {
104215976Sjmallett	vm_paddr_t	gpa;
105215976Sjmallett	size_t		len;
106215976Sjmallett	boolean_t	wired;
107215976Sjmallett	vm_object_t	object;
108215976Sjmallett};
109215976Sjmallett#define	VM_MAX_MEMORY_SEGMENTS	2
110215976Sjmallett
111215976Sjmallettstruct vm {
112215976Sjmallett	void		*cookie;	/* processor-specific data */
113215976Sjmallett	void		*iommu;		/* iommu-specific data */
114215976Sjmallett	struct vhpet	*vhpet;		/* virtual HPET */
115215976Sjmallett	struct vioapic	*vioapic;	/* virtual ioapic */
116215976Sjmallett	struct vmspace	*vmspace;	/* guest's address space */
117215976Sjmallett	struct vcpu	vcpu[VM_MAXCPU];
118215976Sjmallett	int		num_mem_segs;
119215976Sjmallett	struct mem_seg	mem_segs[VM_MAX_MEMORY_SEGMENTS];
120215976Sjmallett	char		name[VM_MAX_NAMELEN];
121215976Sjmallett
122215976Sjmallett	/*
123215976Sjmallett	 * Set of active vcpus.
124215976Sjmallett	 * An active vcpu is one that has been started implicitly (BSP) or
125215976Sjmallett	 * explicitly (AP) by sending it a startup ipi.
126215976Sjmallett	 */
127215976Sjmallett	volatile cpuset_t active_cpus;
128215976Sjmallett
129215976Sjmallett	struct mtx	rendezvous_mtx;
130215976Sjmallett	cpuset_t	rendezvous_req_cpus;
131215976Sjmallett	cpuset_t	rendezvous_done_cpus;
132215976Sjmallett	void		*rendezvous_arg;
133215976Sjmallett	vm_rendezvous_func_t rendezvous_func;
134215976Sjmallett};
135215976Sjmallett
136215976Sjmallettstatic int vmm_initialized;
137215976Sjmallett
138215976Sjmallettstatic struct vmm_ops *ops;
139215976Sjmallett#define	VMM_INIT(num)	(ops != NULL ? (*ops->init)(num) : 0)
140215976Sjmallett#define	VMM_CLEANUP()	(ops != NULL ? (*ops->cleanup)() : 0)
141215976Sjmallett#define	VMM_RESUME()	(ops != NULL ? (*ops->resume)() : 0)
142215976Sjmallett
143215976Sjmallett#define	VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
144215976Sjmallett#define	VMRUN(vmi, vcpu, rip, pmap, rptr) \
145215976Sjmallett	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, rptr) : ENXIO)
146215976Sjmallett#define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
147215976Sjmallett#define	VMSPACE_ALLOC(min, max) \
148215976Sjmallett	(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
149215976Sjmallett#define	VMSPACE_FREE(vmspace) \
150215976Sjmallett	(ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
151215976Sjmallett#define	VMGETREG(vmi, vcpu, num, retval)		\
152215976Sjmallett	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
153215976Sjmallett#define	VMSETREG(vmi, vcpu, num, val)		\
154215976Sjmallett	(ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
155215976Sjmallett#define	VMGETDESC(vmi, vcpu, num, desc)		\
156215976Sjmallett	(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
157215976Sjmallett#define	VMSETDESC(vmi, vcpu, num, desc)		\
158215976Sjmallett	(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
159215976Sjmallett#define	VMINJECT(vmi, vcpu, type, vec, ec, ecv)	\
160215976Sjmallett	(ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO)
161215976Sjmallett#define	VMGETCAP(vmi, vcpu, num, retval)	\
162215976Sjmallett	(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
163215976Sjmallett#define	VMSETCAP(vmi, vcpu, num, val)		\
164215976Sjmallett	(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
165215976Sjmallett#define	VLAPIC_INIT(vmi, vcpu)			\
166215976Sjmallett	(ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL)
167215976Sjmallett#define	VLAPIC_CLEANUP(vmi, vlapic)		\
168215976Sjmallett	(ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL)
169215976Sjmallett
170215976Sjmallett#define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
171215976Sjmallett#define	fpu_stop_emulating()	clts()
172215976Sjmallett
173215976Sjmallettstatic MALLOC_DEFINE(M_VM, "vm", "vm");
174215976SjmallettCTASSERT(VMM_MSR_NUM <= 64);	/* msr_mask can keep track of up to 64 msrs */
175215976Sjmallett
176215976Sjmallett/* statistics */
177215976Sjmallettstatic VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
178215976Sjmallett
179215976SjmallettSYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
180215976Sjmallett
181215976Sjmallettstatic int vmm_ipinum;
182215976SjmallettSYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
183215976Sjmallett    "IPI vector used for vcpu notifications");
184215976Sjmallett
185215976Sjmallettstatic void vm_deactivate_cpu(struct vm *vm, int vcpuid);
186215976Sjmallett
187215976Sjmallettstatic void
188215976Sjmallettvcpu_cleanup(struct vm *vm, int i)
189215976Sjmallett{
190215976Sjmallett	struct vcpu *vcpu = &vm->vcpu[i];
191215976Sjmallett
192215976Sjmallett	VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
193215976Sjmallett	vmm_stat_free(vcpu->stats);
194215976Sjmallett	fpu_save_area_free(vcpu->guestfpu);
195215976Sjmallett}
196215976Sjmallett
197215976Sjmallettstatic void
198215976Sjmallettvcpu_init(struct vm *vm, uint32_t vcpu_id)
199215976Sjmallett{
200215976Sjmallett	struct vcpu *vcpu;
201215976Sjmallett
202215976Sjmallett	vcpu = &vm->vcpu[vcpu_id];
203215976Sjmallett
204215976Sjmallett	vcpu_lock_init(vcpu);
205215976Sjmallett	vcpu->hostcpu = NOCPU;
206215976Sjmallett	vcpu->vcpuid = vcpu_id;
207215976Sjmallett	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
208215976Sjmallett	vm_set_x2apic_state(vm, vcpu_id, X2APIC_ENABLED);
209215976Sjmallett	vcpu->guestfpu = fpu_save_area_alloc();
210215976Sjmallett	fpu_save_area_reset(vcpu->guestfpu);
211215976Sjmallett	vcpu->stats = vmm_stat_alloc();
212215976Sjmallett}
213215976Sjmallett
214215976Sjmallettstruct vm_exit *
215215976Sjmallettvm_exitinfo(struct vm *vm, int cpuid)
216215976Sjmallett{
217215976Sjmallett	struct vcpu *vcpu;
218215976Sjmallett
219215976Sjmallett	if (cpuid < 0 || cpuid >= VM_MAXCPU)
220215976Sjmallett		panic("vm_exitinfo: invalid cpuid %d", cpuid);
221215976Sjmallett
222215976Sjmallett	vcpu = &vm->vcpu[cpuid];
223215976Sjmallett
224215976Sjmallett	return (&vcpu->exitinfo);
225215976Sjmallett}
226215976Sjmallett
227215976Sjmallettstatic void
228215976Sjmallettvmm_resume(void)
229215976Sjmallett{
230215976Sjmallett	VMM_RESUME();
231215976Sjmallett}
232215976Sjmallett
233215976Sjmallettstatic int
234215976Sjmallettvmm_init(void)
235215976Sjmallett{
236215976Sjmallett	int error;
237215976Sjmallett
238215976Sjmallett	vmm_host_state_init();
239215976Sjmallett
240215976Sjmallett	vmm_ipinum = vmm_ipi_alloc();
241215976Sjmallett	if (vmm_ipinum == 0)
242215976Sjmallett		vmm_ipinum = IPI_AST;
243215976Sjmallett
244215976Sjmallett	error = vmm_mem_init();
245215976Sjmallett	if (error)
246215976Sjmallett		return (error);
247215976Sjmallett
248215976Sjmallett	if (vmm_is_intel())
249215976Sjmallett		ops = &vmm_ops_intel;
250215976Sjmallett	else if (vmm_is_amd())
251215976Sjmallett		ops = &vmm_ops_amd;
252215976Sjmallett	else
253215976Sjmallett		return (ENXIO);
254215976Sjmallett
255215976Sjmallett	vmm_msr_init();
256215976Sjmallett	vmm_resume_p = vmm_resume;
257215976Sjmallett
258215976Sjmallett	return (VMM_INIT(vmm_ipinum));
259215976Sjmallett}
260215976Sjmallett
261215976Sjmallettstatic int
262215976Sjmallettvmm_handler(module_t mod, int what, void *arg)
263215976Sjmallett{
264215976Sjmallett	int error;
265215976Sjmallett
266215976Sjmallett	switch (what) {
267215976Sjmallett	case MOD_LOAD:
268215976Sjmallett		vmmdev_init();
269215976Sjmallett		iommu_init();
270215976Sjmallett		error = vmm_init();
271215976Sjmallett		if (error == 0)
272215976Sjmallett			vmm_initialized = 1;
273215976Sjmallett		break;
274215976Sjmallett	case MOD_UNLOAD:
275215976Sjmallett		error = vmmdev_cleanup();
276215976Sjmallett		if (error == 0) {
277215976Sjmallett			vmm_resume_p = NULL;
278215976Sjmallett			iommu_cleanup();
279215976Sjmallett			if (vmm_ipinum != IPI_AST)
280215976Sjmallett				vmm_ipi_free(vmm_ipinum);
281215976Sjmallett			error = VMM_CLEANUP();
282215976Sjmallett			/*
283215976Sjmallett			 * Something bad happened - prevent new
284215976Sjmallett			 * VMs from being created
285215976Sjmallett			 */
286215976Sjmallett			if (error)
287215976Sjmallett				vmm_initialized = 0;
288215976Sjmallett		}
289215976Sjmallett		break;
290215976Sjmallett	default:
291215976Sjmallett		error = 0;
292215976Sjmallett		break;
293215976Sjmallett	}
294215976Sjmallett	return (error);
295215976Sjmallett}
296215976Sjmallett
297215976Sjmallettstatic moduledata_t vmm_kmod = {
298215976Sjmallett	"vmm",
299215976Sjmallett	vmm_handler,
300215976Sjmallett	NULL
301215976Sjmallett};
302215976Sjmallett
303215976Sjmallett/*
304215976Sjmallett * vmm initialization has the following dependencies:
305215976Sjmallett *
306215976Sjmallett * - iommu initialization must happen after the pci passthru driver has had
307215976Sjmallett *   a chance to attach to any passthru devices (after SI_SUB_CONFIGURE).
308215976Sjmallett *
309215976Sjmallett * - VT-x initialization requires smp_rendezvous() and therefore must happen
310215976Sjmallett *   after SMP is fully functional (after SI_SUB_SMP).
311215976Sjmallett */
312215976SjmallettDECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
313215976SjmallettMODULE_VERSION(vmm, 1);
314215976Sjmallett
315215976Sjmallettint
316215976Sjmallettvm_create(const char *name, struct vm **retvm)
317215976Sjmallett{
318215976Sjmallett	int i;
319215976Sjmallett	struct vm *vm;
320215976Sjmallett	struct vmspace *vmspace;
321215976Sjmallett
322215976Sjmallett	const int BSP = 0;
323215976Sjmallett
324215976Sjmallett	/*
325215976Sjmallett	 * If vmm.ko could not be successfully initialized then don't attempt
326215976Sjmallett	 * to create the virtual machine.
327215976Sjmallett	 */
328215976Sjmallett	if (!vmm_initialized)
329215976Sjmallett		return (ENXIO);
330215976Sjmallett
331215976Sjmallett	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
332215976Sjmallett		return (EINVAL);
333215976Sjmallett
334215976Sjmallett	vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS);
335215976Sjmallett	if (vmspace == NULL)
336215976Sjmallett		return (ENOMEM);
337215976Sjmallett
338215976Sjmallett	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
339215976Sjmallett	strcpy(vm->name, name);
340215976Sjmallett	vm->vmspace = vmspace;
341215976Sjmallett	mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
342215976Sjmallett	vm->cookie = VMINIT(vm, vmspace_pmap(vmspace));
343215976Sjmallett	vm->vioapic = vioapic_init(vm);
344215976Sjmallett	vm->vhpet = vhpet_init(vm);
345215976Sjmallett
346215976Sjmallett	for (i = 0; i < VM_MAXCPU; i++) {
347215976Sjmallett		vcpu_init(vm, i);
348215976Sjmallett		guest_msrs_init(vm, i);
349215976Sjmallett	}
350215976Sjmallett
351215976Sjmallett	vm_activate_cpu(vm, BSP);
352215976Sjmallett
353215976Sjmallett	*retvm = vm;
354215976Sjmallett	return (0);
355215976Sjmallett}
356215976Sjmallett
357215976Sjmallettstatic void
358215976Sjmallettvm_free_mem_seg(struct vm *vm, struct mem_seg *seg)
359215976Sjmallett{
360215976Sjmallett
361215976Sjmallett	if (seg->object != NULL)
362215976Sjmallett		vmm_mem_free(vm->vmspace, seg->gpa, seg->len);
363
364	bzero(seg, sizeof(*seg));
365}
366
367void
368vm_destroy(struct vm *vm)
369{
370	int i;
371
372	ppt_unassign_all(vm);
373
374	if (vm->iommu != NULL)
375		iommu_destroy_domain(vm->iommu);
376
377	vhpet_cleanup(vm->vhpet);
378	vioapic_cleanup(vm->vioapic);
379
380	for (i = 0; i < vm->num_mem_segs; i++)
381		vm_free_mem_seg(vm, &vm->mem_segs[i]);
382
383	vm->num_mem_segs = 0;
384
385	for (i = 0; i < VM_MAXCPU; i++)
386		vcpu_cleanup(vm, i);
387
388	VMSPACE_FREE(vm->vmspace);
389
390	VMCLEANUP(vm->cookie);
391
392	free(vm, M_VM);
393}
394
395const char *
396vm_name(struct vm *vm)
397{
398	return (vm->name);
399}
400
401int
402vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
403{
404	vm_object_t obj;
405
406	if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
407		return (ENOMEM);
408	else
409		return (0);
410}
411
412int
413vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
414{
415
416	vmm_mmio_free(vm->vmspace, gpa, len);
417	return (0);
418}
419
420boolean_t
421vm_mem_allocated(struct vm *vm, vm_paddr_t gpa)
422{
423	int i;
424	vm_paddr_t gpabase, gpalimit;
425
426	for (i = 0; i < vm->num_mem_segs; i++) {
427		gpabase = vm->mem_segs[i].gpa;
428		gpalimit = gpabase + vm->mem_segs[i].len;
429		if (gpa >= gpabase && gpa < gpalimit)
430			return (TRUE);		/* 'gpa' is regular memory */
431	}
432
433	if (ppt_is_mmio(vm, gpa))
434		return (TRUE);			/* 'gpa' is pci passthru mmio */
435
436	return (FALSE);
437}
438
439int
440vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
441{
442	int available, allocated;
443	struct mem_seg *seg;
444	vm_object_t object;
445	vm_paddr_t g;
446
447	if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
448		return (EINVAL);
449
450	available = allocated = 0;
451	g = gpa;
452	while (g < gpa + len) {
453		if (vm_mem_allocated(vm, g))
454			allocated++;
455		else
456			available++;
457
458		g += PAGE_SIZE;
459	}
460
461	/*
462	 * If there are some allocated and some available pages in the address
463	 * range then it is an error.
464	 */
465	if (allocated && available)
466		return (EINVAL);
467
468	/*
469	 * If the entire address range being requested has already been
470	 * allocated then there isn't anything more to do.
471	 */
472	if (allocated && available == 0)
473		return (0);
474
475	if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
476		return (E2BIG);
477
478	seg = &vm->mem_segs[vm->num_mem_segs];
479
480	if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL)
481		return (ENOMEM);
482
483	seg->gpa = gpa;
484	seg->len = len;
485	seg->object = object;
486	seg->wired = FALSE;
487
488	vm->num_mem_segs++;
489
490	return (0);
491}
492
493static void
494vm_gpa_unwire(struct vm *vm)
495{
496	int i, rv;
497	struct mem_seg *seg;
498
499	for (i = 0; i < vm->num_mem_segs; i++) {
500		seg = &vm->mem_segs[i];
501		if (!seg->wired)
502			continue;
503
504		rv = vm_map_unwire(&vm->vmspace->vm_map,
505				   seg->gpa, seg->gpa + seg->len,
506				   VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
507		KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment "
508		    "%#lx/%ld could not be unwired: %d",
509		    vm_name(vm), seg->gpa, seg->len, rv));
510
511		seg->wired = FALSE;
512	}
513}
514
515static int
516vm_gpa_wire(struct vm *vm)
517{
518	int i, rv;
519	struct mem_seg *seg;
520
521	for (i = 0; i < vm->num_mem_segs; i++) {
522		seg = &vm->mem_segs[i];
523		if (seg->wired)
524			continue;
525
526		/* XXX rlimits? */
527		rv = vm_map_wire(&vm->vmspace->vm_map,
528				 seg->gpa, seg->gpa + seg->len,
529				 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
530		if (rv != KERN_SUCCESS)
531			break;
532
533		seg->wired = TRUE;
534	}
535
536	if (i < vm->num_mem_segs) {
537		/*
538		 * Undo the wiring before returning an error.
539		 */
540		vm_gpa_unwire(vm);
541		return (EAGAIN);
542	}
543
544	return (0);
545}
546
547static void
548vm_iommu_modify(struct vm *vm, boolean_t map)
549{
550	int i, sz;
551	vm_paddr_t gpa, hpa;
552	struct mem_seg *seg;
553	void *vp, *cookie, *host_domain;
554
555	sz = PAGE_SIZE;
556	host_domain = iommu_host_domain();
557
558	for (i = 0; i < vm->num_mem_segs; i++) {
559		seg = &vm->mem_segs[i];
560		KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired",
561		    vm_name(vm), seg->gpa, seg->len));
562
563		gpa = seg->gpa;
564		while (gpa < seg->gpa + seg->len) {
565			vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE,
566					 &cookie);
567			KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
568			    vm_name(vm), gpa));
569
570			vm_gpa_release(cookie);
571
572			hpa = DMAP_TO_PHYS((uintptr_t)vp);
573			if (map) {
574				iommu_create_mapping(vm->iommu, gpa, hpa, sz);
575				iommu_remove_mapping(host_domain, hpa, sz);
576			} else {
577				iommu_remove_mapping(vm->iommu, gpa, sz);
578				iommu_create_mapping(host_domain, hpa, hpa, sz);
579			}
580
581			gpa += PAGE_SIZE;
582		}
583	}
584
585	/*
586	 * Invalidate the cached translations associated with the domain
587	 * from which pages were removed.
588	 */
589	if (map)
590		iommu_invalidate_tlb(host_domain);
591	else
592		iommu_invalidate_tlb(vm->iommu);
593}
594
595#define	vm_iommu_unmap(vm)	vm_iommu_modify((vm), FALSE)
596#define	vm_iommu_map(vm)	vm_iommu_modify((vm), TRUE)
597
598int
599vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
600{
601	int error;
602
603	error = ppt_unassign_device(vm, bus, slot, func);
604	if (error)
605		return (error);
606
607	if (ppt_num_devices(vm) == 0) {
608		vm_iommu_unmap(vm);
609		vm_gpa_unwire(vm);
610	}
611	return (0);
612}
613
614int
615vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
616{
617	int error;
618	vm_paddr_t maxaddr;
619
620	/*
621	 * Virtual machines with pci passthru devices get special treatment:
622	 * - the guest physical memory is wired
623	 * - the iommu is programmed to do the 'gpa' to 'hpa' translation
624	 *
625	 * We need to do this before the first pci passthru device is attached.
626	 */
627	if (ppt_num_devices(vm) == 0) {
628		KASSERT(vm->iommu == NULL,
629		    ("vm_assign_pptdev: iommu must be NULL"));
630		maxaddr = vmm_mem_maxaddr();
631		vm->iommu = iommu_create_domain(maxaddr);
632
633		error = vm_gpa_wire(vm);
634		if (error)
635			return (error);
636
637		vm_iommu_map(vm);
638	}
639
640	error = ppt_assign_device(vm, bus, slot, func);
641	return (error);
642}
643
644void *
645vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
646	    void **cookie)
647{
648	int count, pageoff;
649	vm_page_t m;
650
651	pageoff = gpa & PAGE_MASK;
652	if (len > PAGE_SIZE - pageoff)
653		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
654
655	count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
656	    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
657
658	if (count == 1) {
659		*cookie = m;
660		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
661	} else {
662		*cookie = NULL;
663		return (NULL);
664	}
665}
666
667void
668vm_gpa_release(void *cookie)
669{
670	vm_page_t m = cookie;
671
672	vm_page_lock(m);
673	vm_page_unhold(m);
674	vm_page_unlock(m);
675}
676
677int
678vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
679		  struct vm_memory_segment *seg)
680{
681	int i;
682
683	for (i = 0; i < vm->num_mem_segs; i++) {
684		if (gpabase == vm->mem_segs[i].gpa) {
685			seg->gpa = vm->mem_segs[i].gpa;
686			seg->len = vm->mem_segs[i].len;
687			seg->wired = vm->mem_segs[i].wired;
688			return (0);
689		}
690	}
691	return (-1);
692}
693
694int
695vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
696	      vm_offset_t *offset, struct vm_object **object)
697{
698	int i;
699	size_t seg_len;
700	vm_paddr_t seg_gpa;
701	vm_object_t seg_obj;
702
703	for (i = 0; i < vm->num_mem_segs; i++) {
704		if ((seg_obj = vm->mem_segs[i].object) == NULL)
705			continue;
706
707		seg_gpa = vm->mem_segs[i].gpa;
708		seg_len = vm->mem_segs[i].len;
709
710		if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) {
711			*offset = gpa - seg_gpa;
712			*object = seg_obj;
713			vm_object_reference(seg_obj);
714			return (0);
715		}
716	}
717
718	return (EINVAL);
719}
720
721int
722vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
723{
724
725	if (vcpu < 0 || vcpu >= VM_MAXCPU)
726		return (EINVAL);
727
728	if (reg >= VM_REG_LAST)
729		return (EINVAL);
730
731	return (VMGETREG(vm->cookie, vcpu, reg, retval));
732}
733
734int
735vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
736{
737
738	if (vcpu < 0 || vcpu >= VM_MAXCPU)
739		return (EINVAL);
740
741	if (reg >= VM_REG_LAST)
742		return (EINVAL);
743
744	return (VMSETREG(vm->cookie, vcpu, reg, val));
745}
746
747static boolean_t
748is_descriptor_table(int reg)
749{
750
751	switch (reg) {
752	case VM_REG_GUEST_IDTR:
753	case VM_REG_GUEST_GDTR:
754		return (TRUE);
755	default:
756		return (FALSE);
757	}
758}
759
760static boolean_t
761is_segment_register(int reg)
762{
763
764	switch (reg) {
765	case VM_REG_GUEST_ES:
766	case VM_REG_GUEST_CS:
767	case VM_REG_GUEST_SS:
768	case VM_REG_GUEST_DS:
769	case VM_REG_GUEST_FS:
770	case VM_REG_GUEST_GS:
771	case VM_REG_GUEST_TR:
772	case VM_REG_GUEST_LDTR:
773		return (TRUE);
774	default:
775		return (FALSE);
776	}
777}
778
779int
780vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
781		struct seg_desc *desc)
782{
783
784	if (vcpu < 0 || vcpu >= VM_MAXCPU)
785		return (EINVAL);
786
787	if (!is_segment_register(reg) && !is_descriptor_table(reg))
788		return (EINVAL);
789
790	return (VMGETDESC(vm->cookie, vcpu, reg, desc));
791}
792
793int
794vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
795		struct seg_desc *desc)
796{
797	if (vcpu < 0 || vcpu >= VM_MAXCPU)
798		return (EINVAL);
799
800	if (!is_segment_register(reg) && !is_descriptor_table(reg))
801		return (EINVAL);
802
803	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
804}
805
806static void
807restore_guest_fpustate(struct vcpu *vcpu)
808{
809
810	/* flush host state to the pcb */
811	fpuexit(curthread);
812
813	/* restore guest FPU state */
814	fpu_stop_emulating();
815	fpurestore(vcpu->guestfpu);
816
817	/*
818	 * The FPU is now "dirty" with the guest's state so turn on emulation
819	 * to trap any access to the FPU by the host.
820	 */
821	fpu_start_emulating();
822}
823
824static void
825save_guest_fpustate(struct vcpu *vcpu)
826{
827
828	if ((rcr0() & CR0_TS) == 0)
829		panic("fpu emulation not enabled in host!");
830
831	/* save guest FPU state */
832	fpu_stop_emulating();
833	fpusave(vcpu->guestfpu);
834	fpu_start_emulating();
835}
836
837static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
838
839static int
840vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
841    bool from_idle)
842{
843	int error;
844
845	vcpu_assert_locked(vcpu);
846
847	/*
848	 * State transitions from the vmmdev_ioctl() must always begin from
849	 * the VCPU_IDLE state. This guarantees that there is only a single
850	 * ioctl() operating on a vcpu at any point.
851	 */
852	if (from_idle) {
853		while (vcpu->state != VCPU_IDLE)
854			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
855	} else {
856		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
857		    "vcpu idle state"));
858	}
859
860	if (vcpu->state == VCPU_RUNNING) {
861		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
862		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
863	} else {
864		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
865		    "vcpu that is not running", vcpu->hostcpu));
866	}
867
868	/*
869	 * The following state transitions are allowed:
870	 * IDLE -> FROZEN -> IDLE
871	 * FROZEN -> RUNNING -> FROZEN
872	 * FROZEN -> SLEEPING -> FROZEN
873	 */
874	switch (vcpu->state) {
875	case VCPU_IDLE:
876	case VCPU_RUNNING:
877	case VCPU_SLEEPING:
878		error = (newstate != VCPU_FROZEN);
879		break;
880	case VCPU_FROZEN:
881		error = (newstate == VCPU_FROZEN);
882		break;
883	default:
884		error = 1;
885		break;
886	}
887
888	if (error)
889		return (EBUSY);
890
891	vcpu->state = newstate;
892	if (newstate == VCPU_RUNNING)
893		vcpu->hostcpu = curcpu;
894	else
895		vcpu->hostcpu = NOCPU;
896
897	if (newstate == VCPU_IDLE)
898		wakeup(&vcpu->state);
899
900	return (0);
901}
902
903static void
904vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
905{
906	int error;
907
908	if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
909		panic("Error %d setting state to %d\n", error, newstate);
910}
911
912static void
913vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
914{
915	int error;
916
917	if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
918		panic("Error %d setting state to %d", error, newstate);
919}
920
921static void
922vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func)
923{
924
925	KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked"));
926
927	/*
928	 * Update 'rendezvous_func' and execute a write memory barrier to
929	 * ensure that it is visible across all host cpus. This is not needed
930	 * for correctness but it does ensure that all the vcpus will notice
931	 * that the rendezvous is requested immediately.
932	 */
933	vm->rendezvous_func = func;
934	wmb();
935}
936
937#define	RENDEZVOUS_CTR0(vm, vcpuid, fmt)				\
938	do {								\
939		if (vcpuid >= 0)					\
940			VCPU_CTR0(vm, vcpuid, fmt);			\
941		else							\
942			VM_CTR0(vm, fmt);				\
943	} while (0)
944
945static void
946vm_handle_rendezvous(struct vm *vm, int vcpuid)
947{
948
949	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
950	    ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid));
951
952	mtx_lock(&vm->rendezvous_mtx);
953	while (vm->rendezvous_func != NULL) {
954		/* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
955		CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus);
956
957		if (vcpuid != -1 &&
958		    CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
959		    !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
960			VCPU_CTR0(vm, vcpuid, "Calling rendezvous func");
961			(*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg);
962			CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
963		}
964		if (CPU_CMP(&vm->rendezvous_req_cpus,
965		    &vm->rendezvous_done_cpus) == 0) {
966			VCPU_CTR0(vm, vcpuid, "Rendezvous completed");
967			vm_set_rendezvous_func(vm, NULL);
968			wakeup(&vm->rendezvous_func);
969			break;
970		}
971		RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion");
972		mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
973		    "vmrndv", 0);
974	}
975	mtx_unlock(&vm->rendezvous_mtx);
976}
977
978/*
979 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
980 */
981static int
982vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
983{
984	struct vm_exit *vmexit;
985	struct vcpu *vcpu;
986	int t, timo, spindown;
987
988	vcpu = &vm->vcpu[vcpuid];
989	spindown = 0;
990
991	vcpu_lock(vcpu);
992
993	/*
994	 * Do a final check for pending NMI or interrupts before
995	 * really putting this thread to sleep.
996	 *
997	 * These interrupts could have happened any time after we
998	 * returned from VMRUN() and before we grabbed the vcpu lock.
999	 */
1000	if (!vm_nmi_pending(vm, vcpuid) &&
1001	    (intr_disabled || !vlapic_pending_intr(vcpu->vlapic, NULL))) {
1002		t = ticks;
1003		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1004		if (vlapic_enabled(vcpu->vlapic)) {
1005			/*
1006			 * XXX msleep_spin() is not interruptible so use the
1007			 * 'timo' to put an upper bound on the sleep time.
1008			 */
1009			timo = hz;
1010			msleep_spin(vcpu, &vcpu->mtx, "vmidle", timo);
1011		} else {
1012			/*
1013			 * Spindown the vcpu if the apic is disabled and it
1014			 * had entered the halted state.
1015			 */
1016			spindown = 1;
1017		}
1018		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1019		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
1020	}
1021	vcpu_unlock(vcpu);
1022
1023	/*
1024	 * Since 'vm_deactivate_cpu()' grabs a sleep mutex we must call it
1025	 * outside the confines of the vcpu spinlock.
1026	 */
1027	if (spindown) {
1028		*retu = true;
1029		vmexit = vm_exitinfo(vm, vcpuid);
1030		vmexit->exitcode = VM_EXITCODE_SPINDOWN_CPU;
1031		vm_deactivate_cpu(vm, vcpuid);
1032		VCPU_CTR0(vm, vcpuid, "spinning down cpu");
1033	}
1034
1035	return (0);
1036}
1037
1038static int
1039vm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
1040{
1041	int rv, ftype;
1042	struct vm_map *map;
1043	struct vcpu *vcpu;
1044	struct vm_exit *vme;
1045
1046	vcpu = &vm->vcpu[vcpuid];
1047	vme = &vcpu->exitinfo;
1048
1049	ftype = vme->u.paging.fault_type;
1050	KASSERT(ftype == VM_PROT_READ ||
1051	    ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
1052	    ("vm_handle_paging: invalid fault_type %d", ftype));
1053
1054	if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
1055		rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
1056		    vme->u.paging.gpa, ftype);
1057		if (rv == 0)
1058			goto done;
1059	}
1060
1061	map = &vm->vmspace->vm_map;
1062	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
1063
1064	VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, "
1065	    "ftype = %d", rv, vme->u.paging.gpa, ftype);
1066
1067	if (rv != KERN_SUCCESS)
1068		return (EFAULT);
1069done:
1070	/* restart execution at the faulting instruction */
1071	vme->inst_length = 0;
1072
1073	return (0);
1074}
1075
1076static int
1077vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
1078{
1079	struct vie *vie;
1080	struct vcpu *vcpu;
1081	struct vm_exit *vme;
1082	int error, inst_length;
1083	uint64_t rip, gla, gpa, cr3;
1084	mem_region_read_t mread;
1085	mem_region_write_t mwrite;
1086
1087	vcpu = &vm->vcpu[vcpuid];
1088	vme = &vcpu->exitinfo;
1089
1090	rip = vme->rip;
1091	inst_length = vme->inst_length;
1092
1093	gla = vme->u.inst_emul.gla;
1094	gpa = vme->u.inst_emul.gpa;
1095	cr3 = vme->u.inst_emul.cr3;
1096	vie = &vme->u.inst_emul.vie;
1097
1098	vie_init(vie);
1099
1100	/* Fetch, decode and emulate the faulting instruction */
1101	if (vmm_fetch_instruction(vm, vcpuid, rip, inst_length, cr3, vie) != 0)
1102		return (EFAULT);
1103
1104	if (vmm_decode_instruction(vm, vcpuid, gla, vie) != 0)
1105		return (EFAULT);
1106
1107	/* return to userland unless this is an in-kernel emulated device */
1108	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1109		mread = lapic_mmio_read;
1110		mwrite = lapic_mmio_write;
1111	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1112		mread = vioapic_mmio_read;
1113		mwrite = vioapic_mmio_write;
1114	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1115		mread = vhpet_mmio_read;
1116		mwrite = vhpet_mmio_write;
1117	} else {
1118		*retu = true;
1119		return (0);
1120	}
1121
1122	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, mread, mwrite,
1123	    retu);
1124
1125	return (error);
1126}
1127
1128int
1129vm_run(struct vm *vm, struct vm_run *vmrun)
1130{
1131	int error, vcpuid;
1132	struct vcpu *vcpu;
1133	struct pcb *pcb;
1134	uint64_t tscval, rip;
1135	struct vm_exit *vme;
1136	bool retu, intr_disabled;
1137	pmap_t pmap;
1138
1139	vcpuid = vmrun->cpuid;
1140
1141	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1142		return (EINVAL);
1143
1144	pmap = vmspace_pmap(vm->vmspace);
1145	vcpu = &vm->vcpu[vcpuid];
1146	vme = &vcpu->exitinfo;
1147	rip = vmrun->rip;
1148restart:
1149	critical_enter();
1150
1151	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
1152	    ("vm_run: absurd pm_active"));
1153
1154	tscval = rdtsc();
1155
1156	pcb = PCPU_GET(curpcb);
1157	set_pcb_flags(pcb, PCB_FULL_IRET);
1158
1159	restore_guest_msrs(vm, vcpuid);
1160	restore_guest_fpustate(vcpu);
1161
1162	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
1163	error = VMRUN(vm->cookie, vcpuid, rip, pmap, &vm->rendezvous_func);
1164	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
1165
1166	save_guest_fpustate(vcpu);
1167	restore_host_msrs(vm, vcpuid);
1168
1169	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
1170
1171	critical_exit();
1172
1173	if (error == 0) {
1174		retu = false;
1175		switch (vme->exitcode) {
1176		case VM_EXITCODE_IOAPIC_EOI:
1177			vioapic_process_eoi(vm, vcpuid,
1178			    vme->u.ioapic_eoi.vector);
1179			break;
1180		case VM_EXITCODE_RENDEZVOUS:
1181			vm_handle_rendezvous(vm, vcpuid);
1182			error = 0;
1183			break;
1184		case VM_EXITCODE_HLT:
1185			intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
1186			error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu);
1187			break;
1188		case VM_EXITCODE_PAGING:
1189			error = vm_handle_paging(vm, vcpuid, &retu);
1190			break;
1191		case VM_EXITCODE_INST_EMUL:
1192			error = vm_handle_inst_emul(vm, vcpuid, &retu);
1193			break;
1194		default:
1195			retu = true;	/* handled in userland */
1196			break;
1197		}
1198	}
1199
1200	if (error == 0 && retu == false) {
1201		rip = vme->rip + vme->inst_length;
1202		goto restart;
1203	}
1204
1205	/* copy the exit information */
1206	bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
1207	return (error);
1208}
1209
1210int
1211vm_inject_event(struct vm *vm, int vcpuid, int type,
1212		int vector, uint32_t code, int code_valid)
1213{
1214	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1215		return (EINVAL);
1216
1217	if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0)
1218		return (EINVAL);
1219
1220	if (vector < 0 || vector > 255)
1221		return (EINVAL);
1222
1223	return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid));
1224}
1225
1226static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
1227
1228int
1229vm_inject_nmi(struct vm *vm, int vcpuid)
1230{
1231	struct vcpu *vcpu;
1232
1233	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1234		return (EINVAL);
1235
1236	vcpu = &vm->vcpu[vcpuid];
1237
1238	vcpu->nmi_pending = 1;
1239	vcpu_notify_event(vm, vcpuid, false);
1240	return (0);
1241}
1242
1243int
1244vm_nmi_pending(struct vm *vm, int vcpuid)
1245{
1246	struct vcpu *vcpu;
1247
1248	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1249		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1250
1251	vcpu = &vm->vcpu[vcpuid];
1252
1253	return (vcpu->nmi_pending);
1254}
1255
1256void
1257vm_nmi_clear(struct vm *vm, int vcpuid)
1258{
1259	struct vcpu *vcpu;
1260
1261	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1262		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1263
1264	vcpu = &vm->vcpu[vcpuid];
1265
1266	if (vcpu->nmi_pending == 0)
1267		panic("vm_nmi_clear: inconsistent nmi_pending state");
1268
1269	vcpu->nmi_pending = 0;
1270	vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
1271}
1272
1273int
1274vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
1275{
1276	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1277		return (EINVAL);
1278
1279	if (type < 0 || type >= VM_CAP_MAX)
1280		return (EINVAL);
1281
1282	return (VMGETCAP(vm->cookie, vcpu, type, retval));
1283}
1284
1285int
1286vm_set_capability(struct vm *vm, int vcpu, int type, int val)
1287{
1288	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1289		return (EINVAL);
1290
1291	if (type < 0 || type >= VM_CAP_MAX)
1292		return (EINVAL);
1293
1294	return (VMSETCAP(vm->cookie, vcpu, type, val));
1295}
1296
1297uint64_t *
1298vm_guest_msrs(struct vm *vm, int cpu)
1299{
1300	return (vm->vcpu[cpu].guest_msrs);
1301}
1302
1303struct vlapic *
1304vm_lapic(struct vm *vm, int cpu)
1305{
1306	return (vm->vcpu[cpu].vlapic);
1307}
1308
1309struct vioapic *
1310vm_ioapic(struct vm *vm)
1311{
1312
1313	return (vm->vioapic);
1314}
1315
1316struct vhpet *
1317vm_hpet(struct vm *vm)
1318{
1319
1320	return (vm->vhpet);
1321}
1322
1323boolean_t
1324vmm_is_pptdev(int bus, int slot, int func)
1325{
1326	int found, i, n;
1327	int b, s, f;
1328	char *val, *cp, *cp2;
1329
1330	/*
1331	 * XXX
1332	 * The length of an environment variable is limited to 128 bytes which
1333	 * puts an upper limit on the number of passthru devices that may be
1334	 * specified using a single environment variable.
1335	 *
1336	 * Work around this by scanning multiple environment variable
1337	 * names instead of a single one - yuck!
1338	 */
1339	const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
1340
1341	/* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
1342	found = 0;
1343	for (i = 0; names[i] != NULL && !found; i++) {
1344		cp = val = getenv(names[i]);
1345		while (cp != NULL && *cp != '\0') {
1346			if ((cp2 = strchr(cp, ' ')) != NULL)
1347				*cp2 = '\0';
1348
1349			n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
1350			if (n == 3 && bus == b && slot == s && func == f) {
1351				found = 1;
1352				break;
1353			}
1354
1355			if (cp2 != NULL)
1356				*cp2++ = ' ';
1357
1358			cp = cp2;
1359		}
1360		freeenv(val);
1361	}
1362	return (found);
1363}
1364
1365void *
1366vm_iommu_domain(struct vm *vm)
1367{
1368
1369	return (vm->iommu);
1370}
1371
1372int
1373vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
1374    bool from_idle)
1375{
1376	int error;
1377	struct vcpu *vcpu;
1378
1379	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1380		panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
1381
1382	vcpu = &vm->vcpu[vcpuid];
1383
1384	vcpu_lock(vcpu);
1385	error = vcpu_set_state_locked(vcpu, newstate, from_idle);
1386	vcpu_unlock(vcpu);
1387
1388	return (error);
1389}
1390
1391enum vcpu_state
1392vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
1393{
1394	struct vcpu *vcpu;
1395	enum vcpu_state state;
1396
1397	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1398		panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
1399
1400	vcpu = &vm->vcpu[vcpuid];
1401
1402	vcpu_lock(vcpu);
1403	state = vcpu->state;
1404	if (hostcpu != NULL)
1405		*hostcpu = vcpu->hostcpu;
1406	vcpu_unlock(vcpu);
1407
1408	return (state);
1409}
1410
1411void
1412vm_activate_cpu(struct vm *vm, int vcpuid)
1413{
1414
1415	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU,
1416	    ("vm_activate_cpu: invalid vcpuid %d", vcpuid));
1417	KASSERT(!CPU_ISSET(vcpuid, &vm->active_cpus),
1418	    ("vm_activate_cpu: vcpuid %d is already active", vcpuid));
1419
1420	VCPU_CTR0(vm, vcpuid, "activated");
1421	CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
1422}
1423
1424static void
1425vm_deactivate_cpu(struct vm *vm, int vcpuid)
1426{
1427
1428	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU,
1429	    ("vm_deactivate_cpu: invalid vcpuid %d", vcpuid));
1430	KASSERT(CPU_ISSET(vcpuid, &vm->active_cpus),
1431	    ("vm_deactivate_cpu: vcpuid %d is not active", vcpuid));
1432
1433	VCPU_CTR0(vm, vcpuid, "deactivated");
1434	CPU_CLR_ATOMIC(vcpuid, &vm->active_cpus);
1435
1436	/*
1437	 * If a vcpu rendezvous is in progress then it could be blocked
1438	 * on 'vcpuid' - unblock it before disappearing forever.
1439	 */
1440	mtx_lock(&vm->rendezvous_mtx);
1441	if (vm->rendezvous_func != NULL) {
1442		VCPU_CTR0(vm, vcpuid, "unblock rendezvous after deactivation");
1443		wakeup(&vm->rendezvous_func);
1444	}
1445	mtx_unlock(&vm->rendezvous_mtx);
1446}
1447
1448cpuset_t
1449vm_active_cpus(struct vm *vm)
1450{
1451
1452	return (vm->active_cpus);
1453}
1454
1455void *
1456vcpu_stats(struct vm *vm, int vcpuid)
1457{
1458
1459	return (vm->vcpu[vcpuid].stats);
1460}
1461
1462int
1463vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
1464{
1465	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1466		return (EINVAL);
1467
1468	*state = vm->vcpu[vcpuid].x2apic_state;
1469
1470	return (0);
1471}
1472
1473int
1474vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
1475{
1476	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1477		return (EINVAL);
1478
1479	if (state >= X2APIC_STATE_LAST)
1480		return (EINVAL);
1481
1482	vm->vcpu[vcpuid].x2apic_state = state;
1483
1484	vlapic_set_x2apic_state(vm, vcpuid, state);
1485
1486	return (0);
1487}
1488
1489/*
1490 * This function is called to ensure that a vcpu "sees" a pending event
1491 * as soon as possible:
1492 * - If the vcpu thread is sleeping then it is woken up.
1493 * - If the vcpu is running on a different host_cpu then an IPI will be directed
1494 *   to the host_cpu to cause the vcpu to trap into the hypervisor.
1495 */
1496void
1497vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
1498{
1499	int hostcpu;
1500	struct vcpu *vcpu;
1501
1502	vcpu = &vm->vcpu[vcpuid];
1503
1504	vcpu_lock(vcpu);
1505	hostcpu = vcpu->hostcpu;
1506	if (vcpu->state == VCPU_RUNNING) {
1507		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
1508		if (hostcpu != curcpu) {
1509			if (lapic_intr) {
1510				vlapic_post_intr(vcpu->vlapic, hostcpu,
1511				    vmm_ipinum);
1512			} else {
1513				ipi_cpu(hostcpu, vmm_ipinum);
1514			}
1515		} else {
1516			/*
1517			 * If the 'vcpu' is running on 'curcpu' then it must
1518			 * be sending a notification to itself (e.g. SELF_IPI).
1519			 * The pending event will be picked up when the vcpu
1520			 * transitions back to guest context.
1521			 */
1522		}
1523	} else {
1524		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
1525		    "with hostcpu %d", vcpu->state, hostcpu));
1526		if (vcpu->state == VCPU_SLEEPING)
1527			wakeup_one(vcpu);
1528	}
1529	vcpu_unlock(vcpu);
1530}
1531
1532struct vmspace *
1533vm_get_vmspace(struct vm *vm)
1534{
1535
1536	return (vm->vmspace);
1537}
1538
1539int
1540vm_apicid2vcpuid(struct vm *vm, int apicid)
1541{
1542	/*
1543	 * XXX apic id is assumed to be numerically identical to vcpu id
1544	 */
1545	return (apicid);
1546}
1547
1548void
1549vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
1550    vm_rendezvous_func_t func, void *arg)
1551{
1552	int i;
1553
1554	/*
1555	 * Enforce that this function is called without any locks
1556	 */
1557	WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous");
1558	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
1559	    ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid));
1560
1561restart:
1562	mtx_lock(&vm->rendezvous_mtx);
1563	if (vm->rendezvous_func != NULL) {
1564		/*
1565		 * If a rendezvous is already in progress then we need to
1566		 * call the rendezvous handler in case this 'vcpuid' is one
1567		 * of the targets of the rendezvous.
1568		 */
1569		RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress");
1570		mtx_unlock(&vm->rendezvous_mtx);
1571		vm_handle_rendezvous(vm, vcpuid);
1572		goto restart;
1573	}
1574	KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous "
1575	    "rendezvous is still in progress"));
1576
1577	RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous");
1578	vm->rendezvous_req_cpus = dest;
1579	CPU_ZERO(&vm->rendezvous_done_cpus);
1580	vm->rendezvous_arg = arg;
1581	vm_set_rendezvous_func(vm, func);
1582	mtx_unlock(&vm->rendezvous_mtx);
1583
1584	/*
1585	 * Wake up any sleeping vcpus and trigger a VM-exit in any running
1586	 * vcpus so they handle the rendezvous as soon as possible.
1587	 */
1588	for (i = 0; i < VM_MAXCPU; i++) {
1589		if (CPU_ISSET(i, &dest))
1590			vcpu_notify_event(vm, i, false);
1591	}
1592
1593	vm_handle_rendezvous(vm, vcpuid);
1594}
1595