vmm.c revision 267447
1/*-
2 * Copyright (c) 2011 NetApp, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD: stable/10/sys/amd64/vmm/vmm.c 267447 2014-06-13 19:10:40Z jhb $
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/vmm.c 267447 2014-06-13 19:10:40Z jhb $");
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/kernel.h>
35#include <sys/module.h>
36#include <sys/sysctl.h>
37#include <sys/malloc.h>
38#include <sys/pcpu.h>
39#include <sys/lock.h>
40#include <sys/mutex.h>
41#include <sys/proc.h>
42#include <sys/rwlock.h>
43#include <sys/sched.h>
44#include <sys/smp.h>
45#include <sys/systm.h>
46
47#include <vm/vm.h>
48#include <vm/vm_object.h>
49#include <vm/vm_page.h>
50#include <vm/pmap.h>
51#include <vm/vm_map.h>
52#include <vm/vm_extern.h>
53#include <vm/vm_param.h>
54
55#include <machine/cpu.h>
56#include <machine/vm.h>
57#include <machine/pcb.h>
58#include <machine/smp.h>
59#include <x86/psl.h>
60#include <x86/apicreg.h>
61#include <machine/vmparam.h>
62
63#include <machine/vmm.h>
64#include <machine/vmm_dev.h>
65
66#include "vmm_ktr.h"
67#include "vmm_host.h"
68#include "vmm_mem.h"
69#include "vmm_util.h"
70#include "vhpet.h"
71#include "vioapic.h"
72#include "vlapic.h"
73#include "vmm_msr.h"
74#include "vmm_ipi.h"
75#include "vmm_stat.h"
76#include "vmm_lapic.h"
77
78#include "io/ppt.h"
79#include "io/iommu.h"
80
81struct vlapic;
82
83struct vcpu {
84	int		flags;
85	enum vcpu_state	state;
86	struct mtx	mtx;
87	int		hostcpu;	/* host cpuid this vcpu last ran on */
88	uint64_t	guest_msrs[VMM_MSR_NUM];
89	struct vlapic	*vlapic;
90	int		 vcpuid;
91	struct savefpu	*guestfpu;	/* guest fpu state */
92	uint64_t	guest_xcr0;
93	void		*stats;
94	struct vm_exit	exitinfo;
95	enum x2apic_state x2apic_state;
96	int		nmi_pending;
97	struct vm_exception exception;
98	int		exception_pending;
99};
100
101#define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
102#define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
103#define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
104#define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
105
106struct mem_seg {
107	vm_paddr_t	gpa;
108	size_t		len;
109	boolean_t	wired;
110	vm_object_t	object;
111};
112#define	VM_MAX_MEMORY_SEGMENTS	2
113
114struct vm {
115	void		*cookie;	/* processor-specific data */
116	void		*iommu;		/* iommu-specific data */
117	struct vhpet	*vhpet;		/* virtual HPET */
118	struct vioapic	*vioapic;	/* virtual ioapic */
119	struct vmspace	*vmspace;	/* guest's address space */
120	struct vcpu	vcpu[VM_MAXCPU];
121	int		num_mem_segs;
122	struct mem_seg	mem_segs[VM_MAX_MEMORY_SEGMENTS];
123	char		name[VM_MAX_NAMELEN];
124
125	/*
126	 * Set of active vcpus.
127	 * An active vcpu is one that has been started implicitly (BSP) or
128	 * explicitly (AP) by sending it a startup ipi.
129	 */
130	volatile cpuset_t active_cpus;
131
132	struct mtx	rendezvous_mtx;
133	cpuset_t	rendezvous_req_cpus;
134	cpuset_t	rendezvous_done_cpus;
135	void		*rendezvous_arg;
136	vm_rendezvous_func_t rendezvous_func;
137};
138
139static int vmm_initialized;
140
141static struct vmm_ops *ops;
142#define	VMM_INIT(num)	(ops != NULL ? (*ops->init)(num) : 0)
143#define	VMM_CLEANUP()	(ops != NULL ? (*ops->cleanup)() : 0)
144#define	VMM_RESUME()	(ops != NULL ? (*ops->resume)() : 0)
145
146#define	VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
147#define	VMRUN(vmi, vcpu, rip, pmap, rptr) \
148	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, rptr) : ENXIO)
149#define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
150#define	VMSPACE_ALLOC(min, max) \
151	(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
152#define	VMSPACE_FREE(vmspace) \
153	(ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
154#define	VMGETREG(vmi, vcpu, num, retval)		\
155	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
156#define	VMSETREG(vmi, vcpu, num, val)		\
157	(ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
158#define	VMGETDESC(vmi, vcpu, num, desc)		\
159	(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
160#define	VMSETDESC(vmi, vcpu, num, desc)		\
161	(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
162#define	VMGETCAP(vmi, vcpu, num, retval)	\
163	(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
164#define	VMSETCAP(vmi, vcpu, num, val)		\
165	(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
166#define	VLAPIC_INIT(vmi, vcpu)			\
167	(ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL)
168#define	VLAPIC_CLEANUP(vmi, vlapic)		\
169	(ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL)
170
171#define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
172#define	fpu_stop_emulating()	clts()
173
174static MALLOC_DEFINE(M_VM, "vm", "vm");
175CTASSERT(VMM_MSR_NUM <= 64);	/* msr_mask can keep track of up to 64 msrs */
176
177/* statistics */
178static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
179
180SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
181
182static int vmm_ipinum;
183SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
184    "IPI vector used for vcpu notifications");
185
186static void vm_deactivate_cpu(struct vm *vm, int vcpuid);
187
188static void
189vcpu_cleanup(struct vm *vm, int i)
190{
191	struct vcpu *vcpu = &vm->vcpu[i];
192
193	VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
194	vmm_stat_free(vcpu->stats);
195	fpu_save_area_free(vcpu->guestfpu);
196}
197
198static void
199vcpu_init(struct vm *vm, uint32_t vcpu_id)
200{
201	struct vcpu *vcpu;
202
203	vcpu = &vm->vcpu[vcpu_id];
204
205	vcpu_lock_init(vcpu);
206	vcpu->hostcpu = NOCPU;
207	vcpu->vcpuid = vcpu_id;
208	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
209	vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
210	vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
211	vcpu->guestfpu = fpu_save_area_alloc();
212	fpu_save_area_reset(vcpu->guestfpu);
213	vcpu->stats = vmm_stat_alloc();
214}
215
216struct vm_exit *
217vm_exitinfo(struct vm *vm, int cpuid)
218{
219	struct vcpu *vcpu;
220
221	if (cpuid < 0 || cpuid >= VM_MAXCPU)
222		panic("vm_exitinfo: invalid cpuid %d", cpuid);
223
224	vcpu = &vm->vcpu[cpuid];
225
226	return (&vcpu->exitinfo);
227}
228
229static void
230vmm_resume(void)
231{
232	VMM_RESUME();
233}
234
235static int
236vmm_init(void)
237{
238	int error;
239
240	vmm_host_state_init();
241
242	vmm_ipinum = vmm_ipi_alloc();
243	if (vmm_ipinum == 0)
244		vmm_ipinum = IPI_AST;
245
246	error = vmm_mem_init();
247	if (error)
248		return (error);
249
250	if (vmm_is_intel())
251		ops = &vmm_ops_intel;
252	else if (vmm_is_amd())
253		ops = &vmm_ops_amd;
254	else
255		return (ENXIO);
256
257	vmm_msr_init();
258	vmm_resume_p = vmm_resume;
259
260	return (VMM_INIT(vmm_ipinum));
261}
262
263static int
264vmm_handler(module_t mod, int what, void *arg)
265{
266	int error;
267
268	switch (what) {
269	case MOD_LOAD:
270		vmmdev_init();
271		if (ppt_avail_devices() > 0)
272			iommu_init();
273		error = vmm_init();
274		if (error == 0)
275			vmm_initialized = 1;
276		break;
277	case MOD_UNLOAD:
278		error = vmmdev_cleanup();
279		if (error == 0) {
280			vmm_resume_p = NULL;
281			iommu_cleanup();
282			if (vmm_ipinum != IPI_AST)
283				vmm_ipi_free(vmm_ipinum);
284			error = VMM_CLEANUP();
285			/*
286			 * Something bad happened - prevent new
287			 * VMs from being created
288			 */
289			if (error)
290				vmm_initialized = 0;
291		}
292		break;
293	default:
294		error = 0;
295		break;
296	}
297	return (error);
298}
299
300static moduledata_t vmm_kmod = {
301	"vmm",
302	vmm_handler,
303	NULL
304};
305
306/*
307 * vmm initialization has the following dependencies:
308 *
309 * - iommu initialization must happen after the pci passthru driver has had
310 *   a chance to attach to any passthru devices (after SI_SUB_CONFIGURE).
311 *
312 * - VT-x initialization requires smp_rendezvous() and therefore must happen
313 *   after SMP is fully functional (after SI_SUB_SMP).
314 */
315DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
316MODULE_VERSION(vmm, 1);
317
318int
319vm_create(const char *name, struct vm **retvm)
320{
321	int i;
322	struct vm *vm;
323	struct vmspace *vmspace;
324
325	const int BSP = 0;
326
327	/*
328	 * If vmm.ko could not be successfully initialized then don't attempt
329	 * to create the virtual machine.
330	 */
331	if (!vmm_initialized)
332		return (ENXIO);
333
334	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
335		return (EINVAL);
336
337	vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS);
338	if (vmspace == NULL)
339		return (ENOMEM);
340
341	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
342	strcpy(vm->name, name);
343	vm->vmspace = vmspace;
344	mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
345	vm->cookie = VMINIT(vm, vmspace_pmap(vmspace));
346	vm->vioapic = vioapic_init(vm);
347	vm->vhpet = vhpet_init(vm);
348
349	for (i = 0; i < VM_MAXCPU; i++) {
350		vcpu_init(vm, i);
351		guest_msrs_init(vm, i);
352	}
353
354	vm_activate_cpu(vm, BSP);
355
356	*retvm = vm;
357	return (0);
358}
359
360static void
361vm_free_mem_seg(struct vm *vm, struct mem_seg *seg)
362{
363
364	if (seg->object != NULL)
365		vmm_mem_free(vm->vmspace, seg->gpa, seg->len);
366
367	bzero(seg, sizeof(*seg));
368}
369
370void
371vm_destroy(struct vm *vm)
372{
373	int i;
374
375	ppt_unassign_all(vm);
376
377	if (vm->iommu != NULL)
378		iommu_destroy_domain(vm->iommu);
379
380	vhpet_cleanup(vm->vhpet);
381	vioapic_cleanup(vm->vioapic);
382
383	for (i = 0; i < vm->num_mem_segs; i++)
384		vm_free_mem_seg(vm, &vm->mem_segs[i]);
385
386	vm->num_mem_segs = 0;
387
388	for (i = 0; i < VM_MAXCPU; i++)
389		vcpu_cleanup(vm, i);
390
391	VMSPACE_FREE(vm->vmspace);
392
393	VMCLEANUP(vm->cookie);
394
395	free(vm, M_VM);
396}
397
398const char *
399vm_name(struct vm *vm)
400{
401	return (vm->name);
402}
403
404int
405vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
406{
407	vm_object_t obj;
408
409	if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
410		return (ENOMEM);
411	else
412		return (0);
413}
414
415int
416vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
417{
418
419	vmm_mmio_free(vm->vmspace, gpa, len);
420	return (0);
421}
422
423boolean_t
424vm_mem_allocated(struct vm *vm, vm_paddr_t gpa)
425{
426	int i;
427	vm_paddr_t gpabase, gpalimit;
428
429	for (i = 0; i < vm->num_mem_segs; i++) {
430		gpabase = vm->mem_segs[i].gpa;
431		gpalimit = gpabase + vm->mem_segs[i].len;
432		if (gpa >= gpabase && gpa < gpalimit)
433			return (TRUE);		/* 'gpa' is regular memory */
434	}
435
436	if (ppt_is_mmio(vm, gpa))
437		return (TRUE);			/* 'gpa' is pci passthru mmio */
438
439	return (FALSE);
440}
441
442int
443vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
444{
445	int available, allocated;
446	struct mem_seg *seg;
447	vm_object_t object;
448	vm_paddr_t g;
449
450	if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
451		return (EINVAL);
452
453	available = allocated = 0;
454	g = gpa;
455	while (g < gpa + len) {
456		if (vm_mem_allocated(vm, g))
457			allocated++;
458		else
459			available++;
460
461		g += PAGE_SIZE;
462	}
463
464	/*
465	 * If there are some allocated and some available pages in the address
466	 * range then it is an error.
467	 */
468	if (allocated && available)
469		return (EINVAL);
470
471	/*
472	 * If the entire address range being requested has already been
473	 * allocated then there isn't anything more to do.
474	 */
475	if (allocated && available == 0)
476		return (0);
477
478	if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
479		return (E2BIG);
480
481	seg = &vm->mem_segs[vm->num_mem_segs];
482
483	if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL)
484		return (ENOMEM);
485
486	seg->gpa = gpa;
487	seg->len = len;
488	seg->object = object;
489	seg->wired = FALSE;
490
491	vm->num_mem_segs++;
492
493	return (0);
494}
495
496static void
497vm_gpa_unwire(struct vm *vm)
498{
499	int i, rv;
500	struct mem_seg *seg;
501
502	for (i = 0; i < vm->num_mem_segs; i++) {
503		seg = &vm->mem_segs[i];
504		if (!seg->wired)
505			continue;
506
507		rv = vm_map_unwire(&vm->vmspace->vm_map,
508				   seg->gpa, seg->gpa + seg->len,
509				   VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
510		KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment "
511		    "%#lx/%ld could not be unwired: %d",
512		    vm_name(vm), seg->gpa, seg->len, rv));
513
514		seg->wired = FALSE;
515	}
516}
517
518static int
519vm_gpa_wire(struct vm *vm)
520{
521	int i, rv;
522	struct mem_seg *seg;
523
524	for (i = 0; i < vm->num_mem_segs; i++) {
525		seg = &vm->mem_segs[i];
526		if (seg->wired)
527			continue;
528
529		/* XXX rlimits? */
530		rv = vm_map_wire(&vm->vmspace->vm_map,
531				 seg->gpa, seg->gpa + seg->len,
532				 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
533		if (rv != KERN_SUCCESS)
534			break;
535
536		seg->wired = TRUE;
537	}
538
539	if (i < vm->num_mem_segs) {
540		/*
541		 * Undo the wiring before returning an error.
542		 */
543		vm_gpa_unwire(vm);
544		return (EAGAIN);
545	}
546
547	return (0);
548}
549
550static void
551vm_iommu_modify(struct vm *vm, boolean_t map)
552{
553	int i, sz;
554	vm_paddr_t gpa, hpa;
555	struct mem_seg *seg;
556	void *vp, *cookie, *host_domain;
557
558	sz = PAGE_SIZE;
559	host_domain = iommu_host_domain();
560
561	for (i = 0; i < vm->num_mem_segs; i++) {
562		seg = &vm->mem_segs[i];
563		KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired",
564		    vm_name(vm), seg->gpa, seg->len));
565
566		gpa = seg->gpa;
567		while (gpa < seg->gpa + seg->len) {
568			vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE,
569					 &cookie);
570			KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
571			    vm_name(vm), gpa));
572
573			vm_gpa_release(cookie);
574
575			hpa = DMAP_TO_PHYS((uintptr_t)vp);
576			if (map) {
577				iommu_create_mapping(vm->iommu, gpa, hpa, sz);
578				iommu_remove_mapping(host_domain, hpa, sz);
579			} else {
580				iommu_remove_mapping(vm->iommu, gpa, sz);
581				iommu_create_mapping(host_domain, hpa, hpa, sz);
582			}
583
584			gpa += PAGE_SIZE;
585		}
586	}
587
588	/*
589	 * Invalidate the cached translations associated with the domain
590	 * from which pages were removed.
591	 */
592	if (map)
593		iommu_invalidate_tlb(host_domain);
594	else
595		iommu_invalidate_tlb(vm->iommu);
596}
597
598#define	vm_iommu_unmap(vm)	vm_iommu_modify((vm), FALSE)
599#define	vm_iommu_map(vm)	vm_iommu_modify((vm), TRUE)
600
601int
602vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
603{
604	int error;
605
606	error = ppt_unassign_device(vm, bus, slot, func);
607	if (error)
608		return (error);
609
610	if (ppt_assigned_devices(vm) == 0) {
611		vm_iommu_unmap(vm);
612		vm_gpa_unwire(vm);
613	}
614	return (0);
615}
616
617int
618vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
619{
620	int error;
621	vm_paddr_t maxaddr;
622
623	/*
624	 * Virtual machines with pci passthru devices get special treatment:
625	 * - the guest physical memory is wired
626	 * - the iommu is programmed to do the 'gpa' to 'hpa' translation
627	 *
628	 * We need to do this before the first pci passthru device is attached.
629	 */
630	if (ppt_assigned_devices(vm) == 0) {
631		KASSERT(vm->iommu == NULL,
632		    ("vm_assign_pptdev: iommu must be NULL"));
633		maxaddr = vmm_mem_maxaddr();
634		vm->iommu = iommu_create_domain(maxaddr);
635
636		error = vm_gpa_wire(vm);
637		if (error)
638			return (error);
639
640		vm_iommu_map(vm);
641	}
642
643	error = ppt_assign_device(vm, bus, slot, func);
644	return (error);
645}
646
647void *
648vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
649	    void **cookie)
650{
651	int count, pageoff;
652	vm_page_t m;
653
654	pageoff = gpa & PAGE_MASK;
655	if (len > PAGE_SIZE - pageoff)
656		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
657
658	count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
659	    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
660
661	if (count == 1) {
662		*cookie = m;
663		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
664	} else {
665		*cookie = NULL;
666		return (NULL);
667	}
668}
669
670void
671vm_gpa_release(void *cookie)
672{
673	vm_page_t m = cookie;
674
675	vm_page_lock(m);
676	vm_page_unhold(m);
677	vm_page_unlock(m);
678}
679
680int
681vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
682		  struct vm_memory_segment *seg)
683{
684	int i;
685
686	for (i = 0; i < vm->num_mem_segs; i++) {
687		if (gpabase == vm->mem_segs[i].gpa) {
688			seg->gpa = vm->mem_segs[i].gpa;
689			seg->len = vm->mem_segs[i].len;
690			seg->wired = vm->mem_segs[i].wired;
691			return (0);
692		}
693	}
694	return (-1);
695}
696
697int
698vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
699	      vm_offset_t *offset, struct vm_object **object)
700{
701	int i;
702	size_t seg_len;
703	vm_paddr_t seg_gpa;
704	vm_object_t seg_obj;
705
706	for (i = 0; i < vm->num_mem_segs; i++) {
707		if ((seg_obj = vm->mem_segs[i].object) == NULL)
708			continue;
709
710		seg_gpa = vm->mem_segs[i].gpa;
711		seg_len = vm->mem_segs[i].len;
712
713		if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) {
714			*offset = gpa - seg_gpa;
715			*object = seg_obj;
716			vm_object_reference(seg_obj);
717			return (0);
718		}
719	}
720
721	return (EINVAL);
722}
723
724int
725vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
726{
727
728	if (vcpu < 0 || vcpu >= VM_MAXCPU)
729		return (EINVAL);
730
731	if (reg >= VM_REG_LAST)
732		return (EINVAL);
733
734	return (VMGETREG(vm->cookie, vcpu, reg, retval));
735}
736
737int
738vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
739{
740
741	if (vcpu < 0 || vcpu >= VM_MAXCPU)
742		return (EINVAL);
743
744	if (reg >= VM_REG_LAST)
745		return (EINVAL);
746
747	return (VMSETREG(vm->cookie, vcpu, reg, val));
748}
749
750static boolean_t
751is_descriptor_table(int reg)
752{
753
754	switch (reg) {
755	case VM_REG_GUEST_IDTR:
756	case VM_REG_GUEST_GDTR:
757		return (TRUE);
758	default:
759		return (FALSE);
760	}
761}
762
763static boolean_t
764is_segment_register(int reg)
765{
766
767	switch (reg) {
768	case VM_REG_GUEST_ES:
769	case VM_REG_GUEST_CS:
770	case VM_REG_GUEST_SS:
771	case VM_REG_GUEST_DS:
772	case VM_REG_GUEST_FS:
773	case VM_REG_GUEST_GS:
774	case VM_REG_GUEST_TR:
775	case VM_REG_GUEST_LDTR:
776		return (TRUE);
777	default:
778		return (FALSE);
779	}
780}
781
782int
783vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
784		struct seg_desc *desc)
785{
786
787	if (vcpu < 0 || vcpu >= VM_MAXCPU)
788		return (EINVAL);
789
790	if (!is_segment_register(reg) && !is_descriptor_table(reg))
791		return (EINVAL);
792
793	return (VMGETDESC(vm->cookie, vcpu, reg, desc));
794}
795
796int
797vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
798		struct seg_desc *desc)
799{
800	if (vcpu < 0 || vcpu >= VM_MAXCPU)
801		return (EINVAL);
802
803	if (!is_segment_register(reg) && !is_descriptor_table(reg))
804		return (EINVAL);
805
806	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
807}
808
809static void
810restore_guest_fpustate(struct vcpu *vcpu)
811{
812
813	/* flush host state to the pcb */
814	fpuexit(curthread);
815
816	/* restore guest FPU state */
817	fpu_stop_emulating();
818	fpurestore(vcpu->guestfpu);
819
820	/* restore guest XCR0 if XSAVE is enabled in the host */
821	if (rcr4() & CR4_XSAVE)
822		load_xcr(0, vcpu->guest_xcr0);
823
824	/*
825	 * The FPU is now "dirty" with the guest's state so turn on emulation
826	 * to trap any access to the FPU by the host.
827	 */
828	fpu_start_emulating();
829}
830
831static void
832save_guest_fpustate(struct vcpu *vcpu)
833{
834
835	if ((rcr0() & CR0_TS) == 0)
836		panic("fpu emulation not enabled in host!");
837
838	/* save guest XCR0 and restore host XCR0 */
839	if (rcr4() & CR4_XSAVE) {
840		vcpu->guest_xcr0 = rxcr(0);
841		load_xcr(0, vmm_get_host_xcr0());
842	}
843
844	/* save guest FPU state */
845	fpu_stop_emulating();
846	fpusave(vcpu->guestfpu);
847	fpu_start_emulating();
848}
849
850static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
851
852static int
853vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
854    bool from_idle)
855{
856	int error;
857
858	vcpu_assert_locked(vcpu);
859
860	/*
861	 * State transitions from the vmmdev_ioctl() must always begin from
862	 * the VCPU_IDLE state. This guarantees that there is only a single
863	 * ioctl() operating on a vcpu at any point.
864	 */
865	if (from_idle) {
866		while (vcpu->state != VCPU_IDLE)
867			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
868	} else {
869		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
870		    "vcpu idle state"));
871	}
872
873	if (vcpu->state == VCPU_RUNNING) {
874		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
875		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
876	} else {
877		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
878		    "vcpu that is not running", vcpu->hostcpu));
879	}
880
881	/*
882	 * The following state transitions are allowed:
883	 * IDLE -> FROZEN -> IDLE
884	 * FROZEN -> RUNNING -> FROZEN
885	 * FROZEN -> SLEEPING -> FROZEN
886	 */
887	switch (vcpu->state) {
888	case VCPU_IDLE:
889	case VCPU_RUNNING:
890	case VCPU_SLEEPING:
891		error = (newstate != VCPU_FROZEN);
892		break;
893	case VCPU_FROZEN:
894		error = (newstate == VCPU_FROZEN);
895		break;
896	default:
897		error = 1;
898		break;
899	}
900
901	if (error)
902		return (EBUSY);
903
904	vcpu->state = newstate;
905	if (newstate == VCPU_RUNNING)
906		vcpu->hostcpu = curcpu;
907	else
908		vcpu->hostcpu = NOCPU;
909
910	if (newstate == VCPU_IDLE)
911		wakeup(&vcpu->state);
912
913	return (0);
914}
915
916static void
917vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
918{
919	int error;
920
921	if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
922		panic("Error %d setting state to %d\n", error, newstate);
923}
924
925static void
926vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
927{
928	int error;
929
930	if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
931		panic("Error %d setting state to %d", error, newstate);
932}
933
934static void
935vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func)
936{
937
938	KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked"));
939
940	/*
941	 * Update 'rendezvous_func' and execute a write memory barrier to
942	 * ensure that it is visible across all host cpus. This is not needed
943	 * for correctness but it does ensure that all the vcpus will notice
944	 * that the rendezvous is requested immediately.
945	 */
946	vm->rendezvous_func = func;
947	wmb();
948}
949
950#define	RENDEZVOUS_CTR0(vm, vcpuid, fmt)				\
951	do {								\
952		if (vcpuid >= 0)					\
953			VCPU_CTR0(vm, vcpuid, fmt);			\
954		else							\
955			VM_CTR0(vm, fmt);				\
956	} while (0)
957
958static void
959vm_handle_rendezvous(struct vm *vm, int vcpuid)
960{
961
962	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
963	    ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid));
964
965	mtx_lock(&vm->rendezvous_mtx);
966	while (vm->rendezvous_func != NULL) {
967		/* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
968		CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus);
969
970		if (vcpuid != -1 &&
971		    CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
972		    !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
973			VCPU_CTR0(vm, vcpuid, "Calling rendezvous func");
974			(*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg);
975			CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
976		}
977		if (CPU_CMP(&vm->rendezvous_req_cpus,
978		    &vm->rendezvous_done_cpus) == 0) {
979			VCPU_CTR0(vm, vcpuid, "Rendezvous completed");
980			vm_set_rendezvous_func(vm, NULL);
981			wakeup(&vm->rendezvous_func);
982			break;
983		}
984		RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion");
985		mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
986		    "vmrndv", 0);
987	}
988	mtx_unlock(&vm->rendezvous_mtx);
989}
990
991/*
992 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
993 */
994static int
995vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
996{
997	struct vm_exit *vmexit;
998	struct vcpu *vcpu;
999	int t, timo, spindown;
1000
1001	vcpu = &vm->vcpu[vcpuid];
1002	spindown = 0;
1003
1004	vcpu_lock(vcpu);
1005
1006	/*
1007	 * Do a final check for pending NMI or interrupts before
1008	 * really putting this thread to sleep.
1009	 *
1010	 * These interrupts could have happened any time after we
1011	 * returned from VMRUN() and before we grabbed the vcpu lock.
1012	 */
1013	if (!vm_nmi_pending(vm, vcpuid) &&
1014	    (intr_disabled || !vlapic_pending_intr(vcpu->vlapic, NULL))) {
1015		t = ticks;
1016		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1017		if (vlapic_enabled(vcpu->vlapic)) {
1018			/*
1019			 * XXX msleep_spin() is not interruptible so use the
1020			 * 'timo' to put an upper bound on the sleep time.
1021			 */
1022			timo = hz;
1023			msleep_spin(vcpu, &vcpu->mtx, "vmidle", timo);
1024		} else {
1025			/*
1026			 * Spindown the vcpu if the apic is disabled and it
1027			 * had entered the halted state.
1028			 */
1029			spindown = 1;
1030		}
1031		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1032		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
1033	}
1034	vcpu_unlock(vcpu);
1035
1036	/*
1037	 * Since 'vm_deactivate_cpu()' grabs a sleep mutex we must call it
1038	 * outside the confines of the vcpu spinlock.
1039	 */
1040	if (spindown) {
1041		*retu = true;
1042		vmexit = vm_exitinfo(vm, vcpuid);
1043		vmexit->exitcode = VM_EXITCODE_SPINDOWN_CPU;
1044		vm_deactivate_cpu(vm, vcpuid);
1045		VCPU_CTR0(vm, vcpuid, "spinning down cpu");
1046	}
1047
1048	return (0);
1049}
1050
1051static int
1052vm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
1053{
1054	int rv, ftype;
1055	struct vm_map *map;
1056	struct vcpu *vcpu;
1057	struct vm_exit *vme;
1058
1059	vcpu = &vm->vcpu[vcpuid];
1060	vme = &vcpu->exitinfo;
1061
1062	ftype = vme->u.paging.fault_type;
1063	KASSERT(ftype == VM_PROT_READ ||
1064	    ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
1065	    ("vm_handle_paging: invalid fault_type %d", ftype));
1066
1067	if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
1068		rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
1069		    vme->u.paging.gpa, ftype);
1070		if (rv == 0)
1071			goto done;
1072	}
1073
1074	map = &vm->vmspace->vm_map;
1075	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
1076
1077	VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, "
1078	    "ftype = %d", rv, vme->u.paging.gpa, ftype);
1079
1080	if (rv != KERN_SUCCESS)
1081		return (EFAULT);
1082done:
1083	/* restart execution at the faulting instruction */
1084	vme->inst_length = 0;
1085
1086	return (0);
1087}
1088
1089static int
1090vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
1091{
1092	struct vie *vie;
1093	struct vcpu *vcpu;
1094	struct vm_exit *vme;
1095	int error, inst_length;
1096	uint64_t rip, gla, gpa, cr3;
1097	enum vie_cpu_mode cpu_mode;
1098	enum vie_paging_mode paging_mode;
1099	mem_region_read_t mread;
1100	mem_region_write_t mwrite;
1101
1102	vcpu = &vm->vcpu[vcpuid];
1103	vme = &vcpu->exitinfo;
1104
1105	rip = vme->rip;
1106	inst_length = vme->inst_length;
1107
1108	gla = vme->u.inst_emul.gla;
1109	gpa = vme->u.inst_emul.gpa;
1110	cr3 = vme->u.inst_emul.cr3;
1111	cpu_mode = vme->u.inst_emul.cpu_mode;
1112	paging_mode = vme->u.inst_emul.paging_mode;
1113	vie = &vme->u.inst_emul.vie;
1114
1115	vie_init(vie);
1116
1117	/* Fetch, decode and emulate the faulting instruction */
1118	if (vmm_fetch_instruction(vm, vcpuid, rip, inst_length, cr3,
1119	    paging_mode, vie) != 0)
1120		return (EFAULT);
1121
1122	if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, vie) != 0)
1123		return (EFAULT);
1124
1125	/* return to userland unless this is an in-kernel emulated device */
1126	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1127		mread = lapic_mmio_read;
1128		mwrite = lapic_mmio_write;
1129	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1130		mread = vioapic_mmio_read;
1131		mwrite = vioapic_mmio_write;
1132	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1133		mread = vhpet_mmio_read;
1134		mwrite = vhpet_mmio_write;
1135	} else {
1136		*retu = true;
1137		return (0);
1138	}
1139
1140	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, mread, mwrite,
1141	    retu);
1142
1143	return (error);
1144}
1145
1146int
1147vm_run(struct vm *vm, struct vm_run *vmrun)
1148{
1149	int error, vcpuid;
1150	struct vcpu *vcpu;
1151	struct pcb *pcb;
1152	uint64_t tscval, rip;
1153	struct vm_exit *vme;
1154	bool retu, intr_disabled;
1155	pmap_t pmap;
1156
1157	vcpuid = vmrun->cpuid;
1158
1159	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1160		return (EINVAL);
1161
1162	pmap = vmspace_pmap(vm->vmspace);
1163	vcpu = &vm->vcpu[vcpuid];
1164	vme = &vcpu->exitinfo;
1165	rip = vmrun->rip;
1166restart:
1167	critical_enter();
1168
1169	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
1170	    ("vm_run: absurd pm_active"));
1171
1172	tscval = rdtsc();
1173
1174	pcb = PCPU_GET(curpcb);
1175	set_pcb_flags(pcb, PCB_FULL_IRET);
1176
1177	restore_guest_msrs(vm, vcpuid);
1178	restore_guest_fpustate(vcpu);
1179
1180	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
1181	error = VMRUN(vm->cookie, vcpuid, rip, pmap, &vm->rendezvous_func);
1182	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
1183
1184	save_guest_fpustate(vcpu);
1185	restore_host_msrs(vm, vcpuid);
1186
1187	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
1188
1189	critical_exit();
1190
1191	if (error == 0) {
1192		retu = false;
1193		switch (vme->exitcode) {
1194		case VM_EXITCODE_IOAPIC_EOI:
1195			vioapic_process_eoi(vm, vcpuid,
1196			    vme->u.ioapic_eoi.vector);
1197			break;
1198		case VM_EXITCODE_RENDEZVOUS:
1199			vm_handle_rendezvous(vm, vcpuid);
1200			error = 0;
1201			break;
1202		case VM_EXITCODE_HLT:
1203			intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
1204			error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu);
1205			break;
1206		case VM_EXITCODE_PAGING:
1207			error = vm_handle_paging(vm, vcpuid, &retu);
1208			break;
1209		case VM_EXITCODE_INST_EMUL:
1210			error = vm_handle_inst_emul(vm, vcpuid, &retu);
1211			break;
1212		default:
1213			retu = true;	/* handled in userland */
1214			break;
1215		}
1216	}
1217
1218	if (error == 0 && retu == false) {
1219		rip = vme->rip + vme->inst_length;
1220		goto restart;
1221	}
1222
1223	/* copy the exit information */
1224	bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
1225	return (error);
1226}
1227
1228int
1229vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
1230{
1231	struct vcpu *vcpu;
1232
1233	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1234		return (EINVAL);
1235
1236	if (exception->vector < 0 || exception->vector >= 32)
1237		return (EINVAL);
1238
1239	vcpu = &vm->vcpu[vcpuid];
1240
1241	if (vcpu->exception_pending) {
1242		VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
1243		    "pending exception %d", exception->vector,
1244		    vcpu->exception.vector);
1245		return (EBUSY);
1246	}
1247
1248	vcpu->exception_pending = 1;
1249	vcpu->exception = *exception;
1250	VCPU_CTR1(vm, vcpuid, "Exception %d pending", exception->vector);
1251	return (0);
1252}
1253
1254int
1255vm_exception_pending(struct vm *vm, int vcpuid, struct vm_exception *exception)
1256{
1257	struct vcpu *vcpu;
1258	int pending;
1259
1260	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));
1261
1262	vcpu = &vm->vcpu[vcpuid];
1263	pending = vcpu->exception_pending;
1264	if (pending) {
1265		vcpu->exception_pending = 0;
1266		*exception = vcpu->exception;
1267		VCPU_CTR1(vm, vcpuid, "Exception %d delivered",
1268		    exception->vector);
1269	}
1270	return (pending);
1271}
1272
1273static void
1274vm_inject_fault(struct vm *vm, int vcpuid, struct vm_exception *exception)
1275{
1276	struct vm_exit *vmexit;
1277	int error;
1278
1279	error = vm_inject_exception(vm, vcpuid, exception);
1280	KASSERT(error == 0, ("vm_inject_exception error %d", error));
1281
1282	/*
1283	 * A fault-like exception allows the instruction to be restarted
1284	 * after the exception handler returns.
1285	 *
1286	 * By setting the inst_length to 0 we ensure that the instruction
1287	 * pointer remains at the faulting instruction.
1288	 */
1289	vmexit = vm_exitinfo(vm, vcpuid);
1290	vmexit->inst_length = 0;
1291}
1292
1293void
1294vm_inject_gp(struct vm *vm, int vcpuid)
1295{
1296	struct vm_exception gpf = {
1297		.vector = IDT_GP,
1298		.error_code_valid = 1,
1299		.error_code = 0
1300	};
1301
1302	vm_inject_fault(vm, vcpuid, &gpf);
1303}
1304
1305void
1306vm_inject_ud(struct vm *vm, int vcpuid)
1307{
1308	struct vm_exception udf = {
1309		.vector = IDT_UD,
1310		.error_code_valid = 0
1311	};
1312
1313	vm_inject_fault(vm, vcpuid, &udf);
1314}
1315
1316static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
1317
1318int
1319vm_inject_nmi(struct vm *vm, int vcpuid)
1320{
1321	struct vcpu *vcpu;
1322
1323	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1324		return (EINVAL);
1325
1326	vcpu = &vm->vcpu[vcpuid];
1327
1328	vcpu->nmi_pending = 1;
1329	vcpu_notify_event(vm, vcpuid, false);
1330	return (0);
1331}
1332
1333int
1334vm_nmi_pending(struct vm *vm, int vcpuid)
1335{
1336	struct vcpu *vcpu;
1337
1338	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1339		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1340
1341	vcpu = &vm->vcpu[vcpuid];
1342
1343	return (vcpu->nmi_pending);
1344}
1345
1346void
1347vm_nmi_clear(struct vm *vm, int vcpuid)
1348{
1349	struct vcpu *vcpu;
1350
1351	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1352		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1353
1354	vcpu = &vm->vcpu[vcpuid];
1355
1356	if (vcpu->nmi_pending == 0)
1357		panic("vm_nmi_clear: inconsistent nmi_pending state");
1358
1359	vcpu->nmi_pending = 0;
1360	vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
1361}
1362
1363int
1364vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
1365{
1366	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1367		return (EINVAL);
1368
1369	if (type < 0 || type >= VM_CAP_MAX)
1370		return (EINVAL);
1371
1372	return (VMGETCAP(vm->cookie, vcpu, type, retval));
1373}
1374
1375int
1376vm_set_capability(struct vm *vm, int vcpu, int type, int val)
1377{
1378	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1379		return (EINVAL);
1380
1381	if (type < 0 || type >= VM_CAP_MAX)
1382		return (EINVAL);
1383
1384	return (VMSETCAP(vm->cookie, vcpu, type, val));
1385}
1386
1387uint64_t *
1388vm_guest_msrs(struct vm *vm, int cpu)
1389{
1390	return (vm->vcpu[cpu].guest_msrs);
1391}
1392
1393struct vlapic *
1394vm_lapic(struct vm *vm, int cpu)
1395{
1396	return (vm->vcpu[cpu].vlapic);
1397}
1398
1399struct vioapic *
1400vm_ioapic(struct vm *vm)
1401{
1402
1403	return (vm->vioapic);
1404}
1405
1406struct vhpet *
1407vm_hpet(struct vm *vm)
1408{
1409
1410	return (vm->vhpet);
1411}
1412
1413boolean_t
1414vmm_is_pptdev(int bus, int slot, int func)
1415{
1416	int found, i, n;
1417	int b, s, f;
1418	char *val, *cp, *cp2;
1419
1420	/*
1421	 * XXX
1422	 * The length of an environment variable is limited to 128 bytes which
1423	 * puts an upper limit on the number of passthru devices that may be
1424	 * specified using a single environment variable.
1425	 *
1426	 * Work around this by scanning multiple environment variable
1427	 * names instead of a single one - yuck!
1428	 */
1429	const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
1430
1431	/* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
1432	found = 0;
1433	for (i = 0; names[i] != NULL && !found; i++) {
1434		cp = val = getenv(names[i]);
1435		while (cp != NULL && *cp != '\0') {
1436			if ((cp2 = strchr(cp, ' ')) != NULL)
1437				*cp2 = '\0';
1438
1439			n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
1440			if (n == 3 && bus == b && slot == s && func == f) {
1441				found = 1;
1442				break;
1443			}
1444
1445			if (cp2 != NULL)
1446				*cp2++ = ' ';
1447
1448			cp = cp2;
1449		}
1450		freeenv(val);
1451	}
1452	return (found);
1453}
1454
1455void *
1456vm_iommu_domain(struct vm *vm)
1457{
1458
1459	return (vm->iommu);
1460}
1461
1462int
1463vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
1464    bool from_idle)
1465{
1466	int error;
1467	struct vcpu *vcpu;
1468
1469	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1470		panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
1471
1472	vcpu = &vm->vcpu[vcpuid];
1473
1474	vcpu_lock(vcpu);
1475	error = vcpu_set_state_locked(vcpu, newstate, from_idle);
1476	vcpu_unlock(vcpu);
1477
1478	return (error);
1479}
1480
1481enum vcpu_state
1482vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
1483{
1484	struct vcpu *vcpu;
1485	enum vcpu_state state;
1486
1487	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1488		panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
1489
1490	vcpu = &vm->vcpu[vcpuid];
1491
1492	vcpu_lock(vcpu);
1493	state = vcpu->state;
1494	if (hostcpu != NULL)
1495		*hostcpu = vcpu->hostcpu;
1496	vcpu_unlock(vcpu);
1497
1498	return (state);
1499}
1500
1501void
1502vm_activate_cpu(struct vm *vm, int vcpuid)
1503{
1504
1505	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU,
1506	    ("vm_activate_cpu: invalid vcpuid %d", vcpuid));
1507	KASSERT(!CPU_ISSET(vcpuid, &vm->active_cpus),
1508	    ("vm_activate_cpu: vcpuid %d is already active", vcpuid));
1509
1510	VCPU_CTR0(vm, vcpuid, "activated");
1511	CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
1512}
1513
1514static void
1515vm_deactivate_cpu(struct vm *vm, int vcpuid)
1516{
1517
1518	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU,
1519	    ("vm_deactivate_cpu: invalid vcpuid %d", vcpuid));
1520	KASSERT(CPU_ISSET(vcpuid, &vm->active_cpus),
1521	    ("vm_deactivate_cpu: vcpuid %d is not active", vcpuid));
1522
1523	VCPU_CTR0(vm, vcpuid, "deactivated");
1524	CPU_CLR_ATOMIC(vcpuid, &vm->active_cpus);
1525
1526	/*
1527	 * If a vcpu rendezvous is in progress then it could be blocked
1528	 * on 'vcpuid' - unblock it before disappearing forever.
1529	 */
1530	mtx_lock(&vm->rendezvous_mtx);
1531	if (vm->rendezvous_func != NULL) {
1532		VCPU_CTR0(vm, vcpuid, "unblock rendezvous after deactivation");
1533		wakeup(&vm->rendezvous_func);
1534	}
1535	mtx_unlock(&vm->rendezvous_mtx);
1536}
1537
1538cpuset_t
1539vm_active_cpus(struct vm *vm)
1540{
1541
1542	return (vm->active_cpus);
1543}
1544
1545void *
1546vcpu_stats(struct vm *vm, int vcpuid)
1547{
1548
1549	return (vm->vcpu[vcpuid].stats);
1550}
1551
1552int
1553vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
1554{
1555	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1556		return (EINVAL);
1557
1558	*state = vm->vcpu[vcpuid].x2apic_state;
1559
1560	return (0);
1561}
1562
1563int
1564vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
1565{
1566	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1567		return (EINVAL);
1568
1569	if (state >= X2APIC_STATE_LAST)
1570		return (EINVAL);
1571
1572	vm->vcpu[vcpuid].x2apic_state = state;
1573
1574	vlapic_set_x2apic_state(vm, vcpuid, state);
1575
1576	return (0);
1577}
1578
1579/*
1580 * This function is called to ensure that a vcpu "sees" a pending event
1581 * as soon as possible:
1582 * - If the vcpu thread is sleeping then it is woken up.
1583 * - If the vcpu is running on a different host_cpu then an IPI will be directed
1584 *   to the host_cpu to cause the vcpu to trap into the hypervisor.
1585 */
1586void
1587vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
1588{
1589	int hostcpu;
1590	struct vcpu *vcpu;
1591
1592	vcpu = &vm->vcpu[vcpuid];
1593
1594	vcpu_lock(vcpu);
1595	hostcpu = vcpu->hostcpu;
1596	if (vcpu->state == VCPU_RUNNING) {
1597		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
1598		if (hostcpu != curcpu) {
1599			if (lapic_intr) {
1600				vlapic_post_intr(vcpu->vlapic, hostcpu,
1601				    vmm_ipinum);
1602			} else {
1603				ipi_cpu(hostcpu, vmm_ipinum);
1604			}
1605		} else {
1606			/*
1607			 * If the 'vcpu' is running on 'curcpu' then it must
1608			 * be sending a notification to itself (e.g. SELF_IPI).
1609			 * The pending event will be picked up when the vcpu
1610			 * transitions back to guest context.
1611			 */
1612		}
1613	} else {
1614		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
1615		    "with hostcpu %d", vcpu->state, hostcpu));
1616		if (vcpu->state == VCPU_SLEEPING)
1617			wakeup_one(vcpu);
1618	}
1619	vcpu_unlock(vcpu);
1620}
1621
1622struct vmspace *
1623vm_get_vmspace(struct vm *vm)
1624{
1625
1626	return (vm->vmspace);
1627}
1628
1629int
1630vm_apicid2vcpuid(struct vm *vm, int apicid)
1631{
1632	/*
1633	 * XXX apic id is assumed to be numerically identical to vcpu id
1634	 */
1635	return (apicid);
1636}
1637
1638void
1639vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
1640    vm_rendezvous_func_t func, void *arg)
1641{
1642	int i;
1643
1644	/*
1645	 * Enforce that this function is called without any locks
1646	 */
1647	WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous");
1648	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
1649	    ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid));
1650
1651restart:
1652	mtx_lock(&vm->rendezvous_mtx);
1653	if (vm->rendezvous_func != NULL) {
1654		/*
1655		 * If a rendezvous is already in progress then we need to
1656		 * call the rendezvous handler in case this 'vcpuid' is one
1657		 * of the targets of the rendezvous.
1658		 */
1659		RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress");
1660		mtx_unlock(&vm->rendezvous_mtx);
1661		vm_handle_rendezvous(vm, vcpuid);
1662		goto restart;
1663	}
1664	KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous "
1665	    "rendezvous is still in progress"));
1666
1667	RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous");
1668	vm->rendezvous_req_cpus = dest;
1669	CPU_ZERO(&vm->rendezvous_done_cpus);
1670	vm->rendezvous_arg = arg;
1671	vm_set_rendezvous_func(vm, func);
1672	mtx_unlock(&vm->rendezvous_mtx);
1673
1674	/*
1675	 * Wake up any sleeping vcpus and trigger a VM-exit in any running
1676	 * vcpus so they handle the rendezvous as soon as possible.
1677	 */
1678	for (i = 0; i < VM_MAXCPU; i++) {
1679		if (CPU_ISSET(i, &dest))
1680			vcpu_notify_event(vm, i, false);
1681	}
1682
1683	vm_handle_rendezvous(vm, vcpuid);
1684}
1685