vmm.c revision 267399
1/*-
2 * Copyright (c) 2011 NetApp, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD: stable/10/sys/amd64/vmm/vmm.c 267399 2014-06-12 15:20:59Z jhb $
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/vmm.c 267399 2014-06-12 15:20:59Z jhb $");
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/kernel.h>
35#include <sys/module.h>
36#include <sys/sysctl.h>
37#include <sys/malloc.h>
38#include <sys/pcpu.h>
39#include <sys/lock.h>
40#include <sys/mutex.h>
41#include <sys/proc.h>
42#include <sys/rwlock.h>
43#include <sys/sched.h>
44#include <sys/smp.h>
45#include <sys/systm.h>
46
47#include <vm/vm.h>
48#include <vm/vm_object.h>
49#include <vm/vm_page.h>
50#include <vm/pmap.h>
51#include <vm/vm_map.h>
52#include <vm/vm_extern.h>
53#include <vm/vm_param.h>
54
55#include <machine/cpu.h>
56#include <machine/vm.h>
57#include <machine/pcb.h>
58#include <machine/smp.h>
59#include <x86/psl.h>
60#include <x86/apicreg.h>
61#include <machine/vmparam.h>
62
63#include <machine/vmm.h>
64#include <machine/vmm_dev.h>
65
66#include "vmm_ktr.h"
67#include "vmm_host.h"
68#include "vmm_mem.h"
69#include "vmm_util.h"
70#include "vhpet.h"
71#include "vioapic.h"
72#include "vlapic.h"
73#include "vmm_msr.h"
74#include "vmm_ipi.h"
75#include "vmm_stat.h"
76#include "vmm_lapic.h"
77
78#include "io/ppt.h"
79#include "io/iommu.h"
80
81struct vlapic;
82
83struct vcpu {
84	int		flags;
85	enum vcpu_state	state;
86	struct mtx	mtx;
87	int		hostcpu;	/* host cpuid this vcpu last ran on */
88	uint64_t	guest_msrs[VMM_MSR_NUM];
89	struct vlapic	*vlapic;
90	int		 vcpuid;
91	struct savefpu	*guestfpu;	/* guest fpu state */
92	void		*stats;
93	struct vm_exit	exitinfo;
94	enum x2apic_state x2apic_state;
95	int		nmi_pending;
96};
97
98#define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
99#define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
100#define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
101#define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
102
103struct mem_seg {
104	vm_paddr_t	gpa;
105	size_t		len;
106	boolean_t	wired;
107	vm_object_t	object;
108};
109#define	VM_MAX_MEMORY_SEGMENTS	2
110
111struct vm {
112	void		*cookie;	/* processor-specific data */
113	void		*iommu;		/* iommu-specific data */
114	struct vhpet	*vhpet;		/* virtual HPET */
115	struct vioapic	*vioapic;	/* virtual ioapic */
116	struct vmspace	*vmspace;	/* guest's address space */
117	struct vcpu	vcpu[VM_MAXCPU];
118	int		num_mem_segs;
119	struct mem_seg	mem_segs[VM_MAX_MEMORY_SEGMENTS];
120	char		name[VM_MAX_NAMELEN];
121
122	/*
123	 * Set of active vcpus.
124	 * An active vcpu is one that has been started implicitly (BSP) or
125	 * explicitly (AP) by sending it a startup ipi.
126	 */
127	volatile cpuset_t active_cpus;
128
129	struct mtx	rendezvous_mtx;
130	cpuset_t	rendezvous_req_cpus;
131	cpuset_t	rendezvous_done_cpus;
132	void		*rendezvous_arg;
133	vm_rendezvous_func_t rendezvous_func;
134};
135
136static int vmm_initialized;
137
138static struct vmm_ops *ops;
139#define	VMM_INIT(num)	(ops != NULL ? (*ops->init)(num) : 0)
140#define	VMM_CLEANUP()	(ops != NULL ? (*ops->cleanup)() : 0)
141#define	VMM_RESUME()	(ops != NULL ? (*ops->resume)() : 0)
142
143#define	VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
144#define	VMRUN(vmi, vcpu, rip, pmap, rptr) \
145	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, rptr) : ENXIO)
146#define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
147#define	VMSPACE_ALLOC(min, max) \
148	(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
149#define	VMSPACE_FREE(vmspace) \
150	(ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
151#define	VMGETREG(vmi, vcpu, num, retval)		\
152	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
153#define	VMSETREG(vmi, vcpu, num, val)		\
154	(ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
155#define	VMGETDESC(vmi, vcpu, num, desc)		\
156	(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
157#define	VMSETDESC(vmi, vcpu, num, desc)		\
158	(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
159#define	VMINJECT(vmi, vcpu, type, vec, ec, ecv)	\
160	(ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO)
161#define	VMGETCAP(vmi, vcpu, num, retval)	\
162	(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
163#define	VMSETCAP(vmi, vcpu, num, val)		\
164	(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
165#define	VLAPIC_INIT(vmi, vcpu)			\
166	(ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL)
167#define	VLAPIC_CLEANUP(vmi, vlapic)		\
168	(ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL)
169
170#define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
171#define	fpu_stop_emulating()	clts()
172
173static MALLOC_DEFINE(M_VM, "vm", "vm");
174CTASSERT(VMM_MSR_NUM <= 64);	/* msr_mask can keep track of up to 64 msrs */
175
176/* statistics */
177static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
178
179SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
180
181static int vmm_ipinum;
182SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
183    "IPI vector used for vcpu notifications");
184
185static void vm_deactivate_cpu(struct vm *vm, int vcpuid);
186
187static void
188vcpu_cleanup(struct vm *vm, int i)
189{
190	struct vcpu *vcpu = &vm->vcpu[i];
191
192	VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
193	vmm_stat_free(vcpu->stats);
194	fpu_save_area_free(vcpu->guestfpu);
195}
196
197static void
198vcpu_init(struct vm *vm, uint32_t vcpu_id)
199{
200	struct vcpu *vcpu;
201
202	vcpu = &vm->vcpu[vcpu_id];
203
204	vcpu_lock_init(vcpu);
205	vcpu->hostcpu = NOCPU;
206	vcpu->vcpuid = vcpu_id;
207	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
208	vm_set_x2apic_state(vm, vcpu_id, X2APIC_ENABLED);
209	vcpu->guestfpu = fpu_save_area_alloc();
210	fpu_save_area_reset(vcpu->guestfpu);
211	vcpu->stats = vmm_stat_alloc();
212}
213
214struct vm_exit *
215vm_exitinfo(struct vm *vm, int cpuid)
216{
217	struct vcpu *vcpu;
218
219	if (cpuid < 0 || cpuid >= VM_MAXCPU)
220		panic("vm_exitinfo: invalid cpuid %d", cpuid);
221
222	vcpu = &vm->vcpu[cpuid];
223
224	return (&vcpu->exitinfo);
225}
226
227static void
228vmm_resume(void)
229{
230	VMM_RESUME();
231}
232
233static int
234vmm_init(void)
235{
236	int error;
237
238	vmm_host_state_init();
239
240	vmm_ipinum = vmm_ipi_alloc();
241	if (vmm_ipinum == 0)
242		vmm_ipinum = IPI_AST;
243
244	error = vmm_mem_init();
245	if (error)
246		return (error);
247
248	if (vmm_is_intel())
249		ops = &vmm_ops_intel;
250	else if (vmm_is_amd())
251		ops = &vmm_ops_amd;
252	else
253		return (ENXIO);
254
255	vmm_msr_init();
256	vmm_resume_p = vmm_resume;
257
258	return (VMM_INIT(vmm_ipinum));
259}
260
261static int
262vmm_handler(module_t mod, int what, void *arg)
263{
264	int error;
265
266	switch (what) {
267	case MOD_LOAD:
268		vmmdev_init();
269		if (ppt_avail_devices() > 0)
270			iommu_init();
271		error = vmm_init();
272		if (error == 0)
273			vmm_initialized = 1;
274		break;
275	case MOD_UNLOAD:
276		error = vmmdev_cleanup();
277		if (error == 0) {
278			vmm_resume_p = NULL;
279			iommu_cleanup();
280			if (vmm_ipinum != IPI_AST)
281				vmm_ipi_free(vmm_ipinum);
282			error = VMM_CLEANUP();
283			/*
284			 * Something bad happened - prevent new
285			 * VMs from being created
286			 */
287			if (error)
288				vmm_initialized = 0;
289		}
290		break;
291	default:
292		error = 0;
293		break;
294	}
295	return (error);
296}
297
298static moduledata_t vmm_kmod = {
299	"vmm",
300	vmm_handler,
301	NULL
302};
303
304/*
305 * vmm initialization has the following dependencies:
306 *
307 * - iommu initialization must happen after the pci passthru driver has had
308 *   a chance to attach to any passthru devices (after SI_SUB_CONFIGURE).
309 *
310 * - VT-x initialization requires smp_rendezvous() and therefore must happen
311 *   after SMP is fully functional (after SI_SUB_SMP).
312 */
313DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
314MODULE_VERSION(vmm, 1);
315
316int
317vm_create(const char *name, struct vm **retvm)
318{
319	int i;
320	struct vm *vm;
321	struct vmspace *vmspace;
322
323	const int BSP = 0;
324
325	/*
326	 * If vmm.ko could not be successfully initialized then don't attempt
327	 * to create the virtual machine.
328	 */
329	if (!vmm_initialized)
330		return (ENXIO);
331
332	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
333		return (EINVAL);
334
335	vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS);
336	if (vmspace == NULL)
337		return (ENOMEM);
338
339	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
340	strcpy(vm->name, name);
341	vm->vmspace = vmspace;
342	mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
343	vm->cookie = VMINIT(vm, vmspace_pmap(vmspace));
344	vm->vioapic = vioapic_init(vm);
345	vm->vhpet = vhpet_init(vm);
346
347	for (i = 0; i < VM_MAXCPU; i++) {
348		vcpu_init(vm, i);
349		guest_msrs_init(vm, i);
350	}
351
352	vm_activate_cpu(vm, BSP);
353
354	*retvm = vm;
355	return (0);
356}
357
358static void
359vm_free_mem_seg(struct vm *vm, struct mem_seg *seg)
360{
361
362	if (seg->object != NULL)
363		vmm_mem_free(vm->vmspace, seg->gpa, seg->len);
364
365	bzero(seg, sizeof(*seg));
366}
367
368void
369vm_destroy(struct vm *vm)
370{
371	int i;
372
373	ppt_unassign_all(vm);
374
375	if (vm->iommu != NULL)
376		iommu_destroy_domain(vm->iommu);
377
378	vhpet_cleanup(vm->vhpet);
379	vioapic_cleanup(vm->vioapic);
380
381	for (i = 0; i < vm->num_mem_segs; i++)
382		vm_free_mem_seg(vm, &vm->mem_segs[i]);
383
384	vm->num_mem_segs = 0;
385
386	for (i = 0; i < VM_MAXCPU; i++)
387		vcpu_cleanup(vm, i);
388
389	VMSPACE_FREE(vm->vmspace);
390
391	VMCLEANUP(vm->cookie);
392
393	free(vm, M_VM);
394}
395
396const char *
397vm_name(struct vm *vm)
398{
399	return (vm->name);
400}
401
402int
403vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
404{
405	vm_object_t obj;
406
407	if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
408		return (ENOMEM);
409	else
410		return (0);
411}
412
413int
414vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
415{
416
417	vmm_mmio_free(vm->vmspace, gpa, len);
418	return (0);
419}
420
421boolean_t
422vm_mem_allocated(struct vm *vm, vm_paddr_t gpa)
423{
424	int i;
425	vm_paddr_t gpabase, gpalimit;
426
427	for (i = 0; i < vm->num_mem_segs; i++) {
428		gpabase = vm->mem_segs[i].gpa;
429		gpalimit = gpabase + vm->mem_segs[i].len;
430		if (gpa >= gpabase && gpa < gpalimit)
431			return (TRUE);		/* 'gpa' is regular memory */
432	}
433
434	if (ppt_is_mmio(vm, gpa))
435		return (TRUE);			/* 'gpa' is pci passthru mmio */
436
437	return (FALSE);
438}
439
440int
441vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
442{
443	int available, allocated;
444	struct mem_seg *seg;
445	vm_object_t object;
446	vm_paddr_t g;
447
448	if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
449		return (EINVAL);
450
451	available = allocated = 0;
452	g = gpa;
453	while (g < gpa + len) {
454		if (vm_mem_allocated(vm, g))
455			allocated++;
456		else
457			available++;
458
459		g += PAGE_SIZE;
460	}
461
462	/*
463	 * If there are some allocated and some available pages in the address
464	 * range then it is an error.
465	 */
466	if (allocated && available)
467		return (EINVAL);
468
469	/*
470	 * If the entire address range being requested has already been
471	 * allocated then there isn't anything more to do.
472	 */
473	if (allocated && available == 0)
474		return (0);
475
476	if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
477		return (E2BIG);
478
479	seg = &vm->mem_segs[vm->num_mem_segs];
480
481	if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL)
482		return (ENOMEM);
483
484	seg->gpa = gpa;
485	seg->len = len;
486	seg->object = object;
487	seg->wired = FALSE;
488
489	vm->num_mem_segs++;
490
491	return (0);
492}
493
494static void
495vm_gpa_unwire(struct vm *vm)
496{
497	int i, rv;
498	struct mem_seg *seg;
499
500	for (i = 0; i < vm->num_mem_segs; i++) {
501		seg = &vm->mem_segs[i];
502		if (!seg->wired)
503			continue;
504
505		rv = vm_map_unwire(&vm->vmspace->vm_map,
506				   seg->gpa, seg->gpa + seg->len,
507				   VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
508		KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment "
509		    "%#lx/%ld could not be unwired: %d",
510		    vm_name(vm), seg->gpa, seg->len, rv));
511
512		seg->wired = FALSE;
513	}
514}
515
516static int
517vm_gpa_wire(struct vm *vm)
518{
519	int i, rv;
520	struct mem_seg *seg;
521
522	for (i = 0; i < vm->num_mem_segs; i++) {
523		seg = &vm->mem_segs[i];
524		if (seg->wired)
525			continue;
526
527		/* XXX rlimits? */
528		rv = vm_map_wire(&vm->vmspace->vm_map,
529				 seg->gpa, seg->gpa + seg->len,
530				 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
531		if (rv != KERN_SUCCESS)
532			break;
533
534		seg->wired = TRUE;
535	}
536
537	if (i < vm->num_mem_segs) {
538		/*
539		 * Undo the wiring before returning an error.
540		 */
541		vm_gpa_unwire(vm);
542		return (EAGAIN);
543	}
544
545	return (0);
546}
547
548static void
549vm_iommu_modify(struct vm *vm, boolean_t map)
550{
551	int i, sz;
552	vm_paddr_t gpa, hpa;
553	struct mem_seg *seg;
554	void *vp, *cookie, *host_domain;
555
556	sz = PAGE_SIZE;
557	host_domain = iommu_host_domain();
558
559	for (i = 0; i < vm->num_mem_segs; i++) {
560		seg = &vm->mem_segs[i];
561		KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired",
562		    vm_name(vm), seg->gpa, seg->len));
563
564		gpa = seg->gpa;
565		while (gpa < seg->gpa + seg->len) {
566			vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE,
567					 &cookie);
568			KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
569			    vm_name(vm), gpa));
570
571			vm_gpa_release(cookie);
572
573			hpa = DMAP_TO_PHYS((uintptr_t)vp);
574			if (map) {
575				iommu_create_mapping(vm->iommu, gpa, hpa, sz);
576				iommu_remove_mapping(host_domain, hpa, sz);
577			} else {
578				iommu_remove_mapping(vm->iommu, gpa, sz);
579				iommu_create_mapping(host_domain, hpa, hpa, sz);
580			}
581
582			gpa += PAGE_SIZE;
583		}
584	}
585
586	/*
587	 * Invalidate the cached translations associated with the domain
588	 * from which pages were removed.
589	 */
590	if (map)
591		iommu_invalidate_tlb(host_domain);
592	else
593		iommu_invalidate_tlb(vm->iommu);
594}
595
596#define	vm_iommu_unmap(vm)	vm_iommu_modify((vm), FALSE)
597#define	vm_iommu_map(vm)	vm_iommu_modify((vm), TRUE)
598
599int
600vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
601{
602	int error;
603
604	error = ppt_unassign_device(vm, bus, slot, func);
605	if (error)
606		return (error);
607
608	if (ppt_assigned_devices(vm) == 0) {
609		vm_iommu_unmap(vm);
610		vm_gpa_unwire(vm);
611	}
612	return (0);
613}
614
615int
616vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
617{
618	int error;
619	vm_paddr_t maxaddr;
620
621	/*
622	 * Virtual machines with pci passthru devices get special treatment:
623	 * - the guest physical memory is wired
624	 * - the iommu is programmed to do the 'gpa' to 'hpa' translation
625	 *
626	 * We need to do this before the first pci passthru device is attached.
627	 */
628	if (ppt_assigned_devices(vm) == 0) {
629		KASSERT(vm->iommu == NULL,
630		    ("vm_assign_pptdev: iommu must be NULL"));
631		maxaddr = vmm_mem_maxaddr();
632		vm->iommu = iommu_create_domain(maxaddr);
633
634		error = vm_gpa_wire(vm);
635		if (error)
636			return (error);
637
638		vm_iommu_map(vm);
639	}
640
641	error = ppt_assign_device(vm, bus, slot, func);
642	return (error);
643}
644
645void *
646vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
647	    void **cookie)
648{
649	int count, pageoff;
650	vm_page_t m;
651
652	pageoff = gpa & PAGE_MASK;
653	if (len > PAGE_SIZE - pageoff)
654		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
655
656	count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
657	    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
658
659	if (count == 1) {
660		*cookie = m;
661		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
662	} else {
663		*cookie = NULL;
664		return (NULL);
665	}
666}
667
668void
669vm_gpa_release(void *cookie)
670{
671	vm_page_t m = cookie;
672
673	vm_page_lock(m);
674	vm_page_unhold(m);
675	vm_page_unlock(m);
676}
677
678int
679vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
680		  struct vm_memory_segment *seg)
681{
682	int i;
683
684	for (i = 0; i < vm->num_mem_segs; i++) {
685		if (gpabase == vm->mem_segs[i].gpa) {
686			seg->gpa = vm->mem_segs[i].gpa;
687			seg->len = vm->mem_segs[i].len;
688			seg->wired = vm->mem_segs[i].wired;
689			return (0);
690		}
691	}
692	return (-1);
693}
694
695int
696vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
697	      vm_offset_t *offset, struct vm_object **object)
698{
699	int i;
700	size_t seg_len;
701	vm_paddr_t seg_gpa;
702	vm_object_t seg_obj;
703
704	for (i = 0; i < vm->num_mem_segs; i++) {
705		if ((seg_obj = vm->mem_segs[i].object) == NULL)
706			continue;
707
708		seg_gpa = vm->mem_segs[i].gpa;
709		seg_len = vm->mem_segs[i].len;
710
711		if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) {
712			*offset = gpa - seg_gpa;
713			*object = seg_obj;
714			vm_object_reference(seg_obj);
715			return (0);
716		}
717	}
718
719	return (EINVAL);
720}
721
722int
723vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
724{
725
726	if (vcpu < 0 || vcpu >= VM_MAXCPU)
727		return (EINVAL);
728
729	if (reg >= VM_REG_LAST)
730		return (EINVAL);
731
732	return (VMGETREG(vm->cookie, vcpu, reg, retval));
733}
734
735int
736vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
737{
738
739	if (vcpu < 0 || vcpu >= VM_MAXCPU)
740		return (EINVAL);
741
742	if (reg >= VM_REG_LAST)
743		return (EINVAL);
744
745	return (VMSETREG(vm->cookie, vcpu, reg, val));
746}
747
748static boolean_t
749is_descriptor_table(int reg)
750{
751
752	switch (reg) {
753	case VM_REG_GUEST_IDTR:
754	case VM_REG_GUEST_GDTR:
755		return (TRUE);
756	default:
757		return (FALSE);
758	}
759}
760
761static boolean_t
762is_segment_register(int reg)
763{
764
765	switch (reg) {
766	case VM_REG_GUEST_ES:
767	case VM_REG_GUEST_CS:
768	case VM_REG_GUEST_SS:
769	case VM_REG_GUEST_DS:
770	case VM_REG_GUEST_FS:
771	case VM_REG_GUEST_GS:
772	case VM_REG_GUEST_TR:
773	case VM_REG_GUEST_LDTR:
774		return (TRUE);
775	default:
776		return (FALSE);
777	}
778}
779
780int
781vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
782		struct seg_desc *desc)
783{
784
785	if (vcpu < 0 || vcpu >= VM_MAXCPU)
786		return (EINVAL);
787
788	if (!is_segment_register(reg) && !is_descriptor_table(reg))
789		return (EINVAL);
790
791	return (VMGETDESC(vm->cookie, vcpu, reg, desc));
792}
793
794int
795vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
796		struct seg_desc *desc)
797{
798	if (vcpu < 0 || vcpu >= VM_MAXCPU)
799		return (EINVAL);
800
801	if (!is_segment_register(reg) && !is_descriptor_table(reg))
802		return (EINVAL);
803
804	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
805}
806
807static void
808restore_guest_fpustate(struct vcpu *vcpu)
809{
810
811	/* flush host state to the pcb */
812	fpuexit(curthread);
813
814	/* restore guest FPU state */
815	fpu_stop_emulating();
816	fpurestore(vcpu->guestfpu);
817
818	/*
819	 * The FPU is now "dirty" with the guest's state so turn on emulation
820	 * to trap any access to the FPU by the host.
821	 */
822	fpu_start_emulating();
823}
824
825static void
826save_guest_fpustate(struct vcpu *vcpu)
827{
828
829	if ((rcr0() & CR0_TS) == 0)
830		panic("fpu emulation not enabled in host!");
831
832	/* save guest FPU state */
833	fpu_stop_emulating();
834	fpusave(vcpu->guestfpu);
835	fpu_start_emulating();
836}
837
838static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
839
840static int
841vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
842    bool from_idle)
843{
844	int error;
845
846	vcpu_assert_locked(vcpu);
847
848	/*
849	 * State transitions from the vmmdev_ioctl() must always begin from
850	 * the VCPU_IDLE state. This guarantees that there is only a single
851	 * ioctl() operating on a vcpu at any point.
852	 */
853	if (from_idle) {
854		while (vcpu->state != VCPU_IDLE)
855			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
856	} else {
857		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
858		    "vcpu idle state"));
859	}
860
861	if (vcpu->state == VCPU_RUNNING) {
862		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
863		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
864	} else {
865		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
866		    "vcpu that is not running", vcpu->hostcpu));
867	}
868
869	/*
870	 * The following state transitions are allowed:
871	 * IDLE -> FROZEN -> IDLE
872	 * FROZEN -> RUNNING -> FROZEN
873	 * FROZEN -> SLEEPING -> FROZEN
874	 */
875	switch (vcpu->state) {
876	case VCPU_IDLE:
877	case VCPU_RUNNING:
878	case VCPU_SLEEPING:
879		error = (newstate != VCPU_FROZEN);
880		break;
881	case VCPU_FROZEN:
882		error = (newstate == VCPU_FROZEN);
883		break;
884	default:
885		error = 1;
886		break;
887	}
888
889	if (error)
890		return (EBUSY);
891
892	vcpu->state = newstate;
893	if (newstate == VCPU_RUNNING)
894		vcpu->hostcpu = curcpu;
895	else
896		vcpu->hostcpu = NOCPU;
897
898	if (newstate == VCPU_IDLE)
899		wakeup(&vcpu->state);
900
901	return (0);
902}
903
904static void
905vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
906{
907	int error;
908
909	if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
910		panic("Error %d setting state to %d\n", error, newstate);
911}
912
913static void
914vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
915{
916	int error;
917
918	if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
919		panic("Error %d setting state to %d", error, newstate);
920}
921
922static void
923vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func)
924{
925
926	KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked"));
927
928	/*
929	 * Update 'rendezvous_func' and execute a write memory barrier to
930	 * ensure that it is visible across all host cpus. This is not needed
931	 * for correctness but it does ensure that all the vcpus will notice
932	 * that the rendezvous is requested immediately.
933	 */
934	vm->rendezvous_func = func;
935	wmb();
936}
937
938#define	RENDEZVOUS_CTR0(vm, vcpuid, fmt)				\
939	do {								\
940		if (vcpuid >= 0)					\
941			VCPU_CTR0(vm, vcpuid, fmt);			\
942		else							\
943			VM_CTR0(vm, fmt);				\
944	} while (0)
945
946static void
947vm_handle_rendezvous(struct vm *vm, int vcpuid)
948{
949
950	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
951	    ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid));
952
953	mtx_lock(&vm->rendezvous_mtx);
954	while (vm->rendezvous_func != NULL) {
955		/* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
956		CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus);
957
958		if (vcpuid != -1 &&
959		    CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
960		    !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
961			VCPU_CTR0(vm, vcpuid, "Calling rendezvous func");
962			(*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg);
963			CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
964		}
965		if (CPU_CMP(&vm->rendezvous_req_cpus,
966		    &vm->rendezvous_done_cpus) == 0) {
967			VCPU_CTR0(vm, vcpuid, "Rendezvous completed");
968			vm_set_rendezvous_func(vm, NULL);
969			wakeup(&vm->rendezvous_func);
970			break;
971		}
972		RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion");
973		mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
974		    "vmrndv", 0);
975	}
976	mtx_unlock(&vm->rendezvous_mtx);
977}
978
979/*
980 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
981 */
982static int
983vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
984{
985	struct vm_exit *vmexit;
986	struct vcpu *vcpu;
987	int t, timo, spindown;
988
989	vcpu = &vm->vcpu[vcpuid];
990	spindown = 0;
991
992	vcpu_lock(vcpu);
993
994	/*
995	 * Do a final check for pending NMI or interrupts before
996	 * really putting this thread to sleep.
997	 *
998	 * These interrupts could have happened any time after we
999	 * returned from VMRUN() and before we grabbed the vcpu lock.
1000	 */
1001	if (!vm_nmi_pending(vm, vcpuid) &&
1002	    (intr_disabled || !vlapic_pending_intr(vcpu->vlapic, NULL))) {
1003		t = ticks;
1004		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1005		if (vlapic_enabled(vcpu->vlapic)) {
1006			/*
1007			 * XXX msleep_spin() is not interruptible so use the
1008			 * 'timo' to put an upper bound on the sleep time.
1009			 */
1010			timo = hz;
1011			msleep_spin(vcpu, &vcpu->mtx, "vmidle", timo);
1012		} else {
1013			/*
1014			 * Spindown the vcpu if the apic is disabled and it
1015			 * had entered the halted state.
1016			 */
1017			spindown = 1;
1018		}
1019		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1020		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
1021	}
1022	vcpu_unlock(vcpu);
1023
1024	/*
1025	 * Since 'vm_deactivate_cpu()' grabs a sleep mutex we must call it
1026	 * outside the confines of the vcpu spinlock.
1027	 */
1028	if (spindown) {
1029		*retu = true;
1030		vmexit = vm_exitinfo(vm, vcpuid);
1031		vmexit->exitcode = VM_EXITCODE_SPINDOWN_CPU;
1032		vm_deactivate_cpu(vm, vcpuid);
1033		VCPU_CTR0(vm, vcpuid, "spinning down cpu");
1034	}
1035
1036	return (0);
1037}
1038
1039static int
1040vm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
1041{
1042	int rv, ftype;
1043	struct vm_map *map;
1044	struct vcpu *vcpu;
1045	struct vm_exit *vme;
1046
1047	vcpu = &vm->vcpu[vcpuid];
1048	vme = &vcpu->exitinfo;
1049
1050	ftype = vme->u.paging.fault_type;
1051	KASSERT(ftype == VM_PROT_READ ||
1052	    ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
1053	    ("vm_handle_paging: invalid fault_type %d", ftype));
1054
1055	if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
1056		rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
1057		    vme->u.paging.gpa, ftype);
1058		if (rv == 0)
1059			goto done;
1060	}
1061
1062	map = &vm->vmspace->vm_map;
1063	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
1064
1065	VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, "
1066	    "ftype = %d", rv, vme->u.paging.gpa, ftype);
1067
1068	if (rv != KERN_SUCCESS)
1069		return (EFAULT);
1070done:
1071	/* restart execution at the faulting instruction */
1072	vme->inst_length = 0;
1073
1074	return (0);
1075}
1076
1077static int
1078vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
1079{
1080	struct vie *vie;
1081	struct vcpu *vcpu;
1082	struct vm_exit *vme;
1083	int error, inst_length;
1084	uint64_t rip, gla, gpa, cr3;
1085	enum vie_cpu_mode cpu_mode;
1086	enum vie_paging_mode paging_mode;
1087	mem_region_read_t mread;
1088	mem_region_write_t mwrite;
1089
1090	vcpu = &vm->vcpu[vcpuid];
1091	vme = &vcpu->exitinfo;
1092
1093	rip = vme->rip;
1094	inst_length = vme->inst_length;
1095
1096	gla = vme->u.inst_emul.gla;
1097	gpa = vme->u.inst_emul.gpa;
1098	cr3 = vme->u.inst_emul.cr3;
1099	cpu_mode = vme->u.inst_emul.cpu_mode;
1100	paging_mode = vme->u.inst_emul.paging_mode;
1101	vie = &vme->u.inst_emul.vie;
1102
1103	vie_init(vie);
1104
1105	/* Fetch, decode and emulate the faulting instruction */
1106	if (vmm_fetch_instruction(vm, vcpuid, rip, inst_length, cr3,
1107	    paging_mode, vie) != 0)
1108		return (EFAULT);
1109
1110	if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, vie) != 0)
1111		return (EFAULT);
1112
1113	/* return to userland unless this is an in-kernel emulated device */
1114	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1115		mread = lapic_mmio_read;
1116		mwrite = lapic_mmio_write;
1117	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1118		mread = vioapic_mmio_read;
1119		mwrite = vioapic_mmio_write;
1120	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1121		mread = vhpet_mmio_read;
1122		mwrite = vhpet_mmio_write;
1123	} else {
1124		*retu = true;
1125		return (0);
1126	}
1127
1128	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, mread, mwrite,
1129	    retu);
1130
1131	return (error);
1132}
1133
1134int
1135vm_run(struct vm *vm, struct vm_run *vmrun)
1136{
1137	int error, vcpuid;
1138	struct vcpu *vcpu;
1139	struct pcb *pcb;
1140	uint64_t tscval, rip;
1141	struct vm_exit *vme;
1142	bool retu, intr_disabled;
1143	pmap_t pmap;
1144
1145	vcpuid = vmrun->cpuid;
1146
1147	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1148		return (EINVAL);
1149
1150	pmap = vmspace_pmap(vm->vmspace);
1151	vcpu = &vm->vcpu[vcpuid];
1152	vme = &vcpu->exitinfo;
1153	rip = vmrun->rip;
1154restart:
1155	critical_enter();
1156
1157	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
1158	    ("vm_run: absurd pm_active"));
1159
1160	tscval = rdtsc();
1161
1162	pcb = PCPU_GET(curpcb);
1163	set_pcb_flags(pcb, PCB_FULL_IRET);
1164
1165	restore_guest_msrs(vm, vcpuid);
1166	restore_guest_fpustate(vcpu);
1167
1168	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
1169	error = VMRUN(vm->cookie, vcpuid, rip, pmap, &vm->rendezvous_func);
1170	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
1171
1172	save_guest_fpustate(vcpu);
1173	restore_host_msrs(vm, vcpuid);
1174
1175	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
1176
1177	critical_exit();
1178
1179	if (error == 0) {
1180		retu = false;
1181		switch (vme->exitcode) {
1182		case VM_EXITCODE_IOAPIC_EOI:
1183			vioapic_process_eoi(vm, vcpuid,
1184			    vme->u.ioapic_eoi.vector);
1185			break;
1186		case VM_EXITCODE_RENDEZVOUS:
1187			vm_handle_rendezvous(vm, vcpuid);
1188			error = 0;
1189			break;
1190		case VM_EXITCODE_HLT:
1191			intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
1192			error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu);
1193			break;
1194		case VM_EXITCODE_PAGING:
1195			error = vm_handle_paging(vm, vcpuid, &retu);
1196			break;
1197		case VM_EXITCODE_INST_EMUL:
1198			error = vm_handle_inst_emul(vm, vcpuid, &retu);
1199			break;
1200		default:
1201			retu = true;	/* handled in userland */
1202			break;
1203		}
1204	}
1205
1206	if (error == 0 && retu == false) {
1207		rip = vme->rip + vme->inst_length;
1208		goto restart;
1209	}
1210
1211	/* copy the exit information */
1212	bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
1213	return (error);
1214}
1215
1216int
1217vm_inject_event(struct vm *vm, int vcpuid, int type,
1218		int vector, uint32_t code, int code_valid)
1219{
1220	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1221		return (EINVAL);
1222
1223	if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0)
1224		return (EINVAL);
1225
1226	if (vector < 0 || vector > 255)
1227		return (EINVAL);
1228
1229	return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid));
1230}
1231
1232static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
1233
1234int
1235vm_inject_nmi(struct vm *vm, int vcpuid)
1236{
1237	struct vcpu *vcpu;
1238
1239	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1240		return (EINVAL);
1241
1242	vcpu = &vm->vcpu[vcpuid];
1243
1244	vcpu->nmi_pending = 1;
1245	vcpu_notify_event(vm, vcpuid, false);
1246	return (0);
1247}
1248
1249int
1250vm_nmi_pending(struct vm *vm, int vcpuid)
1251{
1252	struct vcpu *vcpu;
1253
1254	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1255		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1256
1257	vcpu = &vm->vcpu[vcpuid];
1258
1259	return (vcpu->nmi_pending);
1260}
1261
1262void
1263vm_nmi_clear(struct vm *vm, int vcpuid)
1264{
1265	struct vcpu *vcpu;
1266
1267	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1268		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1269
1270	vcpu = &vm->vcpu[vcpuid];
1271
1272	if (vcpu->nmi_pending == 0)
1273		panic("vm_nmi_clear: inconsistent nmi_pending state");
1274
1275	vcpu->nmi_pending = 0;
1276	vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
1277}
1278
1279int
1280vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
1281{
1282	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1283		return (EINVAL);
1284
1285	if (type < 0 || type >= VM_CAP_MAX)
1286		return (EINVAL);
1287
1288	return (VMGETCAP(vm->cookie, vcpu, type, retval));
1289}
1290
1291int
1292vm_set_capability(struct vm *vm, int vcpu, int type, int val)
1293{
1294	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1295		return (EINVAL);
1296
1297	if (type < 0 || type >= VM_CAP_MAX)
1298		return (EINVAL);
1299
1300	return (VMSETCAP(vm->cookie, vcpu, type, val));
1301}
1302
1303uint64_t *
1304vm_guest_msrs(struct vm *vm, int cpu)
1305{
1306	return (vm->vcpu[cpu].guest_msrs);
1307}
1308
1309struct vlapic *
1310vm_lapic(struct vm *vm, int cpu)
1311{
1312	return (vm->vcpu[cpu].vlapic);
1313}
1314
1315struct vioapic *
1316vm_ioapic(struct vm *vm)
1317{
1318
1319	return (vm->vioapic);
1320}
1321
1322struct vhpet *
1323vm_hpet(struct vm *vm)
1324{
1325
1326	return (vm->vhpet);
1327}
1328
1329boolean_t
1330vmm_is_pptdev(int bus, int slot, int func)
1331{
1332	int found, i, n;
1333	int b, s, f;
1334	char *val, *cp, *cp2;
1335
1336	/*
1337	 * XXX
1338	 * The length of an environment variable is limited to 128 bytes which
1339	 * puts an upper limit on the number of passthru devices that may be
1340	 * specified using a single environment variable.
1341	 *
1342	 * Work around this by scanning multiple environment variable
1343	 * names instead of a single one - yuck!
1344	 */
1345	const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
1346
1347	/* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
1348	found = 0;
1349	for (i = 0; names[i] != NULL && !found; i++) {
1350		cp = val = getenv(names[i]);
1351		while (cp != NULL && *cp != '\0') {
1352			if ((cp2 = strchr(cp, ' ')) != NULL)
1353				*cp2 = '\0';
1354
1355			n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
1356			if (n == 3 && bus == b && slot == s && func == f) {
1357				found = 1;
1358				break;
1359			}
1360
1361			if (cp2 != NULL)
1362				*cp2++ = ' ';
1363
1364			cp = cp2;
1365		}
1366		freeenv(val);
1367	}
1368	return (found);
1369}
1370
1371void *
1372vm_iommu_domain(struct vm *vm)
1373{
1374
1375	return (vm->iommu);
1376}
1377
1378int
1379vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
1380    bool from_idle)
1381{
1382	int error;
1383	struct vcpu *vcpu;
1384
1385	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1386		panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
1387
1388	vcpu = &vm->vcpu[vcpuid];
1389
1390	vcpu_lock(vcpu);
1391	error = vcpu_set_state_locked(vcpu, newstate, from_idle);
1392	vcpu_unlock(vcpu);
1393
1394	return (error);
1395}
1396
1397enum vcpu_state
1398vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
1399{
1400	struct vcpu *vcpu;
1401	enum vcpu_state state;
1402
1403	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1404		panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
1405
1406	vcpu = &vm->vcpu[vcpuid];
1407
1408	vcpu_lock(vcpu);
1409	state = vcpu->state;
1410	if (hostcpu != NULL)
1411		*hostcpu = vcpu->hostcpu;
1412	vcpu_unlock(vcpu);
1413
1414	return (state);
1415}
1416
1417void
1418vm_activate_cpu(struct vm *vm, int vcpuid)
1419{
1420
1421	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU,
1422	    ("vm_activate_cpu: invalid vcpuid %d", vcpuid));
1423	KASSERT(!CPU_ISSET(vcpuid, &vm->active_cpus),
1424	    ("vm_activate_cpu: vcpuid %d is already active", vcpuid));
1425
1426	VCPU_CTR0(vm, vcpuid, "activated");
1427	CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
1428}
1429
1430static void
1431vm_deactivate_cpu(struct vm *vm, int vcpuid)
1432{
1433
1434	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU,
1435	    ("vm_deactivate_cpu: invalid vcpuid %d", vcpuid));
1436	KASSERT(CPU_ISSET(vcpuid, &vm->active_cpus),
1437	    ("vm_deactivate_cpu: vcpuid %d is not active", vcpuid));
1438
1439	VCPU_CTR0(vm, vcpuid, "deactivated");
1440	CPU_CLR_ATOMIC(vcpuid, &vm->active_cpus);
1441
1442	/*
1443	 * If a vcpu rendezvous is in progress then it could be blocked
1444	 * on 'vcpuid' - unblock it before disappearing forever.
1445	 */
1446	mtx_lock(&vm->rendezvous_mtx);
1447	if (vm->rendezvous_func != NULL) {
1448		VCPU_CTR0(vm, vcpuid, "unblock rendezvous after deactivation");
1449		wakeup(&vm->rendezvous_func);
1450	}
1451	mtx_unlock(&vm->rendezvous_mtx);
1452}
1453
1454cpuset_t
1455vm_active_cpus(struct vm *vm)
1456{
1457
1458	return (vm->active_cpus);
1459}
1460
1461void *
1462vcpu_stats(struct vm *vm, int vcpuid)
1463{
1464
1465	return (vm->vcpu[vcpuid].stats);
1466}
1467
1468int
1469vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
1470{
1471	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1472		return (EINVAL);
1473
1474	*state = vm->vcpu[vcpuid].x2apic_state;
1475
1476	return (0);
1477}
1478
1479int
1480vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
1481{
1482	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1483		return (EINVAL);
1484
1485	if (state >= X2APIC_STATE_LAST)
1486		return (EINVAL);
1487
1488	vm->vcpu[vcpuid].x2apic_state = state;
1489
1490	vlapic_set_x2apic_state(vm, vcpuid, state);
1491
1492	return (0);
1493}
1494
1495/*
1496 * This function is called to ensure that a vcpu "sees" a pending event
1497 * as soon as possible:
1498 * - If the vcpu thread is sleeping then it is woken up.
1499 * - If the vcpu is running on a different host_cpu then an IPI will be directed
1500 *   to the host_cpu to cause the vcpu to trap into the hypervisor.
1501 */
1502void
1503vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
1504{
1505	int hostcpu;
1506	struct vcpu *vcpu;
1507
1508	vcpu = &vm->vcpu[vcpuid];
1509
1510	vcpu_lock(vcpu);
1511	hostcpu = vcpu->hostcpu;
1512	if (vcpu->state == VCPU_RUNNING) {
1513		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
1514		if (hostcpu != curcpu) {
1515			if (lapic_intr) {
1516				vlapic_post_intr(vcpu->vlapic, hostcpu,
1517				    vmm_ipinum);
1518			} else {
1519				ipi_cpu(hostcpu, vmm_ipinum);
1520			}
1521		} else {
1522			/*
1523			 * If the 'vcpu' is running on 'curcpu' then it must
1524			 * be sending a notification to itself (e.g. SELF_IPI).
1525			 * The pending event will be picked up when the vcpu
1526			 * transitions back to guest context.
1527			 */
1528		}
1529	} else {
1530		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
1531		    "with hostcpu %d", vcpu->state, hostcpu));
1532		if (vcpu->state == VCPU_SLEEPING)
1533			wakeup_one(vcpu);
1534	}
1535	vcpu_unlock(vcpu);
1536}
1537
1538struct vmspace *
1539vm_get_vmspace(struct vm *vm)
1540{
1541
1542	return (vm->vmspace);
1543}
1544
1545int
1546vm_apicid2vcpuid(struct vm *vm, int apicid)
1547{
1548	/*
1549	 * XXX apic id is assumed to be numerically identical to vcpu id
1550	 */
1551	return (apicid);
1552}
1553
1554void
1555vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
1556    vm_rendezvous_func_t func, void *arg)
1557{
1558	int i;
1559
1560	/*
1561	 * Enforce that this function is called without any locks
1562	 */
1563	WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous");
1564	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
1565	    ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid));
1566
1567restart:
1568	mtx_lock(&vm->rendezvous_mtx);
1569	if (vm->rendezvous_func != NULL) {
1570		/*
1571		 * If a rendezvous is already in progress then we need to
1572		 * call the rendezvous handler in case this 'vcpuid' is one
1573		 * of the targets of the rendezvous.
1574		 */
1575		RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress");
1576		mtx_unlock(&vm->rendezvous_mtx);
1577		vm_handle_rendezvous(vm, vcpuid);
1578		goto restart;
1579	}
1580	KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous "
1581	    "rendezvous is still in progress"));
1582
1583	RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous");
1584	vm->rendezvous_req_cpus = dest;
1585	CPU_ZERO(&vm->rendezvous_done_cpus);
1586	vm->rendezvous_arg = arg;
1587	vm_set_rendezvous_func(vm, func);
1588	mtx_unlock(&vm->rendezvous_mtx);
1589
1590	/*
1591	 * Wake up any sleeping vcpus and trigger a VM-exit in any running
1592	 * vcpus so they handle the rendezvous as soon as possible.
1593	 */
1594	for (i = 0; i < VM_MAXCPU; i++) {
1595		if (CPU_ISSET(i, &dest))
1596			vcpu_notify_event(vm, i, false);
1597	}
1598
1599	vm_handle_rendezvous(vm, vcpuid);
1600}
1601