vmm.c revision 267070
1/*-
2 * Copyright (c) 2011 NetApp, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD: stable/10/sys/amd64/vmm/vmm.c 267070 2014-06-04 17:57:48Z jhb $
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/vmm.c 267070 2014-06-04 17:57:48Z jhb $");
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/kernel.h>
35#include <sys/module.h>
36#include <sys/sysctl.h>
37#include <sys/malloc.h>
38#include <sys/pcpu.h>
39#include <sys/lock.h>
40#include <sys/mutex.h>
41#include <sys/proc.h>
42#include <sys/rwlock.h>
43#include <sys/sched.h>
44#include <sys/smp.h>
45#include <sys/systm.h>
46
47#include <vm/vm.h>
48#include <vm/vm_object.h>
49#include <vm/vm_page.h>
50#include <vm/pmap.h>
51#include <vm/vm_map.h>
52#include <vm/vm_extern.h>
53#include <vm/vm_param.h>
54
55#include <machine/cpu.h>
56#include <machine/vm.h>
57#include <machine/pcb.h>
58#include <machine/smp.h>
59#include <x86/psl.h>
60#include <x86/apicreg.h>
61#include <machine/vmparam.h>
62
63#include <machine/vmm.h>
64#include <machine/vmm_dev.h>
65
66#include "vmm_ktr.h"
67#include "vmm_host.h"
68#include "vmm_mem.h"
69#include "vmm_util.h"
70#include "vhpet.h"
71#include "vioapic.h"
72#include "vlapic.h"
73#include "vmm_msr.h"
74#include "vmm_ipi.h"
75#include "vmm_stat.h"
76#include "vmm_lapic.h"
77
78#include "io/ppt.h"
79#include "io/iommu.h"
80
81struct vlapic;
82
83struct vcpu {
84	int		flags;
85	enum vcpu_state	state;
86	struct mtx	mtx;
87	int		hostcpu;	/* host cpuid this vcpu last ran on */
88	uint64_t	guest_msrs[VMM_MSR_NUM];
89	struct vlapic	*vlapic;
90	int		 vcpuid;
91	struct savefpu	*guestfpu;	/* guest fpu state */
92	void		*stats;
93	struct vm_exit	exitinfo;
94	enum x2apic_state x2apic_state;
95	int		nmi_pending;
96};
97
98#define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
99#define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
100#define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
101#define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
102
103struct mem_seg {
104	vm_paddr_t	gpa;
105	size_t		len;
106	boolean_t	wired;
107	vm_object_t	object;
108};
109#define	VM_MAX_MEMORY_SEGMENTS	2
110
111struct vm {
112	void		*cookie;	/* processor-specific data */
113	void		*iommu;		/* iommu-specific data */
114	struct vhpet	*vhpet;		/* virtual HPET */
115	struct vioapic	*vioapic;	/* virtual ioapic */
116	struct vmspace	*vmspace;	/* guest's address space */
117	struct vcpu	vcpu[VM_MAXCPU];
118	int		num_mem_segs;
119	struct mem_seg	mem_segs[VM_MAX_MEMORY_SEGMENTS];
120	char		name[VM_MAX_NAMELEN];
121
122	/*
123	 * Set of active vcpus.
124	 * An active vcpu is one that has been started implicitly (BSP) or
125	 * explicitly (AP) by sending it a startup ipi.
126	 */
127	volatile cpuset_t active_cpus;
128
129	struct mtx	rendezvous_mtx;
130	cpuset_t	rendezvous_req_cpus;
131	cpuset_t	rendezvous_done_cpus;
132	void		*rendezvous_arg;
133	vm_rendezvous_func_t rendezvous_func;
134};
135
136static int vmm_initialized;
137
138static struct vmm_ops *ops;
139#define	VMM_INIT(num)	(ops != NULL ? (*ops->init)(num) : 0)
140#define	VMM_CLEANUP()	(ops != NULL ? (*ops->cleanup)() : 0)
141#define	VMM_RESUME()	(ops != NULL ? (*ops->resume)() : 0)
142
143#define	VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
144#define	VMRUN(vmi, vcpu, rip, pmap, rptr) \
145	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, rptr) : ENXIO)
146#define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
147#define	VMSPACE_ALLOC(min, max) \
148	(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
149#define	VMSPACE_FREE(vmspace) \
150	(ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
151#define	VMGETREG(vmi, vcpu, num, retval)		\
152	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
153#define	VMSETREG(vmi, vcpu, num, val)		\
154	(ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
155#define	VMGETDESC(vmi, vcpu, num, desc)		\
156	(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
157#define	VMSETDESC(vmi, vcpu, num, desc)		\
158	(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
159#define	VMINJECT(vmi, vcpu, type, vec, ec, ecv)	\
160	(ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO)
161#define	VMGETCAP(vmi, vcpu, num, retval)	\
162	(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
163#define	VMSETCAP(vmi, vcpu, num, val)		\
164	(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
165#define	VLAPIC_INIT(vmi, vcpu)			\
166	(ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL)
167#define	VLAPIC_CLEANUP(vmi, vlapic)		\
168	(ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL)
169
170#define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
171#define	fpu_stop_emulating()	clts()
172
173static MALLOC_DEFINE(M_VM, "vm", "vm");
174CTASSERT(VMM_MSR_NUM <= 64);	/* msr_mask can keep track of up to 64 msrs */
175
176/* statistics */
177static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
178
179SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
180
181static int vmm_ipinum;
182SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
183    "IPI vector used for vcpu notifications");
184
185static void vm_deactivate_cpu(struct vm *vm, int vcpuid);
186
187static void
188vcpu_cleanup(struct vm *vm, int i)
189{
190	struct vcpu *vcpu = &vm->vcpu[i];
191
192	VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
193	vmm_stat_free(vcpu->stats);
194	fpu_save_area_free(vcpu->guestfpu);
195}
196
197static void
198vcpu_init(struct vm *vm, uint32_t vcpu_id)
199{
200	struct vcpu *vcpu;
201
202	vcpu = &vm->vcpu[vcpu_id];
203
204	vcpu_lock_init(vcpu);
205	vcpu->hostcpu = NOCPU;
206	vcpu->vcpuid = vcpu_id;
207	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
208	vm_set_x2apic_state(vm, vcpu_id, X2APIC_ENABLED);
209	vcpu->guestfpu = fpu_save_area_alloc();
210	fpu_save_area_reset(vcpu->guestfpu);
211	vcpu->stats = vmm_stat_alloc();
212}
213
214struct vm_exit *
215vm_exitinfo(struct vm *vm, int cpuid)
216{
217	struct vcpu *vcpu;
218
219	if (cpuid < 0 || cpuid >= VM_MAXCPU)
220		panic("vm_exitinfo: invalid cpuid %d", cpuid);
221
222	vcpu = &vm->vcpu[cpuid];
223
224	return (&vcpu->exitinfo);
225}
226
227static void
228vmm_resume(void)
229{
230	VMM_RESUME();
231}
232
233static int
234vmm_init(void)
235{
236	int error;
237
238	vmm_host_state_init();
239
240	vmm_ipinum = vmm_ipi_alloc();
241	if (vmm_ipinum == 0)
242		vmm_ipinum = IPI_AST;
243
244	error = vmm_mem_init();
245	if (error)
246		return (error);
247
248	if (vmm_is_intel())
249		ops = &vmm_ops_intel;
250	else if (vmm_is_amd())
251		ops = &vmm_ops_amd;
252	else
253		return (ENXIO);
254
255	vmm_msr_init();
256	vmm_resume_p = vmm_resume;
257
258	return (VMM_INIT(vmm_ipinum));
259}
260
261static int
262vmm_handler(module_t mod, int what, void *arg)
263{
264	int error;
265
266	switch (what) {
267	case MOD_LOAD:
268		vmmdev_init();
269		if (ppt_avail_devices() > 0)
270			iommu_init();
271		error = vmm_init();
272		if (error == 0)
273			vmm_initialized = 1;
274		break;
275	case MOD_UNLOAD:
276		error = vmmdev_cleanup();
277		if (error == 0) {
278			vmm_resume_p = NULL;
279			iommu_cleanup();
280			if (vmm_ipinum != IPI_AST)
281				vmm_ipi_free(vmm_ipinum);
282			error = VMM_CLEANUP();
283			/*
284			 * Something bad happened - prevent new
285			 * VMs from being created
286			 */
287			if (error)
288				vmm_initialized = 0;
289		}
290		break;
291	default:
292		error = 0;
293		break;
294	}
295	return (error);
296}
297
298static moduledata_t vmm_kmod = {
299	"vmm",
300	vmm_handler,
301	NULL
302};
303
304/*
305 * vmm initialization has the following dependencies:
306 *
307 * - iommu initialization must happen after the pci passthru driver has had
308 *   a chance to attach to any passthru devices (after SI_SUB_CONFIGURE).
309 *
310 * - VT-x initialization requires smp_rendezvous() and therefore must happen
311 *   after SMP is fully functional (after SI_SUB_SMP).
312 */
313DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
314MODULE_VERSION(vmm, 1);
315
316int
317vm_create(const char *name, struct vm **retvm)
318{
319	int i;
320	struct vm *vm;
321	struct vmspace *vmspace;
322
323	const int BSP = 0;
324
325	/*
326	 * If vmm.ko could not be successfully initialized then don't attempt
327	 * to create the virtual machine.
328	 */
329	if (!vmm_initialized)
330		return (ENXIO);
331
332	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
333		return (EINVAL);
334
335	vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS);
336	if (vmspace == NULL)
337		return (ENOMEM);
338
339	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
340	strcpy(vm->name, name);
341	vm->vmspace = vmspace;
342	mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
343	vm->cookie = VMINIT(vm, vmspace_pmap(vmspace));
344	vm->vioapic = vioapic_init(vm);
345	vm->vhpet = vhpet_init(vm);
346
347	for (i = 0; i < VM_MAXCPU; i++) {
348		vcpu_init(vm, i);
349		guest_msrs_init(vm, i);
350	}
351
352	vm_activate_cpu(vm, BSP);
353
354	*retvm = vm;
355	return (0);
356}
357
358static void
359vm_free_mem_seg(struct vm *vm, struct mem_seg *seg)
360{
361
362	if (seg->object != NULL)
363		vmm_mem_free(vm->vmspace, seg->gpa, seg->len);
364
365	bzero(seg, sizeof(*seg));
366}
367
368void
369vm_destroy(struct vm *vm)
370{
371	int i;
372
373	ppt_unassign_all(vm);
374
375	if (vm->iommu != NULL)
376		iommu_destroy_domain(vm->iommu);
377
378	vhpet_cleanup(vm->vhpet);
379	vioapic_cleanup(vm->vioapic);
380
381	for (i = 0; i < vm->num_mem_segs; i++)
382		vm_free_mem_seg(vm, &vm->mem_segs[i]);
383
384	vm->num_mem_segs = 0;
385
386	for (i = 0; i < VM_MAXCPU; i++)
387		vcpu_cleanup(vm, i);
388
389	VMSPACE_FREE(vm->vmspace);
390
391	VMCLEANUP(vm->cookie);
392
393	free(vm, M_VM);
394}
395
396const char *
397vm_name(struct vm *vm)
398{
399	return (vm->name);
400}
401
402int
403vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
404{
405	vm_object_t obj;
406
407	if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
408		return (ENOMEM);
409	else
410		return (0);
411}
412
413int
414vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
415{
416
417	vmm_mmio_free(vm->vmspace, gpa, len);
418	return (0);
419}
420
421boolean_t
422vm_mem_allocated(struct vm *vm, vm_paddr_t gpa)
423{
424	int i;
425	vm_paddr_t gpabase, gpalimit;
426
427	for (i = 0; i < vm->num_mem_segs; i++) {
428		gpabase = vm->mem_segs[i].gpa;
429		gpalimit = gpabase + vm->mem_segs[i].len;
430		if (gpa >= gpabase && gpa < gpalimit)
431			return (TRUE);		/* 'gpa' is regular memory */
432	}
433
434	if (ppt_is_mmio(vm, gpa))
435		return (TRUE);			/* 'gpa' is pci passthru mmio */
436
437	return (FALSE);
438}
439
440int
441vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
442{
443	int available, allocated;
444	struct mem_seg *seg;
445	vm_object_t object;
446	vm_paddr_t g;
447
448	if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
449		return (EINVAL);
450
451	available = allocated = 0;
452	g = gpa;
453	while (g < gpa + len) {
454		if (vm_mem_allocated(vm, g))
455			allocated++;
456		else
457			available++;
458
459		g += PAGE_SIZE;
460	}
461
462	/*
463	 * If there are some allocated and some available pages in the address
464	 * range then it is an error.
465	 */
466	if (allocated && available)
467		return (EINVAL);
468
469	/*
470	 * If the entire address range being requested has already been
471	 * allocated then there isn't anything more to do.
472	 */
473	if (allocated && available == 0)
474		return (0);
475
476	if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
477		return (E2BIG);
478
479	seg = &vm->mem_segs[vm->num_mem_segs];
480
481	if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL)
482		return (ENOMEM);
483
484	seg->gpa = gpa;
485	seg->len = len;
486	seg->object = object;
487	seg->wired = FALSE;
488
489	vm->num_mem_segs++;
490
491	return (0);
492}
493
494static void
495vm_gpa_unwire(struct vm *vm)
496{
497	int i, rv;
498	struct mem_seg *seg;
499
500	for (i = 0; i < vm->num_mem_segs; i++) {
501		seg = &vm->mem_segs[i];
502		if (!seg->wired)
503			continue;
504
505		rv = vm_map_unwire(&vm->vmspace->vm_map,
506				   seg->gpa, seg->gpa + seg->len,
507				   VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
508		KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment "
509		    "%#lx/%ld could not be unwired: %d",
510		    vm_name(vm), seg->gpa, seg->len, rv));
511
512		seg->wired = FALSE;
513	}
514}
515
516static int
517vm_gpa_wire(struct vm *vm)
518{
519	int i, rv;
520	struct mem_seg *seg;
521
522	for (i = 0; i < vm->num_mem_segs; i++) {
523		seg = &vm->mem_segs[i];
524		if (seg->wired)
525			continue;
526
527		/* XXX rlimits? */
528		rv = vm_map_wire(&vm->vmspace->vm_map,
529				 seg->gpa, seg->gpa + seg->len,
530				 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
531		if (rv != KERN_SUCCESS)
532			break;
533
534		seg->wired = TRUE;
535	}
536
537	if (i < vm->num_mem_segs) {
538		/*
539		 * Undo the wiring before returning an error.
540		 */
541		vm_gpa_unwire(vm);
542		return (EAGAIN);
543	}
544
545	return (0);
546}
547
548static void
549vm_iommu_modify(struct vm *vm, boolean_t map)
550{
551	int i, sz;
552	vm_paddr_t gpa, hpa;
553	struct mem_seg *seg;
554	void *vp, *cookie, *host_domain;
555
556	sz = PAGE_SIZE;
557	host_domain = iommu_host_domain();
558
559	for (i = 0; i < vm->num_mem_segs; i++) {
560		seg = &vm->mem_segs[i];
561		KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired",
562		    vm_name(vm), seg->gpa, seg->len));
563
564		gpa = seg->gpa;
565		while (gpa < seg->gpa + seg->len) {
566			vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE,
567					 &cookie);
568			KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
569			    vm_name(vm), gpa));
570
571			vm_gpa_release(cookie);
572
573			hpa = DMAP_TO_PHYS((uintptr_t)vp);
574			if (map) {
575				iommu_create_mapping(vm->iommu, gpa, hpa, sz);
576				iommu_remove_mapping(host_domain, hpa, sz);
577			} else {
578				iommu_remove_mapping(vm->iommu, gpa, sz);
579				iommu_create_mapping(host_domain, hpa, hpa, sz);
580			}
581
582			gpa += PAGE_SIZE;
583		}
584	}
585
586	/*
587	 * Invalidate the cached translations associated with the domain
588	 * from which pages were removed.
589	 */
590	if (map)
591		iommu_invalidate_tlb(host_domain);
592	else
593		iommu_invalidate_tlb(vm->iommu);
594}
595
596#define	vm_iommu_unmap(vm)	vm_iommu_modify((vm), FALSE)
597#define	vm_iommu_map(vm)	vm_iommu_modify((vm), TRUE)
598
599int
600vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
601{
602	int error;
603
604	error = ppt_unassign_device(vm, bus, slot, func);
605	if (error)
606		return (error);
607
608	if (ppt_assigned_devices(vm) == 0) {
609		vm_iommu_unmap(vm);
610		vm_gpa_unwire(vm);
611	}
612	return (0);
613}
614
615int
616vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
617{
618	int error;
619	vm_paddr_t maxaddr;
620
621	/*
622	 * Virtual machines with pci passthru devices get special treatment:
623	 * - the guest physical memory is wired
624	 * - the iommu is programmed to do the 'gpa' to 'hpa' translation
625	 *
626	 * We need to do this before the first pci passthru device is attached.
627	 */
628	if (ppt_assigned_devices(vm) == 0) {
629		KASSERT(vm->iommu == NULL,
630		    ("vm_assign_pptdev: iommu must be NULL"));
631		maxaddr = vmm_mem_maxaddr();
632		vm->iommu = iommu_create_domain(maxaddr);
633
634		error = vm_gpa_wire(vm);
635		if (error)
636			return (error);
637
638		vm_iommu_map(vm);
639	}
640
641	error = ppt_assign_device(vm, bus, slot, func);
642	return (error);
643}
644
645void *
646vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
647	    void **cookie)
648{
649	int count, pageoff;
650	vm_page_t m;
651
652	pageoff = gpa & PAGE_MASK;
653	if (len > PAGE_SIZE - pageoff)
654		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
655
656	count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
657	    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
658
659	if (count == 1) {
660		*cookie = m;
661		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
662	} else {
663		*cookie = NULL;
664		return (NULL);
665	}
666}
667
668void
669vm_gpa_release(void *cookie)
670{
671	vm_page_t m = cookie;
672
673	vm_page_lock(m);
674	vm_page_unhold(m);
675	vm_page_unlock(m);
676}
677
678int
679vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
680		  struct vm_memory_segment *seg)
681{
682	int i;
683
684	for (i = 0; i < vm->num_mem_segs; i++) {
685		if (gpabase == vm->mem_segs[i].gpa) {
686			seg->gpa = vm->mem_segs[i].gpa;
687			seg->len = vm->mem_segs[i].len;
688			seg->wired = vm->mem_segs[i].wired;
689			return (0);
690		}
691	}
692	return (-1);
693}
694
695int
696vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
697	      vm_offset_t *offset, struct vm_object **object)
698{
699	int i;
700	size_t seg_len;
701	vm_paddr_t seg_gpa;
702	vm_object_t seg_obj;
703
704	for (i = 0; i < vm->num_mem_segs; i++) {
705		if ((seg_obj = vm->mem_segs[i].object) == NULL)
706			continue;
707
708		seg_gpa = vm->mem_segs[i].gpa;
709		seg_len = vm->mem_segs[i].len;
710
711		if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) {
712			*offset = gpa - seg_gpa;
713			*object = seg_obj;
714			vm_object_reference(seg_obj);
715			return (0);
716		}
717	}
718
719	return (EINVAL);
720}
721
722int
723vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
724{
725
726	if (vcpu < 0 || vcpu >= VM_MAXCPU)
727		return (EINVAL);
728
729	if (reg >= VM_REG_LAST)
730		return (EINVAL);
731
732	return (VMGETREG(vm->cookie, vcpu, reg, retval));
733}
734
735int
736vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
737{
738
739	if (vcpu < 0 || vcpu >= VM_MAXCPU)
740		return (EINVAL);
741
742	if (reg >= VM_REG_LAST)
743		return (EINVAL);
744
745	return (VMSETREG(vm->cookie, vcpu, reg, val));
746}
747
748static boolean_t
749is_descriptor_table(int reg)
750{
751
752	switch (reg) {
753	case VM_REG_GUEST_IDTR:
754	case VM_REG_GUEST_GDTR:
755		return (TRUE);
756	default:
757		return (FALSE);
758	}
759}
760
761static boolean_t
762is_segment_register(int reg)
763{
764
765	switch (reg) {
766	case VM_REG_GUEST_ES:
767	case VM_REG_GUEST_CS:
768	case VM_REG_GUEST_SS:
769	case VM_REG_GUEST_DS:
770	case VM_REG_GUEST_FS:
771	case VM_REG_GUEST_GS:
772	case VM_REG_GUEST_TR:
773	case VM_REG_GUEST_LDTR:
774		return (TRUE);
775	default:
776		return (FALSE);
777	}
778}
779
780int
781vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
782		struct seg_desc *desc)
783{
784
785	if (vcpu < 0 || vcpu >= VM_MAXCPU)
786		return (EINVAL);
787
788	if (!is_segment_register(reg) && !is_descriptor_table(reg))
789		return (EINVAL);
790
791	return (VMGETDESC(vm->cookie, vcpu, reg, desc));
792}
793
794int
795vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
796		struct seg_desc *desc)
797{
798	if (vcpu < 0 || vcpu >= VM_MAXCPU)
799		return (EINVAL);
800
801	if (!is_segment_register(reg) && !is_descriptor_table(reg))
802		return (EINVAL);
803
804	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
805}
806
807static void
808restore_guest_fpustate(struct vcpu *vcpu)
809{
810
811	/* flush host state to the pcb */
812	fpuexit(curthread);
813
814	/* restore guest FPU state */
815	fpu_stop_emulating();
816	fpurestore(vcpu->guestfpu);
817
818	/*
819	 * The FPU is now "dirty" with the guest's state so turn on emulation
820	 * to trap any access to the FPU by the host.
821	 */
822	fpu_start_emulating();
823}
824
825static void
826save_guest_fpustate(struct vcpu *vcpu)
827{
828
829	if ((rcr0() & CR0_TS) == 0)
830		panic("fpu emulation not enabled in host!");
831
832	/* save guest FPU state */
833	fpu_stop_emulating();
834	fpusave(vcpu->guestfpu);
835	fpu_start_emulating();
836}
837
838static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
839
840static int
841vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
842    bool from_idle)
843{
844	int error;
845
846	vcpu_assert_locked(vcpu);
847
848	/*
849	 * State transitions from the vmmdev_ioctl() must always begin from
850	 * the VCPU_IDLE state. This guarantees that there is only a single
851	 * ioctl() operating on a vcpu at any point.
852	 */
853	if (from_idle) {
854		while (vcpu->state != VCPU_IDLE)
855			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
856	} else {
857		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
858		    "vcpu idle state"));
859	}
860
861	if (vcpu->state == VCPU_RUNNING) {
862		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
863		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
864	} else {
865		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
866		    "vcpu that is not running", vcpu->hostcpu));
867	}
868
869	/*
870	 * The following state transitions are allowed:
871	 * IDLE -> FROZEN -> IDLE
872	 * FROZEN -> RUNNING -> FROZEN
873	 * FROZEN -> SLEEPING -> FROZEN
874	 */
875	switch (vcpu->state) {
876	case VCPU_IDLE:
877	case VCPU_RUNNING:
878	case VCPU_SLEEPING:
879		error = (newstate != VCPU_FROZEN);
880		break;
881	case VCPU_FROZEN:
882		error = (newstate == VCPU_FROZEN);
883		break;
884	default:
885		error = 1;
886		break;
887	}
888
889	if (error)
890		return (EBUSY);
891
892	vcpu->state = newstate;
893	if (newstate == VCPU_RUNNING)
894		vcpu->hostcpu = curcpu;
895	else
896		vcpu->hostcpu = NOCPU;
897
898	if (newstate == VCPU_IDLE)
899		wakeup(&vcpu->state);
900
901	return (0);
902}
903
904static void
905vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
906{
907	int error;
908
909	if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
910		panic("Error %d setting state to %d\n", error, newstate);
911}
912
913static void
914vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
915{
916	int error;
917
918	if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
919		panic("Error %d setting state to %d", error, newstate);
920}
921
922static void
923vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func)
924{
925
926	KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked"));
927
928	/*
929	 * Update 'rendezvous_func' and execute a write memory barrier to
930	 * ensure that it is visible across all host cpus. This is not needed
931	 * for correctness but it does ensure that all the vcpus will notice
932	 * that the rendezvous is requested immediately.
933	 */
934	vm->rendezvous_func = func;
935	wmb();
936}
937
938#define	RENDEZVOUS_CTR0(vm, vcpuid, fmt)				\
939	do {								\
940		if (vcpuid >= 0)					\
941			VCPU_CTR0(vm, vcpuid, fmt);			\
942		else							\
943			VM_CTR0(vm, fmt);				\
944	} while (0)
945
946static void
947vm_handle_rendezvous(struct vm *vm, int vcpuid)
948{
949
950	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
951	    ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid));
952
953	mtx_lock(&vm->rendezvous_mtx);
954	while (vm->rendezvous_func != NULL) {
955		/* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
956		CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus);
957
958		if (vcpuid != -1 &&
959		    CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
960		    !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
961			VCPU_CTR0(vm, vcpuid, "Calling rendezvous func");
962			(*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg);
963			CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
964		}
965		if (CPU_CMP(&vm->rendezvous_req_cpus,
966		    &vm->rendezvous_done_cpus) == 0) {
967			VCPU_CTR0(vm, vcpuid, "Rendezvous completed");
968			vm_set_rendezvous_func(vm, NULL);
969			wakeup(&vm->rendezvous_func);
970			break;
971		}
972		RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion");
973		mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
974		    "vmrndv", 0);
975	}
976	mtx_unlock(&vm->rendezvous_mtx);
977}
978
979/*
980 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
981 */
982static int
983vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
984{
985	struct vm_exit *vmexit;
986	struct vcpu *vcpu;
987	int t, timo, spindown;
988
989	vcpu = &vm->vcpu[vcpuid];
990	spindown = 0;
991
992	vcpu_lock(vcpu);
993
994	/*
995	 * Do a final check for pending NMI or interrupts before
996	 * really putting this thread to sleep.
997	 *
998	 * These interrupts could have happened any time after we
999	 * returned from VMRUN() and before we grabbed the vcpu lock.
1000	 */
1001	if (!vm_nmi_pending(vm, vcpuid) &&
1002	    (intr_disabled || !vlapic_pending_intr(vcpu->vlapic, NULL))) {
1003		t = ticks;
1004		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1005		if (vlapic_enabled(vcpu->vlapic)) {
1006			/*
1007			 * XXX msleep_spin() is not interruptible so use the
1008			 * 'timo' to put an upper bound on the sleep time.
1009			 */
1010			timo = hz;
1011			msleep_spin(vcpu, &vcpu->mtx, "vmidle", timo);
1012		} else {
1013			/*
1014			 * Spindown the vcpu if the apic is disabled and it
1015			 * had entered the halted state.
1016			 */
1017			spindown = 1;
1018		}
1019		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1020		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
1021	}
1022	vcpu_unlock(vcpu);
1023
1024	/*
1025	 * Since 'vm_deactivate_cpu()' grabs a sleep mutex we must call it
1026	 * outside the confines of the vcpu spinlock.
1027	 */
1028	if (spindown) {
1029		*retu = true;
1030		vmexit = vm_exitinfo(vm, vcpuid);
1031		vmexit->exitcode = VM_EXITCODE_SPINDOWN_CPU;
1032		vm_deactivate_cpu(vm, vcpuid);
1033		VCPU_CTR0(vm, vcpuid, "spinning down cpu");
1034	}
1035
1036	return (0);
1037}
1038
1039static int
1040vm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
1041{
1042	int rv, ftype;
1043	struct vm_map *map;
1044	struct vcpu *vcpu;
1045	struct vm_exit *vme;
1046
1047	vcpu = &vm->vcpu[vcpuid];
1048	vme = &vcpu->exitinfo;
1049
1050	ftype = vme->u.paging.fault_type;
1051	KASSERT(ftype == VM_PROT_READ ||
1052	    ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
1053	    ("vm_handle_paging: invalid fault_type %d", ftype));
1054
1055	if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
1056		rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
1057		    vme->u.paging.gpa, ftype);
1058		if (rv == 0)
1059			goto done;
1060	}
1061
1062	map = &vm->vmspace->vm_map;
1063	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
1064
1065	VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, "
1066	    "ftype = %d", rv, vme->u.paging.gpa, ftype);
1067
1068	if (rv != KERN_SUCCESS)
1069		return (EFAULT);
1070done:
1071	/* restart execution at the faulting instruction */
1072	vme->inst_length = 0;
1073
1074	return (0);
1075}
1076
1077static int
1078vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
1079{
1080	struct vie *vie;
1081	struct vcpu *vcpu;
1082	struct vm_exit *vme;
1083	int error, inst_length;
1084	uint64_t rip, gla, gpa, cr3;
1085	mem_region_read_t mread;
1086	mem_region_write_t mwrite;
1087
1088	vcpu = &vm->vcpu[vcpuid];
1089	vme = &vcpu->exitinfo;
1090
1091	rip = vme->rip;
1092	inst_length = vme->inst_length;
1093
1094	gla = vme->u.inst_emul.gla;
1095	gpa = vme->u.inst_emul.gpa;
1096	cr3 = vme->u.inst_emul.cr3;
1097	vie = &vme->u.inst_emul.vie;
1098
1099	vie_init(vie);
1100
1101	/* Fetch, decode and emulate the faulting instruction */
1102	if (vmm_fetch_instruction(vm, vcpuid, rip, inst_length, cr3, vie) != 0)
1103		return (EFAULT);
1104
1105	if (vmm_decode_instruction(vm, vcpuid, gla, vie) != 0)
1106		return (EFAULT);
1107
1108	/* return to userland unless this is an in-kernel emulated device */
1109	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1110		mread = lapic_mmio_read;
1111		mwrite = lapic_mmio_write;
1112	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1113		mread = vioapic_mmio_read;
1114		mwrite = vioapic_mmio_write;
1115	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1116		mread = vhpet_mmio_read;
1117		mwrite = vhpet_mmio_write;
1118	} else {
1119		*retu = true;
1120		return (0);
1121	}
1122
1123	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, mread, mwrite,
1124	    retu);
1125
1126	return (error);
1127}
1128
1129int
1130vm_run(struct vm *vm, struct vm_run *vmrun)
1131{
1132	int error, vcpuid;
1133	struct vcpu *vcpu;
1134	struct pcb *pcb;
1135	uint64_t tscval, rip;
1136	struct vm_exit *vme;
1137	bool retu, intr_disabled;
1138	pmap_t pmap;
1139
1140	vcpuid = vmrun->cpuid;
1141
1142	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1143		return (EINVAL);
1144
1145	pmap = vmspace_pmap(vm->vmspace);
1146	vcpu = &vm->vcpu[vcpuid];
1147	vme = &vcpu->exitinfo;
1148	rip = vmrun->rip;
1149restart:
1150	critical_enter();
1151
1152	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
1153	    ("vm_run: absurd pm_active"));
1154
1155	tscval = rdtsc();
1156
1157	pcb = PCPU_GET(curpcb);
1158	set_pcb_flags(pcb, PCB_FULL_IRET);
1159
1160	restore_guest_msrs(vm, vcpuid);
1161	restore_guest_fpustate(vcpu);
1162
1163	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
1164	error = VMRUN(vm->cookie, vcpuid, rip, pmap, &vm->rendezvous_func);
1165	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
1166
1167	save_guest_fpustate(vcpu);
1168	restore_host_msrs(vm, vcpuid);
1169
1170	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
1171
1172	critical_exit();
1173
1174	if (error == 0) {
1175		retu = false;
1176		switch (vme->exitcode) {
1177		case VM_EXITCODE_IOAPIC_EOI:
1178			vioapic_process_eoi(vm, vcpuid,
1179			    vme->u.ioapic_eoi.vector);
1180			break;
1181		case VM_EXITCODE_RENDEZVOUS:
1182			vm_handle_rendezvous(vm, vcpuid);
1183			error = 0;
1184			break;
1185		case VM_EXITCODE_HLT:
1186			intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
1187			error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu);
1188			break;
1189		case VM_EXITCODE_PAGING:
1190			error = vm_handle_paging(vm, vcpuid, &retu);
1191			break;
1192		case VM_EXITCODE_INST_EMUL:
1193			error = vm_handle_inst_emul(vm, vcpuid, &retu);
1194			break;
1195		default:
1196			retu = true;	/* handled in userland */
1197			break;
1198		}
1199	}
1200
1201	if (error == 0 && retu == false) {
1202		rip = vme->rip + vme->inst_length;
1203		goto restart;
1204	}
1205
1206	/* copy the exit information */
1207	bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
1208	return (error);
1209}
1210
1211int
1212vm_inject_event(struct vm *vm, int vcpuid, int type,
1213		int vector, uint32_t code, int code_valid)
1214{
1215	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1216		return (EINVAL);
1217
1218	if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0)
1219		return (EINVAL);
1220
1221	if (vector < 0 || vector > 255)
1222		return (EINVAL);
1223
1224	return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid));
1225}
1226
1227static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
1228
1229int
1230vm_inject_nmi(struct vm *vm, int vcpuid)
1231{
1232	struct vcpu *vcpu;
1233
1234	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1235		return (EINVAL);
1236
1237	vcpu = &vm->vcpu[vcpuid];
1238
1239	vcpu->nmi_pending = 1;
1240	vcpu_notify_event(vm, vcpuid, false);
1241	return (0);
1242}
1243
1244int
1245vm_nmi_pending(struct vm *vm, int vcpuid)
1246{
1247	struct vcpu *vcpu;
1248
1249	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1250		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1251
1252	vcpu = &vm->vcpu[vcpuid];
1253
1254	return (vcpu->nmi_pending);
1255}
1256
1257void
1258vm_nmi_clear(struct vm *vm, int vcpuid)
1259{
1260	struct vcpu *vcpu;
1261
1262	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1263		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1264
1265	vcpu = &vm->vcpu[vcpuid];
1266
1267	if (vcpu->nmi_pending == 0)
1268		panic("vm_nmi_clear: inconsistent nmi_pending state");
1269
1270	vcpu->nmi_pending = 0;
1271	vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
1272}
1273
1274int
1275vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
1276{
1277	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1278		return (EINVAL);
1279
1280	if (type < 0 || type >= VM_CAP_MAX)
1281		return (EINVAL);
1282
1283	return (VMGETCAP(vm->cookie, vcpu, type, retval));
1284}
1285
1286int
1287vm_set_capability(struct vm *vm, int vcpu, int type, int val)
1288{
1289	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1290		return (EINVAL);
1291
1292	if (type < 0 || type >= VM_CAP_MAX)
1293		return (EINVAL);
1294
1295	return (VMSETCAP(vm->cookie, vcpu, type, val));
1296}
1297
1298uint64_t *
1299vm_guest_msrs(struct vm *vm, int cpu)
1300{
1301	return (vm->vcpu[cpu].guest_msrs);
1302}
1303
1304struct vlapic *
1305vm_lapic(struct vm *vm, int cpu)
1306{
1307	return (vm->vcpu[cpu].vlapic);
1308}
1309
1310struct vioapic *
1311vm_ioapic(struct vm *vm)
1312{
1313
1314	return (vm->vioapic);
1315}
1316
1317struct vhpet *
1318vm_hpet(struct vm *vm)
1319{
1320
1321	return (vm->vhpet);
1322}
1323
1324boolean_t
1325vmm_is_pptdev(int bus, int slot, int func)
1326{
1327	int found, i, n;
1328	int b, s, f;
1329	char *val, *cp, *cp2;
1330
1331	/*
1332	 * XXX
1333	 * The length of an environment variable is limited to 128 bytes which
1334	 * puts an upper limit on the number of passthru devices that may be
1335	 * specified using a single environment variable.
1336	 *
1337	 * Work around this by scanning multiple environment variable
1338	 * names instead of a single one - yuck!
1339	 */
1340	const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
1341
1342	/* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
1343	found = 0;
1344	for (i = 0; names[i] != NULL && !found; i++) {
1345		cp = val = getenv(names[i]);
1346		while (cp != NULL && *cp != '\0') {
1347			if ((cp2 = strchr(cp, ' ')) != NULL)
1348				*cp2 = '\0';
1349
1350			n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
1351			if (n == 3 && bus == b && slot == s && func == f) {
1352				found = 1;
1353				break;
1354			}
1355
1356			if (cp2 != NULL)
1357				*cp2++ = ' ';
1358
1359			cp = cp2;
1360		}
1361		freeenv(val);
1362	}
1363	return (found);
1364}
1365
1366void *
1367vm_iommu_domain(struct vm *vm)
1368{
1369
1370	return (vm->iommu);
1371}
1372
1373int
1374vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
1375    bool from_idle)
1376{
1377	int error;
1378	struct vcpu *vcpu;
1379
1380	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1381		panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
1382
1383	vcpu = &vm->vcpu[vcpuid];
1384
1385	vcpu_lock(vcpu);
1386	error = vcpu_set_state_locked(vcpu, newstate, from_idle);
1387	vcpu_unlock(vcpu);
1388
1389	return (error);
1390}
1391
1392enum vcpu_state
1393vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
1394{
1395	struct vcpu *vcpu;
1396	enum vcpu_state state;
1397
1398	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1399		panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
1400
1401	vcpu = &vm->vcpu[vcpuid];
1402
1403	vcpu_lock(vcpu);
1404	state = vcpu->state;
1405	if (hostcpu != NULL)
1406		*hostcpu = vcpu->hostcpu;
1407	vcpu_unlock(vcpu);
1408
1409	return (state);
1410}
1411
1412void
1413vm_activate_cpu(struct vm *vm, int vcpuid)
1414{
1415
1416	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU,
1417	    ("vm_activate_cpu: invalid vcpuid %d", vcpuid));
1418	KASSERT(!CPU_ISSET(vcpuid, &vm->active_cpus),
1419	    ("vm_activate_cpu: vcpuid %d is already active", vcpuid));
1420
1421	VCPU_CTR0(vm, vcpuid, "activated");
1422	CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
1423}
1424
1425static void
1426vm_deactivate_cpu(struct vm *vm, int vcpuid)
1427{
1428
1429	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU,
1430	    ("vm_deactivate_cpu: invalid vcpuid %d", vcpuid));
1431	KASSERT(CPU_ISSET(vcpuid, &vm->active_cpus),
1432	    ("vm_deactivate_cpu: vcpuid %d is not active", vcpuid));
1433
1434	VCPU_CTR0(vm, vcpuid, "deactivated");
1435	CPU_CLR_ATOMIC(vcpuid, &vm->active_cpus);
1436
1437	/*
1438	 * If a vcpu rendezvous is in progress then it could be blocked
1439	 * on 'vcpuid' - unblock it before disappearing forever.
1440	 */
1441	mtx_lock(&vm->rendezvous_mtx);
1442	if (vm->rendezvous_func != NULL) {
1443		VCPU_CTR0(vm, vcpuid, "unblock rendezvous after deactivation");
1444		wakeup(&vm->rendezvous_func);
1445	}
1446	mtx_unlock(&vm->rendezvous_mtx);
1447}
1448
1449cpuset_t
1450vm_active_cpus(struct vm *vm)
1451{
1452
1453	return (vm->active_cpus);
1454}
1455
1456void *
1457vcpu_stats(struct vm *vm, int vcpuid)
1458{
1459
1460	return (vm->vcpu[vcpuid].stats);
1461}
1462
1463int
1464vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
1465{
1466	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1467		return (EINVAL);
1468
1469	*state = vm->vcpu[vcpuid].x2apic_state;
1470
1471	return (0);
1472}
1473
1474int
1475vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
1476{
1477	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1478		return (EINVAL);
1479
1480	if (state >= X2APIC_STATE_LAST)
1481		return (EINVAL);
1482
1483	vm->vcpu[vcpuid].x2apic_state = state;
1484
1485	vlapic_set_x2apic_state(vm, vcpuid, state);
1486
1487	return (0);
1488}
1489
1490/*
1491 * This function is called to ensure that a vcpu "sees" a pending event
1492 * as soon as possible:
1493 * - If the vcpu thread is sleeping then it is woken up.
1494 * - If the vcpu is running on a different host_cpu then an IPI will be directed
1495 *   to the host_cpu to cause the vcpu to trap into the hypervisor.
1496 */
1497void
1498vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
1499{
1500	int hostcpu;
1501	struct vcpu *vcpu;
1502
1503	vcpu = &vm->vcpu[vcpuid];
1504
1505	vcpu_lock(vcpu);
1506	hostcpu = vcpu->hostcpu;
1507	if (vcpu->state == VCPU_RUNNING) {
1508		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
1509		if (hostcpu != curcpu) {
1510			if (lapic_intr) {
1511				vlapic_post_intr(vcpu->vlapic, hostcpu,
1512				    vmm_ipinum);
1513			} else {
1514				ipi_cpu(hostcpu, vmm_ipinum);
1515			}
1516		} else {
1517			/*
1518			 * If the 'vcpu' is running on 'curcpu' then it must
1519			 * be sending a notification to itself (e.g. SELF_IPI).
1520			 * The pending event will be picked up when the vcpu
1521			 * transitions back to guest context.
1522			 */
1523		}
1524	} else {
1525		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
1526		    "with hostcpu %d", vcpu->state, hostcpu));
1527		if (vcpu->state == VCPU_SLEEPING)
1528			wakeup_one(vcpu);
1529	}
1530	vcpu_unlock(vcpu);
1531}
1532
1533struct vmspace *
1534vm_get_vmspace(struct vm *vm)
1535{
1536
1537	return (vm->vmspace);
1538}
1539
1540int
1541vm_apicid2vcpuid(struct vm *vm, int apicid)
1542{
1543	/*
1544	 * XXX apic id is assumed to be numerically identical to vcpu id
1545	 */
1546	return (apicid);
1547}
1548
1549void
1550vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
1551    vm_rendezvous_func_t func, void *arg)
1552{
1553	int i;
1554
1555	/*
1556	 * Enforce that this function is called without any locks
1557	 */
1558	WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous");
1559	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
1560	    ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid));
1561
1562restart:
1563	mtx_lock(&vm->rendezvous_mtx);
1564	if (vm->rendezvous_func != NULL) {
1565		/*
1566		 * If a rendezvous is already in progress then we need to
1567		 * call the rendezvous handler in case this 'vcpuid' is one
1568		 * of the targets of the rendezvous.
1569		 */
1570		RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress");
1571		mtx_unlock(&vm->rendezvous_mtx);
1572		vm_handle_rendezvous(vm, vcpuid);
1573		goto restart;
1574	}
1575	KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous "
1576	    "rendezvous is still in progress"));
1577
1578	RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous");
1579	vm->rendezvous_req_cpus = dest;
1580	CPU_ZERO(&vm->rendezvous_done_cpus);
1581	vm->rendezvous_arg = arg;
1582	vm_set_rendezvous_func(vm, func);
1583	mtx_unlock(&vm->rendezvous_mtx);
1584
1585	/*
1586	 * Wake up any sleeping vcpus and trigger a VM-exit in any running
1587	 * vcpus so they handle the rendezvous as soon as possible.
1588	 */
1589	for (i = 0; i < VM_MAXCPU; i++) {
1590		if (CPU_ISSET(i, &dest))
1591			vcpu_notify_event(vm, i, false);
1592	}
1593
1594	vm_handle_rendezvous(vm, vcpuid);
1595}
1596