vmm.c revision 266339
1/*-
2 * Copyright (c) 2011 NetApp, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD: stable/10/sys/amd64/vmm/vmm.c 266339 2014-05-17 19:11:08Z jhb $
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/vmm.c 266339 2014-05-17 19:11:08Z jhb $");
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/kernel.h>
35#include <sys/module.h>
36#include <sys/sysctl.h>
37#include <sys/malloc.h>
38#include <sys/pcpu.h>
39#include <sys/lock.h>
40#include <sys/mutex.h>
41#include <sys/proc.h>
42#include <sys/rwlock.h>
43#include <sys/sched.h>
44#include <sys/smp.h>
45#include <sys/systm.h>
46
47#include <vm/vm.h>
48#include <vm/vm_object.h>
49#include <vm/vm_page.h>
50#include <vm/pmap.h>
51#include <vm/vm_map.h>
52#include <vm/vm_extern.h>
53#include <vm/vm_param.h>
54
55#include <machine/cpu.h>
56#include <machine/vm.h>
57#include <machine/pcb.h>
58#include <machine/smp.h>
59#include <x86/psl.h>
60#include <x86/apicreg.h>
61#include <machine/vmparam.h>
62
63#include <machine/vmm.h>
64#include <machine/vmm_dev.h>
65
66#include "vmm_ktr.h"
67#include "vmm_host.h"
68#include "vmm_mem.h"
69#include "vmm_util.h"
70#include "vhpet.h"
71#include "vioapic.h"
72#include "vlapic.h"
73#include "vmm_msr.h"
74#include "vmm_ipi.h"
75#include "vmm_stat.h"
76#include "vmm_lapic.h"
77
78#include "io/ppt.h"
79#include "io/iommu.h"
80
81struct vlapic;
82
83struct vcpu {
84	int		flags;
85	enum vcpu_state	state;
86	struct mtx	mtx;
87	int		hostcpu;	/* host cpuid this vcpu last ran on */
88	uint64_t	guest_msrs[VMM_MSR_NUM];
89	struct vlapic	*vlapic;
90	int		 vcpuid;
91	struct savefpu	*guestfpu;	/* guest fpu state */
92	void		*stats;
93	struct vm_exit	exitinfo;
94	enum x2apic_state x2apic_state;
95	int		nmi_pending;
96};
97
98#define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
99#define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
100#define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
101#define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
102
103struct mem_seg {
104	vm_paddr_t	gpa;
105	size_t		len;
106	boolean_t	wired;
107	vm_object_t	object;
108};
109#define	VM_MAX_MEMORY_SEGMENTS	2
110
111struct vm {
112	void		*cookie;	/* processor-specific data */
113	void		*iommu;		/* iommu-specific data */
114	struct vhpet	*vhpet;		/* virtual HPET */
115	struct vioapic	*vioapic;	/* virtual ioapic */
116	struct vmspace	*vmspace;	/* guest's address space */
117	struct vcpu	vcpu[VM_MAXCPU];
118	int		num_mem_segs;
119	struct mem_seg	mem_segs[VM_MAX_MEMORY_SEGMENTS];
120	char		name[VM_MAX_NAMELEN];
121
122	/*
123	 * Set of active vcpus.
124	 * An active vcpu is one that has been started implicitly (BSP) or
125	 * explicitly (AP) by sending it a startup ipi.
126	 */
127	volatile cpuset_t active_cpus;
128
129	struct mtx	rendezvous_mtx;
130	cpuset_t	rendezvous_req_cpus;
131	cpuset_t	rendezvous_done_cpus;
132	void		*rendezvous_arg;
133	vm_rendezvous_func_t rendezvous_func;
134};
135
136static int vmm_initialized;
137
138static struct vmm_ops *ops;
139#define	VMM_INIT(num)	(ops != NULL ? (*ops->init)(num) : 0)
140#define	VMM_CLEANUP()	(ops != NULL ? (*ops->cleanup)() : 0)
141#define	VMM_RESUME()	(ops != NULL ? (*ops->resume)() : 0)
142
143#define	VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
144#define	VMRUN(vmi, vcpu, rip, pmap, rptr) \
145	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, rptr) : ENXIO)
146#define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
147#define	VMSPACE_ALLOC(min, max) \
148	(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
149#define	VMSPACE_FREE(vmspace) \
150	(ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
151#define	VMGETREG(vmi, vcpu, num, retval)		\
152	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
153#define	VMSETREG(vmi, vcpu, num, val)		\
154	(ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
155#define	VMGETDESC(vmi, vcpu, num, desc)		\
156	(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
157#define	VMSETDESC(vmi, vcpu, num, desc)		\
158	(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
159#define	VMINJECT(vmi, vcpu, type, vec, ec, ecv)	\
160	(ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO)
161#define	VMGETCAP(vmi, vcpu, num, retval)	\
162	(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
163#define	VMSETCAP(vmi, vcpu, num, val)		\
164	(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
165#define	VLAPIC_INIT(vmi, vcpu)			\
166	(ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL)
167#define	VLAPIC_CLEANUP(vmi, vlapic)		\
168	(ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL)
169
170#define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
171#define	fpu_stop_emulating()	clts()
172
173static MALLOC_DEFINE(M_VM, "vm", "vm");
174CTASSERT(VMM_MSR_NUM <= 64);	/* msr_mask can keep track of up to 64 msrs */
175
176/* statistics */
177static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
178
179SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
180
181static int vmm_ipinum;
182SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
183    "IPI vector used for vcpu notifications");
184
185static void vm_deactivate_cpu(struct vm *vm, int vcpuid);
186
187static void
188vcpu_cleanup(struct vm *vm, int i)
189{
190	struct vcpu *vcpu = &vm->vcpu[i];
191
192	VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
193	vmm_stat_free(vcpu->stats);
194	fpu_save_area_free(vcpu->guestfpu);
195}
196
197static void
198vcpu_init(struct vm *vm, uint32_t vcpu_id)
199{
200	struct vcpu *vcpu;
201
202	vcpu = &vm->vcpu[vcpu_id];
203
204	vcpu_lock_init(vcpu);
205	vcpu->hostcpu = NOCPU;
206	vcpu->vcpuid = vcpu_id;
207	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
208	vm_set_x2apic_state(vm, vcpu_id, X2APIC_ENABLED);
209	vcpu->guestfpu = fpu_save_area_alloc();
210	fpu_save_area_reset(vcpu->guestfpu);
211	vcpu->stats = vmm_stat_alloc();
212}
213
214struct vm_exit *
215vm_exitinfo(struct vm *vm, int cpuid)
216{
217	struct vcpu *vcpu;
218
219	if (cpuid < 0 || cpuid >= VM_MAXCPU)
220		panic("vm_exitinfo: invalid cpuid %d", cpuid);
221
222	vcpu = &vm->vcpu[cpuid];
223
224	return (&vcpu->exitinfo);
225}
226
227static void
228vmm_resume(void)
229{
230	VMM_RESUME();
231}
232
233static int
234vmm_init(void)
235{
236	int error;
237
238	vmm_host_state_init();
239
240	vmm_ipinum = vmm_ipi_alloc();
241	if (vmm_ipinum == 0)
242		vmm_ipinum = IPI_AST;
243
244	error = vmm_mem_init();
245	if (error)
246		return (error);
247
248	if (vmm_is_intel())
249		ops = &vmm_ops_intel;
250	else if (vmm_is_amd())
251		ops = &vmm_ops_amd;
252	else
253		return (ENXIO);
254
255	vmm_msr_init();
256	vmm_resume_p = vmm_resume;
257
258	return (VMM_INIT(vmm_ipinum));
259}
260
261static int
262vmm_handler(module_t mod, int what, void *arg)
263{
264	int error;
265
266	switch (what) {
267	case MOD_LOAD:
268		vmmdev_init();
269		iommu_init();
270		error = vmm_init();
271		if (error == 0)
272			vmm_initialized = 1;
273		break;
274	case MOD_UNLOAD:
275		error = vmmdev_cleanup();
276		if (error == 0) {
277			vmm_resume_p = NULL;
278			iommu_cleanup();
279			if (vmm_ipinum != IPI_AST)
280				vmm_ipi_free(vmm_ipinum);
281			error = VMM_CLEANUP();
282			/*
283			 * Something bad happened - prevent new
284			 * VMs from being created
285			 */
286			if (error)
287				vmm_initialized = 0;
288		}
289		break;
290	default:
291		error = 0;
292		break;
293	}
294	return (error);
295}
296
297static moduledata_t vmm_kmod = {
298	"vmm",
299	vmm_handler,
300	NULL
301};
302
303/*
304 * vmm initialization has the following dependencies:
305 *
306 * - iommu initialization must happen after the pci passthru driver has had
307 *   a chance to attach to any passthru devices (after SI_SUB_CONFIGURE).
308 *
309 * - VT-x initialization requires smp_rendezvous() and therefore must happen
310 *   after SMP is fully functional (after SI_SUB_SMP).
311 */
312DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
313MODULE_VERSION(vmm, 1);
314
315int
316vm_create(const char *name, struct vm **retvm)
317{
318	int i;
319	struct vm *vm;
320	struct vmspace *vmspace;
321
322	const int BSP = 0;
323
324	/*
325	 * If vmm.ko could not be successfully initialized then don't attempt
326	 * to create the virtual machine.
327	 */
328	if (!vmm_initialized)
329		return (ENXIO);
330
331	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
332		return (EINVAL);
333
334	vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS);
335	if (vmspace == NULL)
336		return (ENOMEM);
337
338	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
339	strcpy(vm->name, name);
340	vm->vmspace = vmspace;
341	mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
342	vm->cookie = VMINIT(vm, vmspace_pmap(vmspace));
343	vm->vioapic = vioapic_init(vm);
344	vm->vhpet = vhpet_init(vm);
345
346	for (i = 0; i < VM_MAXCPU; i++) {
347		vcpu_init(vm, i);
348		guest_msrs_init(vm, i);
349	}
350
351	vm_activate_cpu(vm, BSP);
352
353	*retvm = vm;
354	return (0);
355}
356
357static void
358vm_free_mem_seg(struct vm *vm, struct mem_seg *seg)
359{
360
361	if (seg->object != NULL)
362		vmm_mem_free(vm->vmspace, seg->gpa, seg->len);
363
364	bzero(seg, sizeof(*seg));
365}
366
367void
368vm_destroy(struct vm *vm)
369{
370	int i;
371
372	ppt_unassign_all(vm);
373
374	if (vm->iommu != NULL)
375		iommu_destroy_domain(vm->iommu);
376
377	vhpet_cleanup(vm->vhpet);
378	vioapic_cleanup(vm->vioapic);
379
380	for (i = 0; i < vm->num_mem_segs; i++)
381		vm_free_mem_seg(vm, &vm->mem_segs[i]);
382
383	vm->num_mem_segs = 0;
384
385	for (i = 0; i < VM_MAXCPU; i++)
386		vcpu_cleanup(vm, i);
387
388	VMSPACE_FREE(vm->vmspace);
389
390	VMCLEANUP(vm->cookie);
391
392	free(vm, M_VM);
393}
394
395const char *
396vm_name(struct vm *vm)
397{
398	return (vm->name);
399}
400
401int
402vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
403{
404	vm_object_t obj;
405
406	if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
407		return (ENOMEM);
408	else
409		return (0);
410}
411
412int
413vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
414{
415
416	vmm_mmio_free(vm->vmspace, gpa, len);
417	return (0);
418}
419
420boolean_t
421vm_mem_allocated(struct vm *vm, vm_paddr_t gpa)
422{
423	int i;
424	vm_paddr_t gpabase, gpalimit;
425
426	for (i = 0; i < vm->num_mem_segs; i++) {
427		gpabase = vm->mem_segs[i].gpa;
428		gpalimit = gpabase + vm->mem_segs[i].len;
429		if (gpa >= gpabase && gpa < gpalimit)
430			return (TRUE);		/* 'gpa' is regular memory */
431	}
432
433	if (ppt_is_mmio(vm, gpa))
434		return (TRUE);			/* 'gpa' is pci passthru mmio */
435
436	return (FALSE);
437}
438
439int
440vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
441{
442	int available, allocated;
443	struct mem_seg *seg;
444	vm_object_t object;
445	vm_paddr_t g;
446
447	if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
448		return (EINVAL);
449
450	available = allocated = 0;
451	g = gpa;
452	while (g < gpa + len) {
453		if (vm_mem_allocated(vm, g))
454			allocated++;
455		else
456			available++;
457
458		g += PAGE_SIZE;
459	}
460
461	/*
462	 * If there are some allocated and some available pages in the address
463	 * range then it is an error.
464	 */
465	if (allocated && available)
466		return (EINVAL);
467
468	/*
469	 * If the entire address range being requested has already been
470	 * allocated then there isn't anything more to do.
471	 */
472	if (allocated && available == 0)
473		return (0);
474
475	if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
476		return (E2BIG);
477
478	seg = &vm->mem_segs[vm->num_mem_segs];
479
480	if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL)
481		return (ENOMEM);
482
483	seg->gpa = gpa;
484	seg->len = len;
485	seg->object = object;
486	seg->wired = FALSE;
487
488	vm->num_mem_segs++;
489
490	return (0);
491}
492
493static void
494vm_gpa_unwire(struct vm *vm)
495{
496	int i, rv;
497	struct mem_seg *seg;
498
499	for (i = 0; i < vm->num_mem_segs; i++) {
500		seg = &vm->mem_segs[i];
501		if (!seg->wired)
502			continue;
503
504		rv = vm_map_unwire(&vm->vmspace->vm_map,
505				   seg->gpa, seg->gpa + seg->len,
506				   VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
507		KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment "
508		    "%#lx/%ld could not be unwired: %d",
509		    vm_name(vm), seg->gpa, seg->len, rv));
510
511		seg->wired = FALSE;
512	}
513}
514
515static int
516vm_gpa_wire(struct vm *vm)
517{
518	int i, rv;
519	struct mem_seg *seg;
520
521	for (i = 0; i < vm->num_mem_segs; i++) {
522		seg = &vm->mem_segs[i];
523		if (seg->wired)
524			continue;
525
526		/* XXX rlimits? */
527		rv = vm_map_wire(&vm->vmspace->vm_map,
528				 seg->gpa, seg->gpa + seg->len,
529				 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
530		if (rv != KERN_SUCCESS)
531			break;
532
533		seg->wired = TRUE;
534	}
535
536	if (i < vm->num_mem_segs) {
537		/*
538		 * Undo the wiring before returning an error.
539		 */
540		vm_gpa_unwire(vm);
541		return (EAGAIN);
542	}
543
544	return (0);
545}
546
547static void
548vm_iommu_modify(struct vm *vm, boolean_t map)
549{
550	int i, sz;
551	vm_paddr_t gpa, hpa;
552	struct mem_seg *seg;
553	void *vp, *cookie, *host_domain;
554
555	sz = PAGE_SIZE;
556	host_domain = iommu_host_domain();
557
558	for (i = 0; i < vm->num_mem_segs; i++) {
559		seg = &vm->mem_segs[i];
560		KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired",
561		    vm_name(vm), seg->gpa, seg->len));
562
563		gpa = seg->gpa;
564		while (gpa < seg->gpa + seg->len) {
565			vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE,
566					 &cookie);
567			KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
568			    vm_name(vm), gpa));
569
570			vm_gpa_release(cookie);
571
572			hpa = DMAP_TO_PHYS((uintptr_t)vp);
573			if (map) {
574				iommu_create_mapping(vm->iommu, gpa, hpa, sz);
575				iommu_remove_mapping(host_domain, hpa, sz);
576			} else {
577				iommu_remove_mapping(vm->iommu, gpa, sz);
578				iommu_create_mapping(host_domain, hpa, hpa, sz);
579			}
580
581			gpa += PAGE_SIZE;
582		}
583	}
584
585	/*
586	 * Invalidate the cached translations associated with the domain
587	 * from which pages were removed.
588	 */
589	if (map)
590		iommu_invalidate_tlb(host_domain);
591	else
592		iommu_invalidate_tlb(vm->iommu);
593}
594
595#define	vm_iommu_unmap(vm)	vm_iommu_modify((vm), FALSE)
596#define	vm_iommu_map(vm)	vm_iommu_modify((vm), TRUE)
597
598int
599vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
600{
601	int error;
602
603	error = ppt_unassign_device(vm, bus, slot, func);
604	if (error)
605		return (error);
606
607	if (ppt_num_devices(vm) == 0) {
608		vm_iommu_unmap(vm);
609		vm_gpa_unwire(vm);
610	}
611	return (0);
612}
613
614int
615vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
616{
617	int error;
618	vm_paddr_t maxaddr;
619
620	/*
621	 * Virtual machines with pci passthru devices get special treatment:
622	 * - the guest physical memory is wired
623	 * - the iommu is programmed to do the 'gpa' to 'hpa' translation
624	 *
625	 * We need to do this before the first pci passthru device is attached.
626	 */
627	if (ppt_num_devices(vm) == 0) {
628		KASSERT(vm->iommu == NULL,
629		    ("vm_assign_pptdev: iommu must be NULL"));
630		maxaddr = vmm_mem_maxaddr();
631		vm->iommu = iommu_create_domain(maxaddr);
632
633		error = vm_gpa_wire(vm);
634		if (error)
635			return (error);
636
637		vm_iommu_map(vm);
638	}
639
640	error = ppt_assign_device(vm, bus, slot, func);
641	return (error);
642}
643
644void *
645vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
646	    void **cookie)
647{
648	int count, pageoff;
649	vm_page_t m;
650
651	pageoff = gpa & PAGE_MASK;
652	if (len > PAGE_SIZE - pageoff)
653		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
654
655	count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
656	    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
657
658	if (count == 1) {
659		*cookie = m;
660		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
661	} else {
662		*cookie = NULL;
663		return (NULL);
664	}
665}
666
667void
668vm_gpa_release(void *cookie)
669{
670	vm_page_t m = cookie;
671
672	vm_page_lock(m);
673	vm_page_unhold(m);
674	vm_page_unlock(m);
675}
676
677int
678vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
679		  struct vm_memory_segment *seg)
680{
681	int i;
682
683	for (i = 0; i < vm->num_mem_segs; i++) {
684		if (gpabase == vm->mem_segs[i].gpa) {
685			seg->gpa = vm->mem_segs[i].gpa;
686			seg->len = vm->mem_segs[i].len;
687			seg->wired = vm->mem_segs[i].wired;
688			return (0);
689		}
690	}
691	return (-1);
692}
693
694int
695vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
696	      vm_offset_t *offset, struct vm_object **object)
697{
698	int i;
699	size_t seg_len;
700	vm_paddr_t seg_gpa;
701	vm_object_t seg_obj;
702
703	for (i = 0; i < vm->num_mem_segs; i++) {
704		if ((seg_obj = vm->mem_segs[i].object) == NULL)
705			continue;
706
707		seg_gpa = vm->mem_segs[i].gpa;
708		seg_len = vm->mem_segs[i].len;
709
710		if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) {
711			*offset = gpa - seg_gpa;
712			*object = seg_obj;
713			vm_object_reference(seg_obj);
714			return (0);
715		}
716	}
717
718	return (EINVAL);
719}
720
721int
722vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
723{
724
725	if (vcpu < 0 || vcpu >= VM_MAXCPU)
726		return (EINVAL);
727
728	if (reg >= VM_REG_LAST)
729		return (EINVAL);
730
731	return (VMGETREG(vm->cookie, vcpu, reg, retval));
732}
733
734int
735vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
736{
737
738	if (vcpu < 0 || vcpu >= VM_MAXCPU)
739		return (EINVAL);
740
741	if (reg >= VM_REG_LAST)
742		return (EINVAL);
743
744	return (VMSETREG(vm->cookie, vcpu, reg, val));
745}
746
747static boolean_t
748is_descriptor_table(int reg)
749{
750
751	switch (reg) {
752	case VM_REG_GUEST_IDTR:
753	case VM_REG_GUEST_GDTR:
754		return (TRUE);
755	default:
756		return (FALSE);
757	}
758}
759
760static boolean_t
761is_segment_register(int reg)
762{
763
764	switch (reg) {
765	case VM_REG_GUEST_ES:
766	case VM_REG_GUEST_CS:
767	case VM_REG_GUEST_SS:
768	case VM_REG_GUEST_DS:
769	case VM_REG_GUEST_FS:
770	case VM_REG_GUEST_GS:
771	case VM_REG_GUEST_TR:
772	case VM_REG_GUEST_LDTR:
773		return (TRUE);
774	default:
775		return (FALSE);
776	}
777}
778
779int
780vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
781		struct seg_desc *desc)
782{
783
784	if (vcpu < 0 || vcpu >= VM_MAXCPU)
785		return (EINVAL);
786
787	if (!is_segment_register(reg) && !is_descriptor_table(reg))
788		return (EINVAL);
789
790	return (VMGETDESC(vm->cookie, vcpu, reg, desc));
791}
792
793int
794vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
795		struct seg_desc *desc)
796{
797	if (vcpu < 0 || vcpu >= VM_MAXCPU)
798		return (EINVAL);
799
800	if (!is_segment_register(reg) && !is_descriptor_table(reg))
801		return (EINVAL);
802
803	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
804}
805
806static void
807restore_guest_fpustate(struct vcpu *vcpu)
808{
809
810	/* flush host state to the pcb */
811	fpuexit(curthread);
812
813	/* restore guest FPU state */
814	fpu_stop_emulating();
815	fpurestore(vcpu->guestfpu);
816
817	/*
818	 * The FPU is now "dirty" with the guest's state so turn on emulation
819	 * to trap any access to the FPU by the host.
820	 */
821	fpu_start_emulating();
822}
823
824static void
825save_guest_fpustate(struct vcpu *vcpu)
826{
827
828	if ((rcr0() & CR0_TS) == 0)
829		panic("fpu emulation not enabled in host!");
830
831	/* save guest FPU state */
832	fpu_stop_emulating();
833	fpusave(vcpu->guestfpu);
834	fpu_start_emulating();
835}
836
837static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
838
839static int
840vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
841{
842	int error;
843
844	vcpu_assert_locked(vcpu);
845
846	/*
847	 * The following state transitions are allowed:
848	 * IDLE -> FROZEN -> IDLE
849	 * FROZEN -> RUNNING -> FROZEN
850	 * FROZEN -> SLEEPING -> FROZEN
851	 */
852	switch (vcpu->state) {
853	case VCPU_IDLE:
854	case VCPU_RUNNING:
855	case VCPU_SLEEPING:
856		error = (newstate != VCPU_FROZEN);
857		break;
858	case VCPU_FROZEN:
859		error = (newstate == VCPU_FROZEN);
860		break;
861	default:
862		error = 1;
863		break;
864	}
865
866	if (error == 0)
867		vcpu->state = newstate;
868	else
869		error = EBUSY;
870
871	return (error);
872}
873
874static void
875vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
876{
877	int error;
878
879	if ((error = vcpu_set_state(vm, vcpuid, newstate)) != 0)
880		panic("Error %d setting state to %d\n", error, newstate);
881}
882
883static void
884vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
885{
886	int error;
887
888	if ((error = vcpu_set_state_locked(vcpu, newstate)) != 0)
889		panic("Error %d setting state to %d", error, newstate);
890}
891
892static void
893vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func)
894{
895
896	KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked"));
897
898	/*
899	 * Update 'rendezvous_func' and execute a write memory barrier to
900	 * ensure that it is visible across all host cpus. This is not needed
901	 * for correctness but it does ensure that all the vcpus will notice
902	 * that the rendezvous is requested immediately.
903	 */
904	vm->rendezvous_func = func;
905	wmb();
906}
907
908#define	RENDEZVOUS_CTR0(vm, vcpuid, fmt)				\
909	do {								\
910		if (vcpuid >= 0)					\
911			VCPU_CTR0(vm, vcpuid, fmt);			\
912		else							\
913			VM_CTR0(vm, fmt);				\
914	} while (0)
915
916static void
917vm_handle_rendezvous(struct vm *vm, int vcpuid)
918{
919
920	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
921	    ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid));
922
923	mtx_lock(&vm->rendezvous_mtx);
924	while (vm->rendezvous_func != NULL) {
925		/* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
926		CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus);
927
928		if (vcpuid != -1 &&
929		    CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
930		    !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
931			VCPU_CTR0(vm, vcpuid, "Calling rendezvous func");
932			(*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg);
933			CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
934		}
935		if (CPU_CMP(&vm->rendezvous_req_cpus,
936		    &vm->rendezvous_done_cpus) == 0) {
937			VCPU_CTR0(vm, vcpuid, "Rendezvous completed");
938			vm_set_rendezvous_func(vm, NULL);
939			wakeup(&vm->rendezvous_func);
940			break;
941		}
942		RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion");
943		mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
944		    "vmrndv", 0);
945	}
946	mtx_unlock(&vm->rendezvous_mtx);
947}
948
949/*
950 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
951 */
952static int
953vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
954{
955	struct vm_exit *vmexit;
956	struct vcpu *vcpu;
957	int t, timo, spindown;
958
959	vcpu = &vm->vcpu[vcpuid];
960	spindown = 0;
961
962	vcpu_lock(vcpu);
963
964	/*
965	 * Do a final check for pending NMI or interrupts before
966	 * really putting this thread to sleep.
967	 *
968	 * These interrupts could have happened any time after we
969	 * returned from VMRUN() and before we grabbed the vcpu lock.
970	 */
971	if (!vm_nmi_pending(vm, vcpuid) &&
972	    (intr_disabled || !vlapic_pending_intr(vcpu->vlapic, NULL))) {
973		t = ticks;
974		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
975		if (vlapic_enabled(vcpu->vlapic)) {
976			/*
977			 * XXX msleep_spin() is not interruptible so use the
978			 * 'timo' to put an upper bound on the sleep time.
979			 */
980			timo = hz;
981			msleep_spin(vcpu, &vcpu->mtx, "vmidle", timo);
982		} else {
983			/*
984			 * Spindown the vcpu if the apic is disabled and it
985			 * had entered the halted state.
986			 */
987			spindown = 1;
988		}
989		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
990		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
991	}
992	vcpu_unlock(vcpu);
993
994	/*
995	 * Since 'vm_deactivate_cpu()' grabs a sleep mutex we must call it
996	 * outside the confines of the vcpu spinlock.
997	 */
998	if (spindown) {
999		*retu = true;
1000		vmexit = vm_exitinfo(vm, vcpuid);
1001		vmexit->exitcode = VM_EXITCODE_SPINDOWN_CPU;
1002		vm_deactivate_cpu(vm, vcpuid);
1003		VCPU_CTR0(vm, vcpuid, "spinning down cpu");
1004	}
1005
1006	return (0);
1007}
1008
1009static int
1010vm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
1011{
1012	int rv, ftype;
1013	struct vm_map *map;
1014	struct vcpu *vcpu;
1015	struct vm_exit *vme;
1016
1017	vcpu = &vm->vcpu[vcpuid];
1018	vme = &vcpu->exitinfo;
1019
1020	ftype = vme->u.paging.fault_type;
1021	KASSERT(ftype == VM_PROT_READ ||
1022	    ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
1023	    ("vm_handle_paging: invalid fault_type %d", ftype));
1024
1025	if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
1026		rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
1027		    vme->u.paging.gpa, ftype);
1028		if (rv == 0)
1029			goto done;
1030	}
1031
1032	map = &vm->vmspace->vm_map;
1033	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
1034
1035	VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, "
1036	    "ftype = %d", rv, vme->u.paging.gpa, ftype);
1037
1038	if (rv != KERN_SUCCESS)
1039		return (EFAULT);
1040done:
1041	/* restart execution at the faulting instruction */
1042	vme->inst_length = 0;
1043
1044	return (0);
1045}
1046
1047static int
1048vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
1049{
1050	struct vie *vie;
1051	struct vcpu *vcpu;
1052	struct vm_exit *vme;
1053	int error, inst_length;
1054	uint64_t rip, gla, gpa, cr3;
1055	mem_region_read_t mread;
1056	mem_region_write_t mwrite;
1057
1058	vcpu = &vm->vcpu[vcpuid];
1059	vme = &vcpu->exitinfo;
1060
1061	rip = vme->rip;
1062	inst_length = vme->inst_length;
1063
1064	gla = vme->u.inst_emul.gla;
1065	gpa = vme->u.inst_emul.gpa;
1066	cr3 = vme->u.inst_emul.cr3;
1067	vie = &vme->u.inst_emul.vie;
1068
1069	vie_init(vie);
1070
1071	/* Fetch, decode and emulate the faulting instruction */
1072	if (vmm_fetch_instruction(vm, vcpuid, rip, inst_length, cr3, vie) != 0)
1073		return (EFAULT);
1074
1075	if (vmm_decode_instruction(vm, vcpuid, gla, vie) != 0)
1076		return (EFAULT);
1077
1078	/* return to userland unless this is an in-kernel emulated device */
1079	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1080		mread = lapic_mmio_read;
1081		mwrite = lapic_mmio_write;
1082	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1083		mread = vioapic_mmio_read;
1084		mwrite = vioapic_mmio_write;
1085	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1086		mread = vhpet_mmio_read;
1087		mwrite = vhpet_mmio_write;
1088	} else {
1089		*retu = true;
1090		return (0);
1091	}
1092
1093	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, mread, mwrite,
1094	    retu);
1095
1096	return (error);
1097}
1098
1099int
1100vm_run(struct vm *vm, struct vm_run *vmrun)
1101{
1102	int error, vcpuid;
1103	struct vcpu *vcpu;
1104	struct pcb *pcb;
1105	uint64_t tscval, rip;
1106	struct vm_exit *vme;
1107	bool retu, intr_disabled;
1108	pmap_t pmap;
1109
1110	vcpuid = vmrun->cpuid;
1111
1112	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1113		return (EINVAL);
1114
1115	pmap = vmspace_pmap(vm->vmspace);
1116	vcpu = &vm->vcpu[vcpuid];
1117	vme = &vcpu->exitinfo;
1118	rip = vmrun->rip;
1119restart:
1120	critical_enter();
1121
1122	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
1123	    ("vm_run: absurd pm_active"));
1124
1125	tscval = rdtsc();
1126
1127	pcb = PCPU_GET(curpcb);
1128	set_pcb_flags(pcb, PCB_FULL_IRET);
1129
1130	restore_guest_msrs(vm, vcpuid);
1131	restore_guest_fpustate(vcpu);
1132
1133	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
1134	vcpu->hostcpu = curcpu;
1135	error = VMRUN(vm->cookie, vcpuid, rip, pmap, &vm->rendezvous_func);
1136	vcpu->hostcpu = NOCPU;
1137	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
1138
1139	save_guest_fpustate(vcpu);
1140	restore_host_msrs(vm, vcpuid);
1141
1142	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
1143
1144	critical_exit();
1145
1146	if (error == 0) {
1147		retu = false;
1148		switch (vme->exitcode) {
1149		case VM_EXITCODE_IOAPIC_EOI:
1150			vioapic_process_eoi(vm, vcpuid,
1151			    vme->u.ioapic_eoi.vector);
1152			break;
1153		case VM_EXITCODE_RENDEZVOUS:
1154			vm_handle_rendezvous(vm, vcpuid);
1155			error = 0;
1156			break;
1157		case VM_EXITCODE_HLT:
1158			intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
1159			error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu);
1160			break;
1161		case VM_EXITCODE_PAGING:
1162			error = vm_handle_paging(vm, vcpuid, &retu);
1163			break;
1164		case VM_EXITCODE_INST_EMUL:
1165			error = vm_handle_inst_emul(vm, vcpuid, &retu);
1166			break;
1167		default:
1168			retu = true;	/* handled in userland */
1169			break;
1170		}
1171	}
1172
1173	if (error == 0 && retu == false) {
1174		rip = vme->rip + vme->inst_length;
1175		goto restart;
1176	}
1177
1178	/* copy the exit information */
1179	bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
1180	return (error);
1181}
1182
1183int
1184vm_inject_event(struct vm *vm, int vcpuid, int type,
1185		int vector, uint32_t code, int code_valid)
1186{
1187	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1188		return (EINVAL);
1189
1190	if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0)
1191		return (EINVAL);
1192
1193	if (vector < 0 || vector > 255)
1194		return (EINVAL);
1195
1196	return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid));
1197}
1198
1199static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
1200
1201int
1202vm_inject_nmi(struct vm *vm, int vcpuid)
1203{
1204	struct vcpu *vcpu;
1205
1206	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1207		return (EINVAL);
1208
1209	vcpu = &vm->vcpu[vcpuid];
1210
1211	vcpu->nmi_pending = 1;
1212	vcpu_notify_event(vm, vcpuid, false);
1213	return (0);
1214}
1215
1216int
1217vm_nmi_pending(struct vm *vm, int vcpuid)
1218{
1219	struct vcpu *vcpu;
1220
1221	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1222		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1223
1224	vcpu = &vm->vcpu[vcpuid];
1225
1226	return (vcpu->nmi_pending);
1227}
1228
1229void
1230vm_nmi_clear(struct vm *vm, int vcpuid)
1231{
1232	struct vcpu *vcpu;
1233
1234	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1235		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1236
1237	vcpu = &vm->vcpu[vcpuid];
1238
1239	if (vcpu->nmi_pending == 0)
1240		panic("vm_nmi_clear: inconsistent nmi_pending state");
1241
1242	vcpu->nmi_pending = 0;
1243	vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
1244}
1245
1246int
1247vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
1248{
1249	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1250		return (EINVAL);
1251
1252	if (type < 0 || type >= VM_CAP_MAX)
1253		return (EINVAL);
1254
1255	return (VMGETCAP(vm->cookie, vcpu, type, retval));
1256}
1257
1258int
1259vm_set_capability(struct vm *vm, int vcpu, int type, int val)
1260{
1261	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1262		return (EINVAL);
1263
1264	if (type < 0 || type >= VM_CAP_MAX)
1265		return (EINVAL);
1266
1267	return (VMSETCAP(vm->cookie, vcpu, type, val));
1268}
1269
1270uint64_t *
1271vm_guest_msrs(struct vm *vm, int cpu)
1272{
1273	return (vm->vcpu[cpu].guest_msrs);
1274}
1275
1276struct vlapic *
1277vm_lapic(struct vm *vm, int cpu)
1278{
1279	return (vm->vcpu[cpu].vlapic);
1280}
1281
1282struct vioapic *
1283vm_ioapic(struct vm *vm)
1284{
1285
1286	return (vm->vioapic);
1287}
1288
1289struct vhpet *
1290vm_hpet(struct vm *vm)
1291{
1292
1293	return (vm->vhpet);
1294}
1295
1296boolean_t
1297vmm_is_pptdev(int bus, int slot, int func)
1298{
1299	int found, i, n;
1300	int b, s, f;
1301	char *val, *cp, *cp2;
1302
1303	/*
1304	 * XXX
1305	 * The length of an environment variable is limited to 128 bytes which
1306	 * puts an upper limit on the number of passthru devices that may be
1307	 * specified using a single environment variable.
1308	 *
1309	 * Work around this by scanning multiple environment variable
1310	 * names instead of a single one - yuck!
1311	 */
1312	const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
1313
1314	/* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
1315	found = 0;
1316	for (i = 0; names[i] != NULL && !found; i++) {
1317		cp = val = getenv(names[i]);
1318		while (cp != NULL && *cp != '\0') {
1319			if ((cp2 = strchr(cp, ' ')) != NULL)
1320				*cp2 = '\0';
1321
1322			n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
1323			if (n == 3 && bus == b && slot == s && func == f) {
1324				found = 1;
1325				break;
1326			}
1327
1328			if (cp2 != NULL)
1329				*cp2++ = ' ';
1330
1331			cp = cp2;
1332		}
1333		freeenv(val);
1334	}
1335	return (found);
1336}
1337
1338void *
1339vm_iommu_domain(struct vm *vm)
1340{
1341
1342	return (vm->iommu);
1343}
1344
1345int
1346vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1347{
1348	int error;
1349	struct vcpu *vcpu;
1350
1351	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1352		panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
1353
1354	vcpu = &vm->vcpu[vcpuid];
1355
1356	vcpu_lock(vcpu);
1357	error = vcpu_set_state_locked(vcpu, newstate);
1358	vcpu_unlock(vcpu);
1359
1360	return (error);
1361}
1362
1363enum vcpu_state
1364vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
1365{
1366	struct vcpu *vcpu;
1367	enum vcpu_state state;
1368
1369	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1370		panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
1371
1372	vcpu = &vm->vcpu[vcpuid];
1373
1374	vcpu_lock(vcpu);
1375	state = vcpu->state;
1376	if (hostcpu != NULL)
1377		*hostcpu = vcpu->hostcpu;
1378	vcpu_unlock(vcpu);
1379
1380	return (state);
1381}
1382
1383void
1384vm_activate_cpu(struct vm *vm, int vcpuid)
1385{
1386
1387	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU,
1388	    ("vm_activate_cpu: invalid vcpuid %d", vcpuid));
1389	KASSERT(!CPU_ISSET(vcpuid, &vm->active_cpus),
1390	    ("vm_activate_cpu: vcpuid %d is already active", vcpuid));
1391
1392	VCPU_CTR0(vm, vcpuid, "activated");
1393	CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
1394}
1395
1396static void
1397vm_deactivate_cpu(struct vm *vm, int vcpuid)
1398{
1399
1400	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU,
1401	    ("vm_deactivate_cpu: invalid vcpuid %d", vcpuid));
1402	KASSERT(CPU_ISSET(vcpuid, &vm->active_cpus),
1403	    ("vm_deactivate_cpu: vcpuid %d is not active", vcpuid));
1404
1405	VCPU_CTR0(vm, vcpuid, "deactivated");
1406	CPU_CLR_ATOMIC(vcpuid, &vm->active_cpus);
1407
1408	/*
1409	 * If a vcpu rendezvous is in progress then it could be blocked
1410	 * on 'vcpuid' - unblock it before disappearing forever.
1411	 */
1412	mtx_lock(&vm->rendezvous_mtx);
1413	if (vm->rendezvous_func != NULL) {
1414		VCPU_CTR0(vm, vcpuid, "unblock rendezvous after deactivation");
1415		wakeup(&vm->rendezvous_func);
1416	}
1417	mtx_unlock(&vm->rendezvous_mtx);
1418}
1419
1420cpuset_t
1421vm_active_cpus(struct vm *vm)
1422{
1423
1424	return (vm->active_cpus);
1425}
1426
1427void *
1428vcpu_stats(struct vm *vm, int vcpuid)
1429{
1430
1431	return (vm->vcpu[vcpuid].stats);
1432}
1433
1434int
1435vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
1436{
1437	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1438		return (EINVAL);
1439
1440	*state = vm->vcpu[vcpuid].x2apic_state;
1441
1442	return (0);
1443}
1444
1445int
1446vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
1447{
1448	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1449		return (EINVAL);
1450
1451	if (state >= X2APIC_STATE_LAST)
1452		return (EINVAL);
1453
1454	vm->vcpu[vcpuid].x2apic_state = state;
1455
1456	vlapic_set_x2apic_state(vm, vcpuid, state);
1457
1458	return (0);
1459}
1460
1461/*
1462 * This function is called to ensure that a vcpu "sees" a pending event
1463 * as soon as possible:
1464 * - If the vcpu thread is sleeping then it is woken up.
1465 * - If the vcpu is running on a different host_cpu then an IPI will be directed
1466 *   to the host_cpu to cause the vcpu to trap into the hypervisor.
1467 */
1468void
1469vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
1470{
1471	int hostcpu;
1472	struct vcpu *vcpu;
1473
1474	vcpu = &vm->vcpu[vcpuid];
1475
1476	vcpu_lock(vcpu);
1477	hostcpu = vcpu->hostcpu;
1478	if (hostcpu == NOCPU) {
1479		if (vcpu->state == VCPU_SLEEPING)
1480			wakeup_one(vcpu);
1481	} else {
1482		if (vcpu->state != VCPU_RUNNING)
1483			panic("invalid vcpu state %d", vcpu->state);
1484		if (hostcpu != curcpu) {
1485			if (lapic_intr)
1486				vlapic_post_intr(vcpu->vlapic, hostcpu,
1487				    vmm_ipinum);
1488			else
1489				ipi_cpu(hostcpu, vmm_ipinum);
1490		}
1491	}
1492	vcpu_unlock(vcpu);
1493}
1494
1495struct vmspace *
1496vm_get_vmspace(struct vm *vm)
1497{
1498
1499	return (vm->vmspace);
1500}
1501
1502int
1503vm_apicid2vcpuid(struct vm *vm, int apicid)
1504{
1505	/*
1506	 * XXX apic id is assumed to be numerically identical to vcpu id
1507	 */
1508	return (apicid);
1509}
1510
1511void
1512vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
1513    vm_rendezvous_func_t func, void *arg)
1514{
1515	int i;
1516
1517	/*
1518	 * Enforce that this function is called without any locks
1519	 */
1520	WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous");
1521	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
1522	    ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid));
1523
1524restart:
1525	mtx_lock(&vm->rendezvous_mtx);
1526	if (vm->rendezvous_func != NULL) {
1527		/*
1528		 * If a rendezvous is already in progress then we need to
1529		 * call the rendezvous handler in case this 'vcpuid' is one
1530		 * of the targets of the rendezvous.
1531		 */
1532		RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress");
1533		mtx_unlock(&vm->rendezvous_mtx);
1534		vm_handle_rendezvous(vm, vcpuid);
1535		goto restart;
1536	}
1537	KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous "
1538	    "rendezvous is still in progress"));
1539
1540	RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous");
1541	vm->rendezvous_req_cpus = dest;
1542	CPU_ZERO(&vm->rendezvous_done_cpus);
1543	vm->rendezvous_arg = arg;
1544	vm_set_rendezvous_func(vm, func);
1545	mtx_unlock(&vm->rendezvous_mtx);
1546
1547	/*
1548	 * Wake up any sleeping vcpus and trigger a VM-exit in any running
1549	 * vcpus so they handle the rendezvous as soon as possible.
1550	 */
1551	for (i = 0; i < VM_MAXCPU; i++) {
1552		if (CPU_ISSET(i, &dest))
1553			vcpu_notify_event(vm, i, false);
1554	}
1555
1556	vm_handle_rendezvous(vm, vcpuid);
1557}
1558