vmm.c revision 270070
165310Sarchie/*-
265310Sarchie * Copyright (c) 2011 NetApp, Inc.
365310Sarchie * All rights reserved.
465310Sarchie *
565310Sarchie * Redistribution and use in source and binary forms, with or without
665310Sarchie * modification, are permitted provided that the following conditions
765310Sarchie * are met:
865310Sarchie * 1. Redistributions of source code must retain the above copyright
965310Sarchie *    notice, this list of conditions and the following disclaimer.
1065310Sarchie * 2. Redistributions in binary form must reproduce the above copyright
1165310Sarchie *    notice, this list of conditions and the following disclaimer in the
1265310Sarchie *    documentation and/or other materials provided with the distribution.
1365310Sarchie *
1465310Sarchie * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
1565310Sarchie * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1665310Sarchie * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1765310Sarchie * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
1865310Sarchie * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
1965310Sarchie * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2065310Sarchie * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2165310Sarchie * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2265310Sarchie * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2365310Sarchie * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2465310Sarchie * SUCH DAMAGE.
2565310Sarchie *
2665310Sarchie * $FreeBSD: stable/10/sys/amd64/vmm/vmm.c 270070 2014-08-17 00:52:07Z grehan $
2765310Sarchie */
2865310Sarchie
2965310Sarchie#include <sys/cdefs.h>
3065310Sarchie__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/vmm.c 270070 2014-08-17 00:52:07Z grehan $");
3165310Sarchie
3265310Sarchie#include <sys/param.h>
3365310Sarchie#include <sys/systm.h>
3465310Sarchie#include <sys/kernel.h>
3565310Sarchie#include <sys/module.h>
3665310Sarchie#include <sys/sysctl.h>
3765310Sarchie#include <sys/malloc.h>
3865310Sarchie#include <sys/pcpu.h>
3965310Sarchie#include <sys/lock.h>
4065310Sarchie#include <sys/mutex.h>
4165310Sarchie#include <sys/proc.h>
4265310Sarchie#include <sys/rwlock.h>
4365310Sarchie#include <sys/sched.h>
4465310Sarchie#include <sys/smp.h>
4565310Sarchie#include <sys/systm.h>
4665310Sarchie
4765310Sarchie#include <vm/vm.h>
4865310Sarchie#include <vm/vm_object.h>
4965310Sarchie#include <vm/vm_page.h>
5065310Sarchie#include <vm/pmap.h>
5165310Sarchie#include <vm/vm_map.h>
5265310Sarchie#include <vm/vm_extern.h>
5365310Sarchie#include <vm/vm_param.h>
5465310Sarchie
5565310Sarchie#include <machine/cpu.h>
5665310Sarchie#include <machine/vm.h>
5765310Sarchie#include <machine/pcb.h>
5865310Sarchie#include <machine/smp.h>
5965310Sarchie#include <x86/psl.h>
6065310Sarchie#include <x86/apicreg.h>
6165310Sarchie#include <machine/vmparam.h>
6265310Sarchie
6365310Sarchie#include <machine/vmm.h>
6465310Sarchie#include <machine/vmm_dev.h>
6565310Sarchie#include <machine/vmm_instruction_emul.h>
6665310Sarchie
6765310Sarchie#include "vmm_ioport.h"
6865310Sarchie#include "vmm_ktr.h"
6965310Sarchie#include "vmm_host.h"
7065310Sarchie#include "vmm_mem.h"
7165310Sarchie#include "vmm_util.h"
7265310Sarchie#include "vatpic.h"
7365310Sarchie#include "vatpit.h"
7465310Sarchie#include "vhpet.h"
7565310Sarchie#include "vioapic.h"
7665310Sarchie#include "vlapic.h"
7765310Sarchie#include "vmm_msr.h"
7865310Sarchie#include "vmm_ipi.h"
7965310Sarchie#include "vmm_stat.h"
8065310Sarchie#include "vmm_lapic.h"
8165310Sarchie
8265310Sarchie#include "io/ppt.h"
8365310Sarchie#include "io/iommu.h"
8465310Sarchie
8565310Sarchiestruct vlapic;
8665310Sarchie
8765310Sarchiestruct vcpu {
8865310Sarchie	int		flags;
8965310Sarchie	enum vcpu_state	state;
9065310Sarchie	struct mtx	mtx;
9165310Sarchie	int		hostcpu;	/* host cpuid this vcpu last ran on */
9265310Sarchie	uint64_t	guest_msrs[VMM_MSR_NUM];
9365310Sarchie	struct vlapic	*vlapic;
9465310Sarchie	int		 vcpuid;
9565310Sarchie	struct savefpu	*guestfpu;	/* guest fpu state */
9665310Sarchie	uint64_t	guest_xcr0;
9765310Sarchie	void		*stats;
9865310Sarchie	struct vm_exit	exitinfo;
9965310Sarchie	enum x2apic_state x2apic_state;
10065310Sarchie	int		nmi_pending;
10165310Sarchie	int		extint_pending;
10265310Sarchie	struct vm_exception exception;
10365310Sarchie	int		exception_pending;
10465310Sarchie};
10565310Sarchie
10665310Sarchie#define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
10765310Sarchie#define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
10865310Sarchie#define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
10965310Sarchie#define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
11065310Sarchie
11165310Sarchiestruct mem_seg {
11265310Sarchie	vm_paddr_t	gpa;
11365310Sarchie	size_t		len;
11465310Sarchie	boolean_t	wired;
11565310Sarchie	vm_object_t	object;
11665310Sarchie};
11765310Sarchie#define	VM_MAX_MEMORY_SEGMENTS	2
11865310Sarchie
11965310Sarchiestruct vm {
12065310Sarchie	void		*cookie;	/* processor-specific data */
12165310Sarchie	void		*iommu;		/* iommu-specific data */
12265310Sarchie	struct vhpet	*vhpet;		/* virtual HPET */
12365310Sarchie	struct vioapic	*vioapic;	/* virtual ioapic */
12465310Sarchie	struct vatpic	*vatpic;	/* virtual atpic */
12565310Sarchie	struct vatpit	*vatpit;	/* virtual atpit */
12665310Sarchie	struct vmspace	*vmspace;	/* guest's address space */
12765310Sarchie	struct vcpu	vcpu[VM_MAXCPU];
12865310Sarchie	int		num_mem_segs;
12965310Sarchie	struct mem_seg	mem_segs[VM_MAX_MEMORY_SEGMENTS];
13065310Sarchie	char		name[VM_MAX_NAMELEN];
13165310Sarchie
13265310Sarchie	/*
13365310Sarchie	 * Set of active vcpus.
13465310Sarchie	 * An active vcpu is one that has been started implicitly (BSP) or
13565310Sarchie	 * explicitly (AP) by sending it a startup ipi.
13665310Sarchie	 */
13765310Sarchie	volatile cpuset_t active_cpus;
13865310Sarchie
13965310Sarchie	struct mtx	rendezvous_mtx;
14065310Sarchie	cpuset_t	rendezvous_req_cpus;
14165310Sarchie	cpuset_t	rendezvous_done_cpus;
14265310Sarchie	void		*rendezvous_arg;
14365310Sarchie	vm_rendezvous_func_t rendezvous_func;
14465310Sarchie
14565310Sarchie	int		suspend;
14665310Sarchie	volatile cpuset_t suspended_cpus;
14765310Sarchie
14865310Sarchie	volatile cpuset_t halted_cpus;
14965310Sarchie};
15065310Sarchie
15165310Sarchiestatic int vmm_initialized;
15265310Sarchie
15365310Sarchiestatic struct vmm_ops *ops;
15465310Sarchie#define	VMM_INIT(num)	(ops != NULL ? (*ops->init)(num) : 0)
15565310Sarchie#define	VMM_CLEANUP()	(ops != NULL ? (*ops->cleanup)() : 0)
15665310Sarchie#define	VMM_RESUME()	(ops != NULL ? (*ops->resume)() : 0)
15765310Sarchie
15865310Sarchie#define	VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
15965310Sarchie#define	VMRUN(vmi, vcpu, rip, pmap, rptr, sptr) \
16065310Sarchie	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, rptr, sptr) : ENXIO)
16165310Sarchie#define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
16265310Sarchie#define	VMSPACE_ALLOC(min, max) \
16365310Sarchie	(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
16465310Sarchie#define	VMSPACE_FREE(vmspace) \
16565310Sarchie	(ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
16665310Sarchie#define	VMGETREG(vmi, vcpu, num, retval)		\
16765310Sarchie	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
16865310Sarchie#define	VMSETREG(vmi, vcpu, num, val)		\
16965310Sarchie	(ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
17065310Sarchie#define	VMGETDESC(vmi, vcpu, num, desc)		\
17165310Sarchie	(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
17265310Sarchie#define	VMSETDESC(vmi, vcpu, num, desc)		\
17365310Sarchie	(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
17465310Sarchie#define	VMGETCAP(vmi, vcpu, num, retval)	\
17565310Sarchie	(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
17665310Sarchie#define	VMSETCAP(vmi, vcpu, num, val)		\
17765310Sarchie	(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
17865310Sarchie#define	VLAPIC_INIT(vmi, vcpu)			\
17965310Sarchie	(ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL)
18065310Sarchie#define	VLAPIC_CLEANUP(vmi, vlapic)		\
18165310Sarchie	(ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL)
18265310Sarchie
18365310Sarchie#define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
18465310Sarchie#define	fpu_stop_emulating()	clts()
18565310Sarchie
18665310Sarchiestatic MALLOC_DEFINE(M_VM, "vm", "vm");
18765310SarchieCTASSERT(VMM_MSR_NUM <= 64);	/* msr_mask can keep track of up to 64 msrs */
18865310Sarchie
18965310Sarchie/* statistics */
19065310Sarchiestatic VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
19165310Sarchie
19265310SarchieSYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
19365310Sarchie
19465310Sarchie/*
19565310Sarchie * Halt the guest if all vcpus are executing a HLT instruction with
19665310Sarchie * interrupts disabled.
19765310Sarchie */
19865310Sarchiestatic int halt_detection_enabled = 1;
19965310SarchieTUNABLE_INT("hw.vmm.halt_detection", &halt_detection_enabled);
20065310SarchieSYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN,
20165310Sarchie    &halt_detection_enabled, 0,
20265310Sarchie    "Halt VM if all vcpus execute HLT with interrupts disabled");
20365310Sarchie
20465310Sarchiestatic int vmm_ipinum;
20565310SarchieSYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
20665310Sarchie    "IPI vector used for vcpu notifications");
20765310Sarchie
20865310Sarchiestatic void
20965310Sarchievcpu_cleanup(struct vm *vm, int i)
21065310Sarchie{
21165310Sarchie	struct vcpu *vcpu = &vm->vcpu[i];
21265310Sarchie
21365310Sarchie	VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
21465310Sarchie	vmm_stat_free(vcpu->stats);
21565310Sarchie	fpu_save_area_free(vcpu->guestfpu);
21665310Sarchie}
21765310Sarchie
21865310Sarchiestatic void
21965310Sarchievcpu_init(struct vm *vm, uint32_t vcpu_id)
22065310Sarchie{
22165310Sarchie	struct vcpu *vcpu;
22265310Sarchie
22365310Sarchie	vcpu = &vm->vcpu[vcpu_id];
22465310Sarchie
22565310Sarchie	vcpu_lock_init(vcpu);
22665310Sarchie	vcpu->hostcpu = NOCPU;
22765310Sarchie	vcpu->vcpuid = vcpu_id;
22865310Sarchie	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
22965310Sarchie	vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
23065310Sarchie	vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
23165310Sarchie	vcpu->guestfpu = fpu_save_area_alloc();
23265310Sarchie	fpu_save_area_reset(vcpu->guestfpu);
23365310Sarchie	vcpu->stats = vmm_stat_alloc();
23465310Sarchie}
23565310Sarchie
23665310Sarchiestruct vm_exit *
23765310Sarchievm_exitinfo(struct vm *vm, int cpuid)
23865310Sarchie{
23965310Sarchie	struct vcpu *vcpu;
24065310Sarchie
24165310Sarchie	if (cpuid < 0 || cpuid >= VM_MAXCPU)
24265310Sarchie		panic("vm_exitinfo: invalid cpuid %d", cpuid);
24365310Sarchie
24465310Sarchie	vcpu = &vm->vcpu[cpuid];
24565310Sarchie
24665310Sarchie	return (&vcpu->exitinfo);
24765310Sarchie}
24865310Sarchie
24965310Sarchiestatic void
25065310Sarchievmm_resume(void)
25165310Sarchie{
25265310Sarchie	VMM_RESUME();
25365310Sarchie}
25465310Sarchie
25565310Sarchiestatic int
25665310Sarchievmm_init(void)
25765310Sarchie{
25865310Sarchie	int error;
25965310Sarchie
26065310Sarchie	vmm_host_state_init();
26165310Sarchie
26265310Sarchie	vmm_ipinum = vmm_ipi_alloc();
26365310Sarchie	if (vmm_ipinum == 0)
26465310Sarchie		vmm_ipinum = IPI_AST;
26565310Sarchie
26665310Sarchie	error = vmm_mem_init();
26765310Sarchie	if (error)
26865310Sarchie		return (error);
26965310Sarchie
27065310Sarchie	if (vmm_is_intel())
27165310Sarchie		ops = &vmm_ops_intel;
27265310Sarchie	else if (vmm_is_amd())
27365310Sarchie		ops = &vmm_ops_amd;
27465310Sarchie	else
27565310Sarchie		return (ENXIO);
27665310Sarchie
27765310Sarchie	vmm_msr_init();
27865310Sarchie	vmm_resume_p = vmm_resume;
27965310Sarchie
28065310Sarchie	return (VMM_INIT(vmm_ipinum));
28165310Sarchie}
28265310Sarchie
28366887Sarchiestatic int
28465310Sarchievmm_handler(module_t mod, int what, void *arg)
28565310Sarchie{
28665310Sarchie	int error;
28765310Sarchie
28865310Sarchie	switch (what) {
28965310Sarchie	case MOD_LOAD:
29065310Sarchie		vmmdev_init();
29165310Sarchie		if (ppt_avail_devices() > 0)
29265310Sarchie			iommu_init();
29365310Sarchie		error = vmm_init();
29465310Sarchie		if (error == 0)
29565310Sarchie			vmm_initialized = 1;
29665310Sarchie		break;
29765310Sarchie	case MOD_UNLOAD:
29865310Sarchie		error = vmmdev_cleanup();
29965310Sarchie		if (error == 0) {
30065310Sarchie			vmm_resume_p = NULL;
30165310Sarchie			iommu_cleanup();
30265310Sarchie			if (vmm_ipinum != IPI_AST)
30365310Sarchie				vmm_ipi_free(vmm_ipinum);
30465310Sarchie			error = VMM_CLEANUP();
30565310Sarchie			/*
30665310Sarchie			 * Something bad happened - prevent new
30765310Sarchie			 * VMs from being created
30865310Sarchie			 */
30965310Sarchie			if (error)
31065310Sarchie				vmm_initialized = 0;
31165310Sarchie		}
31265310Sarchie		break;
31365310Sarchie	default:
31465310Sarchie		error = 0;
31565310Sarchie		break;
31665310Sarchie	}
31765310Sarchie	return (error);
31865310Sarchie}
31965310Sarchie
32065310Sarchiestatic moduledata_t vmm_kmod = {
32165310Sarchie	"vmm",
32265310Sarchie	vmm_handler,
32365310Sarchie	NULL
32465310Sarchie};
32565310Sarchie
32665310Sarchie/*
32765310Sarchie * vmm initialization has the following dependencies:
32865310Sarchie *
32965310Sarchie * - iommu initialization must happen after the pci passthru driver has had
33065310Sarchie *   a chance to attach to any passthru devices (after SI_SUB_CONFIGURE).
33165310Sarchie *
33265310Sarchie * - VT-x initialization requires smp_rendezvous() and therefore must happen
33365310Sarchie *   after SMP is fully functional (after SI_SUB_SMP).
33465310Sarchie */
33565310SarchieDECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
33665310SarchieMODULE_VERSION(vmm, 1);
33765310Sarchie
33865310Sarchieint
33965310Sarchievm_create(const char *name, struct vm **retvm)
34065310Sarchie{
34165310Sarchie	int i;
34265310Sarchie	struct vm *vm;
34365310Sarchie	struct vmspace *vmspace;
34465310Sarchie
34565310Sarchie	/*
34665310Sarchie	 * If vmm.ko could not be successfully initialized then don't attempt
34765310Sarchie	 * to create the virtual machine.
34865310Sarchie	 */
34965310Sarchie	if (!vmm_initialized)
35065310Sarchie		return (ENXIO);
35165310Sarchie
35265310Sarchie	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
35365310Sarchie		return (EINVAL);
35465310Sarchie
35565310Sarchie	vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS);
35665310Sarchie	if (vmspace == NULL)
35765310Sarchie		return (ENOMEM);
35865310Sarchie
35965310Sarchie	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
36065310Sarchie	strcpy(vm->name, name);
36165310Sarchie	vm->vmspace = vmspace;
36265310Sarchie	mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
36365310Sarchie	vm->cookie = VMINIT(vm, vmspace_pmap(vmspace));
36465310Sarchie	vm->vioapic = vioapic_init(vm);
36565310Sarchie	vm->vhpet = vhpet_init(vm);
36665310Sarchie	vm->vatpic = vatpic_init(vm);
36765310Sarchie	vm->vatpit = vatpit_init(vm);
36865310Sarchie
36965310Sarchie	for (i = 0; i < VM_MAXCPU; i++) {
37065310Sarchie		vcpu_init(vm, i);
37165310Sarchie		guest_msrs_init(vm, i);
37265310Sarchie	}
37365310Sarchie
37465310Sarchie	*retvm = vm;
37565310Sarchie	return (0);
37665310Sarchie}
37765310Sarchie
37865310Sarchiestatic void
37965310Sarchievm_free_mem_seg(struct vm *vm, struct mem_seg *seg)
38065310Sarchie{
38165310Sarchie
38265310Sarchie	if (seg->object != NULL)
38365310Sarchie		vmm_mem_free(vm->vmspace, seg->gpa, seg->len);
38465310Sarchie
38565310Sarchie	bzero(seg, sizeof(*seg));
38665310Sarchie}
38765310Sarchie
38865310Sarchievoid
38965310Sarchievm_destroy(struct vm *vm)
39065310Sarchie{
39165310Sarchie	int i;
39265310Sarchie
39365310Sarchie	ppt_unassign_all(vm);
39465310Sarchie
39565310Sarchie	if (vm->iommu != NULL)
39665310Sarchie		iommu_destroy_domain(vm->iommu);
39765310Sarchie
39865310Sarchie	vatpit_cleanup(vm->vatpit);
39965310Sarchie	vhpet_cleanup(vm->vhpet);
40065310Sarchie	vatpic_cleanup(vm->vatpic);
40165310Sarchie	vioapic_cleanup(vm->vioapic);
40265310Sarchie
40365310Sarchie	for (i = 0; i < vm->num_mem_segs; i++)
40465310Sarchie		vm_free_mem_seg(vm, &vm->mem_segs[i]);
40565310Sarchie
40665310Sarchie	vm->num_mem_segs = 0;
40765310Sarchie
40865310Sarchie	for (i = 0; i < VM_MAXCPU; i++)
40965310Sarchie		vcpu_cleanup(vm, i);
41065310Sarchie
41165310Sarchie	VMSPACE_FREE(vm->vmspace);
41265310Sarchie
41365310Sarchie	VMCLEANUP(vm->cookie);
41465310Sarchie
41565310Sarchie	free(vm, M_VM);
41665310Sarchie}
41765310Sarchie
41865310Sarchieconst char *
41965310Sarchievm_name(struct vm *vm)
42065310Sarchie{
42165310Sarchie	return (vm->name);
42265310Sarchie}
42365310Sarchie
42465310Sarchieint
42565310Sarchievm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
42665310Sarchie{
42765310Sarchie	vm_object_t obj;
42865310Sarchie
42965310Sarchie	if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
43065310Sarchie		return (ENOMEM);
43165310Sarchie	else
43265310Sarchie		return (0);
43365310Sarchie}
43465310Sarchie
43565310Sarchieint
43665310Sarchievm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
43765310Sarchie{
43865310Sarchie
43965310Sarchie	vmm_mmio_free(vm->vmspace, gpa, len);
44065310Sarchie	return (0);
44165310Sarchie}
44265310Sarchie
44365310Sarchieboolean_t
44465310Sarchievm_mem_allocated(struct vm *vm, vm_paddr_t gpa)
44565310Sarchie{
44665310Sarchie	int i;
44765310Sarchie	vm_paddr_t gpabase, gpalimit;
44865310Sarchie
44965310Sarchie	for (i = 0; i < vm->num_mem_segs; i++) {
45065310Sarchie		gpabase = vm->mem_segs[i].gpa;
45165310Sarchie		gpalimit = gpabase + vm->mem_segs[i].len;
45265310Sarchie		if (gpa >= gpabase && gpa < gpalimit)
45365310Sarchie			return (TRUE);		/* 'gpa' is regular memory */
45465310Sarchie	}
45565310Sarchie
45665310Sarchie	if (ppt_is_mmio(vm, gpa))
45765310Sarchie		return (TRUE);			/* 'gpa' is pci passthru mmio */
45865310Sarchie
45965310Sarchie	return (FALSE);
46065310Sarchie}
46165310Sarchie
46265310Sarchieint
46365310Sarchievm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
46465310Sarchie{
46565310Sarchie	int available, allocated;
46665310Sarchie	struct mem_seg *seg;
46765310Sarchie	vm_object_t object;
46865310Sarchie	vm_paddr_t g;
46965310Sarchie
47065310Sarchie	if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
47165310Sarchie		return (EINVAL);
47265310Sarchie
47365310Sarchie	available = allocated = 0;
47465310Sarchie	g = gpa;
47565310Sarchie	while (g < gpa + len) {
47665310Sarchie		if (vm_mem_allocated(vm, g))
47765310Sarchie			allocated++;
47865310Sarchie		else
47965310Sarchie			available++;
48065310Sarchie
48165310Sarchie		g += PAGE_SIZE;
48265310Sarchie	}
48365310Sarchie
48465310Sarchie	/*
48565310Sarchie	 * If there are some allocated and some available pages in the address
48665310Sarchie	 * range then it is an error.
48765310Sarchie	 */
48865310Sarchie	if (allocated && available)
48965310Sarchie		return (EINVAL);
49065310Sarchie
49165310Sarchie	/*
49265310Sarchie	 * If the entire address range being requested has already been
49365310Sarchie	 * allocated then there isn't anything more to do.
49465310Sarchie	 */
49565310Sarchie	if (allocated && available == 0)
49665310Sarchie		return (0);
49765310Sarchie
49865310Sarchie	if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
49965310Sarchie		return (E2BIG);
50065310Sarchie
50165310Sarchie	seg = &vm->mem_segs[vm->num_mem_segs];
50265310Sarchie
50365310Sarchie	if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL)
50465310Sarchie		return (ENOMEM);
50565310Sarchie
50665310Sarchie	seg->gpa = gpa;
50765310Sarchie	seg->len = len;
50865310Sarchie	seg->object = object;
50965310Sarchie	seg->wired = FALSE;
51065310Sarchie
51165310Sarchie	vm->num_mem_segs++;
51265310Sarchie
51365310Sarchie	return (0);
51465310Sarchie}
51565310Sarchie
51665310Sarchiestatic void
51765310Sarchievm_gpa_unwire(struct vm *vm)
51865310Sarchie{
51965310Sarchie	int i, rv;
52065310Sarchie	struct mem_seg *seg;
52165310Sarchie
52265310Sarchie	for (i = 0; i < vm->num_mem_segs; i++) {
52365310Sarchie		seg = &vm->mem_segs[i];
52465310Sarchie		if (!seg->wired)
52565310Sarchie			continue;
52665310Sarchie
52765310Sarchie		rv = vm_map_unwire(&vm->vmspace->vm_map,
52865310Sarchie				   seg->gpa, seg->gpa + seg->len,
52965310Sarchie				   VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
53065310Sarchie		KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment "
53165310Sarchie		    "%#lx/%ld could not be unwired: %d",
53265310Sarchie		    vm_name(vm), seg->gpa, seg->len, rv));
53365310Sarchie
53465310Sarchie		seg->wired = FALSE;
53565310Sarchie	}
53665310Sarchie}
53765310Sarchie
53865310Sarchiestatic int
53965310Sarchievm_gpa_wire(struct vm *vm)
54065310Sarchie{
54165310Sarchie	int i, rv;
54265310Sarchie	struct mem_seg *seg;
54365310Sarchie
54465310Sarchie	for (i = 0; i < vm->num_mem_segs; i++) {
54565310Sarchie		seg = &vm->mem_segs[i];
54665310Sarchie		if (seg->wired)
54765310Sarchie			continue;
54865310Sarchie
54965310Sarchie		/* XXX rlimits? */
55065310Sarchie		rv = vm_map_wire(&vm->vmspace->vm_map,
55165310Sarchie				 seg->gpa, seg->gpa + seg->len,
55265310Sarchie				 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
55365310Sarchie		if (rv != KERN_SUCCESS)
55465310Sarchie			break;
55565310Sarchie
55665310Sarchie		seg->wired = TRUE;
55765310Sarchie	}
55865310Sarchie
55965310Sarchie	if (i < vm->num_mem_segs) {
56065310Sarchie		/*
56165310Sarchie		 * Undo the wiring before returning an error.
56265310Sarchie		 */
56365310Sarchie		vm_gpa_unwire(vm);
56465310Sarchie		return (EAGAIN);
56565310Sarchie	}
56665310Sarchie
56765310Sarchie	return (0);
56865310Sarchie}
56965310Sarchie
57065310Sarchiestatic void
57165310Sarchievm_iommu_modify(struct vm *vm, boolean_t map)
57265310Sarchie{
57365310Sarchie	int i, sz;
57465310Sarchie	vm_paddr_t gpa, hpa;
57565310Sarchie	struct mem_seg *seg;
57665310Sarchie	void *vp, *cookie, *host_domain;
57765310Sarchie
57865310Sarchie	sz = PAGE_SIZE;
57965310Sarchie	host_domain = iommu_host_domain();
58065310Sarchie
58165310Sarchie	for (i = 0; i < vm->num_mem_segs; i++) {
58265310Sarchie		seg = &vm->mem_segs[i];
58365310Sarchie		KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired",
58465310Sarchie		    vm_name(vm), seg->gpa, seg->len));
58565310Sarchie
58665310Sarchie		gpa = seg->gpa;
58765310Sarchie		while (gpa < seg->gpa + seg->len) {
58865310Sarchie			vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE,
58965310Sarchie					 &cookie);
59065310Sarchie			KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
59165310Sarchie			    vm_name(vm), gpa));
59265310Sarchie
59365310Sarchie			vm_gpa_release(cookie);
59465310Sarchie
59565310Sarchie			hpa = DMAP_TO_PHYS((uintptr_t)vp);
59665310Sarchie			if (map) {
59765310Sarchie				iommu_create_mapping(vm->iommu, gpa, hpa, sz);
59865310Sarchie				iommu_remove_mapping(host_domain, hpa, sz);
59965310Sarchie			} else {
60065310Sarchie				iommu_remove_mapping(vm->iommu, gpa, sz);
60165310Sarchie				iommu_create_mapping(host_domain, hpa, hpa, sz);
60265310Sarchie			}
60365310Sarchie
60465310Sarchie			gpa += PAGE_SIZE;
60565310Sarchie		}
60665310Sarchie	}
60765310Sarchie
60865310Sarchie	/*
60965310Sarchie	 * Invalidate the cached translations associated with the domain
61065310Sarchie	 * from which pages were removed.
61165310Sarchie	 */
61265310Sarchie	if (map)
61365310Sarchie		iommu_invalidate_tlb(host_domain);
61465310Sarchie	else
61565310Sarchie		iommu_invalidate_tlb(vm->iommu);
61665310Sarchie}
61765310Sarchie
61865310Sarchie#define	vm_iommu_unmap(vm)	vm_iommu_modify((vm), FALSE)
61965310Sarchie#define	vm_iommu_map(vm)	vm_iommu_modify((vm), TRUE)
62065310Sarchie
62165310Sarchieint
62265310Sarchievm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
62365310Sarchie{
62465310Sarchie	int error;
62565310Sarchie
62665310Sarchie	error = ppt_unassign_device(vm, bus, slot, func);
62765310Sarchie	if (error)
62865310Sarchie		return (error);
62965310Sarchie
63065310Sarchie	if (ppt_assigned_devices(vm) == 0) {
63165310Sarchie		vm_iommu_unmap(vm);
63265310Sarchie		vm_gpa_unwire(vm);
63365310Sarchie	}
63465310Sarchie	return (0);
63565310Sarchie}
63665310Sarchie
63765310Sarchieint
63865310Sarchievm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
63965310Sarchie{
64065310Sarchie	int error;
64165310Sarchie	vm_paddr_t maxaddr;
64265310Sarchie
64365310Sarchie	/*
64465310Sarchie	 * Virtual machines with pci passthru devices get special treatment:
64565310Sarchie	 * - the guest physical memory is wired
64665310Sarchie	 * - the iommu is programmed to do the 'gpa' to 'hpa' translation
64765310Sarchie	 *
64865310Sarchie	 * We need to do this before the first pci passthru device is attached.
64965310Sarchie	 */
65065310Sarchie	if (ppt_assigned_devices(vm) == 0) {
65165310Sarchie		KASSERT(vm->iommu == NULL,
65265310Sarchie		    ("vm_assign_pptdev: iommu must be NULL"));
65365310Sarchie		maxaddr = vmm_mem_maxaddr();
65465310Sarchie		vm->iommu = iommu_create_domain(maxaddr);
65565310Sarchie
65665310Sarchie		error = vm_gpa_wire(vm);
65765310Sarchie		if (error)
65865310Sarchie			return (error);
65965310Sarchie
66065310Sarchie		vm_iommu_map(vm);
66165310Sarchie	}
66265310Sarchie
66365310Sarchie	error = ppt_assign_device(vm, bus, slot, func);
66465310Sarchie	return (error);
66565310Sarchie}
66665310Sarchie
66765310Sarchievoid *
66865310Sarchievm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
66965310Sarchie	    void **cookie)
67065310Sarchie{
67165310Sarchie	int count, pageoff;
67265310Sarchie	vm_page_t m;
67365310Sarchie
67465310Sarchie	pageoff = gpa & PAGE_MASK;
67565310Sarchie	if (len > PAGE_SIZE - pageoff)
67665310Sarchie		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
67765310Sarchie
67866313Sarchie	count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
67965310Sarchie	    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
68065310Sarchie
68165310Sarchie	if (count == 1) {
68265310Sarchie		*cookie = m;
68365310Sarchie		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
68465310Sarchie	} else {
68565310Sarchie		*cookie = NULL;
68665310Sarchie		return (NULL);
68765310Sarchie	}
68865310Sarchie}
68965310Sarchie
69065310Sarchievoid
69165310Sarchievm_gpa_release(void *cookie)
69265310Sarchie{
69365310Sarchie	vm_page_t m = cookie;
69465310Sarchie
69565310Sarchie	vm_page_lock(m);
69665310Sarchie	vm_page_unhold(m);
69765310Sarchie	vm_page_unlock(m);
69865310Sarchie}
69965310Sarchie
70065310Sarchieint
70165310Sarchievm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
70265310Sarchie		  struct vm_memory_segment *seg)
70365310Sarchie{
70465310Sarchie	int i;
70565310Sarchie
70665310Sarchie	for (i = 0; i < vm->num_mem_segs; i++) {
70765310Sarchie		if (gpabase == vm->mem_segs[i].gpa) {
70865310Sarchie			seg->gpa = vm->mem_segs[i].gpa;
70965310Sarchie			seg->len = vm->mem_segs[i].len;
71065310Sarchie			seg->wired = vm->mem_segs[i].wired;
71165310Sarchie			return (0);
71265310Sarchie		}
71365310Sarchie	}
71465310Sarchie	return (-1);
71565310Sarchie}
71665310Sarchie
71765310Sarchieint
71865310Sarchievm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
71965310Sarchie	      vm_offset_t *offset, struct vm_object **object)
72065310Sarchie{
72165310Sarchie	int i;
72265310Sarchie	size_t seg_len;
72365310Sarchie	vm_paddr_t seg_gpa;
72465310Sarchie	vm_object_t seg_obj;
72565310Sarchie
72665310Sarchie	for (i = 0; i < vm->num_mem_segs; i++) {
72765310Sarchie		if ((seg_obj = vm->mem_segs[i].object) == NULL)
72865310Sarchie			continue;
72965310Sarchie
73065310Sarchie		seg_gpa = vm->mem_segs[i].gpa;
73165310Sarchie		seg_len = vm->mem_segs[i].len;
73265310Sarchie
73365310Sarchie		if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) {
73465310Sarchie			*offset = gpa - seg_gpa;
73565310Sarchie			*object = seg_obj;
73665310Sarchie			vm_object_reference(seg_obj);
73765310Sarchie			return (0);
73865310Sarchie		}
73965310Sarchie	}
74065310Sarchie
74165310Sarchie	return (EINVAL);
74265310Sarchie}
74365310Sarchie
74465310Sarchieint
74565310Sarchievm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
74665310Sarchie{
74765310Sarchie
74865310Sarchie	if (vcpu < 0 || vcpu >= VM_MAXCPU)
74965310Sarchie		return (EINVAL);
75065310Sarchie
75165310Sarchie	if (reg >= VM_REG_LAST)
75265310Sarchie		return (EINVAL);
75365310Sarchie
75465310Sarchie	return (VMGETREG(vm->cookie, vcpu, reg, retval));
75565310Sarchie}
75665310Sarchie
75765310Sarchieint
75865310Sarchievm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
75965310Sarchie{
76065310Sarchie
76165310Sarchie	if (vcpu < 0 || vcpu >= VM_MAXCPU)
76265310Sarchie		return (EINVAL);
76365310Sarchie
76465310Sarchie	if (reg >= VM_REG_LAST)
76565310Sarchie		return (EINVAL);
76665310Sarchie
76765310Sarchie	return (VMSETREG(vm->cookie, vcpu, reg, val));
76865310Sarchie}
76965310Sarchie
77065310Sarchiestatic boolean_t
77165310Sarchieis_descriptor_table(int reg)
77265310Sarchie{
77365310Sarchie
77465310Sarchie	switch (reg) {
77565310Sarchie	case VM_REG_GUEST_IDTR:
77665310Sarchie	case VM_REG_GUEST_GDTR:
77765310Sarchie		return (TRUE);
77865310Sarchie	default:
77965310Sarchie		return (FALSE);
78065310Sarchie	}
78165310Sarchie}
78265310Sarchie
78365310Sarchiestatic boolean_t
78465310Sarchieis_segment_register(int reg)
78565310Sarchie{
78665310Sarchie
78765310Sarchie	switch (reg) {
78865310Sarchie	case VM_REG_GUEST_ES:
78965310Sarchie	case VM_REG_GUEST_CS:
79065310Sarchie	case VM_REG_GUEST_SS:
79165310Sarchie	case VM_REG_GUEST_DS:
79265310Sarchie	case VM_REG_GUEST_FS:
79365310Sarchie	case VM_REG_GUEST_GS:
79465310Sarchie	case VM_REG_GUEST_TR:
79565310Sarchie	case VM_REG_GUEST_LDTR:
79665310Sarchie		return (TRUE);
79765310Sarchie	default:
79865310Sarchie		return (FALSE);
79965310Sarchie	}
80065310Sarchie}
80165310Sarchie
80265310Sarchieint
80365310Sarchievm_get_seg_desc(struct vm *vm, int vcpu, int reg,
80465310Sarchie		struct seg_desc *desc)
80565310Sarchie{
80665310Sarchie
80765310Sarchie	if (vcpu < 0 || vcpu >= VM_MAXCPU)
80865310Sarchie		return (EINVAL);
80965310Sarchie
81065310Sarchie	if (!is_segment_register(reg) && !is_descriptor_table(reg))
81165310Sarchie		return (EINVAL);
81265310Sarchie
81365310Sarchie	return (VMGETDESC(vm->cookie, vcpu, reg, desc));
81465310Sarchie}
81565310Sarchie
81665310Sarchieint
81765310Sarchievm_set_seg_desc(struct vm *vm, int vcpu, int reg,
81865310Sarchie		struct seg_desc *desc)
81965310Sarchie{
82065310Sarchie	if (vcpu < 0 || vcpu >= VM_MAXCPU)
82165310Sarchie		return (EINVAL);
82265310Sarchie
82365310Sarchie	if (!is_segment_register(reg) && !is_descriptor_table(reg))
82465310Sarchie		return (EINVAL);
82565310Sarchie
82665310Sarchie	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
82765310Sarchie}
82865310Sarchie
82965310Sarchiestatic void
83065310Sarchierestore_guest_fpustate(struct vcpu *vcpu)
83165310Sarchie{
83265310Sarchie
83365310Sarchie	/* flush host state to the pcb */
83465310Sarchie	fpuexit(curthread);
83565310Sarchie
83665310Sarchie	/* restore guest FPU state */
83765310Sarchie	fpu_stop_emulating();
83865310Sarchie	fpurestore(vcpu->guestfpu);
83965310Sarchie
84065310Sarchie	/* restore guest XCR0 if XSAVE is enabled in the host */
84165310Sarchie	if (rcr4() & CR4_XSAVE)
84265310Sarchie		load_xcr(0, vcpu->guest_xcr0);
84365310Sarchie
84465310Sarchie	/*
84565310Sarchie	 * The FPU is now "dirty" with the guest's state so turn on emulation
84665310Sarchie	 * to trap any access to the FPU by the host.
84765310Sarchie	 */
84865310Sarchie	fpu_start_emulating();
84965310Sarchie}
85065310Sarchie
85165310Sarchiestatic void
85265310Sarchiesave_guest_fpustate(struct vcpu *vcpu)
85365310Sarchie{
85465310Sarchie
85565310Sarchie	if ((rcr0() & CR0_TS) == 0)
85665310Sarchie		panic("fpu emulation not enabled in host!");
85765310Sarchie
85865310Sarchie	/* save guest XCR0 and restore host XCR0 */
85965310Sarchie	if (rcr4() & CR4_XSAVE) {
86065310Sarchie		vcpu->guest_xcr0 = rxcr(0);
86165310Sarchie		load_xcr(0, vmm_get_host_xcr0());
86265310Sarchie	}
86365310Sarchie
86465310Sarchie	/* save guest FPU state */
86565310Sarchie	fpu_stop_emulating();
86665310Sarchie	fpusave(vcpu->guestfpu);
86765310Sarchie	fpu_start_emulating();
86865310Sarchie}
86965310Sarchie
87065310Sarchiestatic VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
87165310Sarchie
87265310Sarchiestatic int
87365310Sarchievcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
87465310Sarchie    bool from_idle)
87565310Sarchie{
87665310Sarchie	int error;
87765310Sarchie
87865310Sarchie	vcpu_assert_locked(vcpu);
87965310Sarchie
88065310Sarchie	/*
88165310Sarchie	 * State transitions from the vmmdev_ioctl() must always begin from
88265310Sarchie	 * the VCPU_IDLE state. This guarantees that there is only a single
88365310Sarchie	 * ioctl() operating on a vcpu at any point.
88465310Sarchie	 */
88565310Sarchie	if (from_idle) {
88665310Sarchie		while (vcpu->state != VCPU_IDLE)
88765310Sarchie			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
88865310Sarchie	} else {
88965310Sarchie		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
89065310Sarchie		    "vcpu idle state"));
89165310Sarchie	}
89265310Sarchie
89365310Sarchie	if (vcpu->state == VCPU_RUNNING) {
89465310Sarchie		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
89565310Sarchie		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
89665310Sarchie	} else {
89765310Sarchie		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
89865310Sarchie		    "vcpu that is not running", vcpu->hostcpu));
89965310Sarchie	}
90065310Sarchie
90165310Sarchie	/*
90265310Sarchie	 * The following state transitions are allowed:
90365310Sarchie	 * IDLE -> FROZEN -> IDLE
90465310Sarchie	 * FROZEN -> RUNNING -> FROZEN
90565310Sarchie	 * FROZEN -> SLEEPING -> FROZEN
90665310Sarchie	 */
90765310Sarchie	switch (vcpu->state) {
90865310Sarchie	case VCPU_IDLE:
90965310Sarchie	case VCPU_RUNNING:
91065310Sarchie	case VCPU_SLEEPING:
91165310Sarchie		error = (newstate != VCPU_FROZEN);
91265310Sarchie		break;
91365310Sarchie	case VCPU_FROZEN:
91465310Sarchie		error = (newstate == VCPU_FROZEN);
91565310Sarchie		break;
91665310Sarchie	default:
91765310Sarchie		error = 1;
91865310Sarchie		break;
91965310Sarchie	}
92065310Sarchie
92165310Sarchie	if (error)
92265310Sarchie		return (EBUSY);
92365310Sarchie
92465310Sarchie	vcpu->state = newstate;
92565310Sarchie	if (newstate == VCPU_RUNNING)
92665310Sarchie		vcpu->hostcpu = curcpu;
92765310Sarchie	else
92865310Sarchie		vcpu->hostcpu = NOCPU;
92965310Sarchie
93065310Sarchie	if (newstate == VCPU_IDLE)
93165310Sarchie		wakeup(&vcpu->state);
93265310Sarchie
93365310Sarchie	return (0);
93465310Sarchie}
93565310Sarchie
93665310Sarchiestatic void
93765310Sarchievcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
93865310Sarchie{
93965310Sarchie	int error;
94065310Sarchie
94165310Sarchie	if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
94265310Sarchie		panic("Error %d setting state to %d\n", error, newstate);
94365310Sarchie}
94465310Sarchie
94565310Sarchiestatic void
94665310Sarchievcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
94765310Sarchie{
94865310Sarchie	int error;
94965310Sarchie
95065310Sarchie	if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
95165310Sarchie		panic("Error %d setting state to %d", error, newstate);
95265310Sarchie}
95365310Sarchie
95465310Sarchiestatic void
95565310Sarchievm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func)
95665310Sarchie{
95765310Sarchie
95865310Sarchie	KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked"));
95965310Sarchie
96065310Sarchie	/*
96165310Sarchie	 * Update 'rendezvous_func' and execute a write memory barrier to
96265310Sarchie	 * ensure that it is visible across all host cpus. This is not needed
96365310Sarchie	 * for correctness but it does ensure that all the vcpus will notice
96465310Sarchie	 * that the rendezvous is requested immediately.
96565310Sarchie	 */
96665310Sarchie	vm->rendezvous_func = func;
96765310Sarchie	wmb();
96865310Sarchie}
96965310Sarchie
97065310Sarchie#define	RENDEZVOUS_CTR0(vm, vcpuid, fmt)				\
97165310Sarchie	do {								\
97265310Sarchie		if (vcpuid >= 0)					\
97365310Sarchie			VCPU_CTR0(vm, vcpuid, fmt);			\
97465310Sarchie		else							\
97565310Sarchie			VM_CTR0(vm, fmt);				\
97665310Sarchie	} while (0)
97765310Sarchie
97865310Sarchiestatic void
97965310Sarchievm_handle_rendezvous(struct vm *vm, int vcpuid)
98065310Sarchie{
98165310Sarchie
98265310Sarchie	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
98365310Sarchie	    ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid));
98465310Sarchie
98565310Sarchie	mtx_lock(&vm->rendezvous_mtx);
98665310Sarchie	while (vm->rendezvous_func != NULL) {
98765310Sarchie		/* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
98865310Sarchie		CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus);
98965310Sarchie
99065310Sarchie		if (vcpuid != -1 &&
99165310Sarchie		    CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
99265310Sarchie		    !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
99365310Sarchie			VCPU_CTR0(vm, vcpuid, "Calling rendezvous func");
99465310Sarchie			(*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg);
99565310Sarchie			CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
99665310Sarchie		}
99765310Sarchie		if (CPU_CMP(&vm->rendezvous_req_cpus,
99865310Sarchie		    &vm->rendezvous_done_cpus) == 0) {
99965310Sarchie			VCPU_CTR0(vm, vcpuid, "Rendezvous completed");
100065310Sarchie			vm_set_rendezvous_func(vm, NULL);
100165310Sarchie			wakeup(&vm->rendezvous_func);
100265310Sarchie			break;
100365310Sarchie		}
100465310Sarchie		RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion");
100565310Sarchie		mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
100665310Sarchie		    "vmrndv", 0);
100765310Sarchie	}
100865310Sarchie	mtx_unlock(&vm->rendezvous_mtx);
100965310Sarchie}
101065310Sarchie
101165310Sarchie/*
101265310Sarchie * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
101365310Sarchie */
101465310Sarchiestatic int
1015vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
1016{
1017	struct vcpu *vcpu;
1018	const char *wmesg;
1019	int t, vcpu_halted, vm_halted;
1020
1021	KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
1022
1023	vcpu = &vm->vcpu[vcpuid];
1024	vcpu_halted = 0;
1025	vm_halted = 0;
1026
1027	vcpu_lock(vcpu);
1028	while (1) {
1029		/*
1030		 * Do a final check for pending NMI or interrupts before
1031		 * really putting this thread to sleep. Also check for
1032		 * software events that would cause this vcpu to wakeup.
1033		 *
1034		 * These interrupts/events could have happened after the
1035		 * vcpu returned from VMRUN() and before it acquired the
1036		 * vcpu lock above.
1037		 */
1038		if (vm->rendezvous_func != NULL || vm->suspend)
1039			break;
1040		if (vm_nmi_pending(vm, vcpuid))
1041			break;
1042		if (!intr_disabled) {
1043			if (vm_extint_pending(vm, vcpuid) ||
1044			    vlapic_pending_intr(vcpu->vlapic, NULL)) {
1045				break;
1046			}
1047		}
1048
1049		/*
1050		 * Some Linux guests implement "halt" by having all vcpus
1051		 * execute HLT with interrupts disabled. 'halted_cpus' keeps
1052		 * track of the vcpus that have entered this state. When all
1053		 * vcpus enter the halted state the virtual machine is halted.
1054		 */
1055		if (intr_disabled) {
1056			wmesg = "vmhalt";
1057			VCPU_CTR0(vm, vcpuid, "Halted");
1058			if (!vcpu_halted && halt_detection_enabled) {
1059				vcpu_halted = 1;
1060				CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
1061			}
1062			if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
1063				vm_halted = 1;
1064				break;
1065			}
1066		} else {
1067			wmesg = "vmidle";
1068		}
1069
1070		t = ticks;
1071		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1072		msleep_spin(vcpu, &vcpu->mtx, wmesg, 0);
1073		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1074		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
1075	}
1076
1077	if (vcpu_halted)
1078		CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
1079
1080	vcpu_unlock(vcpu);
1081
1082	if (vm_halted)
1083		vm_suspend(vm, VM_SUSPEND_HALT);
1084
1085	return (0);
1086}
1087
1088static int
1089vm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
1090{
1091	int rv, ftype;
1092	struct vm_map *map;
1093	struct vcpu *vcpu;
1094	struct vm_exit *vme;
1095
1096	vcpu = &vm->vcpu[vcpuid];
1097	vme = &vcpu->exitinfo;
1098
1099	ftype = vme->u.paging.fault_type;
1100	KASSERT(ftype == VM_PROT_READ ||
1101	    ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
1102	    ("vm_handle_paging: invalid fault_type %d", ftype));
1103
1104	if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
1105		rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
1106		    vme->u.paging.gpa, ftype);
1107		if (rv == 0)
1108			goto done;
1109	}
1110
1111	map = &vm->vmspace->vm_map;
1112	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
1113
1114	VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, "
1115	    "ftype = %d", rv, vme->u.paging.gpa, ftype);
1116
1117	if (rv != KERN_SUCCESS)
1118		return (EFAULT);
1119done:
1120	/* restart execution at the faulting instruction */
1121	vme->inst_length = 0;
1122
1123	return (0);
1124}
1125
1126static int
1127vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
1128{
1129	struct vie *vie;
1130	struct vcpu *vcpu;
1131	struct vm_exit *vme;
1132	uint64_t gla, gpa;
1133	struct vm_guest_paging *paging;
1134	mem_region_read_t mread;
1135	mem_region_write_t mwrite;
1136	int error;
1137
1138	vcpu = &vm->vcpu[vcpuid];
1139	vme = &vcpu->exitinfo;
1140
1141	gla = vme->u.inst_emul.gla;
1142	gpa = vme->u.inst_emul.gpa;
1143	vie = &vme->u.inst_emul.vie;
1144	paging = &vme->u.inst_emul.paging;
1145
1146	vie_init(vie);
1147
1148	/* Fetch, decode and emulate the faulting instruction */
1149	error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip,
1150	    vme->inst_length, vie);
1151	if (error == 1)
1152		return (0);		/* Resume guest to handle page fault */
1153	else if (error == -1)
1154		return (EFAULT);
1155	else if (error != 0)
1156		panic("%s: vmm_fetch_instruction error %d", __func__, error);
1157
1158	if (vmm_decode_instruction(vm, vcpuid, gla, paging->cpu_mode, vie) != 0)
1159		return (EFAULT);
1160
1161	/* return to userland unless this is an in-kernel emulated device */
1162	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1163		mread = lapic_mmio_read;
1164		mwrite = lapic_mmio_write;
1165	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1166		mread = vioapic_mmio_read;
1167		mwrite = vioapic_mmio_write;
1168	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1169		mread = vhpet_mmio_read;
1170		mwrite = vhpet_mmio_write;
1171	} else {
1172		*retu = true;
1173		return (0);
1174	}
1175
1176	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, mread, mwrite,
1177	    retu);
1178
1179	return (error);
1180}
1181
1182static int
1183vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu)
1184{
1185	int i, done;
1186	struct vcpu *vcpu;
1187
1188	done = 0;
1189	vcpu = &vm->vcpu[vcpuid];
1190
1191	CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
1192
1193	/*
1194	 * Wait until all 'active_cpus' have suspended themselves.
1195	 *
1196	 * Since a VM may be suspended at any time including when one or
1197	 * more vcpus are doing a rendezvous we need to call the rendezvous
1198	 * handler while we are waiting to prevent a deadlock.
1199	 */
1200	vcpu_lock(vcpu);
1201	while (1) {
1202		if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
1203			VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
1204			break;
1205		}
1206
1207		if (vm->rendezvous_func == NULL) {
1208			VCPU_CTR0(vm, vcpuid, "Sleeping during suspend");
1209			vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1210			msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
1211			vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1212		} else {
1213			VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend");
1214			vcpu_unlock(vcpu);
1215			vm_handle_rendezvous(vm, vcpuid);
1216			vcpu_lock(vcpu);
1217		}
1218	}
1219	vcpu_unlock(vcpu);
1220
1221	/*
1222	 * Wakeup the other sleeping vcpus and return to userspace.
1223	 */
1224	for (i = 0; i < VM_MAXCPU; i++) {
1225		if (CPU_ISSET(i, &vm->suspended_cpus)) {
1226			vcpu_notify_event(vm, i, false);
1227		}
1228	}
1229
1230	*retu = true;
1231	return (0);
1232}
1233
1234int
1235vm_suspend(struct vm *vm, enum vm_suspend_how how)
1236{
1237	int i;
1238
1239	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
1240		return (EINVAL);
1241
1242	if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
1243		VM_CTR2(vm, "virtual machine already suspended %d/%d",
1244		    vm->suspend, how);
1245		return (EALREADY);
1246	}
1247
1248	VM_CTR1(vm, "virtual machine successfully suspended %d", how);
1249
1250	/*
1251	 * Notify all active vcpus that they are now suspended.
1252	 */
1253	for (i = 0; i < VM_MAXCPU; i++) {
1254		if (CPU_ISSET(i, &vm->active_cpus))
1255			vcpu_notify_event(vm, i, false);
1256	}
1257
1258	return (0);
1259}
1260
1261void
1262vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip)
1263{
1264	struct vm_exit *vmexit;
1265
1266	KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
1267	    ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
1268
1269	vmexit = vm_exitinfo(vm, vcpuid);
1270	vmexit->rip = rip;
1271	vmexit->inst_length = 0;
1272	vmexit->exitcode = VM_EXITCODE_SUSPENDED;
1273	vmexit->u.suspended.how = vm->suspend;
1274}
1275
1276int
1277vm_run(struct vm *vm, struct vm_run *vmrun)
1278{
1279	int error, vcpuid;
1280	struct vcpu *vcpu;
1281	struct pcb *pcb;
1282	uint64_t tscval, rip;
1283	struct vm_exit *vme;
1284	bool retu, intr_disabled;
1285	pmap_t pmap;
1286	void *rptr, *sptr;
1287
1288	vcpuid = vmrun->cpuid;
1289
1290	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1291		return (EINVAL);
1292
1293	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
1294		return (EINVAL);
1295
1296	if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
1297		return (EINVAL);
1298
1299	rptr = &vm->rendezvous_func;
1300	sptr = &vm->suspend;
1301	pmap = vmspace_pmap(vm->vmspace);
1302	vcpu = &vm->vcpu[vcpuid];
1303	vme = &vcpu->exitinfo;
1304	rip = vmrun->rip;
1305restart:
1306	critical_enter();
1307
1308	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
1309	    ("vm_run: absurd pm_active"));
1310
1311	tscval = rdtsc();
1312
1313	pcb = PCPU_GET(curpcb);
1314	set_pcb_flags(pcb, PCB_FULL_IRET);
1315
1316	restore_guest_msrs(vm, vcpuid);
1317	restore_guest_fpustate(vcpu);
1318
1319	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
1320	error = VMRUN(vm->cookie, vcpuid, rip, pmap, rptr, sptr);
1321	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
1322
1323	save_guest_fpustate(vcpu);
1324	restore_host_msrs(vm, vcpuid);
1325
1326	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
1327
1328	critical_exit();
1329
1330	if (error == 0) {
1331		retu = false;
1332		switch (vme->exitcode) {
1333		case VM_EXITCODE_SUSPENDED:
1334			error = vm_handle_suspend(vm, vcpuid, &retu);
1335			break;
1336		case VM_EXITCODE_IOAPIC_EOI:
1337			vioapic_process_eoi(vm, vcpuid,
1338			    vme->u.ioapic_eoi.vector);
1339			break;
1340		case VM_EXITCODE_RENDEZVOUS:
1341			vm_handle_rendezvous(vm, vcpuid);
1342			error = 0;
1343			break;
1344		case VM_EXITCODE_HLT:
1345			intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
1346			error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu);
1347			break;
1348		case VM_EXITCODE_PAGING:
1349			error = vm_handle_paging(vm, vcpuid, &retu);
1350			break;
1351		case VM_EXITCODE_INST_EMUL:
1352			error = vm_handle_inst_emul(vm, vcpuid, &retu);
1353			break;
1354		case VM_EXITCODE_INOUT:
1355		case VM_EXITCODE_INOUT_STR:
1356			error = vm_handle_inout(vm, vcpuid, vme, &retu);
1357			break;
1358		default:
1359			retu = true;	/* handled in userland */
1360			break;
1361		}
1362	}
1363
1364	if (error == 0 && retu == false) {
1365		rip = vme->rip + vme->inst_length;
1366		goto restart;
1367	}
1368
1369	/* copy the exit information */
1370	bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
1371	return (error);
1372}
1373
1374int
1375vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
1376{
1377	struct vcpu *vcpu;
1378
1379	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1380		return (EINVAL);
1381
1382	if (exception->vector < 0 || exception->vector >= 32)
1383		return (EINVAL);
1384
1385	vcpu = &vm->vcpu[vcpuid];
1386
1387	if (vcpu->exception_pending) {
1388		VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
1389		    "pending exception %d", exception->vector,
1390		    vcpu->exception.vector);
1391		return (EBUSY);
1392	}
1393
1394	vcpu->exception_pending = 1;
1395	vcpu->exception = *exception;
1396	VCPU_CTR1(vm, vcpuid, "Exception %d pending", exception->vector);
1397	return (0);
1398}
1399
1400int
1401vm_exception_pending(struct vm *vm, int vcpuid, struct vm_exception *exception)
1402{
1403	struct vcpu *vcpu;
1404	int pending;
1405
1406	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));
1407
1408	vcpu = &vm->vcpu[vcpuid];
1409	pending = vcpu->exception_pending;
1410	if (pending) {
1411		vcpu->exception_pending = 0;
1412		*exception = vcpu->exception;
1413		VCPU_CTR1(vm, vcpuid, "Exception %d delivered",
1414		    exception->vector);
1415	}
1416	return (pending);
1417}
1418
1419static void
1420vm_inject_fault(struct vm *vm, int vcpuid, struct vm_exception *exception)
1421{
1422	struct vm_exit *vmexit;
1423	int error;
1424
1425	error = vm_inject_exception(vm, vcpuid, exception);
1426	KASSERT(error == 0, ("vm_inject_exception error %d", error));
1427
1428	/*
1429	 * A fault-like exception allows the instruction to be restarted
1430	 * after the exception handler returns.
1431	 *
1432	 * By setting the inst_length to 0 we ensure that the instruction
1433	 * pointer remains at the faulting instruction.
1434	 */
1435	vmexit = vm_exitinfo(vm, vcpuid);
1436	vmexit->inst_length = 0;
1437}
1438
1439void
1440vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2)
1441{
1442	struct vm_exception pf = {
1443		.vector = IDT_PF,
1444		.error_code_valid = 1,
1445		.error_code = error_code
1446	};
1447	int error;
1448
1449	VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx",
1450	    error_code, cr2);
1451
1452	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
1453	KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
1454
1455	vm_inject_fault(vm, vcpuid, &pf);
1456}
1457
1458void
1459vm_inject_gp(struct vm *vm, int vcpuid)
1460{
1461	struct vm_exception gpf = {
1462		.vector = IDT_GP,
1463		.error_code_valid = 1,
1464		.error_code = 0
1465	};
1466
1467	vm_inject_fault(vm, vcpuid, &gpf);
1468}
1469
1470void
1471vm_inject_ud(struct vm *vm, int vcpuid)
1472{
1473	struct vm_exception udf = {
1474		.vector = IDT_UD,
1475		.error_code_valid = 0
1476	};
1477
1478	vm_inject_fault(vm, vcpuid, &udf);
1479}
1480
1481static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
1482
1483int
1484vm_inject_nmi(struct vm *vm, int vcpuid)
1485{
1486	struct vcpu *vcpu;
1487
1488	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1489		return (EINVAL);
1490
1491	vcpu = &vm->vcpu[vcpuid];
1492
1493	vcpu->nmi_pending = 1;
1494	vcpu_notify_event(vm, vcpuid, false);
1495	return (0);
1496}
1497
1498int
1499vm_nmi_pending(struct vm *vm, int vcpuid)
1500{
1501	struct vcpu *vcpu;
1502
1503	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1504		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1505
1506	vcpu = &vm->vcpu[vcpuid];
1507
1508	return (vcpu->nmi_pending);
1509}
1510
1511void
1512vm_nmi_clear(struct vm *vm, int vcpuid)
1513{
1514	struct vcpu *vcpu;
1515
1516	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1517		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1518
1519	vcpu = &vm->vcpu[vcpuid];
1520
1521	if (vcpu->nmi_pending == 0)
1522		panic("vm_nmi_clear: inconsistent nmi_pending state");
1523
1524	vcpu->nmi_pending = 0;
1525	vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
1526}
1527
1528static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
1529
1530int
1531vm_inject_extint(struct vm *vm, int vcpuid)
1532{
1533	struct vcpu *vcpu;
1534
1535	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1536		return (EINVAL);
1537
1538	vcpu = &vm->vcpu[vcpuid];
1539
1540	vcpu->extint_pending = 1;
1541	vcpu_notify_event(vm, vcpuid, false);
1542	return (0);
1543}
1544
1545int
1546vm_extint_pending(struct vm *vm, int vcpuid)
1547{
1548	struct vcpu *vcpu;
1549
1550	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1551		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
1552
1553	vcpu = &vm->vcpu[vcpuid];
1554
1555	return (vcpu->extint_pending);
1556}
1557
1558void
1559vm_extint_clear(struct vm *vm, int vcpuid)
1560{
1561	struct vcpu *vcpu;
1562
1563	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1564		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
1565
1566	vcpu = &vm->vcpu[vcpuid];
1567
1568	if (vcpu->extint_pending == 0)
1569		panic("vm_extint_clear: inconsistent extint_pending state");
1570
1571	vcpu->extint_pending = 0;
1572	vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
1573}
1574
1575int
1576vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
1577{
1578	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1579		return (EINVAL);
1580
1581	if (type < 0 || type >= VM_CAP_MAX)
1582		return (EINVAL);
1583
1584	return (VMGETCAP(vm->cookie, vcpu, type, retval));
1585}
1586
1587int
1588vm_set_capability(struct vm *vm, int vcpu, int type, int val)
1589{
1590	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1591		return (EINVAL);
1592
1593	if (type < 0 || type >= VM_CAP_MAX)
1594		return (EINVAL);
1595
1596	return (VMSETCAP(vm->cookie, vcpu, type, val));
1597}
1598
1599uint64_t *
1600vm_guest_msrs(struct vm *vm, int cpu)
1601{
1602	return (vm->vcpu[cpu].guest_msrs);
1603}
1604
1605struct vlapic *
1606vm_lapic(struct vm *vm, int cpu)
1607{
1608	return (vm->vcpu[cpu].vlapic);
1609}
1610
1611struct vioapic *
1612vm_ioapic(struct vm *vm)
1613{
1614
1615	return (vm->vioapic);
1616}
1617
1618struct vhpet *
1619vm_hpet(struct vm *vm)
1620{
1621
1622	return (vm->vhpet);
1623}
1624
1625boolean_t
1626vmm_is_pptdev(int bus, int slot, int func)
1627{
1628	int found, i, n;
1629	int b, s, f;
1630	char *val, *cp, *cp2;
1631
1632	/*
1633	 * XXX
1634	 * The length of an environment variable is limited to 128 bytes which
1635	 * puts an upper limit on the number of passthru devices that may be
1636	 * specified using a single environment variable.
1637	 *
1638	 * Work around this by scanning multiple environment variable
1639	 * names instead of a single one - yuck!
1640	 */
1641	const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
1642
1643	/* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
1644	found = 0;
1645	for (i = 0; names[i] != NULL && !found; i++) {
1646		cp = val = getenv(names[i]);
1647		while (cp != NULL && *cp != '\0') {
1648			if ((cp2 = strchr(cp, ' ')) != NULL)
1649				*cp2 = '\0';
1650
1651			n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
1652			if (n == 3 && bus == b && slot == s && func == f) {
1653				found = 1;
1654				break;
1655			}
1656
1657			if (cp2 != NULL)
1658				*cp2++ = ' ';
1659
1660			cp = cp2;
1661		}
1662		freeenv(val);
1663	}
1664	return (found);
1665}
1666
1667void *
1668vm_iommu_domain(struct vm *vm)
1669{
1670
1671	return (vm->iommu);
1672}
1673
1674int
1675vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
1676    bool from_idle)
1677{
1678	int error;
1679	struct vcpu *vcpu;
1680
1681	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1682		panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
1683
1684	vcpu = &vm->vcpu[vcpuid];
1685
1686	vcpu_lock(vcpu);
1687	error = vcpu_set_state_locked(vcpu, newstate, from_idle);
1688	vcpu_unlock(vcpu);
1689
1690	return (error);
1691}
1692
1693enum vcpu_state
1694vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
1695{
1696	struct vcpu *vcpu;
1697	enum vcpu_state state;
1698
1699	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1700		panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
1701
1702	vcpu = &vm->vcpu[vcpuid];
1703
1704	vcpu_lock(vcpu);
1705	state = vcpu->state;
1706	if (hostcpu != NULL)
1707		*hostcpu = vcpu->hostcpu;
1708	vcpu_unlock(vcpu);
1709
1710	return (state);
1711}
1712
1713int
1714vm_activate_cpu(struct vm *vm, int vcpuid)
1715{
1716
1717	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1718		return (EINVAL);
1719
1720	if (CPU_ISSET(vcpuid, &vm->active_cpus))
1721		return (EBUSY);
1722
1723	VCPU_CTR0(vm, vcpuid, "activated");
1724	CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
1725	return (0);
1726}
1727
1728cpuset_t
1729vm_active_cpus(struct vm *vm)
1730{
1731
1732	return (vm->active_cpus);
1733}
1734
1735cpuset_t
1736vm_suspended_cpus(struct vm *vm)
1737{
1738
1739	return (vm->suspended_cpus);
1740}
1741
1742void *
1743vcpu_stats(struct vm *vm, int vcpuid)
1744{
1745
1746	return (vm->vcpu[vcpuid].stats);
1747}
1748
1749int
1750vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
1751{
1752	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1753		return (EINVAL);
1754
1755	*state = vm->vcpu[vcpuid].x2apic_state;
1756
1757	return (0);
1758}
1759
1760int
1761vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
1762{
1763	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1764		return (EINVAL);
1765
1766	if (state >= X2APIC_STATE_LAST)
1767		return (EINVAL);
1768
1769	vm->vcpu[vcpuid].x2apic_state = state;
1770
1771	vlapic_set_x2apic_state(vm, vcpuid, state);
1772
1773	return (0);
1774}
1775
1776/*
1777 * This function is called to ensure that a vcpu "sees" a pending event
1778 * as soon as possible:
1779 * - If the vcpu thread is sleeping then it is woken up.
1780 * - If the vcpu is running on a different host_cpu then an IPI will be directed
1781 *   to the host_cpu to cause the vcpu to trap into the hypervisor.
1782 */
1783void
1784vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
1785{
1786	int hostcpu;
1787	struct vcpu *vcpu;
1788
1789	vcpu = &vm->vcpu[vcpuid];
1790
1791	vcpu_lock(vcpu);
1792	hostcpu = vcpu->hostcpu;
1793	if (vcpu->state == VCPU_RUNNING) {
1794		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
1795		if (hostcpu != curcpu) {
1796			if (lapic_intr) {
1797				vlapic_post_intr(vcpu->vlapic, hostcpu,
1798				    vmm_ipinum);
1799			} else {
1800				ipi_cpu(hostcpu, vmm_ipinum);
1801			}
1802		} else {
1803			/*
1804			 * If the 'vcpu' is running on 'curcpu' then it must
1805			 * be sending a notification to itself (e.g. SELF_IPI).
1806			 * The pending event will be picked up when the vcpu
1807			 * transitions back to guest context.
1808			 */
1809		}
1810	} else {
1811		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
1812		    "with hostcpu %d", vcpu->state, hostcpu));
1813		if (vcpu->state == VCPU_SLEEPING)
1814			wakeup_one(vcpu);
1815	}
1816	vcpu_unlock(vcpu);
1817}
1818
1819struct vmspace *
1820vm_get_vmspace(struct vm *vm)
1821{
1822
1823	return (vm->vmspace);
1824}
1825
1826int
1827vm_apicid2vcpuid(struct vm *vm, int apicid)
1828{
1829	/*
1830	 * XXX apic id is assumed to be numerically identical to vcpu id
1831	 */
1832	return (apicid);
1833}
1834
1835void
1836vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
1837    vm_rendezvous_func_t func, void *arg)
1838{
1839	int i;
1840
1841	/*
1842	 * Enforce that this function is called without any locks
1843	 */
1844	WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous");
1845	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
1846	    ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid));
1847
1848restart:
1849	mtx_lock(&vm->rendezvous_mtx);
1850	if (vm->rendezvous_func != NULL) {
1851		/*
1852		 * If a rendezvous is already in progress then we need to
1853		 * call the rendezvous handler in case this 'vcpuid' is one
1854		 * of the targets of the rendezvous.
1855		 */
1856		RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress");
1857		mtx_unlock(&vm->rendezvous_mtx);
1858		vm_handle_rendezvous(vm, vcpuid);
1859		goto restart;
1860	}
1861	KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous "
1862	    "rendezvous is still in progress"));
1863
1864	RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous");
1865	vm->rendezvous_req_cpus = dest;
1866	CPU_ZERO(&vm->rendezvous_done_cpus);
1867	vm->rendezvous_arg = arg;
1868	vm_set_rendezvous_func(vm, func);
1869	mtx_unlock(&vm->rendezvous_mtx);
1870
1871	/*
1872	 * Wake up any sleeping vcpus and trigger a VM-exit in any running
1873	 * vcpus so they handle the rendezvous as soon as possible.
1874	 */
1875	for (i = 0; i < VM_MAXCPU; i++) {
1876		if (CPU_ISSET(i, &dest))
1877			vcpu_notify_event(vm, i, false);
1878	}
1879
1880	vm_handle_rendezvous(vm, vcpuid);
1881}
1882
1883struct vatpic *
1884vm_atpic(struct vm *vm)
1885{
1886	return (vm->vatpic);
1887}
1888
1889struct vatpit *
1890vm_atpit(struct vm *vm)
1891{
1892	return (vm->vatpit);
1893}
1894
1895enum vm_reg_name
1896vm_segment_name(int seg)
1897{
1898	static enum vm_reg_name seg_names[] = {
1899		VM_REG_GUEST_ES,
1900		VM_REG_GUEST_CS,
1901		VM_REG_GUEST_SS,
1902		VM_REG_GUEST_DS,
1903		VM_REG_GUEST_FS,
1904		VM_REG_GUEST_GS
1905	};
1906
1907	KASSERT(seg >= 0 && seg < nitems(seg_names),
1908	    ("%s: invalid segment encoding %d", __func__, seg));
1909	return (seg_names[seg]);
1910}
1911