vmm.c revision 276349
1221828Sgrehan/*- 2221828Sgrehan * Copyright (c) 2011 NetApp, Inc. 3221828Sgrehan * All rights reserved. 4221828Sgrehan * 5221828Sgrehan * Redistribution and use in source and binary forms, with or without 6221828Sgrehan * modification, are permitted provided that the following conditions 7221828Sgrehan * are met: 8221828Sgrehan * 1. Redistributions of source code must retain the above copyright 9221828Sgrehan * notice, this list of conditions and the following disclaimer. 10221828Sgrehan * 2. Redistributions in binary form must reproduce the above copyright 11221828Sgrehan * notice, this list of conditions and the following disclaimer in the 12221828Sgrehan * documentation and/or other materials provided with the distribution. 13221828Sgrehan * 14221828Sgrehan * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15221828Sgrehan * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16221828Sgrehan * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17221828Sgrehan * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18221828Sgrehan * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19221828Sgrehan * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20221828Sgrehan * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21221828Sgrehan * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22221828Sgrehan * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23221828Sgrehan * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24221828Sgrehan * SUCH DAMAGE. 25221828Sgrehan * 26221828Sgrehan * $FreeBSD: stable/10/sys/amd64/vmm/vmm.c 276349 2014-12-28 21:27:13Z neel $ 27221828Sgrehan */ 28221828Sgrehan 29221828Sgrehan#include <sys/cdefs.h> 30221828Sgrehan__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/vmm.c 276349 2014-12-28 21:27:13Z neel $"); 31221828Sgrehan 32221828Sgrehan#include <sys/param.h> 33234695Sgrehan#include <sys/systm.h> 34221828Sgrehan#include <sys/kernel.h> 35221828Sgrehan#include <sys/module.h> 36221828Sgrehan#include <sys/sysctl.h> 37221828Sgrehan#include <sys/malloc.h> 38221828Sgrehan#include <sys/pcpu.h> 39221828Sgrehan#include <sys/lock.h> 40221828Sgrehan#include <sys/mutex.h> 41221828Sgrehan#include <sys/proc.h> 42256072Sneel#include <sys/rwlock.h> 43221828Sgrehan#include <sys/sched.h> 44221828Sgrehan#include <sys/smp.h> 45221828Sgrehan#include <sys/systm.h> 46221828Sgrehan 47221828Sgrehan#include <vm/vm.h> 48256072Sneel#include <vm/vm_object.h> 49256072Sneel#include <vm/vm_page.h> 50256072Sneel#include <vm/pmap.h> 51256072Sneel#include <vm/vm_map.h> 52256072Sneel#include <vm/vm_extern.h> 53256072Sneel#include <vm/vm_param.h> 54221828Sgrehan 55261275Sjhb#include <machine/cpu.h> 56221828Sgrehan#include <machine/vm.h> 57221828Sgrehan#include <machine/pcb.h> 58241489Sneel#include <machine/smp.h> 59262350Sjhb#include <x86/psl.h> 60221914Sjhb#include <x86/apicreg.h> 61256072Sneel#include <machine/vmparam.h> 62221828Sgrehan 63221828Sgrehan#include <machine/vmm.h> 64261088Sjhb#include <machine/vmm_dev.h> 65268976Sjhb#include <machine/vmm_instruction_emul.h> 66261088Sjhb 67268976Sjhb#include "vmm_ioport.h" 68256072Sneel#include "vmm_ktr.h" 69242275Sneel#include "vmm_host.h" 70221828Sgrehan#include "vmm_mem.h" 71221828Sgrehan#include "vmm_util.h" 72268891Sjhb#include "vatpic.h" 73268891Sjhb#include "vatpit.h" 74261088Sjhb#include "vhpet.h" 75261088Sjhb#include "vioapic.h" 76221828Sgrehan#include "vlapic.h" 77221828Sgrehan#include "vmm_ipi.h" 78221828Sgrehan#include "vmm_stat.h" 79242065Sneel#include "vmm_lapic.h" 80221828Sgrehan 81221828Sgrehan#include "io/ppt.h" 82221828Sgrehan#include "io/iommu.h" 83221828Sgrehan 84221828Sgrehanstruct vlapic; 85221828Sgrehan 86270071Sgrehan/* 87270071Sgrehan * Initialization: 88270071Sgrehan * (a) allocated when vcpu is created 89270071Sgrehan * (i) initialized when vcpu is created and when it is reinitialized 90270071Sgrehan * (o) initialized the first time the vcpu is created 91270071Sgrehan * (x) initialized before use 92270071Sgrehan */ 93221828Sgrehanstruct vcpu { 94270071Sgrehan struct mtx mtx; /* (o) protects 'state' and 'hostcpu' */ 95270071Sgrehan enum vcpu_state state; /* (o) vcpu state */ 96270071Sgrehan int hostcpu; /* (o) vcpu's host cpu */ 97270071Sgrehan struct vlapic *vlapic; /* (i) APIC device model */ 98270071Sgrehan enum x2apic_state x2apic_state; /* (i) APIC mode */ 99270159Sgrehan uint64_t exitintinfo; /* (i) events pending at VM exit */ 100270071Sgrehan int nmi_pending; /* (i) NMI pending */ 101270071Sgrehan int extint_pending; /* (i) INTR pending */ 102270071Sgrehan struct vm_exception exception; /* (x) exception collateral */ 103270071Sgrehan int exception_pending; /* (i) exception pending */ 104270071Sgrehan struct savefpu *guestfpu; /* (a,i) guest fpu state */ 105270071Sgrehan uint64_t guest_xcr0; /* (i) guest %xcr0 register */ 106270071Sgrehan void *stats; /* (a,i) statistics */ 107270071Sgrehan struct vm_exit exitinfo; /* (x) exit reason and collateral */ 108221828Sgrehan}; 109221828Sgrehan 110270071Sgrehan#define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx)) 111242065Sneel#define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) 112242065Sneel#define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) 113242065Sneel#define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) 114256072Sneel#define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) 115241489Sneel 116256072Sneelstruct mem_seg { 117256072Sneel vm_paddr_t gpa; 118256072Sneel size_t len; 119256072Sneel boolean_t wired; 120256072Sneel vm_object_t object; 121256072Sneel}; 122221828Sgrehan#define VM_MAX_MEMORY_SEGMENTS 2 123221828Sgrehan 124270071Sgrehan/* 125270071Sgrehan * Initialization: 126270071Sgrehan * (o) initialized the first time the VM is created 127270071Sgrehan * (i) initialized when VM is created and when it is reinitialized 128270071Sgrehan * (x) initialized before use 129270071Sgrehan */ 130221828Sgrehanstruct vm { 131270071Sgrehan void *cookie; /* (i) cpu-specific data */ 132270071Sgrehan void *iommu; /* (x) iommu-specific data */ 133270071Sgrehan struct vhpet *vhpet; /* (i) virtual HPET */ 134270071Sgrehan struct vioapic *vioapic; /* (i) virtual ioapic */ 135270071Sgrehan struct vatpic *vatpic; /* (i) virtual atpic */ 136270071Sgrehan struct vatpit *vatpit; /* (i) virtual atpit */ 137270071Sgrehan volatile cpuset_t active_cpus; /* (i) active vcpus */ 138270071Sgrehan int suspend; /* (i) stop VM execution */ 139270071Sgrehan volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ 140270071Sgrehan volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ 141270071Sgrehan cpuset_t rendezvous_req_cpus; /* (x) rendezvous requested */ 142270071Sgrehan cpuset_t rendezvous_done_cpus; /* (x) rendezvous finished */ 143270071Sgrehan void *rendezvous_arg; /* (x) rendezvous func/arg */ 144270071Sgrehan vm_rendezvous_func_t rendezvous_func; 145270071Sgrehan struct mtx rendezvous_mtx; /* (o) rendezvous lock */ 146270071Sgrehan int num_mem_segs; /* (o) guest memory segments */ 147256072Sneel struct mem_seg mem_segs[VM_MAX_MEMORY_SEGMENTS]; 148270071Sgrehan struct vmspace *vmspace; /* (o) guest's address space */ 149270071Sgrehan char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */ 150270071Sgrehan struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */ 151221828Sgrehan}; 152221828Sgrehan 153249396Sneelstatic int vmm_initialized; 154249396Sneel 155221828Sgrehanstatic struct vmm_ops *ops; 156266339Sjhb#define VMM_INIT(num) (ops != NULL ? (*ops->init)(num) : 0) 157221828Sgrehan#define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0) 158261275Sjhb#define VMM_RESUME() (ops != NULL ? (*ops->resume)() : 0) 159221828Sgrehan 160256072Sneel#define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL) 161268935Sjhb#define VMRUN(vmi, vcpu, rip, pmap, rptr, sptr) \ 162268935Sjhb (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, rptr, sptr) : ENXIO) 163221828Sgrehan#define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL) 164256072Sneel#define VMSPACE_ALLOC(min, max) \ 165256072Sneel (ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL) 166256072Sneel#define VMSPACE_FREE(vmspace) \ 167256072Sneel (ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO) 168221828Sgrehan#define VMGETREG(vmi, vcpu, num, retval) \ 169221828Sgrehan (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO) 170221828Sgrehan#define VMSETREG(vmi, vcpu, num, val) \ 171221828Sgrehan (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO) 172221828Sgrehan#define VMGETDESC(vmi, vcpu, num, desc) \ 173221828Sgrehan (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO) 174221828Sgrehan#define VMSETDESC(vmi, vcpu, num, desc) \ 175221828Sgrehan (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO) 176221828Sgrehan#define VMGETCAP(vmi, vcpu, num, retval) \ 177221828Sgrehan (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO) 178221828Sgrehan#define VMSETCAP(vmi, vcpu, num, val) \ 179221828Sgrehan (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO) 180266339Sjhb#define VLAPIC_INIT(vmi, vcpu) \ 181266339Sjhb (ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL) 182266339Sjhb#define VLAPIC_CLEANUP(vmi, vlapic) \ 183266339Sjhb (ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL) 184221828Sgrehan 185245021Sneel#define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) 186245021Sneel#define fpu_stop_emulating() clts() 187221828Sgrehan 188221828Sgrehanstatic MALLOC_DEFINE(M_VM, "vm", "vm"); 189221828Sgrehan 190221828Sgrehan/* statistics */ 191248389Sneelstatic VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); 192221828Sgrehan 193266339SjhbSYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); 194266339Sjhb 195268935Sjhb/* 196268935Sjhb * Halt the guest if all vcpus are executing a HLT instruction with 197268935Sjhb * interrupts disabled. 198268935Sjhb */ 199268935Sjhbstatic int halt_detection_enabled = 1; 200268935SjhbTUNABLE_INT("hw.vmm.halt_detection", &halt_detection_enabled); 201268935SjhbSYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN, 202268935Sjhb &halt_detection_enabled, 0, 203268935Sjhb "Halt VM if all vcpus execute HLT with interrupts disabled"); 204268935Sjhb 205266339Sjhbstatic int vmm_ipinum; 206266339SjhbSYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0, 207266339Sjhb "IPI vector used for vcpu notifications"); 208266339Sjhb 209221828Sgrehanstatic void 210270071Sgrehanvcpu_cleanup(struct vm *vm, int i, bool destroy) 211221828Sgrehan{ 212266339Sjhb struct vcpu *vcpu = &vm->vcpu[i]; 213266339Sjhb 214266339Sjhb VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic); 215270071Sgrehan if (destroy) { 216270071Sgrehan vmm_stat_free(vcpu->stats); 217270071Sgrehan fpu_save_area_free(vcpu->guestfpu); 218270071Sgrehan } 219221828Sgrehan} 220221828Sgrehan 221221828Sgrehanstatic void 222270071Sgrehanvcpu_init(struct vm *vm, int vcpu_id, bool create) 223221828Sgrehan{ 224221828Sgrehan struct vcpu *vcpu; 225270071Sgrehan 226270071Sgrehan KASSERT(vcpu_id >= 0 && vcpu_id < VM_MAXCPU, 227270071Sgrehan ("vcpu_init: invalid vcpu %d", vcpu_id)); 228270071Sgrehan 229221828Sgrehan vcpu = &vm->vcpu[vcpu_id]; 230221828Sgrehan 231270071Sgrehan if (create) { 232270071Sgrehan KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already " 233270071Sgrehan "initialized", vcpu_id)); 234270071Sgrehan vcpu_lock_init(vcpu); 235270071Sgrehan vcpu->state = VCPU_IDLE; 236270071Sgrehan vcpu->hostcpu = NOCPU; 237270071Sgrehan vcpu->guestfpu = fpu_save_area_alloc(); 238270071Sgrehan vcpu->stats = vmm_stat_alloc(); 239270071Sgrehan } 240270071Sgrehan 241266339Sjhb vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); 242267447Sjhb vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); 243270159Sgrehan vcpu->exitintinfo = 0; 244270071Sgrehan vcpu->nmi_pending = 0; 245270071Sgrehan vcpu->extint_pending = 0; 246270071Sgrehan vcpu->exception_pending = 0; 247267427Sjhb vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; 248234695Sgrehan fpu_save_area_reset(vcpu->guestfpu); 249270071Sgrehan vmm_stat_init(vcpu->stats); 250221828Sgrehan} 251221828Sgrehan 252240894Sneelstruct vm_exit * 253240894Sneelvm_exitinfo(struct vm *vm, int cpuid) 254240894Sneel{ 255240894Sneel struct vcpu *vcpu; 256240894Sneel 257240894Sneel if (cpuid < 0 || cpuid >= VM_MAXCPU) 258240894Sneel panic("vm_exitinfo: invalid cpuid %d", cpuid); 259240894Sneel 260240894Sneel vcpu = &vm->vcpu[cpuid]; 261240894Sneel 262240894Sneel return (&vcpu->exitinfo); 263240894Sneel} 264240894Sneel 265261275Sjhbstatic void 266261275Sjhbvmm_resume(void) 267261275Sjhb{ 268261275Sjhb VMM_RESUME(); 269261275Sjhb} 270261275Sjhb 271221828Sgrehanstatic int 272221828Sgrehanvmm_init(void) 273221828Sgrehan{ 274221828Sgrehan int error; 275221828Sgrehan 276242275Sneel vmm_host_state_init(); 277221828Sgrehan 278266339Sjhb vmm_ipinum = vmm_ipi_alloc(); 279266339Sjhb if (vmm_ipinum == 0) 280266339Sjhb vmm_ipinum = IPI_AST; 281266339Sjhb 282221828Sgrehan error = vmm_mem_init(); 283221828Sgrehan if (error) 284221828Sgrehan return (error); 285221828Sgrehan 286221828Sgrehan if (vmm_is_intel()) 287221828Sgrehan ops = &vmm_ops_intel; 288221828Sgrehan else if (vmm_is_amd()) 289221828Sgrehan ops = &vmm_ops_amd; 290221828Sgrehan else 291221828Sgrehan return (ENXIO); 292221828Sgrehan 293261275Sjhb vmm_resume_p = vmm_resume; 294221828Sgrehan 295266339Sjhb return (VMM_INIT(vmm_ipinum)); 296221828Sgrehan} 297221828Sgrehan 298221828Sgrehanstatic int 299221828Sgrehanvmm_handler(module_t mod, int what, void *arg) 300221828Sgrehan{ 301221828Sgrehan int error; 302221828Sgrehan 303221828Sgrehan switch (what) { 304221828Sgrehan case MOD_LOAD: 305221828Sgrehan vmmdev_init(); 306267070Sjhb if (ppt_avail_devices() > 0) 307267070Sjhb iommu_init(); 308221828Sgrehan error = vmm_init(); 309249396Sneel if (error == 0) 310249396Sneel vmm_initialized = 1; 311221828Sgrehan break; 312221828Sgrehan case MOD_UNLOAD: 313241454Sneel error = vmmdev_cleanup(); 314241454Sneel if (error == 0) { 315261275Sjhb vmm_resume_p = NULL; 316241454Sneel iommu_cleanup(); 317266339Sjhb if (vmm_ipinum != IPI_AST) 318266339Sjhb vmm_ipi_free(vmm_ipinum); 319241454Sneel error = VMM_CLEANUP(); 320253854Sgrehan /* 321253854Sgrehan * Something bad happened - prevent new 322253854Sgrehan * VMs from being created 323253854Sgrehan */ 324253854Sgrehan if (error) 325253854Sgrehan vmm_initialized = 0; 326241454Sneel } 327221828Sgrehan break; 328221828Sgrehan default: 329221828Sgrehan error = 0; 330221828Sgrehan break; 331221828Sgrehan } 332221828Sgrehan return (error); 333221828Sgrehan} 334221828Sgrehan 335221828Sgrehanstatic moduledata_t vmm_kmod = { 336221828Sgrehan "vmm", 337221828Sgrehan vmm_handler, 338221828Sgrehan NULL 339221828Sgrehan}; 340221828Sgrehan 341221828Sgrehan/* 342245704Sneel * vmm initialization has the following dependencies: 343245704Sneel * 344245704Sneel * - iommu initialization must happen after the pci passthru driver has had 345245704Sneel * a chance to attach to any passthru devices (after SI_SUB_CONFIGURE). 346245704Sneel * 347245704Sneel * - VT-x initialization requires smp_rendezvous() and therefore must happen 348245704Sneel * after SMP is fully functional (after SI_SUB_SMP). 349221828Sgrehan */ 350245704SneelDECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY); 351221828SgrehanMODULE_VERSION(vmm, 1); 352221828Sgrehan 353270071Sgrehanstatic void 354270071Sgrehanvm_init(struct vm *vm, bool create) 355270071Sgrehan{ 356270071Sgrehan int i; 357270071Sgrehan 358270071Sgrehan vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace)); 359270071Sgrehan vm->iommu = NULL; 360270071Sgrehan vm->vioapic = vioapic_init(vm); 361270071Sgrehan vm->vhpet = vhpet_init(vm); 362270071Sgrehan vm->vatpic = vatpic_init(vm); 363270071Sgrehan vm->vatpit = vatpit_init(vm); 364270071Sgrehan 365270071Sgrehan CPU_ZERO(&vm->active_cpus); 366270071Sgrehan 367270071Sgrehan vm->suspend = 0; 368270071Sgrehan CPU_ZERO(&vm->suspended_cpus); 369270071Sgrehan 370270071Sgrehan for (i = 0; i < VM_MAXCPU; i++) 371270071Sgrehan vcpu_init(vm, i, create); 372270071Sgrehan} 373270071Sgrehan 374249396Sneelint 375249396Sneelvm_create(const char *name, struct vm **retvm) 376221828Sgrehan{ 377221828Sgrehan struct vm *vm; 378256072Sneel struct vmspace *vmspace; 379221828Sgrehan 380249396Sneel /* 381249396Sneel * If vmm.ko could not be successfully initialized then don't attempt 382249396Sneel * to create the virtual machine. 383249396Sneel */ 384249396Sneel if (!vmm_initialized) 385249396Sneel return (ENXIO); 386249396Sneel 387221828Sgrehan if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) 388249396Sneel return (EINVAL); 389221828Sgrehan 390256072Sneel vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS); 391256072Sneel if (vmspace == NULL) 392256072Sneel return (ENOMEM); 393256072Sneel 394221828Sgrehan vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); 395221828Sgrehan strcpy(vm->name, name); 396270071Sgrehan vm->num_mem_segs = 0; 397266339Sjhb vm->vmspace = vmspace; 398266339Sjhb mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF); 399221828Sgrehan 400270071Sgrehan vm_init(vm, true); 401221828Sgrehan 402249396Sneel *retvm = vm; 403249396Sneel return (0); 404221828Sgrehan} 405221828Sgrehan 406241178Sneelstatic void 407256072Sneelvm_free_mem_seg(struct vm *vm, struct mem_seg *seg) 408241178Sneel{ 409241178Sneel 410256072Sneel if (seg->object != NULL) 411256072Sneel vmm_mem_free(vm->vmspace, seg->gpa, seg->len); 412241362Sneel 413256072Sneel bzero(seg, sizeof(*seg)); 414241178Sneel} 415241178Sneel 416270071Sgrehanstatic void 417270071Sgrehanvm_cleanup(struct vm *vm, bool destroy) 418221828Sgrehan{ 419221828Sgrehan int i; 420221828Sgrehan 421221828Sgrehan ppt_unassign_all(vm); 422221828Sgrehan 423256072Sneel if (vm->iommu != NULL) 424256072Sneel iommu_destroy_domain(vm->iommu); 425256072Sneel 426268891Sjhb vatpit_cleanup(vm->vatpit); 427261088Sjhb vhpet_cleanup(vm->vhpet); 428268891Sjhb vatpic_cleanup(vm->vatpic); 429261088Sjhb vioapic_cleanup(vm->vioapic); 430261088Sjhb 431270071Sgrehan for (i = 0; i < VM_MAXCPU; i++) 432270071Sgrehan vcpu_cleanup(vm, i, destroy); 433221828Sgrehan 434270071Sgrehan VMCLEANUP(vm->cookie); 435241178Sneel 436270071Sgrehan if (destroy) { 437270071Sgrehan for (i = 0; i < vm->num_mem_segs; i++) 438270071Sgrehan vm_free_mem_seg(vm, &vm->mem_segs[i]); 439221828Sgrehan 440270071Sgrehan vm->num_mem_segs = 0; 441221828Sgrehan 442270071Sgrehan VMSPACE_FREE(vm->vmspace); 443270071Sgrehan vm->vmspace = NULL; 444270071Sgrehan } 445270071Sgrehan} 446221828Sgrehan 447270071Sgrehanvoid 448270071Sgrehanvm_destroy(struct vm *vm) 449270071Sgrehan{ 450270071Sgrehan vm_cleanup(vm, true); 451221828Sgrehan free(vm, M_VM); 452221828Sgrehan} 453221828Sgrehan 454270071Sgrehanint 455270071Sgrehanvm_reinit(struct vm *vm) 456270071Sgrehan{ 457270071Sgrehan int error; 458270071Sgrehan 459270071Sgrehan /* 460270071Sgrehan * A virtual machine can be reset only if all vcpus are suspended. 461270071Sgrehan */ 462270071Sgrehan if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { 463270071Sgrehan vm_cleanup(vm, false); 464270071Sgrehan vm_init(vm, false); 465270071Sgrehan error = 0; 466270071Sgrehan } else { 467270071Sgrehan error = EBUSY; 468270071Sgrehan } 469270071Sgrehan 470270071Sgrehan return (error); 471270071Sgrehan} 472270071Sgrehan 473221828Sgrehanconst char * 474221828Sgrehanvm_name(struct vm *vm) 475221828Sgrehan{ 476221828Sgrehan return (vm->name); 477221828Sgrehan} 478221828Sgrehan 479221828Sgrehanint 480221828Sgrehanvm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) 481221828Sgrehan{ 482256072Sneel vm_object_t obj; 483221828Sgrehan 484256072Sneel if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) 485256072Sneel return (ENOMEM); 486256072Sneel else 487256072Sneel return (0); 488221828Sgrehan} 489221828Sgrehan 490221828Sgrehanint 491221828Sgrehanvm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) 492221828Sgrehan{ 493221828Sgrehan 494256072Sneel vmm_mmio_free(vm->vmspace, gpa, len); 495256072Sneel return (0); 496221828Sgrehan} 497221828Sgrehan 498256072Sneelboolean_t 499256072Sneelvm_mem_allocated(struct vm *vm, vm_paddr_t gpa) 500241041Sneel{ 501241041Sneel int i; 502241041Sneel vm_paddr_t gpabase, gpalimit; 503241041Sneel 504241041Sneel for (i = 0; i < vm->num_mem_segs; i++) { 505241041Sneel gpabase = vm->mem_segs[i].gpa; 506241041Sneel gpalimit = gpabase + vm->mem_segs[i].len; 507241041Sneel if (gpa >= gpabase && gpa < gpalimit) 508256072Sneel return (TRUE); /* 'gpa' is regular memory */ 509241041Sneel } 510241041Sneel 511256072Sneel if (ppt_is_mmio(vm, gpa)) 512256072Sneel return (TRUE); /* 'gpa' is pci passthru mmio */ 513256072Sneel 514256072Sneel return (FALSE); 515241041Sneel} 516241041Sneel 517221828Sgrehanint 518241041Sneelvm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len) 519221828Sgrehan{ 520256072Sneel int available, allocated; 521256072Sneel struct mem_seg *seg; 522256072Sneel vm_object_t object; 523256072Sneel vm_paddr_t g; 524221828Sgrehan 525241041Sneel if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0) 526241041Sneel return (EINVAL); 527221828Sgrehan 528241041Sneel available = allocated = 0; 529241041Sneel g = gpa; 530241041Sneel while (g < gpa + len) { 531256072Sneel if (vm_mem_allocated(vm, g)) 532256072Sneel allocated++; 533256072Sneel else 534241041Sneel available++; 535241041Sneel 536241041Sneel g += PAGE_SIZE; 537241041Sneel } 538241041Sneel 539221828Sgrehan /* 540241041Sneel * If there are some allocated and some available pages in the address 541241041Sneel * range then it is an error. 542221828Sgrehan */ 543241041Sneel if (allocated && available) 544241041Sneel return (EINVAL); 545221828Sgrehan 546241041Sneel /* 547241041Sneel * If the entire address range being requested has already been 548241041Sneel * allocated then there isn't anything more to do. 549241041Sneel */ 550241041Sneel if (allocated && available == 0) 551241041Sneel return (0); 552241041Sneel 553221828Sgrehan if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS) 554221828Sgrehan return (E2BIG); 555221828Sgrehan 556241178Sneel seg = &vm->mem_segs[vm->num_mem_segs]; 557221828Sgrehan 558256072Sneel if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL) 559256072Sneel return (ENOMEM); 560256072Sneel 561241178Sneel seg->gpa = gpa; 562256072Sneel seg->len = len; 563256072Sneel seg->object = object; 564256072Sneel seg->wired = FALSE; 565241178Sneel 566256072Sneel vm->num_mem_segs++; 567256072Sneel 568256072Sneel return (0); 569256072Sneel} 570256072Sneel 571270159Sgrehanstatic vm_paddr_t 572270159Sgrehanvm_maxmem(struct vm *vm) 573270159Sgrehan{ 574270159Sgrehan int i; 575270159Sgrehan vm_paddr_t gpa, maxmem; 576270159Sgrehan 577270159Sgrehan maxmem = 0; 578270159Sgrehan for (i = 0; i < vm->num_mem_segs; i++) { 579270159Sgrehan gpa = vm->mem_segs[i].gpa + vm->mem_segs[i].len; 580270159Sgrehan if (gpa > maxmem) 581270159Sgrehan maxmem = gpa; 582270159Sgrehan } 583270159Sgrehan return (maxmem); 584270159Sgrehan} 585270159Sgrehan 586256072Sneelstatic void 587256072Sneelvm_gpa_unwire(struct vm *vm) 588256072Sneel{ 589256072Sneel int i, rv; 590256072Sneel struct mem_seg *seg; 591256072Sneel 592256072Sneel for (i = 0; i < vm->num_mem_segs; i++) { 593256072Sneel seg = &vm->mem_segs[i]; 594256072Sneel if (!seg->wired) 595256072Sneel continue; 596256072Sneel 597256072Sneel rv = vm_map_unwire(&vm->vmspace->vm_map, 598256072Sneel seg->gpa, seg->gpa + seg->len, 599256072Sneel VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 600256072Sneel KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment " 601256072Sneel "%#lx/%ld could not be unwired: %d", 602256072Sneel vm_name(vm), seg->gpa, seg->len, rv)); 603256072Sneel 604256072Sneel seg->wired = FALSE; 605256072Sneel } 606256072Sneel} 607256072Sneel 608256072Sneelstatic int 609256072Sneelvm_gpa_wire(struct vm *vm) 610256072Sneel{ 611256072Sneel int i, rv; 612256072Sneel struct mem_seg *seg; 613256072Sneel 614256072Sneel for (i = 0; i < vm->num_mem_segs; i++) { 615256072Sneel seg = &vm->mem_segs[i]; 616256072Sneel if (seg->wired) 617256072Sneel continue; 618256072Sneel 619256072Sneel /* XXX rlimits? */ 620256072Sneel rv = vm_map_wire(&vm->vmspace->vm_map, 621256072Sneel seg->gpa, seg->gpa + seg->len, 622256072Sneel VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 623256072Sneel if (rv != KERN_SUCCESS) 624241178Sneel break; 625241178Sneel 626256072Sneel seg->wired = TRUE; 627256072Sneel } 628256072Sneel 629256072Sneel if (i < vm->num_mem_segs) { 630241362Sneel /* 631256072Sneel * Undo the wiring before returning an error. 632241362Sneel */ 633256072Sneel vm_gpa_unwire(vm); 634256072Sneel return (EAGAIN); 635256072Sneel } 636241178Sneel 637256072Sneel return (0); 638256072Sneel} 639256072Sneel 640256072Sneelstatic void 641256072Sneelvm_iommu_modify(struct vm *vm, boolean_t map) 642256072Sneel{ 643256072Sneel int i, sz; 644256072Sneel vm_paddr_t gpa, hpa; 645256072Sneel struct mem_seg *seg; 646256072Sneel void *vp, *cookie, *host_domain; 647256072Sneel 648256072Sneel sz = PAGE_SIZE; 649256072Sneel host_domain = iommu_host_domain(); 650256072Sneel 651256072Sneel for (i = 0; i < vm->num_mem_segs; i++) { 652256072Sneel seg = &vm->mem_segs[i]; 653256072Sneel KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired", 654256072Sneel vm_name(vm), seg->gpa, seg->len)); 655256072Sneel 656256072Sneel gpa = seg->gpa; 657256072Sneel while (gpa < seg->gpa + seg->len) { 658256072Sneel vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE, 659256072Sneel &cookie); 660256072Sneel KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx", 661256072Sneel vm_name(vm), gpa)); 662256072Sneel 663256072Sneel vm_gpa_release(cookie); 664256072Sneel 665256072Sneel hpa = DMAP_TO_PHYS((uintptr_t)vp); 666256072Sneel if (map) { 667256072Sneel iommu_create_mapping(vm->iommu, gpa, hpa, sz); 668256072Sneel iommu_remove_mapping(host_domain, hpa, sz); 669256072Sneel } else { 670256072Sneel iommu_remove_mapping(vm->iommu, gpa, sz); 671256072Sneel iommu_create_mapping(host_domain, hpa, hpa, sz); 672256072Sneel } 673256072Sneel 674256072Sneel gpa += PAGE_SIZE; 675256072Sneel } 676241178Sneel } 677241178Sneel 678256072Sneel /* 679256072Sneel * Invalidate the cached translations associated with the domain 680256072Sneel * from which pages were removed. 681256072Sneel */ 682256072Sneel if (map) 683256072Sneel iommu_invalidate_tlb(host_domain); 684256072Sneel else 685256072Sneel iommu_invalidate_tlb(vm->iommu); 686256072Sneel} 687256072Sneel 688256072Sneel#define vm_iommu_unmap(vm) vm_iommu_modify((vm), FALSE) 689256072Sneel#define vm_iommu_map(vm) vm_iommu_modify((vm), TRUE) 690256072Sneel 691256072Sneelint 692256072Sneelvm_unassign_pptdev(struct vm *vm, int bus, int slot, int func) 693256072Sneel{ 694256072Sneel int error; 695256072Sneel 696256072Sneel error = ppt_unassign_device(vm, bus, slot, func); 697256072Sneel if (error) 698221828Sgrehan return (error); 699256072Sneel 700267070Sjhb if (ppt_assigned_devices(vm) == 0) { 701256072Sneel vm_iommu_unmap(vm); 702256072Sneel vm_gpa_unwire(vm); 703221828Sgrehan } 704256072Sneel return (0); 705256072Sneel} 706221828Sgrehan 707256072Sneelint 708256072Sneelvm_assign_pptdev(struct vm *vm, int bus, int slot, int func) 709256072Sneel{ 710256072Sneel int error; 711256072Sneel vm_paddr_t maxaddr; 712256072Sneel 713241362Sneel /* 714256072Sneel * Virtual machines with pci passthru devices get special treatment: 715256072Sneel * - the guest physical memory is wired 716256072Sneel * - the iommu is programmed to do the 'gpa' to 'hpa' translation 717256072Sneel * 718256072Sneel * We need to do this before the first pci passthru device is attached. 719241362Sneel */ 720267070Sjhb if (ppt_assigned_devices(vm) == 0) { 721256072Sneel KASSERT(vm->iommu == NULL, 722256072Sneel ("vm_assign_pptdev: iommu must be NULL")); 723270159Sgrehan maxaddr = vm_maxmem(vm); 724256072Sneel vm->iommu = iommu_create_domain(maxaddr); 725241362Sneel 726256072Sneel error = vm_gpa_wire(vm); 727256072Sneel if (error) 728256072Sneel return (error); 729241041Sneel 730256072Sneel vm_iommu_map(vm); 731256072Sneel } 732256072Sneel 733256072Sneel error = ppt_assign_device(vm, bus, slot, func); 734256072Sneel return (error); 735221828Sgrehan} 736221828Sgrehan 737256072Sneelvoid * 738256072Sneelvm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot, 739256072Sneel void **cookie) 740221828Sgrehan{ 741256072Sneel int count, pageoff; 742256072Sneel vm_page_t m; 743221828Sgrehan 744256072Sneel pageoff = gpa & PAGE_MASK; 745256072Sneel if (len > PAGE_SIZE - pageoff) 746256072Sneel panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len); 747241148Sneel 748256072Sneel count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map, 749256072Sneel trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1); 750256072Sneel 751256072Sneel if (count == 1) { 752256072Sneel *cookie = m; 753256072Sneel return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff)); 754256072Sneel } else { 755256072Sneel *cookie = NULL; 756256072Sneel return (NULL); 757256072Sneel } 758221828Sgrehan} 759221828Sgrehan 760256072Sneelvoid 761256072Sneelvm_gpa_release(void *cookie) 762256072Sneel{ 763256072Sneel vm_page_t m = cookie; 764256072Sneel 765256072Sneel vm_page_lock(m); 766256072Sneel vm_page_unhold(m); 767256072Sneel vm_page_unlock(m); 768256072Sneel} 769256072Sneel 770221828Sgrehanint 771221828Sgrehanvm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase, 772221828Sgrehan struct vm_memory_segment *seg) 773221828Sgrehan{ 774221828Sgrehan int i; 775221828Sgrehan 776221828Sgrehan for (i = 0; i < vm->num_mem_segs; i++) { 777221828Sgrehan if (gpabase == vm->mem_segs[i].gpa) { 778256072Sneel seg->gpa = vm->mem_segs[i].gpa; 779256072Sneel seg->len = vm->mem_segs[i].len; 780256072Sneel seg->wired = vm->mem_segs[i].wired; 781221828Sgrehan return (0); 782221828Sgrehan } 783221828Sgrehan } 784221828Sgrehan return (-1); 785221828Sgrehan} 786221828Sgrehan 787221828Sgrehanint 788256072Sneelvm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len, 789256072Sneel vm_offset_t *offset, struct vm_object **object) 790256072Sneel{ 791256072Sneel int i; 792256072Sneel size_t seg_len; 793256072Sneel vm_paddr_t seg_gpa; 794256072Sneel vm_object_t seg_obj; 795256072Sneel 796256072Sneel for (i = 0; i < vm->num_mem_segs; i++) { 797256072Sneel if ((seg_obj = vm->mem_segs[i].object) == NULL) 798256072Sneel continue; 799256072Sneel 800256072Sneel seg_gpa = vm->mem_segs[i].gpa; 801256072Sneel seg_len = vm->mem_segs[i].len; 802256072Sneel 803256072Sneel if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) { 804256072Sneel *offset = gpa - seg_gpa; 805256072Sneel *object = seg_obj; 806256072Sneel vm_object_reference(seg_obj); 807256072Sneel return (0); 808256072Sneel } 809256072Sneel } 810256072Sneel 811256072Sneel return (EINVAL); 812256072Sneel} 813256072Sneel 814256072Sneelint 815221828Sgrehanvm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) 816221828Sgrehan{ 817221828Sgrehan 818221828Sgrehan if (vcpu < 0 || vcpu >= VM_MAXCPU) 819221828Sgrehan return (EINVAL); 820221828Sgrehan 821221828Sgrehan if (reg >= VM_REG_LAST) 822221828Sgrehan return (EINVAL); 823221828Sgrehan 824221828Sgrehan return (VMGETREG(vm->cookie, vcpu, reg, retval)); 825221828Sgrehan} 826221828Sgrehan 827221828Sgrehanint 828221828Sgrehanvm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val) 829221828Sgrehan{ 830221828Sgrehan 831221828Sgrehan if (vcpu < 0 || vcpu >= VM_MAXCPU) 832221828Sgrehan return (EINVAL); 833221828Sgrehan 834221828Sgrehan if (reg >= VM_REG_LAST) 835221828Sgrehan return (EINVAL); 836221828Sgrehan 837221828Sgrehan return (VMSETREG(vm->cookie, vcpu, reg, val)); 838221828Sgrehan} 839221828Sgrehan 840221828Sgrehanstatic boolean_t 841221828Sgrehanis_descriptor_table(int reg) 842221828Sgrehan{ 843221828Sgrehan 844221828Sgrehan switch (reg) { 845221828Sgrehan case VM_REG_GUEST_IDTR: 846221828Sgrehan case VM_REG_GUEST_GDTR: 847221828Sgrehan return (TRUE); 848221828Sgrehan default: 849221828Sgrehan return (FALSE); 850221828Sgrehan } 851221828Sgrehan} 852221828Sgrehan 853221828Sgrehanstatic boolean_t 854221828Sgrehanis_segment_register(int reg) 855221828Sgrehan{ 856221828Sgrehan 857221828Sgrehan switch (reg) { 858221828Sgrehan case VM_REG_GUEST_ES: 859221828Sgrehan case VM_REG_GUEST_CS: 860221828Sgrehan case VM_REG_GUEST_SS: 861221828Sgrehan case VM_REG_GUEST_DS: 862221828Sgrehan case VM_REG_GUEST_FS: 863221828Sgrehan case VM_REG_GUEST_GS: 864221828Sgrehan case VM_REG_GUEST_TR: 865221828Sgrehan case VM_REG_GUEST_LDTR: 866221828Sgrehan return (TRUE); 867221828Sgrehan default: 868221828Sgrehan return (FALSE); 869221828Sgrehan } 870221828Sgrehan} 871221828Sgrehan 872221828Sgrehanint 873221828Sgrehanvm_get_seg_desc(struct vm *vm, int vcpu, int reg, 874221828Sgrehan struct seg_desc *desc) 875221828Sgrehan{ 876221828Sgrehan 877221828Sgrehan if (vcpu < 0 || vcpu >= VM_MAXCPU) 878221828Sgrehan return (EINVAL); 879221828Sgrehan 880221828Sgrehan if (!is_segment_register(reg) && !is_descriptor_table(reg)) 881221828Sgrehan return (EINVAL); 882221828Sgrehan 883221828Sgrehan return (VMGETDESC(vm->cookie, vcpu, reg, desc)); 884221828Sgrehan} 885221828Sgrehan 886221828Sgrehanint 887221828Sgrehanvm_set_seg_desc(struct vm *vm, int vcpu, int reg, 888221828Sgrehan struct seg_desc *desc) 889221828Sgrehan{ 890221828Sgrehan if (vcpu < 0 || vcpu >= VM_MAXCPU) 891221828Sgrehan return (EINVAL); 892221828Sgrehan 893221828Sgrehan if (!is_segment_register(reg) && !is_descriptor_table(reg)) 894221828Sgrehan return (EINVAL); 895221828Sgrehan 896221828Sgrehan return (VMSETDESC(vm->cookie, vcpu, reg, desc)); 897221828Sgrehan} 898221828Sgrehan 899221828Sgrehanstatic void 900221828Sgrehanrestore_guest_fpustate(struct vcpu *vcpu) 901221828Sgrehan{ 902221828Sgrehan 903234695Sgrehan /* flush host state to the pcb */ 904234695Sgrehan fpuexit(curthread); 905242122Sneel 906242122Sneel /* restore guest FPU state */ 907221828Sgrehan fpu_stop_emulating(); 908234695Sgrehan fpurestore(vcpu->guestfpu); 909242122Sneel 910267427Sjhb /* restore guest XCR0 if XSAVE is enabled in the host */ 911267427Sjhb if (rcr4() & CR4_XSAVE) 912267427Sjhb load_xcr(0, vcpu->guest_xcr0); 913267427Sjhb 914242122Sneel /* 915242122Sneel * The FPU is now "dirty" with the guest's state so turn on emulation 916242122Sneel * to trap any access to the FPU by the host. 917242122Sneel */ 918242122Sneel fpu_start_emulating(); 919221828Sgrehan} 920221828Sgrehan 921221828Sgrehanstatic void 922221828Sgrehansave_guest_fpustate(struct vcpu *vcpu) 923221828Sgrehan{ 924221828Sgrehan 925242122Sneel if ((rcr0() & CR0_TS) == 0) 926242122Sneel panic("fpu emulation not enabled in host!"); 927242122Sneel 928267427Sjhb /* save guest XCR0 and restore host XCR0 */ 929267427Sjhb if (rcr4() & CR4_XSAVE) { 930267427Sjhb vcpu->guest_xcr0 = rxcr(0); 931267427Sjhb load_xcr(0, vmm_get_host_xcr0()); 932267427Sjhb } 933267427Sjhb 934242122Sneel /* save guest FPU state */ 935242122Sneel fpu_stop_emulating(); 936234695Sgrehan fpusave(vcpu->guestfpu); 937221828Sgrehan fpu_start_emulating(); 938221828Sgrehan} 939221828Sgrehan 940248389Sneelstatic VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle"); 941242065Sneel 942256072Sneelstatic int 943266393Sjhbvcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, 944266393Sjhb bool from_idle) 945256072Sneel{ 946256072Sneel int error; 947256072Sneel 948256072Sneel vcpu_assert_locked(vcpu); 949256072Sneel 950256072Sneel /* 951266393Sjhb * State transitions from the vmmdev_ioctl() must always begin from 952266393Sjhb * the VCPU_IDLE state. This guarantees that there is only a single 953266393Sjhb * ioctl() operating on a vcpu at any point. 954266393Sjhb */ 955266393Sjhb if (from_idle) { 956266393Sjhb while (vcpu->state != VCPU_IDLE) 957266393Sjhb msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); 958266393Sjhb } else { 959266393Sjhb KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " 960266393Sjhb "vcpu idle state")); 961266393Sjhb } 962266393Sjhb 963266393Sjhb if (vcpu->state == VCPU_RUNNING) { 964266393Sjhb KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " 965266393Sjhb "mismatch for running vcpu", curcpu, vcpu->hostcpu)); 966266393Sjhb } else { 967266393Sjhb KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " 968266393Sjhb "vcpu that is not running", vcpu->hostcpu)); 969266393Sjhb } 970266393Sjhb 971266393Sjhb /* 972256072Sneel * The following state transitions are allowed: 973256072Sneel * IDLE -> FROZEN -> IDLE 974256072Sneel * FROZEN -> RUNNING -> FROZEN 975256072Sneel * FROZEN -> SLEEPING -> FROZEN 976256072Sneel */ 977256072Sneel switch (vcpu->state) { 978256072Sneel case VCPU_IDLE: 979256072Sneel case VCPU_RUNNING: 980256072Sneel case VCPU_SLEEPING: 981256072Sneel error = (newstate != VCPU_FROZEN); 982256072Sneel break; 983256072Sneel case VCPU_FROZEN: 984256072Sneel error = (newstate == VCPU_FROZEN); 985256072Sneel break; 986256072Sneel default: 987256072Sneel error = 1; 988256072Sneel break; 989256072Sneel } 990256072Sneel 991266393Sjhb if (error) 992266393Sjhb return (EBUSY); 993266393Sjhb 994266393Sjhb vcpu->state = newstate; 995266393Sjhb if (newstate == VCPU_RUNNING) 996266393Sjhb vcpu->hostcpu = curcpu; 997256072Sneel else 998266393Sjhb vcpu->hostcpu = NOCPU; 999256072Sneel 1000266393Sjhb if (newstate == VCPU_IDLE) 1001266393Sjhb wakeup(&vcpu->state); 1002266393Sjhb 1003266393Sjhb return (0); 1004256072Sneel} 1005256072Sneel 1006256072Sneelstatic void 1007256072Sneelvcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) 1008256072Sneel{ 1009256072Sneel int error; 1010256072Sneel 1011266393Sjhb if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0) 1012256072Sneel panic("Error %d setting state to %d\n", error, newstate); 1013256072Sneel} 1014256072Sneel 1015256072Sneelstatic void 1016256072Sneelvcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) 1017256072Sneel{ 1018256072Sneel int error; 1019256072Sneel 1020266393Sjhb if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0) 1021256072Sneel panic("Error %d setting state to %d", error, newstate); 1022256072Sneel} 1023256072Sneel 1024266339Sjhbstatic void 1025266339Sjhbvm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func) 1026266339Sjhb{ 1027266339Sjhb 1028266339Sjhb KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked")); 1029266339Sjhb 1030266339Sjhb /* 1031266339Sjhb * Update 'rendezvous_func' and execute a write memory barrier to 1032266339Sjhb * ensure that it is visible across all host cpus. This is not needed 1033266339Sjhb * for correctness but it does ensure that all the vcpus will notice 1034266339Sjhb * that the rendezvous is requested immediately. 1035266339Sjhb */ 1036266339Sjhb vm->rendezvous_func = func; 1037266339Sjhb wmb(); 1038266339Sjhb} 1039266339Sjhb 1040266339Sjhb#define RENDEZVOUS_CTR0(vm, vcpuid, fmt) \ 1041266339Sjhb do { \ 1042266339Sjhb if (vcpuid >= 0) \ 1043266339Sjhb VCPU_CTR0(vm, vcpuid, fmt); \ 1044266339Sjhb else \ 1045266339Sjhb VM_CTR0(vm, fmt); \ 1046266339Sjhb } while (0) 1047266339Sjhb 1048266339Sjhbstatic void 1049266339Sjhbvm_handle_rendezvous(struct vm *vm, int vcpuid) 1050266339Sjhb{ 1051266339Sjhb 1052266339Sjhb KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU), 1053266339Sjhb ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid)); 1054266339Sjhb 1055266339Sjhb mtx_lock(&vm->rendezvous_mtx); 1056266339Sjhb while (vm->rendezvous_func != NULL) { 1057266339Sjhb /* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */ 1058266339Sjhb CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus); 1059266339Sjhb 1060266339Sjhb if (vcpuid != -1 && 1061266339Sjhb CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) && 1062266339Sjhb !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) { 1063266339Sjhb VCPU_CTR0(vm, vcpuid, "Calling rendezvous func"); 1064266339Sjhb (*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg); 1065266339Sjhb CPU_SET(vcpuid, &vm->rendezvous_done_cpus); 1066266339Sjhb } 1067266339Sjhb if (CPU_CMP(&vm->rendezvous_req_cpus, 1068266339Sjhb &vm->rendezvous_done_cpus) == 0) { 1069266339Sjhb VCPU_CTR0(vm, vcpuid, "Rendezvous completed"); 1070266339Sjhb vm_set_rendezvous_func(vm, NULL); 1071266339Sjhb wakeup(&vm->rendezvous_func); 1072266339Sjhb break; 1073266339Sjhb } 1074266339Sjhb RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion"); 1075266339Sjhb mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0, 1076266339Sjhb "vmrndv", 0); 1077266339Sjhb } 1078266339Sjhb mtx_unlock(&vm->rendezvous_mtx); 1079266339Sjhb} 1080266339Sjhb 1081256072Sneel/* 1082256072Sneel * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. 1083256072Sneel */ 1084256072Sneelstatic int 1085262350Sjhbvm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu) 1086256072Sneel{ 1087256072Sneel struct vcpu *vcpu; 1088268935Sjhb const char *wmesg; 1089276349Sneel int error, t, vcpu_halted, vm_halted; 1090256072Sneel 1091268935Sjhb KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); 1092268935Sjhb 1093256072Sneel vcpu = &vm->vcpu[vcpuid]; 1094268935Sjhb vcpu_halted = 0; 1095268935Sjhb vm_halted = 0; 1096256072Sneel 1097276349Sneel /* 1098276349Sneel * The typical way to halt a cpu is to execute: "sti; hlt" 1099276349Sneel * 1100276349Sneel * STI sets RFLAGS.IF to enable interrupts. However, the processor 1101276349Sneel * remains in an "interrupt shadow" for an additional instruction 1102276349Sneel * following the STI. This guarantees that "sti; hlt" sequence is 1103276349Sneel * atomic and a pending interrupt will be recognized after the HLT. 1104276349Sneel * 1105276349Sneel * After the HLT emulation is done the vcpu is no longer in an 1106276349Sneel * interrupt shadow and a pending interrupt can be injected on 1107276349Sneel * the next entry into the guest. 1108276349Sneel */ 1109276349Sneel error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0); 1110276349Sneel KASSERT(error == 0, ("%s: error %d clearing interrupt shadow", 1111276349Sneel __func__, error)); 1112276349Sneel 1113256072Sneel vcpu_lock(vcpu); 1114268935Sjhb while (1) { 1115268935Sjhb /* 1116268935Sjhb * Do a final check for pending NMI or interrupts before 1117268935Sjhb * really putting this thread to sleep. Also check for 1118268935Sjhb * software events that would cause this vcpu to wakeup. 1119268935Sjhb * 1120268935Sjhb * These interrupts/events could have happened after the 1121268935Sjhb * vcpu returned from VMRUN() and before it acquired the 1122268935Sjhb * vcpu lock above. 1123268935Sjhb */ 1124268935Sjhb if (vm->rendezvous_func != NULL || vm->suspend) 1125268935Sjhb break; 1126268935Sjhb if (vm_nmi_pending(vm, vcpuid)) 1127268935Sjhb break; 1128268935Sjhb if (!intr_disabled) { 1129268935Sjhb if (vm_extint_pending(vm, vcpuid) || 1130268935Sjhb vlapic_pending_intr(vcpu->vlapic, NULL)) { 1131268935Sjhb break; 1132268935Sjhb } 1133268935Sjhb } 1134256072Sneel 1135270159Sgrehan /* Don't go to sleep if the vcpu thread needs to yield */ 1136270159Sgrehan if (vcpu_should_yield(vm, vcpuid)) 1137270159Sgrehan break; 1138270159Sgrehan 1139268935Sjhb /* 1140268935Sjhb * Some Linux guests implement "halt" by having all vcpus 1141268935Sjhb * execute HLT with interrupts disabled. 'halted_cpus' keeps 1142268935Sjhb * track of the vcpus that have entered this state. When all 1143268935Sjhb * vcpus enter the halted state the virtual machine is halted. 1144268935Sjhb */ 1145268935Sjhb if (intr_disabled) { 1146268935Sjhb wmesg = "vmhalt"; 1147268935Sjhb VCPU_CTR0(vm, vcpuid, "Halted"); 1148268935Sjhb if (!vcpu_halted && halt_detection_enabled) { 1149268935Sjhb vcpu_halted = 1; 1150268935Sjhb CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); 1151268935Sjhb } 1152268935Sjhb if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) { 1153268935Sjhb vm_halted = 1; 1154268935Sjhb break; 1155268935Sjhb } 1156268935Sjhb } else { 1157268935Sjhb wmesg = "vmidle"; 1158268935Sjhb } 1159268935Sjhb 1160256072Sneel t = ticks; 1161256072Sneel vcpu_require_state_locked(vcpu, VCPU_SLEEPING); 1162270159Sgrehan /* 1163270159Sgrehan * XXX msleep_spin() cannot be interrupted by signals so 1164270159Sgrehan * wake up periodically to check pending signals. 1165270159Sgrehan */ 1166270159Sgrehan msleep_spin(vcpu, &vcpu->mtx, wmesg, hz); 1167256072Sneel vcpu_require_state_locked(vcpu, VCPU_FROZEN); 1168256072Sneel vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t); 1169256072Sneel } 1170268935Sjhb 1171268935Sjhb if (vcpu_halted) 1172268935Sjhb CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); 1173268935Sjhb 1174256072Sneel vcpu_unlock(vcpu); 1175256072Sneel 1176268935Sjhb if (vm_halted) 1177268935Sjhb vm_suspend(vm, VM_SUSPEND_HALT); 1178266339Sjhb 1179256072Sneel return (0); 1180256072Sneel} 1181256072Sneel 1182256072Sneelstatic int 1183262350Sjhbvm_handle_paging(struct vm *vm, int vcpuid, bool *retu) 1184256072Sneel{ 1185256072Sneel int rv, ftype; 1186256072Sneel struct vm_map *map; 1187256072Sneel struct vcpu *vcpu; 1188256072Sneel struct vm_exit *vme; 1189256072Sneel 1190256072Sneel vcpu = &vm->vcpu[vcpuid]; 1191256072Sneel vme = &vcpu->exitinfo; 1192256072Sneel 1193256072Sneel ftype = vme->u.paging.fault_type; 1194256072Sneel KASSERT(ftype == VM_PROT_READ || 1195256072Sneel ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE, 1196256072Sneel ("vm_handle_paging: invalid fault_type %d", ftype)); 1197256072Sneel 1198256072Sneel if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { 1199256072Sneel rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace), 1200256072Sneel vme->u.paging.gpa, ftype); 1201276349Sneel if (rv == 0) { 1202276349Sneel VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %#lx", 1203276349Sneel ftype == VM_PROT_READ ? "accessed" : "dirty", 1204276349Sneel vme->u.paging.gpa); 1205256072Sneel goto done; 1206276349Sneel } 1207256072Sneel } 1208256072Sneel 1209256072Sneel map = &vm->vmspace->vm_map; 1210256072Sneel rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL); 1211256072Sneel 1212261088Sjhb VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, " 1213261088Sjhb "ftype = %d", rv, vme->u.paging.gpa, ftype); 1214256072Sneel 1215256072Sneel if (rv != KERN_SUCCESS) 1216256072Sneel return (EFAULT); 1217256072Sneeldone: 1218256072Sneel /* restart execution at the faulting instruction */ 1219256072Sneel vme->inst_length = 0; 1220256072Sneel 1221256072Sneel return (0); 1222256072Sneel} 1223256072Sneel 1224256072Sneelstatic int 1225262350Sjhbvm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) 1226256072Sneel{ 1227256072Sneel struct vie *vie; 1228256072Sneel struct vcpu *vcpu; 1229256072Sneel struct vm_exit *vme; 1230268976Sjhb uint64_t gla, gpa; 1231268976Sjhb struct vm_guest_paging *paging; 1232261088Sjhb mem_region_read_t mread; 1233261088Sjhb mem_region_write_t mwrite; 1234270159Sgrehan enum vm_cpu_mode cpu_mode; 1235270159Sgrehan int cs_d, error; 1236256072Sneel 1237256072Sneel vcpu = &vm->vcpu[vcpuid]; 1238256072Sneel vme = &vcpu->exitinfo; 1239256072Sneel 1240256072Sneel gla = vme->u.inst_emul.gla; 1241256072Sneel gpa = vme->u.inst_emul.gpa; 1242270159Sgrehan cs_d = vme->u.inst_emul.cs_d; 1243256072Sneel vie = &vme->u.inst_emul.vie; 1244268976Sjhb paging = &vme->u.inst_emul.paging; 1245270159Sgrehan cpu_mode = paging->cpu_mode; 1246256072Sneel 1247276349Sneel VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %#lx", gpa); 1248276349Sneel 1249256072Sneel vie_init(vie); 1250256072Sneel 1251256072Sneel /* Fetch, decode and emulate the faulting instruction */ 1252268976Sjhb error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip, 1253268976Sjhb vme->inst_length, vie); 1254268976Sjhb if (error == 1) 1255268976Sjhb return (0); /* Resume guest to handle page fault */ 1256268976Sjhb else if (error == -1) 1257256072Sneel return (EFAULT); 1258268976Sjhb else if (error != 0) 1259268976Sjhb panic("%s: vmm_fetch_instruction error %d", __func__, error); 1260256072Sneel 1261270159Sgrehan if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0) 1262256072Sneel return (EFAULT); 1263256072Sneel 1264261088Sjhb /* return to userland unless this is an in-kernel emulated device */ 1265261088Sjhb if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { 1266261088Sjhb mread = lapic_mmio_read; 1267261088Sjhb mwrite = lapic_mmio_write; 1268261088Sjhb } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { 1269261088Sjhb mread = vioapic_mmio_read; 1270261088Sjhb mwrite = vioapic_mmio_write; 1271261088Sjhb } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { 1272261088Sjhb mread = vhpet_mmio_read; 1273261088Sjhb mwrite = vhpet_mmio_write; 1274261088Sjhb } else { 1275262350Sjhb *retu = true; 1276256072Sneel return (0); 1277256072Sneel } 1278256072Sneel 1279270159Sgrehan error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging, 1280270159Sgrehan mread, mwrite, retu); 1281256072Sneel 1282256072Sneel return (error); 1283256072Sneel} 1284256072Sneel 1285268935Sjhbstatic int 1286268935Sjhbvm_handle_suspend(struct vm *vm, int vcpuid, bool *retu) 1287268935Sjhb{ 1288268935Sjhb int i, done; 1289268935Sjhb struct vcpu *vcpu; 1290268935Sjhb 1291268935Sjhb done = 0; 1292268935Sjhb vcpu = &vm->vcpu[vcpuid]; 1293268935Sjhb 1294268935Sjhb CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); 1295268935Sjhb 1296268935Sjhb /* 1297268935Sjhb * Wait until all 'active_cpus' have suspended themselves. 1298268935Sjhb * 1299268935Sjhb * Since a VM may be suspended at any time including when one or 1300268935Sjhb * more vcpus are doing a rendezvous we need to call the rendezvous 1301268935Sjhb * handler while we are waiting to prevent a deadlock. 1302268935Sjhb */ 1303268935Sjhb vcpu_lock(vcpu); 1304268935Sjhb while (1) { 1305268935Sjhb if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { 1306268935Sjhb VCPU_CTR0(vm, vcpuid, "All vcpus suspended"); 1307268935Sjhb break; 1308268935Sjhb } 1309268935Sjhb 1310268935Sjhb if (vm->rendezvous_func == NULL) { 1311268935Sjhb VCPU_CTR0(vm, vcpuid, "Sleeping during suspend"); 1312268935Sjhb vcpu_require_state_locked(vcpu, VCPU_SLEEPING); 1313268935Sjhb msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz); 1314268935Sjhb vcpu_require_state_locked(vcpu, VCPU_FROZEN); 1315268935Sjhb } else { 1316268935Sjhb VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend"); 1317268935Sjhb vcpu_unlock(vcpu); 1318268935Sjhb vm_handle_rendezvous(vm, vcpuid); 1319268935Sjhb vcpu_lock(vcpu); 1320268935Sjhb } 1321268935Sjhb } 1322268935Sjhb vcpu_unlock(vcpu); 1323268935Sjhb 1324268935Sjhb /* 1325268935Sjhb * Wakeup the other sleeping vcpus and return to userspace. 1326268935Sjhb */ 1327268935Sjhb for (i = 0; i < VM_MAXCPU; i++) { 1328268935Sjhb if (CPU_ISSET(i, &vm->suspended_cpus)) { 1329268935Sjhb vcpu_notify_event(vm, i, false); 1330268935Sjhb } 1331268935Sjhb } 1332268935Sjhb 1333268935Sjhb *retu = true; 1334268935Sjhb return (0); 1335268935Sjhb} 1336268935Sjhb 1337221828Sgrehanint 1338268935Sjhbvm_suspend(struct vm *vm, enum vm_suspend_how how) 1339268935Sjhb{ 1340268935Sjhb int i; 1341268935Sjhb 1342268935Sjhb if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) 1343268935Sjhb return (EINVAL); 1344268935Sjhb 1345268935Sjhb if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) { 1346268935Sjhb VM_CTR2(vm, "virtual machine already suspended %d/%d", 1347268935Sjhb vm->suspend, how); 1348268935Sjhb return (EALREADY); 1349268935Sjhb } 1350268935Sjhb 1351268935Sjhb VM_CTR1(vm, "virtual machine successfully suspended %d", how); 1352268935Sjhb 1353268935Sjhb /* 1354268935Sjhb * Notify all active vcpus that they are now suspended. 1355268935Sjhb */ 1356268935Sjhb for (i = 0; i < VM_MAXCPU; i++) { 1357268935Sjhb if (CPU_ISSET(i, &vm->active_cpus)) 1358268935Sjhb vcpu_notify_event(vm, i, false); 1359268935Sjhb } 1360268935Sjhb 1361268935Sjhb return (0); 1362268935Sjhb} 1363268935Sjhb 1364268935Sjhbvoid 1365268935Sjhbvm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip) 1366268935Sjhb{ 1367268935Sjhb struct vm_exit *vmexit; 1368268935Sjhb 1369268935Sjhb KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST, 1370268935Sjhb ("vm_exit_suspended: invalid suspend type %d", vm->suspend)); 1371268935Sjhb 1372268935Sjhb vmexit = vm_exitinfo(vm, vcpuid); 1373268935Sjhb vmexit->rip = rip; 1374268935Sjhb vmexit->inst_length = 0; 1375268935Sjhb vmexit->exitcode = VM_EXITCODE_SUSPENDED; 1376268935Sjhb vmexit->u.suspended.how = vm->suspend; 1377268935Sjhb} 1378268935Sjhb 1379270074Sgrehanvoid 1380270074Sgrehanvm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip) 1381270074Sgrehan{ 1382270074Sgrehan struct vm_exit *vmexit; 1383270074Sgrehan 1384270074Sgrehan KASSERT(vm->rendezvous_func != NULL, ("rendezvous not in progress")); 1385270074Sgrehan 1386270074Sgrehan vmexit = vm_exitinfo(vm, vcpuid); 1387270074Sgrehan vmexit->rip = rip; 1388270074Sgrehan vmexit->inst_length = 0; 1389270074Sgrehan vmexit->exitcode = VM_EXITCODE_RENDEZVOUS; 1390270074Sgrehan vmm_stat_incr(vm, vcpuid, VMEXIT_RENDEZVOUS, 1); 1391270074Sgrehan} 1392270074Sgrehan 1393270074Sgrehanvoid 1394270074Sgrehanvm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip) 1395270074Sgrehan{ 1396270074Sgrehan struct vm_exit *vmexit; 1397270074Sgrehan 1398270074Sgrehan vmexit = vm_exitinfo(vm, vcpuid); 1399270074Sgrehan vmexit->rip = rip; 1400270074Sgrehan vmexit->inst_length = 0; 1401270074Sgrehan vmexit->exitcode = VM_EXITCODE_BOGUS; 1402270074Sgrehan vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1); 1403270074Sgrehan} 1404270074Sgrehan 1405268935Sjhbint 1406221828Sgrehanvm_run(struct vm *vm, struct vm_run *vmrun) 1407221828Sgrehan{ 1408256072Sneel int error, vcpuid; 1409221828Sgrehan struct vcpu *vcpu; 1410221828Sgrehan struct pcb *pcb; 1411242065Sneel uint64_t tscval, rip; 1412242065Sneel struct vm_exit *vme; 1413262350Sjhb bool retu, intr_disabled; 1414256072Sneel pmap_t pmap; 1415268935Sjhb void *rptr, *sptr; 1416221828Sgrehan 1417221828Sgrehan vcpuid = vmrun->cpuid; 1418221828Sgrehan 1419221828Sgrehan if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1420221828Sgrehan return (EINVAL); 1421221828Sgrehan 1422270070Sgrehan if (!CPU_ISSET(vcpuid, &vm->active_cpus)) 1423270070Sgrehan return (EINVAL); 1424270070Sgrehan 1425270070Sgrehan if (CPU_ISSET(vcpuid, &vm->suspended_cpus)) 1426270070Sgrehan return (EINVAL); 1427270070Sgrehan 1428268935Sjhb rptr = &vm->rendezvous_func; 1429268935Sjhb sptr = &vm->suspend; 1430256072Sneel pmap = vmspace_pmap(vm->vmspace); 1431221828Sgrehan vcpu = &vm->vcpu[vcpuid]; 1432256072Sneel vme = &vcpu->exitinfo; 1433242065Sneel rip = vmrun->rip; 1434242065Sneelrestart: 1435221828Sgrehan critical_enter(); 1436221828Sgrehan 1437256072Sneel KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), 1438256072Sneel ("vm_run: absurd pm_active")); 1439256072Sneel 1440221828Sgrehan tscval = rdtsc(); 1441221828Sgrehan 1442221828Sgrehan pcb = PCPU_GET(curpcb); 1443221914Sjhb set_pcb_flags(pcb, PCB_FULL_IRET); 1444221828Sgrehan 1445221828Sgrehan restore_guest_fpustate(vcpu); 1446241489Sneel 1447256072Sneel vcpu_require_state(vm, vcpuid, VCPU_RUNNING); 1448268935Sjhb error = VMRUN(vm->cookie, vcpuid, rip, pmap, rptr, sptr); 1449256072Sneel vcpu_require_state(vm, vcpuid, VCPU_FROZEN); 1450241489Sneel 1451221828Sgrehan save_guest_fpustate(vcpu); 1452221828Sgrehan 1453221828Sgrehan vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); 1454221828Sgrehan 1455221828Sgrehan critical_exit(); 1456221828Sgrehan 1457256072Sneel if (error == 0) { 1458262350Sjhb retu = false; 1459256072Sneel switch (vme->exitcode) { 1460268935Sjhb case VM_EXITCODE_SUSPENDED: 1461268935Sjhb error = vm_handle_suspend(vm, vcpuid, &retu); 1462268935Sjhb break; 1463266339Sjhb case VM_EXITCODE_IOAPIC_EOI: 1464266339Sjhb vioapic_process_eoi(vm, vcpuid, 1465266339Sjhb vme->u.ioapic_eoi.vector); 1466266339Sjhb break; 1467266339Sjhb case VM_EXITCODE_RENDEZVOUS: 1468266339Sjhb vm_handle_rendezvous(vm, vcpuid); 1469266339Sjhb error = 0; 1470266339Sjhb break; 1471256072Sneel case VM_EXITCODE_HLT: 1472262350Sjhb intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); 1473262350Sjhb error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu); 1474256072Sneel break; 1475256072Sneel case VM_EXITCODE_PAGING: 1476256072Sneel error = vm_handle_paging(vm, vcpuid, &retu); 1477256072Sneel break; 1478256072Sneel case VM_EXITCODE_INST_EMUL: 1479256072Sneel error = vm_handle_inst_emul(vm, vcpuid, &retu); 1480256072Sneel break; 1481268976Sjhb case VM_EXITCODE_INOUT: 1482268976Sjhb case VM_EXITCODE_INOUT_STR: 1483268976Sjhb error = vm_handle_inout(vm, vcpuid, vme, &retu); 1484268976Sjhb break; 1485276349Sneel case VM_EXITCODE_MONITOR: 1486276349Sneel case VM_EXITCODE_MWAIT: 1487276349Sneel vm_inject_ud(vm, vcpuid); 1488276349Sneel break; 1489256072Sneel default: 1490262350Sjhb retu = true; /* handled in userland */ 1491256072Sneel break; 1492242065Sneel } 1493256072Sneel } 1494242065Sneel 1495262350Sjhb if (error == 0 && retu == false) { 1496242065Sneel rip = vme->rip + vme->inst_length; 1497242065Sneel goto restart; 1498242065Sneel } 1499242065Sneel 1500256072Sneel /* copy the exit information */ 1501256072Sneel bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit)); 1502221828Sgrehan return (error); 1503221828Sgrehan} 1504221828Sgrehan 1505221828Sgrehanint 1506270159Sgrehanvm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info) 1507270159Sgrehan{ 1508270159Sgrehan struct vcpu *vcpu; 1509270159Sgrehan int type, vector; 1510270159Sgrehan 1511270159Sgrehan if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1512270159Sgrehan return (EINVAL); 1513270159Sgrehan 1514270159Sgrehan vcpu = &vm->vcpu[vcpuid]; 1515270159Sgrehan 1516270159Sgrehan if (info & VM_INTINFO_VALID) { 1517270159Sgrehan type = info & VM_INTINFO_TYPE; 1518270159Sgrehan vector = info & 0xff; 1519270159Sgrehan if (type == VM_INTINFO_NMI && vector != IDT_NMI) 1520270159Sgrehan return (EINVAL); 1521270159Sgrehan if (type == VM_INTINFO_HWEXCEPTION && vector >= 32) 1522270159Sgrehan return (EINVAL); 1523270159Sgrehan if (info & VM_INTINFO_RSVD) 1524270159Sgrehan return (EINVAL); 1525270159Sgrehan } else { 1526270159Sgrehan info = 0; 1527270159Sgrehan } 1528270159Sgrehan VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info); 1529270159Sgrehan vcpu->exitintinfo = info; 1530270159Sgrehan return (0); 1531270159Sgrehan} 1532270159Sgrehan 1533270159Sgrehanenum exc_class { 1534270159Sgrehan EXC_BENIGN, 1535270159Sgrehan EXC_CONTRIBUTORY, 1536270159Sgrehan EXC_PAGEFAULT 1537270159Sgrehan}; 1538270159Sgrehan 1539270159Sgrehan#define IDT_VE 20 /* Virtualization Exception (Intel specific) */ 1540270159Sgrehan 1541270159Sgrehanstatic enum exc_class 1542270159Sgrehanexception_class(uint64_t info) 1543270159Sgrehan{ 1544270159Sgrehan int type, vector; 1545270159Sgrehan 1546270159Sgrehan KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info)); 1547270159Sgrehan type = info & VM_INTINFO_TYPE; 1548270159Sgrehan vector = info & 0xff; 1549270159Sgrehan 1550270159Sgrehan /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */ 1551270159Sgrehan switch (type) { 1552270159Sgrehan case VM_INTINFO_HWINTR: 1553270159Sgrehan case VM_INTINFO_SWINTR: 1554270159Sgrehan case VM_INTINFO_NMI: 1555270159Sgrehan return (EXC_BENIGN); 1556270159Sgrehan default: 1557270159Sgrehan /* 1558270159Sgrehan * Hardware exception. 1559270159Sgrehan * 1560270159Sgrehan * SVM and VT-x use identical type values to represent NMI, 1561270159Sgrehan * hardware interrupt and software interrupt. 1562270159Sgrehan * 1563270159Sgrehan * SVM uses type '3' for all exceptions. VT-x uses type '3' 1564270159Sgrehan * for exceptions except #BP and #OF. #BP and #OF use a type 1565270159Sgrehan * value of '5' or '6'. Therefore we don't check for explicit 1566270159Sgrehan * values of 'type' to classify 'intinfo' into a hardware 1567270159Sgrehan * exception. 1568270159Sgrehan */ 1569270159Sgrehan break; 1570270159Sgrehan } 1571270159Sgrehan 1572270159Sgrehan switch (vector) { 1573270159Sgrehan case IDT_PF: 1574270159Sgrehan case IDT_VE: 1575270159Sgrehan return (EXC_PAGEFAULT); 1576270159Sgrehan case IDT_DE: 1577270159Sgrehan case IDT_TS: 1578270159Sgrehan case IDT_NP: 1579270159Sgrehan case IDT_SS: 1580270159Sgrehan case IDT_GP: 1581270159Sgrehan return (EXC_CONTRIBUTORY); 1582270159Sgrehan default: 1583270159Sgrehan return (EXC_BENIGN); 1584270159Sgrehan } 1585270159Sgrehan} 1586270159Sgrehan 1587270159Sgrehanstatic int 1588270159Sgrehannested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2, 1589270159Sgrehan uint64_t *retinfo) 1590270159Sgrehan{ 1591270159Sgrehan enum exc_class exc1, exc2; 1592270159Sgrehan int type1, vector1; 1593270159Sgrehan 1594270159Sgrehan KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1)); 1595270159Sgrehan KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2)); 1596270159Sgrehan 1597270159Sgrehan /* 1598270159Sgrehan * If an exception occurs while attempting to call the double-fault 1599270159Sgrehan * handler the processor enters shutdown mode (aka triple fault). 1600270159Sgrehan */ 1601270159Sgrehan type1 = info1 & VM_INTINFO_TYPE; 1602270159Sgrehan vector1 = info1 & 0xff; 1603270159Sgrehan if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) { 1604270159Sgrehan VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)", 1605270159Sgrehan info1, info2); 1606270159Sgrehan vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT); 1607270159Sgrehan *retinfo = 0; 1608270159Sgrehan return (0); 1609270159Sgrehan } 1610270159Sgrehan 1611270159Sgrehan /* 1612270159Sgrehan * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3 1613270159Sgrehan */ 1614270159Sgrehan exc1 = exception_class(info1); 1615270159Sgrehan exc2 = exception_class(info2); 1616270159Sgrehan if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) || 1617270159Sgrehan (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) { 1618270159Sgrehan /* Convert nested fault into a double fault. */ 1619270159Sgrehan *retinfo = IDT_DF; 1620270159Sgrehan *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; 1621270159Sgrehan *retinfo |= VM_INTINFO_DEL_ERRCODE; 1622270159Sgrehan } else { 1623270159Sgrehan /* Handle exceptions serially */ 1624270159Sgrehan *retinfo = info2; 1625270159Sgrehan } 1626270159Sgrehan return (1); 1627270159Sgrehan} 1628270159Sgrehan 1629270159Sgrehanstatic uint64_t 1630270159Sgrehanvcpu_exception_intinfo(struct vcpu *vcpu) 1631270159Sgrehan{ 1632270159Sgrehan uint64_t info = 0; 1633270159Sgrehan 1634270159Sgrehan if (vcpu->exception_pending) { 1635270159Sgrehan info = vcpu->exception.vector & 0xff; 1636270159Sgrehan info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; 1637270159Sgrehan if (vcpu->exception.error_code_valid) { 1638270159Sgrehan info |= VM_INTINFO_DEL_ERRCODE; 1639270159Sgrehan info |= (uint64_t)vcpu->exception.error_code << 32; 1640270159Sgrehan } 1641270159Sgrehan } 1642270159Sgrehan return (info); 1643270159Sgrehan} 1644270159Sgrehan 1645270159Sgrehanint 1646270159Sgrehanvm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo) 1647270159Sgrehan{ 1648270159Sgrehan struct vcpu *vcpu; 1649270159Sgrehan uint64_t info1, info2; 1650270159Sgrehan int valid; 1651270159Sgrehan 1652270159Sgrehan KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid)); 1653270159Sgrehan 1654270159Sgrehan vcpu = &vm->vcpu[vcpuid]; 1655270159Sgrehan 1656270159Sgrehan info1 = vcpu->exitintinfo; 1657270159Sgrehan vcpu->exitintinfo = 0; 1658270159Sgrehan 1659270159Sgrehan info2 = 0; 1660270159Sgrehan if (vcpu->exception_pending) { 1661270159Sgrehan info2 = vcpu_exception_intinfo(vcpu); 1662270159Sgrehan vcpu->exception_pending = 0; 1663270159Sgrehan VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx", 1664270159Sgrehan vcpu->exception.vector, info2); 1665270159Sgrehan } 1666270159Sgrehan 1667270159Sgrehan if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) { 1668270159Sgrehan valid = nested_fault(vm, vcpuid, info1, info2, retinfo); 1669270159Sgrehan } else if (info1 & VM_INTINFO_VALID) { 1670270159Sgrehan *retinfo = info1; 1671270159Sgrehan valid = 1; 1672270159Sgrehan } else if (info2 & VM_INTINFO_VALID) { 1673270159Sgrehan *retinfo = info2; 1674270159Sgrehan valid = 1; 1675270159Sgrehan } else { 1676270159Sgrehan valid = 0; 1677270159Sgrehan } 1678270159Sgrehan 1679270159Sgrehan if (valid) { 1680270159Sgrehan VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), " 1681270159Sgrehan "retinfo(%#lx)", __func__, info1, info2, *retinfo); 1682270159Sgrehan } 1683270159Sgrehan 1684270159Sgrehan return (valid); 1685270159Sgrehan} 1686270159Sgrehan 1687270159Sgrehanint 1688270159Sgrehanvm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2) 1689270159Sgrehan{ 1690270159Sgrehan struct vcpu *vcpu; 1691270159Sgrehan 1692270159Sgrehan if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1693270159Sgrehan return (EINVAL); 1694270159Sgrehan 1695270159Sgrehan vcpu = &vm->vcpu[vcpuid]; 1696270159Sgrehan *info1 = vcpu->exitintinfo; 1697270159Sgrehan *info2 = vcpu_exception_intinfo(vcpu); 1698270159Sgrehan return (0); 1699270159Sgrehan} 1700270159Sgrehan 1701270159Sgrehanint 1702267427Sjhbvm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception) 1703221828Sgrehan{ 1704267427Sjhb struct vcpu *vcpu; 1705267427Sjhb 1706221828Sgrehan if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1707221828Sgrehan return (EINVAL); 1708221828Sgrehan 1709267427Sjhb if (exception->vector < 0 || exception->vector >= 32) 1710221828Sgrehan return (EINVAL); 1711221828Sgrehan 1712270159Sgrehan /* 1713270159Sgrehan * A double fault exception should never be injected directly into 1714270159Sgrehan * the guest. It is a derived exception that results from specific 1715270159Sgrehan * combinations of nested faults. 1716270159Sgrehan */ 1717270159Sgrehan if (exception->vector == IDT_DF) 1718270159Sgrehan return (EINVAL); 1719270159Sgrehan 1720267427Sjhb vcpu = &vm->vcpu[vcpuid]; 1721221828Sgrehan 1722267427Sjhb if (vcpu->exception_pending) { 1723267427Sjhb VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to " 1724267427Sjhb "pending exception %d", exception->vector, 1725267427Sjhb vcpu->exception.vector); 1726267427Sjhb return (EBUSY); 1727267427Sjhb } 1728267427Sjhb 1729267427Sjhb vcpu->exception_pending = 1; 1730267427Sjhb vcpu->exception = *exception; 1731267427Sjhb VCPU_CTR1(vm, vcpuid, "Exception %d pending", exception->vector); 1732267427Sjhb return (0); 1733221828Sgrehan} 1734221828Sgrehan 1735270159Sgrehanvoid 1736270159Sgrehanvm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid, 1737270159Sgrehan int errcode) 1738267427Sjhb{ 1739270159Sgrehan struct vm_exception exception; 1740267427Sjhb struct vm_exit *vmexit; 1741270159Sgrehan struct vm *vm; 1742267427Sjhb int error; 1743267427Sjhb 1744270159Sgrehan vm = vmarg; 1745270159Sgrehan 1746270159Sgrehan exception.vector = vector; 1747270159Sgrehan exception.error_code = errcode; 1748270159Sgrehan exception.error_code_valid = errcode_valid; 1749270159Sgrehan error = vm_inject_exception(vm, vcpuid, &exception); 1750267427Sjhb KASSERT(error == 0, ("vm_inject_exception error %d", error)); 1751267427Sjhb 1752267427Sjhb /* 1753267427Sjhb * A fault-like exception allows the instruction to be restarted 1754267427Sjhb * after the exception handler returns. 1755267427Sjhb * 1756267427Sjhb * By setting the inst_length to 0 we ensure that the instruction 1757267427Sjhb * pointer remains at the faulting instruction. 1758267427Sjhb */ 1759267427Sjhb vmexit = vm_exitinfo(vm, vcpuid); 1760267427Sjhb vmexit->inst_length = 0; 1761267427Sjhb} 1762267427Sjhb 1763267427Sjhbvoid 1764270159Sgrehanvm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2) 1765268976Sjhb{ 1766270159Sgrehan struct vm *vm; 1767268976Sjhb int error; 1768268976Sjhb 1769270159Sgrehan vm = vmarg; 1770268976Sjhb VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx", 1771268976Sjhb error_code, cr2); 1772268976Sjhb 1773268976Sjhb error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2); 1774268976Sjhb KASSERT(error == 0, ("vm_set_register(cr2) error %d", error)); 1775268976Sjhb 1776270159Sgrehan vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code); 1777268976Sjhb} 1778268976Sjhb 1779248389Sneelstatic VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); 1780241982Sneel 1781221828Sgrehanint 1782241982Sneelvm_inject_nmi(struct vm *vm, int vcpuid) 1783221828Sgrehan{ 1784241982Sneel struct vcpu *vcpu; 1785221828Sgrehan 1786241982Sneel if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1787221828Sgrehan return (EINVAL); 1788221828Sgrehan 1789241982Sneel vcpu = &vm->vcpu[vcpuid]; 1790241982Sneel 1791241982Sneel vcpu->nmi_pending = 1; 1792266339Sjhb vcpu_notify_event(vm, vcpuid, false); 1793241982Sneel return (0); 1794221828Sgrehan} 1795221828Sgrehan 1796221828Sgrehanint 1797241982Sneelvm_nmi_pending(struct vm *vm, int vcpuid) 1798241982Sneel{ 1799241982Sneel struct vcpu *vcpu; 1800241982Sneel 1801241982Sneel if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1802241982Sneel panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); 1803241982Sneel 1804241982Sneel vcpu = &vm->vcpu[vcpuid]; 1805241982Sneel 1806241982Sneel return (vcpu->nmi_pending); 1807241982Sneel} 1808241982Sneel 1809241982Sneelvoid 1810241982Sneelvm_nmi_clear(struct vm *vm, int vcpuid) 1811241982Sneel{ 1812241982Sneel struct vcpu *vcpu; 1813241982Sneel 1814241982Sneel if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1815241982Sneel panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); 1816241982Sneel 1817241982Sneel vcpu = &vm->vcpu[vcpuid]; 1818241982Sneel 1819241982Sneel if (vcpu->nmi_pending == 0) 1820241982Sneel panic("vm_nmi_clear: inconsistent nmi_pending state"); 1821241982Sneel 1822241982Sneel vcpu->nmi_pending = 0; 1823241982Sneel vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1); 1824241982Sneel} 1825241982Sneel 1826268891Sjhbstatic VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu"); 1827268891Sjhb 1828241982Sneelint 1829268891Sjhbvm_inject_extint(struct vm *vm, int vcpuid) 1830268891Sjhb{ 1831268891Sjhb struct vcpu *vcpu; 1832268891Sjhb 1833268891Sjhb if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1834268891Sjhb return (EINVAL); 1835268891Sjhb 1836268891Sjhb vcpu = &vm->vcpu[vcpuid]; 1837268891Sjhb 1838268891Sjhb vcpu->extint_pending = 1; 1839268891Sjhb vcpu_notify_event(vm, vcpuid, false); 1840268891Sjhb return (0); 1841268891Sjhb} 1842268891Sjhb 1843268891Sjhbint 1844268891Sjhbvm_extint_pending(struct vm *vm, int vcpuid) 1845268891Sjhb{ 1846268891Sjhb struct vcpu *vcpu; 1847268891Sjhb 1848268891Sjhb if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1849268891Sjhb panic("vm_extint_pending: invalid vcpuid %d", vcpuid); 1850268891Sjhb 1851268891Sjhb vcpu = &vm->vcpu[vcpuid]; 1852268891Sjhb 1853268891Sjhb return (vcpu->extint_pending); 1854268891Sjhb} 1855268891Sjhb 1856268891Sjhbvoid 1857268891Sjhbvm_extint_clear(struct vm *vm, int vcpuid) 1858268891Sjhb{ 1859268891Sjhb struct vcpu *vcpu; 1860268891Sjhb 1861268891Sjhb if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1862268891Sjhb panic("vm_extint_pending: invalid vcpuid %d", vcpuid); 1863268891Sjhb 1864268891Sjhb vcpu = &vm->vcpu[vcpuid]; 1865268891Sjhb 1866268891Sjhb if (vcpu->extint_pending == 0) 1867268891Sjhb panic("vm_extint_clear: inconsistent extint_pending state"); 1868268891Sjhb 1869268891Sjhb vcpu->extint_pending = 0; 1870268891Sjhb vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1); 1871268891Sjhb} 1872268891Sjhb 1873268891Sjhbint 1874221828Sgrehanvm_get_capability(struct vm *vm, int vcpu, int type, int *retval) 1875221828Sgrehan{ 1876221828Sgrehan if (vcpu < 0 || vcpu >= VM_MAXCPU) 1877221828Sgrehan return (EINVAL); 1878221828Sgrehan 1879221828Sgrehan if (type < 0 || type >= VM_CAP_MAX) 1880221828Sgrehan return (EINVAL); 1881221828Sgrehan 1882221828Sgrehan return (VMGETCAP(vm->cookie, vcpu, type, retval)); 1883221828Sgrehan} 1884221828Sgrehan 1885221828Sgrehanint 1886221828Sgrehanvm_set_capability(struct vm *vm, int vcpu, int type, int val) 1887221828Sgrehan{ 1888221828Sgrehan if (vcpu < 0 || vcpu >= VM_MAXCPU) 1889221828Sgrehan return (EINVAL); 1890221828Sgrehan 1891221828Sgrehan if (type < 0 || type >= VM_CAP_MAX) 1892221828Sgrehan return (EINVAL); 1893221828Sgrehan 1894221828Sgrehan return (VMSETCAP(vm->cookie, vcpu, type, val)); 1895221828Sgrehan} 1896221828Sgrehan 1897221828Sgrehanstruct vlapic * 1898221828Sgrehanvm_lapic(struct vm *vm, int cpu) 1899221828Sgrehan{ 1900221828Sgrehan return (vm->vcpu[cpu].vlapic); 1901221828Sgrehan} 1902221828Sgrehan 1903261088Sjhbstruct vioapic * 1904261088Sjhbvm_ioapic(struct vm *vm) 1905261088Sjhb{ 1906261088Sjhb 1907261088Sjhb return (vm->vioapic); 1908261088Sjhb} 1909261088Sjhb 1910261088Sjhbstruct vhpet * 1911261088Sjhbvm_hpet(struct vm *vm) 1912261088Sjhb{ 1913261088Sjhb 1914261088Sjhb return (vm->vhpet); 1915261088Sjhb} 1916261088Sjhb 1917221828Sgrehanboolean_t 1918221828Sgrehanvmm_is_pptdev(int bus, int slot, int func) 1919221828Sgrehan{ 1920246188Sneel int found, i, n; 1921246188Sneel int b, s, f; 1922221828Sgrehan char *val, *cp, *cp2; 1923221828Sgrehan 1924221828Sgrehan /* 1925246188Sneel * XXX 1926246188Sneel * The length of an environment variable is limited to 128 bytes which 1927246188Sneel * puts an upper limit on the number of passthru devices that may be 1928246188Sneel * specified using a single environment variable. 1929246188Sneel * 1930246188Sneel * Work around this by scanning multiple environment variable 1931246188Sneel * names instead of a single one - yuck! 1932221828Sgrehan */ 1933246188Sneel const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL }; 1934246188Sneel 1935246188Sneel /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */ 1936221828Sgrehan found = 0; 1937246188Sneel for (i = 0; names[i] != NULL && !found; i++) { 1938246188Sneel cp = val = getenv(names[i]); 1939246188Sneel while (cp != NULL && *cp != '\0') { 1940246188Sneel if ((cp2 = strchr(cp, ' ')) != NULL) 1941246188Sneel *cp2 = '\0'; 1942221828Sgrehan 1943246188Sneel n = sscanf(cp, "%d/%d/%d", &b, &s, &f); 1944246188Sneel if (n == 3 && bus == b && slot == s && func == f) { 1945246188Sneel found = 1; 1946246188Sneel break; 1947246188Sneel } 1948221828Sgrehan 1949246188Sneel if (cp2 != NULL) 1950246188Sneel *cp2++ = ' '; 1951221828Sgrehan 1952246188Sneel cp = cp2; 1953246188Sneel } 1954246188Sneel freeenv(val); 1955221828Sgrehan } 1956221828Sgrehan return (found); 1957221828Sgrehan} 1958221828Sgrehan 1959221828Sgrehanvoid * 1960221828Sgrehanvm_iommu_domain(struct vm *vm) 1961221828Sgrehan{ 1962221828Sgrehan 1963221828Sgrehan return (vm->iommu); 1964221828Sgrehan} 1965221828Sgrehan 1966241489Sneelint 1967266393Sjhbvcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate, 1968266393Sjhb bool from_idle) 1969221828Sgrehan{ 1970241489Sneel int error; 1971221828Sgrehan struct vcpu *vcpu; 1972221828Sgrehan 1973221828Sgrehan if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1974221828Sgrehan panic("vm_set_run_state: invalid vcpuid %d", vcpuid); 1975221828Sgrehan 1976221828Sgrehan vcpu = &vm->vcpu[vcpuid]; 1977221828Sgrehan 1978241489Sneel vcpu_lock(vcpu); 1979266393Sjhb error = vcpu_set_state_locked(vcpu, newstate, from_idle); 1980241489Sneel vcpu_unlock(vcpu); 1981241489Sneel 1982241489Sneel return (error); 1983221828Sgrehan} 1984221828Sgrehan 1985241489Sneelenum vcpu_state 1986249879Sgrehanvcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) 1987221828Sgrehan{ 1988221828Sgrehan struct vcpu *vcpu; 1989241489Sneel enum vcpu_state state; 1990221828Sgrehan 1991221828Sgrehan if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1992221828Sgrehan panic("vm_get_run_state: invalid vcpuid %d", vcpuid); 1993221828Sgrehan 1994221828Sgrehan vcpu = &vm->vcpu[vcpuid]; 1995221828Sgrehan 1996241489Sneel vcpu_lock(vcpu); 1997241489Sneel state = vcpu->state; 1998249879Sgrehan if (hostcpu != NULL) 1999249879Sgrehan *hostcpu = vcpu->hostcpu; 2000241489Sneel vcpu_unlock(vcpu); 2001221828Sgrehan 2002241489Sneel return (state); 2003221828Sgrehan} 2004221828Sgrehan 2005270070Sgrehanint 2006221828Sgrehanvm_activate_cpu(struct vm *vm, int vcpuid) 2007221828Sgrehan{ 2008221828Sgrehan 2009270070Sgrehan if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 2010270070Sgrehan return (EINVAL); 2011266339Sjhb 2012270070Sgrehan if (CPU_ISSET(vcpuid, &vm->active_cpus)) 2013270070Sgrehan return (EBUSY); 2014270070Sgrehan 2015266339Sjhb VCPU_CTR0(vm, vcpuid, "activated"); 2016266339Sjhb CPU_SET_ATOMIC(vcpuid, &vm->active_cpus); 2017270070Sgrehan return (0); 2018221828Sgrehan} 2019221828Sgrehan 2020223621Sgrehancpuset_t 2021221828Sgrehanvm_active_cpus(struct vm *vm) 2022221828Sgrehan{ 2023221828Sgrehan 2024221828Sgrehan return (vm->active_cpus); 2025221828Sgrehan} 2026221828Sgrehan 2027270070Sgrehancpuset_t 2028270070Sgrehanvm_suspended_cpus(struct vm *vm) 2029270070Sgrehan{ 2030270070Sgrehan 2031270070Sgrehan return (vm->suspended_cpus); 2032270070Sgrehan} 2033270070Sgrehan 2034221828Sgrehanvoid * 2035221828Sgrehanvcpu_stats(struct vm *vm, int vcpuid) 2036221828Sgrehan{ 2037221828Sgrehan 2038221828Sgrehan return (vm->vcpu[vcpuid].stats); 2039221828Sgrehan} 2040240922Sneel 2041240922Sneelint 2042240922Sneelvm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) 2043240922Sneel{ 2044240922Sneel if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 2045240922Sneel return (EINVAL); 2046240922Sneel 2047240922Sneel *state = vm->vcpu[vcpuid].x2apic_state; 2048240922Sneel 2049240922Sneel return (0); 2050240922Sneel} 2051240922Sneel 2052240922Sneelint 2053240922Sneelvm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) 2054240922Sneel{ 2055240922Sneel if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 2056240922Sneel return (EINVAL); 2057240922Sneel 2058248392Sneel if (state >= X2APIC_STATE_LAST) 2059240922Sneel return (EINVAL); 2060240922Sneel 2061240922Sneel vm->vcpu[vcpuid].x2apic_state = state; 2062240922Sneel 2063240943Sneel vlapic_set_x2apic_state(vm, vcpuid, state); 2064240943Sneel 2065240922Sneel return (0); 2066240922Sneel} 2067241489Sneel 2068262350Sjhb/* 2069262350Sjhb * This function is called to ensure that a vcpu "sees" a pending event 2070262350Sjhb * as soon as possible: 2071262350Sjhb * - If the vcpu thread is sleeping then it is woken up. 2072262350Sjhb * - If the vcpu is running on a different host_cpu then an IPI will be directed 2073262350Sjhb * to the host_cpu to cause the vcpu to trap into the hypervisor. 2074262350Sjhb */ 2075241489Sneelvoid 2076266339Sjhbvcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr) 2077241489Sneel{ 2078241489Sneel int hostcpu; 2079241489Sneel struct vcpu *vcpu; 2080241489Sneel 2081241489Sneel vcpu = &vm->vcpu[vcpuid]; 2082241489Sneel 2083242065Sneel vcpu_lock(vcpu); 2084241489Sneel hostcpu = vcpu->hostcpu; 2085266393Sjhb if (vcpu->state == VCPU_RUNNING) { 2086266393Sjhb KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); 2087266339Sjhb if (hostcpu != curcpu) { 2088266393Sjhb if (lapic_intr) { 2089266339Sjhb vlapic_post_intr(vcpu->vlapic, hostcpu, 2090266339Sjhb vmm_ipinum); 2091266393Sjhb } else { 2092266339Sjhb ipi_cpu(hostcpu, vmm_ipinum); 2093266393Sjhb } 2094266393Sjhb } else { 2095266393Sjhb /* 2096266393Sjhb * If the 'vcpu' is running on 'curcpu' then it must 2097266393Sjhb * be sending a notification to itself (e.g. SELF_IPI). 2098266393Sjhb * The pending event will be picked up when the vcpu 2099266393Sjhb * transitions back to guest context. 2100266393Sjhb */ 2101266339Sjhb } 2102266393Sjhb } else { 2103266393Sjhb KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " 2104266393Sjhb "with hostcpu %d", vcpu->state, hostcpu)); 2105266393Sjhb if (vcpu->state == VCPU_SLEEPING) 2106266393Sjhb wakeup_one(vcpu); 2107242065Sneel } 2108242065Sneel vcpu_unlock(vcpu); 2109241489Sneel} 2110256072Sneel 2111256072Sneelstruct vmspace * 2112256072Sneelvm_get_vmspace(struct vm *vm) 2113256072Sneel{ 2114256072Sneel 2115256072Sneel return (vm->vmspace); 2116256072Sneel} 2117261088Sjhb 2118261088Sjhbint 2119261088Sjhbvm_apicid2vcpuid(struct vm *vm, int apicid) 2120261088Sjhb{ 2121261088Sjhb /* 2122261088Sjhb * XXX apic id is assumed to be numerically identical to vcpu id 2123261088Sjhb */ 2124261088Sjhb return (apicid); 2125261088Sjhb} 2126266339Sjhb 2127266339Sjhbvoid 2128266339Sjhbvm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest, 2129266339Sjhb vm_rendezvous_func_t func, void *arg) 2130266339Sjhb{ 2131266339Sjhb int i; 2132266339Sjhb 2133266339Sjhb /* 2134266339Sjhb * Enforce that this function is called without any locks 2135266339Sjhb */ 2136266339Sjhb WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous"); 2137266339Sjhb KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU), 2138266339Sjhb ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid)); 2139266339Sjhb 2140266339Sjhbrestart: 2141266339Sjhb mtx_lock(&vm->rendezvous_mtx); 2142266339Sjhb if (vm->rendezvous_func != NULL) { 2143266339Sjhb /* 2144266339Sjhb * If a rendezvous is already in progress then we need to 2145266339Sjhb * call the rendezvous handler in case this 'vcpuid' is one 2146266339Sjhb * of the targets of the rendezvous. 2147266339Sjhb */ 2148266339Sjhb RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress"); 2149266339Sjhb mtx_unlock(&vm->rendezvous_mtx); 2150266339Sjhb vm_handle_rendezvous(vm, vcpuid); 2151266339Sjhb goto restart; 2152266339Sjhb } 2153266339Sjhb KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous " 2154266339Sjhb "rendezvous is still in progress")); 2155266339Sjhb 2156266339Sjhb RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous"); 2157266339Sjhb vm->rendezvous_req_cpus = dest; 2158266339Sjhb CPU_ZERO(&vm->rendezvous_done_cpus); 2159266339Sjhb vm->rendezvous_arg = arg; 2160266339Sjhb vm_set_rendezvous_func(vm, func); 2161266339Sjhb mtx_unlock(&vm->rendezvous_mtx); 2162266339Sjhb 2163266339Sjhb /* 2164266339Sjhb * Wake up any sleeping vcpus and trigger a VM-exit in any running 2165266339Sjhb * vcpus so they handle the rendezvous as soon as possible. 2166266339Sjhb */ 2167266339Sjhb for (i = 0; i < VM_MAXCPU; i++) { 2168266339Sjhb if (CPU_ISSET(i, &dest)) 2169266339Sjhb vcpu_notify_event(vm, i, false); 2170266339Sjhb } 2171266339Sjhb 2172266339Sjhb vm_handle_rendezvous(vm, vcpuid); 2173266339Sjhb} 2174268891Sjhb 2175268891Sjhbstruct vatpic * 2176268891Sjhbvm_atpic(struct vm *vm) 2177268891Sjhb{ 2178268891Sjhb return (vm->vatpic); 2179268891Sjhb} 2180268891Sjhb 2181268891Sjhbstruct vatpit * 2182268891Sjhbvm_atpit(struct vm *vm) 2183268891Sjhb{ 2184268891Sjhb return (vm->vatpit); 2185268891Sjhb} 2186268976Sjhb 2187268976Sjhbenum vm_reg_name 2188268976Sjhbvm_segment_name(int seg) 2189268976Sjhb{ 2190268976Sjhb static enum vm_reg_name seg_names[] = { 2191268976Sjhb VM_REG_GUEST_ES, 2192268976Sjhb VM_REG_GUEST_CS, 2193268976Sjhb VM_REG_GUEST_SS, 2194268976Sjhb VM_REG_GUEST_DS, 2195268976Sjhb VM_REG_GUEST_FS, 2196268976Sjhb VM_REG_GUEST_GS 2197268976Sjhb }; 2198268976Sjhb 2199268976Sjhb KASSERT(seg >= 0 && seg < nitems(seg_names), 2200268976Sjhb ("%s: invalid segment encoding %d", __func__, seg)); 2201268976Sjhb return (seg_names[seg]); 2202268976Sjhb} 2203270074Sgrehan 2204270159Sgrehanvoid 2205270159Sgrehanvm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, 2206270159Sgrehan int num_copyinfo) 2207270159Sgrehan{ 2208270159Sgrehan int idx; 2209270074Sgrehan 2210270159Sgrehan for (idx = 0; idx < num_copyinfo; idx++) { 2211270159Sgrehan if (copyinfo[idx].cookie != NULL) 2212270159Sgrehan vm_gpa_release(copyinfo[idx].cookie); 2213270159Sgrehan } 2214270159Sgrehan bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo)); 2215270159Sgrehan} 2216270159Sgrehan 2217270159Sgrehanint 2218270159Sgrehanvm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 2219270159Sgrehan uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, 2220270159Sgrehan int num_copyinfo) 2221270159Sgrehan{ 2222270159Sgrehan int error, idx, nused; 2223270159Sgrehan size_t n, off, remaining; 2224270159Sgrehan void *hva, *cookie; 2225270159Sgrehan uint64_t gpa; 2226270159Sgrehan 2227270159Sgrehan bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo); 2228270159Sgrehan 2229270159Sgrehan nused = 0; 2230270159Sgrehan remaining = len; 2231270159Sgrehan while (remaining > 0) { 2232270159Sgrehan KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo")); 2233270159Sgrehan error = vmm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa); 2234270159Sgrehan if (error) 2235270159Sgrehan return (error); 2236270159Sgrehan off = gpa & PAGE_MASK; 2237270159Sgrehan n = min(remaining, PAGE_SIZE - off); 2238270159Sgrehan copyinfo[nused].gpa = gpa; 2239270159Sgrehan copyinfo[nused].len = n; 2240270159Sgrehan remaining -= n; 2241270159Sgrehan gla += n; 2242270159Sgrehan nused++; 2243270159Sgrehan } 2244270159Sgrehan 2245270159Sgrehan for (idx = 0; idx < nused; idx++) { 2246270159Sgrehan hva = vm_gpa_hold(vm, copyinfo[idx].gpa, copyinfo[idx].len, 2247270159Sgrehan prot, &cookie); 2248270159Sgrehan if (hva == NULL) 2249270159Sgrehan break; 2250270159Sgrehan copyinfo[idx].hva = hva; 2251270159Sgrehan copyinfo[idx].cookie = cookie; 2252270159Sgrehan } 2253270159Sgrehan 2254270159Sgrehan if (idx != nused) { 2255270159Sgrehan vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo); 2256270159Sgrehan return (-1); 2257270159Sgrehan } else { 2258270159Sgrehan return (0); 2259270159Sgrehan } 2260270159Sgrehan} 2261270159Sgrehan 2262270159Sgrehanvoid 2263270159Sgrehanvm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr, 2264270159Sgrehan size_t len) 2265270159Sgrehan{ 2266270159Sgrehan char *dst; 2267270159Sgrehan int idx; 2268270159Sgrehan 2269270159Sgrehan dst = kaddr; 2270270159Sgrehan idx = 0; 2271270159Sgrehan while (len > 0) { 2272270159Sgrehan bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len); 2273270159Sgrehan len -= copyinfo[idx].len; 2274270159Sgrehan dst += copyinfo[idx].len; 2275270159Sgrehan idx++; 2276270159Sgrehan } 2277270159Sgrehan} 2278270159Sgrehan 2279270159Sgrehanvoid 2280270159Sgrehanvm_copyout(struct vm *vm, int vcpuid, const void *kaddr, 2281270159Sgrehan struct vm_copyinfo *copyinfo, size_t len) 2282270159Sgrehan{ 2283270159Sgrehan const char *src; 2284270159Sgrehan int idx; 2285270159Sgrehan 2286270159Sgrehan src = kaddr; 2287270159Sgrehan idx = 0; 2288270159Sgrehan while (len > 0) { 2289270159Sgrehan bcopy(src, copyinfo[idx].hva, copyinfo[idx].len); 2290270159Sgrehan len -= copyinfo[idx].len; 2291270159Sgrehan src += copyinfo[idx].len; 2292270159Sgrehan idx++; 2293270159Sgrehan } 2294270159Sgrehan} 2295270159Sgrehan 2296270074Sgrehan/* 2297270074Sgrehan * Return the amount of in-use and wired memory for the VM. Since 2298270074Sgrehan * these are global stats, only return the values with for vCPU 0 2299270074Sgrehan */ 2300270074SgrehanVMM_STAT_DECLARE(VMM_MEM_RESIDENT); 2301270074SgrehanVMM_STAT_DECLARE(VMM_MEM_WIRED); 2302270074Sgrehan 2303270074Sgrehanstatic void 2304270074Sgrehanvm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) 2305270074Sgrehan{ 2306270074Sgrehan 2307270074Sgrehan if (vcpu == 0) { 2308270074Sgrehan vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT, 2309270074Sgrehan PAGE_SIZE * vmspace_resident_count(vm->vmspace)); 2310270074Sgrehan } 2311270074Sgrehan} 2312270074Sgrehan 2313270074Sgrehanstatic void 2314270074Sgrehanvm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) 2315270074Sgrehan{ 2316270074Sgrehan 2317270074Sgrehan if (vcpu == 0) { 2318270074Sgrehan vmm_stat_set(vm, vcpu, VMM_MEM_WIRED, 2319270074Sgrehan PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace))); 2320270074Sgrehan } 2321270074Sgrehan} 2322270074Sgrehan 2323270074SgrehanVMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt); 2324270074SgrehanVMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt); 2325