vmm.c revision 276403
1221828Sgrehan/*- 2221828Sgrehan * Copyright (c) 2011 NetApp, Inc. 3221828Sgrehan * All rights reserved. 4221828Sgrehan * 5221828Sgrehan * Redistribution and use in source and binary forms, with or without 6221828Sgrehan * modification, are permitted provided that the following conditions 7221828Sgrehan * are met: 8221828Sgrehan * 1. Redistributions of source code must retain the above copyright 9221828Sgrehan * notice, this list of conditions and the following disclaimer. 10221828Sgrehan * 2. Redistributions in binary form must reproduce the above copyright 11221828Sgrehan * notice, this list of conditions and the following disclaimer in the 12221828Sgrehan * documentation and/or other materials provided with the distribution. 13221828Sgrehan * 14221828Sgrehan * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15221828Sgrehan * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16221828Sgrehan * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17221828Sgrehan * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18221828Sgrehan * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19221828Sgrehan * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20221828Sgrehan * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21221828Sgrehan * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22221828Sgrehan * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23221828Sgrehan * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24221828Sgrehan * SUCH DAMAGE. 25221828Sgrehan * 26221828Sgrehan * $FreeBSD: stable/10/sys/amd64/vmm/vmm.c 276403 2014-12-30 08:24:14Z neel $ 27221828Sgrehan */ 28221828Sgrehan 29221828Sgrehan#include <sys/cdefs.h> 30221828Sgrehan__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/vmm.c 276403 2014-12-30 08:24:14Z neel $"); 31221828Sgrehan 32221828Sgrehan#include <sys/param.h> 33234695Sgrehan#include <sys/systm.h> 34221828Sgrehan#include <sys/kernel.h> 35221828Sgrehan#include <sys/module.h> 36221828Sgrehan#include <sys/sysctl.h> 37221828Sgrehan#include <sys/malloc.h> 38221828Sgrehan#include <sys/pcpu.h> 39221828Sgrehan#include <sys/lock.h> 40221828Sgrehan#include <sys/mutex.h> 41221828Sgrehan#include <sys/proc.h> 42256072Sneel#include <sys/rwlock.h> 43221828Sgrehan#include <sys/sched.h> 44221828Sgrehan#include <sys/smp.h> 45221828Sgrehan#include <sys/systm.h> 46221828Sgrehan 47221828Sgrehan#include <vm/vm.h> 48256072Sneel#include <vm/vm_object.h> 49256072Sneel#include <vm/vm_page.h> 50256072Sneel#include <vm/pmap.h> 51256072Sneel#include <vm/vm_map.h> 52256072Sneel#include <vm/vm_extern.h> 53256072Sneel#include <vm/vm_param.h> 54221828Sgrehan 55261275Sjhb#include <machine/cpu.h> 56221828Sgrehan#include <machine/vm.h> 57221828Sgrehan#include <machine/pcb.h> 58241489Sneel#include <machine/smp.h> 59262350Sjhb#include <x86/psl.h> 60221914Sjhb#include <x86/apicreg.h> 61256072Sneel#include <machine/vmparam.h> 62221828Sgrehan 63221828Sgrehan#include <machine/vmm.h> 64261088Sjhb#include <machine/vmm_dev.h> 65268976Sjhb#include <machine/vmm_instruction_emul.h> 66261088Sjhb 67268976Sjhb#include "vmm_ioport.h" 68256072Sneel#include "vmm_ktr.h" 69242275Sneel#include "vmm_host.h" 70221828Sgrehan#include "vmm_mem.h" 71221828Sgrehan#include "vmm_util.h" 72268891Sjhb#include "vatpic.h" 73268891Sjhb#include "vatpit.h" 74261088Sjhb#include "vhpet.h" 75261088Sjhb#include "vioapic.h" 76221828Sgrehan#include "vlapic.h" 77221828Sgrehan#include "vmm_ipi.h" 78221828Sgrehan#include "vmm_stat.h" 79242065Sneel#include "vmm_lapic.h" 80221828Sgrehan 81221828Sgrehan#include "io/ppt.h" 82221828Sgrehan#include "io/iommu.h" 83221828Sgrehan 84221828Sgrehanstruct vlapic; 85221828Sgrehan 86270071Sgrehan/* 87270071Sgrehan * Initialization: 88270071Sgrehan * (a) allocated when vcpu is created 89270071Sgrehan * (i) initialized when vcpu is created and when it is reinitialized 90270071Sgrehan * (o) initialized the first time the vcpu is created 91270071Sgrehan * (x) initialized before use 92270071Sgrehan */ 93221828Sgrehanstruct vcpu { 94270071Sgrehan struct mtx mtx; /* (o) protects 'state' and 'hostcpu' */ 95270071Sgrehan enum vcpu_state state; /* (o) vcpu state */ 96270071Sgrehan int hostcpu; /* (o) vcpu's host cpu */ 97270071Sgrehan struct vlapic *vlapic; /* (i) APIC device model */ 98270071Sgrehan enum x2apic_state x2apic_state; /* (i) APIC mode */ 99270159Sgrehan uint64_t exitintinfo; /* (i) events pending at VM exit */ 100270071Sgrehan int nmi_pending; /* (i) NMI pending */ 101270071Sgrehan int extint_pending; /* (i) INTR pending */ 102270071Sgrehan struct vm_exception exception; /* (x) exception collateral */ 103270071Sgrehan int exception_pending; /* (i) exception pending */ 104270071Sgrehan struct savefpu *guestfpu; /* (a,i) guest fpu state */ 105270071Sgrehan uint64_t guest_xcr0; /* (i) guest %xcr0 register */ 106270071Sgrehan void *stats; /* (a,i) statistics */ 107270071Sgrehan struct vm_exit exitinfo; /* (x) exit reason and collateral */ 108221828Sgrehan}; 109221828Sgrehan 110270071Sgrehan#define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx)) 111242065Sneel#define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) 112242065Sneel#define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) 113242065Sneel#define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) 114256072Sneel#define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) 115241489Sneel 116256072Sneelstruct mem_seg { 117256072Sneel vm_paddr_t gpa; 118256072Sneel size_t len; 119256072Sneel boolean_t wired; 120256072Sneel vm_object_t object; 121256072Sneel}; 122221828Sgrehan#define VM_MAX_MEMORY_SEGMENTS 2 123221828Sgrehan 124270071Sgrehan/* 125270071Sgrehan * Initialization: 126270071Sgrehan * (o) initialized the first time the VM is created 127270071Sgrehan * (i) initialized when VM is created and when it is reinitialized 128270071Sgrehan * (x) initialized before use 129270071Sgrehan */ 130221828Sgrehanstruct vm { 131270071Sgrehan void *cookie; /* (i) cpu-specific data */ 132270071Sgrehan void *iommu; /* (x) iommu-specific data */ 133270071Sgrehan struct vhpet *vhpet; /* (i) virtual HPET */ 134270071Sgrehan struct vioapic *vioapic; /* (i) virtual ioapic */ 135270071Sgrehan struct vatpic *vatpic; /* (i) virtual atpic */ 136270071Sgrehan struct vatpit *vatpit; /* (i) virtual atpit */ 137270071Sgrehan volatile cpuset_t active_cpus; /* (i) active vcpus */ 138270071Sgrehan int suspend; /* (i) stop VM execution */ 139270071Sgrehan volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ 140270071Sgrehan volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ 141270071Sgrehan cpuset_t rendezvous_req_cpus; /* (x) rendezvous requested */ 142270071Sgrehan cpuset_t rendezvous_done_cpus; /* (x) rendezvous finished */ 143270071Sgrehan void *rendezvous_arg; /* (x) rendezvous func/arg */ 144270071Sgrehan vm_rendezvous_func_t rendezvous_func; 145270071Sgrehan struct mtx rendezvous_mtx; /* (o) rendezvous lock */ 146270071Sgrehan int num_mem_segs; /* (o) guest memory segments */ 147256072Sneel struct mem_seg mem_segs[VM_MAX_MEMORY_SEGMENTS]; 148270071Sgrehan struct vmspace *vmspace; /* (o) guest's address space */ 149270071Sgrehan char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */ 150270071Sgrehan struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */ 151221828Sgrehan}; 152221828Sgrehan 153249396Sneelstatic int vmm_initialized; 154249396Sneel 155221828Sgrehanstatic struct vmm_ops *ops; 156266339Sjhb#define VMM_INIT(num) (ops != NULL ? (*ops->init)(num) : 0) 157221828Sgrehan#define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0) 158261275Sjhb#define VMM_RESUME() (ops != NULL ? (*ops->resume)() : 0) 159221828Sgrehan 160256072Sneel#define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL) 161268935Sjhb#define VMRUN(vmi, vcpu, rip, pmap, rptr, sptr) \ 162268935Sjhb (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, rptr, sptr) : ENXIO) 163221828Sgrehan#define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL) 164256072Sneel#define VMSPACE_ALLOC(min, max) \ 165256072Sneel (ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL) 166256072Sneel#define VMSPACE_FREE(vmspace) \ 167256072Sneel (ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO) 168221828Sgrehan#define VMGETREG(vmi, vcpu, num, retval) \ 169221828Sgrehan (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO) 170221828Sgrehan#define VMSETREG(vmi, vcpu, num, val) \ 171221828Sgrehan (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO) 172221828Sgrehan#define VMGETDESC(vmi, vcpu, num, desc) \ 173221828Sgrehan (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO) 174221828Sgrehan#define VMSETDESC(vmi, vcpu, num, desc) \ 175221828Sgrehan (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO) 176221828Sgrehan#define VMGETCAP(vmi, vcpu, num, retval) \ 177221828Sgrehan (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO) 178221828Sgrehan#define VMSETCAP(vmi, vcpu, num, val) \ 179221828Sgrehan (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO) 180266339Sjhb#define VLAPIC_INIT(vmi, vcpu) \ 181266339Sjhb (ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL) 182266339Sjhb#define VLAPIC_CLEANUP(vmi, vlapic) \ 183266339Sjhb (ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL) 184221828Sgrehan 185245021Sneel#define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) 186245021Sneel#define fpu_stop_emulating() clts() 187221828Sgrehan 188221828Sgrehanstatic MALLOC_DEFINE(M_VM, "vm", "vm"); 189221828Sgrehan 190221828Sgrehan/* statistics */ 191248389Sneelstatic VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); 192221828Sgrehan 193266339SjhbSYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); 194266339Sjhb 195268935Sjhb/* 196268935Sjhb * Halt the guest if all vcpus are executing a HLT instruction with 197268935Sjhb * interrupts disabled. 198268935Sjhb */ 199268935Sjhbstatic int halt_detection_enabled = 1; 200268935SjhbTUNABLE_INT("hw.vmm.halt_detection", &halt_detection_enabled); 201268935SjhbSYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN, 202268935Sjhb &halt_detection_enabled, 0, 203268935Sjhb "Halt VM if all vcpus execute HLT with interrupts disabled"); 204268935Sjhb 205266339Sjhbstatic int vmm_ipinum; 206266339SjhbSYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0, 207266339Sjhb "IPI vector used for vcpu notifications"); 208266339Sjhb 209276403Sneelstatic int trace_guest_exceptions; 210276403SneelSYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN, 211276403Sneel &trace_guest_exceptions, 0, 212276403Sneel "Trap into hypervisor on all guest exceptions and reflect them back"); 213276403Sneel 214221828Sgrehanstatic void 215270071Sgrehanvcpu_cleanup(struct vm *vm, int i, bool destroy) 216221828Sgrehan{ 217266339Sjhb struct vcpu *vcpu = &vm->vcpu[i]; 218266339Sjhb 219266339Sjhb VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic); 220270071Sgrehan if (destroy) { 221270071Sgrehan vmm_stat_free(vcpu->stats); 222270071Sgrehan fpu_save_area_free(vcpu->guestfpu); 223270071Sgrehan } 224221828Sgrehan} 225221828Sgrehan 226221828Sgrehanstatic void 227270071Sgrehanvcpu_init(struct vm *vm, int vcpu_id, bool create) 228221828Sgrehan{ 229221828Sgrehan struct vcpu *vcpu; 230270071Sgrehan 231270071Sgrehan KASSERT(vcpu_id >= 0 && vcpu_id < VM_MAXCPU, 232270071Sgrehan ("vcpu_init: invalid vcpu %d", vcpu_id)); 233270071Sgrehan 234221828Sgrehan vcpu = &vm->vcpu[vcpu_id]; 235221828Sgrehan 236270071Sgrehan if (create) { 237270071Sgrehan KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already " 238270071Sgrehan "initialized", vcpu_id)); 239270071Sgrehan vcpu_lock_init(vcpu); 240270071Sgrehan vcpu->state = VCPU_IDLE; 241270071Sgrehan vcpu->hostcpu = NOCPU; 242270071Sgrehan vcpu->guestfpu = fpu_save_area_alloc(); 243270071Sgrehan vcpu->stats = vmm_stat_alloc(); 244270071Sgrehan } 245270071Sgrehan 246266339Sjhb vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); 247267447Sjhb vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); 248270159Sgrehan vcpu->exitintinfo = 0; 249270071Sgrehan vcpu->nmi_pending = 0; 250270071Sgrehan vcpu->extint_pending = 0; 251270071Sgrehan vcpu->exception_pending = 0; 252267427Sjhb vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; 253234695Sgrehan fpu_save_area_reset(vcpu->guestfpu); 254270071Sgrehan vmm_stat_init(vcpu->stats); 255221828Sgrehan} 256221828Sgrehan 257276403Sneelint 258276403Sneelvcpu_trace_exceptions(struct vm *vm, int vcpuid) 259276403Sneel{ 260276403Sneel 261276403Sneel return (trace_guest_exceptions); 262276403Sneel} 263276403Sneel 264240894Sneelstruct vm_exit * 265240894Sneelvm_exitinfo(struct vm *vm, int cpuid) 266240894Sneel{ 267240894Sneel struct vcpu *vcpu; 268240894Sneel 269240894Sneel if (cpuid < 0 || cpuid >= VM_MAXCPU) 270240894Sneel panic("vm_exitinfo: invalid cpuid %d", cpuid); 271240894Sneel 272240894Sneel vcpu = &vm->vcpu[cpuid]; 273240894Sneel 274240894Sneel return (&vcpu->exitinfo); 275240894Sneel} 276240894Sneel 277261275Sjhbstatic void 278261275Sjhbvmm_resume(void) 279261275Sjhb{ 280261275Sjhb VMM_RESUME(); 281261275Sjhb} 282261275Sjhb 283221828Sgrehanstatic int 284221828Sgrehanvmm_init(void) 285221828Sgrehan{ 286221828Sgrehan int error; 287221828Sgrehan 288242275Sneel vmm_host_state_init(); 289221828Sgrehan 290266339Sjhb vmm_ipinum = vmm_ipi_alloc(); 291266339Sjhb if (vmm_ipinum == 0) 292266339Sjhb vmm_ipinum = IPI_AST; 293266339Sjhb 294221828Sgrehan error = vmm_mem_init(); 295221828Sgrehan if (error) 296221828Sgrehan return (error); 297221828Sgrehan 298221828Sgrehan if (vmm_is_intel()) 299221828Sgrehan ops = &vmm_ops_intel; 300221828Sgrehan else if (vmm_is_amd()) 301221828Sgrehan ops = &vmm_ops_amd; 302221828Sgrehan else 303221828Sgrehan return (ENXIO); 304221828Sgrehan 305261275Sjhb vmm_resume_p = vmm_resume; 306221828Sgrehan 307266339Sjhb return (VMM_INIT(vmm_ipinum)); 308221828Sgrehan} 309221828Sgrehan 310221828Sgrehanstatic int 311221828Sgrehanvmm_handler(module_t mod, int what, void *arg) 312221828Sgrehan{ 313221828Sgrehan int error; 314221828Sgrehan 315221828Sgrehan switch (what) { 316221828Sgrehan case MOD_LOAD: 317221828Sgrehan vmmdev_init(); 318267070Sjhb if (ppt_avail_devices() > 0) 319267070Sjhb iommu_init(); 320221828Sgrehan error = vmm_init(); 321249396Sneel if (error == 0) 322249396Sneel vmm_initialized = 1; 323221828Sgrehan break; 324221828Sgrehan case MOD_UNLOAD: 325241454Sneel error = vmmdev_cleanup(); 326241454Sneel if (error == 0) { 327261275Sjhb vmm_resume_p = NULL; 328241454Sneel iommu_cleanup(); 329266339Sjhb if (vmm_ipinum != IPI_AST) 330266339Sjhb vmm_ipi_free(vmm_ipinum); 331241454Sneel error = VMM_CLEANUP(); 332253854Sgrehan /* 333253854Sgrehan * Something bad happened - prevent new 334253854Sgrehan * VMs from being created 335253854Sgrehan */ 336253854Sgrehan if (error) 337253854Sgrehan vmm_initialized = 0; 338241454Sneel } 339221828Sgrehan break; 340221828Sgrehan default: 341221828Sgrehan error = 0; 342221828Sgrehan break; 343221828Sgrehan } 344221828Sgrehan return (error); 345221828Sgrehan} 346221828Sgrehan 347221828Sgrehanstatic moduledata_t vmm_kmod = { 348221828Sgrehan "vmm", 349221828Sgrehan vmm_handler, 350221828Sgrehan NULL 351221828Sgrehan}; 352221828Sgrehan 353221828Sgrehan/* 354245704Sneel * vmm initialization has the following dependencies: 355245704Sneel * 356245704Sneel * - iommu initialization must happen after the pci passthru driver has had 357245704Sneel * a chance to attach to any passthru devices (after SI_SUB_CONFIGURE). 358245704Sneel * 359245704Sneel * - VT-x initialization requires smp_rendezvous() and therefore must happen 360245704Sneel * after SMP is fully functional (after SI_SUB_SMP). 361221828Sgrehan */ 362245704SneelDECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY); 363221828SgrehanMODULE_VERSION(vmm, 1); 364221828Sgrehan 365270071Sgrehanstatic void 366270071Sgrehanvm_init(struct vm *vm, bool create) 367270071Sgrehan{ 368270071Sgrehan int i; 369270071Sgrehan 370270071Sgrehan vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace)); 371270071Sgrehan vm->iommu = NULL; 372270071Sgrehan vm->vioapic = vioapic_init(vm); 373270071Sgrehan vm->vhpet = vhpet_init(vm); 374270071Sgrehan vm->vatpic = vatpic_init(vm); 375270071Sgrehan vm->vatpit = vatpit_init(vm); 376270071Sgrehan 377270071Sgrehan CPU_ZERO(&vm->active_cpus); 378270071Sgrehan 379270071Sgrehan vm->suspend = 0; 380270071Sgrehan CPU_ZERO(&vm->suspended_cpus); 381270071Sgrehan 382270071Sgrehan for (i = 0; i < VM_MAXCPU; i++) 383270071Sgrehan vcpu_init(vm, i, create); 384270071Sgrehan} 385270071Sgrehan 386249396Sneelint 387249396Sneelvm_create(const char *name, struct vm **retvm) 388221828Sgrehan{ 389221828Sgrehan struct vm *vm; 390256072Sneel struct vmspace *vmspace; 391221828Sgrehan 392249396Sneel /* 393249396Sneel * If vmm.ko could not be successfully initialized then don't attempt 394249396Sneel * to create the virtual machine. 395249396Sneel */ 396249396Sneel if (!vmm_initialized) 397249396Sneel return (ENXIO); 398249396Sneel 399221828Sgrehan if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) 400249396Sneel return (EINVAL); 401221828Sgrehan 402256072Sneel vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS); 403256072Sneel if (vmspace == NULL) 404256072Sneel return (ENOMEM); 405256072Sneel 406221828Sgrehan vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); 407221828Sgrehan strcpy(vm->name, name); 408270071Sgrehan vm->num_mem_segs = 0; 409266339Sjhb vm->vmspace = vmspace; 410266339Sjhb mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF); 411221828Sgrehan 412270071Sgrehan vm_init(vm, true); 413221828Sgrehan 414249396Sneel *retvm = vm; 415249396Sneel return (0); 416221828Sgrehan} 417221828Sgrehan 418241178Sneelstatic void 419256072Sneelvm_free_mem_seg(struct vm *vm, struct mem_seg *seg) 420241178Sneel{ 421241178Sneel 422256072Sneel if (seg->object != NULL) 423256072Sneel vmm_mem_free(vm->vmspace, seg->gpa, seg->len); 424241362Sneel 425256072Sneel bzero(seg, sizeof(*seg)); 426241178Sneel} 427241178Sneel 428270071Sgrehanstatic void 429270071Sgrehanvm_cleanup(struct vm *vm, bool destroy) 430221828Sgrehan{ 431221828Sgrehan int i; 432221828Sgrehan 433221828Sgrehan ppt_unassign_all(vm); 434221828Sgrehan 435256072Sneel if (vm->iommu != NULL) 436256072Sneel iommu_destroy_domain(vm->iommu); 437256072Sneel 438268891Sjhb vatpit_cleanup(vm->vatpit); 439261088Sjhb vhpet_cleanup(vm->vhpet); 440268891Sjhb vatpic_cleanup(vm->vatpic); 441261088Sjhb vioapic_cleanup(vm->vioapic); 442261088Sjhb 443270071Sgrehan for (i = 0; i < VM_MAXCPU; i++) 444270071Sgrehan vcpu_cleanup(vm, i, destroy); 445221828Sgrehan 446270071Sgrehan VMCLEANUP(vm->cookie); 447241178Sneel 448270071Sgrehan if (destroy) { 449270071Sgrehan for (i = 0; i < vm->num_mem_segs; i++) 450270071Sgrehan vm_free_mem_seg(vm, &vm->mem_segs[i]); 451221828Sgrehan 452270071Sgrehan vm->num_mem_segs = 0; 453221828Sgrehan 454270071Sgrehan VMSPACE_FREE(vm->vmspace); 455270071Sgrehan vm->vmspace = NULL; 456270071Sgrehan } 457270071Sgrehan} 458221828Sgrehan 459270071Sgrehanvoid 460270071Sgrehanvm_destroy(struct vm *vm) 461270071Sgrehan{ 462270071Sgrehan vm_cleanup(vm, true); 463221828Sgrehan free(vm, M_VM); 464221828Sgrehan} 465221828Sgrehan 466270071Sgrehanint 467270071Sgrehanvm_reinit(struct vm *vm) 468270071Sgrehan{ 469270071Sgrehan int error; 470270071Sgrehan 471270071Sgrehan /* 472270071Sgrehan * A virtual machine can be reset only if all vcpus are suspended. 473270071Sgrehan */ 474270071Sgrehan if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { 475270071Sgrehan vm_cleanup(vm, false); 476270071Sgrehan vm_init(vm, false); 477270071Sgrehan error = 0; 478270071Sgrehan } else { 479270071Sgrehan error = EBUSY; 480270071Sgrehan } 481270071Sgrehan 482270071Sgrehan return (error); 483270071Sgrehan} 484270071Sgrehan 485221828Sgrehanconst char * 486221828Sgrehanvm_name(struct vm *vm) 487221828Sgrehan{ 488221828Sgrehan return (vm->name); 489221828Sgrehan} 490221828Sgrehan 491221828Sgrehanint 492221828Sgrehanvm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) 493221828Sgrehan{ 494256072Sneel vm_object_t obj; 495221828Sgrehan 496256072Sneel if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) 497256072Sneel return (ENOMEM); 498256072Sneel else 499256072Sneel return (0); 500221828Sgrehan} 501221828Sgrehan 502221828Sgrehanint 503221828Sgrehanvm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) 504221828Sgrehan{ 505221828Sgrehan 506256072Sneel vmm_mmio_free(vm->vmspace, gpa, len); 507256072Sneel return (0); 508221828Sgrehan} 509221828Sgrehan 510256072Sneelboolean_t 511256072Sneelvm_mem_allocated(struct vm *vm, vm_paddr_t gpa) 512241041Sneel{ 513241041Sneel int i; 514241041Sneel vm_paddr_t gpabase, gpalimit; 515241041Sneel 516241041Sneel for (i = 0; i < vm->num_mem_segs; i++) { 517241041Sneel gpabase = vm->mem_segs[i].gpa; 518241041Sneel gpalimit = gpabase + vm->mem_segs[i].len; 519241041Sneel if (gpa >= gpabase && gpa < gpalimit) 520256072Sneel return (TRUE); /* 'gpa' is regular memory */ 521241041Sneel } 522241041Sneel 523256072Sneel if (ppt_is_mmio(vm, gpa)) 524256072Sneel return (TRUE); /* 'gpa' is pci passthru mmio */ 525256072Sneel 526256072Sneel return (FALSE); 527241041Sneel} 528241041Sneel 529221828Sgrehanint 530241041Sneelvm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len) 531221828Sgrehan{ 532256072Sneel int available, allocated; 533256072Sneel struct mem_seg *seg; 534256072Sneel vm_object_t object; 535256072Sneel vm_paddr_t g; 536221828Sgrehan 537241041Sneel if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0) 538241041Sneel return (EINVAL); 539221828Sgrehan 540241041Sneel available = allocated = 0; 541241041Sneel g = gpa; 542241041Sneel while (g < gpa + len) { 543256072Sneel if (vm_mem_allocated(vm, g)) 544256072Sneel allocated++; 545256072Sneel else 546241041Sneel available++; 547241041Sneel 548241041Sneel g += PAGE_SIZE; 549241041Sneel } 550241041Sneel 551221828Sgrehan /* 552241041Sneel * If there are some allocated and some available pages in the address 553241041Sneel * range then it is an error. 554221828Sgrehan */ 555241041Sneel if (allocated && available) 556241041Sneel return (EINVAL); 557221828Sgrehan 558241041Sneel /* 559241041Sneel * If the entire address range being requested has already been 560241041Sneel * allocated then there isn't anything more to do. 561241041Sneel */ 562241041Sneel if (allocated && available == 0) 563241041Sneel return (0); 564241041Sneel 565221828Sgrehan if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS) 566221828Sgrehan return (E2BIG); 567221828Sgrehan 568241178Sneel seg = &vm->mem_segs[vm->num_mem_segs]; 569221828Sgrehan 570256072Sneel if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL) 571256072Sneel return (ENOMEM); 572256072Sneel 573241178Sneel seg->gpa = gpa; 574256072Sneel seg->len = len; 575256072Sneel seg->object = object; 576256072Sneel seg->wired = FALSE; 577241178Sneel 578256072Sneel vm->num_mem_segs++; 579256072Sneel 580256072Sneel return (0); 581256072Sneel} 582256072Sneel 583270159Sgrehanstatic vm_paddr_t 584270159Sgrehanvm_maxmem(struct vm *vm) 585270159Sgrehan{ 586270159Sgrehan int i; 587270159Sgrehan vm_paddr_t gpa, maxmem; 588270159Sgrehan 589270159Sgrehan maxmem = 0; 590270159Sgrehan for (i = 0; i < vm->num_mem_segs; i++) { 591270159Sgrehan gpa = vm->mem_segs[i].gpa + vm->mem_segs[i].len; 592270159Sgrehan if (gpa > maxmem) 593270159Sgrehan maxmem = gpa; 594270159Sgrehan } 595270159Sgrehan return (maxmem); 596270159Sgrehan} 597270159Sgrehan 598256072Sneelstatic void 599256072Sneelvm_gpa_unwire(struct vm *vm) 600256072Sneel{ 601256072Sneel int i, rv; 602256072Sneel struct mem_seg *seg; 603256072Sneel 604256072Sneel for (i = 0; i < vm->num_mem_segs; i++) { 605256072Sneel seg = &vm->mem_segs[i]; 606256072Sneel if (!seg->wired) 607256072Sneel continue; 608256072Sneel 609256072Sneel rv = vm_map_unwire(&vm->vmspace->vm_map, 610256072Sneel seg->gpa, seg->gpa + seg->len, 611256072Sneel VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 612256072Sneel KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment " 613256072Sneel "%#lx/%ld could not be unwired: %d", 614256072Sneel vm_name(vm), seg->gpa, seg->len, rv)); 615256072Sneel 616256072Sneel seg->wired = FALSE; 617256072Sneel } 618256072Sneel} 619256072Sneel 620256072Sneelstatic int 621256072Sneelvm_gpa_wire(struct vm *vm) 622256072Sneel{ 623256072Sneel int i, rv; 624256072Sneel struct mem_seg *seg; 625256072Sneel 626256072Sneel for (i = 0; i < vm->num_mem_segs; i++) { 627256072Sneel seg = &vm->mem_segs[i]; 628256072Sneel if (seg->wired) 629256072Sneel continue; 630256072Sneel 631256072Sneel /* XXX rlimits? */ 632256072Sneel rv = vm_map_wire(&vm->vmspace->vm_map, 633256072Sneel seg->gpa, seg->gpa + seg->len, 634256072Sneel VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 635256072Sneel if (rv != KERN_SUCCESS) 636241178Sneel break; 637241178Sneel 638256072Sneel seg->wired = TRUE; 639256072Sneel } 640256072Sneel 641256072Sneel if (i < vm->num_mem_segs) { 642241362Sneel /* 643256072Sneel * Undo the wiring before returning an error. 644241362Sneel */ 645256072Sneel vm_gpa_unwire(vm); 646256072Sneel return (EAGAIN); 647256072Sneel } 648241178Sneel 649256072Sneel return (0); 650256072Sneel} 651256072Sneel 652256072Sneelstatic void 653256072Sneelvm_iommu_modify(struct vm *vm, boolean_t map) 654256072Sneel{ 655256072Sneel int i, sz; 656256072Sneel vm_paddr_t gpa, hpa; 657256072Sneel struct mem_seg *seg; 658256072Sneel void *vp, *cookie, *host_domain; 659256072Sneel 660256072Sneel sz = PAGE_SIZE; 661256072Sneel host_domain = iommu_host_domain(); 662256072Sneel 663256072Sneel for (i = 0; i < vm->num_mem_segs; i++) { 664256072Sneel seg = &vm->mem_segs[i]; 665256072Sneel KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired", 666256072Sneel vm_name(vm), seg->gpa, seg->len)); 667256072Sneel 668256072Sneel gpa = seg->gpa; 669256072Sneel while (gpa < seg->gpa + seg->len) { 670256072Sneel vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE, 671256072Sneel &cookie); 672256072Sneel KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx", 673256072Sneel vm_name(vm), gpa)); 674256072Sneel 675256072Sneel vm_gpa_release(cookie); 676256072Sneel 677256072Sneel hpa = DMAP_TO_PHYS((uintptr_t)vp); 678256072Sneel if (map) { 679256072Sneel iommu_create_mapping(vm->iommu, gpa, hpa, sz); 680256072Sneel iommu_remove_mapping(host_domain, hpa, sz); 681256072Sneel } else { 682256072Sneel iommu_remove_mapping(vm->iommu, gpa, sz); 683256072Sneel iommu_create_mapping(host_domain, hpa, hpa, sz); 684256072Sneel } 685256072Sneel 686256072Sneel gpa += PAGE_SIZE; 687256072Sneel } 688241178Sneel } 689241178Sneel 690256072Sneel /* 691256072Sneel * Invalidate the cached translations associated with the domain 692256072Sneel * from which pages were removed. 693256072Sneel */ 694256072Sneel if (map) 695256072Sneel iommu_invalidate_tlb(host_domain); 696256072Sneel else 697256072Sneel iommu_invalidate_tlb(vm->iommu); 698256072Sneel} 699256072Sneel 700256072Sneel#define vm_iommu_unmap(vm) vm_iommu_modify((vm), FALSE) 701256072Sneel#define vm_iommu_map(vm) vm_iommu_modify((vm), TRUE) 702256072Sneel 703256072Sneelint 704256072Sneelvm_unassign_pptdev(struct vm *vm, int bus, int slot, int func) 705256072Sneel{ 706256072Sneel int error; 707256072Sneel 708256072Sneel error = ppt_unassign_device(vm, bus, slot, func); 709256072Sneel if (error) 710221828Sgrehan return (error); 711256072Sneel 712267070Sjhb if (ppt_assigned_devices(vm) == 0) { 713256072Sneel vm_iommu_unmap(vm); 714256072Sneel vm_gpa_unwire(vm); 715221828Sgrehan } 716256072Sneel return (0); 717256072Sneel} 718221828Sgrehan 719256072Sneelint 720256072Sneelvm_assign_pptdev(struct vm *vm, int bus, int slot, int func) 721256072Sneel{ 722256072Sneel int error; 723256072Sneel vm_paddr_t maxaddr; 724256072Sneel 725241362Sneel /* 726256072Sneel * Virtual machines with pci passthru devices get special treatment: 727256072Sneel * - the guest physical memory is wired 728256072Sneel * - the iommu is programmed to do the 'gpa' to 'hpa' translation 729256072Sneel * 730256072Sneel * We need to do this before the first pci passthru device is attached. 731241362Sneel */ 732267070Sjhb if (ppt_assigned_devices(vm) == 0) { 733256072Sneel KASSERT(vm->iommu == NULL, 734256072Sneel ("vm_assign_pptdev: iommu must be NULL")); 735270159Sgrehan maxaddr = vm_maxmem(vm); 736256072Sneel vm->iommu = iommu_create_domain(maxaddr); 737241362Sneel 738256072Sneel error = vm_gpa_wire(vm); 739256072Sneel if (error) 740256072Sneel return (error); 741241041Sneel 742256072Sneel vm_iommu_map(vm); 743256072Sneel } 744256072Sneel 745256072Sneel error = ppt_assign_device(vm, bus, slot, func); 746256072Sneel return (error); 747221828Sgrehan} 748221828Sgrehan 749256072Sneelvoid * 750256072Sneelvm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot, 751256072Sneel void **cookie) 752221828Sgrehan{ 753256072Sneel int count, pageoff; 754256072Sneel vm_page_t m; 755221828Sgrehan 756256072Sneel pageoff = gpa & PAGE_MASK; 757256072Sneel if (len > PAGE_SIZE - pageoff) 758256072Sneel panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len); 759241148Sneel 760256072Sneel count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map, 761256072Sneel trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1); 762256072Sneel 763256072Sneel if (count == 1) { 764256072Sneel *cookie = m; 765256072Sneel return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff)); 766256072Sneel } else { 767256072Sneel *cookie = NULL; 768256072Sneel return (NULL); 769256072Sneel } 770221828Sgrehan} 771221828Sgrehan 772256072Sneelvoid 773256072Sneelvm_gpa_release(void *cookie) 774256072Sneel{ 775256072Sneel vm_page_t m = cookie; 776256072Sneel 777256072Sneel vm_page_lock(m); 778256072Sneel vm_page_unhold(m); 779256072Sneel vm_page_unlock(m); 780256072Sneel} 781256072Sneel 782221828Sgrehanint 783221828Sgrehanvm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase, 784221828Sgrehan struct vm_memory_segment *seg) 785221828Sgrehan{ 786221828Sgrehan int i; 787221828Sgrehan 788221828Sgrehan for (i = 0; i < vm->num_mem_segs; i++) { 789221828Sgrehan if (gpabase == vm->mem_segs[i].gpa) { 790256072Sneel seg->gpa = vm->mem_segs[i].gpa; 791256072Sneel seg->len = vm->mem_segs[i].len; 792256072Sneel seg->wired = vm->mem_segs[i].wired; 793221828Sgrehan return (0); 794221828Sgrehan } 795221828Sgrehan } 796221828Sgrehan return (-1); 797221828Sgrehan} 798221828Sgrehan 799221828Sgrehanint 800256072Sneelvm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len, 801256072Sneel vm_offset_t *offset, struct vm_object **object) 802256072Sneel{ 803256072Sneel int i; 804256072Sneel size_t seg_len; 805256072Sneel vm_paddr_t seg_gpa; 806256072Sneel vm_object_t seg_obj; 807256072Sneel 808256072Sneel for (i = 0; i < vm->num_mem_segs; i++) { 809256072Sneel if ((seg_obj = vm->mem_segs[i].object) == NULL) 810256072Sneel continue; 811256072Sneel 812256072Sneel seg_gpa = vm->mem_segs[i].gpa; 813256072Sneel seg_len = vm->mem_segs[i].len; 814256072Sneel 815256072Sneel if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) { 816256072Sneel *offset = gpa - seg_gpa; 817256072Sneel *object = seg_obj; 818256072Sneel vm_object_reference(seg_obj); 819256072Sneel return (0); 820256072Sneel } 821256072Sneel } 822256072Sneel 823256072Sneel return (EINVAL); 824256072Sneel} 825256072Sneel 826256072Sneelint 827221828Sgrehanvm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) 828221828Sgrehan{ 829221828Sgrehan 830221828Sgrehan if (vcpu < 0 || vcpu >= VM_MAXCPU) 831221828Sgrehan return (EINVAL); 832221828Sgrehan 833221828Sgrehan if (reg >= VM_REG_LAST) 834221828Sgrehan return (EINVAL); 835221828Sgrehan 836221828Sgrehan return (VMGETREG(vm->cookie, vcpu, reg, retval)); 837221828Sgrehan} 838221828Sgrehan 839221828Sgrehanint 840221828Sgrehanvm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val) 841221828Sgrehan{ 842221828Sgrehan 843221828Sgrehan if (vcpu < 0 || vcpu >= VM_MAXCPU) 844221828Sgrehan return (EINVAL); 845221828Sgrehan 846221828Sgrehan if (reg >= VM_REG_LAST) 847221828Sgrehan return (EINVAL); 848221828Sgrehan 849221828Sgrehan return (VMSETREG(vm->cookie, vcpu, reg, val)); 850221828Sgrehan} 851221828Sgrehan 852221828Sgrehanstatic boolean_t 853221828Sgrehanis_descriptor_table(int reg) 854221828Sgrehan{ 855221828Sgrehan 856221828Sgrehan switch (reg) { 857221828Sgrehan case VM_REG_GUEST_IDTR: 858221828Sgrehan case VM_REG_GUEST_GDTR: 859221828Sgrehan return (TRUE); 860221828Sgrehan default: 861221828Sgrehan return (FALSE); 862221828Sgrehan } 863221828Sgrehan} 864221828Sgrehan 865221828Sgrehanstatic boolean_t 866221828Sgrehanis_segment_register(int reg) 867221828Sgrehan{ 868221828Sgrehan 869221828Sgrehan switch (reg) { 870221828Sgrehan case VM_REG_GUEST_ES: 871221828Sgrehan case VM_REG_GUEST_CS: 872221828Sgrehan case VM_REG_GUEST_SS: 873221828Sgrehan case VM_REG_GUEST_DS: 874221828Sgrehan case VM_REG_GUEST_FS: 875221828Sgrehan case VM_REG_GUEST_GS: 876221828Sgrehan case VM_REG_GUEST_TR: 877221828Sgrehan case VM_REG_GUEST_LDTR: 878221828Sgrehan return (TRUE); 879221828Sgrehan default: 880221828Sgrehan return (FALSE); 881221828Sgrehan } 882221828Sgrehan} 883221828Sgrehan 884221828Sgrehanint 885221828Sgrehanvm_get_seg_desc(struct vm *vm, int vcpu, int reg, 886221828Sgrehan struct seg_desc *desc) 887221828Sgrehan{ 888221828Sgrehan 889221828Sgrehan if (vcpu < 0 || vcpu >= VM_MAXCPU) 890221828Sgrehan return (EINVAL); 891221828Sgrehan 892221828Sgrehan if (!is_segment_register(reg) && !is_descriptor_table(reg)) 893221828Sgrehan return (EINVAL); 894221828Sgrehan 895221828Sgrehan return (VMGETDESC(vm->cookie, vcpu, reg, desc)); 896221828Sgrehan} 897221828Sgrehan 898221828Sgrehanint 899221828Sgrehanvm_set_seg_desc(struct vm *vm, int vcpu, int reg, 900221828Sgrehan struct seg_desc *desc) 901221828Sgrehan{ 902221828Sgrehan if (vcpu < 0 || vcpu >= VM_MAXCPU) 903221828Sgrehan return (EINVAL); 904221828Sgrehan 905221828Sgrehan if (!is_segment_register(reg) && !is_descriptor_table(reg)) 906221828Sgrehan return (EINVAL); 907221828Sgrehan 908221828Sgrehan return (VMSETDESC(vm->cookie, vcpu, reg, desc)); 909221828Sgrehan} 910221828Sgrehan 911221828Sgrehanstatic void 912221828Sgrehanrestore_guest_fpustate(struct vcpu *vcpu) 913221828Sgrehan{ 914221828Sgrehan 915234695Sgrehan /* flush host state to the pcb */ 916234695Sgrehan fpuexit(curthread); 917242122Sneel 918242122Sneel /* restore guest FPU state */ 919221828Sgrehan fpu_stop_emulating(); 920234695Sgrehan fpurestore(vcpu->guestfpu); 921242122Sneel 922267427Sjhb /* restore guest XCR0 if XSAVE is enabled in the host */ 923267427Sjhb if (rcr4() & CR4_XSAVE) 924267427Sjhb load_xcr(0, vcpu->guest_xcr0); 925267427Sjhb 926242122Sneel /* 927242122Sneel * The FPU is now "dirty" with the guest's state so turn on emulation 928242122Sneel * to trap any access to the FPU by the host. 929242122Sneel */ 930242122Sneel fpu_start_emulating(); 931221828Sgrehan} 932221828Sgrehan 933221828Sgrehanstatic void 934221828Sgrehansave_guest_fpustate(struct vcpu *vcpu) 935221828Sgrehan{ 936221828Sgrehan 937242122Sneel if ((rcr0() & CR0_TS) == 0) 938242122Sneel panic("fpu emulation not enabled in host!"); 939242122Sneel 940267427Sjhb /* save guest XCR0 and restore host XCR0 */ 941267427Sjhb if (rcr4() & CR4_XSAVE) { 942267427Sjhb vcpu->guest_xcr0 = rxcr(0); 943267427Sjhb load_xcr(0, vmm_get_host_xcr0()); 944267427Sjhb } 945267427Sjhb 946242122Sneel /* save guest FPU state */ 947242122Sneel fpu_stop_emulating(); 948234695Sgrehan fpusave(vcpu->guestfpu); 949221828Sgrehan fpu_start_emulating(); 950221828Sgrehan} 951221828Sgrehan 952248389Sneelstatic VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle"); 953242065Sneel 954256072Sneelstatic int 955266393Sjhbvcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, 956266393Sjhb bool from_idle) 957256072Sneel{ 958256072Sneel int error; 959256072Sneel 960256072Sneel vcpu_assert_locked(vcpu); 961256072Sneel 962256072Sneel /* 963266393Sjhb * State transitions from the vmmdev_ioctl() must always begin from 964266393Sjhb * the VCPU_IDLE state. This guarantees that there is only a single 965266393Sjhb * ioctl() operating on a vcpu at any point. 966266393Sjhb */ 967266393Sjhb if (from_idle) { 968266393Sjhb while (vcpu->state != VCPU_IDLE) 969266393Sjhb msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); 970266393Sjhb } else { 971266393Sjhb KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " 972266393Sjhb "vcpu idle state")); 973266393Sjhb } 974266393Sjhb 975266393Sjhb if (vcpu->state == VCPU_RUNNING) { 976266393Sjhb KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " 977266393Sjhb "mismatch for running vcpu", curcpu, vcpu->hostcpu)); 978266393Sjhb } else { 979266393Sjhb KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " 980266393Sjhb "vcpu that is not running", vcpu->hostcpu)); 981266393Sjhb } 982266393Sjhb 983266393Sjhb /* 984256072Sneel * The following state transitions are allowed: 985256072Sneel * IDLE -> FROZEN -> IDLE 986256072Sneel * FROZEN -> RUNNING -> FROZEN 987256072Sneel * FROZEN -> SLEEPING -> FROZEN 988256072Sneel */ 989256072Sneel switch (vcpu->state) { 990256072Sneel case VCPU_IDLE: 991256072Sneel case VCPU_RUNNING: 992256072Sneel case VCPU_SLEEPING: 993256072Sneel error = (newstate != VCPU_FROZEN); 994256072Sneel break; 995256072Sneel case VCPU_FROZEN: 996256072Sneel error = (newstate == VCPU_FROZEN); 997256072Sneel break; 998256072Sneel default: 999256072Sneel error = 1; 1000256072Sneel break; 1001256072Sneel } 1002256072Sneel 1003266393Sjhb if (error) 1004266393Sjhb return (EBUSY); 1005266393Sjhb 1006266393Sjhb vcpu->state = newstate; 1007266393Sjhb if (newstate == VCPU_RUNNING) 1008266393Sjhb vcpu->hostcpu = curcpu; 1009256072Sneel else 1010266393Sjhb vcpu->hostcpu = NOCPU; 1011256072Sneel 1012266393Sjhb if (newstate == VCPU_IDLE) 1013266393Sjhb wakeup(&vcpu->state); 1014266393Sjhb 1015266393Sjhb return (0); 1016256072Sneel} 1017256072Sneel 1018256072Sneelstatic void 1019256072Sneelvcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) 1020256072Sneel{ 1021256072Sneel int error; 1022256072Sneel 1023266393Sjhb if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0) 1024256072Sneel panic("Error %d setting state to %d\n", error, newstate); 1025256072Sneel} 1026256072Sneel 1027256072Sneelstatic void 1028256072Sneelvcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) 1029256072Sneel{ 1030256072Sneel int error; 1031256072Sneel 1032266393Sjhb if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0) 1033256072Sneel panic("Error %d setting state to %d", error, newstate); 1034256072Sneel} 1035256072Sneel 1036266339Sjhbstatic void 1037266339Sjhbvm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func) 1038266339Sjhb{ 1039266339Sjhb 1040266339Sjhb KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked")); 1041266339Sjhb 1042266339Sjhb /* 1043266339Sjhb * Update 'rendezvous_func' and execute a write memory barrier to 1044266339Sjhb * ensure that it is visible across all host cpus. This is not needed 1045266339Sjhb * for correctness but it does ensure that all the vcpus will notice 1046266339Sjhb * that the rendezvous is requested immediately. 1047266339Sjhb */ 1048266339Sjhb vm->rendezvous_func = func; 1049266339Sjhb wmb(); 1050266339Sjhb} 1051266339Sjhb 1052266339Sjhb#define RENDEZVOUS_CTR0(vm, vcpuid, fmt) \ 1053266339Sjhb do { \ 1054266339Sjhb if (vcpuid >= 0) \ 1055266339Sjhb VCPU_CTR0(vm, vcpuid, fmt); \ 1056266339Sjhb else \ 1057266339Sjhb VM_CTR0(vm, fmt); \ 1058266339Sjhb } while (0) 1059266339Sjhb 1060266339Sjhbstatic void 1061266339Sjhbvm_handle_rendezvous(struct vm *vm, int vcpuid) 1062266339Sjhb{ 1063266339Sjhb 1064266339Sjhb KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU), 1065266339Sjhb ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid)); 1066266339Sjhb 1067266339Sjhb mtx_lock(&vm->rendezvous_mtx); 1068266339Sjhb while (vm->rendezvous_func != NULL) { 1069266339Sjhb /* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */ 1070266339Sjhb CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus); 1071266339Sjhb 1072266339Sjhb if (vcpuid != -1 && 1073266339Sjhb CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) && 1074266339Sjhb !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) { 1075266339Sjhb VCPU_CTR0(vm, vcpuid, "Calling rendezvous func"); 1076266339Sjhb (*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg); 1077266339Sjhb CPU_SET(vcpuid, &vm->rendezvous_done_cpus); 1078266339Sjhb } 1079266339Sjhb if (CPU_CMP(&vm->rendezvous_req_cpus, 1080266339Sjhb &vm->rendezvous_done_cpus) == 0) { 1081266339Sjhb VCPU_CTR0(vm, vcpuid, "Rendezvous completed"); 1082266339Sjhb vm_set_rendezvous_func(vm, NULL); 1083266339Sjhb wakeup(&vm->rendezvous_func); 1084266339Sjhb break; 1085266339Sjhb } 1086266339Sjhb RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion"); 1087266339Sjhb mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0, 1088266339Sjhb "vmrndv", 0); 1089266339Sjhb } 1090266339Sjhb mtx_unlock(&vm->rendezvous_mtx); 1091266339Sjhb} 1092266339Sjhb 1093256072Sneel/* 1094256072Sneel * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. 1095256072Sneel */ 1096256072Sneelstatic int 1097262350Sjhbvm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu) 1098256072Sneel{ 1099256072Sneel struct vcpu *vcpu; 1100268935Sjhb const char *wmesg; 1101276349Sneel int error, t, vcpu_halted, vm_halted; 1102256072Sneel 1103268935Sjhb KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); 1104268935Sjhb 1105256072Sneel vcpu = &vm->vcpu[vcpuid]; 1106268935Sjhb vcpu_halted = 0; 1107268935Sjhb vm_halted = 0; 1108256072Sneel 1109276349Sneel /* 1110276349Sneel * The typical way to halt a cpu is to execute: "sti; hlt" 1111276349Sneel * 1112276349Sneel * STI sets RFLAGS.IF to enable interrupts. However, the processor 1113276349Sneel * remains in an "interrupt shadow" for an additional instruction 1114276349Sneel * following the STI. This guarantees that "sti; hlt" sequence is 1115276349Sneel * atomic and a pending interrupt will be recognized after the HLT. 1116276349Sneel * 1117276349Sneel * After the HLT emulation is done the vcpu is no longer in an 1118276349Sneel * interrupt shadow and a pending interrupt can be injected on 1119276349Sneel * the next entry into the guest. 1120276349Sneel */ 1121276349Sneel error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0); 1122276349Sneel KASSERT(error == 0, ("%s: error %d clearing interrupt shadow", 1123276349Sneel __func__, error)); 1124276349Sneel 1125256072Sneel vcpu_lock(vcpu); 1126268935Sjhb while (1) { 1127268935Sjhb /* 1128268935Sjhb * Do a final check for pending NMI or interrupts before 1129268935Sjhb * really putting this thread to sleep. Also check for 1130268935Sjhb * software events that would cause this vcpu to wakeup. 1131268935Sjhb * 1132268935Sjhb * These interrupts/events could have happened after the 1133268935Sjhb * vcpu returned from VMRUN() and before it acquired the 1134268935Sjhb * vcpu lock above. 1135268935Sjhb */ 1136268935Sjhb if (vm->rendezvous_func != NULL || vm->suspend) 1137268935Sjhb break; 1138268935Sjhb if (vm_nmi_pending(vm, vcpuid)) 1139268935Sjhb break; 1140268935Sjhb if (!intr_disabled) { 1141268935Sjhb if (vm_extint_pending(vm, vcpuid) || 1142268935Sjhb vlapic_pending_intr(vcpu->vlapic, NULL)) { 1143268935Sjhb break; 1144268935Sjhb } 1145268935Sjhb } 1146256072Sneel 1147270159Sgrehan /* Don't go to sleep if the vcpu thread needs to yield */ 1148270159Sgrehan if (vcpu_should_yield(vm, vcpuid)) 1149270159Sgrehan break; 1150270159Sgrehan 1151268935Sjhb /* 1152268935Sjhb * Some Linux guests implement "halt" by having all vcpus 1153268935Sjhb * execute HLT with interrupts disabled. 'halted_cpus' keeps 1154268935Sjhb * track of the vcpus that have entered this state. When all 1155268935Sjhb * vcpus enter the halted state the virtual machine is halted. 1156268935Sjhb */ 1157268935Sjhb if (intr_disabled) { 1158268935Sjhb wmesg = "vmhalt"; 1159268935Sjhb VCPU_CTR0(vm, vcpuid, "Halted"); 1160268935Sjhb if (!vcpu_halted && halt_detection_enabled) { 1161268935Sjhb vcpu_halted = 1; 1162268935Sjhb CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); 1163268935Sjhb } 1164268935Sjhb if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) { 1165268935Sjhb vm_halted = 1; 1166268935Sjhb break; 1167268935Sjhb } 1168268935Sjhb } else { 1169268935Sjhb wmesg = "vmidle"; 1170268935Sjhb } 1171268935Sjhb 1172256072Sneel t = ticks; 1173256072Sneel vcpu_require_state_locked(vcpu, VCPU_SLEEPING); 1174270159Sgrehan /* 1175270159Sgrehan * XXX msleep_spin() cannot be interrupted by signals so 1176270159Sgrehan * wake up periodically to check pending signals. 1177270159Sgrehan */ 1178270159Sgrehan msleep_spin(vcpu, &vcpu->mtx, wmesg, hz); 1179256072Sneel vcpu_require_state_locked(vcpu, VCPU_FROZEN); 1180256072Sneel vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t); 1181256072Sneel } 1182268935Sjhb 1183268935Sjhb if (vcpu_halted) 1184268935Sjhb CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); 1185268935Sjhb 1186256072Sneel vcpu_unlock(vcpu); 1187256072Sneel 1188268935Sjhb if (vm_halted) 1189268935Sjhb vm_suspend(vm, VM_SUSPEND_HALT); 1190266339Sjhb 1191256072Sneel return (0); 1192256072Sneel} 1193256072Sneel 1194256072Sneelstatic int 1195262350Sjhbvm_handle_paging(struct vm *vm, int vcpuid, bool *retu) 1196256072Sneel{ 1197256072Sneel int rv, ftype; 1198256072Sneel struct vm_map *map; 1199256072Sneel struct vcpu *vcpu; 1200256072Sneel struct vm_exit *vme; 1201256072Sneel 1202256072Sneel vcpu = &vm->vcpu[vcpuid]; 1203256072Sneel vme = &vcpu->exitinfo; 1204256072Sneel 1205256072Sneel ftype = vme->u.paging.fault_type; 1206256072Sneel KASSERT(ftype == VM_PROT_READ || 1207256072Sneel ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE, 1208256072Sneel ("vm_handle_paging: invalid fault_type %d", ftype)); 1209256072Sneel 1210256072Sneel if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { 1211256072Sneel rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace), 1212256072Sneel vme->u.paging.gpa, ftype); 1213276349Sneel if (rv == 0) { 1214276349Sneel VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %#lx", 1215276349Sneel ftype == VM_PROT_READ ? "accessed" : "dirty", 1216276349Sneel vme->u.paging.gpa); 1217256072Sneel goto done; 1218276349Sneel } 1219256072Sneel } 1220256072Sneel 1221256072Sneel map = &vm->vmspace->vm_map; 1222256072Sneel rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL); 1223256072Sneel 1224261088Sjhb VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, " 1225261088Sjhb "ftype = %d", rv, vme->u.paging.gpa, ftype); 1226256072Sneel 1227256072Sneel if (rv != KERN_SUCCESS) 1228256072Sneel return (EFAULT); 1229256072Sneeldone: 1230256072Sneel /* restart execution at the faulting instruction */ 1231256072Sneel vme->inst_length = 0; 1232256072Sneel 1233256072Sneel return (0); 1234256072Sneel} 1235256072Sneel 1236256072Sneelstatic int 1237262350Sjhbvm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) 1238256072Sneel{ 1239256072Sneel struct vie *vie; 1240256072Sneel struct vcpu *vcpu; 1241256072Sneel struct vm_exit *vme; 1242268976Sjhb uint64_t gla, gpa; 1243268976Sjhb struct vm_guest_paging *paging; 1244261088Sjhb mem_region_read_t mread; 1245261088Sjhb mem_region_write_t mwrite; 1246270159Sgrehan enum vm_cpu_mode cpu_mode; 1247276403Sneel int cs_d, error, length; 1248256072Sneel 1249256072Sneel vcpu = &vm->vcpu[vcpuid]; 1250256072Sneel vme = &vcpu->exitinfo; 1251256072Sneel 1252256072Sneel gla = vme->u.inst_emul.gla; 1253256072Sneel gpa = vme->u.inst_emul.gpa; 1254270159Sgrehan cs_d = vme->u.inst_emul.cs_d; 1255256072Sneel vie = &vme->u.inst_emul.vie; 1256268976Sjhb paging = &vme->u.inst_emul.paging; 1257270159Sgrehan cpu_mode = paging->cpu_mode; 1258256072Sneel 1259276349Sneel VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %#lx", gpa); 1260276349Sneel 1261256072Sneel /* Fetch, decode and emulate the faulting instruction */ 1262276403Sneel if (vie->num_valid == 0) { 1263276403Sneel /* 1264276403Sneel * If the instruction length is not known then assume a 1265276403Sneel * maximum size instruction. 1266276403Sneel */ 1267276403Sneel length = vme->inst_length ? vme->inst_length : VIE_INST_SIZE; 1268276403Sneel error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip, 1269276403Sneel length, vie); 1270276403Sneel } else { 1271276403Sneel /* 1272276403Sneel * The instruction bytes have already been copied into 'vie' 1273276403Sneel */ 1274276403Sneel error = 0; 1275276403Sneel } 1276268976Sjhb if (error == 1) 1277268976Sjhb return (0); /* Resume guest to handle page fault */ 1278268976Sjhb else if (error == -1) 1279256072Sneel return (EFAULT); 1280268976Sjhb else if (error != 0) 1281268976Sjhb panic("%s: vmm_fetch_instruction error %d", __func__, error); 1282256072Sneel 1283270159Sgrehan if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0) 1284256072Sneel return (EFAULT); 1285256072Sneel 1286276403Sneel /* 1287276403Sneel * If the instruction length is not specified the update it now. 1288276403Sneel */ 1289276403Sneel if (vme->inst_length == 0) 1290276403Sneel vme->inst_length = vie->num_processed; 1291276403Sneel 1292261088Sjhb /* return to userland unless this is an in-kernel emulated device */ 1293261088Sjhb if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { 1294261088Sjhb mread = lapic_mmio_read; 1295261088Sjhb mwrite = lapic_mmio_write; 1296261088Sjhb } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { 1297261088Sjhb mread = vioapic_mmio_read; 1298261088Sjhb mwrite = vioapic_mmio_write; 1299261088Sjhb } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { 1300261088Sjhb mread = vhpet_mmio_read; 1301261088Sjhb mwrite = vhpet_mmio_write; 1302261088Sjhb } else { 1303262350Sjhb *retu = true; 1304256072Sneel return (0); 1305256072Sneel } 1306256072Sneel 1307270159Sgrehan error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging, 1308270159Sgrehan mread, mwrite, retu); 1309256072Sneel 1310256072Sneel return (error); 1311256072Sneel} 1312256072Sneel 1313268935Sjhbstatic int 1314268935Sjhbvm_handle_suspend(struct vm *vm, int vcpuid, bool *retu) 1315268935Sjhb{ 1316268935Sjhb int i, done; 1317268935Sjhb struct vcpu *vcpu; 1318268935Sjhb 1319268935Sjhb done = 0; 1320268935Sjhb vcpu = &vm->vcpu[vcpuid]; 1321268935Sjhb 1322268935Sjhb CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); 1323268935Sjhb 1324268935Sjhb /* 1325268935Sjhb * Wait until all 'active_cpus' have suspended themselves. 1326268935Sjhb * 1327268935Sjhb * Since a VM may be suspended at any time including when one or 1328268935Sjhb * more vcpus are doing a rendezvous we need to call the rendezvous 1329268935Sjhb * handler while we are waiting to prevent a deadlock. 1330268935Sjhb */ 1331268935Sjhb vcpu_lock(vcpu); 1332268935Sjhb while (1) { 1333268935Sjhb if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { 1334268935Sjhb VCPU_CTR0(vm, vcpuid, "All vcpus suspended"); 1335268935Sjhb break; 1336268935Sjhb } 1337268935Sjhb 1338268935Sjhb if (vm->rendezvous_func == NULL) { 1339268935Sjhb VCPU_CTR0(vm, vcpuid, "Sleeping during suspend"); 1340268935Sjhb vcpu_require_state_locked(vcpu, VCPU_SLEEPING); 1341268935Sjhb msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz); 1342268935Sjhb vcpu_require_state_locked(vcpu, VCPU_FROZEN); 1343268935Sjhb } else { 1344268935Sjhb VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend"); 1345268935Sjhb vcpu_unlock(vcpu); 1346268935Sjhb vm_handle_rendezvous(vm, vcpuid); 1347268935Sjhb vcpu_lock(vcpu); 1348268935Sjhb } 1349268935Sjhb } 1350268935Sjhb vcpu_unlock(vcpu); 1351268935Sjhb 1352268935Sjhb /* 1353268935Sjhb * Wakeup the other sleeping vcpus and return to userspace. 1354268935Sjhb */ 1355268935Sjhb for (i = 0; i < VM_MAXCPU; i++) { 1356268935Sjhb if (CPU_ISSET(i, &vm->suspended_cpus)) { 1357268935Sjhb vcpu_notify_event(vm, i, false); 1358268935Sjhb } 1359268935Sjhb } 1360268935Sjhb 1361268935Sjhb *retu = true; 1362268935Sjhb return (0); 1363268935Sjhb} 1364268935Sjhb 1365221828Sgrehanint 1366268935Sjhbvm_suspend(struct vm *vm, enum vm_suspend_how how) 1367268935Sjhb{ 1368268935Sjhb int i; 1369268935Sjhb 1370268935Sjhb if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) 1371268935Sjhb return (EINVAL); 1372268935Sjhb 1373268935Sjhb if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) { 1374268935Sjhb VM_CTR2(vm, "virtual machine already suspended %d/%d", 1375268935Sjhb vm->suspend, how); 1376268935Sjhb return (EALREADY); 1377268935Sjhb } 1378268935Sjhb 1379268935Sjhb VM_CTR1(vm, "virtual machine successfully suspended %d", how); 1380268935Sjhb 1381268935Sjhb /* 1382268935Sjhb * Notify all active vcpus that they are now suspended. 1383268935Sjhb */ 1384268935Sjhb for (i = 0; i < VM_MAXCPU; i++) { 1385268935Sjhb if (CPU_ISSET(i, &vm->active_cpus)) 1386268935Sjhb vcpu_notify_event(vm, i, false); 1387268935Sjhb } 1388268935Sjhb 1389268935Sjhb return (0); 1390268935Sjhb} 1391268935Sjhb 1392268935Sjhbvoid 1393268935Sjhbvm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip) 1394268935Sjhb{ 1395268935Sjhb struct vm_exit *vmexit; 1396268935Sjhb 1397268935Sjhb KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST, 1398268935Sjhb ("vm_exit_suspended: invalid suspend type %d", vm->suspend)); 1399268935Sjhb 1400268935Sjhb vmexit = vm_exitinfo(vm, vcpuid); 1401268935Sjhb vmexit->rip = rip; 1402268935Sjhb vmexit->inst_length = 0; 1403268935Sjhb vmexit->exitcode = VM_EXITCODE_SUSPENDED; 1404268935Sjhb vmexit->u.suspended.how = vm->suspend; 1405268935Sjhb} 1406268935Sjhb 1407270074Sgrehanvoid 1408270074Sgrehanvm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip) 1409270074Sgrehan{ 1410270074Sgrehan struct vm_exit *vmexit; 1411270074Sgrehan 1412270074Sgrehan KASSERT(vm->rendezvous_func != NULL, ("rendezvous not in progress")); 1413270074Sgrehan 1414270074Sgrehan vmexit = vm_exitinfo(vm, vcpuid); 1415270074Sgrehan vmexit->rip = rip; 1416270074Sgrehan vmexit->inst_length = 0; 1417270074Sgrehan vmexit->exitcode = VM_EXITCODE_RENDEZVOUS; 1418270074Sgrehan vmm_stat_incr(vm, vcpuid, VMEXIT_RENDEZVOUS, 1); 1419270074Sgrehan} 1420270074Sgrehan 1421270074Sgrehanvoid 1422270074Sgrehanvm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip) 1423270074Sgrehan{ 1424270074Sgrehan struct vm_exit *vmexit; 1425270074Sgrehan 1426270074Sgrehan vmexit = vm_exitinfo(vm, vcpuid); 1427270074Sgrehan vmexit->rip = rip; 1428270074Sgrehan vmexit->inst_length = 0; 1429270074Sgrehan vmexit->exitcode = VM_EXITCODE_BOGUS; 1430270074Sgrehan vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1); 1431270074Sgrehan} 1432270074Sgrehan 1433268935Sjhbint 1434221828Sgrehanvm_run(struct vm *vm, struct vm_run *vmrun) 1435221828Sgrehan{ 1436256072Sneel int error, vcpuid; 1437221828Sgrehan struct vcpu *vcpu; 1438221828Sgrehan struct pcb *pcb; 1439242065Sneel uint64_t tscval, rip; 1440242065Sneel struct vm_exit *vme; 1441262350Sjhb bool retu, intr_disabled; 1442256072Sneel pmap_t pmap; 1443268935Sjhb void *rptr, *sptr; 1444221828Sgrehan 1445221828Sgrehan vcpuid = vmrun->cpuid; 1446221828Sgrehan 1447221828Sgrehan if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1448221828Sgrehan return (EINVAL); 1449221828Sgrehan 1450270070Sgrehan if (!CPU_ISSET(vcpuid, &vm->active_cpus)) 1451270070Sgrehan return (EINVAL); 1452270070Sgrehan 1453270070Sgrehan if (CPU_ISSET(vcpuid, &vm->suspended_cpus)) 1454270070Sgrehan return (EINVAL); 1455270070Sgrehan 1456268935Sjhb rptr = &vm->rendezvous_func; 1457268935Sjhb sptr = &vm->suspend; 1458256072Sneel pmap = vmspace_pmap(vm->vmspace); 1459221828Sgrehan vcpu = &vm->vcpu[vcpuid]; 1460256072Sneel vme = &vcpu->exitinfo; 1461242065Sneel rip = vmrun->rip; 1462242065Sneelrestart: 1463221828Sgrehan critical_enter(); 1464221828Sgrehan 1465256072Sneel KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), 1466256072Sneel ("vm_run: absurd pm_active")); 1467256072Sneel 1468221828Sgrehan tscval = rdtsc(); 1469221828Sgrehan 1470221828Sgrehan pcb = PCPU_GET(curpcb); 1471221914Sjhb set_pcb_flags(pcb, PCB_FULL_IRET); 1472221828Sgrehan 1473221828Sgrehan restore_guest_fpustate(vcpu); 1474241489Sneel 1475256072Sneel vcpu_require_state(vm, vcpuid, VCPU_RUNNING); 1476268935Sjhb error = VMRUN(vm->cookie, vcpuid, rip, pmap, rptr, sptr); 1477256072Sneel vcpu_require_state(vm, vcpuid, VCPU_FROZEN); 1478241489Sneel 1479221828Sgrehan save_guest_fpustate(vcpu); 1480221828Sgrehan 1481221828Sgrehan vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); 1482221828Sgrehan 1483221828Sgrehan critical_exit(); 1484221828Sgrehan 1485256072Sneel if (error == 0) { 1486262350Sjhb retu = false; 1487256072Sneel switch (vme->exitcode) { 1488268935Sjhb case VM_EXITCODE_SUSPENDED: 1489268935Sjhb error = vm_handle_suspend(vm, vcpuid, &retu); 1490268935Sjhb break; 1491266339Sjhb case VM_EXITCODE_IOAPIC_EOI: 1492266339Sjhb vioapic_process_eoi(vm, vcpuid, 1493266339Sjhb vme->u.ioapic_eoi.vector); 1494266339Sjhb break; 1495266339Sjhb case VM_EXITCODE_RENDEZVOUS: 1496266339Sjhb vm_handle_rendezvous(vm, vcpuid); 1497266339Sjhb error = 0; 1498266339Sjhb break; 1499256072Sneel case VM_EXITCODE_HLT: 1500262350Sjhb intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); 1501262350Sjhb error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu); 1502256072Sneel break; 1503256072Sneel case VM_EXITCODE_PAGING: 1504256072Sneel error = vm_handle_paging(vm, vcpuid, &retu); 1505256072Sneel break; 1506256072Sneel case VM_EXITCODE_INST_EMUL: 1507256072Sneel error = vm_handle_inst_emul(vm, vcpuid, &retu); 1508256072Sneel break; 1509268976Sjhb case VM_EXITCODE_INOUT: 1510268976Sjhb case VM_EXITCODE_INOUT_STR: 1511268976Sjhb error = vm_handle_inout(vm, vcpuid, vme, &retu); 1512268976Sjhb break; 1513276349Sneel case VM_EXITCODE_MONITOR: 1514276349Sneel case VM_EXITCODE_MWAIT: 1515276349Sneel vm_inject_ud(vm, vcpuid); 1516276349Sneel break; 1517256072Sneel default: 1518262350Sjhb retu = true; /* handled in userland */ 1519256072Sneel break; 1520242065Sneel } 1521256072Sneel } 1522242065Sneel 1523262350Sjhb if (error == 0 && retu == false) { 1524242065Sneel rip = vme->rip + vme->inst_length; 1525242065Sneel goto restart; 1526242065Sneel } 1527242065Sneel 1528256072Sneel /* copy the exit information */ 1529256072Sneel bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit)); 1530221828Sgrehan return (error); 1531221828Sgrehan} 1532221828Sgrehan 1533221828Sgrehanint 1534270159Sgrehanvm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info) 1535270159Sgrehan{ 1536270159Sgrehan struct vcpu *vcpu; 1537270159Sgrehan int type, vector; 1538270159Sgrehan 1539270159Sgrehan if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1540270159Sgrehan return (EINVAL); 1541270159Sgrehan 1542270159Sgrehan vcpu = &vm->vcpu[vcpuid]; 1543270159Sgrehan 1544270159Sgrehan if (info & VM_INTINFO_VALID) { 1545270159Sgrehan type = info & VM_INTINFO_TYPE; 1546270159Sgrehan vector = info & 0xff; 1547270159Sgrehan if (type == VM_INTINFO_NMI && vector != IDT_NMI) 1548270159Sgrehan return (EINVAL); 1549270159Sgrehan if (type == VM_INTINFO_HWEXCEPTION && vector >= 32) 1550270159Sgrehan return (EINVAL); 1551270159Sgrehan if (info & VM_INTINFO_RSVD) 1552270159Sgrehan return (EINVAL); 1553270159Sgrehan } else { 1554270159Sgrehan info = 0; 1555270159Sgrehan } 1556270159Sgrehan VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info); 1557270159Sgrehan vcpu->exitintinfo = info; 1558270159Sgrehan return (0); 1559270159Sgrehan} 1560270159Sgrehan 1561270159Sgrehanenum exc_class { 1562270159Sgrehan EXC_BENIGN, 1563270159Sgrehan EXC_CONTRIBUTORY, 1564270159Sgrehan EXC_PAGEFAULT 1565270159Sgrehan}; 1566270159Sgrehan 1567270159Sgrehan#define IDT_VE 20 /* Virtualization Exception (Intel specific) */ 1568270159Sgrehan 1569270159Sgrehanstatic enum exc_class 1570270159Sgrehanexception_class(uint64_t info) 1571270159Sgrehan{ 1572270159Sgrehan int type, vector; 1573270159Sgrehan 1574270159Sgrehan KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info)); 1575270159Sgrehan type = info & VM_INTINFO_TYPE; 1576270159Sgrehan vector = info & 0xff; 1577270159Sgrehan 1578270159Sgrehan /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */ 1579270159Sgrehan switch (type) { 1580270159Sgrehan case VM_INTINFO_HWINTR: 1581270159Sgrehan case VM_INTINFO_SWINTR: 1582270159Sgrehan case VM_INTINFO_NMI: 1583270159Sgrehan return (EXC_BENIGN); 1584270159Sgrehan default: 1585270159Sgrehan /* 1586270159Sgrehan * Hardware exception. 1587270159Sgrehan * 1588270159Sgrehan * SVM and VT-x use identical type values to represent NMI, 1589270159Sgrehan * hardware interrupt and software interrupt. 1590270159Sgrehan * 1591270159Sgrehan * SVM uses type '3' for all exceptions. VT-x uses type '3' 1592270159Sgrehan * for exceptions except #BP and #OF. #BP and #OF use a type 1593270159Sgrehan * value of '5' or '6'. Therefore we don't check for explicit 1594270159Sgrehan * values of 'type' to classify 'intinfo' into a hardware 1595270159Sgrehan * exception. 1596270159Sgrehan */ 1597270159Sgrehan break; 1598270159Sgrehan } 1599270159Sgrehan 1600270159Sgrehan switch (vector) { 1601270159Sgrehan case IDT_PF: 1602270159Sgrehan case IDT_VE: 1603270159Sgrehan return (EXC_PAGEFAULT); 1604270159Sgrehan case IDT_DE: 1605270159Sgrehan case IDT_TS: 1606270159Sgrehan case IDT_NP: 1607270159Sgrehan case IDT_SS: 1608270159Sgrehan case IDT_GP: 1609270159Sgrehan return (EXC_CONTRIBUTORY); 1610270159Sgrehan default: 1611270159Sgrehan return (EXC_BENIGN); 1612270159Sgrehan } 1613270159Sgrehan} 1614270159Sgrehan 1615270159Sgrehanstatic int 1616270159Sgrehannested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2, 1617270159Sgrehan uint64_t *retinfo) 1618270159Sgrehan{ 1619270159Sgrehan enum exc_class exc1, exc2; 1620270159Sgrehan int type1, vector1; 1621270159Sgrehan 1622270159Sgrehan KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1)); 1623270159Sgrehan KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2)); 1624270159Sgrehan 1625270159Sgrehan /* 1626270159Sgrehan * If an exception occurs while attempting to call the double-fault 1627270159Sgrehan * handler the processor enters shutdown mode (aka triple fault). 1628270159Sgrehan */ 1629270159Sgrehan type1 = info1 & VM_INTINFO_TYPE; 1630270159Sgrehan vector1 = info1 & 0xff; 1631270159Sgrehan if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) { 1632270159Sgrehan VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)", 1633270159Sgrehan info1, info2); 1634270159Sgrehan vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT); 1635270159Sgrehan *retinfo = 0; 1636270159Sgrehan return (0); 1637270159Sgrehan } 1638270159Sgrehan 1639270159Sgrehan /* 1640270159Sgrehan * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3 1641270159Sgrehan */ 1642270159Sgrehan exc1 = exception_class(info1); 1643270159Sgrehan exc2 = exception_class(info2); 1644270159Sgrehan if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) || 1645270159Sgrehan (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) { 1646270159Sgrehan /* Convert nested fault into a double fault. */ 1647270159Sgrehan *retinfo = IDT_DF; 1648270159Sgrehan *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; 1649270159Sgrehan *retinfo |= VM_INTINFO_DEL_ERRCODE; 1650270159Sgrehan } else { 1651270159Sgrehan /* Handle exceptions serially */ 1652270159Sgrehan *retinfo = info2; 1653270159Sgrehan } 1654270159Sgrehan return (1); 1655270159Sgrehan} 1656270159Sgrehan 1657270159Sgrehanstatic uint64_t 1658270159Sgrehanvcpu_exception_intinfo(struct vcpu *vcpu) 1659270159Sgrehan{ 1660270159Sgrehan uint64_t info = 0; 1661270159Sgrehan 1662270159Sgrehan if (vcpu->exception_pending) { 1663270159Sgrehan info = vcpu->exception.vector & 0xff; 1664270159Sgrehan info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; 1665270159Sgrehan if (vcpu->exception.error_code_valid) { 1666270159Sgrehan info |= VM_INTINFO_DEL_ERRCODE; 1667270159Sgrehan info |= (uint64_t)vcpu->exception.error_code << 32; 1668270159Sgrehan } 1669270159Sgrehan } 1670270159Sgrehan return (info); 1671270159Sgrehan} 1672270159Sgrehan 1673270159Sgrehanint 1674270159Sgrehanvm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo) 1675270159Sgrehan{ 1676270159Sgrehan struct vcpu *vcpu; 1677270159Sgrehan uint64_t info1, info2; 1678270159Sgrehan int valid; 1679270159Sgrehan 1680270159Sgrehan KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid)); 1681270159Sgrehan 1682270159Sgrehan vcpu = &vm->vcpu[vcpuid]; 1683270159Sgrehan 1684270159Sgrehan info1 = vcpu->exitintinfo; 1685270159Sgrehan vcpu->exitintinfo = 0; 1686270159Sgrehan 1687270159Sgrehan info2 = 0; 1688270159Sgrehan if (vcpu->exception_pending) { 1689270159Sgrehan info2 = vcpu_exception_intinfo(vcpu); 1690270159Sgrehan vcpu->exception_pending = 0; 1691270159Sgrehan VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx", 1692270159Sgrehan vcpu->exception.vector, info2); 1693270159Sgrehan } 1694270159Sgrehan 1695270159Sgrehan if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) { 1696270159Sgrehan valid = nested_fault(vm, vcpuid, info1, info2, retinfo); 1697270159Sgrehan } else if (info1 & VM_INTINFO_VALID) { 1698270159Sgrehan *retinfo = info1; 1699270159Sgrehan valid = 1; 1700270159Sgrehan } else if (info2 & VM_INTINFO_VALID) { 1701270159Sgrehan *retinfo = info2; 1702270159Sgrehan valid = 1; 1703270159Sgrehan } else { 1704270159Sgrehan valid = 0; 1705270159Sgrehan } 1706270159Sgrehan 1707270159Sgrehan if (valid) { 1708270159Sgrehan VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), " 1709270159Sgrehan "retinfo(%#lx)", __func__, info1, info2, *retinfo); 1710270159Sgrehan } 1711270159Sgrehan 1712270159Sgrehan return (valid); 1713270159Sgrehan} 1714270159Sgrehan 1715270159Sgrehanint 1716270159Sgrehanvm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2) 1717270159Sgrehan{ 1718270159Sgrehan struct vcpu *vcpu; 1719270159Sgrehan 1720270159Sgrehan if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1721270159Sgrehan return (EINVAL); 1722270159Sgrehan 1723270159Sgrehan vcpu = &vm->vcpu[vcpuid]; 1724270159Sgrehan *info1 = vcpu->exitintinfo; 1725270159Sgrehan *info2 = vcpu_exception_intinfo(vcpu); 1726270159Sgrehan return (0); 1727270159Sgrehan} 1728270159Sgrehan 1729270159Sgrehanint 1730267427Sjhbvm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception) 1731221828Sgrehan{ 1732267427Sjhb struct vcpu *vcpu; 1733267427Sjhb 1734221828Sgrehan if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1735221828Sgrehan return (EINVAL); 1736221828Sgrehan 1737267427Sjhb if (exception->vector < 0 || exception->vector >= 32) 1738221828Sgrehan return (EINVAL); 1739221828Sgrehan 1740270159Sgrehan /* 1741270159Sgrehan * A double fault exception should never be injected directly into 1742270159Sgrehan * the guest. It is a derived exception that results from specific 1743270159Sgrehan * combinations of nested faults. 1744270159Sgrehan */ 1745270159Sgrehan if (exception->vector == IDT_DF) 1746270159Sgrehan return (EINVAL); 1747270159Sgrehan 1748267427Sjhb vcpu = &vm->vcpu[vcpuid]; 1749221828Sgrehan 1750267427Sjhb if (vcpu->exception_pending) { 1751267427Sjhb VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to " 1752267427Sjhb "pending exception %d", exception->vector, 1753267427Sjhb vcpu->exception.vector); 1754267427Sjhb return (EBUSY); 1755267427Sjhb } 1756267427Sjhb 1757267427Sjhb vcpu->exception_pending = 1; 1758267427Sjhb vcpu->exception = *exception; 1759267427Sjhb VCPU_CTR1(vm, vcpuid, "Exception %d pending", exception->vector); 1760267427Sjhb return (0); 1761221828Sgrehan} 1762221828Sgrehan 1763270159Sgrehanvoid 1764270159Sgrehanvm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid, 1765270159Sgrehan int errcode) 1766267427Sjhb{ 1767270159Sgrehan struct vm_exception exception; 1768267427Sjhb struct vm_exit *vmexit; 1769270159Sgrehan struct vm *vm; 1770267427Sjhb int error; 1771267427Sjhb 1772270159Sgrehan vm = vmarg; 1773270159Sgrehan 1774270159Sgrehan exception.vector = vector; 1775270159Sgrehan exception.error_code = errcode; 1776270159Sgrehan exception.error_code_valid = errcode_valid; 1777270159Sgrehan error = vm_inject_exception(vm, vcpuid, &exception); 1778267427Sjhb KASSERT(error == 0, ("vm_inject_exception error %d", error)); 1779267427Sjhb 1780267427Sjhb /* 1781267427Sjhb * A fault-like exception allows the instruction to be restarted 1782267427Sjhb * after the exception handler returns. 1783267427Sjhb * 1784267427Sjhb * By setting the inst_length to 0 we ensure that the instruction 1785267427Sjhb * pointer remains at the faulting instruction. 1786267427Sjhb */ 1787267427Sjhb vmexit = vm_exitinfo(vm, vcpuid); 1788267427Sjhb vmexit->inst_length = 0; 1789267427Sjhb} 1790267427Sjhb 1791267427Sjhbvoid 1792270159Sgrehanvm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2) 1793268976Sjhb{ 1794270159Sgrehan struct vm *vm; 1795268976Sjhb int error; 1796268976Sjhb 1797270159Sgrehan vm = vmarg; 1798268976Sjhb VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx", 1799268976Sjhb error_code, cr2); 1800268976Sjhb 1801268976Sjhb error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2); 1802268976Sjhb KASSERT(error == 0, ("vm_set_register(cr2) error %d", error)); 1803268976Sjhb 1804270159Sgrehan vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code); 1805268976Sjhb} 1806268976Sjhb 1807248389Sneelstatic VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); 1808241982Sneel 1809221828Sgrehanint 1810241982Sneelvm_inject_nmi(struct vm *vm, int vcpuid) 1811221828Sgrehan{ 1812241982Sneel struct vcpu *vcpu; 1813221828Sgrehan 1814241982Sneel if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1815221828Sgrehan return (EINVAL); 1816221828Sgrehan 1817241982Sneel vcpu = &vm->vcpu[vcpuid]; 1818241982Sneel 1819241982Sneel vcpu->nmi_pending = 1; 1820266339Sjhb vcpu_notify_event(vm, vcpuid, false); 1821241982Sneel return (0); 1822221828Sgrehan} 1823221828Sgrehan 1824221828Sgrehanint 1825241982Sneelvm_nmi_pending(struct vm *vm, int vcpuid) 1826241982Sneel{ 1827241982Sneel struct vcpu *vcpu; 1828241982Sneel 1829241982Sneel if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1830241982Sneel panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); 1831241982Sneel 1832241982Sneel vcpu = &vm->vcpu[vcpuid]; 1833241982Sneel 1834241982Sneel return (vcpu->nmi_pending); 1835241982Sneel} 1836241982Sneel 1837241982Sneelvoid 1838241982Sneelvm_nmi_clear(struct vm *vm, int vcpuid) 1839241982Sneel{ 1840241982Sneel struct vcpu *vcpu; 1841241982Sneel 1842241982Sneel if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1843241982Sneel panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); 1844241982Sneel 1845241982Sneel vcpu = &vm->vcpu[vcpuid]; 1846241982Sneel 1847241982Sneel if (vcpu->nmi_pending == 0) 1848241982Sneel panic("vm_nmi_clear: inconsistent nmi_pending state"); 1849241982Sneel 1850241982Sneel vcpu->nmi_pending = 0; 1851241982Sneel vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1); 1852241982Sneel} 1853241982Sneel 1854268891Sjhbstatic VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu"); 1855268891Sjhb 1856241982Sneelint 1857268891Sjhbvm_inject_extint(struct vm *vm, int vcpuid) 1858268891Sjhb{ 1859268891Sjhb struct vcpu *vcpu; 1860268891Sjhb 1861268891Sjhb if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1862268891Sjhb return (EINVAL); 1863268891Sjhb 1864268891Sjhb vcpu = &vm->vcpu[vcpuid]; 1865268891Sjhb 1866268891Sjhb vcpu->extint_pending = 1; 1867268891Sjhb vcpu_notify_event(vm, vcpuid, false); 1868268891Sjhb return (0); 1869268891Sjhb} 1870268891Sjhb 1871268891Sjhbint 1872268891Sjhbvm_extint_pending(struct vm *vm, int vcpuid) 1873268891Sjhb{ 1874268891Sjhb struct vcpu *vcpu; 1875268891Sjhb 1876268891Sjhb if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1877268891Sjhb panic("vm_extint_pending: invalid vcpuid %d", vcpuid); 1878268891Sjhb 1879268891Sjhb vcpu = &vm->vcpu[vcpuid]; 1880268891Sjhb 1881268891Sjhb return (vcpu->extint_pending); 1882268891Sjhb} 1883268891Sjhb 1884268891Sjhbvoid 1885268891Sjhbvm_extint_clear(struct vm *vm, int vcpuid) 1886268891Sjhb{ 1887268891Sjhb struct vcpu *vcpu; 1888268891Sjhb 1889268891Sjhb if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1890268891Sjhb panic("vm_extint_pending: invalid vcpuid %d", vcpuid); 1891268891Sjhb 1892268891Sjhb vcpu = &vm->vcpu[vcpuid]; 1893268891Sjhb 1894268891Sjhb if (vcpu->extint_pending == 0) 1895268891Sjhb panic("vm_extint_clear: inconsistent extint_pending state"); 1896268891Sjhb 1897268891Sjhb vcpu->extint_pending = 0; 1898268891Sjhb vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1); 1899268891Sjhb} 1900268891Sjhb 1901268891Sjhbint 1902221828Sgrehanvm_get_capability(struct vm *vm, int vcpu, int type, int *retval) 1903221828Sgrehan{ 1904221828Sgrehan if (vcpu < 0 || vcpu >= VM_MAXCPU) 1905221828Sgrehan return (EINVAL); 1906221828Sgrehan 1907221828Sgrehan if (type < 0 || type >= VM_CAP_MAX) 1908221828Sgrehan return (EINVAL); 1909221828Sgrehan 1910221828Sgrehan return (VMGETCAP(vm->cookie, vcpu, type, retval)); 1911221828Sgrehan} 1912221828Sgrehan 1913221828Sgrehanint 1914221828Sgrehanvm_set_capability(struct vm *vm, int vcpu, int type, int val) 1915221828Sgrehan{ 1916221828Sgrehan if (vcpu < 0 || vcpu >= VM_MAXCPU) 1917221828Sgrehan return (EINVAL); 1918221828Sgrehan 1919221828Sgrehan if (type < 0 || type >= VM_CAP_MAX) 1920221828Sgrehan return (EINVAL); 1921221828Sgrehan 1922221828Sgrehan return (VMSETCAP(vm->cookie, vcpu, type, val)); 1923221828Sgrehan} 1924221828Sgrehan 1925221828Sgrehanstruct vlapic * 1926221828Sgrehanvm_lapic(struct vm *vm, int cpu) 1927221828Sgrehan{ 1928221828Sgrehan return (vm->vcpu[cpu].vlapic); 1929221828Sgrehan} 1930221828Sgrehan 1931261088Sjhbstruct vioapic * 1932261088Sjhbvm_ioapic(struct vm *vm) 1933261088Sjhb{ 1934261088Sjhb 1935261088Sjhb return (vm->vioapic); 1936261088Sjhb} 1937261088Sjhb 1938261088Sjhbstruct vhpet * 1939261088Sjhbvm_hpet(struct vm *vm) 1940261088Sjhb{ 1941261088Sjhb 1942261088Sjhb return (vm->vhpet); 1943261088Sjhb} 1944261088Sjhb 1945221828Sgrehanboolean_t 1946221828Sgrehanvmm_is_pptdev(int bus, int slot, int func) 1947221828Sgrehan{ 1948246188Sneel int found, i, n; 1949246188Sneel int b, s, f; 1950221828Sgrehan char *val, *cp, *cp2; 1951221828Sgrehan 1952221828Sgrehan /* 1953246188Sneel * XXX 1954246188Sneel * The length of an environment variable is limited to 128 bytes which 1955246188Sneel * puts an upper limit on the number of passthru devices that may be 1956246188Sneel * specified using a single environment variable. 1957246188Sneel * 1958246188Sneel * Work around this by scanning multiple environment variable 1959246188Sneel * names instead of a single one - yuck! 1960221828Sgrehan */ 1961246188Sneel const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL }; 1962246188Sneel 1963246188Sneel /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */ 1964221828Sgrehan found = 0; 1965246188Sneel for (i = 0; names[i] != NULL && !found; i++) { 1966246188Sneel cp = val = getenv(names[i]); 1967246188Sneel while (cp != NULL && *cp != '\0') { 1968246188Sneel if ((cp2 = strchr(cp, ' ')) != NULL) 1969246188Sneel *cp2 = '\0'; 1970221828Sgrehan 1971246188Sneel n = sscanf(cp, "%d/%d/%d", &b, &s, &f); 1972246188Sneel if (n == 3 && bus == b && slot == s && func == f) { 1973246188Sneel found = 1; 1974246188Sneel break; 1975246188Sneel } 1976221828Sgrehan 1977246188Sneel if (cp2 != NULL) 1978246188Sneel *cp2++ = ' '; 1979221828Sgrehan 1980246188Sneel cp = cp2; 1981246188Sneel } 1982246188Sneel freeenv(val); 1983221828Sgrehan } 1984221828Sgrehan return (found); 1985221828Sgrehan} 1986221828Sgrehan 1987221828Sgrehanvoid * 1988221828Sgrehanvm_iommu_domain(struct vm *vm) 1989221828Sgrehan{ 1990221828Sgrehan 1991221828Sgrehan return (vm->iommu); 1992221828Sgrehan} 1993221828Sgrehan 1994241489Sneelint 1995266393Sjhbvcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate, 1996266393Sjhb bool from_idle) 1997221828Sgrehan{ 1998241489Sneel int error; 1999221828Sgrehan struct vcpu *vcpu; 2000221828Sgrehan 2001221828Sgrehan if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 2002221828Sgrehan panic("vm_set_run_state: invalid vcpuid %d", vcpuid); 2003221828Sgrehan 2004221828Sgrehan vcpu = &vm->vcpu[vcpuid]; 2005221828Sgrehan 2006241489Sneel vcpu_lock(vcpu); 2007266393Sjhb error = vcpu_set_state_locked(vcpu, newstate, from_idle); 2008241489Sneel vcpu_unlock(vcpu); 2009241489Sneel 2010241489Sneel return (error); 2011221828Sgrehan} 2012221828Sgrehan 2013241489Sneelenum vcpu_state 2014249879Sgrehanvcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) 2015221828Sgrehan{ 2016221828Sgrehan struct vcpu *vcpu; 2017241489Sneel enum vcpu_state state; 2018221828Sgrehan 2019221828Sgrehan if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 2020221828Sgrehan panic("vm_get_run_state: invalid vcpuid %d", vcpuid); 2021221828Sgrehan 2022221828Sgrehan vcpu = &vm->vcpu[vcpuid]; 2023221828Sgrehan 2024241489Sneel vcpu_lock(vcpu); 2025241489Sneel state = vcpu->state; 2026249879Sgrehan if (hostcpu != NULL) 2027249879Sgrehan *hostcpu = vcpu->hostcpu; 2028241489Sneel vcpu_unlock(vcpu); 2029221828Sgrehan 2030241489Sneel return (state); 2031221828Sgrehan} 2032221828Sgrehan 2033270070Sgrehanint 2034221828Sgrehanvm_activate_cpu(struct vm *vm, int vcpuid) 2035221828Sgrehan{ 2036221828Sgrehan 2037270070Sgrehan if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 2038270070Sgrehan return (EINVAL); 2039266339Sjhb 2040270070Sgrehan if (CPU_ISSET(vcpuid, &vm->active_cpus)) 2041270070Sgrehan return (EBUSY); 2042270070Sgrehan 2043266339Sjhb VCPU_CTR0(vm, vcpuid, "activated"); 2044266339Sjhb CPU_SET_ATOMIC(vcpuid, &vm->active_cpus); 2045270070Sgrehan return (0); 2046221828Sgrehan} 2047221828Sgrehan 2048223621Sgrehancpuset_t 2049221828Sgrehanvm_active_cpus(struct vm *vm) 2050221828Sgrehan{ 2051221828Sgrehan 2052221828Sgrehan return (vm->active_cpus); 2053221828Sgrehan} 2054221828Sgrehan 2055270070Sgrehancpuset_t 2056270070Sgrehanvm_suspended_cpus(struct vm *vm) 2057270070Sgrehan{ 2058270070Sgrehan 2059270070Sgrehan return (vm->suspended_cpus); 2060270070Sgrehan} 2061270070Sgrehan 2062221828Sgrehanvoid * 2063221828Sgrehanvcpu_stats(struct vm *vm, int vcpuid) 2064221828Sgrehan{ 2065221828Sgrehan 2066221828Sgrehan return (vm->vcpu[vcpuid].stats); 2067221828Sgrehan} 2068240922Sneel 2069240922Sneelint 2070240922Sneelvm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) 2071240922Sneel{ 2072240922Sneel if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 2073240922Sneel return (EINVAL); 2074240922Sneel 2075240922Sneel *state = vm->vcpu[vcpuid].x2apic_state; 2076240922Sneel 2077240922Sneel return (0); 2078240922Sneel} 2079240922Sneel 2080240922Sneelint 2081240922Sneelvm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) 2082240922Sneel{ 2083240922Sneel if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 2084240922Sneel return (EINVAL); 2085240922Sneel 2086248392Sneel if (state >= X2APIC_STATE_LAST) 2087240922Sneel return (EINVAL); 2088240922Sneel 2089240922Sneel vm->vcpu[vcpuid].x2apic_state = state; 2090240922Sneel 2091240943Sneel vlapic_set_x2apic_state(vm, vcpuid, state); 2092240943Sneel 2093240922Sneel return (0); 2094240922Sneel} 2095241489Sneel 2096262350Sjhb/* 2097262350Sjhb * This function is called to ensure that a vcpu "sees" a pending event 2098262350Sjhb * as soon as possible: 2099262350Sjhb * - If the vcpu thread is sleeping then it is woken up. 2100262350Sjhb * - If the vcpu is running on a different host_cpu then an IPI will be directed 2101262350Sjhb * to the host_cpu to cause the vcpu to trap into the hypervisor. 2102262350Sjhb */ 2103241489Sneelvoid 2104266339Sjhbvcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr) 2105241489Sneel{ 2106241489Sneel int hostcpu; 2107241489Sneel struct vcpu *vcpu; 2108241489Sneel 2109241489Sneel vcpu = &vm->vcpu[vcpuid]; 2110241489Sneel 2111242065Sneel vcpu_lock(vcpu); 2112241489Sneel hostcpu = vcpu->hostcpu; 2113266393Sjhb if (vcpu->state == VCPU_RUNNING) { 2114266393Sjhb KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); 2115266339Sjhb if (hostcpu != curcpu) { 2116266393Sjhb if (lapic_intr) { 2117266339Sjhb vlapic_post_intr(vcpu->vlapic, hostcpu, 2118266339Sjhb vmm_ipinum); 2119266393Sjhb } else { 2120266339Sjhb ipi_cpu(hostcpu, vmm_ipinum); 2121266393Sjhb } 2122266393Sjhb } else { 2123266393Sjhb /* 2124266393Sjhb * If the 'vcpu' is running on 'curcpu' then it must 2125266393Sjhb * be sending a notification to itself (e.g. SELF_IPI). 2126266393Sjhb * The pending event will be picked up when the vcpu 2127266393Sjhb * transitions back to guest context. 2128266393Sjhb */ 2129266339Sjhb } 2130266393Sjhb } else { 2131266393Sjhb KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " 2132266393Sjhb "with hostcpu %d", vcpu->state, hostcpu)); 2133266393Sjhb if (vcpu->state == VCPU_SLEEPING) 2134266393Sjhb wakeup_one(vcpu); 2135242065Sneel } 2136242065Sneel vcpu_unlock(vcpu); 2137241489Sneel} 2138256072Sneel 2139256072Sneelstruct vmspace * 2140256072Sneelvm_get_vmspace(struct vm *vm) 2141256072Sneel{ 2142256072Sneel 2143256072Sneel return (vm->vmspace); 2144256072Sneel} 2145261088Sjhb 2146261088Sjhbint 2147261088Sjhbvm_apicid2vcpuid(struct vm *vm, int apicid) 2148261088Sjhb{ 2149261088Sjhb /* 2150261088Sjhb * XXX apic id is assumed to be numerically identical to vcpu id 2151261088Sjhb */ 2152261088Sjhb return (apicid); 2153261088Sjhb} 2154266339Sjhb 2155266339Sjhbvoid 2156266339Sjhbvm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest, 2157266339Sjhb vm_rendezvous_func_t func, void *arg) 2158266339Sjhb{ 2159266339Sjhb int i; 2160266339Sjhb 2161266339Sjhb /* 2162266339Sjhb * Enforce that this function is called without any locks 2163266339Sjhb */ 2164266339Sjhb WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous"); 2165266339Sjhb KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU), 2166266339Sjhb ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid)); 2167266339Sjhb 2168266339Sjhbrestart: 2169266339Sjhb mtx_lock(&vm->rendezvous_mtx); 2170266339Sjhb if (vm->rendezvous_func != NULL) { 2171266339Sjhb /* 2172266339Sjhb * If a rendezvous is already in progress then we need to 2173266339Sjhb * call the rendezvous handler in case this 'vcpuid' is one 2174266339Sjhb * of the targets of the rendezvous. 2175266339Sjhb */ 2176266339Sjhb RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress"); 2177266339Sjhb mtx_unlock(&vm->rendezvous_mtx); 2178266339Sjhb vm_handle_rendezvous(vm, vcpuid); 2179266339Sjhb goto restart; 2180266339Sjhb } 2181266339Sjhb KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous " 2182266339Sjhb "rendezvous is still in progress")); 2183266339Sjhb 2184266339Sjhb RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous"); 2185266339Sjhb vm->rendezvous_req_cpus = dest; 2186266339Sjhb CPU_ZERO(&vm->rendezvous_done_cpus); 2187266339Sjhb vm->rendezvous_arg = arg; 2188266339Sjhb vm_set_rendezvous_func(vm, func); 2189266339Sjhb mtx_unlock(&vm->rendezvous_mtx); 2190266339Sjhb 2191266339Sjhb /* 2192266339Sjhb * Wake up any sleeping vcpus and trigger a VM-exit in any running 2193266339Sjhb * vcpus so they handle the rendezvous as soon as possible. 2194266339Sjhb */ 2195266339Sjhb for (i = 0; i < VM_MAXCPU; i++) { 2196266339Sjhb if (CPU_ISSET(i, &dest)) 2197266339Sjhb vcpu_notify_event(vm, i, false); 2198266339Sjhb } 2199266339Sjhb 2200266339Sjhb vm_handle_rendezvous(vm, vcpuid); 2201266339Sjhb} 2202268891Sjhb 2203268891Sjhbstruct vatpic * 2204268891Sjhbvm_atpic(struct vm *vm) 2205268891Sjhb{ 2206268891Sjhb return (vm->vatpic); 2207268891Sjhb} 2208268891Sjhb 2209268891Sjhbstruct vatpit * 2210268891Sjhbvm_atpit(struct vm *vm) 2211268891Sjhb{ 2212268891Sjhb return (vm->vatpit); 2213268891Sjhb} 2214268976Sjhb 2215268976Sjhbenum vm_reg_name 2216268976Sjhbvm_segment_name(int seg) 2217268976Sjhb{ 2218268976Sjhb static enum vm_reg_name seg_names[] = { 2219268976Sjhb VM_REG_GUEST_ES, 2220268976Sjhb VM_REG_GUEST_CS, 2221268976Sjhb VM_REG_GUEST_SS, 2222268976Sjhb VM_REG_GUEST_DS, 2223268976Sjhb VM_REG_GUEST_FS, 2224268976Sjhb VM_REG_GUEST_GS 2225268976Sjhb }; 2226268976Sjhb 2227268976Sjhb KASSERT(seg >= 0 && seg < nitems(seg_names), 2228268976Sjhb ("%s: invalid segment encoding %d", __func__, seg)); 2229268976Sjhb return (seg_names[seg]); 2230268976Sjhb} 2231270074Sgrehan 2232270159Sgrehanvoid 2233270159Sgrehanvm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, 2234270159Sgrehan int num_copyinfo) 2235270159Sgrehan{ 2236270159Sgrehan int idx; 2237270074Sgrehan 2238270159Sgrehan for (idx = 0; idx < num_copyinfo; idx++) { 2239270159Sgrehan if (copyinfo[idx].cookie != NULL) 2240270159Sgrehan vm_gpa_release(copyinfo[idx].cookie); 2241270159Sgrehan } 2242270159Sgrehan bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo)); 2243270159Sgrehan} 2244270159Sgrehan 2245270159Sgrehanint 2246270159Sgrehanvm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 2247270159Sgrehan uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, 2248270159Sgrehan int num_copyinfo) 2249270159Sgrehan{ 2250270159Sgrehan int error, idx, nused; 2251270159Sgrehan size_t n, off, remaining; 2252270159Sgrehan void *hva, *cookie; 2253270159Sgrehan uint64_t gpa; 2254270159Sgrehan 2255270159Sgrehan bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo); 2256270159Sgrehan 2257270159Sgrehan nused = 0; 2258270159Sgrehan remaining = len; 2259270159Sgrehan while (remaining > 0) { 2260270159Sgrehan KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo")); 2261270159Sgrehan error = vmm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa); 2262270159Sgrehan if (error) 2263270159Sgrehan return (error); 2264270159Sgrehan off = gpa & PAGE_MASK; 2265270159Sgrehan n = min(remaining, PAGE_SIZE - off); 2266270159Sgrehan copyinfo[nused].gpa = gpa; 2267270159Sgrehan copyinfo[nused].len = n; 2268270159Sgrehan remaining -= n; 2269270159Sgrehan gla += n; 2270270159Sgrehan nused++; 2271270159Sgrehan } 2272270159Sgrehan 2273270159Sgrehan for (idx = 0; idx < nused; idx++) { 2274270159Sgrehan hva = vm_gpa_hold(vm, copyinfo[idx].gpa, copyinfo[idx].len, 2275270159Sgrehan prot, &cookie); 2276270159Sgrehan if (hva == NULL) 2277270159Sgrehan break; 2278270159Sgrehan copyinfo[idx].hva = hva; 2279270159Sgrehan copyinfo[idx].cookie = cookie; 2280270159Sgrehan } 2281270159Sgrehan 2282270159Sgrehan if (idx != nused) { 2283270159Sgrehan vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo); 2284270159Sgrehan return (-1); 2285270159Sgrehan } else { 2286270159Sgrehan return (0); 2287270159Sgrehan } 2288270159Sgrehan} 2289270159Sgrehan 2290270159Sgrehanvoid 2291270159Sgrehanvm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr, 2292270159Sgrehan size_t len) 2293270159Sgrehan{ 2294270159Sgrehan char *dst; 2295270159Sgrehan int idx; 2296270159Sgrehan 2297270159Sgrehan dst = kaddr; 2298270159Sgrehan idx = 0; 2299270159Sgrehan while (len > 0) { 2300270159Sgrehan bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len); 2301270159Sgrehan len -= copyinfo[idx].len; 2302270159Sgrehan dst += copyinfo[idx].len; 2303270159Sgrehan idx++; 2304270159Sgrehan } 2305270159Sgrehan} 2306270159Sgrehan 2307270159Sgrehanvoid 2308270159Sgrehanvm_copyout(struct vm *vm, int vcpuid, const void *kaddr, 2309270159Sgrehan struct vm_copyinfo *copyinfo, size_t len) 2310270159Sgrehan{ 2311270159Sgrehan const char *src; 2312270159Sgrehan int idx; 2313270159Sgrehan 2314270159Sgrehan src = kaddr; 2315270159Sgrehan idx = 0; 2316270159Sgrehan while (len > 0) { 2317270159Sgrehan bcopy(src, copyinfo[idx].hva, copyinfo[idx].len); 2318270159Sgrehan len -= copyinfo[idx].len; 2319270159Sgrehan src += copyinfo[idx].len; 2320270159Sgrehan idx++; 2321270159Sgrehan } 2322270159Sgrehan} 2323270159Sgrehan 2324270074Sgrehan/* 2325270074Sgrehan * Return the amount of in-use and wired memory for the VM. Since 2326270074Sgrehan * these are global stats, only return the values with for vCPU 0 2327270074Sgrehan */ 2328270074SgrehanVMM_STAT_DECLARE(VMM_MEM_RESIDENT); 2329270074SgrehanVMM_STAT_DECLARE(VMM_MEM_WIRED); 2330270074Sgrehan 2331270074Sgrehanstatic void 2332270074Sgrehanvm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) 2333270074Sgrehan{ 2334270074Sgrehan 2335270074Sgrehan if (vcpu == 0) { 2336270074Sgrehan vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT, 2337270074Sgrehan PAGE_SIZE * vmspace_resident_count(vm->vmspace)); 2338270074Sgrehan } 2339270074Sgrehan} 2340270074Sgrehan 2341270074Sgrehanstatic void 2342270074Sgrehanvm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) 2343270074Sgrehan{ 2344270074Sgrehan 2345270074Sgrehan if (vcpu == 0) { 2346270074Sgrehan vmm_stat_set(vm, vcpu, VMM_MEM_WIRED, 2347270074Sgrehan PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace))); 2348270074Sgrehan } 2349270074Sgrehan} 2350270074Sgrehan 2351270074SgrehanVMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt); 2352270074SgrehanVMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt); 2353