vmx.c revision 245652
1145519Sdarrenr/*- 2145510Sdarrenr * Copyright (c) 2011 NetApp, Inc. 3145510Sdarrenr * All rights reserved. 4170268Sdarrenr * 5145510Sdarrenr * Redistribution and use in source and binary forms, with or without 6145510Sdarrenr * modification, are permitted provided that the following conditions 7145510Sdarrenr * are met: 8145510Sdarrenr * 1. Redistributions of source code must retain the above copyright 9145510Sdarrenr * notice, this list of conditions and the following disclaimer. 10145510Sdarrenr * 2. Redistributions in binary form must reproduce the above copyright 11145510Sdarrenr * notice, this list of conditions and the following disclaimer in the 12145510Sdarrenr * documentation and/or other materials provided with the distribution. 13145510Sdarrenr * 14145510Sdarrenr * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15145510Sdarrenr * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16145510Sdarrenr * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17145510Sdarrenr * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18145510Sdarrenr * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19145510Sdarrenr * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20145510Sdarrenr * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21145510Sdarrenr * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22145510Sdarrenr * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23145510Sdarrenr * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24145510Sdarrenr * SUCH DAMAGE. 25145510Sdarrenr * 26145510Sdarrenr * $FreeBSD$ 27145510Sdarrenr */ 28145510Sdarrenr 29145510Sdarrenr#include <sys/cdefs.h> 30145510Sdarrenr__FBSDID("$FreeBSD$"); 31145510Sdarrenr 32145510Sdarrenr#include <sys/param.h> 33145510Sdarrenr#include <sys/systm.h> 34145510Sdarrenr#include <sys/smp.h> 35145510Sdarrenr#include <sys/kernel.h> 36145510Sdarrenr#include <sys/malloc.h> 37145510Sdarrenr#include <sys/pcpu.h> 38145510Sdarrenr#include <sys/proc.h> 39145510Sdarrenr 40145510Sdarrenr#include <vm/vm.h> 41145510Sdarrenr#include <vm/pmap.h> 42145510Sdarrenr 43145510Sdarrenr#include <machine/psl.h> 44145510Sdarrenr#include <machine/cpufunc.h> 45145510Sdarrenr#include <machine/md_var.h> 46145510Sdarrenr#include <machine/pmap.h> 47145510Sdarrenr#include <machine/segments.h> 48145510Sdarrenr#include <machine/specialreg.h> 49145510Sdarrenr#include <machine/vmparam.h> 50145510Sdarrenr 51145510Sdarrenr#include <x86/apicreg.h> 52145510Sdarrenr 53145510Sdarrenr#include <machine/vmm.h> 54145510Sdarrenr#include "vmm_host.h" 55145510Sdarrenr#include "vmm_lapic.h" 56145510Sdarrenr#include "vmm_msr.h" 57145510Sdarrenr#include "vmm_ktr.h" 58145510Sdarrenr#include "vmm_stat.h" 59145510Sdarrenr 60145510Sdarrenr#include "vmx_msr.h" 61145510Sdarrenr#include "ept.h" 62145510Sdarrenr#include "vmx_cpufunc.h" 63145510Sdarrenr#include "vmx.h" 64145510Sdarrenr#include "x86.h" 65145510Sdarrenr#include "vmx_controls.h" 66145510Sdarrenr 67145510Sdarrenr#define PINBASED_CTLS_ONE_SETTING \ 68145510Sdarrenr (PINBASED_EXTINT_EXITING | \ 69145510Sdarrenr PINBASED_NMI_EXITING | \ 70145510Sdarrenr PINBASED_VIRTUAL_NMI) 71145510Sdarrenr#define PINBASED_CTLS_ZERO_SETTING 0 72145510Sdarrenr 73145510Sdarrenr#define PROCBASED_CTLS_WINDOW_SETTING \ 74145510Sdarrenr (PROCBASED_INT_WINDOW_EXITING | \ 75145510Sdarrenr PROCBASED_NMI_WINDOW_EXITING) 76145510Sdarrenr 77145510Sdarrenr#define PROCBASED_CTLS_ONE_SETTING \ 78145510Sdarrenr (PROCBASED_SECONDARY_CONTROLS | \ 79145510Sdarrenr PROCBASED_IO_EXITING | \ 80145510Sdarrenr PROCBASED_MSR_BITMAPS | \ 81145510Sdarrenr PROCBASED_CTLS_WINDOW_SETTING) 82145510Sdarrenr#define PROCBASED_CTLS_ZERO_SETTING \ 83145510Sdarrenr (PROCBASED_CR3_LOAD_EXITING | \ 84145510Sdarrenr PROCBASED_CR3_STORE_EXITING | \ 85145510Sdarrenr PROCBASED_IO_BITMAPS) 86145510Sdarrenr 87145510Sdarrenr#define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT 88145510Sdarrenr#define PROCBASED_CTLS2_ZERO_SETTING 0 89145510Sdarrenr 90145510Sdarrenr#define VM_EXIT_CTLS_ONE_SETTING_NO_PAT \ 91145510Sdarrenr (VM_EXIT_HOST_LMA | \ 92145510Sdarrenr VM_EXIT_SAVE_EFER | \ 93145510Sdarrenr VM_EXIT_LOAD_EFER) 94145510Sdarrenr 95145510Sdarrenr#define VM_EXIT_CTLS_ONE_SETTING \ 96145510Sdarrenr (VM_EXIT_CTLS_ONE_SETTING_NO_PAT | \ 97145510Sdarrenr VM_EXIT_SAVE_PAT | \ 98145510Sdarrenr VM_EXIT_LOAD_PAT) 99145510Sdarrenr#define VM_EXIT_CTLS_ZERO_SETTING VM_EXIT_SAVE_DEBUG_CONTROLS 100145510Sdarrenr 101145510Sdarrenr#define VM_ENTRY_CTLS_ONE_SETTING_NO_PAT VM_ENTRY_LOAD_EFER 102145510Sdarrenr 103145510Sdarrenr#define VM_ENTRY_CTLS_ONE_SETTING \ 104145510Sdarrenr (VM_ENTRY_CTLS_ONE_SETTING_NO_PAT | \ 105145510Sdarrenr VM_ENTRY_LOAD_PAT) 106145510Sdarrenr#define VM_ENTRY_CTLS_ZERO_SETTING \ 107145510Sdarrenr (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ 108145510Sdarrenr VM_ENTRY_INTO_SMM | \ 109145510Sdarrenr VM_ENTRY_DEACTIVATE_DUAL_MONITOR) 110145510Sdarrenr 111145510Sdarrenr#define guest_msr_rw(vmx, msr) \ 112145510Sdarrenr msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW) 113145510Sdarrenr 114145510Sdarrenr#define HANDLED 1 115145510Sdarrenr#define UNHANDLED 0 116145510Sdarrenr 117145510SdarrenrMALLOC_DEFINE(M_VMX, "vmx", "vmx"); 118145510Sdarrenr 119145510Sdarrenrint vmxon_enabled[MAXCPU]; 120145510Sdarrenrstatic char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); 121145510Sdarrenr 122145510Sdarrenrstatic uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; 123145510Sdarrenrstatic uint32_t exit_ctls, entry_ctls; 124145510Sdarrenr 125145510Sdarrenrstatic uint64_t cr0_ones_mask, cr0_zeros_mask; 126145510Sdarrenrstatic uint64_t cr4_ones_mask, cr4_zeros_mask; 127145510Sdarrenr 128145510Sdarrenrstatic volatile u_int nextvpid; 129145510Sdarrenr 130145510Sdarrenrstatic int vmx_no_patmsr; 131145510Sdarrenr 132145510Sdarrenr/* 133145510Sdarrenr * Virtual NMI blocking conditions. 134145510Sdarrenr * 135145510Sdarrenr * Some processor implementations also require NMI to be blocked if 136145510Sdarrenr * the STI_BLOCKING bit is set. It is possible to detect this at runtime 137145510Sdarrenr * based on the (exit_reason,exit_qual) tuple being set to 138145510Sdarrenr * (EXIT_REASON_INVAL_VMCS, EXIT_QUAL_NMI_WHILE_STI_BLOCKING). 139145510Sdarrenr * 140145510Sdarrenr * We take the easy way out and also include STI_BLOCKING as one of the 141145510Sdarrenr * gating items for vNMI injection. 142145510Sdarrenr */ 143145510Sdarrenrstatic uint64_t nmi_blocking_bits = VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING | 144145510Sdarrenr VMCS_INTERRUPTIBILITY_NMI_BLOCKING | 145145510Sdarrenr VMCS_INTERRUPTIBILITY_STI_BLOCKING; 146145510Sdarrenr 147145510Sdarrenr/* 148145510Sdarrenr * Optional capabilities 149145510Sdarrenr */ 150145510Sdarrenrstatic int cap_halt_exit; 151145510Sdarrenrstatic int cap_pause_exit; 152145510Sdarrenrstatic int cap_unrestricted_guest; 153145510Sdarrenrstatic int cap_monitor_trap; 154145510Sdarrenr 155145510Sdarrenr/* statistics */ 156145510Sdarrenrstatic VMM_STAT_DEFINE(VCPU_MIGRATIONS, "vcpu migration across host cpus"); 157145510Sdarrenrstatic VMM_STAT_DEFINE(VMEXIT_EXTINT, "vm exits due to external interrupt"); 158145510Sdarrenrstatic VMM_STAT_DEFINE(VMEXIT_HLT_IGNORED, "number of times hlt was ignored"); 159145510Sdarrenrstatic VMM_STAT_DEFINE(VMEXIT_HLT, "number of times hlt was intercepted"); 160145510Sdarrenr 161145510Sdarrenr#ifdef KTR 162145510Sdarrenrstatic const char * 163145510Sdarrenrexit_reason_to_str(int reason) 164145510Sdarrenr{ 165145510Sdarrenr static char reasonbuf[32]; 166145510Sdarrenr 167145510Sdarrenr switch (reason) { 168145510Sdarrenr case EXIT_REASON_EXCEPTION: 169145510Sdarrenr return "exception"; 170145510Sdarrenr case EXIT_REASON_EXT_INTR: 171145510Sdarrenr return "extint"; 172145510Sdarrenr case EXIT_REASON_TRIPLE_FAULT: 173145510Sdarrenr return "triplefault"; 174145510Sdarrenr case EXIT_REASON_INIT: 175161357Sguido return "init"; 176161357Sguido case EXIT_REASON_SIPI: 177145510Sdarrenr return "sipi"; 178145510Sdarrenr case EXIT_REASON_IO_SMI: 179145510Sdarrenr return "iosmi"; 180145510Sdarrenr case EXIT_REASON_SMI: 181145510Sdarrenr return "smi"; 182145510Sdarrenr case EXIT_REASON_INTR_WINDOW: 183145510Sdarrenr return "intrwindow"; 184145510Sdarrenr case EXIT_REASON_NMI_WINDOW: 185145510Sdarrenr return "nmiwindow"; 186145510Sdarrenr case EXIT_REASON_TASK_SWITCH: 187145510Sdarrenr return "taskswitch"; 188145510Sdarrenr case EXIT_REASON_CPUID: 189145510Sdarrenr return "cpuid"; 190145510Sdarrenr case EXIT_REASON_GETSEC: 191145510Sdarrenr return "getsec"; 192145510Sdarrenr case EXIT_REASON_HLT: 193145510Sdarrenr return "hlt"; 194145510Sdarrenr case EXIT_REASON_INVD: 195145510Sdarrenr return "invd"; 196145510Sdarrenr case EXIT_REASON_INVLPG: 197145510Sdarrenr return "invlpg"; 198145510Sdarrenr case EXIT_REASON_RDPMC: 199145510Sdarrenr return "rdpmc"; 200145510Sdarrenr case EXIT_REASON_RDTSC: 201145510Sdarrenr return "rdtsc"; 202145510Sdarrenr case EXIT_REASON_RSM: 203145510Sdarrenr return "rsm"; 204145510Sdarrenr case EXIT_REASON_VMCALL: 205145510Sdarrenr return "vmcall"; 206145510Sdarrenr case EXIT_REASON_VMCLEAR: 207145510Sdarrenr return "vmclear"; 208145510Sdarrenr case EXIT_REASON_VMLAUNCH: 209145510Sdarrenr return "vmlaunch"; 210145510Sdarrenr case EXIT_REASON_VMPTRLD: 211145510Sdarrenr return "vmptrld"; 212145510Sdarrenr case EXIT_REASON_VMPTRST: 213145510Sdarrenr return "vmptrst"; 214145510Sdarrenr case EXIT_REASON_VMREAD: 215145510Sdarrenr return "vmread"; 216145510Sdarrenr case EXIT_REASON_VMRESUME: 217145510Sdarrenr return "vmresume"; 218145510Sdarrenr case EXIT_REASON_VMWRITE: 219145510Sdarrenr return "vmwrite"; 220145510Sdarrenr case EXIT_REASON_VMXOFF: 221145510Sdarrenr return "vmxoff"; 222145510Sdarrenr case EXIT_REASON_VMXON: 223145510Sdarrenr return "vmxon"; 224145510Sdarrenr case EXIT_REASON_CR_ACCESS: 225145510Sdarrenr return "craccess"; 226145510Sdarrenr case EXIT_REASON_DR_ACCESS: 227145510Sdarrenr return "draccess"; 228145510Sdarrenr case EXIT_REASON_INOUT: 229145510Sdarrenr return "inout"; 230145510Sdarrenr case EXIT_REASON_RDMSR: 231145510Sdarrenr return "rdmsr"; 232145510Sdarrenr case EXIT_REASON_WRMSR: 233145510Sdarrenr return "wrmsr"; 234145510Sdarrenr case EXIT_REASON_INVAL_VMCS: 235145510Sdarrenr return "invalvmcs"; 236145510Sdarrenr case EXIT_REASON_INVAL_MSR: 237145510Sdarrenr return "invalmsr"; 238145510Sdarrenr case EXIT_REASON_MWAIT: 239145510Sdarrenr return "mwait"; 240145510Sdarrenr case EXIT_REASON_MTF: 241145510Sdarrenr return "mtf"; 242145510Sdarrenr case EXIT_REASON_MONITOR: 243145510Sdarrenr return "monitor"; 244145510Sdarrenr case EXIT_REASON_PAUSE: 245145510Sdarrenr return "pause"; 246145510Sdarrenr case EXIT_REASON_MCE: 247145510Sdarrenr return "mce"; 248145510Sdarrenr case EXIT_REASON_TPR: 249145510Sdarrenr return "tpr"; 250145510Sdarrenr case EXIT_REASON_APIC: 251145510Sdarrenr return "apic"; 252145510Sdarrenr case EXIT_REASON_GDTR_IDTR: 253145510Sdarrenr return "gdtridtr"; 254145510Sdarrenr case EXIT_REASON_LDTR_TR: 255145510Sdarrenr return "ldtrtr"; 256145510Sdarrenr case EXIT_REASON_EPT_FAULT: 257145510Sdarrenr return "eptfault"; 258145510Sdarrenr case EXIT_REASON_EPT_MISCONFIG: 259145510Sdarrenr return "eptmisconfig"; 260145510Sdarrenr case EXIT_REASON_INVEPT: 261145510Sdarrenr return "invept"; 262145510Sdarrenr case EXIT_REASON_RDTSCP: 263145510Sdarrenr return "rdtscp"; 264145510Sdarrenr case EXIT_REASON_VMX_PREEMPT: 265145510Sdarrenr return "vmxpreempt"; 266145510Sdarrenr case EXIT_REASON_INVVPID: 267145510Sdarrenr return "invvpid"; 268145510Sdarrenr case EXIT_REASON_WBINVD: 269145510Sdarrenr return "wbinvd"; 270145510Sdarrenr case EXIT_REASON_XSETBV: 271145510Sdarrenr return "xsetbv"; 272145510Sdarrenr default: 273145510Sdarrenr snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); 274145510Sdarrenr return (reasonbuf); 275145510Sdarrenr } 276145510Sdarrenr} 277145510Sdarrenr 278145510Sdarrenr#ifdef SETJMP_TRACE 279145510Sdarrenrstatic const char * 280145510Sdarrenrvmx_setjmp_rc2str(int rc) 281145510Sdarrenr{ 282145510Sdarrenr switch (rc) { 283145510Sdarrenr case VMX_RETURN_DIRECT: 284145510Sdarrenr return "direct"; 285145510Sdarrenr case VMX_RETURN_LONGJMP: 286145510Sdarrenr return "longjmp"; 287145510Sdarrenr case VMX_RETURN_VMRESUME: 288170268Sdarrenr return "vmresume"; 289170268Sdarrenr case VMX_RETURN_VMLAUNCH: 290170268Sdarrenr return "vmlaunch"; 291145510Sdarrenr case VMX_RETURN_AST: 292145510Sdarrenr return "ast"; 293145510Sdarrenr default: 294145510Sdarrenr return "unknown"; 295145510Sdarrenr } 296145510Sdarrenr} 297145510Sdarrenr 298145510Sdarrenr#define SETJMP_TRACE(vmx, vcpu, vmxctx, regname) \ 299145510Sdarrenr VMM_CTR1((vmx)->vm, (vcpu), "setjmp trace " #regname " 0x%016lx", \ 300145510Sdarrenr (vmxctx)->regname) 301145510Sdarrenr 302145510Sdarrenrstatic void 303145510Sdarrenrvmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) 304145510Sdarrenr{ 305145510Sdarrenr uint64_t host_rip, host_rsp; 306145510Sdarrenr 307145510Sdarrenr if (vmxctx != &vmx->ctx[vcpu]) 308145510Sdarrenr panic("vmx_setjmp_trace: invalid vmxctx %p; should be %p", 309145510Sdarrenr vmxctx, &vmx->ctx[vcpu]); 310145510Sdarrenr 311145510Sdarrenr VMM_CTR1((vmx)->vm, (vcpu), "vmxctx = %p", vmxctx); 312145510Sdarrenr VMM_CTR2((vmx)->vm, (vcpu), "setjmp return code %s(%d)", 313145510Sdarrenr vmx_setjmp_rc2str(rc), rc); 314145510Sdarrenr 315145510Sdarrenr host_rsp = host_rip = ~0; 316145510Sdarrenr vmread(VMCS_HOST_RIP, &host_rip); 317145510Sdarrenr vmread(VMCS_HOST_RSP, &host_rsp); 318145510Sdarrenr VMM_CTR2((vmx)->vm, (vcpu), "vmcs host_rip 0x%016lx, host_rsp 0x%016lx", 319145510Sdarrenr host_rip, host_rsp); 320145510Sdarrenr 321145510Sdarrenr SETJMP_TRACE(vmx, vcpu, vmxctx, host_r15); 322145510Sdarrenr SETJMP_TRACE(vmx, vcpu, vmxctx, host_r14); 323145510Sdarrenr SETJMP_TRACE(vmx, vcpu, vmxctx, host_r13); 324145510Sdarrenr SETJMP_TRACE(vmx, vcpu, vmxctx, host_r12); 325145510Sdarrenr SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbp); 326145510Sdarrenr SETJMP_TRACE(vmx, vcpu, vmxctx, host_rsp); 327145510Sdarrenr SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbx); 328145510Sdarrenr SETJMP_TRACE(vmx, vcpu, vmxctx, host_rip); 329145510Sdarrenr 330145510Sdarrenr SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdi); 331145510Sdarrenr SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rsi); 332145510Sdarrenr SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdx); 333145510Sdarrenr SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rcx); 334145510Sdarrenr SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r8); 335145510Sdarrenr SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r9); 336145510Sdarrenr SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rax); 337145510Sdarrenr SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbx); 338145510Sdarrenr SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbp); 339145510Sdarrenr SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r10); 340145510Sdarrenr SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r11); 341145510Sdarrenr SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r12); 342145510Sdarrenr SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r13); 343145510Sdarrenr SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r14); 344145510Sdarrenr SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r15); 345145510Sdarrenr SETJMP_TRACE(vmx, vcpu, vmxctx, guest_cr2); 346145510Sdarrenr} 347145510Sdarrenr#endif 348145510Sdarrenr#else 349145510Sdarrenrstatic void __inline 350145510Sdarrenrvmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) 351145510Sdarrenr{ 352145510Sdarrenr return; 353145510Sdarrenr} 354145510Sdarrenr#endif /* KTR */ 355145510Sdarrenr 356145510Sdarrenru_long 357145510Sdarrenrvmx_fix_cr0(u_long cr0) 358145510Sdarrenr{ 359145510Sdarrenr 360145510Sdarrenr return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); 361145510Sdarrenr} 362145510Sdarrenr 363145510Sdarrenru_long 364145510Sdarrenrvmx_fix_cr4(u_long cr4) 365145510Sdarrenr{ 366145510Sdarrenr 367145510Sdarrenr return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); 368145510Sdarrenr} 369145510Sdarrenr 370145510Sdarrenrstatic void 371145510Sdarrenrmsr_save_area_init(struct msr_entry *g_area, int *g_count) 372145510Sdarrenr{ 373145510Sdarrenr int cnt; 374145510Sdarrenr 375145510Sdarrenr static struct msr_entry guest_msrs[] = { 376145510Sdarrenr { MSR_KGSBASE, 0, 0 }, 377145510Sdarrenr }; 378145510Sdarrenr 379145510Sdarrenr cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]); 380145510Sdarrenr if (cnt > GUEST_MSR_MAX_ENTRIES) 381145510Sdarrenr panic("guest msr save area overrun"); 382145510Sdarrenr bcopy(guest_msrs, g_area, sizeof(guest_msrs)); 383145510Sdarrenr *g_count = cnt; 384145510Sdarrenr} 385145510Sdarrenr 386145510Sdarrenrstatic void 387145510Sdarrenrvmx_disable(void *arg __unused) 388145510Sdarrenr{ 389145510Sdarrenr struct invvpid_desc invvpid_desc = { 0 }; 390145510Sdarrenr struct invept_desc invept_desc = { 0 }; 391145510Sdarrenr 392145510Sdarrenr if (vmxon_enabled[curcpu]) { 393145510Sdarrenr /* 394145510Sdarrenr * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b. 395145510Sdarrenr * 396145510Sdarrenr * VMXON or VMXOFF are not required to invalidate any TLB 397145510Sdarrenr * caching structures. This prevents potential retention of 398145510Sdarrenr * cached information in the TLB between distinct VMX episodes. 399145510Sdarrenr */ 400145510Sdarrenr invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc); 401145510Sdarrenr invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc); 402145510Sdarrenr vmxoff(); 403145510Sdarrenr } 404145510Sdarrenr load_cr4(rcr4() & ~CR4_VMXE); 405145510Sdarrenr} 406145510Sdarrenr 407145510Sdarrenrstatic int 408145510Sdarrenrvmx_cleanup(void) 409145510Sdarrenr{ 410145510Sdarrenr 411145510Sdarrenr smp_rendezvous(NULL, vmx_disable, NULL, NULL); 412145510Sdarrenr 413145510Sdarrenr return (0); 414145510Sdarrenr} 415145510Sdarrenr 416145510Sdarrenrstatic void 417145510Sdarrenrvmx_enable(void *arg __unused) 418145510Sdarrenr{ 419145510Sdarrenr int error; 420145510Sdarrenr 421145510Sdarrenr load_cr4(rcr4() | CR4_VMXE); 422145510Sdarrenr 423145510Sdarrenr *(uint32_t *)vmxon_region[curcpu] = vmx_revision(); 424145510Sdarrenr error = vmxon(vmxon_region[curcpu]); 425145510Sdarrenr if (error == 0) 426145510Sdarrenr vmxon_enabled[curcpu] = 1; 427145510Sdarrenr} 428145510Sdarrenr 429145510Sdarrenrstatic int 430145510Sdarrenrvmx_init(void) 431145510Sdarrenr{ 432145510Sdarrenr int error; 433145510Sdarrenr uint64_t fixed0, fixed1, feature_control; 434145510Sdarrenr uint32_t tmp; 435145510Sdarrenr 436145510Sdarrenr /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ 437145510Sdarrenr if (!(cpu_feature2 & CPUID2_VMX)) { 438145510Sdarrenr printf("vmx_init: processor does not support VMX operation\n"); 439145510Sdarrenr return (ENXIO); 440145510Sdarrenr } 441145510Sdarrenr 442145510Sdarrenr /* 443145510Sdarrenr * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits 444145510Sdarrenr * are set (bits 0 and 2 respectively). 445145510Sdarrenr */ 446145510Sdarrenr feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); 447145510Sdarrenr if ((feature_control & 0x5) != 0x5) { 448145510Sdarrenr printf("vmx_init: VMX operation disabled by BIOS\n"); 449145510Sdarrenr return (ENXIO); 450145510Sdarrenr } 451145510Sdarrenr 452145510Sdarrenr /* Check support for primary processor-based VM-execution controls */ 453145510Sdarrenr error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 454145510Sdarrenr MSR_VMX_TRUE_PROCBASED_CTLS, 455145510Sdarrenr PROCBASED_CTLS_ONE_SETTING, 456145510Sdarrenr PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); 457145510Sdarrenr if (error) { 458145510Sdarrenr printf("vmx_init: processor does not support desired primary " 459145510Sdarrenr "processor-based controls\n"); 460145510Sdarrenr return (error); 461145510Sdarrenr } 462145510Sdarrenr 463145510Sdarrenr /* Clear the processor-based ctl bits that are set on demand */ 464145510Sdarrenr procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; 465145510Sdarrenr 466145510Sdarrenr /* Check support for secondary processor-based VM-execution controls */ 467145510Sdarrenr error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 468145510Sdarrenr MSR_VMX_PROCBASED_CTLS2, 469145510Sdarrenr PROCBASED_CTLS2_ONE_SETTING, 470145510Sdarrenr PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); 471170268Sdarrenr if (error) { 472170268Sdarrenr printf("vmx_init: processor does not support desired secondary " 473170268Sdarrenr "processor-based controls\n"); 474145510Sdarrenr return (error); 475145510Sdarrenr } 476145510Sdarrenr 477145510Sdarrenr /* Check support for VPID */ 478145510Sdarrenr error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, 479145510Sdarrenr PROCBASED2_ENABLE_VPID, 0, &tmp); 480145510Sdarrenr if (error == 0) 481145510Sdarrenr procbased_ctls2 |= PROCBASED2_ENABLE_VPID; 482145510Sdarrenr 483145510Sdarrenr /* Check support for pin-based VM-execution controls */ 484145510Sdarrenr error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, 485145510Sdarrenr MSR_VMX_TRUE_PINBASED_CTLS, 486145510Sdarrenr PINBASED_CTLS_ONE_SETTING, 487145510Sdarrenr PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); 488145510Sdarrenr if (error) { 489145510Sdarrenr printf("vmx_init: processor does not support desired " 490145510Sdarrenr "pin-based controls\n"); 491145510Sdarrenr return (error); 492145510Sdarrenr } 493145510Sdarrenr 494145510Sdarrenr /* Check support for VM-exit controls */ 495145510Sdarrenr error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, 496145510Sdarrenr VM_EXIT_CTLS_ONE_SETTING, 497145510Sdarrenr VM_EXIT_CTLS_ZERO_SETTING, 498145510Sdarrenr &exit_ctls); 499145510Sdarrenr if (error) { 500170268Sdarrenr /* Try again without the PAT MSR bits */ 501170268Sdarrenr error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, 502145510Sdarrenr MSR_VMX_TRUE_EXIT_CTLS, 503145510Sdarrenr VM_EXIT_CTLS_ONE_SETTING_NO_PAT, 504145510Sdarrenr VM_EXIT_CTLS_ZERO_SETTING, 505145510Sdarrenr &exit_ctls); 506145510Sdarrenr if (error) { 507145510Sdarrenr printf("vmx_init: processor does not support desired " 508145510Sdarrenr "exit controls\n"); 509145510Sdarrenr return (error); 510145510Sdarrenr } else { 511145510Sdarrenr if (bootverbose) 512145510Sdarrenr printf("vmm: PAT MSR access not supported\n"); 513145510Sdarrenr guest_msr_valid(MSR_PAT); 514145510Sdarrenr vmx_no_patmsr = 1; 515145510Sdarrenr } 516145510Sdarrenr } 517145510Sdarrenr 518145510Sdarrenr /* Check support for VM-entry controls */ 519145510Sdarrenr if (!vmx_no_patmsr) { 520145510Sdarrenr error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, 521145510Sdarrenr MSR_VMX_TRUE_ENTRY_CTLS, 522145510Sdarrenr VM_ENTRY_CTLS_ONE_SETTING, 523145510Sdarrenr VM_ENTRY_CTLS_ZERO_SETTING, 524145510Sdarrenr &entry_ctls); 525145510Sdarrenr } else { 526145510Sdarrenr error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, 527145510Sdarrenr MSR_VMX_TRUE_ENTRY_CTLS, 528145510Sdarrenr VM_ENTRY_CTLS_ONE_SETTING_NO_PAT, 529145510Sdarrenr VM_ENTRY_CTLS_ZERO_SETTING, 530145510Sdarrenr &entry_ctls); 531145510Sdarrenr } 532145510Sdarrenr 533145510Sdarrenr if (error) { 534145510Sdarrenr printf("vmx_init: processor does not support desired " 535145510Sdarrenr "entry controls\n"); 536145510Sdarrenr return (error); 537145510Sdarrenr } 538145510Sdarrenr 539145510Sdarrenr /* 540145510Sdarrenr * Check support for optional features by testing them 541145510Sdarrenr * as individual bits 542145510Sdarrenr */ 543145510Sdarrenr cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 544145510Sdarrenr MSR_VMX_TRUE_PROCBASED_CTLS, 545145510Sdarrenr PROCBASED_HLT_EXITING, 0, 546145510Sdarrenr &tmp) == 0); 547145510Sdarrenr 548145510Sdarrenr cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 549145510Sdarrenr MSR_VMX_PROCBASED_CTLS, 550145510Sdarrenr PROCBASED_MTF, 0, 551145510Sdarrenr &tmp) == 0); 552145510Sdarrenr 553145510Sdarrenr cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 554145510Sdarrenr MSR_VMX_TRUE_PROCBASED_CTLS, 555145510Sdarrenr PROCBASED_PAUSE_EXITING, 0, 556145510Sdarrenr &tmp) == 0); 557145510Sdarrenr 558145510Sdarrenr cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 559145510Sdarrenr MSR_VMX_PROCBASED_CTLS2, 560145510Sdarrenr PROCBASED2_UNRESTRICTED_GUEST, 0, 561145510Sdarrenr &tmp) == 0); 562145510Sdarrenr 563145510Sdarrenr /* Initialize EPT */ 564145510Sdarrenr error = ept_init(); 565145510Sdarrenr if (error) { 566145510Sdarrenr printf("vmx_init: ept initialization failed (%d)\n", error); 567145510Sdarrenr return (error); 568145510Sdarrenr } 569145510Sdarrenr 570145510Sdarrenr /* 571145510Sdarrenr * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 572145510Sdarrenr */ 573145510Sdarrenr fixed0 = rdmsr(MSR_VMX_CR0_FIXED0); 574145510Sdarrenr fixed1 = rdmsr(MSR_VMX_CR0_FIXED1); 575145510Sdarrenr cr0_ones_mask = fixed0 & fixed1; 576145510Sdarrenr cr0_zeros_mask = ~fixed0 & ~fixed1; 577145510Sdarrenr 578145510Sdarrenr /* 579145510Sdarrenr * CR0_PE and CR0_PG can be set to zero in VMX non-root operation 580145510Sdarrenr * if unrestricted guest execution is allowed. 581145510Sdarrenr */ 582145510Sdarrenr if (cap_unrestricted_guest) 583145510Sdarrenr cr0_ones_mask &= ~(CR0_PG | CR0_PE); 584145510Sdarrenr 585145510Sdarrenr /* 586145510Sdarrenr * Do not allow the guest to set CR0_NW or CR0_CD. 587145510Sdarrenr */ 588145510Sdarrenr cr0_zeros_mask |= (CR0_NW | CR0_CD); 589145510Sdarrenr 590145510Sdarrenr fixed0 = rdmsr(MSR_VMX_CR4_FIXED0); 591145510Sdarrenr fixed1 = rdmsr(MSR_VMX_CR4_FIXED1); 592145510Sdarrenr cr4_ones_mask = fixed0 & fixed1; 593145510Sdarrenr cr4_zeros_mask = ~fixed0 & ~fixed1; 594145510Sdarrenr 595145510Sdarrenr /* enable VMX operation */ 596145510Sdarrenr smp_rendezvous(NULL, vmx_enable, NULL, NULL); 597145510Sdarrenr 598145510Sdarrenr return (0); 599145510Sdarrenr} 600145510Sdarrenr 601145510Sdarrenr/* 602145510Sdarrenr * If this processor does not support VPIDs then simply return 0. 603145510Sdarrenr * 604145510Sdarrenr * Otherwise generate the next value of VPID to use. Any value is alright 605145510Sdarrenr * as long as it is non-zero. 606145510Sdarrenr * 607145510Sdarrenr * We always execute in VMX non-root context with EPT enabled. Thus all 608145510Sdarrenr * combined mappings are tagged with the (EP4TA, VPID, PCID) tuple. This 609145510Sdarrenr * in turn means that multiple VMs can share the same VPID as long as 610145510Sdarrenr * they have distinct EPT page tables. 611145510Sdarrenr * 612145510Sdarrenr * XXX 613145510Sdarrenr * We should optimize this so that it returns VPIDs that are not in 614145510Sdarrenr * use. Then we will not unnecessarily invalidate mappings in 615145510Sdarrenr * vmx_set_pcpu_defaults() just because two or more vcpus happen to 616170268Sdarrenr * use the same 'vpid'. 617170268Sdarrenr */ 618145510Sdarrenrstatic uint16_t 619145510Sdarrenrvmx_vpid(void) 620145510Sdarrenr{ 621145510Sdarrenr uint16_t vpid = 0; 622145510Sdarrenr 623145510Sdarrenr if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) != 0) { 624145510Sdarrenr do { 625145510Sdarrenr vpid = atomic_fetchadd_int(&nextvpid, 1); 626145510Sdarrenr } while (vpid == 0); 627145510Sdarrenr } 628145510Sdarrenr 629145510Sdarrenr return (vpid); 630145510Sdarrenr} 631145510Sdarrenr 632145510Sdarrenrstatic int 633145510Sdarrenrvmx_setup_cr_shadow(int which, struct vmcs *vmcs) 634145510Sdarrenr{ 635145510Sdarrenr int error, mask_ident, shadow_ident; 636145510Sdarrenr uint64_t mask_value, shadow_value; 637145510Sdarrenr 638145510Sdarrenr if (which != 0 && which != 4) 639145510Sdarrenr panic("vmx_setup_cr_shadow: unknown cr%d", which); 640145510Sdarrenr 641 if (which == 0) { 642 mask_ident = VMCS_CR0_MASK; 643 mask_value = cr0_ones_mask | cr0_zeros_mask; 644 shadow_ident = VMCS_CR0_SHADOW; 645 shadow_value = cr0_ones_mask; 646 } else { 647 mask_ident = VMCS_CR4_MASK; 648 mask_value = cr4_ones_mask | cr4_zeros_mask; 649 shadow_ident = VMCS_CR4_SHADOW; 650 shadow_value = cr4_ones_mask; 651 } 652 653 error = vmcs_setreg(vmcs, VMCS_IDENT(mask_ident), mask_value); 654 if (error) 655 return (error); 656 657 error = vmcs_setreg(vmcs, VMCS_IDENT(shadow_ident), shadow_value); 658 if (error) 659 return (error); 660 661 return (0); 662} 663#define vmx_setup_cr0_shadow(vmcs) vmx_setup_cr_shadow(0, (vmcs)) 664#define vmx_setup_cr4_shadow(vmcs) vmx_setup_cr_shadow(4, (vmcs)) 665 666static void * 667vmx_vminit(struct vm *vm) 668{ 669 uint16_t vpid; 670 int i, error, guest_msr_count; 671 struct vmx *vmx; 672 673 vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); 674 if ((uintptr_t)vmx & PAGE_MASK) { 675 panic("malloc of struct vmx not aligned on %d byte boundary", 676 PAGE_SIZE); 677 } 678 vmx->vm = vm; 679 680 /* 681 * Clean up EPTP-tagged guest physical and combined mappings 682 * 683 * VMX transitions are not required to invalidate any guest physical 684 * mappings. So, it may be possible for stale guest physical mappings 685 * to be present in the processor TLBs. 686 * 687 * Combined mappings for this EP4TA are also invalidated for all VPIDs. 688 */ 689 ept_invalidate_mappings(vtophys(vmx->pml4ept)); 690 691 msr_bitmap_initialize(vmx->msr_bitmap); 692 693 /* 694 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE. 695 * The guest FSBASE and GSBASE are saved and restored during 696 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are 697 * always restored from the vmcs host state area on vm-exit. 698 * 699 * Guest KGSBASE is saved and restored in the guest MSR save area. 700 * Host KGSBASE is restored before returning to userland from the pcb. 701 * There will be a window of time when we are executing in the host 702 * kernel context with a value of KGSBASE from the guest. This is ok 703 * because the value of KGSBASE is inconsequential in kernel context. 704 * 705 * MSR_EFER is saved and restored in the guest VMCS area on a 706 * VM exit and entry respectively. It is also restored from the 707 * host VMCS area on a VM exit. 708 */ 709 if (guest_msr_rw(vmx, MSR_GSBASE) || 710 guest_msr_rw(vmx, MSR_FSBASE) || 711 guest_msr_rw(vmx, MSR_KGSBASE) || 712 guest_msr_rw(vmx, MSR_EFER)) 713 panic("vmx_vminit: error setting guest msr access"); 714 715 /* 716 * MSR_PAT is saved and restored in the guest VMCS are on a VM exit 717 * and entry respectively. It is also restored from the host VMCS 718 * area on a VM exit. However, if running on a system with no 719 * MSR_PAT save/restore support, leave access disabled so accesses 720 * will be trapped. 721 */ 722 if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT)) 723 panic("vmx_vminit: error setting guest pat msr access"); 724 725 for (i = 0; i < VM_MAXCPU; i++) { 726 vmx->vmcs[i].identifier = vmx_revision(); 727 error = vmclear(&vmx->vmcs[i]); 728 if (error != 0) { 729 panic("vmx_vminit: vmclear error %d on vcpu %d\n", 730 error, i); 731 } 732 733 vpid = vmx_vpid(); 734 735 error = vmcs_set_defaults(&vmx->vmcs[i], 736 (u_long)vmx_longjmp, 737 (u_long)&vmx->ctx[i], 738 vtophys(vmx->pml4ept), 739 pinbased_ctls, 740 procbased_ctls, 741 procbased_ctls2, 742 exit_ctls, entry_ctls, 743 vtophys(vmx->msr_bitmap), 744 vpid); 745 746 if (error != 0) 747 panic("vmx_vminit: vmcs_set_defaults error %d", error); 748 749 vmx->cap[i].set = 0; 750 vmx->cap[i].proc_ctls = procbased_ctls; 751 752 vmx->state[i].lastcpu = -1; 753 vmx->state[i].vpid = vpid; 754 755 msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count); 756 757 error = vmcs_set_msr_save(&vmx->vmcs[i], 758 vtophys(vmx->guest_msrs[i]), 759 guest_msr_count); 760 if (error != 0) 761 panic("vmcs_set_msr_save error %d", error); 762 763 error = vmx_setup_cr0_shadow(&vmx->vmcs[i]); 764 if (error != 0) 765 panic("vmx_setup_cr0_shadow %d", error); 766 767 error = vmx_setup_cr4_shadow(&vmx->vmcs[i]); 768 if (error != 0) 769 panic("vmx_setup_cr4_shadow %d", error); 770 } 771 772 return (vmx); 773} 774 775static int 776vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx) 777{ 778 int handled, func; 779 780 func = vmxctx->guest_rax; 781 782 handled = x86_emulate_cpuid(vm, vcpu, 783 (uint32_t*)(&vmxctx->guest_rax), 784 (uint32_t*)(&vmxctx->guest_rbx), 785 (uint32_t*)(&vmxctx->guest_rcx), 786 (uint32_t*)(&vmxctx->guest_rdx)); 787 return (handled); 788} 789 790static __inline void 791vmx_run_trace(struct vmx *vmx, int vcpu) 792{ 793#ifdef KTR 794 VMM_CTR1(vmx->vm, vcpu, "Resume execution at 0x%0lx", vmcs_guest_rip()); 795#endif 796} 797 798static __inline void 799vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, 800 int handled) 801{ 802#ifdef KTR 803 VMM_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx", 804 handled ? "handled" : "unhandled", 805 exit_reason_to_str(exit_reason), rip); 806#endif 807} 808 809static __inline void 810vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip) 811{ 812#ifdef KTR 813 VMM_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip); 814#endif 815} 816 817static int 818vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu) 819{ 820 int error, lastcpu; 821 struct vmxstate *vmxstate; 822 struct invvpid_desc invvpid_desc = { 0 }; 823 824 vmxstate = &vmx->state[vcpu]; 825 lastcpu = vmxstate->lastcpu; 826 vmxstate->lastcpu = curcpu; 827 828 if (lastcpu == curcpu) { 829 error = 0; 830 goto done; 831 } 832 833 vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); 834 835 error = vmwrite(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); 836 if (error != 0) 837 goto done; 838 839 error = vmwrite(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); 840 if (error != 0) 841 goto done; 842 843 error = vmwrite(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); 844 if (error != 0) 845 goto done; 846 847 /* 848 * If we are using VPIDs then invalidate all mappings tagged with 'vpid' 849 * 850 * We do this because this vcpu was executing on a different host 851 * cpu when it last ran. We do not track whether it invalidated 852 * mappings associated with its 'vpid' during that run. So we must 853 * assume that the mappings associated with 'vpid' on 'curcpu' are 854 * stale and invalidate them. 855 * 856 * Note that we incur this penalty only when the scheduler chooses to 857 * move the thread associated with this vcpu between host cpus. 858 * 859 * Note also that this will invalidate mappings tagged with 'vpid' 860 * for "all" EP4TAs. 861 */ 862 if (vmxstate->vpid != 0) { 863 invvpid_desc.vpid = vmxstate->vpid; 864 invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); 865 } 866done: 867 return (error); 868} 869 870static void 871vm_exit_update_rip(struct vm_exit *vmexit) 872{ 873 int error; 874 875 error = vmwrite(VMCS_GUEST_RIP, vmexit->rip + vmexit->inst_length); 876 if (error) 877 panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); 878} 879 880/* 881 * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. 882 */ 883CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); 884 885static void __inline 886vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) 887{ 888 int error; 889 890 vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING; 891 892 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 893 if (error) 894 panic("vmx_set_int_window_exiting: vmwrite error %d", error); 895} 896 897static void __inline 898vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) 899{ 900 int error; 901 902 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; 903 904 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 905 if (error) 906 panic("vmx_clear_int_window_exiting: vmwrite error %d", error); 907} 908 909static void __inline 910vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) 911{ 912 int error; 913 914 vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; 915 916 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 917 if (error) 918 panic("vmx_set_nmi_window_exiting: vmwrite error %d", error); 919} 920 921static void __inline 922vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) 923{ 924 int error; 925 926 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; 927 928 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 929 if (error) 930 panic("vmx_clear_nmi_window_exiting: vmwrite error %d", error); 931} 932 933static int 934vmx_inject_nmi(struct vmx *vmx, int vcpu) 935{ 936 int error; 937 uint64_t info, interruptibility; 938 939 /* Bail out if no NMI requested */ 940 if (!vm_nmi_pending(vmx->vm, vcpu)) 941 return (0); 942 943 error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility); 944 if (error) { 945 panic("vmx_inject_nmi: vmread(interruptibility) %d", 946 error); 947 } 948 if (interruptibility & nmi_blocking_bits) 949 goto nmiblocked; 950 951 /* 952 * Inject the virtual NMI. The vector must be the NMI IDT entry 953 * or the VMCS entry check will fail. 954 */ 955 info = VMCS_INTERRUPTION_INFO_NMI | VMCS_INTERRUPTION_INFO_VALID; 956 info |= IDT_NMI; 957 958 error = vmwrite(VMCS_ENTRY_INTR_INFO, info); 959 if (error) 960 panic("vmx_inject_nmi: vmwrite(intrinfo) %d", error); 961 962 VMM_CTR0(vmx->vm, vcpu, "Injecting vNMI"); 963 964 /* Clear the request */ 965 vm_nmi_clear(vmx->vm, vcpu); 966 return (1); 967 968nmiblocked: 969 /* 970 * Set the NMI Window Exiting execution control so we can inject 971 * the virtual NMI as soon as blocking condition goes away. 972 */ 973 vmx_set_nmi_window_exiting(vmx, vcpu); 974 975 VMM_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting"); 976 return (1); 977} 978 979static void 980vmx_inject_interrupts(struct vmx *vmx, int vcpu) 981{ 982 int error, vector; 983 uint64_t info, rflags, interruptibility; 984 985 const int HWINTR_BLOCKED = VMCS_INTERRUPTIBILITY_STI_BLOCKING | 986 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING; 987 988 /* 989 * If there is already an interrupt pending then just return. 990 * 991 * This could happen if an interrupt was injected on a prior 992 * VM entry but the actual entry into guest mode was aborted 993 * because of a pending AST. 994 */ 995 error = vmread(VMCS_ENTRY_INTR_INFO, &info); 996 if (error) 997 panic("vmx_inject_interrupts: vmread(intrinfo) %d", error); 998 if (info & VMCS_INTERRUPTION_INFO_VALID) 999 return; 1000 1001 /* 1002 * NMI injection has priority so deal with those first 1003 */ 1004 if (vmx_inject_nmi(vmx, vcpu)) 1005 return; 1006 1007 /* Ask the local apic for a vector to inject */ 1008 vector = lapic_pending_intr(vmx->vm, vcpu); 1009 if (vector < 0) 1010 return; 1011 1012 if (vector < 32 || vector > 255) 1013 panic("vmx_inject_interrupts: invalid vector %d\n", vector); 1014 1015 /* Check RFLAGS.IF and the interruptibility state of the guest */ 1016 error = vmread(VMCS_GUEST_RFLAGS, &rflags); 1017 if (error) 1018 panic("vmx_inject_interrupts: vmread(rflags) %d", error); 1019 1020 if ((rflags & PSL_I) == 0) 1021 goto cantinject; 1022 1023 error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility); 1024 if (error) { 1025 panic("vmx_inject_interrupts: vmread(interruptibility) %d", 1026 error); 1027 } 1028 if (interruptibility & HWINTR_BLOCKED) 1029 goto cantinject; 1030 1031 /* Inject the interrupt */ 1032 info = VMCS_INTERRUPTION_INFO_HW_INTR | VMCS_INTERRUPTION_INFO_VALID; 1033 info |= vector; 1034 error = vmwrite(VMCS_ENTRY_INTR_INFO, info); 1035 if (error) 1036 panic("vmx_inject_interrupts: vmwrite(intrinfo) %d", error); 1037 1038 /* Update the Local APIC ISR */ 1039 lapic_intr_accepted(vmx->vm, vcpu, vector); 1040 1041 VMM_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); 1042 1043 return; 1044 1045cantinject: 1046 /* 1047 * Set the Interrupt Window Exiting execution control so we can inject 1048 * the interrupt as soon as blocking condition goes away. 1049 */ 1050 vmx_set_int_window_exiting(vmx, vcpu); 1051 1052 VMM_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting"); 1053} 1054 1055static int 1056vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual) 1057{ 1058 int error, cr, vmcs_guest_cr; 1059 uint64_t regval, ones_mask, zeros_mask; 1060 const struct vmxctx *vmxctx; 1061 1062 /* We only handle mov to %cr0 or %cr4 at this time */ 1063 if ((exitqual & 0xf0) != 0x00) 1064 return (UNHANDLED); 1065 1066 cr = exitqual & 0xf; 1067 if (cr != 0 && cr != 4) 1068 return (UNHANDLED); 1069 1070 vmxctx = &vmx->ctx[vcpu]; 1071 1072 /* 1073 * We must use vmwrite() directly here because vmcs_setreg() will 1074 * call vmclear(vmcs) as a side-effect which we certainly don't want. 1075 */ 1076 switch ((exitqual >> 8) & 0xf) { 1077 case 0: 1078 regval = vmxctx->guest_rax; 1079 break; 1080 case 1: 1081 regval = vmxctx->guest_rcx; 1082 break; 1083 case 2: 1084 regval = vmxctx->guest_rdx; 1085 break; 1086 case 3: 1087 regval = vmxctx->guest_rbx; 1088 break; 1089 case 4: 1090 error = vmread(VMCS_GUEST_RSP, ®val); 1091 if (error) { 1092 panic("vmx_emulate_cr_access: " 1093 "error %d reading guest rsp", error); 1094 } 1095 break; 1096 case 5: 1097 regval = vmxctx->guest_rbp; 1098 break; 1099 case 6: 1100 regval = vmxctx->guest_rsi; 1101 break; 1102 case 7: 1103 regval = vmxctx->guest_rdi; 1104 break; 1105 case 8: 1106 regval = vmxctx->guest_r8; 1107 break; 1108 case 9: 1109 regval = vmxctx->guest_r9; 1110 break; 1111 case 10: 1112 regval = vmxctx->guest_r10; 1113 break; 1114 case 11: 1115 regval = vmxctx->guest_r11; 1116 break; 1117 case 12: 1118 regval = vmxctx->guest_r12; 1119 break; 1120 case 13: 1121 regval = vmxctx->guest_r13; 1122 break; 1123 case 14: 1124 regval = vmxctx->guest_r14; 1125 break; 1126 case 15: 1127 regval = vmxctx->guest_r15; 1128 break; 1129 } 1130 1131 if (cr == 0) { 1132 ones_mask = cr0_ones_mask; 1133 zeros_mask = cr0_zeros_mask; 1134 vmcs_guest_cr = VMCS_GUEST_CR0; 1135 } else { 1136 ones_mask = cr4_ones_mask; 1137 zeros_mask = cr4_zeros_mask; 1138 vmcs_guest_cr = VMCS_GUEST_CR4; 1139 } 1140 regval |= ones_mask; 1141 regval &= ~zeros_mask; 1142 error = vmwrite(vmcs_guest_cr, regval); 1143 if (error) { 1144 panic("vmx_emulate_cr_access: error %d writing cr%d", 1145 error, cr); 1146 } 1147 1148 return (HANDLED); 1149} 1150 1151static int 1152vmx_ept_fault(struct vm *vm, int cpu, 1153 uint64_t gla, uint64_t gpa, uint64_t rip, int inst_length, 1154 uint64_t cr3, uint64_t ept_qual, struct vie *vie) 1155{ 1156 int read, write, error; 1157 1158 /* EPT violation on an instruction fetch doesn't make sense here */ 1159 if (ept_qual & EPT_VIOLATION_INST_FETCH) 1160 return (UNHANDLED); 1161 1162 /* EPT violation must be a read fault or a write fault */ 1163 read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0; 1164 write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0; 1165 if ((read | write) == 0) 1166 return (UNHANDLED); 1167 1168 /* 1169 * The EPT violation must have been caused by accessing a 1170 * guest-physical address that is a translation of a guest-linear 1171 * address. 1172 */ 1173 if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 || 1174 (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) { 1175 return (UNHANDLED); 1176 } 1177 1178 /* Fetch, decode and emulate the faulting instruction */ 1179 if (vmm_fetch_instruction(vm, cpu, rip, inst_length, cr3, vie) != 0) 1180 return (UNHANDLED); 1181 1182 if (vmm_decode_instruction(vm, cpu, gla, vie) != 0) 1183 return (UNHANDLED); 1184 1185 /* 1186 * Check if this is a local apic access 1187 */ 1188 if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE) 1189 return (UNHANDLED); 1190 1191 error = vmm_emulate_instruction(vm, cpu, gpa, vie, 1192 lapic_mmio_read, lapic_mmio_write, 0); 1193 1194 return (error ? UNHANDLED : HANDLED); 1195} 1196 1197static int 1198vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 1199{ 1200 int error, handled; 1201 struct vmcs *vmcs; 1202 struct vmxctx *vmxctx; 1203 uint32_t eax, ecx, edx; 1204 uint64_t qual, gla, gpa, cr3, intr_info; 1205 1206 handled = 0; 1207 vmcs = &vmx->vmcs[vcpu]; 1208 vmxctx = &vmx->ctx[vcpu]; 1209 qual = vmexit->u.vmx.exit_qualification; 1210 vmexit->exitcode = VM_EXITCODE_BOGUS; 1211 1212 switch (vmexit->u.vmx.exit_reason) { 1213 case EXIT_REASON_CR_ACCESS: 1214 handled = vmx_emulate_cr_access(vmx, vcpu, qual); 1215 break; 1216 case EXIT_REASON_RDMSR: 1217 ecx = vmxctx->guest_rcx; 1218 error = emulate_rdmsr(vmx->vm, vcpu, ecx); 1219 if (error) { 1220 vmexit->exitcode = VM_EXITCODE_RDMSR; 1221 vmexit->u.msr.code = ecx; 1222 } else 1223 handled = 1; 1224 break; 1225 case EXIT_REASON_WRMSR: 1226 eax = vmxctx->guest_rax; 1227 ecx = vmxctx->guest_rcx; 1228 edx = vmxctx->guest_rdx; 1229 error = emulate_wrmsr(vmx->vm, vcpu, ecx, 1230 (uint64_t)edx << 32 | eax); 1231 if (error) { 1232 vmexit->exitcode = VM_EXITCODE_WRMSR; 1233 vmexit->u.msr.code = ecx; 1234 vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; 1235 } else 1236 handled = 1; 1237 break; 1238 case EXIT_REASON_HLT: 1239 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1); 1240 /* 1241 * If there is an event waiting to be injected then there is 1242 * no need to 'hlt'. 1243 */ 1244 error = vmread(VMCS_ENTRY_INTR_INFO, &intr_info); 1245 if (error) 1246 panic("vmx_exit_process: vmread(intrinfo) %d", error); 1247 1248 if (intr_info & VMCS_INTERRUPTION_INFO_VALID) { 1249 handled = 1; 1250 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT_IGNORED, 1); 1251 } else 1252 vmexit->exitcode = VM_EXITCODE_HLT; 1253 break; 1254 case EXIT_REASON_MTF: 1255 vmexit->exitcode = VM_EXITCODE_MTRAP; 1256 break; 1257 case EXIT_REASON_PAUSE: 1258 vmexit->exitcode = VM_EXITCODE_PAUSE; 1259 break; 1260 case EXIT_REASON_INTR_WINDOW: 1261 vmx_clear_int_window_exiting(vmx, vcpu); 1262 VMM_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); 1263 /* FALLTHRU */ 1264 case EXIT_REASON_EXT_INTR: 1265 /* 1266 * External interrupts serve only to cause VM exits and allow 1267 * the host interrupt handler to run. 1268 * 1269 * If this external interrupt triggers a virtual interrupt 1270 * to a VM, then that state will be recorded by the 1271 * host interrupt handler in the VM's softc. We will inject 1272 * this virtual interrupt during the subsequent VM enter. 1273 */ 1274 1275 /* 1276 * This is special. We want to treat this as an 'handled' 1277 * VM-exit but not increment the instruction pointer. 1278 */ 1279 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); 1280 return (1); 1281 case EXIT_REASON_NMI_WINDOW: 1282 /* Exit to allow the pending virtual NMI to be injected */ 1283 vmx_clear_nmi_window_exiting(vmx, vcpu); 1284 VMM_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting"); 1285 return (1); 1286 case EXIT_REASON_INOUT: 1287 vmexit->exitcode = VM_EXITCODE_INOUT; 1288 vmexit->u.inout.bytes = (qual & 0x7) + 1; 1289 vmexit->u.inout.in = (qual & 0x8) ? 1 : 0; 1290 vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; 1291 vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; 1292 vmexit->u.inout.port = (uint16_t)(qual >> 16); 1293 vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); 1294 break; 1295 case EXIT_REASON_CPUID: 1296 handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx); 1297 break; 1298 case EXIT_REASON_EPT_FAULT: 1299 gla = vmcs_gla(); 1300 gpa = vmcs_gpa(); 1301 cr3 = vmcs_guest_cr3(); 1302 handled = vmx_ept_fault(vmx->vm, vcpu, gla, gpa, 1303 vmexit->rip, vmexit->inst_length, 1304 cr3, qual, &vmexit->u.paging.vie); 1305 if (!handled) { 1306 vmexit->exitcode = VM_EXITCODE_PAGING; 1307 vmexit->u.paging.gpa = gpa; 1308 } 1309 break; 1310 default: 1311 break; 1312 } 1313 1314 if (handled) { 1315 /* 1316 * It is possible that control is returned to userland 1317 * even though we were able to handle the VM exit in the 1318 * kernel. 1319 * 1320 * In such a case we want to make sure that the userland 1321 * restarts guest execution at the instruction *after* 1322 * the one we just processed. Therefore we update the 1323 * guest rip in the VMCS and in 'vmexit'. 1324 */ 1325 vm_exit_update_rip(vmexit); 1326 vmexit->rip += vmexit->inst_length; 1327 vmexit->inst_length = 0; 1328 1329 /* 1330 * Special case for spinning up an AP - exit to userspace to 1331 * give the controlling process a chance to intercept and 1332 * spin up a thread for the AP. 1333 */ 1334 if (vmexit->exitcode == VM_EXITCODE_SPINUP_AP) 1335 handled = 0; 1336 } else { 1337 if (vmexit->exitcode == VM_EXITCODE_BOGUS) { 1338 /* 1339 * If this VM exit was not claimed by anybody then 1340 * treat it as a generic VMX exit. 1341 */ 1342 vmexit->exitcode = VM_EXITCODE_VMX; 1343 vmexit->u.vmx.error = 0; 1344 } else { 1345 /* 1346 * The exitcode and collateral have been populated. 1347 * The VM exit will be processed further in userland. 1348 */ 1349 } 1350 } 1351 return (handled); 1352} 1353 1354static int 1355vmx_run(void *arg, int vcpu, register_t rip) 1356{ 1357 int error, vie, rc, handled, astpending; 1358 uint32_t exit_reason; 1359 struct vmx *vmx; 1360 struct vmxctx *vmxctx; 1361 struct vmcs *vmcs; 1362 struct vm_exit *vmexit; 1363 1364 vmx = arg; 1365 vmcs = &vmx->vmcs[vcpu]; 1366 vmxctx = &vmx->ctx[vcpu]; 1367 vmxctx->launched = 0; 1368 1369 astpending = 0; 1370 vmexit = vm_exitinfo(vmx->vm, vcpu); 1371 1372 /* 1373 * XXX Can we avoid doing this every time we do a vm run? 1374 */ 1375 VMPTRLD(vmcs); 1376 1377 /* 1378 * XXX 1379 * We do this every time because we may setup the virtual machine 1380 * from a different process than the one that actually runs it. 1381 * 1382 * If the life of a virtual machine was spent entirely in the context 1383 * of a single process we could do this once in vmcs_set_defaults(). 1384 */ 1385 if ((error = vmwrite(VMCS_HOST_CR3, rcr3())) != 0) 1386 panic("vmx_run: error %d writing to VMCS_HOST_CR3", error); 1387 1388 if ((error = vmwrite(VMCS_GUEST_RIP, rip)) != 0) 1389 panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); 1390 1391 if ((error = vmx_set_pcpu_defaults(vmx, vcpu)) != 0) 1392 panic("vmx_run: error %d setting up pcpu defaults", error); 1393 1394 do { 1395 lapic_timer_tick(vmx->vm, vcpu); 1396 vmx_inject_interrupts(vmx, vcpu); 1397 vmx_run_trace(vmx, vcpu); 1398 rc = vmx_setjmp(vmxctx); 1399#ifdef SETJMP_TRACE 1400 vmx_setjmp_trace(vmx, vcpu, vmxctx, rc); 1401#endif 1402 switch (rc) { 1403 case VMX_RETURN_DIRECT: 1404 if (vmxctx->launched == 0) { 1405 vmxctx->launched = 1; 1406 vmx_launch(vmxctx); 1407 } else 1408 vmx_resume(vmxctx); 1409 panic("vmx_launch/resume should not return"); 1410 break; 1411 case VMX_RETURN_LONGJMP: 1412 break; /* vm exit */ 1413 case VMX_RETURN_AST: 1414 astpending = 1; 1415 break; 1416 case VMX_RETURN_VMRESUME: 1417 vie = vmcs_instruction_error(); 1418 if (vmxctx->launch_error == VM_FAIL_INVALID || 1419 vie != VMRESUME_WITH_NON_LAUNCHED_VMCS) { 1420 printf("vmresume error %d vmcs inst error %d\n", 1421 vmxctx->launch_error, vie); 1422 goto err_exit; 1423 } 1424 vmx_launch(vmxctx); /* try to launch the guest */ 1425 panic("vmx_launch should not return"); 1426 break; 1427 case VMX_RETURN_VMLAUNCH: 1428 vie = vmcs_instruction_error(); 1429#if 1 1430 printf("vmlaunch error %d vmcs inst error %d\n", 1431 vmxctx->launch_error, vie); 1432#endif 1433 goto err_exit; 1434 default: 1435 panic("vmx_setjmp returned %d", rc); 1436 } 1437 1438 /* enable interrupts */ 1439 enable_intr(); 1440 1441 /* collect some basic information for VM exit processing */ 1442 vmexit->rip = rip = vmcs_guest_rip(); 1443 vmexit->inst_length = vmexit_instruction_length(); 1444 vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); 1445 vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); 1446 1447 if (astpending) { 1448 handled = 1; 1449 vmexit->inst_length = 0; 1450 vmexit->exitcode = VM_EXITCODE_BOGUS; 1451 vmx_astpending_trace(vmx, vcpu, rip); 1452 break; 1453 } 1454 1455 handled = vmx_exit_process(vmx, vcpu, vmexit); 1456 vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled); 1457 1458 } while (handled); 1459 1460 /* 1461 * If a VM exit has been handled then the exitcode must be BOGUS 1462 * If a VM exit is not handled then the exitcode must not be BOGUS 1463 */ 1464 if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || 1465 (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { 1466 panic("Mismatch between handled (%d) and exitcode (%d)", 1467 handled, vmexit->exitcode); 1468 } 1469 1470 VMM_CTR1(vmx->vm, vcpu, "goto userland: exitcode %d",vmexit->exitcode); 1471 1472 /* 1473 * XXX 1474 * We need to do this to ensure that any VMCS state cached by the 1475 * processor is flushed to memory. We need to do this in case the 1476 * VM moves to a different cpu the next time it runs. 1477 * 1478 * Can we avoid doing this? 1479 */ 1480 VMCLEAR(vmcs); 1481 return (0); 1482 1483err_exit: 1484 vmexit->exitcode = VM_EXITCODE_VMX; 1485 vmexit->u.vmx.exit_reason = (uint32_t)-1; 1486 vmexit->u.vmx.exit_qualification = (uint32_t)-1; 1487 vmexit->u.vmx.error = vie; 1488 VMCLEAR(vmcs); 1489 return (ENOEXEC); 1490} 1491 1492static void 1493vmx_vmcleanup(void *arg) 1494{ 1495 int error; 1496 struct vmx *vmx = arg; 1497 1498 /* 1499 * XXXSMP we also need to clear the VMCS active on the other vcpus. 1500 */ 1501 error = vmclear(&vmx->vmcs[0]); 1502 if (error != 0) 1503 panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error); 1504 1505 ept_vmcleanup(vmx); 1506 free(vmx, M_VMX); 1507 1508 return; 1509} 1510 1511static register_t * 1512vmxctx_regptr(struct vmxctx *vmxctx, int reg) 1513{ 1514 1515 switch (reg) { 1516 case VM_REG_GUEST_RAX: 1517 return (&vmxctx->guest_rax); 1518 case VM_REG_GUEST_RBX: 1519 return (&vmxctx->guest_rbx); 1520 case VM_REG_GUEST_RCX: 1521 return (&vmxctx->guest_rcx); 1522 case VM_REG_GUEST_RDX: 1523 return (&vmxctx->guest_rdx); 1524 case VM_REG_GUEST_RSI: 1525 return (&vmxctx->guest_rsi); 1526 case VM_REG_GUEST_RDI: 1527 return (&vmxctx->guest_rdi); 1528 case VM_REG_GUEST_RBP: 1529 return (&vmxctx->guest_rbp); 1530 case VM_REG_GUEST_R8: 1531 return (&vmxctx->guest_r8); 1532 case VM_REG_GUEST_R9: 1533 return (&vmxctx->guest_r9); 1534 case VM_REG_GUEST_R10: 1535 return (&vmxctx->guest_r10); 1536 case VM_REG_GUEST_R11: 1537 return (&vmxctx->guest_r11); 1538 case VM_REG_GUEST_R12: 1539 return (&vmxctx->guest_r12); 1540 case VM_REG_GUEST_R13: 1541 return (&vmxctx->guest_r13); 1542 case VM_REG_GUEST_R14: 1543 return (&vmxctx->guest_r14); 1544 case VM_REG_GUEST_R15: 1545 return (&vmxctx->guest_r15); 1546 default: 1547 break; 1548 } 1549 return (NULL); 1550} 1551 1552static int 1553vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval) 1554{ 1555 register_t *regp; 1556 1557 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 1558 *retval = *regp; 1559 return (0); 1560 } else 1561 return (EINVAL); 1562} 1563 1564static int 1565vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) 1566{ 1567 register_t *regp; 1568 1569 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 1570 *regp = val; 1571 return (0); 1572 } else 1573 return (EINVAL); 1574} 1575 1576static int 1577vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) 1578{ 1579 struct vmx *vmx = arg; 1580 1581 if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0) 1582 return (0); 1583 1584 /* 1585 * If the vcpu is running then don't mess with the VMCS. 1586 * 1587 * vmcs_getreg will VMCLEAR the vmcs when it is done which will cause 1588 * the subsequent vmlaunch/vmresume to fail. 1589 */ 1590 if (vcpu_is_running(vmx->vm, vcpu)) 1591 panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); 1592 1593 return (vmcs_getreg(&vmx->vmcs[vcpu], reg, retval)); 1594} 1595 1596static int 1597vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) 1598{ 1599 int error; 1600 uint64_t ctls; 1601 struct vmx *vmx = arg; 1602 1603 /* 1604 * XXX Allow caller to set contents of the guest registers saved in 1605 * the 'vmxctx' even though the vcpu might be running. We need this 1606 * specifically to support the rdmsr emulation that will set the 1607 * %eax and %edx registers during vm exit processing. 1608 */ 1609 if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0) 1610 return (0); 1611 1612 /* 1613 * If the vcpu is running then don't mess with the VMCS. 1614 * 1615 * vmcs_setreg will VMCLEAR the vmcs when it is done which will cause 1616 * the subsequent vmlaunch/vmresume to fail. 1617 */ 1618 if (vcpu_is_running(vmx->vm, vcpu)) 1619 panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); 1620 1621 error = vmcs_setreg(&vmx->vmcs[vcpu], reg, val); 1622 1623 if (error == 0) { 1624 /* 1625 * If the "load EFER" VM-entry control is 1 then the 1626 * value of EFER.LMA must be identical to "IA-32e mode guest" 1627 * bit in the VM-entry control. 1628 */ 1629 if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 && 1630 (reg == VM_REG_GUEST_EFER)) { 1631 vmcs_getreg(&vmx->vmcs[vcpu], 1632 VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls); 1633 if (val & EFER_LMA) 1634 ctls |= VM_ENTRY_GUEST_LMA; 1635 else 1636 ctls &= ~VM_ENTRY_GUEST_LMA; 1637 vmcs_setreg(&vmx->vmcs[vcpu], 1638 VMCS_IDENT(VMCS_ENTRY_CTLS), ctls); 1639 } 1640 } 1641 1642 return (error); 1643} 1644 1645static int 1646vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 1647{ 1648 struct vmx *vmx = arg; 1649 1650 return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc)); 1651} 1652 1653static int 1654vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 1655{ 1656 struct vmx *vmx = arg; 1657 1658 return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc)); 1659} 1660 1661static int 1662vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code, 1663 int code_valid) 1664{ 1665 int error; 1666 uint64_t info; 1667 struct vmx *vmx = arg; 1668 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 1669 1670 static uint32_t type_map[VM_EVENT_MAX] = { 1671 0x1, /* VM_EVENT_NONE */ 1672 0x0, /* VM_HW_INTR */ 1673 0x2, /* VM_NMI */ 1674 0x3, /* VM_HW_EXCEPTION */ 1675 0x4, /* VM_SW_INTR */ 1676 0x5, /* VM_PRIV_SW_EXCEPTION */ 1677 0x6, /* VM_SW_EXCEPTION */ 1678 }; 1679 1680 /* 1681 * If there is already an exception pending to be delivered to the 1682 * vcpu then just return. 1683 */ 1684 error = vmcs_getreg(vmcs, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), &info); 1685 if (error) 1686 return (error); 1687 1688 if (info & VMCS_INTERRUPTION_INFO_VALID) 1689 return (EAGAIN); 1690 1691 info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0); 1692 info |= VMCS_INTERRUPTION_INFO_VALID; 1693 error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info); 1694 if (error != 0) 1695 return (error); 1696 1697 if (code_valid) { 1698 error = vmcs_setreg(vmcs, 1699 VMCS_IDENT(VMCS_ENTRY_EXCEPTION_ERROR), 1700 code); 1701 } 1702 return (error); 1703} 1704 1705static int 1706vmx_getcap(void *arg, int vcpu, int type, int *retval) 1707{ 1708 struct vmx *vmx = arg; 1709 int vcap; 1710 int ret; 1711 1712 ret = ENOENT; 1713 1714 vcap = vmx->cap[vcpu].set; 1715 1716 switch (type) { 1717 case VM_CAP_HALT_EXIT: 1718 if (cap_halt_exit) 1719 ret = 0; 1720 break; 1721 case VM_CAP_PAUSE_EXIT: 1722 if (cap_pause_exit) 1723 ret = 0; 1724 break; 1725 case VM_CAP_MTRAP_EXIT: 1726 if (cap_monitor_trap) 1727 ret = 0; 1728 break; 1729 case VM_CAP_UNRESTRICTED_GUEST: 1730 if (cap_unrestricted_guest) 1731 ret = 0; 1732 break; 1733 default: 1734 break; 1735 } 1736 1737 if (ret == 0) 1738 *retval = (vcap & (1 << type)) ? 1 : 0; 1739 1740 return (ret); 1741} 1742 1743static int 1744vmx_setcap(void *arg, int vcpu, int type, int val) 1745{ 1746 struct vmx *vmx = arg; 1747 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 1748 uint32_t baseval; 1749 uint32_t *pptr; 1750 int error; 1751 int flag; 1752 int reg; 1753 int retval; 1754 1755 retval = ENOENT; 1756 pptr = NULL; 1757 1758 switch (type) { 1759 case VM_CAP_HALT_EXIT: 1760 if (cap_halt_exit) { 1761 retval = 0; 1762 pptr = &vmx->cap[vcpu].proc_ctls; 1763 baseval = *pptr; 1764 flag = PROCBASED_HLT_EXITING; 1765 reg = VMCS_PRI_PROC_BASED_CTLS; 1766 } 1767 break; 1768 case VM_CAP_MTRAP_EXIT: 1769 if (cap_monitor_trap) { 1770 retval = 0; 1771 pptr = &vmx->cap[vcpu].proc_ctls; 1772 baseval = *pptr; 1773 flag = PROCBASED_MTF; 1774 reg = VMCS_PRI_PROC_BASED_CTLS; 1775 } 1776 break; 1777 case VM_CAP_PAUSE_EXIT: 1778 if (cap_pause_exit) { 1779 retval = 0; 1780 pptr = &vmx->cap[vcpu].proc_ctls; 1781 baseval = *pptr; 1782 flag = PROCBASED_PAUSE_EXITING; 1783 reg = VMCS_PRI_PROC_BASED_CTLS; 1784 } 1785 break; 1786 case VM_CAP_UNRESTRICTED_GUEST: 1787 if (cap_unrestricted_guest) { 1788 retval = 0; 1789 baseval = procbased_ctls2; 1790 flag = PROCBASED2_UNRESTRICTED_GUEST; 1791 reg = VMCS_SEC_PROC_BASED_CTLS; 1792 } 1793 break; 1794 default: 1795 break; 1796 } 1797 1798 if (retval == 0) { 1799 if (val) { 1800 baseval |= flag; 1801 } else { 1802 baseval &= ~flag; 1803 } 1804 VMPTRLD(vmcs); 1805 error = vmwrite(reg, baseval); 1806 VMCLEAR(vmcs); 1807 1808 if (error) { 1809 retval = error; 1810 } else { 1811 /* 1812 * Update optional stored flags, and record 1813 * setting 1814 */ 1815 if (pptr != NULL) { 1816 *pptr = baseval; 1817 } 1818 1819 if (val) { 1820 vmx->cap[vcpu].set |= (1 << type); 1821 } else { 1822 vmx->cap[vcpu].set &= ~(1 << type); 1823 } 1824 } 1825 } 1826 1827 return (retval); 1828} 1829 1830struct vmm_ops vmm_ops_intel = { 1831 vmx_init, 1832 vmx_cleanup, 1833 vmx_vminit, 1834 vmx_run, 1835 vmx_vmcleanup, 1836 ept_vmmmap_set, 1837 ept_vmmmap_get, 1838 vmx_getreg, 1839 vmx_setreg, 1840 vmx_getdesc, 1841 vmx_setdesc, 1842 vmx_inject, 1843 vmx_getcap, 1844 vmx_setcap 1845}; 1846