vmx.c revision 222610
1/*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD$ 27 */ 28 29#include <sys/cdefs.h> 30__FBSDID("$FreeBSD$"); 31 32#include <sys/param.h> 33#include <sys/systm.h> 34#include <sys/smp.h> 35#include <sys/kernel.h> 36#include <sys/malloc.h> 37#include <sys/pcpu.h> 38#include <sys/proc.h> 39 40#include <vm/vm.h> 41#include <vm/pmap.h> 42 43#include <machine/psl.h> 44#include <machine/cpufunc.h> 45#include <machine/md_var.h> 46#include <machine/pmap.h> 47#include <machine/segments.h> 48#include <machine/vmparam.h> 49 50#include <machine/vmm.h> 51#include "vmm_lapic.h" 52#include "vmm_msr.h" 53#include "vmm_ktr.h" 54#include "vmm_stat.h" 55 56#include "vmx_msr.h" 57#include "ept.h" 58#include "vmx_cpufunc.h" 59#include "vmx.h" 60#include "x86.h" 61#include "vmx_controls.h" 62 63#define CR4_VMXE (1UL << 13) 64 65#define PINBASED_CTLS_ONE_SETTING \ 66 (PINBASED_EXTINT_EXITING | \ 67 PINBASED_NMI_EXITING | \ 68 PINBASED_VIRTUAL_NMI) 69#define PINBASED_CTLS_ZERO_SETTING 0 70 71#define PROCBASED_CTLS_WINDOW_SETTING \ 72 (PROCBASED_INT_WINDOW_EXITING | \ 73 PROCBASED_NMI_WINDOW_EXITING) 74 75#define PROCBASED_CTLS_ONE_SETTING \ 76 (PROCBASED_SECONDARY_CONTROLS | \ 77 PROCBASED_IO_EXITING | \ 78 PROCBASED_MSR_BITMAPS | \ 79 PROCBASED_CTLS_WINDOW_SETTING) 80#define PROCBASED_CTLS_ZERO_SETTING \ 81 (PROCBASED_CR3_LOAD_EXITING | \ 82 PROCBASED_CR3_STORE_EXITING | \ 83 PROCBASED_IO_BITMAPS) 84 85#define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT 86#define PROCBASED_CTLS2_ZERO_SETTING 0 87 88#define VM_EXIT_CTLS_ONE_SETTING \ 89 (VM_EXIT_HOST_LMA | \ 90 VM_EXIT_SAVE_EFER | \ 91 VM_EXIT_SAVE_PAT | \ 92 VM_EXIT_LOAD_PAT | \ 93 VM_EXIT_LOAD_EFER) 94#define VM_EXIT_CTLS_ZERO_SETTING VM_EXIT_SAVE_DEBUG_CONTROLS 95 96#define VM_ENTRY_CTLS_ONE_SETTING \ 97 (VM_ENTRY_LOAD_PAT | \ 98 VM_ENTRY_LOAD_EFER) 99#define VM_ENTRY_CTLS_ZERO_SETTING \ 100 (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ 101 VM_ENTRY_INTO_SMM | \ 102 VM_ENTRY_DEACTIVATE_DUAL_MONITOR) 103 104#define guest_msr_rw(vmx, msr) \ 105 msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW) 106 107#define HANDLED 1 108#define UNHANDLED 0 109 110MALLOC_DEFINE(M_VMX, "vmx", "vmx"); 111 112extern struct pcpu __pcpu[]; 113 114int vmxon_enabled[MAXCPU]; 115static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); 116 117static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; 118static uint32_t exit_ctls, entry_ctls; 119 120static uint64_t cr0_ones_mask, cr0_zeros_mask; 121static uint64_t cr4_ones_mask, cr4_zeros_mask; 122 123static volatile u_int nextvpid; 124 125/* 126 * Virtual NMI blocking conditions. 127 * 128 * Some processor implementations also require NMI to be blocked if 129 * the STI_BLOCKING bit is set. It is possible to detect this at runtime 130 * based on the (exit_reason,exit_qual) tuple being set to 131 * (EXIT_REASON_INVAL_VMCS, EXIT_QUAL_NMI_WHILE_STI_BLOCKING). 132 * 133 * We take the easy way out and also include STI_BLOCKING as one of the 134 * gating items for vNMI injection. 135 */ 136static uint64_t nmi_blocking_bits = VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING | 137 VMCS_INTERRUPTIBILITY_NMI_BLOCKING | 138 VMCS_INTERRUPTIBILITY_STI_BLOCKING; 139 140/* 141 * Optional capabilities 142 */ 143static int cap_halt_exit; 144static int cap_pause_exit; 145static int cap_unrestricted_guest; 146static int cap_monitor_trap; 147 148/* statistics */ 149static VMM_STAT_DEFINE(VCPU_MIGRATIONS, "vcpu migration across host cpus"); 150static VMM_STAT_DEFINE(VMEXIT_EXTINT, "vm exits due to external interrupt"); 151 152#ifdef KTR 153static const char * 154exit_reason_to_str(int reason) 155{ 156 static char reasonbuf[32]; 157 158 switch (reason) { 159 case EXIT_REASON_EXCEPTION: 160 return "exception"; 161 case EXIT_REASON_EXT_INTR: 162 return "extint"; 163 case EXIT_REASON_TRIPLE_FAULT: 164 return "triplefault"; 165 case EXIT_REASON_INIT: 166 return "init"; 167 case EXIT_REASON_SIPI: 168 return "sipi"; 169 case EXIT_REASON_IO_SMI: 170 return "iosmi"; 171 case EXIT_REASON_SMI: 172 return "smi"; 173 case EXIT_REASON_INTR_WINDOW: 174 return "intrwindow"; 175 case EXIT_REASON_NMI_WINDOW: 176 return "nmiwindow"; 177 case EXIT_REASON_TASK_SWITCH: 178 return "taskswitch"; 179 case EXIT_REASON_CPUID: 180 return "cpuid"; 181 case EXIT_REASON_GETSEC: 182 return "getsec"; 183 case EXIT_REASON_HLT: 184 return "hlt"; 185 case EXIT_REASON_INVD: 186 return "invd"; 187 case EXIT_REASON_INVLPG: 188 return "invlpg"; 189 case EXIT_REASON_RDPMC: 190 return "rdpmc"; 191 case EXIT_REASON_RDTSC: 192 return "rdtsc"; 193 case EXIT_REASON_RSM: 194 return "rsm"; 195 case EXIT_REASON_VMCALL: 196 return "vmcall"; 197 case EXIT_REASON_VMCLEAR: 198 return "vmclear"; 199 case EXIT_REASON_VMLAUNCH: 200 return "vmlaunch"; 201 case EXIT_REASON_VMPTRLD: 202 return "vmptrld"; 203 case EXIT_REASON_VMPTRST: 204 return "vmptrst"; 205 case EXIT_REASON_VMREAD: 206 return "vmread"; 207 case EXIT_REASON_VMRESUME: 208 return "vmresume"; 209 case EXIT_REASON_VMWRITE: 210 return "vmwrite"; 211 case EXIT_REASON_VMXOFF: 212 return "vmxoff"; 213 case EXIT_REASON_VMXON: 214 return "vmxon"; 215 case EXIT_REASON_CR_ACCESS: 216 return "craccess"; 217 case EXIT_REASON_DR_ACCESS: 218 return "draccess"; 219 case EXIT_REASON_INOUT: 220 return "inout"; 221 case EXIT_REASON_RDMSR: 222 return "rdmsr"; 223 case EXIT_REASON_WRMSR: 224 return "wrmsr"; 225 case EXIT_REASON_INVAL_VMCS: 226 return "invalvmcs"; 227 case EXIT_REASON_INVAL_MSR: 228 return "invalmsr"; 229 case EXIT_REASON_MWAIT: 230 return "mwait"; 231 case EXIT_REASON_MTF: 232 return "mtf"; 233 case EXIT_REASON_MONITOR: 234 return "monitor"; 235 case EXIT_REASON_PAUSE: 236 return "pause"; 237 case EXIT_REASON_MCE: 238 return "mce"; 239 case EXIT_REASON_TPR: 240 return "tpr"; 241 case EXIT_REASON_APIC: 242 return "apic"; 243 case EXIT_REASON_GDTR_IDTR: 244 return "gdtridtr"; 245 case EXIT_REASON_LDTR_TR: 246 return "ldtrtr"; 247 case EXIT_REASON_EPT_FAULT: 248 return "eptfault"; 249 case EXIT_REASON_EPT_MISCONFIG: 250 return "eptmisconfig"; 251 case EXIT_REASON_INVEPT: 252 return "invept"; 253 case EXIT_REASON_RDTSCP: 254 return "rdtscp"; 255 case EXIT_REASON_VMX_PREEMPT: 256 return "vmxpreempt"; 257 case EXIT_REASON_INVVPID: 258 return "invvpid"; 259 case EXIT_REASON_WBINVD: 260 return "wbinvd"; 261 case EXIT_REASON_XSETBV: 262 return "xsetbv"; 263 default: 264 snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); 265 return (reasonbuf); 266 } 267} 268 269#ifdef SETJMP_TRACE 270static const char * 271vmx_setjmp_rc2str(int rc) 272{ 273 switch (rc) { 274 case VMX_RETURN_DIRECT: 275 return "direct"; 276 case VMX_RETURN_LONGJMP: 277 return "longjmp"; 278 case VMX_RETURN_VMRESUME: 279 return "vmresume"; 280 case VMX_RETURN_VMLAUNCH: 281 return "vmlaunch"; 282 default: 283 return "unknown"; 284 } 285} 286 287#define SETJMP_TRACE(vmx, vcpu, vmxctx, regname) \ 288 VMM_CTR1((vmx)->vm, (vcpu), "setjmp trace " #regname " 0x%016lx", \ 289 (vmxctx)->regname) 290 291static void 292vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) 293{ 294 uint64_t host_rip, host_rsp; 295 296 if (vmxctx != &vmx->ctx[vcpu]) 297 panic("vmx_setjmp_trace: invalid vmxctx %p; should be %p", 298 vmxctx, &vmx->ctx[vcpu]); 299 300 VMM_CTR1((vmx)->vm, (vcpu), "vmxctx = %p", vmxctx); 301 VMM_CTR2((vmx)->vm, (vcpu), "setjmp return code %s(%d)", 302 vmx_setjmp_rc2str(rc), rc); 303 304 host_rsp = host_rip = ~0; 305 vmread(VMCS_HOST_RIP, &host_rip); 306 vmread(VMCS_HOST_RSP, &host_rsp); 307 VMM_CTR2((vmx)->vm, (vcpu), "vmcs host_rip 0x%016lx, host_rsp 0x%016lx", 308 host_rip, host_rsp); 309 310 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r15); 311 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r14); 312 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r13); 313 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r12); 314 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbp); 315 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rsp); 316 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbx); 317 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rip); 318 319 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdi); 320 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rsi); 321 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdx); 322 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rcx); 323 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r8); 324 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r9); 325 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rax); 326 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbx); 327 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbp); 328 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r10); 329 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r11); 330 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r12); 331 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r13); 332 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r14); 333 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r15); 334 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_cr2); 335} 336#endif 337#else 338static void __inline 339vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) 340{ 341 return; 342} 343#endif /* KTR */ 344 345u_long 346vmx_fix_cr0(u_long cr0) 347{ 348 349 return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); 350} 351 352u_long 353vmx_fix_cr4(u_long cr4) 354{ 355 356 return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); 357} 358 359static void 360msr_save_area_init(struct msr_entry *g_area, int *g_count) 361{ 362 int cnt; 363 364 static struct msr_entry guest_msrs[] = { 365 { MSR_KGSBASE, 0, 0 }, 366 }; 367 368 cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]); 369 if (cnt > GUEST_MSR_MAX_ENTRIES) 370 panic("guest msr save area overrun"); 371 bcopy(guest_msrs, g_area, sizeof(guest_msrs)); 372 *g_count = cnt; 373} 374 375static void 376vmx_disable(void *arg __unused) 377{ 378 struct invvpid_desc invvpid_desc = { 0 }; 379 struct invept_desc invept_desc = { 0 }; 380 381 if (vmxon_enabled[curcpu]) { 382 /* 383 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b. 384 * 385 * VMXON or VMXOFF are not required to invalidate any TLB 386 * caching structures. This prevents potential retention of 387 * cached information in the TLB between distinct VMX episodes. 388 */ 389 invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc); 390 invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc); 391 vmxoff(); 392 } 393 load_cr4(rcr4() & ~CR4_VMXE); 394} 395 396static int 397vmx_cleanup(void) 398{ 399 400 smp_rendezvous(NULL, vmx_disable, NULL, NULL); 401 402 return (0); 403} 404 405static void 406vmx_enable(void *arg __unused) 407{ 408 int error; 409 410 load_cr4(rcr4() | CR4_VMXE); 411 412 *(uint32_t *)vmxon_region[curcpu] = vmx_revision(); 413 error = vmxon(vmxon_region[curcpu]); 414 if (error == 0) 415 vmxon_enabled[curcpu] = 1; 416} 417 418static int 419vmx_init(void) 420{ 421 int error; 422 uint64_t fixed0, fixed1; 423 uint32_t tmp; 424 425 /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ 426 if (!(cpu_feature2 & CPUID2_VMX)) { 427 printf("vmx_init: processor does not support VMX operation\n"); 428 return (ENXIO); 429 } 430 431 /* Check support for primary processor-based VM-execution controls */ 432 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 433 MSR_VMX_TRUE_PROCBASED_CTLS, 434 PROCBASED_CTLS_ONE_SETTING, 435 PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); 436 if (error) { 437 printf("vmx_init: processor does not support desired primary " 438 "processor-based controls\n"); 439 return (error); 440 } 441 442 /* Clear the processor-based ctl bits that are set on demand */ 443 procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; 444 445 /* Check support for secondary processor-based VM-execution controls */ 446 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 447 MSR_VMX_PROCBASED_CTLS2, 448 PROCBASED_CTLS2_ONE_SETTING, 449 PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); 450 if (error) { 451 printf("vmx_init: processor does not support desired secondary " 452 "processor-based controls\n"); 453 return (error); 454 } 455 456 /* Check support for VPID */ 457 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, 458 PROCBASED2_ENABLE_VPID, 0, &tmp); 459 if (error == 0) 460 procbased_ctls2 |= PROCBASED2_ENABLE_VPID; 461 462 /* Check support for pin-based VM-execution controls */ 463 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, 464 MSR_VMX_TRUE_PINBASED_CTLS, 465 PINBASED_CTLS_ONE_SETTING, 466 PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); 467 if (error) { 468 printf("vmx_init: processor does not support desired " 469 "pin-based controls\n"); 470 return (error); 471 } 472 473 /* Check support for VM-exit controls */ 474 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, 475 VM_EXIT_CTLS_ONE_SETTING, 476 VM_EXIT_CTLS_ZERO_SETTING, 477 &exit_ctls); 478 if (error) { 479 printf("vmx_init: processor does not support desired " 480 "exit controls\n"); 481 return (error); 482 } 483 484 /* Check support for VM-entry controls */ 485 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS, 486 VM_ENTRY_CTLS_ONE_SETTING, 487 VM_ENTRY_CTLS_ZERO_SETTING, 488 &entry_ctls); 489 if (error) { 490 printf("vmx_init: processor does not support desired " 491 "entry controls\n"); 492 return (error); 493 } 494 495 /* 496 * Check support for optional features by testing them 497 * as individual bits 498 */ 499 cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 500 MSR_VMX_TRUE_PROCBASED_CTLS, 501 PROCBASED_HLT_EXITING, 0, 502 &tmp) == 0); 503 504 cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 505 MSR_VMX_PROCBASED_CTLS, 506 PROCBASED_MTF, 0, 507 &tmp) == 0); 508 509 cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 510 MSR_VMX_TRUE_PROCBASED_CTLS, 511 PROCBASED_PAUSE_EXITING, 0, 512 &tmp) == 0); 513 514 cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 515 MSR_VMX_PROCBASED_CTLS2, 516 PROCBASED2_UNRESTRICTED_GUEST, 0, 517 &tmp) == 0); 518 519 /* Initialize EPT */ 520 error = ept_init(); 521 if (error) { 522 printf("vmx_init: ept initialization failed (%d)\n", error); 523 return (error); 524 } 525 526 /* 527 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 528 */ 529 fixed0 = rdmsr(MSR_VMX_CR0_FIXED0); 530 fixed1 = rdmsr(MSR_VMX_CR0_FIXED1); 531 cr0_ones_mask = fixed0 & fixed1; 532 cr0_zeros_mask = ~fixed0 & ~fixed1; 533 534 /* 535 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation 536 * if unrestricted guest execution is allowed. 537 */ 538 if (cap_unrestricted_guest) 539 cr0_ones_mask &= ~(CR0_PG | CR0_PE); 540 541 /* 542 * Do not allow the guest to set CR0_NW or CR0_CD. 543 */ 544 cr0_zeros_mask |= (CR0_NW | CR0_CD); 545 546 fixed0 = rdmsr(MSR_VMX_CR4_FIXED0); 547 fixed1 = rdmsr(MSR_VMX_CR4_FIXED1); 548 cr4_ones_mask = fixed0 & fixed1; 549 cr4_zeros_mask = ~fixed0 & ~fixed1; 550 551 /* enable VMX operation */ 552 smp_rendezvous(NULL, vmx_enable, NULL, NULL); 553 554 return (0); 555} 556 557/* 558 * If this processor does not support VPIDs then simply return 0. 559 * 560 * Otherwise generate the next value of VPID to use. Any value is alright 561 * as long as it is non-zero. 562 * 563 * We always execute in VMX non-root context with EPT enabled. Thus all 564 * combined mappings are tagged with the (EP4TA, VPID, PCID) tuple. This 565 * in turn means that multiple VMs can share the same VPID as long as 566 * they have distinct EPT page tables. 567 * 568 * XXX 569 * We should optimize this so that it returns VPIDs that are not in 570 * use. Then we will not unnecessarily invalidate mappings in 571 * vmx_set_pcpu_defaults() just because two or more vcpus happen to 572 * use the same 'vpid'. 573 */ 574static uint16_t 575vmx_vpid(void) 576{ 577 uint16_t vpid = 0; 578 579 if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) != 0) { 580 do { 581 vpid = atomic_fetchadd_int(&nextvpid, 1); 582 } while (vpid == 0); 583 } 584 585 return (vpid); 586} 587 588static int 589vmx_setup_cr0_shadow(struct vmcs *vmcs) 590{ 591 int error; 592 uint64_t mask, shadow; 593 594 mask = cr0_ones_mask | cr0_zeros_mask; 595 error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_CR0_MASK), mask); 596 if (error) 597 return (error); 598 599 shadow = cr0_ones_mask; 600 error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_CR0_SHADOW), shadow); 601 if (error) 602 return (error); 603 604 return (0); 605} 606 607static void * 608vmx_vminit(struct vm *vm) 609{ 610 uint16_t vpid; 611 int i, error, guest_msr_count; 612 struct vmx *vmx; 613 614 vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); 615 if ((uintptr_t)vmx & PAGE_MASK) { 616 panic("malloc of struct vmx not aligned on %d byte boundary", 617 PAGE_SIZE); 618 } 619 vmx->vm = vm; 620 621 /* 622 * Clean up EPTP-tagged guest physical and combined mappings 623 * 624 * VMX transitions are not required to invalidate any guest physical 625 * mappings. So, it may be possible for stale guest physical mappings 626 * to be present in the processor TLBs. 627 * 628 * Combined mappings for this EP4TA are also invalidated for all VPIDs. 629 */ 630 ept_invalidate_mappings(vtophys(vmx->pml4ept)); 631 632 msr_bitmap_initialize(vmx->msr_bitmap); 633 634 /* 635 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE. 636 * The guest FSBASE and GSBASE are saved and restored during 637 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are 638 * always restored from the vmcs host state area on vm-exit. 639 * 640 * Guest KGSBASE is saved and restored in the guest MSR save area. 641 * Host KGSBASE is restored before returning to userland from the pcb. 642 * There will be a window of time when we are executing in the host 643 * kernel context with a value of KGSBASE from the guest. This is ok 644 * because the value of KGSBASE is inconsequential in kernel context. 645 * 646 * MSR_EFER is saved and restored in the guest VMCS area on a 647 * VM exit and entry respectively. It is also restored from the 648 * host VMCS area on a VM exit. 649 * 650 * MSR_PAT is saved and restored in the guest VMCS are on a VM exit 651 * and entry respectively. It is also restored from the host VMCS 652 * area on a VM exit. 653 */ 654 if (guest_msr_rw(vmx, MSR_GSBASE) || 655 guest_msr_rw(vmx, MSR_FSBASE) || 656 guest_msr_rw(vmx, MSR_KGSBASE) || 657 guest_msr_rw(vmx, MSR_EFER) || 658 guest_msr_rw(vmx, MSR_PAT)) 659 panic("vmx_vminit: error setting guest msr access"); 660 661 for (i = 0; i < VM_MAXCPU; i++) { 662 vmx->vmcs[i].identifier = vmx_revision(); 663 error = vmclear(&vmx->vmcs[i]); 664 if (error != 0) { 665 panic("vmx_vminit: vmclear error %d on vcpu %d\n", 666 error, i); 667 } 668 669 vpid = vmx_vpid(); 670 671 error = vmcs_set_defaults(&vmx->vmcs[i], 672 (u_long)vmx_longjmp, 673 (u_long)&vmx->ctx[i], 674 vtophys(vmx->pml4ept), 675 pinbased_ctls, 676 procbased_ctls, 677 procbased_ctls2, 678 exit_ctls, entry_ctls, 679 vtophys(vmx->msr_bitmap), 680 vpid); 681 682 if (error != 0) 683 panic("vmx_vminit: vmcs_set_defaults error %d", error); 684 685 vmx->cap[i].set = 0; 686 vmx->cap[i].proc_ctls = procbased_ctls; 687 688 vmx->state[i].request_nmi = 0; 689 vmx->state[i].lastcpu = -1; 690 vmx->state[i].vpid = vpid; 691 692 msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count); 693 694 error = vmcs_set_msr_save(&vmx->vmcs[i], 695 vtophys(vmx->guest_msrs[i]), 696 guest_msr_count); 697 if (error != 0) 698 panic("vmcs_set_msr_save error %d", error); 699 700 error = vmx_setup_cr0_shadow(&vmx->vmcs[i]); 701 } 702 703 return (vmx); 704} 705 706static int 707vmx_handle_cpuid(int vcpu, struct vmxctx *vmxctx) 708{ 709 int handled, func; 710 711 func = vmxctx->guest_rax; 712 713 handled = x86_emulate_cpuid((uint32_t*)(&vmxctx->guest_rax), 714 (uint32_t*)(&vmxctx->guest_rbx), (uint32_t*)(&vmxctx->guest_rcx), 715 (uint32_t*)(&vmxctx->guest_rdx), vcpu); 716#if 0 717 printf("%s: func %x rax %lx rbx %lx rcx %lx rdx %lx handled %d\n", 718 __func__, func, vmxctx->guest_rax, vmxctx->guest_rbx, 719 vmxctx->guest_rcx, vmxctx->guest_rdx, handled); 720#endif 721 722 return (handled); 723} 724 725static __inline void 726vmx_run_trace(struct vmx *vmx, int vcpu) 727{ 728#ifdef KTR 729 VMM_CTR1(vmx->vm, vcpu, "Resume execution at 0x%0lx", vmcs_guest_rip()); 730#endif 731} 732 733static __inline void 734vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, 735 int handled, int astpending) 736{ 737#ifdef KTR 738 VMM_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx", 739 handled ? "handled" : "unhandled", 740 exit_reason_to_str(exit_reason), rip); 741 742 if (astpending) 743 VMM_CTR0(vmx->vm, vcpu, "astpending"); 744#endif 745} 746 747static int 748vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu) 749{ 750 int error, lastcpu; 751 struct vmxstate *vmxstate; 752 struct invvpid_desc invvpid_desc = { 0 }; 753 754 vmxstate = &vmx->state[vcpu]; 755 lastcpu = vmxstate->lastcpu; 756 vmxstate->lastcpu = curcpu; 757 758 if (lastcpu == curcpu) { 759 error = 0; 760 goto done; 761 } 762 763 vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); 764 765 error = vmwrite(VMCS_HOST_TR_BASE, (u_long)PCPU_GET(tssp)); 766 if (error != 0) 767 goto done; 768 769 error = vmwrite(VMCS_HOST_GDTR_BASE, (u_long)&gdt[NGDT * curcpu]); 770 if (error != 0) 771 goto done; 772 773 error = vmwrite(VMCS_HOST_GS_BASE, (u_long)&__pcpu[curcpu]); 774 if (error != 0) 775 goto done; 776 777 /* 778 * If we are using VPIDs then invalidate all mappings tagged with 'vpid' 779 * 780 * We do this because this vcpu was executing on a different host 781 * cpu when it last ran. We do not track whether it invalidated 782 * mappings associated with its 'vpid' during that run. So we must 783 * assume that the mappings associated with 'vpid' on 'curcpu' are 784 * stale and invalidate them. 785 * 786 * Note that we incur this penalty only when the scheduler chooses to 787 * move the thread associated with this vcpu between host cpus. 788 * 789 * Note also that this will invalidate mappings tagged with 'vpid' 790 * for "all" EP4TAs. 791 */ 792 if (vmxstate->vpid != 0) { 793 invvpid_desc.vpid = vmxstate->vpid; 794 invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); 795 } 796done: 797 return (error); 798} 799 800static void 801vm_exit_update_rip(struct vm_exit *vmexit) 802{ 803 int error; 804 805 error = vmwrite(VMCS_GUEST_RIP, vmexit->rip + vmexit->inst_length); 806 if (error) 807 panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); 808} 809 810/* 811 * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. 812 */ 813CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); 814 815static void __inline 816vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) 817{ 818 int error; 819 820 vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING; 821 822 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 823 if (error) 824 panic("vmx_set_int_window_exiting: vmwrite error %d", error); 825} 826 827static void __inline 828vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) 829{ 830 int error; 831 832 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; 833 834 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 835 if (error) 836 panic("vmx_clear_int_window_exiting: vmwrite error %d", error); 837} 838 839static void __inline 840vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) 841{ 842 int error; 843 844 vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; 845 846 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 847 if (error) 848 panic("vmx_set_nmi_window_exiting: vmwrite error %d", error); 849} 850 851static void __inline 852vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) 853{ 854 int error; 855 856 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; 857 858 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 859 if (error) 860 panic("vmx_clear_nmi_window_exiting: vmwrite error %d", error); 861} 862 863static int 864vmx_inject_nmi(struct vmx *vmx, int vcpu) 865{ 866 int error; 867 uint64_t info, interruptibility; 868 869 /* Bail out if no NMI requested */ 870 if (vmx->state[vcpu].request_nmi == 0) 871 return (0); 872 873 error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility); 874 if (error) { 875 panic("vmx_inject_nmi: vmread(interruptibility) %d", 876 error); 877 } 878 if (interruptibility & nmi_blocking_bits) 879 goto nmiblocked; 880 881 /* 882 * Inject the virtual NMI. The vector must be the NMI IDT entry 883 * or the VMCS entry check will fail. 884 */ 885 info = VMCS_INTERRUPTION_INFO_NMI | VMCS_INTERRUPTION_INFO_VALID; 886 info |= IDT_NMI; 887 888 error = vmwrite(VMCS_ENTRY_INTR_INFO, info); 889 if (error) 890 panic("vmx_inject_nmi: vmwrite(intrinfo) %d", error); 891 892 VMM_CTR0(vmx->vm, vcpu, "Injecting vNMI"); 893 894 /* Clear the request */ 895 vmx->state[vcpu].request_nmi = 0; 896 return (1); 897 898nmiblocked: 899 /* 900 * Set the NMI Window Exiting execution control so we can inject 901 * the virtual NMI as soon as blocking condition goes away. 902 */ 903 vmx_set_nmi_window_exiting(vmx, vcpu); 904 905 VMM_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting"); 906 return (1); 907} 908 909static void 910vmx_inject_interrupts(struct vmx *vmx, int vcpu) 911{ 912 int error, vector; 913 uint64_t info, rflags, interruptibility; 914 915 const int HWINTR_BLOCKED = VMCS_INTERRUPTIBILITY_STI_BLOCKING | 916 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING; 917 918#if 1 919 /* 920 * XXX 921 * If an event is being injected from userland then just return. 922 * For e.g. we may inject a breakpoint exception to cause the 923 * guest to enter the debugger so we can inspect its state. 924 */ 925 error = vmread(VMCS_ENTRY_INTR_INFO, &info); 926 if (error) 927 panic("vmx_inject_interrupts: vmread(intrinfo) %d", error); 928 if (info & VMCS_INTERRUPTION_INFO_VALID) 929 return; 930#endif 931 /* 932 * NMI injection has priority so deal with those first 933 */ 934 if (vmx_inject_nmi(vmx, vcpu)) 935 return; 936 937 /* Ask the local apic for a vector to inject */ 938 vector = lapic_pending_intr(vmx->vm, vcpu); 939 if (vector < 0) 940 return; 941 942 if (vector < 32 || vector > 255) 943 panic("vmx_inject_interrupts: invalid vector %d\n", vector); 944 945 /* Check RFLAGS.IF and the interruptibility state of the guest */ 946 error = vmread(VMCS_GUEST_RFLAGS, &rflags); 947 if (error) 948 panic("vmx_inject_interrupts: vmread(rflags) %d", error); 949 950 if ((rflags & PSL_I) == 0) 951 goto cantinject; 952 953 error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility); 954 if (error) { 955 panic("vmx_inject_interrupts: vmread(interruptibility) %d", 956 error); 957 } 958 if (interruptibility & HWINTR_BLOCKED) 959 goto cantinject; 960 961 /* Inject the interrupt */ 962 info = VMCS_INTERRUPTION_INFO_HW_INTR | VMCS_INTERRUPTION_INFO_VALID; 963 info |= vector; 964 error = vmwrite(VMCS_ENTRY_INTR_INFO, info); 965 if (error) 966 panic("vmx_inject_interrupts: vmwrite(intrinfo) %d", error); 967 968 /* Update the Local APIC ISR */ 969 lapic_intr_accepted(vmx->vm, vcpu, vector); 970 971 VMM_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); 972 973 return; 974 975cantinject: 976 /* 977 * Set the Interrupt Window Exiting execution control so we can inject 978 * the interrupt as soon as blocking condition goes away. 979 */ 980 vmx_set_int_window_exiting(vmx, vcpu); 981 982 VMM_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting"); 983} 984 985static int 986vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual) 987{ 988 int error; 989 uint64_t regval; 990 const struct vmxctx *vmxctx; 991 992 /* We only handle mov to %cr0 at this time */ 993 if ((exitqual & 0xff) != 0x00) 994 return (UNHANDLED); 995 996 vmxctx = &vmx->ctx[vcpu]; 997 998 /* 999 * We must use vmwrite() directly here because vmcs_setreg() will 1000 * call vmclear(vmcs) as a side-effect which we certainly don't want. 1001 */ 1002 switch ((exitqual >> 8) & 0xf) { 1003 case 0: 1004 regval = vmxctx->guest_rax; 1005 break; 1006 case 1: 1007 regval = vmxctx->guest_rcx; 1008 break; 1009 case 2: 1010 regval = vmxctx->guest_rdx; 1011 break; 1012 case 3: 1013 regval = vmxctx->guest_rbx; 1014 break; 1015 case 4: 1016 error = vmread(VMCS_GUEST_RSP, ®val); 1017 if (error) { 1018 panic("vmx_emulate_cr_access: " 1019 "error %d reading guest rsp", error); 1020 } 1021 break; 1022 case 5: 1023 regval = vmxctx->guest_rbp; 1024 break; 1025 case 6: 1026 regval = vmxctx->guest_rsi; 1027 break; 1028 case 7: 1029 regval = vmxctx->guest_rdi; 1030 break; 1031 case 8: 1032 regval = vmxctx->guest_r8; 1033 break; 1034 case 9: 1035 regval = vmxctx->guest_r9; 1036 break; 1037 case 10: 1038 regval = vmxctx->guest_r10; 1039 break; 1040 case 11: 1041 regval = vmxctx->guest_r11; 1042 break; 1043 case 12: 1044 regval = vmxctx->guest_r12; 1045 break; 1046 case 13: 1047 regval = vmxctx->guest_r13; 1048 break; 1049 case 14: 1050 regval = vmxctx->guest_r14; 1051 break; 1052 case 15: 1053 regval = vmxctx->guest_r15; 1054 break; 1055 } 1056 1057 regval |= cr0_ones_mask; 1058 regval &= ~cr0_zeros_mask; 1059 error = vmwrite(VMCS_GUEST_CR0, regval); 1060 if (error) 1061 panic("vmx_emulate_cr_access: error %d writing cr0", error); 1062 1063 return (HANDLED); 1064} 1065 1066static int 1067vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 1068{ 1069 int handled; 1070 struct vmcs *vmcs; 1071 struct vmxctx *vmxctx; 1072 uint32_t eax, ecx, edx; 1073 uint64_t qual; 1074 1075 handled = 0; 1076 vmcs = &vmx->vmcs[vcpu]; 1077 vmxctx = &vmx->ctx[vcpu]; 1078 qual = vmexit->u.vmx.exit_qualification; 1079 vmexit->exitcode = VM_EXITCODE_BOGUS; 1080 1081 switch (vmexit->u.vmx.exit_reason) { 1082 case EXIT_REASON_CR_ACCESS: 1083 handled = vmx_emulate_cr_access(vmx, vcpu, qual); 1084 break; 1085 case EXIT_REASON_RDMSR: 1086 ecx = vmxctx->guest_rcx; 1087 handled = emulate_rdmsr(vmx->vm, vcpu, ecx); 1088 if (!handled) { 1089 vmexit->exitcode = VM_EXITCODE_RDMSR; 1090 vmexit->u.msr.code = ecx; 1091 } 1092 break; 1093 case EXIT_REASON_WRMSR: 1094 eax = vmxctx->guest_rax; 1095 ecx = vmxctx->guest_rcx; 1096 edx = vmxctx->guest_rdx; 1097 handled = emulate_wrmsr(vmx->vm, vcpu, ecx, 1098 (uint64_t)edx << 32 | eax); 1099 if (!handled) { 1100 vmexit->exitcode = VM_EXITCODE_WRMSR; 1101 vmexit->u.msr.code = ecx; 1102 vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; 1103 } 1104 break; 1105 case EXIT_REASON_HLT: 1106 vmexit->exitcode = VM_EXITCODE_HLT; 1107 break; 1108 case EXIT_REASON_MTF: 1109 vmexit->exitcode = VM_EXITCODE_MTRAP; 1110 break; 1111 case EXIT_REASON_PAUSE: 1112 vmexit->exitcode = VM_EXITCODE_PAUSE; 1113 break; 1114 case EXIT_REASON_INTR_WINDOW: 1115 vmx_clear_int_window_exiting(vmx, vcpu); 1116 VMM_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); 1117 /* FALLTHRU */ 1118 case EXIT_REASON_EXT_INTR: 1119 /* 1120 * External interrupts serve only to cause VM exits and allow 1121 * the host interrupt handler to run. 1122 * 1123 * If this external interrupt triggers a virtual interrupt 1124 * to a VM, then that state will be recorded by the 1125 * host interrupt handler in the VM's softc. We will inject 1126 * this virtual interrupt during the subsequent VM enter. 1127 */ 1128 1129 /* 1130 * This is special. We want to treat this as an 'handled' 1131 * VM-exit but not increment the instruction pointer. 1132 */ 1133 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); 1134 return (1); 1135 case EXIT_REASON_NMI_WINDOW: 1136 /* Exit to allow the pending virtual NMI to be injected */ 1137 vmx_clear_nmi_window_exiting(vmx, vcpu); 1138 VMM_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting"); 1139 return (1); 1140 case EXIT_REASON_INOUT: 1141 vmexit->exitcode = VM_EXITCODE_INOUT; 1142 vmexit->u.inout.bytes = (qual & 0x7) + 1; 1143 vmexit->u.inout.in = (qual & 0x8) ? 1 : 0; 1144 vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; 1145 vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; 1146 vmexit->u.inout.port = (uint16_t)(qual >> 16); 1147 vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); 1148 break; 1149 case EXIT_REASON_CPUID: 1150 handled = vmx_handle_cpuid(vcpu, vmxctx); 1151 break; 1152 default: 1153 break; 1154 } 1155 1156 if (handled) { 1157 /* 1158 * It is possible that control is returned to userland 1159 * even though we were able to handle the VM exit in the 1160 * kernel (for e.g. 'astpending' is set in the run loop). 1161 * 1162 * In such a case we want to make sure that the userland 1163 * restarts guest execution at the instruction *after* 1164 * the one we just processed. Therefore we update the 1165 * guest rip in the VMCS and in 'vmexit'. 1166 */ 1167 vm_exit_update_rip(vmexit); 1168 vmexit->rip += vmexit->inst_length; 1169 vmexit->inst_length = 0; 1170 } else { 1171 if (vmexit->exitcode == VM_EXITCODE_BOGUS) { 1172 /* 1173 * If this VM exit was not claimed by anybody then 1174 * treat it as a generic VMX exit. 1175 */ 1176 vmexit->exitcode = VM_EXITCODE_VMX; 1177 vmexit->u.vmx.error = 0; 1178 } else { 1179 /* 1180 * The exitcode and collateral have been populated. 1181 * The VM exit will be processed further in userland. 1182 */ 1183 } 1184 } 1185 return (handled); 1186} 1187 1188static int 1189vmx_run(void *arg, int vcpu, register_t rip, struct vm_exit *vmexit) 1190{ 1191 int error, vie, rc, handled, astpending; 1192 uint32_t exit_reason; 1193 struct vmx *vmx; 1194 struct vmxctx *vmxctx; 1195 struct vmcs *vmcs; 1196 1197 vmx = arg; 1198 vmcs = &vmx->vmcs[vcpu]; 1199 vmxctx = &vmx->ctx[vcpu]; 1200 vmxctx->launched = 0; 1201 1202 /* 1203 * XXX Can we avoid doing this every time we do a vm run? 1204 */ 1205 VMPTRLD(vmcs); 1206 1207 /* 1208 * XXX 1209 * We do this every time because we may setup the virtual machine 1210 * from a different process than the one that actually runs it. 1211 * 1212 * If the life of a virtual machine was spent entirely in the context 1213 * of a single process we could do this once in vmcs_set_defaults(). 1214 */ 1215 if ((error = vmwrite(VMCS_HOST_CR3, rcr3())) != 0) 1216 panic("vmx_run: error %d writing to VMCS_HOST_CR3", error); 1217 1218 if ((error = vmwrite(VMCS_GUEST_RIP, rip)) != 0) 1219 panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); 1220 1221 if ((error = vmx_set_pcpu_defaults(vmx, vcpu)) != 0) 1222 panic("vmx_run: error %d setting up pcpu defaults", error); 1223 1224 do { 1225 lapic_timer_tick(vmx->vm, vcpu); 1226 vmx_inject_interrupts(vmx, vcpu); 1227 vmx_run_trace(vmx, vcpu); 1228 rc = vmx_setjmp(vmxctx); 1229#ifdef SETJMP_TRACE 1230 vmx_setjmp_trace(vmx, vcpu, vmxctx, rc); 1231#endif 1232 switch (rc) { 1233 case VMX_RETURN_DIRECT: 1234 if (vmxctx->launched == 0) { 1235 vmxctx->launched = 1; 1236 vmx_launch(vmxctx); 1237 } else 1238 vmx_resume(vmxctx); 1239 panic("vmx_launch/resume should not return"); 1240 break; 1241 case VMX_RETURN_LONGJMP: 1242 break; /* vm exit */ 1243 case VMX_RETURN_VMRESUME: 1244 vie = vmcs_instruction_error(); 1245 if (vmxctx->launch_error == VM_FAIL_INVALID || 1246 vie != VMRESUME_WITH_NON_LAUNCHED_VMCS) { 1247 printf("vmresume error %d vmcs inst error %d\n", 1248 vmxctx->launch_error, vie); 1249 goto err_exit; 1250 } 1251 vmx_launch(vmxctx); /* try to launch the guest */ 1252 panic("vmx_launch should not return"); 1253 break; 1254 case VMX_RETURN_VMLAUNCH: 1255 vie = vmcs_instruction_error(); 1256#if 1 1257 printf("vmlaunch error %d vmcs inst error %d\n", 1258 vmxctx->launch_error, vie); 1259#endif 1260 goto err_exit; 1261 default: 1262 panic("vmx_setjmp returned %d", rc); 1263 } 1264 1265 /* 1266 * XXX locking? 1267 * See comments in exception.S about checking for ASTs 1268 * atomically while interrupts are disabled. But it is 1269 * not clear that they apply in our case. 1270 */ 1271 astpending = curthread->td_flags & TDF_ASTPENDING; 1272 1273 /* enable interrupts */ 1274 enable_intr(); 1275 1276 /* collect some basic information for VM exit processing */ 1277 vmexit->rip = rip = vmcs_guest_rip(); 1278 vmexit->inst_length = vmexit_instruction_length(); 1279 vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); 1280 vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); 1281 1282 handled = vmx_exit_process(vmx, vcpu, vmexit); 1283 1284 vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled, 1285 astpending); 1286 } while (handled && !astpending); 1287 1288 /* 1289 * If a VM exit has been handled then the exitcode must be BOGUS 1290 * If a VM exit is not handled then the exitcode must not be BOGUS 1291 */ 1292 if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || 1293 (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { 1294 panic("Mismatch between handled (%d) and exitcode (%d)", 1295 handled, vmexit->exitcode); 1296 } 1297 1298 VMM_CTR1(vmx->vm, vcpu, "goto userland: exitcode %d",vmexit->exitcode); 1299 1300 /* 1301 * XXX 1302 * We need to do this to ensure that any VMCS state cached by the 1303 * processor is flushed to memory. We need to do this in case the 1304 * VM moves to a different cpu the next time it runs. 1305 * 1306 * Can we avoid doing this? 1307 */ 1308 VMCLEAR(vmcs); 1309 return (0); 1310 1311err_exit: 1312 vmexit->exitcode = VM_EXITCODE_VMX; 1313 vmexit->u.vmx.exit_reason = (uint32_t)-1; 1314 vmexit->u.vmx.exit_qualification = (uint32_t)-1; 1315 vmexit->u.vmx.error = vie; 1316 VMCLEAR(vmcs); 1317 return (ENOEXEC); 1318} 1319 1320static void 1321vmx_vmcleanup(void *arg) 1322{ 1323 int error; 1324 struct vmx *vmx = arg; 1325 1326 /* 1327 * XXXSMP we also need to clear the VMCS active on the other vcpus. 1328 */ 1329 error = vmclear(&vmx->vmcs[0]); 1330 if (error != 0) 1331 panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error); 1332 1333 ept_vmcleanup(vmx); 1334 free(vmx, M_VMX); 1335 1336 return; 1337} 1338 1339static register_t * 1340vmxctx_regptr(struct vmxctx *vmxctx, int reg) 1341{ 1342 1343 switch (reg) { 1344 case VM_REG_GUEST_RAX: 1345 return (&vmxctx->guest_rax); 1346 case VM_REG_GUEST_RBX: 1347 return (&vmxctx->guest_rbx); 1348 case VM_REG_GUEST_RCX: 1349 return (&vmxctx->guest_rcx); 1350 case VM_REG_GUEST_RDX: 1351 return (&vmxctx->guest_rdx); 1352 case VM_REG_GUEST_RSI: 1353 return (&vmxctx->guest_rsi); 1354 case VM_REG_GUEST_RDI: 1355 return (&vmxctx->guest_rdi); 1356 case VM_REG_GUEST_RBP: 1357 return (&vmxctx->guest_rbp); 1358 case VM_REG_GUEST_R8: 1359 return (&vmxctx->guest_r8); 1360 case VM_REG_GUEST_R9: 1361 return (&vmxctx->guest_r9); 1362 case VM_REG_GUEST_R10: 1363 return (&vmxctx->guest_r10); 1364 case VM_REG_GUEST_R11: 1365 return (&vmxctx->guest_r11); 1366 case VM_REG_GUEST_R12: 1367 return (&vmxctx->guest_r12); 1368 case VM_REG_GUEST_R13: 1369 return (&vmxctx->guest_r13); 1370 case VM_REG_GUEST_R14: 1371 return (&vmxctx->guest_r14); 1372 case VM_REG_GUEST_R15: 1373 return (&vmxctx->guest_r15); 1374 default: 1375 break; 1376 } 1377 return (NULL); 1378} 1379 1380static int 1381vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval) 1382{ 1383 register_t *regp; 1384 1385 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 1386 *retval = *regp; 1387 return (0); 1388 } else 1389 return (EINVAL); 1390} 1391 1392static int 1393vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) 1394{ 1395 register_t *regp; 1396 1397 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 1398 *regp = val; 1399 return (0); 1400 } else 1401 return (EINVAL); 1402} 1403 1404static int 1405vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) 1406{ 1407 struct vmx *vmx = arg; 1408 1409 if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0) 1410 return (0); 1411 1412 /* 1413 * If the vcpu is running then don't mess with the VMCS. 1414 * 1415 * vmcs_getreg will VMCLEAR the vmcs when it is done which will cause 1416 * the subsequent vmlaunch/vmresume to fail. 1417 */ 1418 if (vcpu_is_running(vmx->vm, vcpu, NULL)) 1419 panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); 1420 1421 return (vmcs_getreg(&vmx->vmcs[vcpu], reg, retval)); 1422} 1423 1424static int 1425vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) 1426{ 1427 int error; 1428 uint64_t ctls; 1429 struct vmx *vmx = arg; 1430 1431 /* 1432 * XXX Allow caller to set contents of the guest registers saved in 1433 * the 'vmxctx' even though the vcpu might be running. We need this 1434 * specifically to support the rdmsr emulation that will set the 1435 * %eax and %edx registers during vm exit processing. 1436 */ 1437 if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0) 1438 return (0); 1439 1440 /* 1441 * If the vcpu is running then don't mess with the VMCS. 1442 * 1443 * vmcs_setreg will VMCLEAR the vmcs when it is done which will cause 1444 * the subsequent vmlaunch/vmresume to fail. 1445 */ 1446 if (vcpu_is_running(vmx->vm, vcpu, NULL)) 1447 panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); 1448 1449 error = vmcs_setreg(&vmx->vmcs[vcpu], reg, val); 1450 1451 if (error == 0) { 1452 /* 1453 * If the "load EFER" VM-entry control is 1 then the 1454 * value of EFER.LMA must be identical to "IA-32e mode guest" 1455 * bit in the VM-entry control. 1456 */ 1457 if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 && 1458 (reg == VM_REG_GUEST_EFER)) { 1459 vmcs_getreg(&vmx->vmcs[vcpu], 1460 VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls); 1461 if (val & EFER_LMA) 1462 ctls |= VM_ENTRY_GUEST_LMA; 1463 else 1464 ctls &= ~VM_ENTRY_GUEST_LMA; 1465 vmcs_setreg(&vmx->vmcs[vcpu], 1466 VMCS_IDENT(VMCS_ENTRY_CTLS), ctls); 1467 } 1468 } 1469 1470 return (error); 1471} 1472 1473static int 1474vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 1475{ 1476 struct vmx *vmx = arg; 1477 1478 return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc)); 1479} 1480 1481static int 1482vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 1483{ 1484 struct vmx *vmx = arg; 1485 1486 return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc)); 1487} 1488 1489static int 1490vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code, 1491 int code_valid) 1492{ 1493 int error; 1494 uint32_t info; 1495 struct vmx *vmx = arg; 1496 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 1497 1498 static uint32_t type_map[VM_EVENT_MAX] = { 1499 0x1, /* VM_EVENT_NONE */ 1500 0x0, /* VM_HW_INTR */ 1501 0x2, /* VM_NMI */ 1502 0x3, /* VM_HW_EXCEPTION */ 1503 0x4, /* VM_SW_INTR */ 1504 0x5, /* VM_PRIV_SW_EXCEPTION */ 1505 0x6, /* VM_SW_EXCEPTION */ 1506 }; 1507 1508 info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0); 1509 info |= VMCS_INTERRUPTION_INFO_VALID; 1510 error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info); 1511 if (error != 0) 1512 return (error); 1513 1514 if (code_valid) { 1515 error = vmcs_setreg(vmcs, 1516 VMCS_IDENT(VMCS_ENTRY_EXCEPTION_ERROR), 1517 code); 1518 } 1519 return (error); 1520} 1521 1522static int 1523vmx_nmi(void *arg, int vcpu) 1524{ 1525 struct vmx *vmx = arg; 1526 1527 atomic_set_int(&vmx->state[vcpu].request_nmi, 1); 1528 1529 return (0); 1530} 1531 1532static int 1533vmx_getcap(void *arg, int vcpu, int type, int *retval) 1534{ 1535 struct vmx *vmx = arg; 1536 int vcap; 1537 int ret; 1538 1539 ret = ENOENT; 1540 1541 vcap = vmx->cap[vcpu].set; 1542 1543 switch (type) { 1544 case VM_CAP_HALT_EXIT: 1545 if (cap_halt_exit) 1546 ret = 0; 1547 break; 1548 case VM_CAP_PAUSE_EXIT: 1549 if (cap_pause_exit) 1550 ret = 0; 1551 break; 1552 case VM_CAP_MTRAP_EXIT: 1553 if (cap_monitor_trap) 1554 ret = 0; 1555 break; 1556 case VM_CAP_UNRESTRICTED_GUEST: 1557 if (cap_unrestricted_guest) 1558 ret = 0; 1559 break; 1560 default: 1561 break; 1562 } 1563 1564 if (ret == 0) 1565 *retval = (vcap & (1 << type)) ? 1 : 0; 1566 1567 return (ret); 1568} 1569 1570static int 1571vmx_setcap(void *arg, int vcpu, int type, int val) 1572{ 1573 struct vmx *vmx = arg; 1574 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 1575 uint32_t baseval; 1576 uint32_t *pptr; 1577 int error; 1578 int flag; 1579 int reg; 1580 int retval; 1581 1582 retval = ENOENT; 1583 pptr = NULL; 1584 1585 switch (type) { 1586 case VM_CAP_HALT_EXIT: 1587 if (cap_halt_exit) { 1588 retval = 0; 1589 pptr = &vmx->cap[vcpu].proc_ctls; 1590 baseval = *pptr; 1591 flag = PROCBASED_HLT_EXITING; 1592 reg = VMCS_PRI_PROC_BASED_CTLS; 1593 } 1594 break; 1595 case VM_CAP_MTRAP_EXIT: 1596 if (cap_monitor_trap) { 1597 retval = 0; 1598 pptr = &vmx->cap[vcpu].proc_ctls; 1599 baseval = *pptr; 1600 flag = PROCBASED_MTF; 1601 reg = VMCS_PRI_PROC_BASED_CTLS; 1602 } 1603 break; 1604 case VM_CAP_PAUSE_EXIT: 1605 if (cap_pause_exit) { 1606 retval = 0; 1607 pptr = &vmx->cap[vcpu].proc_ctls; 1608 baseval = *pptr; 1609 flag = PROCBASED_PAUSE_EXITING; 1610 reg = VMCS_PRI_PROC_BASED_CTLS; 1611 } 1612 break; 1613 case VM_CAP_UNRESTRICTED_GUEST: 1614 if (cap_unrestricted_guest) { 1615 retval = 0; 1616 baseval = procbased_ctls2; 1617 flag = PROCBASED2_UNRESTRICTED_GUEST; 1618 reg = VMCS_SEC_PROC_BASED_CTLS; 1619 } 1620 break; 1621 default: 1622 break; 1623 } 1624 1625 if (retval == 0) { 1626 if (val) { 1627 baseval |= flag; 1628 } else { 1629 baseval &= ~flag; 1630 } 1631 VMPTRLD(vmcs); 1632 error = vmwrite(reg, baseval); 1633 VMCLEAR(vmcs); 1634 1635 if (error) { 1636 retval = error; 1637 } else { 1638 /* 1639 * Update optional stored flags, and record 1640 * setting 1641 */ 1642 if (pptr != NULL) { 1643 *pptr = baseval; 1644 } 1645 1646 if (val) { 1647 vmx->cap[vcpu].set |= (1 << type); 1648 } else { 1649 vmx->cap[vcpu].set &= ~(1 << type); 1650 } 1651 } 1652 } 1653 1654 return (retval); 1655} 1656 1657struct vmm_ops vmm_ops_intel = { 1658 vmx_init, 1659 vmx_cleanup, 1660 vmx_vminit, 1661 vmx_run, 1662 vmx_vmcleanup, 1663 ept_vmmmap, 1664 vmx_getreg, 1665 vmx_setreg, 1666 vmx_getdesc, 1667 vmx_setdesc, 1668 vmx_inject, 1669 vmx_nmi, 1670 vmx_getcap, 1671 vmx_setcap 1672}; 1673