vmx.c revision 242275
1/*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD$ 27 */ 28 29#include <sys/cdefs.h> 30__FBSDID("$FreeBSD$"); 31 32#include <sys/param.h> 33#include <sys/systm.h> 34#include <sys/smp.h> 35#include <sys/kernel.h> 36#include <sys/malloc.h> 37#include <sys/pcpu.h> 38#include <sys/proc.h> 39 40#include <vm/vm.h> 41#include <vm/pmap.h> 42 43#include <machine/psl.h> 44#include <machine/cpufunc.h> 45#include <machine/md_var.h> 46#include <machine/pmap.h> 47#include <machine/segments.h> 48#include <machine/specialreg.h> 49#include <machine/vmparam.h> 50 51#include <x86/apicreg.h> 52 53#include <machine/vmm.h> 54#include "vmm_host.h" 55#include "vmm_lapic.h" 56#include "vmm_msr.h" 57#include "vmm_ktr.h" 58#include "vmm_stat.h" 59 60#include "vmx_msr.h" 61#include "ept.h" 62#include "vmx_cpufunc.h" 63#include "vmx.h" 64#include "x86.h" 65#include "vmx_controls.h" 66#include "vmm_instruction_emul.h" 67 68#define PINBASED_CTLS_ONE_SETTING \ 69 (PINBASED_EXTINT_EXITING | \ 70 PINBASED_NMI_EXITING | \ 71 PINBASED_VIRTUAL_NMI) 72#define PINBASED_CTLS_ZERO_SETTING 0 73 74#define PROCBASED_CTLS_WINDOW_SETTING \ 75 (PROCBASED_INT_WINDOW_EXITING | \ 76 PROCBASED_NMI_WINDOW_EXITING) 77 78#define PROCBASED_CTLS_ONE_SETTING \ 79 (PROCBASED_SECONDARY_CONTROLS | \ 80 PROCBASED_IO_EXITING | \ 81 PROCBASED_MSR_BITMAPS | \ 82 PROCBASED_CTLS_WINDOW_SETTING) 83#define PROCBASED_CTLS_ZERO_SETTING \ 84 (PROCBASED_CR3_LOAD_EXITING | \ 85 PROCBASED_CR3_STORE_EXITING | \ 86 PROCBASED_IO_BITMAPS) 87 88#define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT 89#define PROCBASED_CTLS2_ZERO_SETTING 0 90 91#define VM_EXIT_CTLS_ONE_SETTING_NO_PAT \ 92 (VM_EXIT_HOST_LMA | \ 93 VM_EXIT_SAVE_EFER | \ 94 VM_EXIT_LOAD_EFER) 95 96#define VM_EXIT_CTLS_ONE_SETTING \ 97 (VM_EXIT_CTLS_ONE_SETTING_NO_PAT | \ 98 VM_EXIT_SAVE_PAT | \ 99 VM_EXIT_LOAD_PAT) 100#define VM_EXIT_CTLS_ZERO_SETTING VM_EXIT_SAVE_DEBUG_CONTROLS 101 102#define VM_ENTRY_CTLS_ONE_SETTING_NO_PAT VM_ENTRY_LOAD_EFER 103 104#define VM_ENTRY_CTLS_ONE_SETTING \ 105 (VM_ENTRY_CTLS_ONE_SETTING_NO_PAT | \ 106 VM_ENTRY_LOAD_PAT) 107#define VM_ENTRY_CTLS_ZERO_SETTING \ 108 (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ 109 VM_ENTRY_INTO_SMM | \ 110 VM_ENTRY_DEACTIVATE_DUAL_MONITOR) 111 112#define guest_msr_rw(vmx, msr) \ 113 msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW) 114 115#define HANDLED 1 116#define UNHANDLED 0 117 118MALLOC_DEFINE(M_VMX, "vmx", "vmx"); 119 120int vmxon_enabled[MAXCPU]; 121static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); 122 123static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; 124static uint32_t exit_ctls, entry_ctls; 125 126static uint64_t cr0_ones_mask, cr0_zeros_mask; 127static uint64_t cr4_ones_mask, cr4_zeros_mask; 128 129static volatile u_int nextvpid; 130 131static int vmx_no_patmsr; 132 133/* 134 * Virtual NMI blocking conditions. 135 * 136 * Some processor implementations also require NMI to be blocked if 137 * the STI_BLOCKING bit is set. It is possible to detect this at runtime 138 * based on the (exit_reason,exit_qual) tuple being set to 139 * (EXIT_REASON_INVAL_VMCS, EXIT_QUAL_NMI_WHILE_STI_BLOCKING). 140 * 141 * We take the easy way out and also include STI_BLOCKING as one of the 142 * gating items for vNMI injection. 143 */ 144static uint64_t nmi_blocking_bits = VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING | 145 VMCS_INTERRUPTIBILITY_NMI_BLOCKING | 146 VMCS_INTERRUPTIBILITY_STI_BLOCKING; 147 148/* 149 * Optional capabilities 150 */ 151static int cap_halt_exit; 152static int cap_pause_exit; 153static int cap_unrestricted_guest; 154static int cap_monitor_trap; 155 156/* statistics */ 157static VMM_STAT_DEFINE(VCPU_MIGRATIONS, "vcpu migration across host cpus"); 158static VMM_STAT_DEFINE(VMEXIT_EXTINT, "vm exits due to external interrupt"); 159static VMM_STAT_DEFINE(VMEXIT_HLT_IGNORED, "number of times hlt was ignored"); 160static VMM_STAT_DEFINE(VMEXIT_HLT, "number of times hlt was intercepted"); 161 162#ifdef KTR 163static const char * 164exit_reason_to_str(int reason) 165{ 166 static char reasonbuf[32]; 167 168 switch (reason) { 169 case EXIT_REASON_EXCEPTION: 170 return "exception"; 171 case EXIT_REASON_EXT_INTR: 172 return "extint"; 173 case EXIT_REASON_TRIPLE_FAULT: 174 return "triplefault"; 175 case EXIT_REASON_INIT: 176 return "init"; 177 case EXIT_REASON_SIPI: 178 return "sipi"; 179 case EXIT_REASON_IO_SMI: 180 return "iosmi"; 181 case EXIT_REASON_SMI: 182 return "smi"; 183 case EXIT_REASON_INTR_WINDOW: 184 return "intrwindow"; 185 case EXIT_REASON_NMI_WINDOW: 186 return "nmiwindow"; 187 case EXIT_REASON_TASK_SWITCH: 188 return "taskswitch"; 189 case EXIT_REASON_CPUID: 190 return "cpuid"; 191 case EXIT_REASON_GETSEC: 192 return "getsec"; 193 case EXIT_REASON_HLT: 194 return "hlt"; 195 case EXIT_REASON_INVD: 196 return "invd"; 197 case EXIT_REASON_INVLPG: 198 return "invlpg"; 199 case EXIT_REASON_RDPMC: 200 return "rdpmc"; 201 case EXIT_REASON_RDTSC: 202 return "rdtsc"; 203 case EXIT_REASON_RSM: 204 return "rsm"; 205 case EXIT_REASON_VMCALL: 206 return "vmcall"; 207 case EXIT_REASON_VMCLEAR: 208 return "vmclear"; 209 case EXIT_REASON_VMLAUNCH: 210 return "vmlaunch"; 211 case EXIT_REASON_VMPTRLD: 212 return "vmptrld"; 213 case EXIT_REASON_VMPTRST: 214 return "vmptrst"; 215 case EXIT_REASON_VMREAD: 216 return "vmread"; 217 case EXIT_REASON_VMRESUME: 218 return "vmresume"; 219 case EXIT_REASON_VMWRITE: 220 return "vmwrite"; 221 case EXIT_REASON_VMXOFF: 222 return "vmxoff"; 223 case EXIT_REASON_VMXON: 224 return "vmxon"; 225 case EXIT_REASON_CR_ACCESS: 226 return "craccess"; 227 case EXIT_REASON_DR_ACCESS: 228 return "draccess"; 229 case EXIT_REASON_INOUT: 230 return "inout"; 231 case EXIT_REASON_RDMSR: 232 return "rdmsr"; 233 case EXIT_REASON_WRMSR: 234 return "wrmsr"; 235 case EXIT_REASON_INVAL_VMCS: 236 return "invalvmcs"; 237 case EXIT_REASON_INVAL_MSR: 238 return "invalmsr"; 239 case EXIT_REASON_MWAIT: 240 return "mwait"; 241 case EXIT_REASON_MTF: 242 return "mtf"; 243 case EXIT_REASON_MONITOR: 244 return "monitor"; 245 case EXIT_REASON_PAUSE: 246 return "pause"; 247 case EXIT_REASON_MCE: 248 return "mce"; 249 case EXIT_REASON_TPR: 250 return "tpr"; 251 case EXIT_REASON_APIC: 252 return "apic"; 253 case EXIT_REASON_GDTR_IDTR: 254 return "gdtridtr"; 255 case EXIT_REASON_LDTR_TR: 256 return "ldtrtr"; 257 case EXIT_REASON_EPT_FAULT: 258 return "eptfault"; 259 case EXIT_REASON_EPT_MISCONFIG: 260 return "eptmisconfig"; 261 case EXIT_REASON_INVEPT: 262 return "invept"; 263 case EXIT_REASON_RDTSCP: 264 return "rdtscp"; 265 case EXIT_REASON_VMX_PREEMPT: 266 return "vmxpreempt"; 267 case EXIT_REASON_INVVPID: 268 return "invvpid"; 269 case EXIT_REASON_WBINVD: 270 return "wbinvd"; 271 case EXIT_REASON_XSETBV: 272 return "xsetbv"; 273 default: 274 snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); 275 return (reasonbuf); 276 } 277} 278 279#ifdef SETJMP_TRACE 280static const char * 281vmx_setjmp_rc2str(int rc) 282{ 283 switch (rc) { 284 case VMX_RETURN_DIRECT: 285 return "direct"; 286 case VMX_RETURN_LONGJMP: 287 return "longjmp"; 288 case VMX_RETURN_VMRESUME: 289 return "vmresume"; 290 case VMX_RETURN_VMLAUNCH: 291 return "vmlaunch"; 292 case VMX_RETURN_AST: 293 return "ast"; 294 default: 295 return "unknown"; 296 } 297} 298 299#define SETJMP_TRACE(vmx, vcpu, vmxctx, regname) \ 300 VMM_CTR1((vmx)->vm, (vcpu), "setjmp trace " #regname " 0x%016lx", \ 301 (vmxctx)->regname) 302 303static void 304vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) 305{ 306 uint64_t host_rip, host_rsp; 307 308 if (vmxctx != &vmx->ctx[vcpu]) 309 panic("vmx_setjmp_trace: invalid vmxctx %p; should be %p", 310 vmxctx, &vmx->ctx[vcpu]); 311 312 VMM_CTR1((vmx)->vm, (vcpu), "vmxctx = %p", vmxctx); 313 VMM_CTR2((vmx)->vm, (vcpu), "setjmp return code %s(%d)", 314 vmx_setjmp_rc2str(rc), rc); 315 316 host_rsp = host_rip = ~0; 317 vmread(VMCS_HOST_RIP, &host_rip); 318 vmread(VMCS_HOST_RSP, &host_rsp); 319 VMM_CTR2((vmx)->vm, (vcpu), "vmcs host_rip 0x%016lx, host_rsp 0x%016lx", 320 host_rip, host_rsp); 321 322 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r15); 323 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r14); 324 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r13); 325 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r12); 326 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbp); 327 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rsp); 328 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbx); 329 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rip); 330 331 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdi); 332 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rsi); 333 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdx); 334 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rcx); 335 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r8); 336 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r9); 337 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rax); 338 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbx); 339 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbp); 340 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r10); 341 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r11); 342 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r12); 343 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r13); 344 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r14); 345 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r15); 346 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_cr2); 347} 348#endif 349#else 350static void __inline 351vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) 352{ 353 return; 354} 355#endif /* KTR */ 356 357u_long 358vmx_fix_cr0(u_long cr0) 359{ 360 361 return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); 362} 363 364u_long 365vmx_fix_cr4(u_long cr4) 366{ 367 368 return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); 369} 370 371static void 372msr_save_area_init(struct msr_entry *g_area, int *g_count) 373{ 374 int cnt; 375 376 static struct msr_entry guest_msrs[] = { 377 { MSR_KGSBASE, 0, 0 }, 378 }; 379 380 cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]); 381 if (cnt > GUEST_MSR_MAX_ENTRIES) 382 panic("guest msr save area overrun"); 383 bcopy(guest_msrs, g_area, sizeof(guest_msrs)); 384 *g_count = cnt; 385} 386 387static void 388vmx_disable(void *arg __unused) 389{ 390 struct invvpid_desc invvpid_desc = { 0 }; 391 struct invept_desc invept_desc = { 0 }; 392 393 if (vmxon_enabled[curcpu]) { 394 /* 395 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b. 396 * 397 * VMXON or VMXOFF are not required to invalidate any TLB 398 * caching structures. This prevents potential retention of 399 * cached information in the TLB between distinct VMX episodes. 400 */ 401 invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc); 402 invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc); 403 vmxoff(); 404 } 405 load_cr4(rcr4() & ~CR4_VMXE); 406} 407 408static int 409vmx_cleanup(void) 410{ 411 412 smp_rendezvous(NULL, vmx_disable, NULL, NULL); 413 414 return (0); 415} 416 417static void 418vmx_enable(void *arg __unused) 419{ 420 int error; 421 422 load_cr4(rcr4() | CR4_VMXE); 423 424 *(uint32_t *)vmxon_region[curcpu] = vmx_revision(); 425 error = vmxon(vmxon_region[curcpu]); 426 if (error == 0) 427 vmxon_enabled[curcpu] = 1; 428} 429 430static int 431vmx_init(void) 432{ 433 int error; 434 uint64_t fixed0, fixed1, feature_control; 435 uint32_t tmp; 436 437 /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ 438 if (!(cpu_feature2 & CPUID2_VMX)) { 439 printf("vmx_init: processor does not support VMX operation\n"); 440 return (ENXIO); 441 } 442 443 /* 444 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits 445 * are set (bits 0 and 2 respectively). 446 */ 447 feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); 448 if ((feature_control & 0x5) != 0x5) { 449 printf("vmx_init: VMX operation disabled by BIOS\n"); 450 return (ENXIO); 451 } 452 453 /* Check support for primary processor-based VM-execution controls */ 454 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 455 MSR_VMX_TRUE_PROCBASED_CTLS, 456 PROCBASED_CTLS_ONE_SETTING, 457 PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); 458 if (error) { 459 printf("vmx_init: processor does not support desired primary " 460 "processor-based controls\n"); 461 return (error); 462 } 463 464 /* Clear the processor-based ctl bits that are set on demand */ 465 procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; 466 467 /* Check support for secondary processor-based VM-execution controls */ 468 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 469 MSR_VMX_PROCBASED_CTLS2, 470 PROCBASED_CTLS2_ONE_SETTING, 471 PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); 472 if (error) { 473 printf("vmx_init: processor does not support desired secondary " 474 "processor-based controls\n"); 475 return (error); 476 } 477 478 /* Check support for VPID */ 479 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, 480 PROCBASED2_ENABLE_VPID, 0, &tmp); 481 if (error == 0) 482 procbased_ctls2 |= PROCBASED2_ENABLE_VPID; 483 484 /* Check support for pin-based VM-execution controls */ 485 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, 486 MSR_VMX_TRUE_PINBASED_CTLS, 487 PINBASED_CTLS_ONE_SETTING, 488 PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); 489 if (error) { 490 printf("vmx_init: processor does not support desired " 491 "pin-based controls\n"); 492 return (error); 493 } 494 495 /* Check support for VM-exit controls */ 496 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, 497 VM_EXIT_CTLS_ONE_SETTING, 498 VM_EXIT_CTLS_ZERO_SETTING, 499 &exit_ctls); 500 if (error) { 501 /* Try again without the PAT MSR bits */ 502 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, 503 MSR_VMX_TRUE_EXIT_CTLS, 504 VM_EXIT_CTLS_ONE_SETTING_NO_PAT, 505 VM_EXIT_CTLS_ZERO_SETTING, 506 &exit_ctls); 507 if (error) { 508 printf("vmx_init: processor does not support desired " 509 "exit controls\n"); 510 return (error); 511 } else { 512 if (bootverbose) 513 printf("vmm: PAT MSR access not supported\n"); 514 guest_msr_valid(MSR_PAT); 515 vmx_no_patmsr = 1; 516 } 517 } 518 519 /* Check support for VM-entry controls */ 520 if (!vmx_no_patmsr) { 521 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, 522 MSR_VMX_TRUE_ENTRY_CTLS, 523 VM_ENTRY_CTLS_ONE_SETTING, 524 VM_ENTRY_CTLS_ZERO_SETTING, 525 &entry_ctls); 526 } else { 527 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, 528 MSR_VMX_TRUE_ENTRY_CTLS, 529 VM_ENTRY_CTLS_ONE_SETTING_NO_PAT, 530 VM_ENTRY_CTLS_ZERO_SETTING, 531 &entry_ctls); 532 } 533 534 if (error) { 535 printf("vmx_init: processor does not support desired " 536 "entry controls\n"); 537 return (error); 538 } 539 540 /* 541 * Check support for optional features by testing them 542 * as individual bits 543 */ 544 cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 545 MSR_VMX_TRUE_PROCBASED_CTLS, 546 PROCBASED_HLT_EXITING, 0, 547 &tmp) == 0); 548 549 cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 550 MSR_VMX_PROCBASED_CTLS, 551 PROCBASED_MTF, 0, 552 &tmp) == 0); 553 554 cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 555 MSR_VMX_TRUE_PROCBASED_CTLS, 556 PROCBASED_PAUSE_EXITING, 0, 557 &tmp) == 0); 558 559 cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 560 MSR_VMX_PROCBASED_CTLS2, 561 PROCBASED2_UNRESTRICTED_GUEST, 0, 562 &tmp) == 0); 563 564 /* Initialize EPT */ 565 error = ept_init(); 566 if (error) { 567 printf("vmx_init: ept initialization failed (%d)\n", error); 568 return (error); 569 } 570 571 /* 572 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 573 */ 574 fixed0 = rdmsr(MSR_VMX_CR0_FIXED0); 575 fixed1 = rdmsr(MSR_VMX_CR0_FIXED1); 576 cr0_ones_mask = fixed0 & fixed1; 577 cr0_zeros_mask = ~fixed0 & ~fixed1; 578 579 /* 580 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation 581 * if unrestricted guest execution is allowed. 582 */ 583 if (cap_unrestricted_guest) 584 cr0_ones_mask &= ~(CR0_PG | CR0_PE); 585 586 /* 587 * Do not allow the guest to set CR0_NW or CR0_CD. 588 */ 589 cr0_zeros_mask |= (CR0_NW | CR0_CD); 590 591 fixed0 = rdmsr(MSR_VMX_CR4_FIXED0); 592 fixed1 = rdmsr(MSR_VMX_CR4_FIXED1); 593 cr4_ones_mask = fixed0 & fixed1; 594 cr4_zeros_mask = ~fixed0 & ~fixed1; 595 596 /* enable VMX operation */ 597 smp_rendezvous(NULL, vmx_enable, NULL, NULL); 598 599 return (0); 600} 601 602/* 603 * If this processor does not support VPIDs then simply return 0. 604 * 605 * Otherwise generate the next value of VPID to use. Any value is alright 606 * as long as it is non-zero. 607 * 608 * We always execute in VMX non-root context with EPT enabled. Thus all 609 * combined mappings are tagged with the (EP4TA, VPID, PCID) tuple. This 610 * in turn means that multiple VMs can share the same VPID as long as 611 * they have distinct EPT page tables. 612 * 613 * XXX 614 * We should optimize this so that it returns VPIDs that are not in 615 * use. Then we will not unnecessarily invalidate mappings in 616 * vmx_set_pcpu_defaults() just because two or more vcpus happen to 617 * use the same 'vpid'. 618 */ 619static uint16_t 620vmx_vpid(void) 621{ 622 uint16_t vpid = 0; 623 624 if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) != 0) { 625 do { 626 vpid = atomic_fetchadd_int(&nextvpid, 1); 627 } while (vpid == 0); 628 } 629 630 return (vpid); 631} 632 633static int 634vmx_setup_cr_shadow(int which, struct vmcs *vmcs) 635{ 636 int error, mask_ident, shadow_ident; 637 uint64_t mask_value, shadow_value; 638 639 if (which != 0 && which != 4) 640 panic("vmx_setup_cr_shadow: unknown cr%d", which); 641 642 if (which == 0) { 643 mask_ident = VMCS_CR0_MASK; 644 mask_value = cr0_ones_mask | cr0_zeros_mask; 645 shadow_ident = VMCS_CR0_SHADOW; 646 shadow_value = cr0_ones_mask; 647 } else { 648 mask_ident = VMCS_CR4_MASK; 649 mask_value = cr4_ones_mask | cr4_zeros_mask; 650 shadow_ident = VMCS_CR4_SHADOW; 651 shadow_value = cr4_ones_mask; 652 } 653 654 error = vmcs_setreg(vmcs, VMCS_IDENT(mask_ident), mask_value); 655 if (error) 656 return (error); 657 658 error = vmcs_setreg(vmcs, VMCS_IDENT(shadow_ident), shadow_value); 659 if (error) 660 return (error); 661 662 return (0); 663} 664#define vmx_setup_cr0_shadow(vmcs) vmx_setup_cr_shadow(0, (vmcs)) 665#define vmx_setup_cr4_shadow(vmcs) vmx_setup_cr_shadow(4, (vmcs)) 666 667static void * 668vmx_vminit(struct vm *vm) 669{ 670 uint16_t vpid; 671 int i, error, guest_msr_count; 672 struct vmx *vmx; 673 674 vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); 675 if ((uintptr_t)vmx & PAGE_MASK) { 676 panic("malloc of struct vmx not aligned on %d byte boundary", 677 PAGE_SIZE); 678 } 679 vmx->vm = vm; 680 681 /* 682 * Clean up EPTP-tagged guest physical and combined mappings 683 * 684 * VMX transitions are not required to invalidate any guest physical 685 * mappings. So, it may be possible for stale guest physical mappings 686 * to be present in the processor TLBs. 687 * 688 * Combined mappings for this EP4TA are also invalidated for all VPIDs. 689 */ 690 ept_invalidate_mappings(vtophys(vmx->pml4ept)); 691 692 msr_bitmap_initialize(vmx->msr_bitmap); 693 694 /* 695 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE. 696 * The guest FSBASE and GSBASE are saved and restored during 697 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are 698 * always restored from the vmcs host state area on vm-exit. 699 * 700 * Guest KGSBASE is saved and restored in the guest MSR save area. 701 * Host KGSBASE is restored before returning to userland from the pcb. 702 * There will be a window of time when we are executing in the host 703 * kernel context with a value of KGSBASE from the guest. This is ok 704 * because the value of KGSBASE is inconsequential in kernel context. 705 * 706 * MSR_EFER is saved and restored in the guest VMCS area on a 707 * VM exit and entry respectively. It is also restored from the 708 * host VMCS area on a VM exit. 709 */ 710 if (guest_msr_rw(vmx, MSR_GSBASE) || 711 guest_msr_rw(vmx, MSR_FSBASE) || 712 guest_msr_rw(vmx, MSR_KGSBASE) || 713 guest_msr_rw(vmx, MSR_EFER)) 714 panic("vmx_vminit: error setting guest msr access"); 715 716 /* 717 * MSR_PAT is saved and restored in the guest VMCS are on a VM exit 718 * and entry respectively. It is also restored from the host VMCS 719 * area on a VM exit. However, if running on a system with no 720 * MSR_PAT save/restore support, leave access disabled so accesses 721 * will be trapped. 722 */ 723 if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT)) 724 panic("vmx_vminit: error setting guest pat msr access"); 725 726 for (i = 0; i < VM_MAXCPU; i++) { 727 vmx->vmcs[i].identifier = vmx_revision(); 728 error = vmclear(&vmx->vmcs[i]); 729 if (error != 0) { 730 panic("vmx_vminit: vmclear error %d on vcpu %d\n", 731 error, i); 732 } 733 734 vpid = vmx_vpid(); 735 736 error = vmcs_set_defaults(&vmx->vmcs[i], 737 (u_long)vmx_longjmp, 738 (u_long)&vmx->ctx[i], 739 vtophys(vmx->pml4ept), 740 pinbased_ctls, 741 procbased_ctls, 742 procbased_ctls2, 743 exit_ctls, entry_ctls, 744 vtophys(vmx->msr_bitmap), 745 vpid); 746 747 if (error != 0) 748 panic("vmx_vminit: vmcs_set_defaults error %d", error); 749 750 vmx->cap[i].set = 0; 751 vmx->cap[i].proc_ctls = procbased_ctls; 752 753 vmx->state[i].lastcpu = -1; 754 vmx->state[i].vpid = vpid; 755 756 msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count); 757 758 error = vmcs_set_msr_save(&vmx->vmcs[i], 759 vtophys(vmx->guest_msrs[i]), 760 guest_msr_count); 761 if (error != 0) 762 panic("vmcs_set_msr_save error %d", error); 763 764 error = vmx_setup_cr0_shadow(&vmx->vmcs[i]); 765 if (error != 0) 766 panic("vmx_setup_cr0_shadow %d", error); 767 768 error = vmx_setup_cr4_shadow(&vmx->vmcs[i]); 769 if (error != 0) 770 panic("vmx_setup_cr4_shadow %d", error); 771 } 772 773 return (vmx); 774} 775 776static int 777vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx) 778{ 779 int handled, func; 780 781 func = vmxctx->guest_rax; 782 783 handled = x86_emulate_cpuid(vm, vcpu, 784 (uint32_t*)(&vmxctx->guest_rax), 785 (uint32_t*)(&vmxctx->guest_rbx), 786 (uint32_t*)(&vmxctx->guest_rcx), 787 (uint32_t*)(&vmxctx->guest_rdx)); 788 return (handled); 789} 790 791static __inline void 792vmx_run_trace(struct vmx *vmx, int vcpu) 793{ 794#ifdef KTR 795 VMM_CTR1(vmx->vm, vcpu, "Resume execution at 0x%0lx", vmcs_guest_rip()); 796#endif 797} 798 799static __inline void 800vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, 801 int handled) 802{ 803#ifdef KTR 804 VMM_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx", 805 handled ? "handled" : "unhandled", 806 exit_reason_to_str(exit_reason), rip); 807#endif 808} 809 810static __inline void 811vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip) 812{ 813#ifdef KTR 814 VMM_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip); 815#endif 816} 817 818static int 819vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu) 820{ 821 int error, lastcpu; 822 struct vmxstate *vmxstate; 823 struct invvpid_desc invvpid_desc = { 0 }; 824 825 vmxstate = &vmx->state[vcpu]; 826 lastcpu = vmxstate->lastcpu; 827 vmxstate->lastcpu = curcpu; 828 829 if (lastcpu == curcpu) { 830 error = 0; 831 goto done; 832 } 833 834 vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); 835 836 error = vmwrite(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); 837 if (error != 0) 838 goto done; 839 840 error = vmwrite(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); 841 if (error != 0) 842 goto done; 843 844 error = vmwrite(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); 845 if (error != 0) 846 goto done; 847 848 /* 849 * If we are using VPIDs then invalidate all mappings tagged with 'vpid' 850 * 851 * We do this because this vcpu was executing on a different host 852 * cpu when it last ran. We do not track whether it invalidated 853 * mappings associated with its 'vpid' during that run. So we must 854 * assume that the mappings associated with 'vpid' on 'curcpu' are 855 * stale and invalidate them. 856 * 857 * Note that we incur this penalty only when the scheduler chooses to 858 * move the thread associated with this vcpu between host cpus. 859 * 860 * Note also that this will invalidate mappings tagged with 'vpid' 861 * for "all" EP4TAs. 862 */ 863 if (vmxstate->vpid != 0) { 864 invvpid_desc.vpid = vmxstate->vpid; 865 invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); 866 } 867done: 868 return (error); 869} 870 871static void 872vm_exit_update_rip(struct vm_exit *vmexit) 873{ 874 int error; 875 876 error = vmwrite(VMCS_GUEST_RIP, vmexit->rip + vmexit->inst_length); 877 if (error) 878 panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); 879} 880 881/* 882 * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. 883 */ 884CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); 885 886static void __inline 887vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) 888{ 889 int error; 890 891 vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING; 892 893 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 894 if (error) 895 panic("vmx_set_int_window_exiting: vmwrite error %d", error); 896} 897 898static void __inline 899vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) 900{ 901 int error; 902 903 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; 904 905 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 906 if (error) 907 panic("vmx_clear_int_window_exiting: vmwrite error %d", error); 908} 909 910static void __inline 911vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) 912{ 913 int error; 914 915 vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; 916 917 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 918 if (error) 919 panic("vmx_set_nmi_window_exiting: vmwrite error %d", error); 920} 921 922static void __inline 923vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) 924{ 925 int error; 926 927 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; 928 929 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 930 if (error) 931 panic("vmx_clear_nmi_window_exiting: vmwrite error %d", error); 932} 933 934static int 935vmx_inject_nmi(struct vmx *vmx, int vcpu) 936{ 937 int error; 938 uint64_t info, interruptibility; 939 940 /* Bail out if no NMI requested */ 941 if (!vm_nmi_pending(vmx->vm, vcpu)) 942 return (0); 943 944 error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility); 945 if (error) { 946 panic("vmx_inject_nmi: vmread(interruptibility) %d", 947 error); 948 } 949 if (interruptibility & nmi_blocking_bits) 950 goto nmiblocked; 951 952 /* 953 * Inject the virtual NMI. The vector must be the NMI IDT entry 954 * or the VMCS entry check will fail. 955 */ 956 info = VMCS_INTERRUPTION_INFO_NMI | VMCS_INTERRUPTION_INFO_VALID; 957 info |= IDT_NMI; 958 959 error = vmwrite(VMCS_ENTRY_INTR_INFO, info); 960 if (error) 961 panic("vmx_inject_nmi: vmwrite(intrinfo) %d", error); 962 963 VMM_CTR0(vmx->vm, vcpu, "Injecting vNMI"); 964 965 /* Clear the request */ 966 vm_nmi_clear(vmx->vm, vcpu); 967 return (1); 968 969nmiblocked: 970 /* 971 * Set the NMI Window Exiting execution control so we can inject 972 * the virtual NMI as soon as blocking condition goes away. 973 */ 974 vmx_set_nmi_window_exiting(vmx, vcpu); 975 976 VMM_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting"); 977 return (1); 978} 979 980static void 981vmx_inject_interrupts(struct vmx *vmx, int vcpu) 982{ 983 int error, vector; 984 uint64_t info, rflags, interruptibility; 985 986 const int HWINTR_BLOCKED = VMCS_INTERRUPTIBILITY_STI_BLOCKING | 987 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING; 988 989 /* 990 * If there is already an interrupt pending then just return. 991 * 992 * This could happen if an interrupt was injected on a prior 993 * VM entry but the actual entry into guest mode was aborted 994 * because of a pending AST. 995 */ 996 error = vmread(VMCS_ENTRY_INTR_INFO, &info); 997 if (error) 998 panic("vmx_inject_interrupts: vmread(intrinfo) %d", error); 999 if (info & VMCS_INTERRUPTION_INFO_VALID) 1000 return; 1001 1002 /* 1003 * NMI injection has priority so deal with those first 1004 */ 1005 if (vmx_inject_nmi(vmx, vcpu)) 1006 return; 1007 1008 /* Ask the local apic for a vector to inject */ 1009 vector = lapic_pending_intr(vmx->vm, vcpu); 1010 if (vector < 0) 1011 return; 1012 1013 if (vector < 32 || vector > 255) 1014 panic("vmx_inject_interrupts: invalid vector %d\n", vector); 1015 1016 /* Check RFLAGS.IF and the interruptibility state of the guest */ 1017 error = vmread(VMCS_GUEST_RFLAGS, &rflags); 1018 if (error) 1019 panic("vmx_inject_interrupts: vmread(rflags) %d", error); 1020 1021 if ((rflags & PSL_I) == 0) 1022 goto cantinject; 1023 1024 error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility); 1025 if (error) { 1026 panic("vmx_inject_interrupts: vmread(interruptibility) %d", 1027 error); 1028 } 1029 if (interruptibility & HWINTR_BLOCKED) 1030 goto cantinject; 1031 1032 /* Inject the interrupt */ 1033 info = VMCS_INTERRUPTION_INFO_HW_INTR | VMCS_INTERRUPTION_INFO_VALID; 1034 info |= vector; 1035 error = vmwrite(VMCS_ENTRY_INTR_INFO, info); 1036 if (error) 1037 panic("vmx_inject_interrupts: vmwrite(intrinfo) %d", error); 1038 1039 /* Update the Local APIC ISR */ 1040 lapic_intr_accepted(vmx->vm, vcpu, vector); 1041 1042 VMM_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); 1043 1044 return; 1045 1046cantinject: 1047 /* 1048 * Set the Interrupt Window Exiting execution control so we can inject 1049 * the interrupt as soon as blocking condition goes away. 1050 */ 1051 vmx_set_int_window_exiting(vmx, vcpu); 1052 1053 VMM_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting"); 1054} 1055 1056static int 1057vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual) 1058{ 1059 int error, cr, vmcs_guest_cr; 1060 uint64_t regval, ones_mask, zeros_mask; 1061 const struct vmxctx *vmxctx; 1062 1063 /* We only handle mov to %cr0 or %cr4 at this time */ 1064 if ((exitqual & 0xf0) != 0x00) 1065 return (UNHANDLED); 1066 1067 cr = exitqual & 0xf; 1068 if (cr != 0 && cr != 4) 1069 return (UNHANDLED); 1070 1071 vmxctx = &vmx->ctx[vcpu]; 1072 1073 /* 1074 * We must use vmwrite() directly here because vmcs_setreg() will 1075 * call vmclear(vmcs) as a side-effect which we certainly don't want. 1076 */ 1077 switch ((exitqual >> 8) & 0xf) { 1078 case 0: 1079 regval = vmxctx->guest_rax; 1080 break; 1081 case 1: 1082 regval = vmxctx->guest_rcx; 1083 break; 1084 case 2: 1085 regval = vmxctx->guest_rdx; 1086 break; 1087 case 3: 1088 regval = vmxctx->guest_rbx; 1089 break; 1090 case 4: 1091 error = vmread(VMCS_GUEST_RSP, ®val); 1092 if (error) { 1093 panic("vmx_emulate_cr_access: " 1094 "error %d reading guest rsp", error); 1095 } 1096 break; 1097 case 5: 1098 regval = vmxctx->guest_rbp; 1099 break; 1100 case 6: 1101 regval = vmxctx->guest_rsi; 1102 break; 1103 case 7: 1104 regval = vmxctx->guest_rdi; 1105 break; 1106 case 8: 1107 regval = vmxctx->guest_r8; 1108 break; 1109 case 9: 1110 regval = vmxctx->guest_r9; 1111 break; 1112 case 10: 1113 regval = vmxctx->guest_r10; 1114 break; 1115 case 11: 1116 regval = vmxctx->guest_r11; 1117 break; 1118 case 12: 1119 regval = vmxctx->guest_r12; 1120 break; 1121 case 13: 1122 regval = vmxctx->guest_r13; 1123 break; 1124 case 14: 1125 regval = vmxctx->guest_r14; 1126 break; 1127 case 15: 1128 regval = vmxctx->guest_r15; 1129 break; 1130 } 1131 1132 if (cr == 0) { 1133 ones_mask = cr0_ones_mask; 1134 zeros_mask = cr0_zeros_mask; 1135 vmcs_guest_cr = VMCS_GUEST_CR0; 1136 } else { 1137 ones_mask = cr4_ones_mask; 1138 zeros_mask = cr4_zeros_mask; 1139 vmcs_guest_cr = VMCS_GUEST_CR4; 1140 } 1141 regval |= ones_mask; 1142 regval &= ~zeros_mask; 1143 error = vmwrite(vmcs_guest_cr, regval); 1144 if (error) { 1145 panic("vmx_emulate_cr_access: error %d writing cr%d", 1146 error, cr); 1147 } 1148 1149 return (HANDLED); 1150} 1151 1152static int 1153vmx_lapic_fault(struct vm *vm, int cpu, 1154 uint64_t gpa, uint64_t rip, int inst_length, 1155 uint64_t cr3, uint64_t ept_qual) 1156{ 1157 int read, write, handled; 1158 struct vie vie; 1159 1160 /* 1161 * For this to be a legitimate access to the local apic: 1162 * - the GPA in the local apic page 1163 * - the GPA must be aligned on a 16 byte boundary 1164 */ 1165 if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE) 1166 return (UNHANDLED); 1167 1168 if ((gpa & 0xF) != 0) 1169 return (UNHANDLED); 1170 1171 /* EPT violation on an instruction fetch doesn't make sense here */ 1172 if (ept_qual & EPT_VIOLATION_INST_FETCH) 1173 return (UNHANDLED); 1174 1175 /* EPT violation must be a read fault or a write fault but not both */ 1176 read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0; 1177 write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0; 1178 if ((read ^ write) == 0) 1179 return (UNHANDLED); 1180 1181 /* 1182 * The EPT violation must have been caused by accessing a guest-physical 1183 * address that is a translation of a guest-linear address. 1184 */ 1185 if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 || 1186 (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) { 1187 return (UNHANDLED); 1188 } 1189 1190 /* Fetch, decode and emulate the faulting instruction */ 1191 if (vmm_fetch_instruction(vm, rip, inst_length, cr3, &vie) != 0) 1192 return (UNHANDLED); 1193 1194 if (vmm_decode_instruction(&vie) != 0) 1195 return (UNHANDLED); 1196 1197 handled = lapic_mmio(vm, cpu, gpa - DEFAULT_APIC_BASE, read, &vie); 1198 1199 return (handled); 1200} 1201 1202static int 1203vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 1204{ 1205 int error, handled; 1206 struct vmcs *vmcs; 1207 struct vmxctx *vmxctx; 1208 uint32_t eax, ecx, edx; 1209 uint64_t qual, gpa, cr3, intr_info; 1210 1211 handled = 0; 1212 vmcs = &vmx->vmcs[vcpu]; 1213 vmxctx = &vmx->ctx[vcpu]; 1214 qual = vmexit->u.vmx.exit_qualification; 1215 vmexit->exitcode = VM_EXITCODE_BOGUS; 1216 1217 switch (vmexit->u.vmx.exit_reason) { 1218 case EXIT_REASON_CR_ACCESS: 1219 handled = vmx_emulate_cr_access(vmx, vcpu, qual); 1220 break; 1221 case EXIT_REASON_RDMSR: 1222 ecx = vmxctx->guest_rcx; 1223 handled = emulate_rdmsr(vmx->vm, vcpu, ecx); 1224 if (!handled) { 1225 vmexit->exitcode = VM_EXITCODE_RDMSR; 1226 vmexit->u.msr.code = ecx; 1227 } 1228 break; 1229 case EXIT_REASON_WRMSR: 1230 eax = vmxctx->guest_rax; 1231 ecx = vmxctx->guest_rcx; 1232 edx = vmxctx->guest_rdx; 1233 handled = emulate_wrmsr(vmx->vm, vcpu, ecx, 1234 (uint64_t)edx << 32 | eax); 1235 if (!handled) { 1236 vmexit->exitcode = VM_EXITCODE_WRMSR; 1237 vmexit->u.msr.code = ecx; 1238 vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; 1239 } 1240 break; 1241 case EXIT_REASON_HLT: 1242 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1); 1243 /* 1244 * If there is an event waiting to be injected then there is 1245 * no need to 'hlt'. 1246 */ 1247 error = vmread(VMCS_ENTRY_INTR_INFO, &intr_info); 1248 if (error) 1249 panic("vmx_exit_process: vmread(intrinfo) %d", error); 1250 1251 if (intr_info & VMCS_INTERRUPTION_INFO_VALID) { 1252 handled = 1; 1253 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT_IGNORED, 1); 1254 } else 1255 vmexit->exitcode = VM_EXITCODE_HLT; 1256 break; 1257 case EXIT_REASON_MTF: 1258 vmexit->exitcode = VM_EXITCODE_MTRAP; 1259 break; 1260 case EXIT_REASON_PAUSE: 1261 vmexit->exitcode = VM_EXITCODE_PAUSE; 1262 break; 1263 case EXIT_REASON_INTR_WINDOW: 1264 vmx_clear_int_window_exiting(vmx, vcpu); 1265 VMM_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); 1266 /* FALLTHRU */ 1267 case EXIT_REASON_EXT_INTR: 1268 /* 1269 * External interrupts serve only to cause VM exits and allow 1270 * the host interrupt handler to run. 1271 * 1272 * If this external interrupt triggers a virtual interrupt 1273 * to a VM, then that state will be recorded by the 1274 * host interrupt handler in the VM's softc. We will inject 1275 * this virtual interrupt during the subsequent VM enter. 1276 */ 1277 1278 /* 1279 * This is special. We want to treat this as an 'handled' 1280 * VM-exit but not increment the instruction pointer. 1281 */ 1282 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); 1283 return (1); 1284 case EXIT_REASON_NMI_WINDOW: 1285 /* Exit to allow the pending virtual NMI to be injected */ 1286 vmx_clear_nmi_window_exiting(vmx, vcpu); 1287 VMM_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting"); 1288 return (1); 1289 case EXIT_REASON_INOUT: 1290 vmexit->exitcode = VM_EXITCODE_INOUT; 1291 vmexit->u.inout.bytes = (qual & 0x7) + 1; 1292 vmexit->u.inout.in = (qual & 0x8) ? 1 : 0; 1293 vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; 1294 vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; 1295 vmexit->u.inout.port = (uint16_t)(qual >> 16); 1296 vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); 1297 break; 1298 case EXIT_REASON_CPUID: 1299 handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx); 1300 break; 1301 case EXIT_REASON_EPT_FAULT: 1302 gpa = vmcs_gpa(); 1303 cr3 = vmcs_guest_cr3(); 1304 handled = vmx_lapic_fault(vmx->vm, vcpu, 1305 gpa, vmexit->rip, vmexit->inst_length, 1306 cr3, qual); 1307 if (!handled) { 1308 vmexit->exitcode = VM_EXITCODE_PAGING; 1309 vmexit->u.paging.cr3 = cr3; 1310 vmexit->u.paging.gpa = gpa; 1311 vmexit->u.paging.rwx = qual & 0x7; 1312 } 1313 break; 1314 default: 1315 break; 1316 } 1317 1318 if (handled) { 1319 /* 1320 * It is possible that control is returned to userland 1321 * even though we were able to handle the VM exit in the 1322 * kernel. 1323 * 1324 * In such a case we want to make sure that the userland 1325 * restarts guest execution at the instruction *after* 1326 * the one we just processed. Therefore we update the 1327 * guest rip in the VMCS and in 'vmexit'. 1328 */ 1329 vm_exit_update_rip(vmexit); 1330 vmexit->rip += vmexit->inst_length; 1331 vmexit->inst_length = 0; 1332 1333 /* 1334 * Special case for spinning up an AP - exit to userspace to 1335 * give the controlling process a chance to intercept and 1336 * spin up a thread for the AP. 1337 */ 1338 if (vmexit->exitcode == VM_EXITCODE_SPINUP_AP) 1339 handled = 0; 1340 } else { 1341 if (vmexit->exitcode == VM_EXITCODE_BOGUS) { 1342 /* 1343 * If this VM exit was not claimed by anybody then 1344 * treat it as a generic VMX exit. 1345 */ 1346 vmexit->exitcode = VM_EXITCODE_VMX; 1347 vmexit->u.vmx.error = 0; 1348 } else { 1349 /* 1350 * The exitcode and collateral have been populated. 1351 * The VM exit will be processed further in userland. 1352 */ 1353 } 1354 } 1355 return (handled); 1356} 1357 1358static int 1359vmx_run(void *arg, int vcpu, register_t rip) 1360{ 1361 int error, vie, rc, handled, astpending; 1362 uint32_t exit_reason; 1363 struct vmx *vmx; 1364 struct vmxctx *vmxctx; 1365 struct vmcs *vmcs; 1366 struct vm_exit *vmexit; 1367 1368 vmx = arg; 1369 vmcs = &vmx->vmcs[vcpu]; 1370 vmxctx = &vmx->ctx[vcpu]; 1371 vmxctx->launched = 0; 1372 1373 astpending = 0; 1374 vmexit = vm_exitinfo(vmx->vm, vcpu); 1375 1376 /* 1377 * XXX Can we avoid doing this every time we do a vm run? 1378 */ 1379 VMPTRLD(vmcs); 1380 1381 /* 1382 * XXX 1383 * We do this every time because we may setup the virtual machine 1384 * from a different process than the one that actually runs it. 1385 * 1386 * If the life of a virtual machine was spent entirely in the context 1387 * of a single process we could do this once in vmcs_set_defaults(). 1388 */ 1389 if ((error = vmwrite(VMCS_HOST_CR3, rcr3())) != 0) 1390 panic("vmx_run: error %d writing to VMCS_HOST_CR3", error); 1391 1392 if ((error = vmwrite(VMCS_GUEST_RIP, rip)) != 0) 1393 panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); 1394 1395 if ((error = vmx_set_pcpu_defaults(vmx, vcpu)) != 0) 1396 panic("vmx_run: error %d setting up pcpu defaults", error); 1397 1398 do { 1399 lapic_timer_tick(vmx->vm, vcpu); 1400 vmx_inject_interrupts(vmx, vcpu); 1401 vmx_run_trace(vmx, vcpu); 1402 rc = vmx_setjmp(vmxctx); 1403#ifdef SETJMP_TRACE 1404 vmx_setjmp_trace(vmx, vcpu, vmxctx, rc); 1405#endif 1406 switch (rc) { 1407 case VMX_RETURN_DIRECT: 1408 if (vmxctx->launched == 0) { 1409 vmxctx->launched = 1; 1410 vmx_launch(vmxctx); 1411 } else 1412 vmx_resume(vmxctx); 1413 panic("vmx_launch/resume should not return"); 1414 break; 1415 case VMX_RETURN_LONGJMP: 1416 break; /* vm exit */ 1417 case VMX_RETURN_AST: 1418 astpending = 1; 1419 break; 1420 case VMX_RETURN_VMRESUME: 1421 vie = vmcs_instruction_error(); 1422 if (vmxctx->launch_error == VM_FAIL_INVALID || 1423 vie != VMRESUME_WITH_NON_LAUNCHED_VMCS) { 1424 printf("vmresume error %d vmcs inst error %d\n", 1425 vmxctx->launch_error, vie); 1426 goto err_exit; 1427 } 1428 vmx_launch(vmxctx); /* try to launch the guest */ 1429 panic("vmx_launch should not return"); 1430 break; 1431 case VMX_RETURN_VMLAUNCH: 1432 vie = vmcs_instruction_error(); 1433#if 1 1434 printf("vmlaunch error %d vmcs inst error %d\n", 1435 vmxctx->launch_error, vie); 1436#endif 1437 goto err_exit; 1438 default: 1439 panic("vmx_setjmp returned %d", rc); 1440 } 1441 1442 /* enable interrupts */ 1443 enable_intr(); 1444 1445 /* collect some basic information for VM exit processing */ 1446 vmexit->rip = rip = vmcs_guest_rip(); 1447 vmexit->inst_length = vmexit_instruction_length(); 1448 vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); 1449 vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); 1450 1451 if (astpending) { 1452 handled = 1; 1453 vmexit->inst_length = 0; 1454 vmexit->exitcode = VM_EXITCODE_BOGUS; 1455 vmx_astpending_trace(vmx, vcpu, rip); 1456 break; 1457 } 1458 1459 handled = vmx_exit_process(vmx, vcpu, vmexit); 1460 vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled); 1461 1462 } while (handled); 1463 1464 /* 1465 * If a VM exit has been handled then the exitcode must be BOGUS 1466 * If a VM exit is not handled then the exitcode must not be BOGUS 1467 */ 1468 if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || 1469 (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { 1470 panic("Mismatch between handled (%d) and exitcode (%d)", 1471 handled, vmexit->exitcode); 1472 } 1473 1474 VMM_CTR1(vmx->vm, vcpu, "goto userland: exitcode %d",vmexit->exitcode); 1475 1476 /* 1477 * XXX 1478 * We need to do this to ensure that any VMCS state cached by the 1479 * processor is flushed to memory. We need to do this in case the 1480 * VM moves to a different cpu the next time it runs. 1481 * 1482 * Can we avoid doing this? 1483 */ 1484 VMCLEAR(vmcs); 1485 return (0); 1486 1487err_exit: 1488 vmexit->exitcode = VM_EXITCODE_VMX; 1489 vmexit->u.vmx.exit_reason = (uint32_t)-1; 1490 vmexit->u.vmx.exit_qualification = (uint32_t)-1; 1491 vmexit->u.vmx.error = vie; 1492 VMCLEAR(vmcs); 1493 return (ENOEXEC); 1494} 1495 1496static void 1497vmx_vmcleanup(void *arg) 1498{ 1499 int error; 1500 struct vmx *vmx = arg; 1501 1502 /* 1503 * XXXSMP we also need to clear the VMCS active on the other vcpus. 1504 */ 1505 error = vmclear(&vmx->vmcs[0]); 1506 if (error != 0) 1507 panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error); 1508 1509 ept_vmcleanup(vmx); 1510 free(vmx, M_VMX); 1511 1512 return; 1513} 1514 1515static register_t * 1516vmxctx_regptr(struct vmxctx *vmxctx, int reg) 1517{ 1518 1519 switch (reg) { 1520 case VM_REG_GUEST_RAX: 1521 return (&vmxctx->guest_rax); 1522 case VM_REG_GUEST_RBX: 1523 return (&vmxctx->guest_rbx); 1524 case VM_REG_GUEST_RCX: 1525 return (&vmxctx->guest_rcx); 1526 case VM_REG_GUEST_RDX: 1527 return (&vmxctx->guest_rdx); 1528 case VM_REG_GUEST_RSI: 1529 return (&vmxctx->guest_rsi); 1530 case VM_REG_GUEST_RDI: 1531 return (&vmxctx->guest_rdi); 1532 case VM_REG_GUEST_RBP: 1533 return (&vmxctx->guest_rbp); 1534 case VM_REG_GUEST_R8: 1535 return (&vmxctx->guest_r8); 1536 case VM_REG_GUEST_R9: 1537 return (&vmxctx->guest_r9); 1538 case VM_REG_GUEST_R10: 1539 return (&vmxctx->guest_r10); 1540 case VM_REG_GUEST_R11: 1541 return (&vmxctx->guest_r11); 1542 case VM_REG_GUEST_R12: 1543 return (&vmxctx->guest_r12); 1544 case VM_REG_GUEST_R13: 1545 return (&vmxctx->guest_r13); 1546 case VM_REG_GUEST_R14: 1547 return (&vmxctx->guest_r14); 1548 case VM_REG_GUEST_R15: 1549 return (&vmxctx->guest_r15); 1550 default: 1551 break; 1552 } 1553 return (NULL); 1554} 1555 1556static int 1557vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval) 1558{ 1559 register_t *regp; 1560 1561 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 1562 *retval = *regp; 1563 return (0); 1564 } else 1565 return (EINVAL); 1566} 1567 1568static int 1569vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) 1570{ 1571 register_t *regp; 1572 1573 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 1574 *regp = val; 1575 return (0); 1576 } else 1577 return (EINVAL); 1578} 1579 1580static int 1581vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) 1582{ 1583 struct vmx *vmx = arg; 1584 1585 if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0) 1586 return (0); 1587 1588 /* 1589 * If the vcpu is running then don't mess with the VMCS. 1590 * 1591 * vmcs_getreg will VMCLEAR the vmcs when it is done which will cause 1592 * the subsequent vmlaunch/vmresume to fail. 1593 */ 1594 if (vcpu_is_running(vmx->vm, vcpu)) 1595 panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); 1596 1597 return (vmcs_getreg(&vmx->vmcs[vcpu], reg, retval)); 1598} 1599 1600static int 1601vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) 1602{ 1603 int error; 1604 uint64_t ctls; 1605 struct vmx *vmx = arg; 1606 1607 /* 1608 * XXX Allow caller to set contents of the guest registers saved in 1609 * the 'vmxctx' even though the vcpu might be running. We need this 1610 * specifically to support the rdmsr emulation that will set the 1611 * %eax and %edx registers during vm exit processing. 1612 */ 1613 if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0) 1614 return (0); 1615 1616 /* 1617 * If the vcpu is running then don't mess with the VMCS. 1618 * 1619 * vmcs_setreg will VMCLEAR the vmcs when it is done which will cause 1620 * the subsequent vmlaunch/vmresume to fail. 1621 */ 1622 if (vcpu_is_running(vmx->vm, vcpu)) 1623 panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); 1624 1625 error = vmcs_setreg(&vmx->vmcs[vcpu], reg, val); 1626 1627 if (error == 0) { 1628 /* 1629 * If the "load EFER" VM-entry control is 1 then the 1630 * value of EFER.LMA must be identical to "IA-32e mode guest" 1631 * bit in the VM-entry control. 1632 */ 1633 if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 && 1634 (reg == VM_REG_GUEST_EFER)) { 1635 vmcs_getreg(&vmx->vmcs[vcpu], 1636 VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls); 1637 if (val & EFER_LMA) 1638 ctls |= VM_ENTRY_GUEST_LMA; 1639 else 1640 ctls &= ~VM_ENTRY_GUEST_LMA; 1641 vmcs_setreg(&vmx->vmcs[vcpu], 1642 VMCS_IDENT(VMCS_ENTRY_CTLS), ctls); 1643 } 1644 } 1645 1646 return (error); 1647} 1648 1649static int 1650vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 1651{ 1652 struct vmx *vmx = arg; 1653 1654 return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc)); 1655} 1656 1657static int 1658vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 1659{ 1660 struct vmx *vmx = arg; 1661 1662 return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc)); 1663} 1664 1665static int 1666vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code, 1667 int code_valid) 1668{ 1669 int error; 1670 uint64_t info; 1671 struct vmx *vmx = arg; 1672 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 1673 1674 static uint32_t type_map[VM_EVENT_MAX] = { 1675 0x1, /* VM_EVENT_NONE */ 1676 0x0, /* VM_HW_INTR */ 1677 0x2, /* VM_NMI */ 1678 0x3, /* VM_HW_EXCEPTION */ 1679 0x4, /* VM_SW_INTR */ 1680 0x5, /* VM_PRIV_SW_EXCEPTION */ 1681 0x6, /* VM_SW_EXCEPTION */ 1682 }; 1683 1684 /* 1685 * If there is already an exception pending to be delivered to the 1686 * vcpu then just return. 1687 */ 1688 error = vmcs_getreg(vmcs, VMCS_ENTRY_INTR_INFO, &info); 1689 if (error) 1690 return (error); 1691 1692 if (info & VMCS_INTERRUPTION_INFO_VALID) 1693 return (EAGAIN); 1694 1695 info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0); 1696 info |= VMCS_INTERRUPTION_INFO_VALID; 1697 error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info); 1698 if (error != 0) 1699 return (error); 1700 1701 if (code_valid) { 1702 error = vmcs_setreg(vmcs, 1703 VMCS_IDENT(VMCS_ENTRY_EXCEPTION_ERROR), 1704 code); 1705 } 1706 return (error); 1707} 1708 1709static int 1710vmx_getcap(void *arg, int vcpu, int type, int *retval) 1711{ 1712 struct vmx *vmx = arg; 1713 int vcap; 1714 int ret; 1715 1716 ret = ENOENT; 1717 1718 vcap = vmx->cap[vcpu].set; 1719 1720 switch (type) { 1721 case VM_CAP_HALT_EXIT: 1722 if (cap_halt_exit) 1723 ret = 0; 1724 break; 1725 case VM_CAP_PAUSE_EXIT: 1726 if (cap_pause_exit) 1727 ret = 0; 1728 break; 1729 case VM_CAP_MTRAP_EXIT: 1730 if (cap_monitor_trap) 1731 ret = 0; 1732 break; 1733 case VM_CAP_UNRESTRICTED_GUEST: 1734 if (cap_unrestricted_guest) 1735 ret = 0; 1736 break; 1737 default: 1738 break; 1739 } 1740 1741 if (ret == 0) 1742 *retval = (vcap & (1 << type)) ? 1 : 0; 1743 1744 return (ret); 1745} 1746 1747static int 1748vmx_setcap(void *arg, int vcpu, int type, int val) 1749{ 1750 struct vmx *vmx = arg; 1751 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 1752 uint32_t baseval; 1753 uint32_t *pptr; 1754 int error; 1755 int flag; 1756 int reg; 1757 int retval; 1758 1759 retval = ENOENT; 1760 pptr = NULL; 1761 1762 switch (type) { 1763 case VM_CAP_HALT_EXIT: 1764 if (cap_halt_exit) { 1765 retval = 0; 1766 pptr = &vmx->cap[vcpu].proc_ctls; 1767 baseval = *pptr; 1768 flag = PROCBASED_HLT_EXITING; 1769 reg = VMCS_PRI_PROC_BASED_CTLS; 1770 } 1771 break; 1772 case VM_CAP_MTRAP_EXIT: 1773 if (cap_monitor_trap) { 1774 retval = 0; 1775 pptr = &vmx->cap[vcpu].proc_ctls; 1776 baseval = *pptr; 1777 flag = PROCBASED_MTF; 1778 reg = VMCS_PRI_PROC_BASED_CTLS; 1779 } 1780 break; 1781 case VM_CAP_PAUSE_EXIT: 1782 if (cap_pause_exit) { 1783 retval = 0; 1784 pptr = &vmx->cap[vcpu].proc_ctls; 1785 baseval = *pptr; 1786 flag = PROCBASED_PAUSE_EXITING; 1787 reg = VMCS_PRI_PROC_BASED_CTLS; 1788 } 1789 break; 1790 case VM_CAP_UNRESTRICTED_GUEST: 1791 if (cap_unrestricted_guest) { 1792 retval = 0; 1793 baseval = procbased_ctls2; 1794 flag = PROCBASED2_UNRESTRICTED_GUEST; 1795 reg = VMCS_SEC_PROC_BASED_CTLS; 1796 } 1797 break; 1798 default: 1799 break; 1800 } 1801 1802 if (retval == 0) { 1803 if (val) { 1804 baseval |= flag; 1805 } else { 1806 baseval &= ~flag; 1807 } 1808 VMPTRLD(vmcs); 1809 error = vmwrite(reg, baseval); 1810 VMCLEAR(vmcs); 1811 1812 if (error) { 1813 retval = error; 1814 } else { 1815 /* 1816 * Update optional stored flags, and record 1817 * setting 1818 */ 1819 if (pptr != NULL) { 1820 *pptr = baseval; 1821 } 1822 1823 if (val) { 1824 vmx->cap[vcpu].set |= (1 << type); 1825 } else { 1826 vmx->cap[vcpu].set &= ~(1 << type); 1827 } 1828 } 1829 } 1830 1831 return (retval); 1832} 1833 1834struct vmm_ops vmm_ops_intel = { 1835 vmx_init, 1836 vmx_cleanup, 1837 vmx_vminit, 1838 vmx_run, 1839 vmx_vmcleanup, 1840 ept_vmmmap_set, 1841 ept_vmmmap_get, 1842 vmx_getreg, 1843 vmx_setreg, 1844 vmx_getdesc, 1845 vmx_setdesc, 1846 vmx_inject, 1847 vmx_getcap, 1848 vmx_setcap 1849}; 1850