vmx.c revision 243667
1/*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD$ 27 */ 28 29#include <sys/cdefs.h> 30__FBSDID("$FreeBSD$"); 31 32#include <sys/param.h> 33#include <sys/systm.h> 34#include <sys/smp.h> 35#include <sys/kernel.h> 36#include <sys/malloc.h> 37#include <sys/pcpu.h> 38#include <sys/proc.h> 39 40#include <vm/vm.h> 41#include <vm/pmap.h> 42 43#include <machine/psl.h> 44#include <machine/cpufunc.h> 45#include <machine/md_var.h> 46#include <machine/pmap.h> 47#include <machine/segments.h> 48#include <machine/specialreg.h> 49#include <machine/vmparam.h> 50 51#include <x86/apicreg.h> 52 53#include <machine/vmm.h> 54#include "vmm_host.h" 55#include "vmm_lapic.h" 56#include "vmm_msr.h" 57#include "vmm_ktr.h" 58#include "vmm_stat.h" 59 60#include "vmx_msr.h" 61#include "ept.h" 62#include "vmx_cpufunc.h" 63#include "vmx.h" 64#include "x86.h" 65#include "vmx_controls.h" 66 67#define PINBASED_CTLS_ONE_SETTING \ 68 (PINBASED_EXTINT_EXITING | \ 69 PINBASED_NMI_EXITING | \ 70 PINBASED_VIRTUAL_NMI) 71#define PINBASED_CTLS_ZERO_SETTING 0 72 73#define PROCBASED_CTLS_WINDOW_SETTING \ 74 (PROCBASED_INT_WINDOW_EXITING | \ 75 PROCBASED_NMI_WINDOW_EXITING) 76 77#define PROCBASED_CTLS_ONE_SETTING \ 78 (PROCBASED_SECONDARY_CONTROLS | \ 79 PROCBASED_IO_EXITING | \ 80 PROCBASED_MSR_BITMAPS | \ 81 PROCBASED_CTLS_WINDOW_SETTING) 82#define PROCBASED_CTLS_ZERO_SETTING \ 83 (PROCBASED_CR3_LOAD_EXITING | \ 84 PROCBASED_CR3_STORE_EXITING | \ 85 PROCBASED_IO_BITMAPS) 86 87#define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT 88#define PROCBASED_CTLS2_ZERO_SETTING 0 89 90#define VM_EXIT_CTLS_ONE_SETTING_NO_PAT \ 91 (VM_EXIT_HOST_LMA | \ 92 VM_EXIT_SAVE_EFER | \ 93 VM_EXIT_LOAD_EFER) 94 95#define VM_EXIT_CTLS_ONE_SETTING \ 96 (VM_EXIT_CTLS_ONE_SETTING_NO_PAT | \ 97 VM_EXIT_SAVE_PAT | \ 98 VM_EXIT_LOAD_PAT) 99#define VM_EXIT_CTLS_ZERO_SETTING VM_EXIT_SAVE_DEBUG_CONTROLS 100 101#define VM_ENTRY_CTLS_ONE_SETTING_NO_PAT VM_ENTRY_LOAD_EFER 102 103#define VM_ENTRY_CTLS_ONE_SETTING \ 104 (VM_ENTRY_CTLS_ONE_SETTING_NO_PAT | \ 105 VM_ENTRY_LOAD_PAT) 106#define VM_ENTRY_CTLS_ZERO_SETTING \ 107 (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ 108 VM_ENTRY_INTO_SMM | \ 109 VM_ENTRY_DEACTIVATE_DUAL_MONITOR) 110 111#define guest_msr_rw(vmx, msr) \ 112 msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW) 113 114#define HANDLED 1 115#define UNHANDLED 0 116 117MALLOC_DEFINE(M_VMX, "vmx", "vmx"); 118 119int vmxon_enabled[MAXCPU]; 120static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); 121 122static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; 123static uint32_t exit_ctls, entry_ctls; 124 125static uint64_t cr0_ones_mask, cr0_zeros_mask; 126static uint64_t cr4_ones_mask, cr4_zeros_mask; 127 128static volatile u_int nextvpid; 129 130static int vmx_no_patmsr; 131 132/* 133 * Virtual NMI blocking conditions. 134 * 135 * Some processor implementations also require NMI to be blocked if 136 * the STI_BLOCKING bit is set. It is possible to detect this at runtime 137 * based on the (exit_reason,exit_qual) tuple being set to 138 * (EXIT_REASON_INVAL_VMCS, EXIT_QUAL_NMI_WHILE_STI_BLOCKING). 139 * 140 * We take the easy way out and also include STI_BLOCKING as one of the 141 * gating items for vNMI injection. 142 */ 143static uint64_t nmi_blocking_bits = VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING | 144 VMCS_INTERRUPTIBILITY_NMI_BLOCKING | 145 VMCS_INTERRUPTIBILITY_STI_BLOCKING; 146 147/* 148 * Optional capabilities 149 */ 150static int cap_halt_exit; 151static int cap_pause_exit; 152static int cap_unrestricted_guest; 153static int cap_monitor_trap; 154 155/* statistics */ 156static VMM_STAT_DEFINE(VCPU_MIGRATIONS, "vcpu migration across host cpus"); 157static VMM_STAT_DEFINE(VMEXIT_EXTINT, "vm exits due to external interrupt"); 158static VMM_STAT_DEFINE(VMEXIT_HLT_IGNORED, "number of times hlt was ignored"); 159static VMM_STAT_DEFINE(VMEXIT_HLT, "number of times hlt was intercepted"); 160 161#ifdef KTR 162static const char * 163exit_reason_to_str(int reason) 164{ 165 static char reasonbuf[32]; 166 167 switch (reason) { 168 case EXIT_REASON_EXCEPTION: 169 return "exception"; 170 case EXIT_REASON_EXT_INTR: 171 return "extint"; 172 case EXIT_REASON_TRIPLE_FAULT: 173 return "triplefault"; 174 case EXIT_REASON_INIT: 175 return "init"; 176 case EXIT_REASON_SIPI: 177 return "sipi"; 178 case EXIT_REASON_IO_SMI: 179 return "iosmi"; 180 case EXIT_REASON_SMI: 181 return "smi"; 182 case EXIT_REASON_INTR_WINDOW: 183 return "intrwindow"; 184 case EXIT_REASON_NMI_WINDOW: 185 return "nmiwindow"; 186 case EXIT_REASON_TASK_SWITCH: 187 return "taskswitch"; 188 case EXIT_REASON_CPUID: 189 return "cpuid"; 190 case EXIT_REASON_GETSEC: 191 return "getsec"; 192 case EXIT_REASON_HLT: 193 return "hlt"; 194 case EXIT_REASON_INVD: 195 return "invd"; 196 case EXIT_REASON_INVLPG: 197 return "invlpg"; 198 case EXIT_REASON_RDPMC: 199 return "rdpmc"; 200 case EXIT_REASON_RDTSC: 201 return "rdtsc"; 202 case EXIT_REASON_RSM: 203 return "rsm"; 204 case EXIT_REASON_VMCALL: 205 return "vmcall"; 206 case EXIT_REASON_VMCLEAR: 207 return "vmclear"; 208 case EXIT_REASON_VMLAUNCH: 209 return "vmlaunch"; 210 case EXIT_REASON_VMPTRLD: 211 return "vmptrld"; 212 case EXIT_REASON_VMPTRST: 213 return "vmptrst"; 214 case EXIT_REASON_VMREAD: 215 return "vmread"; 216 case EXIT_REASON_VMRESUME: 217 return "vmresume"; 218 case EXIT_REASON_VMWRITE: 219 return "vmwrite"; 220 case EXIT_REASON_VMXOFF: 221 return "vmxoff"; 222 case EXIT_REASON_VMXON: 223 return "vmxon"; 224 case EXIT_REASON_CR_ACCESS: 225 return "craccess"; 226 case EXIT_REASON_DR_ACCESS: 227 return "draccess"; 228 case EXIT_REASON_INOUT: 229 return "inout"; 230 case EXIT_REASON_RDMSR: 231 return "rdmsr"; 232 case EXIT_REASON_WRMSR: 233 return "wrmsr"; 234 case EXIT_REASON_INVAL_VMCS: 235 return "invalvmcs"; 236 case EXIT_REASON_INVAL_MSR: 237 return "invalmsr"; 238 case EXIT_REASON_MWAIT: 239 return "mwait"; 240 case EXIT_REASON_MTF: 241 return "mtf"; 242 case EXIT_REASON_MONITOR: 243 return "monitor"; 244 case EXIT_REASON_PAUSE: 245 return "pause"; 246 case EXIT_REASON_MCE: 247 return "mce"; 248 case EXIT_REASON_TPR: 249 return "tpr"; 250 case EXIT_REASON_APIC: 251 return "apic"; 252 case EXIT_REASON_GDTR_IDTR: 253 return "gdtridtr"; 254 case EXIT_REASON_LDTR_TR: 255 return "ldtrtr"; 256 case EXIT_REASON_EPT_FAULT: 257 return "eptfault"; 258 case EXIT_REASON_EPT_MISCONFIG: 259 return "eptmisconfig"; 260 case EXIT_REASON_INVEPT: 261 return "invept"; 262 case EXIT_REASON_RDTSCP: 263 return "rdtscp"; 264 case EXIT_REASON_VMX_PREEMPT: 265 return "vmxpreempt"; 266 case EXIT_REASON_INVVPID: 267 return "invvpid"; 268 case EXIT_REASON_WBINVD: 269 return "wbinvd"; 270 case EXIT_REASON_XSETBV: 271 return "xsetbv"; 272 default: 273 snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); 274 return (reasonbuf); 275 } 276} 277 278#ifdef SETJMP_TRACE 279static const char * 280vmx_setjmp_rc2str(int rc) 281{ 282 switch (rc) { 283 case VMX_RETURN_DIRECT: 284 return "direct"; 285 case VMX_RETURN_LONGJMP: 286 return "longjmp"; 287 case VMX_RETURN_VMRESUME: 288 return "vmresume"; 289 case VMX_RETURN_VMLAUNCH: 290 return "vmlaunch"; 291 case VMX_RETURN_AST: 292 return "ast"; 293 default: 294 return "unknown"; 295 } 296} 297 298#define SETJMP_TRACE(vmx, vcpu, vmxctx, regname) \ 299 VMM_CTR1((vmx)->vm, (vcpu), "setjmp trace " #regname " 0x%016lx", \ 300 (vmxctx)->regname) 301 302static void 303vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) 304{ 305 uint64_t host_rip, host_rsp; 306 307 if (vmxctx != &vmx->ctx[vcpu]) 308 panic("vmx_setjmp_trace: invalid vmxctx %p; should be %p", 309 vmxctx, &vmx->ctx[vcpu]); 310 311 VMM_CTR1((vmx)->vm, (vcpu), "vmxctx = %p", vmxctx); 312 VMM_CTR2((vmx)->vm, (vcpu), "setjmp return code %s(%d)", 313 vmx_setjmp_rc2str(rc), rc); 314 315 host_rsp = host_rip = ~0; 316 vmread(VMCS_HOST_RIP, &host_rip); 317 vmread(VMCS_HOST_RSP, &host_rsp); 318 VMM_CTR2((vmx)->vm, (vcpu), "vmcs host_rip 0x%016lx, host_rsp 0x%016lx", 319 host_rip, host_rsp); 320 321 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r15); 322 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r14); 323 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r13); 324 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r12); 325 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbp); 326 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rsp); 327 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbx); 328 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rip); 329 330 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdi); 331 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rsi); 332 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdx); 333 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rcx); 334 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r8); 335 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r9); 336 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rax); 337 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbx); 338 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbp); 339 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r10); 340 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r11); 341 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r12); 342 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r13); 343 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r14); 344 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r15); 345 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_cr2); 346} 347#endif 348#else 349static void __inline 350vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) 351{ 352 return; 353} 354#endif /* KTR */ 355 356u_long 357vmx_fix_cr0(u_long cr0) 358{ 359 360 return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); 361} 362 363u_long 364vmx_fix_cr4(u_long cr4) 365{ 366 367 return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); 368} 369 370static void 371msr_save_area_init(struct msr_entry *g_area, int *g_count) 372{ 373 int cnt; 374 375 static struct msr_entry guest_msrs[] = { 376 { MSR_KGSBASE, 0, 0 }, 377 }; 378 379 cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]); 380 if (cnt > GUEST_MSR_MAX_ENTRIES) 381 panic("guest msr save area overrun"); 382 bcopy(guest_msrs, g_area, sizeof(guest_msrs)); 383 *g_count = cnt; 384} 385 386static void 387vmx_disable(void *arg __unused) 388{ 389 struct invvpid_desc invvpid_desc = { 0 }; 390 struct invept_desc invept_desc = { 0 }; 391 392 if (vmxon_enabled[curcpu]) { 393 /* 394 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b. 395 * 396 * VMXON or VMXOFF are not required to invalidate any TLB 397 * caching structures. This prevents potential retention of 398 * cached information in the TLB between distinct VMX episodes. 399 */ 400 invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc); 401 invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc); 402 vmxoff(); 403 } 404 load_cr4(rcr4() & ~CR4_VMXE); 405} 406 407static int 408vmx_cleanup(void) 409{ 410 411 smp_rendezvous(NULL, vmx_disable, NULL, NULL); 412 413 return (0); 414} 415 416static void 417vmx_enable(void *arg __unused) 418{ 419 int error; 420 421 load_cr4(rcr4() | CR4_VMXE); 422 423 *(uint32_t *)vmxon_region[curcpu] = vmx_revision(); 424 error = vmxon(vmxon_region[curcpu]); 425 if (error == 0) 426 vmxon_enabled[curcpu] = 1; 427} 428 429static int 430vmx_init(void) 431{ 432 int error; 433 uint64_t fixed0, fixed1, feature_control; 434 uint32_t tmp; 435 436 /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ 437 if (!(cpu_feature2 & CPUID2_VMX)) { 438 printf("vmx_init: processor does not support VMX operation\n"); 439 return (ENXIO); 440 } 441 442 /* 443 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits 444 * are set (bits 0 and 2 respectively). 445 */ 446 feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); 447 if ((feature_control & 0x5) != 0x5) { 448 printf("vmx_init: VMX operation disabled by BIOS\n"); 449 return (ENXIO); 450 } 451 452 /* Check support for primary processor-based VM-execution controls */ 453 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 454 MSR_VMX_TRUE_PROCBASED_CTLS, 455 PROCBASED_CTLS_ONE_SETTING, 456 PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); 457 if (error) { 458 printf("vmx_init: processor does not support desired primary " 459 "processor-based controls\n"); 460 return (error); 461 } 462 463 /* Clear the processor-based ctl bits that are set on demand */ 464 procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; 465 466 /* Check support for secondary processor-based VM-execution controls */ 467 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 468 MSR_VMX_PROCBASED_CTLS2, 469 PROCBASED_CTLS2_ONE_SETTING, 470 PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); 471 if (error) { 472 printf("vmx_init: processor does not support desired secondary " 473 "processor-based controls\n"); 474 return (error); 475 } 476 477 /* Check support for VPID */ 478 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, 479 PROCBASED2_ENABLE_VPID, 0, &tmp); 480 if (error == 0) 481 procbased_ctls2 |= PROCBASED2_ENABLE_VPID; 482 483 /* Check support for pin-based VM-execution controls */ 484 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, 485 MSR_VMX_TRUE_PINBASED_CTLS, 486 PINBASED_CTLS_ONE_SETTING, 487 PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); 488 if (error) { 489 printf("vmx_init: processor does not support desired " 490 "pin-based controls\n"); 491 return (error); 492 } 493 494 /* Check support for VM-exit controls */ 495 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, 496 VM_EXIT_CTLS_ONE_SETTING, 497 VM_EXIT_CTLS_ZERO_SETTING, 498 &exit_ctls); 499 if (error) { 500 /* Try again without the PAT MSR bits */ 501 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, 502 MSR_VMX_TRUE_EXIT_CTLS, 503 VM_EXIT_CTLS_ONE_SETTING_NO_PAT, 504 VM_EXIT_CTLS_ZERO_SETTING, 505 &exit_ctls); 506 if (error) { 507 printf("vmx_init: processor does not support desired " 508 "exit controls\n"); 509 return (error); 510 } else { 511 if (bootverbose) 512 printf("vmm: PAT MSR access not supported\n"); 513 guest_msr_valid(MSR_PAT); 514 vmx_no_patmsr = 1; 515 } 516 } 517 518 /* Check support for VM-entry controls */ 519 if (!vmx_no_patmsr) { 520 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, 521 MSR_VMX_TRUE_ENTRY_CTLS, 522 VM_ENTRY_CTLS_ONE_SETTING, 523 VM_ENTRY_CTLS_ZERO_SETTING, 524 &entry_ctls); 525 } else { 526 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, 527 MSR_VMX_TRUE_ENTRY_CTLS, 528 VM_ENTRY_CTLS_ONE_SETTING_NO_PAT, 529 VM_ENTRY_CTLS_ZERO_SETTING, 530 &entry_ctls); 531 } 532 533 if (error) { 534 printf("vmx_init: processor does not support desired " 535 "entry controls\n"); 536 return (error); 537 } 538 539 /* 540 * Check support for optional features by testing them 541 * as individual bits 542 */ 543 cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 544 MSR_VMX_TRUE_PROCBASED_CTLS, 545 PROCBASED_HLT_EXITING, 0, 546 &tmp) == 0); 547 548 cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 549 MSR_VMX_PROCBASED_CTLS, 550 PROCBASED_MTF, 0, 551 &tmp) == 0); 552 553 cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 554 MSR_VMX_TRUE_PROCBASED_CTLS, 555 PROCBASED_PAUSE_EXITING, 0, 556 &tmp) == 0); 557 558 cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 559 MSR_VMX_PROCBASED_CTLS2, 560 PROCBASED2_UNRESTRICTED_GUEST, 0, 561 &tmp) == 0); 562 563 /* Initialize EPT */ 564 error = ept_init(); 565 if (error) { 566 printf("vmx_init: ept initialization failed (%d)\n", error); 567 return (error); 568 } 569 570 /* 571 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 572 */ 573 fixed0 = rdmsr(MSR_VMX_CR0_FIXED0); 574 fixed1 = rdmsr(MSR_VMX_CR0_FIXED1); 575 cr0_ones_mask = fixed0 & fixed1; 576 cr0_zeros_mask = ~fixed0 & ~fixed1; 577 578 /* 579 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation 580 * if unrestricted guest execution is allowed. 581 */ 582 if (cap_unrestricted_guest) 583 cr0_ones_mask &= ~(CR0_PG | CR0_PE); 584 585 /* 586 * Do not allow the guest to set CR0_NW or CR0_CD. 587 */ 588 cr0_zeros_mask |= (CR0_NW | CR0_CD); 589 590 fixed0 = rdmsr(MSR_VMX_CR4_FIXED0); 591 fixed1 = rdmsr(MSR_VMX_CR4_FIXED1); 592 cr4_ones_mask = fixed0 & fixed1; 593 cr4_zeros_mask = ~fixed0 & ~fixed1; 594 595 /* enable VMX operation */ 596 smp_rendezvous(NULL, vmx_enable, NULL, NULL); 597 598 return (0); 599} 600 601/* 602 * If this processor does not support VPIDs then simply return 0. 603 * 604 * Otherwise generate the next value of VPID to use. Any value is alright 605 * as long as it is non-zero. 606 * 607 * We always execute in VMX non-root context with EPT enabled. Thus all 608 * combined mappings are tagged with the (EP4TA, VPID, PCID) tuple. This 609 * in turn means that multiple VMs can share the same VPID as long as 610 * they have distinct EPT page tables. 611 * 612 * XXX 613 * We should optimize this so that it returns VPIDs that are not in 614 * use. Then we will not unnecessarily invalidate mappings in 615 * vmx_set_pcpu_defaults() just because two or more vcpus happen to 616 * use the same 'vpid'. 617 */ 618static uint16_t 619vmx_vpid(void) 620{ 621 uint16_t vpid = 0; 622 623 if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) != 0) { 624 do { 625 vpid = atomic_fetchadd_int(&nextvpid, 1); 626 } while (vpid == 0); 627 } 628 629 return (vpid); 630} 631 632static int 633vmx_setup_cr_shadow(int which, struct vmcs *vmcs) 634{ 635 int error, mask_ident, shadow_ident; 636 uint64_t mask_value, shadow_value; 637 638 if (which != 0 && which != 4) 639 panic("vmx_setup_cr_shadow: unknown cr%d", which); 640 641 if (which == 0) { 642 mask_ident = VMCS_CR0_MASK; 643 mask_value = cr0_ones_mask | cr0_zeros_mask; 644 shadow_ident = VMCS_CR0_SHADOW; 645 shadow_value = cr0_ones_mask; 646 } else { 647 mask_ident = VMCS_CR4_MASK; 648 mask_value = cr4_ones_mask | cr4_zeros_mask; 649 shadow_ident = VMCS_CR4_SHADOW; 650 shadow_value = cr4_ones_mask; 651 } 652 653 error = vmcs_setreg(vmcs, VMCS_IDENT(mask_ident), mask_value); 654 if (error) 655 return (error); 656 657 error = vmcs_setreg(vmcs, VMCS_IDENT(shadow_ident), shadow_value); 658 if (error) 659 return (error); 660 661 return (0); 662} 663#define vmx_setup_cr0_shadow(vmcs) vmx_setup_cr_shadow(0, (vmcs)) 664#define vmx_setup_cr4_shadow(vmcs) vmx_setup_cr_shadow(4, (vmcs)) 665 666static void * 667vmx_vminit(struct vm *vm) 668{ 669 uint16_t vpid; 670 int i, error, guest_msr_count; 671 struct vmx *vmx; 672 673 vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); 674 if ((uintptr_t)vmx & PAGE_MASK) { 675 panic("malloc of struct vmx not aligned on %d byte boundary", 676 PAGE_SIZE); 677 } 678 vmx->vm = vm; 679 680 /* 681 * Clean up EPTP-tagged guest physical and combined mappings 682 * 683 * VMX transitions are not required to invalidate any guest physical 684 * mappings. So, it may be possible for stale guest physical mappings 685 * to be present in the processor TLBs. 686 * 687 * Combined mappings for this EP4TA are also invalidated for all VPIDs. 688 */ 689 ept_invalidate_mappings(vtophys(vmx->pml4ept)); 690 691 msr_bitmap_initialize(vmx->msr_bitmap); 692 693 /* 694 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE. 695 * The guest FSBASE and GSBASE are saved and restored during 696 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are 697 * always restored from the vmcs host state area on vm-exit. 698 * 699 * Guest KGSBASE is saved and restored in the guest MSR save area. 700 * Host KGSBASE is restored before returning to userland from the pcb. 701 * There will be a window of time when we are executing in the host 702 * kernel context with a value of KGSBASE from the guest. This is ok 703 * because the value of KGSBASE is inconsequential in kernel context. 704 * 705 * MSR_EFER is saved and restored in the guest VMCS area on a 706 * VM exit and entry respectively. It is also restored from the 707 * host VMCS area on a VM exit. 708 */ 709 if (guest_msr_rw(vmx, MSR_GSBASE) || 710 guest_msr_rw(vmx, MSR_FSBASE) || 711 guest_msr_rw(vmx, MSR_KGSBASE) || 712 guest_msr_rw(vmx, MSR_EFER)) 713 panic("vmx_vminit: error setting guest msr access"); 714 715 /* 716 * MSR_PAT is saved and restored in the guest VMCS are on a VM exit 717 * and entry respectively. It is also restored from the host VMCS 718 * area on a VM exit. However, if running on a system with no 719 * MSR_PAT save/restore support, leave access disabled so accesses 720 * will be trapped. 721 */ 722 if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT)) 723 panic("vmx_vminit: error setting guest pat msr access"); 724 725 for (i = 0; i < VM_MAXCPU; i++) { 726 vmx->vmcs[i].identifier = vmx_revision(); 727 error = vmclear(&vmx->vmcs[i]); 728 if (error != 0) { 729 panic("vmx_vminit: vmclear error %d on vcpu %d\n", 730 error, i); 731 } 732 733 vpid = vmx_vpid(); 734 735 error = vmcs_set_defaults(&vmx->vmcs[i], 736 (u_long)vmx_longjmp, 737 (u_long)&vmx->ctx[i], 738 vtophys(vmx->pml4ept), 739 pinbased_ctls, 740 procbased_ctls, 741 procbased_ctls2, 742 exit_ctls, entry_ctls, 743 vtophys(vmx->msr_bitmap), 744 vpid); 745 746 if (error != 0) 747 panic("vmx_vminit: vmcs_set_defaults error %d", error); 748 749 vmx->cap[i].set = 0; 750 vmx->cap[i].proc_ctls = procbased_ctls; 751 752 vmx->state[i].lastcpu = -1; 753 vmx->state[i].vpid = vpid; 754 755 msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count); 756 757 error = vmcs_set_msr_save(&vmx->vmcs[i], 758 vtophys(vmx->guest_msrs[i]), 759 guest_msr_count); 760 if (error != 0) 761 panic("vmcs_set_msr_save error %d", error); 762 763 error = vmx_setup_cr0_shadow(&vmx->vmcs[i]); 764 if (error != 0) 765 panic("vmx_setup_cr0_shadow %d", error); 766 767 error = vmx_setup_cr4_shadow(&vmx->vmcs[i]); 768 if (error != 0) 769 panic("vmx_setup_cr4_shadow %d", error); 770 } 771 772 return (vmx); 773} 774 775static int 776vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx) 777{ 778 int handled, func; 779 780 func = vmxctx->guest_rax; 781 782 handled = x86_emulate_cpuid(vm, vcpu, 783 (uint32_t*)(&vmxctx->guest_rax), 784 (uint32_t*)(&vmxctx->guest_rbx), 785 (uint32_t*)(&vmxctx->guest_rcx), 786 (uint32_t*)(&vmxctx->guest_rdx)); 787 return (handled); 788} 789 790static __inline void 791vmx_run_trace(struct vmx *vmx, int vcpu) 792{ 793#ifdef KTR 794 VMM_CTR1(vmx->vm, vcpu, "Resume execution at 0x%0lx", vmcs_guest_rip()); 795#endif 796} 797 798static __inline void 799vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, 800 int handled) 801{ 802#ifdef KTR 803 VMM_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx", 804 handled ? "handled" : "unhandled", 805 exit_reason_to_str(exit_reason), rip); 806#endif 807} 808 809static __inline void 810vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip) 811{ 812#ifdef KTR 813 VMM_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip); 814#endif 815} 816 817static int 818vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu) 819{ 820 int error, lastcpu; 821 struct vmxstate *vmxstate; 822 struct invvpid_desc invvpid_desc = { 0 }; 823 824 vmxstate = &vmx->state[vcpu]; 825 lastcpu = vmxstate->lastcpu; 826 vmxstate->lastcpu = curcpu; 827 828 if (lastcpu == curcpu) { 829 error = 0; 830 goto done; 831 } 832 833 vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); 834 835 error = vmwrite(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); 836 if (error != 0) 837 goto done; 838 839 error = vmwrite(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); 840 if (error != 0) 841 goto done; 842 843 error = vmwrite(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); 844 if (error != 0) 845 goto done; 846 847 /* 848 * If we are using VPIDs then invalidate all mappings tagged with 'vpid' 849 * 850 * We do this because this vcpu was executing on a different host 851 * cpu when it last ran. We do not track whether it invalidated 852 * mappings associated with its 'vpid' during that run. So we must 853 * assume that the mappings associated with 'vpid' on 'curcpu' are 854 * stale and invalidate them. 855 * 856 * Note that we incur this penalty only when the scheduler chooses to 857 * move the thread associated with this vcpu between host cpus. 858 * 859 * Note also that this will invalidate mappings tagged with 'vpid' 860 * for "all" EP4TAs. 861 */ 862 if (vmxstate->vpid != 0) { 863 invvpid_desc.vpid = vmxstate->vpid; 864 invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); 865 } 866done: 867 return (error); 868} 869 870static void 871vm_exit_update_rip(struct vm_exit *vmexit) 872{ 873 int error; 874 875 error = vmwrite(VMCS_GUEST_RIP, vmexit->rip + vmexit->inst_length); 876 if (error) 877 panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); 878} 879 880/* 881 * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. 882 */ 883CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); 884 885static void __inline 886vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) 887{ 888 int error; 889 890 vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING; 891 892 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 893 if (error) 894 panic("vmx_set_int_window_exiting: vmwrite error %d", error); 895} 896 897static void __inline 898vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) 899{ 900 int error; 901 902 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; 903 904 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 905 if (error) 906 panic("vmx_clear_int_window_exiting: vmwrite error %d", error); 907} 908 909static void __inline 910vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) 911{ 912 int error; 913 914 vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; 915 916 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 917 if (error) 918 panic("vmx_set_nmi_window_exiting: vmwrite error %d", error); 919} 920 921static void __inline 922vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) 923{ 924 int error; 925 926 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; 927 928 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 929 if (error) 930 panic("vmx_clear_nmi_window_exiting: vmwrite error %d", error); 931} 932 933static int 934vmx_inject_nmi(struct vmx *vmx, int vcpu) 935{ 936 int error; 937 uint64_t info, interruptibility; 938 939 /* Bail out if no NMI requested */ 940 if (!vm_nmi_pending(vmx->vm, vcpu)) 941 return (0); 942 943 error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility); 944 if (error) { 945 panic("vmx_inject_nmi: vmread(interruptibility) %d", 946 error); 947 } 948 if (interruptibility & nmi_blocking_bits) 949 goto nmiblocked; 950 951 /* 952 * Inject the virtual NMI. The vector must be the NMI IDT entry 953 * or the VMCS entry check will fail. 954 */ 955 info = VMCS_INTERRUPTION_INFO_NMI | VMCS_INTERRUPTION_INFO_VALID; 956 info |= IDT_NMI; 957 958 error = vmwrite(VMCS_ENTRY_INTR_INFO, info); 959 if (error) 960 panic("vmx_inject_nmi: vmwrite(intrinfo) %d", error); 961 962 VMM_CTR0(vmx->vm, vcpu, "Injecting vNMI"); 963 964 /* Clear the request */ 965 vm_nmi_clear(vmx->vm, vcpu); 966 return (1); 967 968nmiblocked: 969 /* 970 * Set the NMI Window Exiting execution control so we can inject 971 * the virtual NMI as soon as blocking condition goes away. 972 */ 973 vmx_set_nmi_window_exiting(vmx, vcpu); 974 975 VMM_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting"); 976 return (1); 977} 978 979static void 980vmx_inject_interrupts(struct vmx *vmx, int vcpu) 981{ 982 int error, vector; 983 uint64_t info, rflags, interruptibility; 984 985 const int HWINTR_BLOCKED = VMCS_INTERRUPTIBILITY_STI_BLOCKING | 986 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING; 987 988 /* 989 * If there is already an interrupt pending then just return. 990 * 991 * This could happen if an interrupt was injected on a prior 992 * VM entry but the actual entry into guest mode was aborted 993 * because of a pending AST. 994 */ 995 error = vmread(VMCS_ENTRY_INTR_INFO, &info); 996 if (error) 997 panic("vmx_inject_interrupts: vmread(intrinfo) %d", error); 998 if (info & VMCS_INTERRUPTION_INFO_VALID) 999 return; 1000 1001 /* 1002 * NMI injection has priority so deal with those first 1003 */ 1004 if (vmx_inject_nmi(vmx, vcpu)) 1005 return; 1006 1007 /* Ask the local apic for a vector to inject */ 1008 vector = lapic_pending_intr(vmx->vm, vcpu); 1009 if (vector < 0) 1010 return; 1011 1012 if (vector < 32 || vector > 255) 1013 panic("vmx_inject_interrupts: invalid vector %d\n", vector); 1014 1015 /* Check RFLAGS.IF and the interruptibility state of the guest */ 1016 error = vmread(VMCS_GUEST_RFLAGS, &rflags); 1017 if (error) 1018 panic("vmx_inject_interrupts: vmread(rflags) %d", error); 1019 1020 if ((rflags & PSL_I) == 0) 1021 goto cantinject; 1022 1023 error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility); 1024 if (error) { 1025 panic("vmx_inject_interrupts: vmread(interruptibility) %d", 1026 error); 1027 } 1028 if (interruptibility & HWINTR_BLOCKED) 1029 goto cantinject; 1030 1031 /* Inject the interrupt */ 1032 info = VMCS_INTERRUPTION_INFO_HW_INTR | VMCS_INTERRUPTION_INFO_VALID; 1033 info |= vector; 1034 error = vmwrite(VMCS_ENTRY_INTR_INFO, info); 1035 if (error) 1036 panic("vmx_inject_interrupts: vmwrite(intrinfo) %d", error); 1037 1038 /* Update the Local APIC ISR */ 1039 lapic_intr_accepted(vmx->vm, vcpu, vector); 1040 1041 VMM_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); 1042 1043 return; 1044 1045cantinject: 1046 /* 1047 * Set the Interrupt Window Exiting execution control so we can inject 1048 * the interrupt as soon as blocking condition goes away. 1049 */ 1050 vmx_set_int_window_exiting(vmx, vcpu); 1051 1052 VMM_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting"); 1053} 1054 1055static int 1056vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual) 1057{ 1058 int error, cr, vmcs_guest_cr; 1059 uint64_t regval, ones_mask, zeros_mask; 1060 const struct vmxctx *vmxctx; 1061 1062 /* We only handle mov to %cr0 or %cr4 at this time */ 1063 if ((exitqual & 0xf0) != 0x00) 1064 return (UNHANDLED); 1065 1066 cr = exitqual & 0xf; 1067 if (cr != 0 && cr != 4) 1068 return (UNHANDLED); 1069 1070 vmxctx = &vmx->ctx[vcpu]; 1071 1072 /* 1073 * We must use vmwrite() directly here because vmcs_setreg() will 1074 * call vmclear(vmcs) as a side-effect which we certainly don't want. 1075 */ 1076 switch ((exitqual >> 8) & 0xf) { 1077 case 0: 1078 regval = vmxctx->guest_rax; 1079 break; 1080 case 1: 1081 regval = vmxctx->guest_rcx; 1082 break; 1083 case 2: 1084 regval = vmxctx->guest_rdx; 1085 break; 1086 case 3: 1087 regval = vmxctx->guest_rbx; 1088 break; 1089 case 4: 1090 error = vmread(VMCS_GUEST_RSP, ®val); 1091 if (error) { 1092 panic("vmx_emulate_cr_access: " 1093 "error %d reading guest rsp", error); 1094 } 1095 break; 1096 case 5: 1097 regval = vmxctx->guest_rbp; 1098 break; 1099 case 6: 1100 regval = vmxctx->guest_rsi; 1101 break; 1102 case 7: 1103 regval = vmxctx->guest_rdi; 1104 break; 1105 case 8: 1106 regval = vmxctx->guest_r8; 1107 break; 1108 case 9: 1109 regval = vmxctx->guest_r9; 1110 break; 1111 case 10: 1112 regval = vmxctx->guest_r10; 1113 break; 1114 case 11: 1115 regval = vmxctx->guest_r11; 1116 break; 1117 case 12: 1118 regval = vmxctx->guest_r12; 1119 break; 1120 case 13: 1121 regval = vmxctx->guest_r13; 1122 break; 1123 case 14: 1124 regval = vmxctx->guest_r14; 1125 break; 1126 case 15: 1127 regval = vmxctx->guest_r15; 1128 break; 1129 } 1130 1131 if (cr == 0) { 1132 ones_mask = cr0_ones_mask; 1133 zeros_mask = cr0_zeros_mask; 1134 vmcs_guest_cr = VMCS_GUEST_CR0; 1135 } else { 1136 ones_mask = cr4_ones_mask; 1137 zeros_mask = cr4_zeros_mask; 1138 vmcs_guest_cr = VMCS_GUEST_CR4; 1139 } 1140 regval |= ones_mask; 1141 regval &= ~zeros_mask; 1142 error = vmwrite(vmcs_guest_cr, regval); 1143 if (error) { 1144 panic("vmx_emulate_cr_access: error %d writing cr%d", 1145 error, cr); 1146 } 1147 1148 return (HANDLED); 1149} 1150 1151static int 1152vmx_ept_fault(struct vm *vm, int cpu, 1153 uint64_t gla, uint64_t gpa, uint64_t rip, int inst_length, 1154 uint64_t cr3, uint64_t ept_qual, struct vie *vie) 1155{ 1156 int read, write, error; 1157 1158 /* EPT violation on an instruction fetch doesn't make sense here */ 1159 if (ept_qual & EPT_VIOLATION_INST_FETCH) 1160 return (UNHANDLED); 1161 1162 /* EPT violation must be a read fault or a write fault */ 1163 read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0; 1164 write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0; 1165 if ((read | write) == 0) 1166 return (UNHANDLED); 1167 1168 /* 1169 * The EPT violation must have been caused by accessing a 1170 * guest-physical address that is a translation of a guest-linear 1171 * address. 1172 */ 1173 if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 || 1174 (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) { 1175 return (UNHANDLED); 1176 } 1177 1178 /* Fetch, decode and emulate the faulting instruction */ 1179 if (vmm_fetch_instruction(vm, cpu, rip, inst_length, cr3, vie) != 0) 1180 return (UNHANDLED); 1181 1182 if (vmm_decode_instruction(vm, cpu, gla, vie) != 0) 1183 return (UNHANDLED); 1184 1185 /* 1186 * Check if this is a local apic access 1187 */ 1188 if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE) 1189 return (UNHANDLED); 1190 1191 error = vmm_emulate_instruction(vm, cpu, gpa, vie, 1192 lapic_mmio_read, lapic_mmio_write, 0); 1193 1194 return (error ? UNHANDLED : HANDLED); 1195} 1196 1197static int 1198vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 1199{ 1200 int error, handled; 1201 struct vmcs *vmcs; 1202 struct vmxctx *vmxctx; 1203 uint32_t eax, ecx, edx; 1204 uint64_t qual, gla, gpa, cr3, intr_info; 1205 1206 handled = 0; 1207 vmcs = &vmx->vmcs[vcpu]; 1208 vmxctx = &vmx->ctx[vcpu]; 1209 qual = vmexit->u.vmx.exit_qualification; 1210 vmexit->exitcode = VM_EXITCODE_BOGUS; 1211 1212 switch (vmexit->u.vmx.exit_reason) { 1213 case EXIT_REASON_CR_ACCESS: 1214 handled = vmx_emulate_cr_access(vmx, vcpu, qual); 1215 break; 1216 case EXIT_REASON_RDMSR: 1217 ecx = vmxctx->guest_rcx; 1218 error = emulate_rdmsr(vmx->vm, vcpu, ecx); 1219 if (error) { 1220 vmexit->exitcode = VM_EXITCODE_RDMSR; 1221 vmexit->u.msr.code = ecx; 1222 } else 1223 handled = 1; 1224 break; 1225 case EXIT_REASON_WRMSR: 1226 eax = vmxctx->guest_rax; 1227 ecx = vmxctx->guest_rcx; 1228 edx = vmxctx->guest_rdx; 1229 error = emulate_wrmsr(vmx->vm, vcpu, ecx, 1230 (uint64_t)edx << 32 | eax); 1231 if (error) { 1232 vmexit->exitcode = VM_EXITCODE_WRMSR; 1233 vmexit->u.msr.code = ecx; 1234 vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; 1235 } else 1236 handled = 1; 1237 break; 1238 case EXIT_REASON_HLT: 1239 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1); 1240 /* 1241 * If there is an event waiting to be injected then there is 1242 * no need to 'hlt'. 1243 */ 1244 error = vmread(VMCS_ENTRY_INTR_INFO, &intr_info); 1245 if (error) 1246 panic("vmx_exit_process: vmread(intrinfo) %d", error); 1247 1248 if (intr_info & VMCS_INTERRUPTION_INFO_VALID) { 1249 handled = 1; 1250 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT_IGNORED, 1); 1251 } else 1252 vmexit->exitcode = VM_EXITCODE_HLT; 1253 break; 1254 case EXIT_REASON_MTF: 1255 vmexit->exitcode = VM_EXITCODE_MTRAP; 1256 break; 1257 case EXIT_REASON_PAUSE: 1258 vmexit->exitcode = VM_EXITCODE_PAUSE; 1259 break; 1260 case EXIT_REASON_INTR_WINDOW: 1261 vmx_clear_int_window_exiting(vmx, vcpu); 1262 VMM_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); 1263 /* FALLTHRU */ 1264 case EXIT_REASON_EXT_INTR: 1265 /* 1266 * External interrupts serve only to cause VM exits and allow 1267 * the host interrupt handler to run. 1268 * 1269 * If this external interrupt triggers a virtual interrupt 1270 * to a VM, then that state will be recorded by the 1271 * host interrupt handler in the VM's softc. We will inject 1272 * this virtual interrupt during the subsequent VM enter. 1273 */ 1274 1275 /* 1276 * This is special. We want to treat this as an 'handled' 1277 * VM-exit but not increment the instruction pointer. 1278 */ 1279 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); 1280 return (1); 1281 case EXIT_REASON_NMI_WINDOW: 1282 /* Exit to allow the pending virtual NMI to be injected */ 1283 vmx_clear_nmi_window_exiting(vmx, vcpu); 1284 VMM_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting"); 1285 return (1); 1286 case EXIT_REASON_INOUT: 1287 vmexit->exitcode = VM_EXITCODE_INOUT; 1288 vmexit->u.inout.bytes = (qual & 0x7) + 1; 1289 vmexit->u.inout.in = (qual & 0x8) ? 1 : 0; 1290 vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; 1291 vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; 1292 vmexit->u.inout.port = (uint16_t)(qual >> 16); 1293 vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); 1294 break; 1295 case EXIT_REASON_CPUID: 1296 handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx); 1297 break; 1298 case EXIT_REASON_EPT_FAULT: 1299 gla = vmcs_gla(); 1300 gpa = vmcs_gpa(); 1301 cr3 = vmcs_guest_cr3(); 1302 handled = vmx_ept_fault(vmx->vm, vcpu, gla, gpa, 1303 vmexit->rip, vmexit->inst_length, 1304 cr3, qual, &vmexit->u.paging.vie); 1305 if (!handled) { 1306 vmexit->exitcode = VM_EXITCODE_PAGING; 1307 vmexit->u.paging.gpa = gpa; 1308 } 1309 break; 1310 default: 1311 break; 1312 } 1313 1314 if (handled) { 1315 /* 1316 * It is possible that control is returned to userland 1317 * even though we were able to handle the VM exit in the 1318 * kernel. 1319 * 1320 * In such a case we want to make sure that the userland 1321 * restarts guest execution at the instruction *after* 1322 * the one we just processed. Therefore we update the 1323 * guest rip in the VMCS and in 'vmexit'. 1324 */ 1325 vm_exit_update_rip(vmexit); 1326 vmexit->rip += vmexit->inst_length; 1327 vmexit->inst_length = 0; 1328 1329 /* 1330 * Special case for spinning up an AP - exit to userspace to 1331 * give the controlling process a chance to intercept and 1332 * spin up a thread for the AP. 1333 */ 1334 if (vmexit->exitcode == VM_EXITCODE_SPINUP_AP) 1335 handled = 0; 1336 } else { 1337 if (vmexit->exitcode == VM_EXITCODE_BOGUS) { 1338 /* 1339 * If this VM exit was not claimed by anybody then 1340 * treat it as a generic VMX exit. 1341 */ 1342 vmexit->exitcode = VM_EXITCODE_VMX; 1343 vmexit->u.vmx.error = 0; 1344 } else { 1345 /* 1346 * The exitcode and collateral have been populated. 1347 * The VM exit will be processed further in userland. 1348 */ 1349 } 1350 } 1351 return (handled); 1352} 1353 1354static int 1355vmx_run(void *arg, int vcpu, register_t rip) 1356{ 1357 int error, vie, rc, handled, astpending; 1358 uint32_t exit_reason; 1359 struct vmx *vmx; 1360 struct vmxctx *vmxctx; 1361 struct vmcs *vmcs; 1362 struct vm_exit *vmexit; 1363 1364 vmx = arg; 1365 vmcs = &vmx->vmcs[vcpu]; 1366 vmxctx = &vmx->ctx[vcpu]; 1367 vmxctx->launched = 0; 1368 1369 astpending = 0; 1370 vmexit = vm_exitinfo(vmx->vm, vcpu); 1371 1372 /* 1373 * XXX Can we avoid doing this every time we do a vm run? 1374 */ 1375 VMPTRLD(vmcs); 1376 1377 /* 1378 * XXX 1379 * We do this every time because we may setup the virtual machine 1380 * from a different process than the one that actually runs it. 1381 * 1382 * If the life of a virtual machine was spent entirely in the context 1383 * of a single process we could do this once in vmcs_set_defaults(). 1384 */ 1385 if ((error = vmwrite(VMCS_HOST_CR3, rcr3())) != 0) 1386 panic("vmx_run: error %d writing to VMCS_HOST_CR3", error); 1387 1388 if ((error = vmwrite(VMCS_GUEST_RIP, rip)) != 0) 1389 panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); 1390 1391 if ((error = vmx_set_pcpu_defaults(vmx, vcpu)) != 0) 1392 panic("vmx_run: error %d setting up pcpu defaults", error); 1393 1394 do { 1395 lapic_timer_tick(vmx->vm, vcpu); 1396 vmx_inject_interrupts(vmx, vcpu); 1397 vmx_run_trace(vmx, vcpu); 1398 rc = vmx_setjmp(vmxctx); 1399#ifdef SETJMP_TRACE 1400 vmx_setjmp_trace(vmx, vcpu, vmxctx, rc); 1401#endif 1402 switch (rc) { 1403 case VMX_RETURN_DIRECT: 1404 if (vmxctx->launched == 0) { 1405 vmxctx->launched = 1; 1406 vmx_launch(vmxctx); 1407 } else 1408 vmx_resume(vmxctx); 1409 panic("vmx_launch/resume should not return"); 1410 break; 1411 case VMX_RETURN_LONGJMP: 1412 break; /* vm exit */ 1413 case VMX_RETURN_AST: 1414 astpending = 1; 1415 break; 1416 case VMX_RETURN_VMRESUME: 1417 vie = vmcs_instruction_error(); 1418 if (vmxctx->launch_error == VM_FAIL_INVALID || 1419 vie != VMRESUME_WITH_NON_LAUNCHED_VMCS) { 1420 printf("vmresume error %d vmcs inst error %d\n", 1421 vmxctx->launch_error, vie); 1422 goto err_exit; 1423 } 1424 vmx_launch(vmxctx); /* try to launch the guest */ 1425 panic("vmx_launch should not return"); 1426 break; 1427 case VMX_RETURN_VMLAUNCH: 1428 vie = vmcs_instruction_error(); 1429#if 1 1430 printf("vmlaunch error %d vmcs inst error %d\n", 1431 vmxctx->launch_error, vie); 1432#endif 1433 goto err_exit; 1434 default: 1435 panic("vmx_setjmp returned %d", rc); 1436 } 1437 1438 /* enable interrupts */ 1439 enable_intr(); 1440 1441 /* collect some basic information for VM exit processing */ 1442 vmexit->rip = rip = vmcs_guest_rip(); 1443 vmexit->inst_length = vmexit_instruction_length(); 1444 vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); 1445 vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); 1446 1447 if (astpending) { 1448 handled = 1; 1449 vmexit->inst_length = 0; 1450 vmexit->exitcode = VM_EXITCODE_BOGUS; 1451 vmx_astpending_trace(vmx, vcpu, rip); 1452 break; 1453 } 1454 1455 handled = vmx_exit_process(vmx, vcpu, vmexit); 1456 vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled); 1457 1458 } while (handled); 1459 1460 /* 1461 * If a VM exit has been handled then the exitcode must be BOGUS 1462 * If a VM exit is not handled then the exitcode must not be BOGUS 1463 */ 1464 if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || 1465 (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { 1466 panic("Mismatch between handled (%d) and exitcode (%d)", 1467 handled, vmexit->exitcode); 1468 } 1469 1470 VMM_CTR1(vmx->vm, vcpu, "goto userland: exitcode %d",vmexit->exitcode); 1471 1472 /* 1473 * XXX 1474 * We need to do this to ensure that any VMCS state cached by the 1475 * processor is flushed to memory. We need to do this in case the 1476 * VM moves to a different cpu the next time it runs. 1477 * 1478 * Can we avoid doing this? 1479 */ 1480 VMCLEAR(vmcs); 1481 return (0); 1482 1483err_exit: 1484 vmexit->exitcode = VM_EXITCODE_VMX; 1485 vmexit->u.vmx.exit_reason = (uint32_t)-1; 1486 vmexit->u.vmx.exit_qualification = (uint32_t)-1; 1487 vmexit->u.vmx.error = vie; 1488 VMCLEAR(vmcs); 1489 return (ENOEXEC); 1490} 1491 1492static void 1493vmx_vmcleanup(void *arg) 1494{ 1495 int error; 1496 struct vmx *vmx = arg; 1497 1498 /* 1499 * XXXSMP we also need to clear the VMCS active on the other vcpus. 1500 */ 1501 error = vmclear(&vmx->vmcs[0]); 1502 if (error != 0) 1503 panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error); 1504 1505 ept_vmcleanup(vmx); 1506 free(vmx, M_VMX); 1507 1508 return; 1509} 1510 1511static register_t * 1512vmxctx_regptr(struct vmxctx *vmxctx, int reg) 1513{ 1514 1515 switch (reg) { 1516 case VM_REG_GUEST_RAX: 1517 return (&vmxctx->guest_rax); 1518 case VM_REG_GUEST_RBX: 1519 return (&vmxctx->guest_rbx); 1520 case VM_REG_GUEST_RCX: 1521 return (&vmxctx->guest_rcx); 1522 case VM_REG_GUEST_RDX: 1523 return (&vmxctx->guest_rdx); 1524 case VM_REG_GUEST_RSI: 1525 return (&vmxctx->guest_rsi); 1526 case VM_REG_GUEST_RDI: 1527 return (&vmxctx->guest_rdi); 1528 case VM_REG_GUEST_RBP: 1529 return (&vmxctx->guest_rbp); 1530 case VM_REG_GUEST_R8: 1531 return (&vmxctx->guest_r8); 1532 case VM_REG_GUEST_R9: 1533 return (&vmxctx->guest_r9); 1534 case VM_REG_GUEST_R10: 1535 return (&vmxctx->guest_r10); 1536 case VM_REG_GUEST_R11: 1537 return (&vmxctx->guest_r11); 1538 case VM_REG_GUEST_R12: 1539 return (&vmxctx->guest_r12); 1540 case VM_REG_GUEST_R13: 1541 return (&vmxctx->guest_r13); 1542 case VM_REG_GUEST_R14: 1543 return (&vmxctx->guest_r14); 1544 case VM_REG_GUEST_R15: 1545 return (&vmxctx->guest_r15); 1546 default: 1547 break; 1548 } 1549 return (NULL); 1550} 1551 1552static int 1553vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval) 1554{ 1555 register_t *regp; 1556 1557 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 1558 *retval = *regp; 1559 return (0); 1560 } else 1561 return (EINVAL); 1562} 1563 1564static int 1565vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) 1566{ 1567 register_t *regp; 1568 1569 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 1570 *regp = val; 1571 return (0); 1572 } else 1573 return (EINVAL); 1574} 1575 1576static int 1577vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) 1578{ 1579 struct vmx *vmx = arg; 1580 1581 if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0) 1582 return (0); 1583 1584 /* 1585 * If the vcpu is running then don't mess with the VMCS. 1586 * 1587 * vmcs_getreg will VMCLEAR the vmcs when it is done which will cause 1588 * the subsequent vmlaunch/vmresume to fail. 1589 */ 1590 if (vcpu_is_running(vmx->vm, vcpu)) 1591 panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); 1592 1593 return (vmcs_getreg(&vmx->vmcs[vcpu], reg, retval)); 1594} 1595 1596static int 1597vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) 1598{ 1599 int error; 1600 uint64_t ctls; 1601 struct vmx *vmx = arg; 1602 1603 /* 1604 * XXX Allow caller to set contents of the guest registers saved in 1605 * the 'vmxctx' even though the vcpu might be running. We need this 1606 * specifically to support the rdmsr emulation that will set the 1607 * %eax and %edx registers during vm exit processing. 1608 */ 1609 if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0) 1610 return (0); 1611 1612 /* 1613 * If the vcpu is running then don't mess with the VMCS. 1614 * 1615 * vmcs_setreg will VMCLEAR the vmcs when it is done which will cause 1616 * the subsequent vmlaunch/vmresume to fail. 1617 */ 1618 if (vcpu_is_running(vmx->vm, vcpu)) 1619 panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); 1620 1621 error = vmcs_setreg(&vmx->vmcs[vcpu], reg, val); 1622 1623 if (error == 0) { 1624 /* 1625 * If the "load EFER" VM-entry control is 1 then the 1626 * value of EFER.LMA must be identical to "IA-32e mode guest" 1627 * bit in the VM-entry control. 1628 */ 1629 if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 && 1630 (reg == VM_REG_GUEST_EFER)) { 1631 vmcs_getreg(&vmx->vmcs[vcpu], 1632 VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls); 1633 if (val & EFER_LMA) 1634 ctls |= VM_ENTRY_GUEST_LMA; 1635 else 1636 ctls &= ~VM_ENTRY_GUEST_LMA; 1637 vmcs_setreg(&vmx->vmcs[vcpu], 1638 VMCS_IDENT(VMCS_ENTRY_CTLS), ctls); 1639 } 1640 } 1641 1642 return (error); 1643} 1644 1645static int 1646vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 1647{ 1648 struct vmx *vmx = arg; 1649 1650 return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc)); 1651} 1652 1653static int 1654vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 1655{ 1656 struct vmx *vmx = arg; 1657 1658 return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc)); 1659} 1660 1661static int 1662vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code, 1663 int code_valid) 1664{ 1665 int error; 1666 uint64_t info; 1667 struct vmx *vmx = arg; 1668 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 1669 1670 static uint32_t type_map[VM_EVENT_MAX] = { 1671 0x1, /* VM_EVENT_NONE */ 1672 0x0, /* VM_HW_INTR */ 1673 0x2, /* VM_NMI */ 1674 0x3, /* VM_HW_EXCEPTION */ 1675 0x4, /* VM_SW_INTR */ 1676 0x5, /* VM_PRIV_SW_EXCEPTION */ 1677 0x6, /* VM_SW_EXCEPTION */ 1678 }; 1679 1680 /* 1681 * If there is already an exception pending to be delivered to the 1682 * vcpu then just return. 1683 */ 1684 error = vmcs_getreg(vmcs, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), &info); 1685 if (error) 1686 return (error); 1687 1688 if (info & VMCS_INTERRUPTION_INFO_VALID) 1689 return (EAGAIN); 1690 1691 info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0); 1692 info |= VMCS_INTERRUPTION_INFO_VALID; 1693 error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info); 1694 if (error != 0) 1695 return (error); 1696 1697 if (code_valid) { 1698 error = vmcs_setreg(vmcs, 1699 VMCS_IDENT(VMCS_ENTRY_EXCEPTION_ERROR), 1700 code); 1701 } 1702 return (error); 1703} 1704 1705static int 1706vmx_getcap(void *arg, int vcpu, int type, int *retval) 1707{ 1708 struct vmx *vmx = arg; 1709 int vcap; 1710 int ret; 1711 1712 ret = ENOENT; 1713 1714 vcap = vmx->cap[vcpu].set; 1715 1716 switch (type) { 1717 case VM_CAP_HALT_EXIT: 1718 if (cap_halt_exit) 1719 ret = 0; 1720 break; 1721 case VM_CAP_PAUSE_EXIT: 1722 if (cap_pause_exit) 1723 ret = 0; 1724 break; 1725 case VM_CAP_MTRAP_EXIT: 1726 if (cap_monitor_trap) 1727 ret = 0; 1728 break; 1729 case VM_CAP_UNRESTRICTED_GUEST: 1730 if (cap_unrestricted_guest) 1731 ret = 0; 1732 break; 1733 default: 1734 break; 1735 } 1736 1737 if (ret == 0) 1738 *retval = (vcap & (1 << type)) ? 1 : 0; 1739 1740 return (ret); 1741} 1742 1743static int 1744vmx_setcap(void *arg, int vcpu, int type, int val) 1745{ 1746 struct vmx *vmx = arg; 1747 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 1748 uint32_t baseval; 1749 uint32_t *pptr; 1750 int error; 1751 int flag; 1752 int reg; 1753 int retval; 1754 1755 retval = ENOENT; 1756 pptr = NULL; 1757 1758 switch (type) { 1759 case VM_CAP_HALT_EXIT: 1760 if (cap_halt_exit) { 1761 retval = 0; 1762 pptr = &vmx->cap[vcpu].proc_ctls; 1763 baseval = *pptr; 1764 flag = PROCBASED_HLT_EXITING; 1765 reg = VMCS_PRI_PROC_BASED_CTLS; 1766 } 1767 break; 1768 case VM_CAP_MTRAP_EXIT: 1769 if (cap_monitor_trap) { 1770 retval = 0; 1771 pptr = &vmx->cap[vcpu].proc_ctls; 1772 baseval = *pptr; 1773 flag = PROCBASED_MTF; 1774 reg = VMCS_PRI_PROC_BASED_CTLS; 1775 } 1776 break; 1777 case VM_CAP_PAUSE_EXIT: 1778 if (cap_pause_exit) { 1779 retval = 0; 1780 pptr = &vmx->cap[vcpu].proc_ctls; 1781 baseval = *pptr; 1782 flag = PROCBASED_PAUSE_EXITING; 1783 reg = VMCS_PRI_PROC_BASED_CTLS; 1784 } 1785 break; 1786 case VM_CAP_UNRESTRICTED_GUEST: 1787 if (cap_unrestricted_guest) { 1788 retval = 0; 1789 baseval = procbased_ctls2; 1790 flag = PROCBASED2_UNRESTRICTED_GUEST; 1791 reg = VMCS_SEC_PROC_BASED_CTLS; 1792 } 1793 break; 1794 default: 1795 break; 1796 } 1797 1798 if (retval == 0) { 1799 if (val) { 1800 baseval |= flag; 1801 } else { 1802 baseval &= ~flag; 1803 } 1804 VMPTRLD(vmcs); 1805 error = vmwrite(reg, baseval); 1806 VMCLEAR(vmcs); 1807 1808 if (error) { 1809 retval = error; 1810 } else { 1811 /* 1812 * Update optional stored flags, and record 1813 * setting 1814 */ 1815 if (pptr != NULL) { 1816 *pptr = baseval; 1817 } 1818 1819 if (val) { 1820 vmx->cap[vcpu].set |= (1 << type); 1821 } else { 1822 vmx->cap[vcpu].set &= ~(1 << type); 1823 } 1824 } 1825 } 1826 1827 return (retval); 1828} 1829 1830struct vmm_ops vmm_ops_intel = { 1831 vmx_init, 1832 vmx_cleanup, 1833 vmx_vminit, 1834 vmx_run, 1835 vmx_vmcleanup, 1836 ept_vmmmap_set, 1837 ept_vmmmap_get, 1838 vmx_getreg, 1839 vmx_setreg, 1840 vmx_getdesc, 1841 vmx_setdesc, 1842 vmx_inject, 1843 vmx_getcap, 1844 vmx_setcap 1845}; 1846