vmx.c revision 240941
1/*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD$ 27 */ 28 29#include <sys/cdefs.h> 30__FBSDID("$FreeBSD$"); 31 32#include <sys/param.h> 33#include <sys/systm.h> 34#include <sys/smp.h> 35#include <sys/kernel.h> 36#include <sys/malloc.h> 37#include <sys/pcpu.h> 38#include <sys/proc.h> 39 40#include <vm/vm.h> 41#include <vm/pmap.h> 42 43#include <machine/psl.h> 44#include <machine/cpufunc.h> 45#include <machine/md_var.h> 46#include <machine/pmap.h> 47#include <machine/segments.h> 48#include <machine/specialreg.h> 49#include <machine/vmparam.h> 50 51#include <x86/apicreg.h> 52 53#include <machine/vmm.h> 54#include "vmm_lapic.h" 55#include "vmm_msr.h" 56#include "vmm_ktr.h" 57#include "vmm_stat.h" 58 59#include "vmx_msr.h" 60#include "ept.h" 61#include "vmx_cpufunc.h" 62#include "vmx.h" 63#include "x86.h" 64#include "vmx_controls.h" 65#include "vmm_instruction_emul.h" 66 67#define CR4_VMXE (1UL << 13) 68 69#define PINBASED_CTLS_ONE_SETTING \ 70 (PINBASED_EXTINT_EXITING | \ 71 PINBASED_NMI_EXITING | \ 72 PINBASED_VIRTUAL_NMI) 73#define PINBASED_CTLS_ZERO_SETTING 0 74 75#define PROCBASED_CTLS_WINDOW_SETTING \ 76 (PROCBASED_INT_WINDOW_EXITING | \ 77 PROCBASED_NMI_WINDOW_EXITING) 78 79#define PROCBASED_CTLS_ONE_SETTING \ 80 (PROCBASED_SECONDARY_CONTROLS | \ 81 PROCBASED_IO_EXITING | \ 82 PROCBASED_MSR_BITMAPS | \ 83 PROCBASED_CTLS_WINDOW_SETTING) 84#define PROCBASED_CTLS_ZERO_SETTING \ 85 (PROCBASED_CR3_LOAD_EXITING | \ 86 PROCBASED_CR3_STORE_EXITING | \ 87 PROCBASED_IO_BITMAPS) 88 89#define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT 90#define PROCBASED_CTLS2_ZERO_SETTING 0 91 92#define VM_EXIT_CTLS_ONE_SETTING_NO_PAT \ 93 (VM_EXIT_HOST_LMA | \ 94 VM_EXIT_SAVE_EFER | \ 95 VM_EXIT_LOAD_EFER) 96 97#define VM_EXIT_CTLS_ONE_SETTING \ 98 (VM_EXIT_CTLS_ONE_SETTING_NO_PAT | \ 99 VM_EXIT_SAVE_PAT | \ 100 VM_EXIT_LOAD_PAT) 101#define VM_EXIT_CTLS_ZERO_SETTING VM_EXIT_SAVE_DEBUG_CONTROLS 102 103#define VM_ENTRY_CTLS_ONE_SETTING_NO_PAT VM_ENTRY_LOAD_EFER 104 105#define VM_ENTRY_CTLS_ONE_SETTING \ 106 (VM_ENTRY_CTLS_ONE_SETTING_NO_PAT | \ 107 VM_ENTRY_LOAD_PAT) 108#define VM_ENTRY_CTLS_ZERO_SETTING \ 109 (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ 110 VM_ENTRY_INTO_SMM | \ 111 VM_ENTRY_DEACTIVATE_DUAL_MONITOR) 112 113#define guest_msr_rw(vmx, msr) \ 114 msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW) 115 116#define HANDLED 1 117#define UNHANDLED 0 118 119MALLOC_DEFINE(M_VMX, "vmx", "vmx"); 120 121extern struct pcpu __pcpu[]; 122 123int vmxon_enabled[MAXCPU]; 124static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); 125 126static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; 127static uint32_t exit_ctls, entry_ctls; 128 129static uint64_t cr0_ones_mask, cr0_zeros_mask; 130static uint64_t cr4_ones_mask, cr4_zeros_mask; 131 132static volatile u_int nextvpid; 133 134static int vmx_no_patmsr; 135 136/* 137 * Virtual NMI blocking conditions. 138 * 139 * Some processor implementations also require NMI to be blocked if 140 * the STI_BLOCKING bit is set. It is possible to detect this at runtime 141 * based on the (exit_reason,exit_qual) tuple being set to 142 * (EXIT_REASON_INVAL_VMCS, EXIT_QUAL_NMI_WHILE_STI_BLOCKING). 143 * 144 * We take the easy way out and also include STI_BLOCKING as one of the 145 * gating items for vNMI injection. 146 */ 147static uint64_t nmi_blocking_bits = VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING | 148 VMCS_INTERRUPTIBILITY_NMI_BLOCKING | 149 VMCS_INTERRUPTIBILITY_STI_BLOCKING; 150 151/* 152 * Optional capabilities 153 */ 154static int cap_halt_exit; 155static int cap_pause_exit; 156static int cap_unrestricted_guest; 157static int cap_monitor_trap; 158 159/* statistics */ 160static VMM_STAT_DEFINE(VCPU_MIGRATIONS, "vcpu migration across host cpus"); 161static VMM_STAT_DEFINE(VMEXIT_EXTINT, "vm exits due to external interrupt"); 162 163#ifdef KTR 164static const char * 165exit_reason_to_str(int reason) 166{ 167 static char reasonbuf[32]; 168 169 switch (reason) { 170 case EXIT_REASON_EXCEPTION: 171 return "exception"; 172 case EXIT_REASON_EXT_INTR: 173 return "extint"; 174 case EXIT_REASON_TRIPLE_FAULT: 175 return "triplefault"; 176 case EXIT_REASON_INIT: 177 return "init"; 178 case EXIT_REASON_SIPI: 179 return "sipi"; 180 case EXIT_REASON_IO_SMI: 181 return "iosmi"; 182 case EXIT_REASON_SMI: 183 return "smi"; 184 case EXIT_REASON_INTR_WINDOW: 185 return "intrwindow"; 186 case EXIT_REASON_NMI_WINDOW: 187 return "nmiwindow"; 188 case EXIT_REASON_TASK_SWITCH: 189 return "taskswitch"; 190 case EXIT_REASON_CPUID: 191 return "cpuid"; 192 case EXIT_REASON_GETSEC: 193 return "getsec"; 194 case EXIT_REASON_HLT: 195 return "hlt"; 196 case EXIT_REASON_INVD: 197 return "invd"; 198 case EXIT_REASON_INVLPG: 199 return "invlpg"; 200 case EXIT_REASON_RDPMC: 201 return "rdpmc"; 202 case EXIT_REASON_RDTSC: 203 return "rdtsc"; 204 case EXIT_REASON_RSM: 205 return "rsm"; 206 case EXIT_REASON_VMCALL: 207 return "vmcall"; 208 case EXIT_REASON_VMCLEAR: 209 return "vmclear"; 210 case EXIT_REASON_VMLAUNCH: 211 return "vmlaunch"; 212 case EXIT_REASON_VMPTRLD: 213 return "vmptrld"; 214 case EXIT_REASON_VMPTRST: 215 return "vmptrst"; 216 case EXIT_REASON_VMREAD: 217 return "vmread"; 218 case EXIT_REASON_VMRESUME: 219 return "vmresume"; 220 case EXIT_REASON_VMWRITE: 221 return "vmwrite"; 222 case EXIT_REASON_VMXOFF: 223 return "vmxoff"; 224 case EXIT_REASON_VMXON: 225 return "vmxon"; 226 case EXIT_REASON_CR_ACCESS: 227 return "craccess"; 228 case EXIT_REASON_DR_ACCESS: 229 return "draccess"; 230 case EXIT_REASON_INOUT: 231 return "inout"; 232 case EXIT_REASON_RDMSR: 233 return "rdmsr"; 234 case EXIT_REASON_WRMSR: 235 return "wrmsr"; 236 case EXIT_REASON_INVAL_VMCS: 237 return "invalvmcs"; 238 case EXIT_REASON_INVAL_MSR: 239 return "invalmsr"; 240 case EXIT_REASON_MWAIT: 241 return "mwait"; 242 case EXIT_REASON_MTF: 243 return "mtf"; 244 case EXIT_REASON_MONITOR: 245 return "monitor"; 246 case EXIT_REASON_PAUSE: 247 return "pause"; 248 case EXIT_REASON_MCE: 249 return "mce"; 250 case EXIT_REASON_TPR: 251 return "tpr"; 252 case EXIT_REASON_APIC: 253 return "apic"; 254 case EXIT_REASON_GDTR_IDTR: 255 return "gdtridtr"; 256 case EXIT_REASON_LDTR_TR: 257 return "ldtrtr"; 258 case EXIT_REASON_EPT_FAULT: 259 return "eptfault"; 260 case EXIT_REASON_EPT_MISCONFIG: 261 return "eptmisconfig"; 262 case EXIT_REASON_INVEPT: 263 return "invept"; 264 case EXIT_REASON_RDTSCP: 265 return "rdtscp"; 266 case EXIT_REASON_VMX_PREEMPT: 267 return "vmxpreempt"; 268 case EXIT_REASON_INVVPID: 269 return "invvpid"; 270 case EXIT_REASON_WBINVD: 271 return "wbinvd"; 272 case EXIT_REASON_XSETBV: 273 return "xsetbv"; 274 default: 275 snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); 276 return (reasonbuf); 277 } 278} 279 280#ifdef SETJMP_TRACE 281static const char * 282vmx_setjmp_rc2str(int rc) 283{ 284 switch (rc) { 285 case VMX_RETURN_DIRECT: 286 return "direct"; 287 case VMX_RETURN_LONGJMP: 288 return "longjmp"; 289 case VMX_RETURN_VMRESUME: 290 return "vmresume"; 291 case VMX_RETURN_VMLAUNCH: 292 return "vmlaunch"; 293 default: 294 return "unknown"; 295 } 296} 297 298#define SETJMP_TRACE(vmx, vcpu, vmxctx, regname) \ 299 VMM_CTR1((vmx)->vm, (vcpu), "setjmp trace " #regname " 0x%016lx", \ 300 (vmxctx)->regname) 301 302static void 303vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) 304{ 305 uint64_t host_rip, host_rsp; 306 307 if (vmxctx != &vmx->ctx[vcpu]) 308 panic("vmx_setjmp_trace: invalid vmxctx %p; should be %p", 309 vmxctx, &vmx->ctx[vcpu]); 310 311 VMM_CTR1((vmx)->vm, (vcpu), "vmxctx = %p", vmxctx); 312 VMM_CTR2((vmx)->vm, (vcpu), "setjmp return code %s(%d)", 313 vmx_setjmp_rc2str(rc), rc); 314 315 host_rsp = host_rip = ~0; 316 vmread(VMCS_HOST_RIP, &host_rip); 317 vmread(VMCS_HOST_RSP, &host_rsp); 318 VMM_CTR2((vmx)->vm, (vcpu), "vmcs host_rip 0x%016lx, host_rsp 0x%016lx", 319 host_rip, host_rsp); 320 321 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r15); 322 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r14); 323 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r13); 324 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r12); 325 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbp); 326 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rsp); 327 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbx); 328 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rip); 329 330 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdi); 331 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rsi); 332 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdx); 333 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rcx); 334 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r8); 335 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r9); 336 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rax); 337 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbx); 338 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbp); 339 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r10); 340 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r11); 341 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r12); 342 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r13); 343 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r14); 344 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r15); 345 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_cr2); 346} 347#endif 348#else 349static void __inline 350vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) 351{ 352 return; 353} 354#endif /* KTR */ 355 356u_long 357vmx_fix_cr0(u_long cr0) 358{ 359 360 return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); 361} 362 363u_long 364vmx_fix_cr4(u_long cr4) 365{ 366 367 return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); 368} 369 370static void 371msr_save_area_init(struct msr_entry *g_area, int *g_count) 372{ 373 int cnt; 374 375 static struct msr_entry guest_msrs[] = { 376 { MSR_KGSBASE, 0, 0 }, 377 }; 378 379 cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]); 380 if (cnt > GUEST_MSR_MAX_ENTRIES) 381 panic("guest msr save area overrun"); 382 bcopy(guest_msrs, g_area, sizeof(guest_msrs)); 383 *g_count = cnt; 384} 385 386static void 387vmx_disable(void *arg __unused) 388{ 389 struct invvpid_desc invvpid_desc = { 0 }; 390 struct invept_desc invept_desc = { 0 }; 391 392 if (vmxon_enabled[curcpu]) { 393 /* 394 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b. 395 * 396 * VMXON or VMXOFF are not required to invalidate any TLB 397 * caching structures. This prevents potential retention of 398 * cached information in the TLB between distinct VMX episodes. 399 */ 400 invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc); 401 invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc); 402 vmxoff(); 403 } 404 load_cr4(rcr4() & ~CR4_VMXE); 405} 406 407static int 408vmx_cleanup(void) 409{ 410 411 smp_rendezvous(NULL, vmx_disable, NULL, NULL); 412 413 return (0); 414} 415 416static void 417vmx_enable(void *arg __unused) 418{ 419 int error; 420 421 load_cr4(rcr4() | CR4_VMXE); 422 423 *(uint32_t *)vmxon_region[curcpu] = vmx_revision(); 424 error = vmxon(vmxon_region[curcpu]); 425 if (error == 0) 426 vmxon_enabled[curcpu] = 1; 427} 428 429static int 430vmx_init(void) 431{ 432 int error; 433 uint64_t fixed0, fixed1, feature_control; 434 uint32_t tmp; 435 436 /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ 437 if (!(cpu_feature2 & CPUID2_VMX)) { 438 printf("vmx_init: processor does not support VMX operation\n"); 439 return (ENXIO); 440 } 441 442 /* 443 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits 444 * are set (bits 0 and 2 respectively). 445 */ 446 feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); 447 if ((feature_control & 0x5) != 0x5) { 448 printf("vmx_init: VMX operation disabled by BIOS\n"); 449 return (ENXIO); 450 } 451 452 /* Check support for primary processor-based VM-execution controls */ 453 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 454 MSR_VMX_TRUE_PROCBASED_CTLS, 455 PROCBASED_CTLS_ONE_SETTING, 456 PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); 457 if (error) { 458 printf("vmx_init: processor does not support desired primary " 459 "processor-based controls\n"); 460 return (error); 461 } 462 463 /* Clear the processor-based ctl bits that are set on demand */ 464 procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; 465 466 /* Check support for secondary processor-based VM-execution controls */ 467 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 468 MSR_VMX_PROCBASED_CTLS2, 469 PROCBASED_CTLS2_ONE_SETTING, 470 PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); 471 if (error) { 472 printf("vmx_init: processor does not support desired secondary " 473 "processor-based controls\n"); 474 return (error); 475 } 476 477 /* Check support for VPID */ 478 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, 479 PROCBASED2_ENABLE_VPID, 0, &tmp); 480 if (error == 0) 481 procbased_ctls2 |= PROCBASED2_ENABLE_VPID; 482 483 /* Check support for pin-based VM-execution controls */ 484 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, 485 MSR_VMX_TRUE_PINBASED_CTLS, 486 PINBASED_CTLS_ONE_SETTING, 487 PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); 488 if (error) { 489 printf("vmx_init: processor does not support desired " 490 "pin-based controls\n"); 491 return (error); 492 } 493 494 /* Check support for VM-exit controls */ 495 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, 496 VM_EXIT_CTLS_ONE_SETTING, 497 VM_EXIT_CTLS_ZERO_SETTING, 498 &exit_ctls); 499 if (error) { 500 /* Try again without the PAT MSR bits */ 501 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, 502 MSR_VMX_TRUE_EXIT_CTLS, 503 VM_EXIT_CTLS_ONE_SETTING_NO_PAT, 504 VM_EXIT_CTLS_ZERO_SETTING, 505 &exit_ctls); 506 if (error) { 507 printf("vmx_init: processor does not support desired " 508 "exit controls\n"); 509 return (error); 510 } else { 511 if (bootverbose) 512 printf("vmm: PAT MSR access not supported\n"); 513 guest_msr_valid(MSR_PAT); 514 vmx_no_patmsr = 1; 515 } 516 } 517 518 /* Check support for VM-entry controls */ 519 if (!vmx_no_patmsr) { 520 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, 521 MSR_VMX_TRUE_ENTRY_CTLS, 522 VM_ENTRY_CTLS_ONE_SETTING, 523 VM_ENTRY_CTLS_ZERO_SETTING, 524 &entry_ctls); 525 } else { 526 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, 527 MSR_VMX_TRUE_ENTRY_CTLS, 528 VM_ENTRY_CTLS_ONE_SETTING_NO_PAT, 529 VM_ENTRY_CTLS_ZERO_SETTING, 530 &entry_ctls); 531 } 532 533 if (error) { 534 printf("vmx_init: processor does not support desired " 535 "entry controls\n"); 536 return (error); 537 } 538 539 /* 540 * Check support for optional features by testing them 541 * as individual bits 542 */ 543 cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 544 MSR_VMX_TRUE_PROCBASED_CTLS, 545 PROCBASED_HLT_EXITING, 0, 546 &tmp) == 0); 547 548 cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 549 MSR_VMX_PROCBASED_CTLS, 550 PROCBASED_MTF, 0, 551 &tmp) == 0); 552 553 cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 554 MSR_VMX_TRUE_PROCBASED_CTLS, 555 PROCBASED_PAUSE_EXITING, 0, 556 &tmp) == 0); 557 558 cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 559 MSR_VMX_PROCBASED_CTLS2, 560 PROCBASED2_UNRESTRICTED_GUEST, 0, 561 &tmp) == 0); 562 563 /* Initialize EPT */ 564 error = ept_init(); 565 if (error) { 566 printf("vmx_init: ept initialization failed (%d)\n", error); 567 return (error); 568 } 569 570 /* 571 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 572 */ 573 fixed0 = rdmsr(MSR_VMX_CR0_FIXED0); 574 fixed1 = rdmsr(MSR_VMX_CR0_FIXED1); 575 cr0_ones_mask = fixed0 & fixed1; 576 cr0_zeros_mask = ~fixed0 & ~fixed1; 577 578 /* 579 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation 580 * if unrestricted guest execution is allowed. 581 */ 582 if (cap_unrestricted_guest) 583 cr0_ones_mask &= ~(CR0_PG | CR0_PE); 584 585 /* 586 * Do not allow the guest to set CR0_NW or CR0_CD. 587 */ 588 cr0_zeros_mask |= (CR0_NW | CR0_CD); 589 590 fixed0 = rdmsr(MSR_VMX_CR4_FIXED0); 591 fixed1 = rdmsr(MSR_VMX_CR4_FIXED1); 592 cr4_ones_mask = fixed0 & fixed1; 593 cr4_zeros_mask = ~fixed0 & ~fixed1; 594 595 /* enable VMX operation */ 596 smp_rendezvous(NULL, vmx_enable, NULL, NULL); 597 598 return (0); 599} 600 601/* 602 * If this processor does not support VPIDs then simply return 0. 603 * 604 * Otherwise generate the next value of VPID to use. Any value is alright 605 * as long as it is non-zero. 606 * 607 * We always execute in VMX non-root context with EPT enabled. Thus all 608 * combined mappings are tagged with the (EP4TA, VPID, PCID) tuple. This 609 * in turn means that multiple VMs can share the same VPID as long as 610 * they have distinct EPT page tables. 611 * 612 * XXX 613 * We should optimize this so that it returns VPIDs that are not in 614 * use. Then we will not unnecessarily invalidate mappings in 615 * vmx_set_pcpu_defaults() just because two or more vcpus happen to 616 * use the same 'vpid'. 617 */ 618static uint16_t 619vmx_vpid(void) 620{ 621 uint16_t vpid = 0; 622 623 if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) != 0) { 624 do { 625 vpid = atomic_fetchadd_int(&nextvpid, 1); 626 } while (vpid == 0); 627 } 628 629 return (vpid); 630} 631 632static int 633vmx_setup_cr_shadow(int which, struct vmcs *vmcs) 634{ 635 int error, mask_ident, shadow_ident; 636 uint64_t mask_value, shadow_value; 637 638 if (which != 0 && which != 4) 639 panic("vmx_setup_cr_shadow: unknown cr%d", which); 640 641 if (which == 0) { 642 mask_ident = VMCS_CR0_MASK; 643 mask_value = cr0_ones_mask | cr0_zeros_mask; 644 shadow_ident = VMCS_CR0_SHADOW; 645 shadow_value = cr0_ones_mask; 646 } else { 647 mask_ident = VMCS_CR4_MASK; 648 mask_value = cr4_ones_mask | cr4_zeros_mask; 649 shadow_ident = VMCS_CR4_SHADOW; 650 shadow_value = cr4_ones_mask; 651 } 652 653 error = vmcs_setreg(vmcs, VMCS_IDENT(mask_ident), mask_value); 654 if (error) 655 return (error); 656 657 error = vmcs_setreg(vmcs, VMCS_IDENT(shadow_ident), shadow_value); 658 if (error) 659 return (error); 660 661 return (0); 662} 663#define vmx_setup_cr0_shadow(vmcs) vmx_setup_cr_shadow(0, (vmcs)) 664#define vmx_setup_cr4_shadow(vmcs) vmx_setup_cr_shadow(4, (vmcs)) 665 666static void * 667vmx_vminit(struct vm *vm) 668{ 669 uint16_t vpid; 670 int i, error, guest_msr_count; 671 struct vmx *vmx; 672 673 vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); 674 if ((uintptr_t)vmx & PAGE_MASK) { 675 panic("malloc of struct vmx not aligned on %d byte boundary", 676 PAGE_SIZE); 677 } 678 vmx->vm = vm; 679 680 /* 681 * Clean up EPTP-tagged guest physical and combined mappings 682 * 683 * VMX transitions are not required to invalidate any guest physical 684 * mappings. So, it may be possible for stale guest physical mappings 685 * to be present in the processor TLBs. 686 * 687 * Combined mappings for this EP4TA are also invalidated for all VPIDs. 688 */ 689 ept_invalidate_mappings(vtophys(vmx->pml4ept)); 690 691 msr_bitmap_initialize(vmx->msr_bitmap); 692 693 /* 694 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE. 695 * The guest FSBASE and GSBASE are saved and restored during 696 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are 697 * always restored from the vmcs host state area on vm-exit. 698 * 699 * Guest KGSBASE is saved and restored in the guest MSR save area. 700 * Host KGSBASE is restored before returning to userland from the pcb. 701 * There will be a window of time when we are executing in the host 702 * kernel context with a value of KGSBASE from the guest. This is ok 703 * because the value of KGSBASE is inconsequential in kernel context. 704 * 705 * MSR_EFER is saved and restored in the guest VMCS area on a 706 * VM exit and entry respectively. It is also restored from the 707 * host VMCS area on a VM exit. 708 */ 709 if (guest_msr_rw(vmx, MSR_GSBASE) || 710 guest_msr_rw(vmx, MSR_FSBASE) || 711 guest_msr_rw(vmx, MSR_KGSBASE) || 712 guest_msr_rw(vmx, MSR_EFER)) 713 panic("vmx_vminit: error setting guest msr access"); 714 715 /* 716 * MSR_PAT is saved and restored in the guest VMCS are on a VM exit 717 * and entry respectively. It is also restored from the host VMCS 718 * area on a VM exit. However, if running on a system with no 719 * MSR_PAT save/restore support, leave access disabled so accesses 720 * will be trapped. 721 */ 722 if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT)) 723 panic("vmx_vminit: error setting guest pat msr access"); 724 725 for (i = 0; i < VM_MAXCPU; i++) { 726 vmx->vmcs[i].identifier = vmx_revision(); 727 error = vmclear(&vmx->vmcs[i]); 728 if (error != 0) { 729 panic("vmx_vminit: vmclear error %d on vcpu %d\n", 730 error, i); 731 } 732 733 vpid = vmx_vpid(); 734 735 error = vmcs_set_defaults(&vmx->vmcs[i], 736 (u_long)vmx_longjmp, 737 (u_long)&vmx->ctx[i], 738 vtophys(vmx->pml4ept), 739 pinbased_ctls, 740 procbased_ctls, 741 procbased_ctls2, 742 exit_ctls, entry_ctls, 743 vtophys(vmx->msr_bitmap), 744 vpid); 745 746 if (error != 0) 747 panic("vmx_vminit: vmcs_set_defaults error %d", error); 748 749 vmx->cap[i].set = 0; 750 vmx->cap[i].proc_ctls = procbased_ctls; 751 752 vmx->state[i].request_nmi = 0; 753 vmx->state[i].lastcpu = -1; 754 vmx->state[i].vpid = vpid; 755 756 msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count); 757 758 error = vmcs_set_msr_save(&vmx->vmcs[i], 759 vtophys(vmx->guest_msrs[i]), 760 guest_msr_count); 761 if (error != 0) 762 panic("vmcs_set_msr_save error %d", error); 763 764 error = vmx_setup_cr0_shadow(&vmx->vmcs[i]); 765 if (error != 0) 766 panic("vmx_setup_cr0_shadow %d", error); 767 768 error = vmx_setup_cr4_shadow(&vmx->vmcs[i]); 769 if (error != 0) 770 panic("vmx_setup_cr4_shadow %d", error); 771 } 772 773 return (vmx); 774} 775 776static int 777vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx) 778{ 779 int handled, func; 780 781 func = vmxctx->guest_rax; 782 783 handled = x86_emulate_cpuid(vm, vcpu, 784 (uint32_t*)(&vmxctx->guest_rax), 785 (uint32_t*)(&vmxctx->guest_rbx), 786 (uint32_t*)(&vmxctx->guest_rcx), 787 (uint32_t*)(&vmxctx->guest_rdx)); 788 return (handled); 789} 790 791static __inline void 792vmx_run_trace(struct vmx *vmx, int vcpu) 793{ 794#ifdef KTR 795 VMM_CTR1(vmx->vm, vcpu, "Resume execution at 0x%0lx", vmcs_guest_rip()); 796#endif 797} 798 799static __inline void 800vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, 801 int handled, int astpending) 802{ 803#ifdef KTR 804 VMM_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx", 805 handled ? "handled" : "unhandled", 806 exit_reason_to_str(exit_reason), rip); 807 808 if (astpending) 809 VMM_CTR0(vmx->vm, vcpu, "astpending"); 810#endif 811} 812 813static int 814vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu) 815{ 816 int error, lastcpu; 817 struct vmxstate *vmxstate; 818 struct invvpid_desc invvpid_desc = { 0 }; 819 820 vmxstate = &vmx->state[vcpu]; 821 lastcpu = vmxstate->lastcpu; 822 vmxstate->lastcpu = curcpu; 823 824 if (lastcpu == curcpu) { 825 error = 0; 826 goto done; 827 } 828 829 vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); 830 831 error = vmwrite(VMCS_HOST_TR_BASE, (u_long)PCPU_GET(tssp)); 832 if (error != 0) 833 goto done; 834 835 error = vmwrite(VMCS_HOST_GDTR_BASE, (u_long)&gdt[NGDT * curcpu]); 836 if (error != 0) 837 goto done; 838 839 error = vmwrite(VMCS_HOST_GS_BASE, (u_long)&__pcpu[curcpu]); 840 if (error != 0) 841 goto done; 842 843 /* 844 * If we are using VPIDs then invalidate all mappings tagged with 'vpid' 845 * 846 * We do this because this vcpu was executing on a different host 847 * cpu when it last ran. We do not track whether it invalidated 848 * mappings associated with its 'vpid' during that run. So we must 849 * assume that the mappings associated with 'vpid' on 'curcpu' are 850 * stale and invalidate them. 851 * 852 * Note that we incur this penalty only when the scheduler chooses to 853 * move the thread associated with this vcpu between host cpus. 854 * 855 * Note also that this will invalidate mappings tagged with 'vpid' 856 * for "all" EP4TAs. 857 */ 858 if (vmxstate->vpid != 0) { 859 invvpid_desc.vpid = vmxstate->vpid; 860 invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); 861 } 862done: 863 return (error); 864} 865 866static void 867vm_exit_update_rip(struct vm_exit *vmexit) 868{ 869 int error; 870 871 error = vmwrite(VMCS_GUEST_RIP, vmexit->rip + vmexit->inst_length); 872 if (error) 873 panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); 874} 875 876/* 877 * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. 878 */ 879CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); 880 881static void __inline 882vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) 883{ 884 int error; 885 886 vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING; 887 888 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 889 if (error) 890 panic("vmx_set_int_window_exiting: vmwrite error %d", error); 891} 892 893static void __inline 894vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) 895{ 896 int error; 897 898 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; 899 900 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 901 if (error) 902 panic("vmx_clear_int_window_exiting: vmwrite error %d", error); 903} 904 905static void __inline 906vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) 907{ 908 int error; 909 910 vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; 911 912 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 913 if (error) 914 panic("vmx_set_nmi_window_exiting: vmwrite error %d", error); 915} 916 917static void __inline 918vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) 919{ 920 int error; 921 922 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; 923 924 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 925 if (error) 926 panic("vmx_clear_nmi_window_exiting: vmwrite error %d", error); 927} 928 929static int 930vmx_inject_nmi(struct vmx *vmx, int vcpu) 931{ 932 int error; 933 uint64_t info, interruptibility; 934 935 /* Bail out if no NMI requested */ 936 if (vmx->state[vcpu].request_nmi == 0) 937 return (0); 938 939 error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility); 940 if (error) { 941 panic("vmx_inject_nmi: vmread(interruptibility) %d", 942 error); 943 } 944 if (interruptibility & nmi_blocking_bits) 945 goto nmiblocked; 946 947 /* 948 * Inject the virtual NMI. The vector must be the NMI IDT entry 949 * or the VMCS entry check will fail. 950 */ 951 info = VMCS_INTERRUPTION_INFO_NMI | VMCS_INTERRUPTION_INFO_VALID; 952 info |= IDT_NMI; 953 954 error = vmwrite(VMCS_ENTRY_INTR_INFO, info); 955 if (error) 956 panic("vmx_inject_nmi: vmwrite(intrinfo) %d", error); 957 958 VMM_CTR0(vmx->vm, vcpu, "Injecting vNMI"); 959 960 /* Clear the request */ 961 vmx->state[vcpu].request_nmi = 0; 962 return (1); 963 964nmiblocked: 965 /* 966 * Set the NMI Window Exiting execution control so we can inject 967 * the virtual NMI as soon as blocking condition goes away. 968 */ 969 vmx_set_nmi_window_exiting(vmx, vcpu); 970 971 VMM_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting"); 972 return (1); 973} 974 975static void 976vmx_inject_interrupts(struct vmx *vmx, int vcpu) 977{ 978 int error, vector; 979 uint64_t info, rflags, interruptibility; 980 981 const int HWINTR_BLOCKED = VMCS_INTERRUPTIBILITY_STI_BLOCKING | 982 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING; 983 984#if 1 985 /* 986 * XXX 987 * If an event is being injected from userland then just return. 988 * For e.g. we may inject a breakpoint exception to cause the 989 * guest to enter the debugger so we can inspect its state. 990 */ 991 error = vmread(VMCS_ENTRY_INTR_INFO, &info); 992 if (error) 993 panic("vmx_inject_interrupts: vmread(intrinfo) %d", error); 994 if (info & VMCS_INTERRUPTION_INFO_VALID) 995 return; 996#endif 997 /* 998 * NMI injection has priority so deal with those first 999 */ 1000 if (vmx_inject_nmi(vmx, vcpu)) 1001 return; 1002 1003 /* Ask the local apic for a vector to inject */ 1004 vector = lapic_pending_intr(vmx->vm, vcpu); 1005 if (vector < 0) 1006 return; 1007 1008 if (vector < 32 || vector > 255) 1009 panic("vmx_inject_interrupts: invalid vector %d\n", vector); 1010 1011 /* Check RFLAGS.IF and the interruptibility state of the guest */ 1012 error = vmread(VMCS_GUEST_RFLAGS, &rflags); 1013 if (error) 1014 panic("vmx_inject_interrupts: vmread(rflags) %d", error); 1015 1016 if ((rflags & PSL_I) == 0) 1017 goto cantinject; 1018 1019 error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility); 1020 if (error) { 1021 panic("vmx_inject_interrupts: vmread(interruptibility) %d", 1022 error); 1023 } 1024 if (interruptibility & HWINTR_BLOCKED) 1025 goto cantinject; 1026 1027 /* Inject the interrupt */ 1028 info = VMCS_INTERRUPTION_INFO_HW_INTR | VMCS_INTERRUPTION_INFO_VALID; 1029 info |= vector; 1030 error = vmwrite(VMCS_ENTRY_INTR_INFO, info); 1031 if (error) 1032 panic("vmx_inject_interrupts: vmwrite(intrinfo) %d", error); 1033 1034 /* Update the Local APIC ISR */ 1035 lapic_intr_accepted(vmx->vm, vcpu, vector); 1036 1037 VMM_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); 1038 1039 return; 1040 1041cantinject: 1042 /* 1043 * Set the Interrupt Window Exiting execution control so we can inject 1044 * the interrupt as soon as blocking condition goes away. 1045 */ 1046 vmx_set_int_window_exiting(vmx, vcpu); 1047 1048 VMM_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting"); 1049} 1050 1051static int 1052vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual) 1053{ 1054 int error, cr, vmcs_guest_cr; 1055 uint64_t regval, ones_mask, zeros_mask; 1056 const struct vmxctx *vmxctx; 1057 1058 /* We only handle mov to %cr0 or %cr4 at this time */ 1059 if ((exitqual & 0xf0) != 0x00) 1060 return (UNHANDLED); 1061 1062 cr = exitqual & 0xf; 1063 if (cr != 0 && cr != 4) 1064 return (UNHANDLED); 1065 1066 vmxctx = &vmx->ctx[vcpu]; 1067 1068 /* 1069 * We must use vmwrite() directly here because vmcs_setreg() will 1070 * call vmclear(vmcs) as a side-effect which we certainly don't want. 1071 */ 1072 switch ((exitqual >> 8) & 0xf) { 1073 case 0: 1074 regval = vmxctx->guest_rax; 1075 break; 1076 case 1: 1077 regval = vmxctx->guest_rcx; 1078 break; 1079 case 2: 1080 regval = vmxctx->guest_rdx; 1081 break; 1082 case 3: 1083 regval = vmxctx->guest_rbx; 1084 break; 1085 case 4: 1086 error = vmread(VMCS_GUEST_RSP, ®val); 1087 if (error) { 1088 panic("vmx_emulate_cr_access: " 1089 "error %d reading guest rsp", error); 1090 } 1091 break; 1092 case 5: 1093 regval = vmxctx->guest_rbp; 1094 break; 1095 case 6: 1096 regval = vmxctx->guest_rsi; 1097 break; 1098 case 7: 1099 regval = vmxctx->guest_rdi; 1100 break; 1101 case 8: 1102 regval = vmxctx->guest_r8; 1103 break; 1104 case 9: 1105 regval = vmxctx->guest_r9; 1106 break; 1107 case 10: 1108 regval = vmxctx->guest_r10; 1109 break; 1110 case 11: 1111 regval = vmxctx->guest_r11; 1112 break; 1113 case 12: 1114 regval = vmxctx->guest_r12; 1115 break; 1116 case 13: 1117 regval = vmxctx->guest_r13; 1118 break; 1119 case 14: 1120 regval = vmxctx->guest_r14; 1121 break; 1122 case 15: 1123 regval = vmxctx->guest_r15; 1124 break; 1125 } 1126 1127 if (cr == 0) { 1128 ones_mask = cr0_ones_mask; 1129 zeros_mask = cr0_zeros_mask; 1130 vmcs_guest_cr = VMCS_GUEST_CR0; 1131 } else { 1132 ones_mask = cr4_ones_mask; 1133 zeros_mask = cr4_zeros_mask; 1134 vmcs_guest_cr = VMCS_GUEST_CR4; 1135 } 1136 regval |= ones_mask; 1137 regval &= ~zeros_mask; 1138 error = vmwrite(vmcs_guest_cr, regval); 1139 if (error) { 1140 panic("vmx_emulate_cr_access: error %d writing cr%d", 1141 error, cr); 1142 } 1143 1144 return (HANDLED); 1145} 1146 1147static int 1148vmx_lapic_fault(struct vm *vm, int cpu, 1149 uint64_t gpa, uint64_t rip, uint64_t cr3, uint64_t ept_qual) 1150{ 1151 int read, write, handled; 1152 1153 /* 1154 * For this to be a legitimate access to the local apic: 1155 * - the GPA in the local apic page 1156 * - the GPA must be aligned on a 16 byte boundary 1157 */ 1158 if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE) 1159 return (UNHANDLED); 1160 1161 if ((gpa & 0xF) != 0) 1162 return (UNHANDLED); 1163 1164 /* EPT violation on an instruction fetch doesn't make sense here */ 1165 if (ept_qual & EPT_VIOLATION_INST_FETCH) 1166 return (UNHANDLED); 1167 1168 /* EPT violation must be a read fault or a write fault but not both */ 1169 read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0; 1170 write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0; 1171 if ((read ^ write) == 0) 1172 return (UNHANDLED); 1173 1174 /* 1175 * The EPT violation must have been caused by accessing a guest-physical 1176 * address that is a translation of a guest-linear address. 1177 */ 1178 if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 || 1179 (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) { 1180 return (UNHANDLED); 1181 } 1182 1183 handled = lapic_mmio(vm, cpu, gpa - DEFAULT_APIC_BASE, read, rip, cr3); 1184 1185 return (handled); 1186} 1187 1188static int 1189vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 1190{ 1191 int handled; 1192 struct vmcs *vmcs; 1193 struct vmxctx *vmxctx; 1194 uint32_t eax, ecx, edx; 1195 uint64_t qual, gpa, cr3; 1196 1197 handled = 0; 1198 vmcs = &vmx->vmcs[vcpu]; 1199 vmxctx = &vmx->ctx[vcpu]; 1200 qual = vmexit->u.vmx.exit_qualification; 1201 vmexit->exitcode = VM_EXITCODE_BOGUS; 1202 1203 switch (vmexit->u.vmx.exit_reason) { 1204 case EXIT_REASON_CR_ACCESS: 1205 handled = vmx_emulate_cr_access(vmx, vcpu, qual); 1206 break; 1207 case EXIT_REASON_RDMSR: 1208 ecx = vmxctx->guest_rcx; 1209 handled = emulate_rdmsr(vmx->vm, vcpu, ecx); 1210 if (!handled) { 1211 vmexit->exitcode = VM_EXITCODE_RDMSR; 1212 vmexit->u.msr.code = ecx; 1213 } 1214 break; 1215 case EXIT_REASON_WRMSR: 1216 eax = vmxctx->guest_rax; 1217 ecx = vmxctx->guest_rcx; 1218 edx = vmxctx->guest_rdx; 1219 handled = emulate_wrmsr(vmx->vm, vcpu, ecx, 1220 (uint64_t)edx << 32 | eax); 1221 if (!handled) { 1222 vmexit->exitcode = VM_EXITCODE_WRMSR; 1223 vmexit->u.msr.code = ecx; 1224 vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; 1225 } 1226 break; 1227 case EXIT_REASON_HLT: 1228 vmexit->exitcode = VM_EXITCODE_HLT; 1229 break; 1230 case EXIT_REASON_MTF: 1231 vmexit->exitcode = VM_EXITCODE_MTRAP; 1232 break; 1233 case EXIT_REASON_PAUSE: 1234 vmexit->exitcode = VM_EXITCODE_PAUSE; 1235 break; 1236 case EXIT_REASON_INTR_WINDOW: 1237 vmx_clear_int_window_exiting(vmx, vcpu); 1238 VMM_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); 1239 /* FALLTHRU */ 1240 case EXIT_REASON_EXT_INTR: 1241 /* 1242 * External interrupts serve only to cause VM exits and allow 1243 * the host interrupt handler to run. 1244 * 1245 * If this external interrupt triggers a virtual interrupt 1246 * to a VM, then that state will be recorded by the 1247 * host interrupt handler in the VM's softc. We will inject 1248 * this virtual interrupt during the subsequent VM enter. 1249 */ 1250 1251 /* 1252 * This is special. We want to treat this as an 'handled' 1253 * VM-exit but not increment the instruction pointer. 1254 */ 1255 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); 1256 return (1); 1257 case EXIT_REASON_NMI_WINDOW: 1258 /* Exit to allow the pending virtual NMI to be injected */ 1259 vmx_clear_nmi_window_exiting(vmx, vcpu); 1260 VMM_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting"); 1261 return (1); 1262 case EXIT_REASON_INOUT: 1263 vmexit->exitcode = VM_EXITCODE_INOUT; 1264 vmexit->u.inout.bytes = (qual & 0x7) + 1; 1265 vmexit->u.inout.in = (qual & 0x8) ? 1 : 0; 1266 vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; 1267 vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; 1268 vmexit->u.inout.port = (uint16_t)(qual >> 16); 1269 vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); 1270 break; 1271 case EXIT_REASON_CPUID: 1272 handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx); 1273 break; 1274 case EXIT_REASON_EPT_FAULT: 1275 gpa = vmcs_gpa(); 1276 cr3 = vmcs_guest_cr3(); 1277 handled = vmx_lapic_fault(vmx->vm, vcpu, 1278 gpa, vmexit->rip, cr3, qual); 1279 if (!handled) { 1280 vmexit->exitcode = VM_EXITCODE_PAGING; 1281 vmexit->u.paging.cr3 = cr3; 1282 } 1283 break; 1284 default: 1285 break; 1286 } 1287 1288 if (handled) { 1289 /* 1290 * It is possible that control is returned to userland 1291 * even though we were able to handle the VM exit in the 1292 * kernel (for e.g. 'astpending' is set in the run loop). 1293 * 1294 * In such a case we want to make sure that the userland 1295 * restarts guest execution at the instruction *after* 1296 * the one we just processed. Therefore we update the 1297 * guest rip in the VMCS and in 'vmexit'. 1298 */ 1299 vm_exit_update_rip(vmexit); 1300 vmexit->rip += vmexit->inst_length; 1301 vmexit->inst_length = 0; 1302 1303 /* 1304 * Special case for spinning up an AP - exit to userspace to 1305 * give the controlling process a chance to intercept and 1306 * spin up a thread for the AP. 1307 */ 1308 if (vmexit->exitcode == VM_EXITCODE_SPINUP_AP) 1309 handled = 0; 1310 } else { 1311 if (vmexit->exitcode == VM_EXITCODE_BOGUS) { 1312 /* 1313 * If this VM exit was not claimed by anybody then 1314 * treat it as a generic VMX exit. 1315 */ 1316 vmexit->exitcode = VM_EXITCODE_VMX; 1317 vmexit->u.vmx.error = 0; 1318 } else { 1319 /* 1320 * The exitcode and collateral have been populated. 1321 * The VM exit will be processed further in userland. 1322 */ 1323 } 1324 } 1325 return (handled); 1326} 1327 1328static int 1329vmx_run(void *arg, int vcpu, register_t rip) 1330{ 1331 int error, vie, rc, handled, astpending; 1332 uint32_t exit_reason; 1333 struct vmx *vmx; 1334 struct vmxctx *vmxctx; 1335 struct vmcs *vmcs; 1336 struct vm_exit *vmexit; 1337 1338 vmx = arg; 1339 vmcs = &vmx->vmcs[vcpu]; 1340 vmxctx = &vmx->ctx[vcpu]; 1341 vmxctx->launched = 0; 1342 1343 vmexit = vm_exitinfo(vmx->vm, vcpu); 1344 1345 /* 1346 * XXX Can we avoid doing this every time we do a vm run? 1347 */ 1348 VMPTRLD(vmcs); 1349 1350 /* 1351 * XXX 1352 * We do this every time because we may setup the virtual machine 1353 * from a different process than the one that actually runs it. 1354 * 1355 * If the life of a virtual machine was spent entirely in the context 1356 * of a single process we could do this once in vmcs_set_defaults(). 1357 */ 1358 if ((error = vmwrite(VMCS_HOST_CR3, rcr3())) != 0) 1359 panic("vmx_run: error %d writing to VMCS_HOST_CR3", error); 1360 1361 if ((error = vmwrite(VMCS_GUEST_RIP, rip)) != 0) 1362 panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); 1363 1364 if ((error = vmx_set_pcpu_defaults(vmx, vcpu)) != 0) 1365 panic("vmx_run: error %d setting up pcpu defaults", error); 1366 1367 do { 1368 lapic_timer_tick(vmx->vm, vcpu); 1369 vmx_inject_interrupts(vmx, vcpu); 1370 vmx_run_trace(vmx, vcpu); 1371 rc = vmx_setjmp(vmxctx); 1372#ifdef SETJMP_TRACE 1373 vmx_setjmp_trace(vmx, vcpu, vmxctx, rc); 1374#endif 1375 switch (rc) { 1376 case VMX_RETURN_DIRECT: 1377 if (vmxctx->launched == 0) { 1378 vmxctx->launched = 1; 1379 vmx_launch(vmxctx); 1380 } else 1381 vmx_resume(vmxctx); 1382 panic("vmx_launch/resume should not return"); 1383 break; 1384 case VMX_RETURN_LONGJMP: 1385 break; /* vm exit */ 1386 case VMX_RETURN_VMRESUME: 1387 vie = vmcs_instruction_error(); 1388 if (vmxctx->launch_error == VM_FAIL_INVALID || 1389 vie != VMRESUME_WITH_NON_LAUNCHED_VMCS) { 1390 printf("vmresume error %d vmcs inst error %d\n", 1391 vmxctx->launch_error, vie); 1392 goto err_exit; 1393 } 1394 vmx_launch(vmxctx); /* try to launch the guest */ 1395 panic("vmx_launch should not return"); 1396 break; 1397 case VMX_RETURN_VMLAUNCH: 1398 vie = vmcs_instruction_error(); 1399#if 1 1400 printf("vmlaunch error %d vmcs inst error %d\n", 1401 vmxctx->launch_error, vie); 1402#endif 1403 goto err_exit; 1404 default: 1405 panic("vmx_setjmp returned %d", rc); 1406 } 1407 1408 /* 1409 * XXX locking? 1410 * See comments in exception.S about checking for ASTs 1411 * atomically while interrupts are disabled. But it is 1412 * not clear that they apply in our case. 1413 */ 1414 astpending = curthread->td_flags & TDF_ASTPENDING; 1415 1416 /* enable interrupts */ 1417 enable_intr(); 1418 1419 /* collect some basic information for VM exit processing */ 1420 vmexit->rip = rip = vmcs_guest_rip(); 1421 vmexit->inst_length = vmexit_instruction_length(); 1422 vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); 1423 vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); 1424 1425 handled = vmx_exit_process(vmx, vcpu, vmexit); 1426 1427 vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled, 1428 astpending); 1429 } while (handled && !astpending); 1430 1431 /* 1432 * If a VM exit has been handled then the exitcode must be BOGUS 1433 * If a VM exit is not handled then the exitcode must not be BOGUS 1434 */ 1435 if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || 1436 (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { 1437 panic("Mismatch between handled (%d) and exitcode (%d)", 1438 handled, vmexit->exitcode); 1439 } 1440 1441 VMM_CTR1(vmx->vm, vcpu, "goto userland: exitcode %d",vmexit->exitcode); 1442 1443 /* 1444 * XXX 1445 * We need to do this to ensure that any VMCS state cached by the 1446 * processor is flushed to memory. We need to do this in case the 1447 * VM moves to a different cpu the next time it runs. 1448 * 1449 * Can we avoid doing this? 1450 */ 1451 VMCLEAR(vmcs); 1452 return (0); 1453 1454err_exit: 1455 vmexit->exitcode = VM_EXITCODE_VMX; 1456 vmexit->u.vmx.exit_reason = (uint32_t)-1; 1457 vmexit->u.vmx.exit_qualification = (uint32_t)-1; 1458 vmexit->u.vmx.error = vie; 1459 VMCLEAR(vmcs); 1460 return (ENOEXEC); 1461} 1462 1463static void 1464vmx_vmcleanup(void *arg) 1465{ 1466 int error; 1467 struct vmx *vmx = arg; 1468 1469 /* 1470 * XXXSMP we also need to clear the VMCS active on the other vcpus. 1471 */ 1472 error = vmclear(&vmx->vmcs[0]); 1473 if (error != 0) 1474 panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error); 1475 1476 ept_vmcleanup(vmx); 1477 free(vmx, M_VMX); 1478 1479 return; 1480} 1481 1482static register_t * 1483vmxctx_regptr(struct vmxctx *vmxctx, int reg) 1484{ 1485 1486 switch (reg) { 1487 case VM_REG_GUEST_RAX: 1488 return (&vmxctx->guest_rax); 1489 case VM_REG_GUEST_RBX: 1490 return (&vmxctx->guest_rbx); 1491 case VM_REG_GUEST_RCX: 1492 return (&vmxctx->guest_rcx); 1493 case VM_REG_GUEST_RDX: 1494 return (&vmxctx->guest_rdx); 1495 case VM_REG_GUEST_RSI: 1496 return (&vmxctx->guest_rsi); 1497 case VM_REG_GUEST_RDI: 1498 return (&vmxctx->guest_rdi); 1499 case VM_REG_GUEST_RBP: 1500 return (&vmxctx->guest_rbp); 1501 case VM_REG_GUEST_R8: 1502 return (&vmxctx->guest_r8); 1503 case VM_REG_GUEST_R9: 1504 return (&vmxctx->guest_r9); 1505 case VM_REG_GUEST_R10: 1506 return (&vmxctx->guest_r10); 1507 case VM_REG_GUEST_R11: 1508 return (&vmxctx->guest_r11); 1509 case VM_REG_GUEST_R12: 1510 return (&vmxctx->guest_r12); 1511 case VM_REG_GUEST_R13: 1512 return (&vmxctx->guest_r13); 1513 case VM_REG_GUEST_R14: 1514 return (&vmxctx->guest_r14); 1515 case VM_REG_GUEST_R15: 1516 return (&vmxctx->guest_r15); 1517 default: 1518 break; 1519 } 1520 return (NULL); 1521} 1522 1523static int 1524vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval) 1525{ 1526 register_t *regp; 1527 1528 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 1529 *retval = *regp; 1530 return (0); 1531 } else 1532 return (EINVAL); 1533} 1534 1535static int 1536vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) 1537{ 1538 register_t *regp; 1539 1540 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 1541 *regp = val; 1542 return (0); 1543 } else 1544 return (EINVAL); 1545} 1546 1547static int 1548vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) 1549{ 1550 struct vmx *vmx = arg; 1551 1552 if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0) 1553 return (0); 1554 1555 /* 1556 * If the vcpu is running then don't mess with the VMCS. 1557 * 1558 * vmcs_getreg will VMCLEAR the vmcs when it is done which will cause 1559 * the subsequent vmlaunch/vmresume to fail. 1560 */ 1561 if (vcpu_is_running(vmx->vm, vcpu, NULL)) 1562 panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); 1563 1564 return (vmcs_getreg(&vmx->vmcs[vcpu], reg, retval)); 1565} 1566 1567static int 1568vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) 1569{ 1570 int error; 1571 uint64_t ctls; 1572 struct vmx *vmx = arg; 1573 1574 /* 1575 * XXX Allow caller to set contents of the guest registers saved in 1576 * the 'vmxctx' even though the vcpu might be running. We need this 1577 * specifically to support the rdmsr emulation that will set the 1578 * %eax and %edx registers during vm exit processing. 1579 */ 1580 if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0) 1581 return (0); 1582 1583 /* 1584 * If the vcpu is running then don't mess with the VMCS. 1585 * 1586 * vmcs_setreg will VMCLEAR the vmcs when it is done which will cause 1587 * the subsequent vmlaunch/vmresume to fail. 1588 */ 1589 if (vcpu_is_running(vmx->vm, vcpu, NULL)) 1590 panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); 1591 1592 error = vmcs_setreg(&vmx->vmcs[vcpu], reg, val); 1593 1594 if (error == 0) { 1595 /* 1596 * If the "load EFER" VM-entry control is 1 then the 1597 * value of EFER.LMA must be identical to "IA-32e mode guest" 1598 * bit in the VM-entry control. 1599 */ 1600 if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 && 1601 (reg == VM_REG_GUEST_EFER)) { 1602 vmcs_getreg(&vmx->vmcs[vcpu], 1603 VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls); 1604 if (val & EFER_LMA) 1605 ctls |= VM_ENTRY_GUEST_LMA; 1606 else 1607 ctls &= ~VM_ENTRY_GUEST_LMA; 1608 vmcs_setreg(&vmx->vmcs[vcpu], 1609 VMCS_IDENT(VMCS_ENTRY_CTLS), ctls); 1610 } 1611 } 1612 1613 return (error); 1614} 1615 1616static int 1617vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 1618{ 1619 struct vmx *vmx = arg; 1620 1621 return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc)); 1622} 1623 1624static int 1625vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 1626{ 1627 struct vmx *vmx = arg; 1628 1629 return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc)); 1630} 1631 1632static int 1633vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code, 1634 int code_valid) 1635{ 1636 int error; 1637 uint32_t info; 1638 struct vmx *vmx = arg; 1639 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 1640 1641 static uint32_t type_map[VM_EVENT_MAX] = { 1642 0x1, /* VM_EVENT_NONE */ 1643 0x0, /* VM_HW_INTR */ 1644 0x2, /* VM_NMI */ 1645 0x3, /* VM_HW_EXCEPTION */ 1646 0x4, /* VM_SW_INTR */ 1647 0x5, /* VM_PRIV_SW_EXCEPTION */ 1648 0x6, /* VM_SW_EXCEPTION */ 1649 }; 1650 1651 info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0); 1652 info |= VMCS_INTERRUPTION_INFO_VALID; 1653 error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info); 1654 if (error != 0) 1655 return (error); 1656 1657 if (code_valid) { 1658 error = vmcs_setreg(vmcs, 1659 VMCS_IDENT(VMCS_ENTRY_EXCEPTION_ERROR), 1660 code); 1661 } 1662 return (error); 1663} 1664 1665static int 1666vmx_nmi(void *arg, int vcpu) 1667{ 1668 struct vmx *vmx = arg; 1669 1670 atomic_set_int(&vmx->state[vcpu].request_nmi, 1); 1671 1672 return (0); 1673} 1674 1675static int 1676vmx_getcap(void *arg, int vcpu, int type, int *retval) 1677{ 1678 struct vmx *vmx = arg; 1679 int vcap; 1680 int ret; 1681 1682 ret = ENOENT; 1683 1684 vcap = vmx->cap[vcpu].set; 1685 1686 switch (type) { 1687 case VM_CAP_HALT_EXIT: 1688 if (cap_halt_exit) 1689 ret = 0; 1690 break; 1691 case VM_CAP_PAUSE_EXIT: 1692 if (cap_pause_exit) 1693 ret = 0; 1694 break; 1695 case VM_CAP_MTRAP_EXIT: 1696 if (cap_monitor_trap) 1697 ret = 0; 1698 break; 1699 case VM_CAP_UNRESTRICTED_GUEST: 1700 if (cap_unrestricted_guest) 1701 ret = 0; 1702 break; 1703 default: 1704 break; 1705 } 1706 1707 if (ret == 0) 1708 *retval = (vcap & (1 << type)) ? 1 : 0; 1709 1710 return (ret); 1711} 1712 1713static int 1714vmx_setcap(void *arg, int vcpu, int type, int val) 1715{ 1716 struct vmx *vmx = arg; 1717 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 1718 uint32_t baseval; 1719 uint32_t *pptr; 1720 int error; 1721 int flag; 1722 int reg; 1723 int retval; 1724 1725 retval = ENOENT; 1726 pptr = NULL; 1727 1728 switch (type) { 1729 case VM_CAP_HALT_EXIT: 1730 if (cap_halt_exit) { 1731 retval = 0; 1732 pptr = &vmx->cap[vcpu].proc_ctls; 1733 baseval = *pptr; 1734 flag = PROCBASED_HLT_EXITING; 1735 reg = VMCS_PRI_PROC_BASED_CTLS; 1736 } 1737 break; 1738 case VM_CAP_MTRAP_EXIT: 1739 if (cap_monitor_trap) { 1740 retval = 0; 1741 pptr = &vmx->cap[vcpu].proc_ctls; 1742 baseval = *pptr; 1743 flag = PROCBASED_MTF; 1744 reg = VMCS_PRI_PROC_BASED_CTLS; 1745 } 1746 break; 1747 case VM_CAP_PAUSE_EXIT: 1748 if (cap_pause_exit) { 1749 retval = 0; 1750 pptr = &vmx->cap[vcpu].proc_ctls; 1751 baseval = *pptr; 1752 flag = PROCBASED_PAUSE_EXITING; 1753 reg = VMCS_PRI_PROC_BASED_CTLS; 1754 } 1755 break; 1756 case VM_CAP_UNRESTRICTED_GUEST: 1757 if (cap_unrestricted_guest) { 1758 retval = 0; 1759 baseval = procbased_ctls2; 1760 flag = PROCBASED2_UNRESTRICTED_GUEST; 1761 reg = VMCS_SEC_PROC_BASED_CTLS; 1762 } 1763 break; 1764 default: 1765 break; 1766 } 1767 1768 if (retval == 0) { 1769 if (val) { 1770 baseval |= flag; 1771 } else { 1772 baseval &= ~flag; 1773 } 1774 VMPTRLD(vmcs); 1775 error = vmwrite(reg, baseval); 1776 VMCLEAR(vmcs); 1777 1778 if (error) { 1779 retval = error; 1780 } else { 1781 /* 1782 * Update optional stored flags, and record 1783 * setting 1784 */ 1785 if (pptr != NULL) { 1786 *pptr = baseval; 1787 } 1788 1789 if (val) { 1790 vmx->cap[vcpu].set |= (1 << type); 1791 } else { 1792 vmx->cap[vcpu].set &= ~(1 << type); 1793 } 1794 } 1795 } 1796 1797 return (retval); 1798} 1799 1800struct vmm_ops vmm_ops_intel = { 1801 vmx_init, 1802 vmx_cleanup, 1803 vmx_vminit, 1804 vmx_run, 1805 vmx_vmcleanup, 1806 ept_vmmmap, 1807 vmx_getreg, 1808 vmx_setreg, 1809 vmx_getdesc, 1810 vmx_setdesc, 1811 vmx_inject, 1812 vmx_nmi, 1813 vmx_getcap, 1814 vmx_setcap 1815}; 1816