vmx.c revision 241921
1/*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD$ 27 */ 28 29#include <sys/cdefs.h> 30__FBSDID("$FreeBSD$"); 31 32#include <sys/param.h> 33#include <sys/systm.h> 34#include <sys/smp.h> 35#include <sys/kernel.h> 36#include <sys/malloc.h> 37#include <sys/pcpu.h> 38#include <sys/proc.h> 39 40#include <vm/vm.h> 41#include <vm/pmap.h> 42 43#include <machine/psl.h> 44#include <machine/cpufunc.h> 45#include <machine/md_var.h> 46#include <machine/pmap.h> 47#include <machine/segments.h> 48#include <machine/specialreg.h> 49#include <machine/vmparam.h> 50 51#include <x86/apicreg.h> 52 53#include <machine/vmm.h> 54#include "vmm_lapic.h" 55#include "vmm_msr.h" 56#include "vmm_ktr.h" 57#include "vmm_stat.h" 58 59#include "vmx_msr.h" 60#include "ept.h" 61#include "vmx_cpufunc.h" 62#include "vmx.h" 63#include "x86.h" 64#include "vmx_controls.h" 65#include "vmm_instruction_emul.h" 66 67#define CR4_VMXE (1UL << 13) 68 69#define PINBASED_CTLS_ONE_SETTING \ 70 (PINBASED_EXTINT_EXITING | \ 71 PINBASED_NMI_EXITING | \ 72 PINBASED_VIRTUAL_NMI) 73#define PINBASED_CTLS_ZERO_SETTING 0 74 75#define PROCBASED_CTLS_WINDOW_SETTING \ 76 (PROCBASED_INT_WINDOW_EXITING | \ 77 PROCBASED_NMI_WINDOW_EXITING) 78 79#define PROCBASED_CTLS_ONE_SETTING \ 80 (PROCBASED_SECONDARY_CONTROLS | \ 81 PROCBASED_IO_EXITING | \ 82 PROCBASED_MSR_BITMAPS | \ 83 PROCBASED_CTLS_WINDOW_SETTING) 84#define PROCBASED_CTLS_ZERO_SETTING \ 85 (PROCBASED_CR3_LOAD_EXITING | \ 86 PROCBASED_CR3_STORE_EXITING | \ 87 PROCBASED_IO_BITMAPS) 88 89#define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT 90#define PROCBASED_CTLS2_ZERO_SETTING 0 91 92#define VM_EXIT_CTLS_ONE_SETTING_NO_PAT \ 93 (VM_EXIT_HOST_LMA | \ 94 VM_EXIT_SAVE_EFER | \ 95 VM_EXIT_LOAD_EFER) 96 97#define VM_EXIT_CTLS_ONE_SETTING \ 98 (VM_EXIT_CTLS_ONE_SETTING_NO_PAT | \ 99 VM_EXIT_SAVE_PAT | \ 100 VM_EXIT_LOAD_PAT) 101#define VM_EXIT_CTLS_ZERO_SETTING VM_EXIT_SAVE_DEBUG_CONTROLS 102 103#define VM_ENTRY_CTLS_ONE_SETTING_NO_PAT VM_ENTRY_LOAD_EFER 104 105#define VM_ENTRY_CTLS_ONE_SETTING \ 106 (VM_ENTRY_CTLS_ONE_SETTING_NO_PAT | \ 107 VM_ENTRY_LOAD_PAT) 108#define VM_ENTRY_CTLS_ZERO_SETTING \ 109 (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ 110 VM_ENTRY_INTO_SMM | \ 111 VM_ENTRY_DEACTIVATE_DUAL_MONITOR) 112 113#define guest_msr_rw(vmx, msr) \ 114 msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW) 115 116#define HANDLED 1 117#define UNHANDLED 0 118 119MALLOC_DEFINE(M_VMX, "vmx", "vmx"); 120 121extern struct pcpu __pcpu[]; 122 123int vmxon_enabled[MAXCPU]; 124static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); 125 126static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; 127static uint32_t exit_ctls, entry_ctls; 128 129static uint64_t cr0_ones_mask, cr0_zeros_mask; 130static uint64_t cr4_ones_mask, cr4_zeros_mask; 131 132static volatile u_int nextvpid; 133 134static int vmx_no_patmsr; 135 136/* 137 * Virtual NMI blocking conditions. 138 * 139 * Some processor implementations also require NMI to be blocked if 140 * the STI_BLOCKING bit is set. It is possible to detect this at runtime 141 * based on the (exit_reason,exit_qual) tuple being set to 142 * (EXIT_REASON_INVAL_VMCS, EXIT_QUAL_NMI_WHILE_STI_BLOCKING). 143 * 144 * We take the easy way out and also include STI_BLOCKING as one of the 145 * gating items for vNMI injection. 146 */ 147static uint64_t nmi_blocking_bits = VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING | 148 VMCS_INTERRUPTIBILITY_NMI_BLOCKING | 149 VMCS_INTERRUPTIBILITY_STI_BLOCKING; 150 151/* 152 * Optional capabilities 153 */ 154static int cap_halt_exit; 155static int cap_pause_exit; 156static int cap_unrestricted_guest; 157static int cap_monitor_trap; 158 159/* statistics */ 160static VMM_STAT_DEFINE(VCPU_MIGRATIONS, "vcpu migration across host cpus"); 161static VMM_STAT_DEFINE(VMEXIT_EXTINT, "vm exits due to external interrupt"); 162 163#ifdef KTR 164static const char * 165exit_reason_to_str(int reason) 166{ 167 static char reasonbuf[32]; 168 169 switch (reason) { 170 case EXIT_REASON_EXCEPTION: 171 return "exception"; 172 case EXIT_REASON_EXT_INTR: 173 return "extint"; 174 case EXIT_REASON_TRIPLE_FAULT: 175 return "triplefault"; 176 case EXIT_REASON_INIT: 177 return "init"; 178 case EXIT_REASON_SIPI: 179 return "sipi"; 180 case EXIT_REASON_IO_SMI: 181 return "iosmi"; 182 case EXIT_REASON_SMI: 183 return "smi"; 184 case EXIT_REASON_INTR_WINDOW: 185 return "intrwindow"; 186 case EXIT_REASON_NMI_WINDOW: 187 return "nmiwindow"; 188 case EXIT_REASON_TASK_SWITCH: 189 return "taskswitch"; 190 case EXIT_REASON_CPUID: 191 return "cpuid"; 192 case EXIT_REASON_GETSEC: 193 return "getsec"; 194 case EXIT_REASON_HLT: 195 return "hlt"; 196 case EXIT_REASON_INVD: 197 return "invd"; 198 case EXIT_REASON_INVLPG: 199 return "invlpg"; 200 case EXIT_REASON_RDPMC: 201 return "rdpmc"; 202 case EXIT_REASON_RDTSC: 203 return "rdtsc"; 204 case EXIT_REASON_RSM: 205 return "rsm"; 206 case EXIT_REASON_VMCALL: 207 return "vmcall"; 208 case EXIT_REASON_VMCLEAR: 209 return "vmclear"; 210 case EXIT_REASON_VMLAUNCH: 211 return "vmlaunch"; 212 case EXIT_REASON_VMPTRLD: 213 return "vmptrld"; 214 case EXIT_REASON_VMPTRST: 215 return "vmptrst"; 216 case EXIT_REASON_VMREAD: 217 return "vmread"; 218 case EXIT_REASON_VMRESUME: 219 return "vmresume"; 220 case EXIT_REASON_VMWRITE: 221 return "vmwrite"; 222 case EXIT_REASON_VMXOFF: 223 return "vmxoff"; 224 case EXIT_REASON_VMXON: 225 return "vmxon"; 226 case EXIT_REASON_CR_ACCESS: 227 return "craccess"; 228 case EXIT_REASON_DR_ACCESS: 229 return "draccess"; 230 case EXIT_REASON_INOUT: 231 return "inout"; 232 case EXIT_REASON_RDMSR: 233 return "rdmsr"; 234 case EXIT_REASON_WRMSR: 235 return "wrmsr"; 236 case EXIT_REASON_INVAL_VMCS: 237 return "invalvmcs"; 238 case EXIT_REASON_INVAL_MSR: 239 return "invalmsr"; 240 case EXIT_REASON_MWAIT: 241 return "mwait"; 242 case EXIT_REASON_MTF: 243 return "mtf"; 244 case EXIT_REASON_MONITOR: 245 return "monitor"; 246 case EXIT_REASON_PAUSE: 247 return "pause"; 248 case EXIT_REASON_MCE: 249 return "mce"; 250 case EXIT_REASON_TPR: 251 return "tpr"; 252 case EXIT_REASON_APIC: 253 return "apic"; 254 case EXIT_REASON_GDTR_IDTR: 255 return "gdtridtr"; 256 case EXIT_REASON_LDTR_TR: 257 return "ldtrtr"; 258 case EXIT_REASON_EPT_FAULT: 259 return "eptfault"; 260 case EXIT_REASON_EPT_MISCONFIG: 261 return "eptmisconfig"; 262 case EXIT_REASON_INVEPT: 263 return "invept"; 264 case EXIT_REASON_RDTSCP: 265 return "rdtscp"; 266 case EXIT_REASON_VMX_PREEMPT: 267 return "vmxpreempt"; 268 case EXIT_REASON_INVVPID: 269 return "invvpid"; 270 case EXIT_REASON_WBINVD: 271 return "wbinvd"; 272 case EXIT_REASON_XSETBV: 273 return "xsetbv"; 274 default: 275 snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); 276 return (reasonbuf); 277 } 278} 279 280#ifdef SETJMP_TRACE 281static const char * 282vmx_setjmp_rc2str(int rc) 283{ 284 switch (rc) { 285 case VMX_RETURN_DIRECT: 286 return "direct"; 287 case VMX_RETURN_LONGJMP: 288 return "longjmp"; 289 case VMX_RETURN_VMRESUME: 290 return "vmresume"; 291 case VMX_RETURN_VMLAUNCH: 292 return "vmlaunch"; 293 case VMX_RETURN_AST: 294 return "ast"; 295 default: 296 return "unknown"; 297 } 298} 299 300#define SETJMP_TRACE(vmx, vcpu, vmxctx, regname) \ 301 VMM_CTR1((vmx)->vm, (vcpu), "setjmp trace " #regname " 0x%016lx", \ 302 (vmxctx)->regname) 303 304static void 305vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) 306{ 307 uint64_t host_rip, host_rsp; 308 309 if (vmxctx != &vmx->ctx[vcpu]) 310 panic("vmx_setjmp_trace: invalid vmxctx %p; should be %p", 311 vmxctx, &vmx->ctx[vcpu]); 312 313 VMM_CTR1((vmx)->vm, (vcpu), "vmxctx = %p", vmxctx); 314 VMM_CTR2((vmx)->vm, (vcpu), "setjmp return code %s(%d)", 315 vmx_setjmp_rc2str(rc), rc); 316 317 host_rsp = host_rip = ~0; 318 vmread(VMCS_HOST_RIP, &host_rip); 319 vmread(VMCS_HOST_RSP, &host_rsp); 320 VMM_CTR2((vmx)->vm, (vcpu), "vmcs host_rip 0x%016lx, host_rsp 0x%016lx", 321 host_rip, host_rsp); 322 323 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r15); 324 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r14); 325 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r13); 326 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r12); 327 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbp); 328 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rsp); 329 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbx); 330 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rip); 331 332 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdi); 333 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rsi); 334 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdx); 335 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rcx); 336 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r8); 337 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r9); 338 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rax); 339 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbx); 340 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbp); 341 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r10); 342 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r11); 343 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r12); 344 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r13); 345 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r14); 346 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r15); 347 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_cr2); 348} 349#endif 350#else 351static void __inline 352vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) 353{ 354 return; 355} 356#endif /* KTR */ 357 358u_long 359vmx_fix_cr0(u_long cr0) 360{ 361 362 return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); 363} 364 365u_long 366vmx_fix_cr4(u_long cr4) 367{ 368 369 return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); 370} 371 372static void 373msr_save_area_init(struct msr_entry *g_area, int *g_count) 374{ 375 int cnt; 376 377 static struct msr_entry guest_msrs[] = { 378 { MSR_KGSBASE, 0, 0 }, 379 }; 380 381 cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]); 382 if (cnt > GUEST_MSR_MAX_ENTRIES) 383 panic("guest msr save area overrun"); 384 bcopy(guest_msrs, g_area, sizeof(guest_msrs)); 385 *g_count = cnt; 386} 387 388static void 389vmx_disable(void *arg __unused) 390{ 391 struct invvpid_desc invvpid_desc = { 0 }; 392 struct invept_desc invept_desc = { 0 }; 393 394 if (vmxon_enabled[curcpu]) { 395 /* 396 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b. 397 * 398 * VMXON or VMXOFF are not required to invalidate any TLB 399 * caching structures. This prevents potential retention of 400 * cached information in the TLB between distinct VMX episodes. 401 */ 402 invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc); 403 invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc); 404 vmxoff(); 405 } 406 load_cr4(rcr4() & ~CR4_VMXE); 407} 408 409static int 410vmx_cleanup(void) 411{ 412 413 smp_rendezvous(NULL, vmx_disable, NULL, NULL); 414 415 return (0); 416} 417 418static void 419vmx_enable(void *arg __unused) 420{ 421 int error; 422 423 load_cr4(rcr4() | CR4_VMXE); 424 425 *(uint32_t *)vmxon_region[curcpu] = vmx_revision(); 426 error = vmxon(vmxon_region[curcpu]); 427 if (error == 0) 428 vmxon_enabled[curcpu] = 1; 429} 430 431static int 432vmx_init(void) 433{ 434 int error; 435 uint64_t fixed0, fixed1, feature_control; 436 uint32_t tmp; 437 438 /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ 439 if (!(cpu_feature2 & CPUID2_VMX)) { 440 printf("vmx_init: processor does not support VMX operation\n"); 441 return (ENXIO); 442 } 443 444 /* 445 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits 446 * are set (bits 0 and 2 respectively). 447 */ 448 feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); 449 if ((feature_control & 0x5) != 0x5) { 450 printf("vmx_init: VMX operation disabled by BIOS\n"); 451 return (ENXIO); 452 } 453 454 /* Check support for primary processor-based VM-execution controls */ 455 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 456 MSR_VMX_TRUE_PROCBASED_CTLS, 457 PROCBASED_CTLS_ONE_SETTING, 458 PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); 459 if (error) { 460 printf("vmx_init: processor does not support desired primary " 461 "processor-based controls\n"); 462 return (error); 463 } 464 465 /* Clear the processor-based ctl bits that are set on demand */ 466 procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; 467 468 /* Check support for secondary processor-based VM-execution controls */ 469 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 470 MSR_VMX_PROCBASED_CTLS2, 471 PROCBASED_CTLS2_ONE_SETTING, 472 PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); 473 if (error) { 474 printf("vmx_init: processor does not support desired secondary " 475 "processor-based controls\n"); 476 return (error); 477 } 478 479 /* Check support for VPID */ 480 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, 481 PROCBASED2_ENABLE_VPID, 0, &tmp); 482 if (error == 0) 483 procbased_ctls2 |= PROCBASED2_ENABLE_VPID; 484 485 /* Check support for pin-based VM-execution controls */ 486 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, 487 MSR_VMX_TRUE_PINBASED_CTLS, 488 PINBASED_CTLS_ONE_SETTING, 489 PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); 490 if (error) { 491 printf("vmx_init: processor does not support desired " 492 "pin-based controls\n"); 493 return (error); 494 } 495 496 /* Check support for VM-exit controls */ 497 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, 498 VM_EXIT_CTLS_ONE_SETTING, 499 VM_EXIT_CTLS_ZERO_SETTING, 500 &exit_ctls); 501 if (error) { 502 /* Try again without the PAT MSR bits */ 503 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, 504 MSR_VMX_TRUE_EXIT_CTLS, 505 VM_EXIT_CTLS_ONE_SETTING_NO_PAT, 506 VM_EXIT_CTLS_ZERO_SETTING, 507 &exit_ctls); 508 if (error) { 509 printf("vmx_init: processor does not support desired " 510 "exit controls\n"); 511 return (error); 512 } else { 513 if (bootverbose) 514 printf("vmm: PAT MSR access not supported\n"); 515 guest_msr_valid(MSR_PAT); 516 vmx_no_patmsr = 1; 517 } 518 } 519 520 /* Check support for VM-entry controls */ 521 if (!vmx_no_patmsr) { 522 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, 523 MSR_VMX_TRUE_ENTRY_CTLS, 524 VM_ENTRY_CTLS_ONE_SETTING, 525 VM_ENTRY_CTLS_ZERO_SETTING, 526 &entry_ctls); 527 } else { 528 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, 529 MSR_VMX_TRUE_ENTRY_CTLS, 530 VM_ENTRY_CTLS_ONE_SETTING_NO_PAT, 531 VM_ENTRY_CTLS_ZERO_SETTING, 532 &entry_ctls); 533 } 534 535 if (error) { 536 printf("vmx_init: processor does not support desired " 537 "entry controls\n"); 538 return (error); 539 } 540 541 /* 542 * Check support for optional features by testing them 543 * as individual bits 544 */ 545 cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 546 MSR_VMX_TRUE_PROCBASED_CTLS, 547 PROCBASED_HLT_EXITING, 0, 548 &tmp) == 0); 549 550 cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 551 MSR_VMX_PROCBASED_CTLS, 552 PROCBASED_MTF, 0, 553 &tmp) == 0); 554 555 cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 556 MSR_VMX_TRUE_PROCBASED_CTLS, 557 PROCBASED_PAUSE_EXITING, 0, 558 &tmp) == 0); 559 560 cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 561 MSR_VMX_PROCBASED_CTLS2, 562 PROCBASED2_UNRESTRICTED_GUEST, 0, 563 &tmp) == 0); 564 565 /* Initialize EPT */ 566 error = ept_init(); 567 if (error) { 568 printf("vmx_init: ept initialization failed (%d)\n", error); 569 return (error); 570 } 571 572 /* 573 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 574 */ 575 fixed0 = rdmsr(MSR_VMX_CR0_FIXED0); 576 fixed1 = rdmsr(MSR_VMX_CR0_FIXED1); 577 cr0_ones_mask = fixed0 & fixed1; 578 cr0_zeros_mask = ~fixed0 & ~fixed1; 579 580 /* 581 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation 582 * if unrestricted guest execution is allowed. 583 */ 584 if (cap_unrestricted_guest) 585 cr0_ones_mask &= ~(CR0_PG | CR0_PE); 586 587 /* 588 * Do not allow the guest to set CR0_NW or CR0_CD. 589 */ 590 cr0_zeros_mask |= (CR0_NW | CR0_CD); 591 592 fixed0 = rdmsr(MSR_VMX_CR4_FIXED0); 593 fixed1 = rdmsr(MSR_VMX_CR4_FIXED1); 594 cr4_ones_mask = fixed0 & fixed1; 595 cr4_zeros_mask = ~fixed0 & ~fixed1; 596 597 /* enable VMX operation */ 598 smp_rendezvous(NULL, vmx_enable, NULL, NULL); 599 600 return (0); 601} 602 603/* 604 * If this processor does not support VPIDs then simply return 0. 605 * 606 * Otherwise generate the next value of VPID to use. Any value is alright 607 * as long as it is non-zero. 608 * 609 * We always execute in VMX non-root context with EPT enabled. Thus all 610 * combined mappings are tagged with the (EP4TA, VPID, PCID) tuple. This 611 * in turn means that multiple VMs can share the same VPID as long as 612 * they have distinct EPT page tables. 613 * 614 * XXX 615 * We should optimize this so that it returns VPIDs that are not in 616 * use. Then we will not unnecessarily invalidate mappings in 617 * vmx_set_pcpu_defaults() just because two or more vcpus happen to 618 * use the same 'vpid'. 619 */ 620static uint16_t 621vmx_vpid(void) 622{ 623 uint16_t vpid = 0; 624 625 if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) != 0) { 626 do { 627 vpid = atomic_fetchadd_int(&nextvpid, 1); 628 } while (vpid == 0); 629 } 630 631 return (vpid); 632} 633 634static int 635vmx_setup_cr_shadow(int which, struct vmcs *vmcs) 636{ 637 int error, mask_ident, shadow_ident; 638 uint64_t mask_value, shadow_value; 639 640 if (which != 0 && which != 4) 641 panic("vmx_setup_cr_shadow: unknown cr%d", which); 642 643 if (which == 0) { 644 mask_ident = VMCS_CR0_MASK; 645 mask_value = cr0_ones_mask | cr0_zeros_mask; 646 shadow_ident = VMCS_CR0_SHADOW; 647 shadow_value = cr0_ones_mask; 648 } else { 649 mask_ident = VMCS_CR4_MASK; 650 mask_value = cr4_ones_mask | cr4_zeros_mask; 651 shadow_ident = VMCS_CR4_SHADOW; 652 shadow_value = cr4_ones_mask; 653 } 654 655 error = vmcs_setreg(vmcs, VMCS_IDENT(mask_ident), mask_value); 656 if (error) 657 return (error); 658 659 error = vmcs_setreg(vmcs, VMCS_IDENT(shadow_ident), shadow_value); 660 if (error) 661 return (error); 662 663 return (0); 664} 665#define vmx_setup_cr0_shadow(vmcs) vmx_setup_cr_shadow(0, (vmcs)) 666#define vmx_setup_cr4_shadow(vmcs) vmx_setup_cr_shadow(4, (vmcs)) 667 668static void * 669vmx_vminit(struct vm *vm) 670{ 671 uint16_t vpid; 672 int i, error, guest_msr_count; 673 struct vmx *vmx; 674 675 vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); 676 if ((uintptr_t)vmx & PAGE_MASK) { 677 panic("malloc of struct vmx not aligned on %d byte boundary", 678 PAGE_SIZE); 679 } 680 vmx->vm = vm; 681 682 /* 683 * Clean up EPTP-tagged guest physical and combined mappings 684 * 685 * VMX transitions are not required to invalidate any guest physical 686 * mappings. So, it may be possible for stale guest physical mappings 687 * to be present in the processor TLBs. 688 * 689 * Combined mappings for this EP4TA are also invalidated for all VPIDs. 690 */ 691 ept_invalidate_mappings(vtophys(vmx->pml4ept)); 692 693 msr_bitmap_initialize(vmx->msr_bitmap); 694 695 /* 696 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE. 697 * The guest FSBASE and GSBASE are saved and restored during 698 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are 699 * always restored from the vmcs host state area on vm-exit. 700 * 701 * Guest KGSBASE is saved and restored in the guest MSR save area. 702 * Host KGSBASE is restored before returning to userland from the pcb. 703 * There will be a window of time when we are executing in the host 704 * kernel context with a value of KGSBASE from the guest. This is ok 705 * because the value of KGSBASE is inconsequential in kernel context. 706 * 707 * MSR_EFER is saved and restored in the guest VMCS area on a 708 * VM exit and entry respectively. It is also restored from the 709 * host VMCS area on a VM exit. 710 */ 711 if (guest_msr_rw(vmx, MSR_GSBASE) || 712 guest_msr_rw(vmx, MSR_FSBASE) || 713 guest_msr_rw(vmx, MSR_KGSBASE) || 714 guest_msr_rw(vmx, MSR_EFER)) 715 panic("vmx_vminit: error setting guest msr access"); 716 717 /* 718 * MSR_PAT is saved and restored in the guest VMCS are on a VM exit 719 * and entry respectively. It is also restored from the host VMCS 720 * area on a VM exit. However, if running on a system with no 721 * MSR_PAT save/restore support, leave access disabled so accesses 722 * will be trapped. 723 */ 724 if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT)) 725 panic("vmx_vminit: error setting guest pat msr access"); 726 727 for (i = 0; i < VM_MAXCPU; i++) { 728 vmx->vmcs[i].identifier = vmx_revision(); 729 error = vmclear(&vmx->vmcs[i]); 730 if (error != 0) { 731 panic("vmx_vminit: vmclear error %d on vcpu %d\n", 732 error, i); 733 } 734 735 vpid = vmx_vpid(); 736 737 error = vmcs_set_defaults(&vmx->vmcs[i], 738 (u_long)vmx_longjmp, 739 (u_long)&vmx->ctx[i], 740 vtophys(vmx->pml4ept), 741 pinbased_ctls, 742 procbased_ctls, 743 procbased_ctls2, 744 exit_ctls, entry_ctls, 745 vtophys(vmx->msr_bitmap), 746 vpid); 747 748 if (error != 0) 749 panic("vmx_vminit: vmcs_set_defaults error %d", error); 750 751 vmx->cap[i].set = 0; 752 vmx->cap[i].proc_ctls = procbased_ctls; 753 754 vmx->state[i].request_nmi = 0; 755 vmx->state[i].lastcpu = -1; 756 vmx->state[i].vpid = vpid; 757 758 msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count); 759 760 error = vmcs_set_msr_save(&vmx->vmcs[i], 761 vtophys(vmx->guest_msrs[i]), 762 guest_msr_count); 763 if (error != 0) 764 panic("vmcs_set_msr_save error %d", error); 765 766 error = vmx_setup_cr0_shadow(&vmx->vmcs[i]); 767 if (error != 0) 768 panic("vmx_setup_cr0_shadow %d", error); 769 770 error = vmx_setup_cr4_shadow(&vmx->vmcs[i]); 771 if (error != 0) 772 panic("vmx_setup_cr4_shadow %d", error); 773 } 774 775 return (vmx); 776} 777 778static int 779vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx) 780{ 781 int handled, func; 782 783 func = vmxctx->guest_rax; 784 785 handled = x86_emulate_cpuid(vm, vcpu, 786 (uint32_t*)(&vmxctx->guest_rax), 787 (uint32_t*)(&vmxctx->guest_rbx), 788 (uint32_t*)(&vmxctx->guest_rcx), 789 (uint32_t*)(&vmxctx->guest_rdx)); 790 return (handled); 791} 792 793static __inline void 794vmx_run_trace(struct vmx *vmx, int vcpu) 795{ 796#ifdef KTR 797 VMM_CTR1(vmx->vm, vcpu, "Resume execution at 0x%0lx", vmcs_guest_rip()); 798#endif 799} 800 801static __inline void 802vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, 803 int handled) 804{ 805#ifdef KTR 806 VMM_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx", 807 handled ? "handled" : "unhandled", 808 exit_reason_to_str(exit_reason), rip); 809#endif 810} 811 812static __inline void 813vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip) 814{ 815#ifdef KTR 816 VMM_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip); 817#endif 818} 819 820static int 821vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu) 822{ 823 int error, lastcpu; 824 struct vmxstate *vmxstate; 825 struct invvpid_desc invvpid_desc = { 0 }; 826 827 vmxstate = &vmx->state[vcpu]; 828 lastcpu = vmxstate->lastcpu; 829 vmxstate->lastcpu = curcpu; 830 831 if (lastcpu == curcpu) { 832 error = 0; 833 goto done; 834 } 835 836 vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); 837 838 error = vmwrite(VMCS_HOST_TR_BASE, (u_long)PCPU_GET(tssp)); 839 if (error != 0) 840 goto done; 841 842 error = vmwrite(VMCS_HOST_GDTR_BASE, (u_long)&gdt[NGDT * curcpu]); 843 if (error != 0) 844 goto done; 845 846 error = vmwrite(VMCS_HOST_GS_BASE, (u_long)&__pcpu[curcpu]); 847 if (error != 0) 848 goto done; 849 850 /* 851 * If we are using VPIDs then invalidate all mappings tagged with 'vpid' 852 * 853 * We do this because this vcpu was executing on a different host 854 * cpu when it last ran. We do not track whether it invalidated 855 * mappings associated with its 'vpid' during that run. So we must 856 * assume that the mappings associated with 'vpid' on 'curcpu' are 857 * stale and invalidate them. 858 * 859 * Note that we incur this penalty only when the scheduler chooses to 860 * move the thread associated with this vcpu between host cpus. 861 * 862 * Note also that this will invalidate mappings tagged with 'vpid' 863 * for "all" EP4TAs. 864 */ 865 if (vmxstate->vpid != 0) { 866 invvpid_desc.vpid = vmxstate->vpid; 867 invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); 868 } 869done: 870 return (error); 871} 872 873static void 874vm_exit_update_rip(struct vm_exit *vmexit) 875{ 876 int error; 877 878 error = vmwrite(VMCS_GUEST_RIP, vmexit->rip + vmexit->inst_length); 879 if (error) 880 panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); 881} 882 883/* 884 * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. 885 */ 886CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); 887 888static void __inline 889vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) 890{ 891 int error; 892 893 vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING; 894 895 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 896 if (error) 897 panic("vmx_set_int_window_exiting: vmwrite error %d", error); 898} 899 900static void __inline 901vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) 902{ 903 int error; 904 905 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; 906 907 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 908 if (error) 909 panic("vmx_clear_int_window_exiting: vmwrite error %d", error); 910} 911 912static void __inline 913vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) 914{ 915 int error; 916 917 vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; 918 919 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 920 if (error) 921 panic("vmx_set_nmi_window_exiting: vmwrite error %d", error); 922} 923 924static void __inline 925vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) 926{ 927 int error; 928 929 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; 930 931 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 932 if (error) 933 panic("vmx_clear_nmi_window_exiting: vmwrite error %d", error); 934} 935 936static int 937vmx_inject_nmi(struct vmx *vmx, int vcpu) 938{ 939 int error; 940 uint64_t info, interruptibility; 941 942 /* Bail out if no NMI requested */ 943 if (vmx->state[vcpu].request_nmi == 0) 944 return (0); 945 946 error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility); 947 if (error) { 948 panic("vmx_inject_nmi: vmread(interruptibility) %d", 949 error); 950 } 951 if (interruptibility & nmi_blocking_bits) 952 goto nmiblocked; 953 954 /* 955 * Inject the virtual NMI. The vector must be the NMI IDT entry 956 * or the VMCS entry check will fail. 957 */ 958 info = VMCS_INTERRUPTION_INFO_NMI | VMCS_INTERRUPTION_INFO_VALID; 959 info |= IDT_NMI; 960 961 error = vmwrite(VMCS_ENTRY_INTR_INFO, info); 962 if (error) 963 panic("vmx_inject_nmi: vmwrite(intrinfo) %d", error); 964 965 VMM_CTR0(vmx->vm, vcpu, "Injecting vNMI"); 966 967 /* Clear the request */ 968 vmx->state[vcpu].request_nmi = 0; 969 return (1); 970 971nmiblocked: 972 /* 973 * Set the NMI Window Exiting execution control so we can inject 974 * the virtual NMI as soon as blocking condition goes away. 975 */ 976 vmx_set_nmi_window_exiting(vmx, vcpu); 977 978 VMM_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting"); 979 return (1); 980} 981 982static void 983vmx_inject_interrupts(struct vmx *vmx, int vcpu) 984{ 985 int error, vector; 986 uint64_t info, rflags, interruptibility; 987 988 const int HWINTR_BLOCKED = VMCS_INTERRUPTIBILITY_STI_BLOCKING | 989 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING; 990 991 /* 992 * If there is already an interrupt pending then just return. 993 * 994 * This could happen if an interrupt was injected on a prior 995 * VM entry but the actual entry into guest mode was aborted 996 * because of a pending AST. 997 */ 998 error = vmread(VMCS_ENTRY_INTR_INFO, &info); 999 if (error) 1000 panic("vmx_inject_interrupts: vmread(intrinfo) %d", error); 1001 if (info & VMCS_INTERRUPTION_INFO_VALID) 1002 return; 1003 1004 /* 1005 * NMI injection has priority so deal with those first 1006 */ 1007 if (vmx_inject_nmi(vmx, vcpu)) 1008 return; 1009 1010 /* Ask the local apic for a vector to inject */ 1011 vector = lapic_pending_intr(vmx->vm, vcpu); 1012 if (vector < 0) 1013 return; 1014 1015 if (vector < 32 || vector > 255) 1016 panic("vmx_inject_interrupts: invalid vector %d\n", vector); 1017 1018 /* Check RFLAGS.IF and the interruptibility state of the guest */ 1019 error = vmread(VMCS_GUEST_RFLAGS, &rflags); 1020 if (error) 1021 panic("vmx_inject_interrupts: vmread(rflags) %d", error); 1022 1023 if ((rflags & PSL_I) == 0) 1024 goto cantinject; 1025 1026 error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility); 1027 if (error) { 1028 panic("vmx_inject_interrupts: vmread(interruptibility) %d", 1029 error); 1030 } 1031 if (interruptibility & HWINTR_BLOCKED) 1032 goto cantinject; 1033 1034 /* Inject the interrupt */ 1035 info = VMCS_INTERRUPTION_INFO_HW_INTR | VMCS_INTERRUPTION_INFO_VALID; 1036 info |= vector; 1037 error = vmwrite(VMCS_ENTRY_INTR_INFO, info); 1038 if (error) 1039 panic("vmx_inject_interrupts: vmwrite(intrinfo) %d", error); 1040 1041 /* Update the Local APIC ISR */ 1042 lapic_intr_accepted(vmx->vm, vcpu, vector); 1043 1044 VMM_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); 1045 1046 return; 1047 1048cantinject: 1049 /* 1050 * Set the Interrupt Window Exiting execution control so we can inject 1051 * the interrupt as soon as blocking condition goes away. 1052 */ 1053 vmx_set_int_window_exiting(vmx, vcpu); 1054 1055 VMM_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting"); 1056} 1057 1058static int 1059vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual) 1060{ 1061 int error, cr, vmcs_guest_cr; 1062 uint64_t regval, ones_mask, zeros_mask; 1063 const struct vmxctx *vmxctx; 1064 1065 /* We only handle mov to %cr0 or %cr4 at this time */ 1066 if ((exitqual & 0xf0) != 0x00) 1067 return (UNHANDLED); 1068 1069 cr = exitqual & 0xf; 1070 if (cr != 0 && cr != 4) 1071 return (UNHANDLED); 1072 1073 vmxctx = &vmx->ctx[vcpu]; 1074 1075 /* 1076 * We must use vmwrite() directly here because vmcs_setreg() will 1077 * call vmclear(vmcs) as a side-effect which we certainly don't want. 1078 */ 1079 switch ((exitqual >> 8) & 0xf) { 1080 case 0: 1081 regval = vmxctx->guest_rax; 1082 break; 1083 case 1: 1084 regval = vmxctx->guest_rcx; 1085 break; 1086 case 2: 1087 regval = vmxctx->guest_rdx; 1088 break; 1089 case 3: 1090 regval = vmxctx->guest_rbx; 1091 break; 1092 case 4: 1093 error = vmread(VMCS_GUEST_RSP, ®val); 1094 if (error) { 1095 panic("vmx_emulate_cr_access: " 1096 "error %d reading guest rsp", error); 1097 } 1098 break; 1099 case 5: 1100 regval = vmxctx->guest_rbp; 1101 break; 1102 case 6: 1103 regval = vmxctx->guest_rsi; 1104 break; 1105 case 7: 1106 regval = vmxctx->guest_rdi; 1107 break; 1108 case 8: 1109 regval = vmxctx->guest_r8; 1110 break; 1111 case 9: 1112 regval = vmxctx->guest_r9; 1113 break; 1114 case 10: 1115 regval = vmxctx->guest_r10; 1116 break; 1117 case 11: 1118 regval = vmxctx->guest_r11; 1119 break; 1120 case 12: 1121 regval = vmxctx->guest_r12; 1122 break; 1123 case 13: 1124 regval = vmxctx->guest_r13; 1125 break; 1126 case 14: 1127 regval = vmxctx->guest_r14; 1128 break; 1129 case 15: 1130 regval = vmxctx->guest_r15; 1131 break; 1132 } 1133 1134 if (cr == 0) { 1135 ones_mask = cr0_ones_mask; 1136 zeros_mask = cr0_zeros_mask; 1137 vmcs_guest_cr = VMCS_GUEST_CR0; 1138 } else { 1139 ones_mask = cr4_ones_mask; 1140 zeros_mask = cr4_zeros_mask; 1141 vmcs_guest_cr = VMCS_GUEST_CR4; 1142 } 1143 regval |= ones_mask; 1144 regval &= ~zeros_mask; 1145 error = vmwrite(vmcs_guest_cr, regval); 1146 if (error) { 1147 panic("vmx_emulate_cr_access: error %d writing cr%d", 1148 error, cr); 1149 } 1150 1151 return (HANDLED); 1152} 1153 1154static int 1155vmx_lapic_fault(struct vm *vm, int cpu, 1156 uint64_t gpa, uint64_t rip, int inst_length, 1157 uint64_t cr3, uint64_t ept_qual) 1158{ 1159 int read, write, handled; 1160 struct vie vie; 1161 1162 /* 1163 * For this to be a legitimate access to the local apic: 1164 * - the GPA in the local apic page 1165 * - the GPA must be aligned on a 16 byte boundary 1166 */ 1167 if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE) 1168 return (UNHANDLED); 1169 1170 if ((gpa & 0xF) != 0) 1171 return (UNHANDLED); 1172 1173 /* EPT violation on an instruction fetch doesn't make sense here */ 1174 if (ept_qual & EPT_VIOLATION_INST_FETCH) 1175 return (UNHANDLED); 1176 1177 /* EPT violation must be a read fault or a write fault but not both */ 1178 read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0; 1179 write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0; 1180 if ((read ^ write) == 0) 1181 return (UNHANDLED); 1182 1183 /* 1184 * The EPT violation must have been caused by accessing a guest-physical 1185 * address that is a translation of a guest-linear address. 1186 */ 1187 if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 || 1188 (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) { 1189 return (UNHANDLED); 1190 } 1191 1192 /* Fetch, decode and emulate the faulting instruction */ 1193 if (vmm_fetch_instruction(vm, rip, inst_length, cr3, &vie) != 0) 1194 return (UNHANDLED); 1195 1196 if (vmm_decode_instruction(&vie) != 0) 1197 return (UNHANDLED); 1198 1199 handled = lapic_mmio(vm, cpu, gpa - DEFAULT_APIC_BASE, read, &vie); 1200 1201 return (handled); 1202} 1203 1204static int 1205vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 1206{ 1207 int handled; 1208 struct vmcs *vmcs; 1209 struct vmxctx *vmxctx; 1210 uint32_t eax, ecx, edx; 1211 uint64_t qual, gpa, cr3; 1212 1213 handled = 0; 1214 vmcs = &vmx->vmcs[vcpu]; 1215 vmxctx = &vmx->ctx[vcpu]; 1216 qual = vmexit->u.vmx.exit_qualification; 1217 vmexit->exitcode = VM_EXITCODE_BOGUS; 1218 1219 switch (vmexit->u.vmx.exit_reason) { 1220 case EXIT_REASON_CR_ACCESS: 1221 handled = vmx_emulate_cr_access(vmx, vcpu, qual); 1222 break; 1223 case EXIT_REASON_RDMSR: 1224 ecx = vmxctx->guest_rcx; 1225 handled = emulate_rdmsr(vmx->vm, vcpu, ecx); 1226 if (!handled) { 1227 vmexit->exitcode = VM_EXITCODE_RDMSR; 1228 vmexit->u.msr.code = ecx; 1229 } 1230 break; 1231 case EXIT_REASON_WRMSR: 1232 eax = vmxctx->guest_rax; 1233 ecx = vmxctx->guest_rcx; 1234 edx = vmxctx->guest_rdx; 1235 handled = emulate_wrmsr(vmx->vm, vcpu, ecx, 1236 (uint64_t)edx << 32 | eax); 1237 if (!handled) { 1238 vmexit->exitcode = VM_EXITCODE_WRMSR; 1239 vmexit->u.msr.code = ecx; 1240 vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; 1241 } 1242 break; 1243 case EXIT_REASON_HLT: 1244 vmexit->exitcode = VM_EXITCODE_HLT; 1245 break; 1246 case EXIT_REASON_MTF: 1247 vmexit->exitcode = VM_EXITCODE_MTRAP; 1248 break; 1249 case EXIT_REASON_PAUSE: 1250 vmexit->exitcode = VM_EXITCODE_PAUSE; 1251 break; 1252 case EXIT_REASON_INTR_WINDOW: 1253 vmx_clear_int_window_exiting(vmx, vcpu); 1254 VMM_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); 1255 /* FALLTHRU */ 1256 case EXIT_REASON_EXT_INTR: 1257 /* 1258 * External interrupts serve only to cause VM exits and allow 1259 * the host interrupt handler to run. 1260 * 1261 * If this external interrupt triggers a virtual interrupt 1262 * to a VM, then that state will be recorded by the 1263 * host interrupt handler in the VM's softc. We will inject 1264 * this virtual interrupt during the subsequent VM enter. 1265 */ 1266 1267 /* 1268 * This is special. We want to treat this as an 'handled' 1269 * VM-exit but not increment the instruction pointer. 1270 */ 1271 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); 1272 return (1); 1273 case EXIT_REASON_NMI_WINDOW: 1274 /* Exit to allow the pending virtual NMI to be injected */ 1275 vmx_clear_nmi_window_exiting(vmx, vcpu); 1276 VMM_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting"); 1277 return (1); 1278 case EXIT_REASON_INOUT: 1279 vmexit->exitcode = VM_EXITCODE_INOUT; 1280 vmexit->u.inout.bytes = (qual & 0x7) + 1; 1281 vmexit->u.inout.in = (qual & 0x8) ? 1 : 0; 1282 vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; 1283 vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; 1284 vmexit->u.inout.port = (uint16_t)(qual >> 16); 1285 vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); 1286 break; 1287 case EXIT_REASON_CPUID: 1288 handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx); 1289 break; 1290 case EXIT_REASON_EPT_FAULT: 1291 gpa = vmcs_gpa(); 1292 cr3 = vmcs_guest_cr3(); 1293 handled = vmx_lapic_fault(vmx->vm, vcpu, 1294 gpa, vmexit->rip, vmexit->inst_length, 1295 cr3, qual); 1296 if (!handled) { 1297 vmexit->exitcode = VM_EXITCODE_PAGING; 1298 vmexit->u.paging.cr3 = cr3; 1299 vmexit->u.paging.gpa = gpa; 1300 vmexit->u.paging.rwx = qual & 0x7; 1301 } 1302 break; 1303 default: 1304 break; 1305 } 1306 1307 if (handled) { 1308 /* 1309 * It is possible that control is returned to userland 1310 * even though we were able to handle the VM exit in the 1311 * kernel. 1312 * 1313 * In such a case we want to make sure that the userland 1314 * restarts guest execution at the instruction *after* 1315 * the one we just processed. Therefore we update the 1316 * guest rip in the VMCS and in 'vmexit'. 1317 */ 1318 vm_exit_update_rip(vmexit); 1319 vmexit->rip += vmexit->inst_length; 1320 vmexit->inst_length = 0; 1321 1322 /* 1323 * Special case for spinning up an AP - exit to userspace to 1324 * give the controlling process a chance to intercept and 1325 * spin up a thread for the AP. 1326 */ 1327 if (vmexit->exitcode == VM_EXITCODE_SPINUP_AP) 1328 handled = 0; 1329 } else { 1330 if (vmexit->exitcode == VM_EXITCODE_BOGUS) { 1331 /* 1332 * If this VM exit was not claimed by anybody then 1333 * treat it as a generic VMX exit. 1334 */ 1335 vmexit->exitcode = VM_EXITCODE_VMX; 1336 vmexit->u.vmx.error = 0; 1337 } else { 1338 /* 1339 * The exitcode and collateral have been populated. 1340 * The VM exit will be processed further in userland. 1341 */ 1342 } 1343 } 1344 return (handled); 1345} 1346 1347static int 1348vmx_run(void *arg, int vcpu, register_t rip) 1349{ 1350 int error, vie, rc, handled, astpending; 1351 uint32_t exit_reason; 1352 struct vmx *vmx; 1353 struct vmxctx *vmxctx; 1354 struct vmcs *vmcs; 1355 struct vm_exit *vmexit; 1356 1357 vmx = arg; 1358 vmcs = &vmx->vmcs[vcpu]; 1359 vmxctx = &vmx->ctx[vcpu]; 1360 vmxctx->launched = 0; 1361 1362 astpending = 0; 1363 vmexit = vm_exitinfo(vmx->vm, vcpu); 1364 1365 /* 1366 * XXX Can we avoid doing this every time we do a vm run? 1367 */ 1368 VMPTRLD(vmcs); 1369 1370 /* 1371 * XXX 1372 * We do this every time because we may setup the virtual machine 1373 * from a different process than the one that actually runs it. 1374 * 1375 * If the life of a virtual machine was spent entirely in the context 1376 * of a single process we could do this once in vmcs_set_defaults(). 1377 */ 1378 if ((error = vmwrite(VMCS_HOST_CR3, rcr3())) != 0) 1379 panic("vmx_run: error %d writing to VMCS_HOST_CR3", error); 1380 1381 if ((error = vmwrite(VMCS_GUEST_RIP, rip)) != 0) 1382 panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); 1383 1384 if ((error = vmx_set_pcpu_defaults(vmx, vcpu)) != 0) 1385 panic("vmx_run: error %d setting up pcpu defaults", error); 1386 1387 do { 1388 lapic_timer_tick(vmx->vm, vcpu); 1389 vmx_inject_interrupts(vmx, vcpu); 1390 vmx_run_trace(vmx, vcpu); 1391 rc = vmx_setjmp(vmxctx); 1392#ifdef SETJMP_TRACE 1393 vmx_setjmp_trace(vmx, vcpu, vmxctx, rc); 1394#endif 1395 switch (rc) { 1396 case VMX_RETURN_DIRECT: 1397 if (vmxctx->launched == 0) { 1398 vmxctx->launched = 1; 1399 vmx_launch(vmxctx); 1400 } else 1401 vmx_resume(vmxctx); 1402 panic("vmx_launch/resume should not return"); 1403 break; 1404 case VMX_RETURN_LONGJMP: 1405 break; /* vm exit */ 1406 case VMX_RETURN_AST: 1407 astpending = 1; 1408 break; 1409 case VMX_RETURN_VMRESUME: 1410 vie = vmcs_instruction_error(); 1411 if (vmxctx->launch_error == VM_FAIL_INVALID || 1412 vie != VMRESUME_WITH_NON_LAUNCHED_VMCS) { 1413 printf("vmresume error %d vmcs inst error %d\n", 1414 vmxctx->launch_error, vie); 1415 goto err_exit; 1416 } 1417 vmx_launch(vmxctx); /* try to launch the guest */ 1418 panic("vmx_launch should not return"); 1419 break; 1420 case VMX_RETURN_VMLAUNCH: 1421 vie = vmcs_instruction_error(); 1422#if 1 1423 printf("vmlaunch error %d vmcs inst error %d\n", 1424 vmxctx->launch_error, vie); 1425#endif 1426 goto err_exit; 1427 default: 1428 panic("vmx_setjmp returned %d", rc); 1429 } 1430 1431 /* enable interrupts */ 1432 enable_intr(); 1433 1434 /* collect some basic information for VM exit processing */ 1435 vmexit->rip = rip = vmcs_guest_rip(); 1436 vmexit->inst_length = vmexit_instruction_length(); 1437 vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); 1438 vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); 1439 1440 if (astpending) { 1441 handled = 1; 1442 vmexit->inst_length = 0; 1443 vmexit->exitcode = VM_EXITCODE_BOGUS; 1444 vmx_astpending_trace(vmx, vcpu, rip); 1445 break; 1446 } 1447 1448 handled = vmx_exit_process(vmx, vcpu, vmexit); 1449 vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled); 1450 1451 } while (handled); 1452 1453 /* 1454 * If a VM exit has been handled then the exitcode must be BOGUS 1455 * If a VM exit is not handled then the exitcode must not be BOGUS 1456 */ 1457 if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || 1458 (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { 1459 panic("Mismatch between handled (%d) and exitcode (%d)", 1460 handled, vmexit->exitcode); 1461 } 1462 1463 VMM_CTR1(vmx->vm, vcpu, "goto userland: exitcode %d",vmexit->exitcode); 1464 1465 /* 1466 * XXX 1467 * We need to do this to ensure that any VMCS state cached by the 1468 * processor is flushed to memory. We need to do this in case the 1469 * VM moves to a different cpu the next time it runs. 1470 * 1471 * Can we avoid doing this? 1472 */ 1473 VMCLEAR(vmcs); 1474 return (0); 1475 1476err_exit: 1477 vmexit->exitcode = VM_EXITCODE_VMX; 1478 vmexit->u.vmx.exit_reason = (uint32_t)-1; 1479 vmexit->u.vmx.exit_qualification = (uint32_t)-1; 1480 vmexit->u.vmx.error = vie; 1481 VMCLEAR(vmcs); 1482 return (ENOEXEC); 1483} 1484 1485static void 1486vmx_vmcleanup(void *arg) 1487{ 1488 int error; 1489 struct vmx *vmx = arg; 1490 1491 /* 1492 * XXXSMP we also need to clear the VMCS active on the other vcpus. 1493 */ 1494 error = vmclear(&vmx->vmcs[0]); 1495 if (error != 0) 1496 panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error); 1497 1498 ept_vmcleanup(vmx); 1499 free(vmx, M_VMX); 1500 1501 return; 1502} 1503 1504static register_t * 1505vmxctx_regptr(struct vmxctx *vmxctx, int reg) 1506{ 1507 1508 switch (reg) { 1509 case VM_REG_GUEST_RAX: 1510 return (&vmxctx->guest_rax); 1511 case VM_REG_GUEST_RBX: 1512 return (&vmxctx->guest_rbx); 1513 case VM_REG_GUEST_RCX: 1514 return (&vmxctx->guest_rcx); 1515 case VM_REG_GUEST_RDX: 1516 return (&vmxctx->guest_rdx); 1517 case VM_REG_GUEST_RSI: 1518 return (&vmxctx->guest_rsi); 1519 case VM_REG_GUEST_RDI: 1520 return (&vmxctx->guest_rdi); 1521 case VM_REG_GUEST_RBP: 1522 return (&vmxctx->guest_rbp); 1523 case VM_REG_GUEST_R8: 1524 return (&vmxctx->guest_r8); 1525 case VM_REG_GUEST_R9: 1526 return (&vmxctx->guest_r9); 1527 case VM_REG_GUEST_R10: 1528 return (&vmxctx->guest_r10); 1529 case VM_REG_GUEST_R11: 1530 return (&vmxctx->guest_r11); 1531 case VM_REG_GUEST_R12: 1532 return (&vmxctx->guest_r12); 1533 case VM_REG_GUEST_R13: 1534 return (&vmxctx->guest_r13); 1535 case VM_REG_GUEST_R14: 1536 return (&vmxctx->guest_r14); 1537 case VM_REG_GUEST_R15: 1538 return (&vmxctx->guest_r15); 1539 default: 1540 break; 1541 } 1542 return (NULL); 1543} 1544 1545static int 1546vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval) 1547{ 1548 register_t *regp; 1549 1550 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 1551 *retval = *regp; 1552 return (0); 1553 } else 1554 return (EINVAL); 1555} 1556 1557static int 1558vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) 1559{ 1560 register_t *regp; 1561 1562 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 1563 *regp = val; 1564 return (0); 1565 } else 1566 return (EINVAL); 1567} 1568 1569static int 1570vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) 1571{ 1572 struct vmx *vmx = arg; 1573 1574 if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0) 1575 return (0); 1576 1577 /* 1578 * If the vcpu is running then don't mess with the VMCS. 1579 * 1580 * vmcs_getreg will VMCLEAR the vmcs when it is done which will cause 1581 * the subsequent vmlaunch/vmresume to fail. 1582 */ 1583 if (vcpu_is_running(vmx->vm, vcpu)) 1584 panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); 1585 1586 return (vmcs_getreg(&vmx->vmcs[vcpu], reg, retval)); 1587} 1588 1589static int 1590vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) 1591{ 1592 int error; 1593 uint64_t ctls; 1594 struct vmx *vmx = arg; 1595 1596 /* 1597 * XXX Allow caller to set contents of the guest registers saved in 1598 * the 'vmxctx' even though the vcpu might be running. We need this 1599 * specifically to support the rdmsr emulation that will set the 1600 * %eax and %edx registers during vm exit processing. 1601 */ 1602 if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0) 1603 return (0); 1604 1605 /* 1606 * If the vcpu is running then don't mess with the VMCS. 1607 * 1608 * vmcs_setreg will VMCLEAR the vmcs when it is done which will cause 1609 * the subsequent vmlaunch/vmresume to fail. 1610 */ 1611 if (vcpu_is_running(vmx->vm, vcpu)) 1612 panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); 1613 1614 error = vmcs_setreg(&vmx->vmcs[vcpu], reg, val); 1615 1616 if (error == 0) { 1617 /* 1618 * If the "load EFER" VM-entry control is 1 then the 1619 * value of EFER.LMA must be identical to "IA-32e mode guest" 1620 * bit in the VM-entry control. 1621 */ 1622 if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 && 1623 (reg == VM_REG_GUEST_EFER)) { 1624 vmcs_getreg(&vmx->vmcs[vcpu], 1625 VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls); 1626 if (val & EFER_LMA) 1627 ctls |= VM_ENTRY_GUEST_LMA; 1628 else 1629 ctls &= ~VM_ENTRY_GUEST_LMA; 1630 vmcs_setreg(&vmx->vmcs[vcpu], 1631 VMCS_IDENT(VMCS_ENTRY_CTLS), ctls); 1632 } 1633 } 1634 1635 return (error); 1636} 1637 1638static int 1639vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 1640{ 1641 struct vmx *vmx = arg; 1642 1643 return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc)); 1644} 1645 1646static int 1647vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 1648{ 1649 struct vmx *vmx = arg; 1650 1651 return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc)); 1652} 1653 1654static int 1655vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code, 1656 int code_valid) 1657{ 1658 int error; 1659 uint64_t info; 1660 struct vmx *vmx = arg; 1661 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 1662 1663 static uint32_t type_map[VM_EVENT_MAX] = { 1664 0x1, /* VM_EVENT_NONE */ 1665 0x0, /* VM_HW_INTR */ 1666 0x2, /* VM_NMI */ 1667 0x3, /* VM_HW_EXCEPTION */ 1668 0x4, /* VM_SW_INTR */ 1669 0x5, /* VM_PRIV_SW_EXCEPTION */ 1670 0x6, /* VM_SW_EXCEPTION */ 1671 }; 1672 1673 /* 1674 * If there is already an exception pending to be delivered to the 1675 * vcpu then just return. 1676 */ 1677 error = vmcs_getreg(vmcs, VMCS_ENTRY_INTR_INFO, &info); 1678 if (error) 1679 return (error); 1680 1681 if (info & VMCS_INTERRUPTION_INFO_VALID) 1682 return (EAGAIN); 1683 1684 info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0); 1685 info |= VMCS_INTERRUPTION_INFO_VALID; 1686 error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info); 1687 if (error != 0) 1688 return (error); 1689 1690 if (code_valid) { 1691 error = vmcs_setreg(vmcs, 1692 VMCS_IDENT(VMCS_ENTRY_EXCEPTION_ERROR), 1693 code); 1694 } 1695 return (error); 1696} 1697 1698static int 1699vmx_nmi(void *arg, int vcpu) 1700{ 1701 struct vmx *vmx = arg; 1702 1703 atomic_set_int(&vmx->state[vcpu].request_nmi, 1); 1704 1705 return (0); 1706} 1707 1708static int 1709vmx_getcap(void *arg, int vcpu, int type, int *retval) 1710{ 1711 struct vmx *vmx = arg; 1712 int vcap; 1713 int ret; 1714 1715 ret = ENOENT; 1716 1717 vcap = vmx->cap[vcpu].set; 1718 1719 switch (type) { 1720 case VM_CAP_HALT_EXIT: 1721 if (cap_halt_exit) 1722 ret = 0; 1723 break; 1724 case VM_CAP_PAUSE_EXIT: 1725 if (cap_pause_exit) 1726 ret = 0; 1727 break; 1728 case VM_CAP_MTRAP_EXIT: 1729 if (cap_monitor_trap) 1730 ret = 0; 1731 break; 1732 case VM_CAP_UNRESTRICTED_GUEST: 1733 if (cap_unrestricted_guest) 1734 ret = 0; 1735 break; 1736 default: 1737 break; 1738 } 1739 1740 if (ret == 0) 1741 *retval = (vcap & (1 << type)) ? 1 : 0; 1742 1743 return (ret); 1744} 1745 1746static int 1747vmx_setcap(void *arg, int vcpu, int type, int val) 1748{ 1749 struct vmx *vmx = arg; 1750 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 1751 uint32_t baseval; 1752 uint32_t *pptr; 1753 int error; 1754 int flag; 1755 int reg; 1756 int retval; 1757 1758 retval = ENOENT; 1759 pptr = NULL; 1760 1761 switch (type) { 1762 case VM_CAP_HALT_EXIT: 1763 if (cap_halt_exit) { 1764 retval = 0; 1765 pptr = &vmx->cap[vcpu].proc_ctls; 1766 baseval = *pptr; 1767 flag = PROCBASED_HLT_EXITING; 1768 reg = VMCS_PRI_PROC_BASED_CTLS; 1769 } 1770 break; 1771 case VM_CAP_MTRAP_EXIT: 1772 if (cap_monitor_trap) { 1773 retval = 0; 1774 pptr = &vmx->cap[vcpu].proc_ctls; 1775 baseval = *pptr; 1776 flag = PROCBASED_MTF; 1777 reg = VMCS_PRI_PROC_BASED_CTLS; 1778 } 1779 break; 1780 case VM_CAP_PAUSE_EXIT: 1781 if (cap_pause_exit) { 1782 retval = 0; 1783 pptr = &vmx->cap[vcpu].proc_ctls; 1784 baseval = *pptr; 1785 flag = PROCBASED_PAUSE_EXITING; 1786 reg = VMCS_PRI_PROC_BASED_CTLS; 1787 } 1788 break; 1789 case VM_CAP_UNRESTRICTED_GUEST: 1790 if (cap_unrestricted_guest) { 1791 retval = 0; 1792 baseval = procbased_ctls2; 1793 flag = PROCBASED2_UNRESTRICTED_GUEST; 1794 reg = VMCS_SEC_PROC_BASED_CTLS; 1795 } 1796 break; 1797 default: 1798 break; 1799 } 1800 1801 if (retval == 0) { 1802 if (val) { 1803 baseval |= flag; 1804 } else { 1805 baseval &= ~flag; 1806 } 1807 VMPTRLD(vmcs); 1808 error = vmwrite(reg, baseval); 1809 VMCLEAR(vmcs); 1810 1811 if (error) { 1812 retval = error; 1813 } else { 1814 /* 1815 * Update optional stored flags, and record 1816 * setting 1817 */ 1818 if (pptr != NULL) { 1819 *pptr = baseval; 1820 } 1821 1822 if (val) { 1823 vmx->cap[vcpu].set |= (1 << type); 1824 } else { 1825 vmx->cap[vcpu].set &= ~(1 << type); 1826 } 1827 } 1828 } 1829 1830 return (retval); 1831} 1832 1833struct vmm_ops vmm_ops_intel = { 1834 vmx_init, 1835 vmx_cleanup, 1836 vmx_vminit, 1837 vmx_run, 1838 vmx_vmcleanup, 1839 ept_vmmmap_set, 1840 ept_vmmmap_get, 1841 vmx_getreg, 1842 vmx_setreg, 1843 vmx_getdesc, 1844 vmx_setdesc, 1845 vmx_inject, 1846 vmx_nmi, 1847 vmx_getcap, 1848 vmx_setcap 1849}; 1850