vmx.c revision 259782
1/*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD: head/sys/amd64/vmm/intel/vmx.c 259782 2013-12-23 19:48:22Z jhb $ 27 */ 28 29#include <sys/cdefs.h> 30__FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/vmx.c 259782 2013-12-23 19:48:22Z jhb $"); 31 32#include <sys/param.h> 33#include <sys/systm.h> 34#include <sys/smp.h> 35#include <sys/kernel.h> 36#include <sys/malloc.h> 37#include <sys/pcpu.h> 38#include <sys/proc.h> 39#include <sys/sysctl.h> 40 41#include <vm/vm.h> 42#include <vm/pmap.h> 43 44#include <machine/psl.h> 45#include <machine/cpufunc.h> 46#include <machine/md_var.h> 47#include <machine/segments.h> 48#include <machine/specialreg.h> 49#include <machine/vmparam.h> 50 51#include <machine/vmm.h> 52#include "vmm_host.h" 53#include "vmm_lapic.h" 54#include "vmm_msr.h" 55#include "vmm_ktr.h" 56#include "vmm_stat.h" 57 58#include "vmx_msr.h" 59#include "ept.h" 60#include "vmx_cpufunc.h" 61#include "vmx.h" 62#include "x86.h" 63#include "vmx_controls.h" 64 65#define PINBASED_CTLS_ONE_SETTING \ 66 (PINBASED_EXTINT_EXITING | \ 67 PINBASED_NMI_EXITING | \ 68 PINBASED_VIRTUAL_NMI) 69#define PINBASED_CTLS_ZERO_SETTING 0 70 71#define PROCBASED_CTLS_WINDOW_SETTING \ 72 (PROCBASED_INT_WINDOW_EXITING | \ 73 PROCBASED_NMI_WINDOW_EXITING) 74 75#define PROCBASED_CTLS_ONE_SETTING \ 76 (PROCBASED_SECONDARY_CONTROLS | \ 77 PROCBASED_IO_EXITING | \ 78 PROCBASED_MSR_BITMAPS | \ 79 PROCBASED_CTLS_WINDOW_SETTING) 80#define PROCBASED_CTLS_ZERO_SETTING \ 81 (PROCBASED_CR3_LOAD_EXITING | \ 82 PROCBASED_CR3_STORE_EXITING | \ 83 PROCBASED_IO_BITMAPS) 84 85#define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT 86#define PROCBASED_CTLS2_ZERO_SETTING 0 87 88#define VM_EXIT_CTLS_ONE_SETTING_NO_PAT \ 89 (VM_EXIT_HOST_LMA | \ 90 VM_EXIT_SAVE_EFER | \ 91 VM_EXIT_LOAD_EFER) 92 93#define VM_EXIT_CTLS_ONE_SETTING \ 94 (VM_EXIT_CTLS_ONE_SETTING_NO_PAT | \ 95 VM_EXIT_SAVE_PAT | \ 96 VM_EXIT_LOAD_PAT) 97#define VM_EXIT_CTLS_ZERO_SETTING VM_EXIT_SAVE_DEBUG_CONTROLS 98 99#define VM_ENTRY_CTLS_ONE_SETTING_NO_PAT VM_ENTRY_LOAD_EFER 100 101#define VM_ENTRY_CTLS_ONE_SETTING \ 102 (VM_ENTRY_CTLS_ONE_SETTING_NO_PAT | \ 103 VM_ENTRY_LOAD_PAT) 104#define VM_ENTRY_CTLS_ZERO_SETTING \ 105 (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ 106 VM_ENTRY_INTO_SMM | \ 107 VM_ENTRY_DEACTIVATE_DUAL_MONITOR) 108 109#define guest_msr_rw(vmx, msr) \ 110 msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW) 111 112#define HANDLED 1 113#define UNHANDLED 0 114 115MALLOC_DEFINE(M_VMX, "vmx", "vmx"); 116 117SYSCTL_DECL(_hw_vmm); 118SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL); 119 120int vmxon_enabled[MAXCPU]; 121static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); 122 123static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; 124static uint32_t exit_ctls, entry_ctls; 125 126static uint64_t cr0_ones_mask, cr0_zeros_mask; 127SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD, 128 &cr0_ones_mask, 0, NULL); 129SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD, 130 &cr0_zeros_mask, 0, NULL); 131 132static uint64_t cr4_ones_mask, cr4_zeros_mask; 133SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD, 134 &cr4_ones_mask, 0, NULL); 135SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD, 136 &cr4_zeros_mask, 0, NULL); 137 138static int vmx_no_patmsr; 139 140static int vmx_initialized; 141SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD, 142 &vmx_initialized, 0, "Intel VMX initialized"); 143 144/* 145 * Virtual NMI blocking conditions. 146 * 147 * Some processor implementations also require NMI to be blocked if 148 * the STI_BLOCKING bit is set. It is possible to detect this at runtime 149 * based on the (exit_reason,exit_qual) tuple being set to 150 * (EXIT_REASON_INVAL_VMCS, EXIT_QUAL_NMI_WHILE_STI_BLOCKING). 151 * 152 * We take the easy way out and also include STI_BLOCKING as one of the 153 * gating items for vNMI injection. 154 */ 155static uint64_t nmi_blocking_bits = VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING | 156 VMCS_INTERRUPTIBILITY_NMI_BLOCKING | 157 VMCS_INTERRUPTIBILITY_STI_BLOCKING; 158 159/* 160 * Optional capabilities 161 */ 162static int cap_halt_exit; 163static int cap_pause_exit; 164static int cap_unrestricted_guest; 165static int cap_monitor_trap; 166static int cap_invpcid; 167 168static struct unrhdr *vpid_unr; 169static u_int vpid_alloc_failed; 170SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD, 171 &vpid_alloc_failed, 0, NULL); 172 173#ifdef KTR 174static const char * 175exit_reason_to_str(int reason) 176{ 177 static char reasonbuf[32]; 178 179 switch (reason) { 180 case EXIT_REASON_EXCEPTION: 181 return "exception"; 182 case EXIT_REASON_EXT_INTR: 183 return "extint"; 184 case EXIT_REASON_TRIPLE_FAULT: 185 return "triplefault"; 186 case EXIT_REASON_INIT: 187 return "init"; 188 case EXIT_REASON_SIPI: 189 return "sipi"; 190 case EXIT_REASON_IO_SMI: 191 return "iosmi"; 192 case EXIT_REASON_SMI: 193 return "smi"; 194 case EXIT_REASON_INTR_WINDOW: 195 return "intrwindow"; 196 case EXIT_REASON_NMI_WINDOW: 197 return "nmiwindow"; 198 case EXIT_REASON_TASK_SWITCH: 199 return "taskswitch"; 200 case EXIT_REASON_CPUID: 201 return "cpuid"; 202 case EXIT_REASON_GETSEC: 203 return "getsec"; 204 case EXIT_REASON_HLT: 205 return "hlt"; 206 case EXIT_REASON_INVD: 207 return "invd"; 208 case EXIT_REASON_INVLPG: 209 return "invlpg"; 210 case EXIT_REASON_RDPMC: 211 return "rdpmc"; 212 case EXIT_REASON_RDTSC: 213 return "rdtsc"; 214 case EXIT_REASON_RSM: 215 return "rsm"; 216 case EXIT_REASON_VMCALL: 217 return "vmcall"; 218 case EXIT_REASON_VMCLEAR: 219 return "vmclear"; 220 case EXIT_REASON_VMLAUNCH: 221 return "vmlaunch"; 222 case EXIT_REASON_VMPTRLD: 223 return "vmptrld"; 224 case EXIT_REASON_VMPTRST: 225 return "vmptrst"; 226 case EXIT_REASON_VMREAD: 227 return "vmread"; 228 case EXIT_REASON_VMRESUME: 229 return "vmresume"; 230 case EXIT_REASON_VMWRITE: 231 return "vmwrite"; 232 case EXIT_REASON_VMXOFF: 233 return "vmxoff"; 234 case EXIT_REASON_VMXON: 235 return "vmxon"; 236 case EXIT_REASON_CR_ACCESS: 237 return "craccess"; 238 case EXIT_REASON_DR_ACCESS: 239 return "draccess"; 240 case EXIT_REASON_INOUT: 241 return "inout"; 242 case EXIT_REASON_RDMSR: 243 return "rdmsr"; 244 case EXIT_REASON_WRMSR: 245 return "wrmsr"; 246 case EXIT_REASON_INVAL_VMCS: 247 return "invalvmcs"; 248 case EXIT_REASON_INVAL_MSR: 249 return "invalmsr"; 250 case EXIT_REASON_MWAIT: 251 return "mwait"; 252 case EXIT_REASON_MTF: 253 return "mtf"; 254 case EXIT_REASON_MONITOR: 255 return "monitor"; 256 case EXIT_REASON_PAUSE: 257 return "pause"; 258 case EXIT_REASON_MCE: 259 return "mce"; 260 case EXIT_REASON_TPR: 261 return "tpr"; 262 case EXIT_REASON_APIC: 263 return "apic"; 264 case EXIT_REASON_GDTR_IDTR: 265 return "gdtridtr"; 266 case EXIT_REASON_LDTR_TR: 267 return "ldtrtr"; 268 case EXIT_REASON_EPT_FAULT: 269 return "eptfault"; 270 case EXIT_REASON_EPT_MISCONFIG: 271 return "eptmisconfig"; 272 case EXIT_REASON_INVEPT: 273 return "invept"; 274 case EXIT_REASON_RDTSCP: 275 return "rdtscp"; 276 case EXIT_REASON_VMX_PREEMPT: 277 return "vmxpreempt"; 278 case EXIT_REASON_INVVPID: 279 return "invvpid"; 280 case EXIT_REASON_WBINVD: 281 return "wbinvd"; 282 case EXIT_REASON_XSETBV: 283 return "xsetbv"; 284 default: 285 snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); 286 return (reasonbuf); 287 } 288} 289 290#ifdef SETJMP_TRACE 291static const char * 292vmx_setjmp_rc2str(int rc) 293{ 294 switch (rc) { 295 case VMX_RETURN_DIRECT: 296 return "direct"; 297 case VMX_RETURN_LONGJMP: 298 return "longjmp"; 299 case VMX_RETURN_VMRESUME: 300 return "vmresume"; 301 case VMX_RETURN_VMLAUNCH: 302 return "vmlaunch"; 303 case VMX_RETURN_AST: 304 return "ast"; 305 default: 306 return "unknown"; 307 } 308} 309 310#define SETJMP_TRACE(vmx, vcpu, vmxctx, regname) \ 311 VCPU_CTR1((vmx)->vm, (vcpu), "setjmp trace " #regname " 0x%016lx", \ 312 (vmxctx)->regname) 313 314static void 315vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) 316{ 317 uint64_t host_rip, host_rsp; 318 319 if (vmxctx != &vmx->ctx[vcpu]) 320 panic("vmx_setjmp_trace: invalid vmxctx %p; should be %p", 321 vmxctx, &vmx->ctx[vcpu]); 322 323 VCPU_CTR1((vmx)->vm, (vcpu), "vmxctx = %p", vmxctx); 324 VCPU_CTR2((vmx)->vm, (vcpu), "setjmp return code %s(%d)", 325 vmx_setjmp_rc2str(rc), rc); 326 327 host_rip = vmcs_read(VMCS_HOST_RIP); 328 host_rsp = vmcs_read(VMCS_HOST_RSP); 329 VCPU_CTR2((vmx)->vm, (vcpu), "vmcs host_rip 0x%016lx, host_rsp %#lx", 330 host_rip, host_rsp); 331 332 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r15); 333 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r14); 334 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r13); 335 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r12); 336 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbp); 337 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rsp); 338 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbx); 339 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rip); 340 341 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdi); 342 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rsi); 343 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdx); 344 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rcx); 345 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r8); 346 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r9); 347 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rax); 348 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbx); 349 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbp); 350 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r10); 351 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r11); 352 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r12); 353 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r13); 354 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r14); 355 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r15); 356 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_cr2); 357} 358#endif 359#else 360static void __inline 361vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) 362{ 363 return; 364} 365#endif /* KTR */ 366 367u_long 368vmx_fix_cr0(u_long cr0) 369{ 370 371 return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); 372} 373 374u_long 375vmx_fix_cr4(u_long cr4) 376{ 377 378 return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); 379} 380 381static void 382vpid_free(int vpid) 383{ 384 if (vpid < 0 || vpid > 0xffff) 385 panic("vpid_free: invalid vpid %d", vpid); 386 387 /* 388 * VPIDs [0,VM_MAXCPU] are special and are not allocated from 389 * the unit number allocator. 390 */ 391 392 if (vpid > VM_MAXCPU) 393 free_unr(vpid_unr, vpid); 394} 395 396static void 397vpid_alloc(uint16_t *vpid, int num) 398{ 399 int i, x; 400 401 if (num <= 0 || num > VM_MAXCPU) 402 panic("invalid number of vpids requested: %d", num); 403 404 /* 405 * If the "enable vpid" execution control is not enabled then the 406 * VPID is required to be 0 for all vcpus. 407 */ 408 if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) { 409 for (i = 0; i < num; i++) 410 vpid[i] = 0; 411 return; 412 } 413 414 /* 415 * Allocate a unique VPID for each vcpu from the unit number allocator. 416 */ 417 for (i = 0; i < num; i++) { 418 x = alloc_unr(vpid_unr); 419 if (x == -1) 420 break; 421 else 422 vpid[i] = x; 423 } 424 425 if (i < num) { 426 atomic_add_int(&vpid_alloc_failed, 1); 427 428 /* 429 * If the unit number allocator does not have enough unique 430 * VPIDs then we need to allocate from the [1,VM_MAXCPU] range. 431 * 432 * These VPIDs are not be unique across VMs but this does not 433 * affect correctness because the combined mappings are also 434 * tagged with the EP4TA which is unique for each VM. 435 * 436 * It is still sub-optimal because the invvpid will invalidate 437 * combined mappings for a particular VPID across all EP4TAs. 438 */ 439 while (i-- > 0) 440 vpid_free(vpid[i]); 441 442 for (i = 0; i < num; i++) 443 vpid[i] = i + 1; 444 } 445} 446 447static void 448vpid_init(void) 449{ 450 /* 451 * VPID 0 is required when the "enable VPID" execution control is 452 * disabled. 453 * 454 * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the 455 * unit number allocator does not have sufficient unique VPIDs to 456 * satisfy the allocation. 457 * 458 * The remaining VPIDs are managed by the unit number allocator. 459 */ 460 vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL); 461} 462 463static void 464msr_save_area_init(struct msr_entry *g_area, int *g_count) 465{ 466 int cnt; 467 468 static struct msr_entry guest_msrs[] = { 469 { MSR_KGSBASE, 0, 0 }, 470 }; 471 472 cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]); 473 if (cnt > GUEST_MSR_MAX_ENTRIES) 474 panic("guest msr save area overrun"); 475 bcopy(guest_msrs, g_area, sizeof(guest_msrs)); 476 *g_count = cnt; 477} 478 479static void 480vmx_disable(void *arg __unused) 481{ 482 struct invvpid_desc invvpid_desc = { 0 }; 483 struct invept_desc invept_desc = { 0 }; 484 485 if (vmxon_enabled[curcpu]) { 486 /* 487 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b. 488 * 489 * VMXON or VMXOFF are not required to invalidate any TLB 490 * caching structures. This prevents potential retention of 491 * cached information in the TLB between distinct VMX episodes. 492 */ 493 invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc); 494 invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc); 495 vmxoff(); 496 } 497 load_cr4(rcr4() & ~CR4_VMXE); 498} 499 500static int 501vmx_cleanup(void) 502{ 503 504 if (vpid_unr != NULL) { 505 delete_unrhdr(vpid_unr); 506 vpid_unr = NULL; 507 } 508 509 smp_rendezvous(NULL, vmx_disable, NULL, NULL); 510 511 return (0); 512} 513 514static void 515vmx_enable(void *arg __unused) 516{ 517 int error; 518 519 load_cr4(rcr4() | CR4_VMXE); 520 521 *(uint32_t *)vmxon_region[curcpu] = vmx_revision(); 522 error = vmxon(vmxon_region[curcpu]); 523 if (error == 0) 524 vmxon_enabled[curcpu] = 1; 525} 526 527static void 528vmx_restore(void) 529{ 530 531 if (vmxon_enabled[curcpu]) 532 vmxon(vmxon_region[curcpu]); 533} 534 535static int 536vmx_init(void) 537{ 538 int error; 539 uint64_t fixed0, fixed1, feature_control; 540 uint32_t tmp; 541 542 /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ 543 if (!(cpu_feature2 & CPUID2_VMX)) { 544 printf("vmx_init: processor does not support VMX operation\n"); 545 return (ENXIO); 546 } 547 548 /* 549 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits 550 * are set (bits 0 and 2 respectively). 551 */ 552 feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); 553 if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 || 554 (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { 555 printf("vmx_init: VMX operation disabled by BIOS\n"); 556 return (ENXIO); 557 } 558 559 /* Check support for primary processor-based VM-execution controls */ 560 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 561 MSR_VMX_TRUE_PROCBASED_CTLS, 562 PROCBASED_CTLS_ONE_SETTING, 563 PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); 564 if (error) { 565 printf("vmx_init: processor does not support desired primary " 566 "processor-based controls\n"); 567 return (error); 568 } 569 570 /* Clear the processor-based ctl bits that are set on demand */ 571 procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; 572 573 /* Check support for secondary processor-based VM-execution controls */ 574 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 575 MSR_VMX_PROCBASED_CTLS2, 576 PROCBASED_CTLS2_ONE_SETTING, 577 PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); 578 if (error) { 579 printf("vmx_init: processor does not support desired secondary " 580 "processor-based controls\n"); 581 return (error); 582 } 583 584 /* Check support for VPID */ 585 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, 586 PROCBASED2_ENABLE_VPID, 0, &tmp); 587 if (error == 0) 588 procbased_ctls2 |= PROCBASED2_ENABLE_VPID; 589 590 /* Check support for pin-based VM-execution controls */ 591 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, 592 MSR_VMX_TRUE_PINBASED_CTLS, 593 PINBASED_CTLS_ONE_SETTING, 594 PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); 595 if (error) { 596 printf("vmx_init: processor does not support desired " 597 "pin-based controls\n"); 598 return (error); 599 } 600 601 /* Check support for VM-exit controls */ 602 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, 603 VM_EXIT_CTLS_ONE_SETTING, 604 VM_EXIT_CTLS_ZERO_SETTING, 605 &exit_ctls); 606 if (error) { 607 /* Try again without the PAT MSR bits */ 608 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, 609 MSR_VMX_TRUE_EXIT_CTLS, 610 VM_EXIT_CTLS_ONE_SETTING_NO_PAT, 611 VM_EXIT_CTLS_ZERO_SETTING, 612 &exit_ctls); 613 if (error) { 614 printf("vmx_init: processor does not support desired " 615 "exit controls\n"); 616 return (error); 617 } else { 618 if (bootverbose) 619 printf("vmm: PAT MSR access not supported\n"); 620 guest_msr_valid(MSR_PAT); 621 vmx_no_patmsr = 1; 622 } 623 } 624 625 /* Check support for VM-entry controls */ 626 if (!vmx_no_patmsr) { 627 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, 628 MSR_VMX_TRUE_ENTRY_CTLS, 629 VM_ENTRY_CTLS_ONE_SETTING, 630 VM_ENTRY_CTLS_ZERO_SETTING, 631 &entry_ctls); 632 } else { 633 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, 634 MSR_VMX_TRUE_ENTRY_CTLS, 635 VM_ENTRY_CTLS_ONE_SETTING_NO_PAT, 636 VM_ENTRY_CTLS_ZERO_SETTING, 637 &entry_ctls); 638 } 639 640 if (error) { 641 printf("vmx_init: processor does not support desired " 642 "entry controls\n"); 643 return (error); 644 } 645 646 /* 647 * Check support for optional features by testing them 648 * as individual bits 649 */ 650 cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 651 MSR_VMX_TRUE_PROCBASED_CTLS, 652 PROCBASED_HLT_EXITING, 0, 653 &tmp) == 0); 654 655 cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 656 MSR_VMX_PROCBASED_CTLS, 657 PROCBASED_MTF, 0, 658 &tmp) == 0); 659 660 cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 661 MSR_VMX_TRUE_PROCBASED_CTLS, 662 PROCBASED_PAUSE_EXITING, 0, 663 &tmp) == 0); 664 665 cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 666 MSR_VMX_PROCBASED_CTLS2, 667 PROCBASED2_UNRESTRICTED_GUEST, 0, 668 &tmp) == 0); 669 670 cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 671 MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0, 672 &tmp) == 0); 673 674 675 /* Initialize EPT */ 676 error = ept_init(); 677 if (error) { 678 printf("vmx_init: ept initialization failed (%d)\n", error); 679 return (error); 680 } 681 682 /* 683 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 684 */ 685 fixed0 = rdmsr(MSR_VMX_CR0_FIXED0); 686 fixed1 = rdmsr(MSR_VMX_CR0_FIXED1); 687 cr0_ones_mask = fixed0 & fixed1; 688 cr0_zeros_mask = ~fixed0 & ~fixed1; 689 690 /* 691 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation 692 * if unrestricted guest execution is allowed. 693 */ 694 if (cap_unrestricted_guest) 695 cr0_ones_mask &= ~(CR0_PG | CR0_PE); 696 697 /* 698 * Do not allow the guest to set CR0_NW or CR0_CD. 699 */ 700 cr0_zeros_mask |= (CR0_NW | CR0_CD); 701 702 fixed0 = rdmsr(MSR_VMX_CR4_FIXED0); 703 fixed1 = rdmsr(MSR_VMX_CR4_FIXED1); 704 cr4_ones_mask = fixed0 & fixed1; 705 cr4_zeros_mask = ~fixed0 & ~fixed1; 706 707 vpid_init(); 708 709 /* enable VMX operation */ 710 smp_rendezvous(NULL, vmx_enable, NULL, NULL); 711 712 vmx_initialized = 1; 713 714 return (0); 715} 716 717static int 718vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial) 719{ 720 int error, mask_ident, shadow_ident; 721 uint64_t mask_value; 722 723 if (which != 0 && which != 4) 724 panic("vmx_setup_cr_shadow: unknown cr%d", which); 725 726 if (which == 0) { 727 mask_ident = VMCS_CR0_MASK; 728 mask_value = cr0_ones_mask | cr0_zeros_mask; 729 shadow_ident = VMCS_CR0_SHADOW; 730 } else { 731 mask_ident = VMCS_CR4_MASK; 732 mask_value = cr4_ones_mask | cr4_zeros_mask; 733 shadow_ident = VMCS_CR4_SHADOW; 734 } 735 736 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value); 737 if (error) 738 return (error); 739 740 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial); 741 if (error) 742 return (error); 743 744 return (0); 745} 746#define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init)) 747#define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init)) 748 749static void * 750vmx_vminit(struct vm *vm, pmap_t pmap) 751{ 752 uint16_t vpid[VM_MAXCPU]; 753 int i, error, guest_msr_count; 754 struct vmx *vmx; 755 756 vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); 757 if ((uintptr_t)vmx & PAGE_MASK) { 758 panic("malloc of struct vmx not aligned on %d byte boundary", 759 PAGE_SIZE); 760 } 761 vmx->vm = vm; 762 763 vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4)); 764 765 /* 766 * Clean up EPTP-tagged guest physical and combined mappings 767 * 768 * VMX transitions are not required to invalidate any guest physical 769 * mappings. So, it may be possible for stale guest physical mappings 770 * to be present in the processor TLBs. 771 * 772 * Combined mappings for this EP4TA are also invalidated for all VPIDs. 773 */ 774 ept_invalidate_mappings(vmx->eptp); 775 776 msr_bitmap_initialize(vmx->msr_bitmap); 777 778 /* 779 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE. 780 * The guest FSBASE and GSBASE are saved and restored during 781 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are 782 * always restored from the vmcs host state area on vm-exit. 783 * 784 * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in 785 * how they are saved/restored so can be directly accessed by the 786 * guest. 787 * 788 * Guest KGSBASE is saved and restored in the guest MSR save area. 789 * Host KGSBASE is restored before returning to userland from the pcb. 790 * There will be a window of time when we are executing in the host 791 * kernel context with a value of KGSBASE from the guest. This is ok 792 * because the value of KGSBASE is inconsequential in kernel context. 793 * 794 * MSR_EFER is saved and restored in the guest VMCS area on a 795 * VM exit and entry respectively. It is also restored from the 796 * host VMCS area on a VM exit. 797 */ 798 if (guest_msr_rw(vmx, MSR_GSBASE) || 799 guest_msr_rw(vmx, MSR_FSBASE) || 800 guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) || 801 guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) || 802 guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) || 803 guest_msr_rw(vmx, MSR_KGSBASE) || 804 guest_msr_rw(vmx, MSR_EFER)) 805 panic("vmx_vminit: error setting guest msr access"); 806 807 /* 808 * MSR_PAT is saved and restored in the guest VMCS are on a VM exit 809 * and entry respectively. It is also restored from the host VMCS 810 * area on a VM exit. However, if running on a system with no 811 * MSR_PAT save/restore support, leave access disabled so accesses 812 * will be trapped. 813 */ 814 if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT)) 815 panic("vmx_vminit: error setting guest pat msr access"); 816 817 vpid_alloc(vpid, VM_MAXCPU); 818 819 for (i = 0; i < VM_MAXCPU; i++) { 820 vmx->vmcs[i].identifier = vmx_revision(); 821 error = vmclear(&vmx->vmcs[i]); 822 if (error != 0) { 823 panic("vmx_vminit: vmclear error %d on vcpu %d\n", 824 error, i); 825 } 826 827 error = vmcs_set_defaults(&vmx->vmcs[i], 828 (u_long)vmx_longjmp, 829 (u_long)&vmx->ctx[i], 830 vmx->eptp, 831 pinbased_ctls, 832 procbased_ctls, 833 procbased_ctls2, 834 exit_ctls, entry_ctls, 835 vtophys(vmx->msr_bitmap), 836 vpid[i]); 837 838 if (error != 0) 839 panic("vmx_vminit: vmcs_set_defaults error %d", error); 840 841 vmx->cap[i].set = 0; 842 vmx->cap[i].proc_ctls = procbased_ctls; 843 vmx->cap[i].proc_ctls2 = procbased_ctls2; 844 845 vmx->state[i].lastcpu = -1; 846 vmx->state[i].vpid = vpid[i]; 847 848 msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count); 849 850 error = vmcs_set_msr_save(&vmx->vmcs[i], 851 vtophys(vmx->guest_msrs[i]), 852 guest_msr_count); 853 if (error != 0) 854 panic("vmcs_set_msr_save error %d", error); 855 856 /* 857 * Set up the CR0/4 shadows, and init the read shadow 858 * to the power-on register value from the Intel Sys Arch. 859 * CR0 - 0x60000010 860 * CR4 - 0 861 */ 862 error = vmx_setup_cr0_shadow(&vmx->vmcs[i], 0x60000010); 863 if (error != 0) 864 panic("vmx_setup_cr0_shadow %d", error); 865 866 error = vmx_setup_cr4_shadow(&vmx->vmcs[i], 0); 867 if (error != 0) 868 panic("vmx_setup_cr4_shadow %d", error); 869 870 vmx->ctx[i].pmap = pmap; 871 vmx->ctx[i].eptp = vmx->eptp; 872 } 873 874 return (vmx); 875} 876 877static int 878vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx) 879{ 880 int handled, func; 881 882 func = vmxctx->guest_rax; 883 884 handled = x86_emulate_cpuid(vm, vcpu, 885 (uint32_t*)(&vmxctx->guest_rax), 886 (uint32_t*)(&vmxctx->guest_rbx), 887 (uint32_t*)(&vmxctx->guest_rcx), 888 (uint32_t*)(&vmxctx->guest_rdx)); 889 return (handled); 890} 891 892static __inline void 893vmx_run_trace(struct vmx *vmx, int vcpu) 894{ 895#ifdef KTR 896 VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip()); 897#endif 898} 899 900static __inline void 901vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, 902 int handled) 903{ 904#ifdef KTR 905 VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx", 906 handled ? "handled" : "unhandled", 907 exit_reason_to_str(exit_reason), rip); 908#endif 909} 910 911static __inline void 912vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip) 913{ 914#ifdef KTR 915 VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip); 916#endif 917} 918 919static void 920vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu) 921{ 922 int lastcpu; 923 struct vmxstate *vmxstate; 924 struct invvpid_desc invvpid_desc = { 0 }; 925 926 vmxstate = &vmx->state[vcpu]; 927 lastcpu = vmxstate->lastcpu; 928 vmxstate->lastcpu = curcpu; 929 930 if (lastcpu == curcpu) 931 return; 932 933 vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); 934 935 vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); 936 vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); 937 vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); 938 939 /* 940 * If we are using VPIDs then invalidate all mappings tagged with 'vpid' 941 * 942 * We do this because this vcpu was executing on a different host 943 * cpu when it last ran. We do not track whether it invalidated 944 * mappings associated with its 'vpid' during that run. So we must 945 * assume that the mappings associated with 'vpid' on 'curcpu' are 946 * stale and invalidate them. 947 * 948 * Note that we incur this penalty only when the scheduler chooses to 949 * move the thread associated with this vcpu between host cpus. 950 * 951 * Note also that this will invalidate mappings tagged with 'vpid' 952 * for "all" EP4TAs. 953 */ 954 if (vmxstate->vpid != 0) { 955 invvpid_desc.vpid = vmxstate->vpid; 956 invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); 957 } 958} 959 960/* 961 * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. 962 */ 963CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); 964 965static void __inline 966vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) 967{ 968 969 vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING; 970 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 971} 972 973static void __inline 974vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) 975{ 976 977 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; 978 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 979} 980 981static void __inline 982vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) 983{ 984 985 vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; 986 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 987} 988 989static void __inline 990vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) 991{ 992 993 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; 994 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 995} 996 997static int 998vmx_inject_nmi(struct vmx *vmx, int vcpu) 999{ 1000 uint64_t info, interruptibility; 1001 1002 /* Bail out if no NMI requested */ 1003 if (!vm_nmi_pending(vmx->vm, vcpu)) 1004 return (0); 1005 1006 interruptibility = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1007 if (interruptibility & nmi_blocking_bits) 1008 goto nmiblocked; 1009 1010 /* 1011 * Inject the virtual NMI. The vector must be the NMI IDT entry 1012 * or the VMCS entry check will fail. 1013 */ 1014 info = VMCS_INTERRUPTION_INFO_NMI | VMCS_INTERRUPTION_INFO_VALID; 1015 info |= IDT_NMI; 1016 vmcs_write(VMCS_ENTRY_INTR_INFO, info); 1017 1018 VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI"); 1019 1020 /* Clear the request */ 1021 vm_nmi_clear(vmx->vm, vcpu); 1022 return (1); 1023 1024nmiblocked: 1025 /* 1026 * Set the NMI Window Exiting execution control so we can inject 1027 * the virtual NMI as soon as blocking condition goes away. 1028 */ 1029 vmx_set_nmi_window_exiting(vmx, vcpu); 1030 1031 VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting"); 1032 return (1); 1033} 1034 1035static void 1036vmx_inject_interrupts(struct vmx *vmx, int vcpu) 1037{ 1038 int vector; 1039 uint64_t info, rflags, interruptibility; 1040 1041 const int HWINTR_BLOCKED = VMCS_INTERRUPTIBILITY_STI_BLOCKING | 1042 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING; 1043 1044 /* 1045 * If there is already an interrupt pending then just return. 1046 * 1047 * This could happen if an interrupt was injected on a prior 1048 * VM entry but the actual entry into guest mode was aborted 1049 * because of a pending AST. 1050 */ 1051 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1052 if (info & VMCS_INTERRUPTION_INFO_VALID) 1053 return; 1054 1055 /* 1056 * NMI injection has priority so deal with those first 1057 */ 1058 if (vmx_inject_nmi(vmx, vcpu)) 1059 return; 1060 1061 /* Ask the local apic for a vector to inject */ 1062 vector = lapic_pending_intr(vmx->vm, vcpu); 1063 if (vector < 0) 1064 return; 1065 1066 if (vector < 32 || vector > 255) 1067 panic("vmx_inject_interrupts: invalid vector %d\n", vector); 1068 1069 /* Check RFLAGS.IF and the interruptibility state of the guest */ 1070 rflags = vmcs_read(VMCS_GUEST_RFLAGS); 1071 if ((rflags & PSL_I) == 0) 1072 goto cantinject; 1073 1074 interruptibility = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1075 if (interruptibility & HWINTR_BLOCKED) 1076 goto cantinject; 1077 1078 /* Inject the interrupt */ 1079 info = VMCS_INTERRUPTION_INFO_HW_INTR | VMCS_INTERRUPTION_INFO_VALID; 1080 info |= vector; 1081 vmcs_write(VMCS_ENTRY_INTR_INFO, info); 1082 1083 /* Update the Local APIC ISR */ 1084 lapic_intr_accepted(vmx->vm, vcpu, vector); 1085 1086 VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); 1087 1088 return; 1089 1090cantinject: 1091 /* 1092 * Set the Interrupt Window Exiting execution control so we can inject 1093 * the interrupt as soon as blocking condition goes away. 1094 */ 1095 vmx_set_int_window_exiting(vmx, vcpu); 1096 1097 VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting"); 1098} 1099 1100static int 1101vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual) 1102{ 1103 int cr, vmcs_guest_cr, vmcs_shadow_cr; 1104 uint64_t crval, regval, ones_mask, zeros_mask; 1105 const struct vmxctx *vmxctx; 1106 1107 /* We only handle mov to %cr0 or %cr4 at this time */ 1108 if ((exitqual & 0xf0) != 0x00) 1109 return (UNHANDLED); 1110 1111 cr = exitqual & 0xf; 1112 if (cr != 0 && cr != 4) 1113 return (UNHANDLED); 1114 1115 vmxctx = &vmx->ctx[vcpu]; 1116 1117 /* 1118 * We must use vmcs_write() directly here because vmcs_setreg() will 1119 * call vmclear(vmcs) as a side-effect which we certainly don't want. 1120 */ 1121 switch ((exitqual >> 8) & 0xf) { 1122 case 0: 1123 regval = vmxctx->guest_rax; 1124 break; 1125 case 1: 1126 regval = vmxctx->guest_rcx; 1127 break; 1128 case 2: 1129 regval = vmxctx->guest_rdx; 1130 break; 1131 case 3: 1132 regval = vmxctx->guest_rbx; 1133 break; 1134 case 4: 1135 regval = vmcs_read(VMCS_GUEST_RSP); 1136 break; 1137 case 5: 1138 regval = vmxctx->guest_rbp; 1139 break; 1140 case 6: 1141 regval = vmxctx->guest_rsi; 1142 break; 1143 case 7: 1144 regval = vmxctx->guest_rdi; 1145 break; 1146 case 8: 1147 regval = vmxctx->guest_r8; 1148 break; 1149 case 9: 1150 regval = vmxctx->guest_r9; 1151 break; 1152 case 10: 1153 regval = vmxctx->guest_r10; 1154 break; 1155 case 11: 1156 regval = vmxctx->guest_r11; 1157 break; 1158 case 12: 1159 regval = vmxctx->guest_r12; 1160 break; 1161 case 13: 1162 regval = vmxctx->guest_r13; 1163 break; 1164 case 14: 1165 regval = vmxctx->guest_r14; 1166 break; 1167 case 15: 1168 regval = vmxctx->guest_r15; 1169 break; 1170 } 1171 1172 if (cr == 0) { 1173 ones_mask = cr0_ones_mask; 1174 zeros_mask = cr0_zeros_mask; 1175 vmcs_guest_cr = VMCS_GUEST_CR0; 1176 vmcs_shadow_cr = VMCS_CR0_SHADOW; 1177 } else { 1178 ones_mask = cr4_ones_mask; 1179 zeros_mask = cr4_zeros_mask; 1180 vmcs_guest_cr = VMCS_GUEST_CR4; 1181 vmcs_shadow_cr = VMCS_CR4_SHADOW; 1182 } 1183 vmcs_write(vmcs_shadow_cr, regval); 1184 1185 crval = regval | ones_mask; 1186 crval &= ~zeros_mask; 1187 vmcs_write(vmcs_guest_cr, crval); 1188 1189 if (cr == 0 && regval & CR0_PG) { 1190 uint64_t efer, entry_ctls; 1191 1192 /* 1193 * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and 1194 * the "IA-32e mode guest" bit in VM-entry control must be 1195 * equal. 1196 */ 1197 efer = vmcs_read(VMCS_GUEST_IA32_EFER); 1198 if (efer & EFER_LME) { 1199 efer |= EFER_LMA; 1200 vmcs_write(VMCS_GUEST_IA32_EFER, efer); 1201 entry_ctls = vmcs_read(VMCS_ENTRY_CTLS); 1202 entry_ctls |= VM_ENTRY_GUEST_LMA; 1203 vmcs_write(VMCS_ENTRY_CTLS, entry_ctls); 1204 } 1205 } 1206 1207 return (HANDLED); 1208} 1209 1210static int 1211ept_fault_type(uint64_t ept_qual) 1212{ 1213 int fault_type; 1214 1215 if (ept_qual & EPT_VIOLATION_DATA_WRITE) 1216 fault_type = VM_PROT_WRITE; 1217 else if (ept_qual & EPT_VIOLATION_INST_FETCH) 1218 fault_type = VM_PROT_EXECUTE; 1219 else 1220 fault_type= VM_PROT_READ; 1221 1222 return (fault_type); 1223} 1224 1225static boolean_t 1226ept_emulation_fault(uint64_t ept_qual) 1227{ 1228 int read, write; 1229 1230 /* EPT fault on an instruction fetch doesn't make sense here */ 1231 if (ept_qual & EPT_VIOLATION_INST_FETCH) 1232 return (FALSE); 1233 1234 /* EPT fault must be a read fault or a write fault */ 1235 read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0; 1236 write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0; 1237 if ((read | write) == 0) 1238 return (FALSE); 1239 1240 /* 1241 * The EPT violation must have been caused by accessing a 1242 * guest-physical address that is a translation of a guest-linear 1243 * address. 1244 */ 1245 if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 || 1246 (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) { 1247 return (FALSE); 1248 } 1249 1250 return (TRUE); 1251} 1252 1253static int 1254vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 1255{ 1256 int error, handled; 1257 struct vmcs *vmcs; 1258 struct vmxctx *vmxctx; 1259 uint32_t eax, ecx, edx, idtvec_info, idtvec_err, reason; 1260 uint64_t qual, gpa; 1261 bool retu; 1262 1263 handled = 0; 1264 vmcs = &vmx->vmcs[vcpu]; 1265 vmxctx = &vmx->ctx[vcpu]; 1266 qual = vmexit->u.vmx.exit_qualification; 1267 reason = vmexit->u.vmx.exit_reason; 1268 vmexit->exitcode = VM_EXITCODE_BOGUS; 1269 1270 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1); 1271 1272 /* 1273 * VM exits that could be triggered during event injection on the 1274 * previous VM entry need to be handled specially by re-injecting 1275 * the event. 1276 * 1277 * See "Information for VM Exits During Event Delivery" in Intel SDM 1278 * for details. 1279 */ 1280 switch (reason) { 1281 case EXIT_REASON_EPT_FAULT: 1282 case EXIT_REASON_EPT_MISCONFIG: 1283 case EXIT_REASON_APIC: 1284 case EXIT_REASON_TASK_SWITCH: 1285 case EXIT_REASON_EXCEPTION: 1286 idtvec_info = vmcs_idt_vectoring_info(); 1287 if (idtvec_info & VMCS_IDT_VEC_VALID) { 1288 idtvec_info &= ~(1 << 12); /* clear undefined bit */ 1289 vmcs_write(VMCS_ENTRY_INTR_INFO, idtvec_info); 1290 if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { 1291 idtvec_err = vmcs_idt_vectoring_err(); 1292 vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, 1293 idtvec_err); 1294 } 1295 vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); 1296 } 1297 default: 1298 break; 1299 } 1300 1301 switch (reason) { 1302 case EXIT_REASON_CR_ACCESS: 1303 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1); 1304 handled = vmx_emulate_cr_access(vmx, vcpu, qual); 1305 break; 1306 case EXIT_REASON_RDMSR: 1307 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1); 1308 retu = false; 1309 ecx = vmxctx->guest_rcx; 1310 error = emulate_rdmsr(vmx->vm, vcpu, ecx, &retu); 1311 if (error) { 1312 vmexit->exitcode = VM_EXITCODE_RDMSR; 1313 vmexit->u.msr.code = ecx; 1314 } else if (!retu) { 1315 handled = 1; 1316 } else { 1317 /* Return to userspace with a valid exitcode */ 1318 KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, 1319 ("emulate_wrmsr retu with bogus exitcode")); 1320 } 1321 break; 1322 case EXIT_REASON_WRMSR: 1323 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1); 1324 retu = false; 1325 eax = vmxctx->guest_rax; 1326 ecx = vmxctx->guest_rcx; 1327 edx = vmxctx->guest_rdx; 1328 error = emulate_wrmsr(vmx->vm, vcpu, ecx, 1329 (uint64_t)edx << 32 | eax, &retu); 1330 if (error) { 1331 vmexit->exitcode = VM_EXITCODE_WRMSR; 1332 vmexit->u.msr.code = ecx; 1333 vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; 1334 } else if (!retu) { 1335 handled = 1; 1336 } else { 1337 /* Return to userspace with a valid exitcode */ 1338 KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, 1339 ("emulate_wrmsr retu with bogus exitcode")); 1340 } 1341 break; 1342 case EXIT_REASON_HLT: 1343 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1); 1344 vmexit->exitcode = VM_EXITCODE_HLT; 1345 vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS); 1346 break; 1347 case EXIT_REASON_MTF: 1348 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1); 1349 vmexit->exitcode = VM_EXITCODE_MTRAP; 1350 break; 1351 case EXIT_REASON_PAUSE: 1352 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1); 1353 vmexit->exitcode = VM_EXITCODE_PAUSE; 1354 break; 1355 case EXIT_REASON_INTR_WINDOW: 1356 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1); 1357 vmx_clear_int_window_exiting(vmx, vcpu); 1358 VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); 1359 return (1); 1360 case EXIT_REASON_EXT_INTR: 1361 /* 1362 * External interrupts serve only to cause VM exits and allow 1363 * the host interrupt handler to run. 1364 * 1365 * If this external interrupt triggers a virtual interrupt 1366 * to a VM, then that state will be recorded by the 1367 * host interrupt handler in the VM's softc. We will inject 1368 * this virtual interrupt during the subsequent VM enter. 1369 */ 1370 1371 /* 1372 * This is special. We want to treat this as an 'handled' 1373 * VM-exit but not increment the instruction pointer. 1374 */ 1375 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); 1376 return (1); 1377 case EXIT_REASON_NMI_WINDOW: 1378 /* Exit to allow the pending virtual NMI to be injected */ 1379 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1); 1380 vmx_clear_nmi_window_exiting(vmx, vcpu); 1381 VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting"); 1382 return (1); 1383 case EXIT_REASON_INOUT: 1384 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1); 1385 vmexit->exitcode = VM_EXITCODE_INOUT; 1386 vmexit->u.inout.bytes = (qual & 0x7) + 1; 1387 vmexit->u.inout.in = (qual & 0x8) ? 1 : 0; 1388 vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; 1389 vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; 1390 vmexit->u.inout.port = (uint16_t)(qual >> 16); 1391 vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); 1392 break; 1393 case EXIT_REASON_CPUID: 1394 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1); 1395 handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx); 1396 break; 1397 case EXIT_REASON_EPT_FAULT: 1398 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EPT_FAULT, 1); 1399 /* 1400 * If 'gpa' lies within the address space allocated to 1401 * memory then this must be a nested page fault otherwise 1402 * this must be an instruction that accesses MMIO space. 1403 */ 1404 gpa = vmcs_gpa(); 1405 if (vm_mem_allocated(vmx->vm, gpa)) { 1406 vmexit->exitcode = VM_EXITCODE_PAGING; 1407 vmexit->u.paging.gpa = gpa; 1408 vmexit->u.paging.fault_type = ept_fault_type(qual); 1409 } else if (ept_emulation_fault(qual)) { 1410 vmexit->exitcode = VM_EXITCODE_INST_EMUL; 1411 vmexit->u.inst_emul.gpa = gpa; 1412 vmexit->u.inst_emul.gla = vmcs_gla(); 1413 vmexit->u.inst_emul.cr3 = vmcs_guest_cr3(); 1414 } 1415 break; 1416 default: 1417 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1); 1418 break; 1419 } 1420 1421 if (handled) { 1422 /* 1423 * It is possible that control is returned to userland 1424 * even though we were able to handle the VM exit in the 1425 * kernel. 1426 * 1427 * In such a case we want to make sure that the userland 1428 * restarts guest execution at the instruction *after* 1429 * the one we just processed. Therefore we update the 1430 * guest rip in the VMCS and in 'vmexit'. 1431 */ 1432 vmexit->rip += vmexit->inst_length; 1433 vmexit->inst_length = 0; 1434 vmcs_write(VMCS_GUEST_RIP, vmexit->rip); 1435 } else { 1436 if (vmexit->exitcode == VM_EXITCODE_BOGUS) { 1437 /* 1438 * If this VM exit was not claimed by anybody then 1439 * treat it as a generic VMX exit. 1440 */ 1441 vmexit->exitcode = VM_EXITCODE_VMX; 1442 vmexit->u.vmx.error = 0; 1443 } else { 1444 /* 1445 * The exitcode and collateral have been populated. 1446 * The VM exit will be processed further in userland. 1447 */ 1448 } 1449 } 1450 return (handled); 1451} 1452 1453static int 1454vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap) 1455{ 1456 int vie, rc, handled, astpending; 1457 uint32_t exit_reason; 1458 struct vmx *vmx; 1459 struct vmxctx *vmxctx; 1460 struct vmcs *vmcs; 1461 struct vm_exit *vmexit; 1462 1463 vmx = arg; 1464 vmcs = &vmx->vmcs[vcpu]; 1465 vmxctx = &vmx->ctx[vcpu]; 1466 vmxctx->launched = 0; 1467 1468 astpending = 0; 1469 vmexit = vm_exitinfo(vmx->vm, vcpu); 1470 1471 KASSERT(vmxctx->pmap == pmap, 1472 ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap)); 1473 KASSERT(vmxctx->eptp == vmx->eptp, 1474 ("eptp %p different than ctx eptp %#lx", eptp, vmxctx->eptp)); 1475 1476 /* 1477 * XXX Can we avoid doing this every time we do a vm run? 1478 */ 1479 VMPTRLD(vmcs); 1480 1481 /* 1482 * XXX 1483 * We do this every time because we may setup the virtual machine 1484 * from a different process than the one that actually runs it. 1485 * 1486 * If the life of a virtual machine was spent entirely in the context 1487 * of a single process we could do this once in vmcs_set_defaults(). 1488 */ 1489 vmcs_write(VMCS_HOST_CR3, rcr3()); 1490 vmcs_write(VMCS_GUEST_RIP, rip); 1491 vmx_set_pcpu_defaults(vmx, vcpu); 1492 1493 do { 1494 vmx_inject_interrupts(vmx, vcpu); 1495 vmx_run_trace(vmx, vcpu); 1496 rc = vmx_setjmp(vmxctx); 1497#ifdef SETJMP_TRACE 1498 vmx_setjmp_trace(vmx, vcpu, vmxctx, rc); 1499#endif 1500 switch (rc) { 1501 case VMX_RETURN_DIRECT: 1502 if (vmxctx->launched == 0) { 1503 vmxctx->launched = 1; 1504 vmx_launch(vmxctx); 1505 } else 1506 vmx_resume(vmxctx); 1507 panic("vmx_launch/resume should not return"); 1508 break; 1509 case VMX_RETURN_LONGJMP: 1510 break; /* vm exit */ 1511 case VMX_RETURN_AST: 1512 astpending = 1; 1513 break; 1514 case VMX_RETURN_VMRESUME: 1515 vie = vmcs_instruction_error(); 1516 if (vmxctx->launch_error == VM_FAIL_INVALID || 1517 vie != VMRESUME_WITH_NON_LAUNCHED_VMCS) { 1518 printf("vmresume error %d vmcs inst error %d\n", 1519 vmxctx->launch_error, vie); 1520 goto err_exit; 1521 } 1522 vmx_launch(vmxctx); /* try to launch the guest */ 1523 panic("vmx_launch should not return"); 1524 break; 1525 case VMX_RETURN_VMLAUNCH: 1526 vie = vmcs_instruction_error(); 1527#if 1 1528 printf("vmlaunch error %d vmcs inst error %d\n", 1529 vmxctx->launch_error, vie); 1530#endif 1531 goto err_exit; 1532 case VMX_RETURN_INVEPT: 1533 panic("vm %s:%d invept error %d", 1534 vm_name(vmx->vm), vcpu, vmxctx->launch_error); 1535 default: 1536 panic("vmx_setjmp returned %d", rc); 1537 } 1538 1539 /* enable interrupts */ 1540 enable_intr(); 1541 1542 /* collect some basic information for VM exit processing */ 1543 vmexit->rip = rip = vmcs_guest_rip(); 1544 vmexit->inst_length = vmexit_instruction_length(); 1545 vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); 1546 vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); 1547 1548 if (astpending) { 1549 handled = 1; 1550 vmexit->inst_length = 0; 1551 vmexit->exitcode = VM_EXITCODE_BOGUS; 1552 vmx_astpending_trace(vmx, vcpu, rip); 1553 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_ASTPENDING, 1); 1554 break; 1555 } 1556 1557 handled = vmx_exit_process(vmx, vcpu, vmexit); 1558 vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled); 1559 1560 } while (handled); 1561 1562 /* 1563 * If a VM exit has been handled then the exitcode must be BOGUS 1564 * If a VM exit is not handled then the exitcode must not be BOGUS 1565 */ 1566 if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || 1567 (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { 1568 panic("Mismatch between handled (%d) and exitcode (%d)", 1569 handled, vmexit->exitcode); 1570 } 1571 1572 if (!handled) 1573 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_USERSPACE, 1); 1574 1575 VCPU_CTR1(vmx->vm, vcpu, "goto userland: exitcode %d",vmexit->exitcode); 1576 1577 /* 1578 * XXX 1579 * We need to do this to ensure that any VMCS state cached by the 1580 * processor is flushed to memory. We need to do this in case the 1581 * VM moves to a different cpu the next time it runs. 1582 * 1583 * Can we avoid doing this? 1584 */ 1585 VMCLEAR(vmcs); 1586 return (0); 1587 1588err_exit: 1589 vmexit->exitcode = VM_EXITCODE_VMX; 1590 vmexit->u.vmx.exit_reason = (uint32_t)-1; 1591 vmexit->u.vmx.exit_qualification = (uint32_t)-1; 1592 vmexit->u.vmx.error = vie; 1593 VMCLEAR(vmcs); 1594 return (ENOEXEC); 1595} 1596 1597static void 1598vmx_vmcleanup(void *arg) 1599{ 1600 int i, error; 1601 struct vmx *vmx = arg; 1602 1603 for (i = 0; i < VM_MAXCPU; i++) 1604 vpid_free(vmx->state[i].vpid); 1605 1606 /* 1607 * XXXSMP we also need to clear the VMCS active on the other vcpus. 1608 */ 1609 error = vmclear(&vmx->vmcs[0]); 1610 if (error != 0) 1611 panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error); 1612 1613 free(vmx, M_VMX); 1614 1615 return; 1616} 1617 1618static register_t * 1619vmxctx_regptr(struct vmxctx *vmxctx, int reg) 1620{ 1621 1622 switch (reg) { 1623 case VM_REG_GUEST_RAX: 1624 return (&vmxctx->guest_rax); 1625 case VM_REG_GUEST_RBX: 1626 return (&vmxctx->guest_rbx); 1627 case VM_REG_GUEST_RCX: 1628 return (&vmxctx->guest_rcx); 1629 case VM_REG_GUEST_RDX: 1630 return (&vmxctx->guest_rdx); 1631 case VM_REG_GUEST_RSI: 1632 return (&vmxctx->guest_rsi); 1633 case VM_REG_GUEST_RDI: 1634 return (&vmxctx->guest_rdi); 1635 case VM_REG_GUEST_RBP: 1636 return (&vmxctx->guest_rbp); 1637 case VM_REG_GUEST_R8: 1638 return (&vmxctx->guest_r8); 1639 case VM_REG_GUEST_R9: 1640 return (&vmxctx->guest_r9); 1641 case VM_REG_GUEST_R10: 1642 return (&vmxctx->guest_r10); 1643 case VM_REG_GUEST_R11: 1644 return (&vmxctx->guest_r11); 1645 case VM_REG_GUEST_R12: 1646 return (&vmxctx->guest_r12); 1647 case VM_REG_GUEST_R13: 1648 return (&vmxctx->guest_r13); 1649 case VM_REG_GUEST_R14: 1650 return (&vmxctx->guest_r14); 1651 case VM_REG_GUEST_R15: 1652 return (&vmxctx->guest_r15); 1653 default: 1654 break; 1655 } 1656 return (NULL); 1657} 1658 1659static int 1660vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval) 1661{ 1662 register_t *regp; 1663 1664 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 1665 *retval = *regp; 1666 return (0); 1667 } else 1668 return (EINVAL); 1669} 1670 1671static int 1672vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) 1673{ 1674 register_t *regp; 1675 1676 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 1677 *regp = val; 1678 return (0); 1679 } else 1680 return (EINVAL); 1681} 1682 1683static int 1684vmx_shadow_reg(int reg) 1685{ 1686 int shreg; 1687 1688 shreg = -1; 1689 1690 switch (reg) { 1691 case VM_REG_GUEST_CR0: 1692 shreg = VMCS_CR0_SHADOW; 1693 break; 1694 case VM_REG_GUEST_CR4: 1695 shreg = VMCS_CR4_SHADOW; 1696 break; 1697 default: 1698 break; 1699 } 1700 1701 return (shreg); 1702} 1703 1704static int 1705vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) 1706{ 1707 int running, hostcpu; 1708 struct vmx *vmx = arg; 1709 1710 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 1711 if (running && hostcpu != curcpu) 1712 panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); 1713 1714 if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0) 1715 return (0); 1716 1717 return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval)); 1718} 1719 1720static int 1721vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) 1722{ 1723 int error, hostcpu, running, shadow; 1724 uint64_t ctls; 1725 struct vmx *vmx = arg; 1726 1727 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 1728 if (running && hostcpu != curcpu) 1729 panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); 1730 1731 if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0) 1732 return (0); 1733 1734 error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val); 1735 1736 if (error == 0) { 1737 /* 1738 * If the "load EFER" VM-entry control is 1 then the 1739 * value of EFER.LMA must be identical to "IA-32e mode guest" 1740 * bit in the VM-entry control. 1741 */ 1742 if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 && 1743 (reg == VM_REG_GUEST_EFER)) { 1744 vmcs_getreg(&vmx->vmcs[vcpu], running, 1745 VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls); 1746 if (val & EFER_LMA) 1747 ctls |= VM_ENTRY_GUEST_LMA; 1748 else 1749 ctls &= ~VM_ENTRY_GUEST_LMA; 1750 vmcs_setreg(&vmx->vmcs[vcpu], running, 1751 VMCS_IDENT(VMCS_ENTRY_CTLS), ctls); 1752 } 1753 1754 shadow = vmx_shadow_reg(reg); 1755 if (shadow > 0) { 1756 /* 1757 * Store the unmodified value in the shadow 1758 */ 1759 error = vmcs_setreg(&vmx->vmcs[vcpu], running, 1760 VMCS_IDENT(shadow), val); 1761 } 1762 } 1763 1764 return (error); 1765} 1766 1767static int 1768vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 1769{ 1770 struct vmx *vmx = arg; 1771 1772 return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc)); 1773} 1774 1775static int 1776vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 1777{ 1778 struct vmx *vmx = arg; 1779 1780 return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc)); 1781} 1782 1783static int 1784vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code, 1785 int code_valid) 1786{ 1787 int error; 1788 uint64_t info; 1789 struct vmx *vmx = arg; 1790 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 1791 1792 static uint32_t type_map[VM_EVENT_MAX] = { 1793 0x1, /* VM_EVENT_NONE */ 1794 0x0, /* VM_HW_INTR */ 1795 0x2, /* VM_NMI */ 1796 0x3, /* VM_HW_EXCEPTION */ 1797 0x4, /* VM_SW_INTR */ 1798 0x5, /* VM_PRIV_SW_EXCEPTION */ 1799 0x6, /* VM_SW_EXCEPTION */ 1800 }; 1801 1802 /* 1803 * If there is already an exception pending to be delivered to the 1804 * vcpu then just return. 1805 */ 1806 error = vmcs_getreg(vmcs, 0, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), &info); 1807 if (error) 1808 return (error); 1809 1810 if (info & VMCS_INTERRUPTION_INFO_VALID) 1811 return (EAGAIN); 1812 1813 info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0); 1814 info |= VMCS_INTERRUPTION_INFO_VALID; 1815 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info); 1816 if (error != 0) 1817 return (error); 1818 1819 if (code_valid) { 1820 error = vmcs_setreg(vmcs, 0, 1821 VMCS_IDENT(VMCS_ENTRY_EXCEPTION_ERROR), 1822 code); 1823 } 1824 return (error); 1825} 1826 1827static int 1828vmx_getcap(void *arg, int vcpu, int type, int *retval) 1829{ 1830 struct vmx *vmx = arg; 1831 int vcap; 1832 int ret; 1833 1834 ret = ENOENT; 1835 1836 vcap = vmx->cap[vcpu].set; 1837 1838 switch (type) { 1839 case VM_CAP_HALT_EXIT: 1840 if (cap_halt_exit) 1841 ret = 0; 1842 break; 1843 case VM_CAP_PAUSE_EXIT: 1844 if (cap_pause_exit) 1845 ret = 0; 1846 break; 1847 case VM_CAP_MTRAP_EXIT: 1848 if (cap_monitor_trap) 1849 ret = 0; 1850 break; 1851 case VM_CAP_UNRESTRICTED_GUEST: 1852 if (cap_unrestricted_guest) 1853 ret = 0; 1854 break; 1855 case VM_CAP_ENABLE_INVPCID: 1856 if (cap_invpcid) 1857 ret = 0; 1858 break; 1859 default: 1860 break; 1861 } 1862 1863 if (ret == 0) 1864 *retval = (vcap & (1 << type)) ? 1 : 0; 1865 1866 return (ret); 1867} 1868 1869static int 1870vmx_setcap(void *arg, int vcpu, int type, int val) 1871{ 1872 struct vmx *vmx = arg; 1873 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 1874 uint32_t baseval; 1875 uint32_t *pptr; 1876 int error; 1877 int flag; 1878 int reg; 1879 int retval; 1880 1881 retval = ENOENT; 1882 pptr = NULL; 1883 1884 switch (type) { 1885 case VM_CAP_HALT_EXIT: 1886 if (cap_halt_exit) { 1887 retval = 0; 1888 pptr = &vmx->cap[vcpu].proc_ctls; 1889 baseval = *pptr; 1890 flag = PROCBASED_HLT_EXITING; 1891 reg = VMCS_PRI_PROC_BASED_CTLS; 1892 } 1893 break; 1894 case VM_CAP_MTRAP_EXIT: 1895 if (cap_monitor_trap) { 1896 retval = 0; 1897 pptr = &vmx->cap[vcpu].proc_ctls; 1898 baseval = *pptr; 1899 flag = PROCBASED_MTF; 1900 reg = VMCS_PRI_PROC_BASED_CTLS; 1901 } 1902 break; 1903 case VM_CAP_PAUSE_EXIT: 1904 if (cap_pause_exit) { 1905 retval = 0; 1906 pptr = &vmx->cap[vcpu].proc_ctls; 1907 baseval = *pptr; 1908 flag = PROCBASED_PAUSE_EXITING; 1909 reg = VMCS_PRI_PROC_BASED_CTLS; 1910 } 1911 break; 1912 case VM_CAP_UNRESTRICTED_GUEST: 1913 if (cap_unrestricted_guest) { 1914 retval = 0; 1915 pptr = &vmx->cap[vcpu].proc_ctls2; 1916 baseval = *pptr; 1917 flag = PROCBASED2_UNRESTRICTED_GUEST; 1918 reg = VMCS_SEC_PROC_BASED_CTLS; 1919 } 1920 break; 1921 case VM_CAP_ENABLE_INVPCID: 1922 if (cap_invpcid) { 1923 retval = 0; 1924 pptr = &vmx->cap[vcpu].proc_ctls2; 1925 baseval = *pptr; 1926 flag = PROCBASED2_ENABLE_INVPCID; 1927 reg = VMCS_SEC_PROC_BASED_CTLS; 1928 } 1929 break; 1930 default: 1931 break; 1932 } 1933 1934 if (retval == 0) { 1935 if (val) { 1936 baseval |= flag; 1937 } else { 1938 baseval &= ~flag; 1939 } 1940 VMPTRLD(vmcs); 1941 error = vmwrite(reg, baseval); 1942 VMCLEAR(vmcs); 1943 1944 if (error) { 1945 retval = error; 1946 } else { 1947 /* 1948 * Update optional stored flags, and record 1949 * setting 1950 */ 1951 if (pptr != NULL) { 1952 *pptr = baseval; 1953 } 1954 1955 if (val) { 1956 vmx->cap[vcpu].set |= (1 << type); 1957 } else { 1958 vmx->cap[vcpu].set &= ~(1 << type); 1959 } 1960 } 1961 } 1962 1963 return (retval); 1964} 1965 1966struct vmm_ops vmm_ops_intel = { 1967 vmx_init, 1968 vmx_cleanup, 1969 vmx_restore, 1970 vmx_vminit, 1971 vmx_run, 1972 vmx_vmcleanup, 1973 vmx_getreg, 1974 vmx_setreg, 1975 vmx_getdesc, 1976 vmx_setdesc, 1977 vmx_inject, 1978 vmx_getcap, 1979 vmx_setcap, 1980 ept_vmspace_alloc, 1981 ept_vmspace_free, 1982}; 1983