vmx.c revision 261088
1/*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD: stable/10/sys/amd64/vmm/intel/vmx.c 261088 2014-01-23 20:21:39Z jhb $ 27 */ 28 29#include <sys/cdefs.h> 30__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/intel/vmx.c 261088 2014-01-23 20:21:39Z jhb $"); 31 32#include <sys/param.h> 33#include <sys/systm.h> 34#include <sys/smp.h> 35#include <sys/kernel.h> 36#include <sys/malloc.h> 37#include <sys/pcpu.h> 38#include <sys/proc.h> 39#include <sys/sysctl.h> 40 41#include <vm/vm.h> 42#include <vm/pmap.h> 43 44#include <machine/psl.h> 45#include <machine/cpufunc.h> 46#include <machine/md_var.h> 47#include <machine/pmap.h> 48#include <machine/segments.h> 49#include <machine/specialreg.h> 50#include <machine/vmparam.h> 51 52#include <machine/vmm.h> 53#include "vmm_host.h" 54#include "vmm_lapic.h" 55#include "vmm_msr.h" 56#include "vmm_ktr.h" 57#include "vmm_stat.h" 58 59#include "vmx_msr.h" 60#include "ept.h" 61#include "vmx_cpufunc.h" 62#include "vmx.h" 63#include "x86.h" 64#include "vmx_controls.h" 65 66#define PINBASED_CTLS_ONE_SETTING \ 67 (PINBASED_EXTINT_EXITING | \ 68 PINBASED_NMI_EXITING | \ 69 PINBASED_VIRTUAL_NMI) 70#define PINBASED_CTLS_ZERO_SETTING 0 71 72#define PROCBASED_CTLS_WINDOW_SETTING \ 73 (PROCBASED_INT_WINDOW_EXITING | \ 74 PROCBASED_NMI_WINDOW_EXITING) 75 76#define PROCBASED_CTLS_ONE_SETTING \ 77 (PROCBASED_SECONDARY_CONTROLS | \ 78 PROCBASED_IO_EXITING | \ 79 PROCBASED_MSR_BITMAPS | \ 80 PROCBASED_CTLS_WINDOW_SETTING) 81#define PROCBASED_CTLS_ZERO_SETTING \ 82 (PROCBASED_CR3_LOAD_EXITING | \ 83 PROCBASED_CR3_STORE_EXITING | \ 84 PROCBASED_IO_BITMAPS) 85 86#define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT 87#define PROCBASED_CTLS2_ZERO_SETTING 0 88 89#define VM_EXIT_CTLS_ONE_SETTING_NO_PAT \ 90 (VM_EXIT_HOST_LMA | \ 91 VM_EXIT_SAVE_EFER | \ 92 VM_EXIT_LOAD_EFER) 93 94#define VM_EXIT_CTLS_ONE_SETTING \ 95 (VM_EXIT_CTLS_ONE_SETTING_NO_PAT | \ 96 VM_EXIT_SAVE_PAT | \ 97 VM_EXIT_LOAD_PAT) 98#define VM_EXIT_CTLS_ZERO_SETTING VM_EXIT_SAVE_DEBUG_CONTROLS 99 100#define VM_ENTRY_CTLS_ONE_SETTING_NO_PAT VM_ENTRY_LOAD_EFER 101 102#define VM_ENTRY_CTLS_ONE_SETTING \ 103 (VM_ENTRY_CTLS_ONE_SETTING_NO_PAT | \ 104 VM_ENTRY_LOAD_PAT) 105#define VM_ENTRY_CTLS_ZERO_SETTING \ 106 (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ 107 VM_ENTRY_INTO_SMM | \ 108 VM_ENTRY_DEACTIVATE_DUAL_MONITOR) 109 110#define guest_msr_rw(vmx, msr) \ 111 msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW) 112 113#define HANDLED 1 114#define UNHANDLED 0 115 116MALLOC_DEFINE(M_VMX, "vmx", "vmx"); 117 118SYSCTL_DECL(_hw_vmm); 119SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL); 120 121int vmxon_enabled[MAXCPU]; 122static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); 123 124static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; 125static uint32_t exit_ctls, entry_ctls; 126 127static uint64_t cr0_ones_mask, cr0_zeros_mask; 128SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD, 129 &cr0_ones_mask, 0, NULL); 130SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD, 131 &cr0_zeros_mask, 0, NULL); 132 133static uint64_t cr4_ones_mask, cr4_zeros_mask; 134SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD, 135 &cr4_ones_mask, 0, NULL); 136SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD, 137 &cr4_zeros_mask, 0, NULL); 138 139static int vmx_no_patmsr; 140 141static int vmx_initialized; 142SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD, 143 &vmx_initialized, 0, "Intel VMX initialized"); 144 145/* 146 * Virtual NMI blocking conditions. 147 * 148 * Some processor implementations also require NMI to be blocked if 149 * the STI_BLOCKING bit is set. It is possible to detect this at runtime 150 * based on the (exit_reason,exit_qual) tuple being set to 151 * (EXIT_REASON_INVAL_VMCS, EXIT_QUAL_NMI_WHILE_STI_BLOCKING). 152 * 153 * We take the easy way out and also include STI_BLOCKING as one of the 154 * gating items for vNMI injection. 155 */ 156static uint64_t nmi_blocking_bits = VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING | 157 VMCS_INTERRUPTIBILITY_NMI_BLOCKING | 158 VMCS_INTERRUPTIBILITY_STI_BLOCKING; 159 160/* 161 * Optional capabilities 162 */ 163static int cap_halt_exit; 164static int cap_pause_exit; 165static int cap_unrestricted_guest; 166static int cap_monitor_trap; 167static int cap_invpcid; 168 169static struct unrhdr *vpid_unr; 170static u_int vpid_alloc_failed; 171SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD, 172 &vpid_alloc_failed, 0, NULL); 173 174#ifdef KTR 175static const char * 176exit_reason_to_str(int reason) 177{ 178 static char reasonbuf[32]; 179 180 switch (reason) { 181 case EXIT_REASON_EXCEPTION: 182 return "exception"; 183 case EXIT_REASON_EXT_INTR: 184 return "extint"; 185 case EXIT_REASON_TRIPLE_FAULT: 186 return "triplefault"; 187 case EXIT_REASON_INIT: 188 return "init"; 189 case EXIT_REASON_SIPI: 190 return "sipi"; 191 case EXIT_REASON_IO_SMI: 192 return "iosmi"; 193 case EXIT_REASON_SMI: 194 return "smi"; 195 case EXIT_REASON_INTR_WINDOW: 196 return "intrwindow"; 197 case EXIT_REASON_NMI_WINDOW: 198 return "nmiwindow"; 199 case EXIT_REASON_TASK_SWITCH: 200 return "taskswitch"; 201 case EXIT_REASON_CPUID: 202 return "cpuid"; 203 case EXIT_REASON_GETSEC: 204 return "getsec"; 205 case EXIT_REASON_HLT: 206 return "hlt"; 207 case EXIT_REASON_INVD: 208 return "invd"; 209 case EXIT_REASON_INVLPG: 210 return "invlpg"; 211 case EXIT_REASON_RDPMC: 212 return "rdpmc"; 213 case EXIT_REASON_RDTSC: 214 return "rdtsc"; 215 case EXIT_REASON_RSM: 216 return "rsm"; 217 case EXIT_REASON_VMCALL: 218 return "vmcall"; 219 case EXIT_REASON_VMCLEAR: 220 return "vmclear"; 221 case EXIT_REASON_VMLAUNCH: 222 return "vmlaunch"; 223 case EXIT_REASON_VMPTRLD: 224 return "vmptrld"; 225 case EXIT_REASON_VMPTRST: 226 return "vmptrst"; 227 case EXIT_REASON_VMREAD: 228 return "vmread"; 229 case EXIT_REASON_VMRESUME: 230 return "vmresume"; 231 case EXIT_REASON_VMWRITE: 232 return "vmwrite"; 233 case EXIT_REASON_VMXOFF: 234 return "vmxoff"; 235 case EXIT_REASON_VMXON: 236 return "vmxon"; 237 case EXIT_REASON_CR_ACCESS: 238 return "craccess"; 239 case EXIT_REASON_DR_ACCESS: 240 return "draccess"; 241 case EXIT_REASON_INOUT: 242 return "inout"; 243 case EXIT_REASON_RDMSR: 244 return "rdmsr"; 245 case EXIT_REASON_WRMSR: 246 return "wrmsr"; 247 case EXIT_REASON_INVAL_VMCS: 248 return "invalvmcs"; 249 case EXIT_REASON_INVAL_MSR: 250 return "invalmsr"; 251 case EXIT_REASON_MWAIT: 252 return "mwait"; 253 case EXIT_REASON_MTF: 254 return "mtf"; 255 case EXIT_REASON_MONITOR: 256 return "monitor"; 257 case EXIT_REASON_PAUSE: 258 return "pause"; 259 case EXIT_REASON_MCE: 260 return "mce"; 261 case EXIT_REASON_TPR: 262 return "tpr"; 263 case EXIT_REASON_APIC: 264 return "apic"; 265 case EXIT_REASON_GDTR_IDTR: 266 return "gdtridtr"; 267 case EXIT_REASON_LDTR_TR: 268 return "ldtrtr"; 269 case EXIT_REASON_EPT_FAULT: 270 return "eptfault"; 271 case EXIT_REASON_EPT_MISCONFIG: 272 return "eptmisconfig"; 273 case EXIT_REASON_INVEPT: 274 return "invept"; 275 case EXIT_REASON_RDTSCP: 276 return "rdtscp"; 277 case EXIT_REASON_VMX_PREEMPT: 278 return "vmxpreempt"; 279 case EXIT_REASON_INVVPID: 280 return "invvpid"; 281 case EXIT_REASON_WBINVD: 282 return "wbinvd"; 283 case EXIT_REASON_XSETBV: 284 return "xsetbv"; 285 default: 286 snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); 287 return (reasonbuf); 288 } 289} 290 291#ifdef SETJMP_TRACE 292static const char * 293vmx_setjmp_rc2str(int rc) 294{ 295 switch (rc) { 296 case VMX_RETURN_DIRECT: 297 return "direct"; 298 case VMX_RETURN_LONGJMP: 299 return "longjmp"; 300 case VMX_RETURN_VMRESUME: 301 return "vmresume"; 302 case VMX_RETURN_VMLAUNCH: 303 return "vmlaunch"; 304 case VMX_RETURN_AST: 305 return "ast"; 306 default: 307 return "unknown"; 308 } 309} 310 311#define SETJMP_TRACE(vmx, vcpu, vmxctx, regname) \ 312 VCPU_CTR1((vmx)->vm, (vcpu), "setjmp trace " #regname " 0x%016lx", \ 313 (vmxctx)->regname) 314 315static void 316vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) 317{ 318 uint64_t host_rip, host_rsp; 319 320 if (vmxctx != &vmx->ctx[vcpu]) 321 panic("vmx_setjmp_trace: invalid vmxctx %p; should be %p", 322 vmxctx, &vmx->ctx[vcpu]); 323 324 VCPU_CTR1((vmx)->vm, (vcpu), "vmxctx = %p", vmxctx); 325 VCPU_CTR2((vmx)->vm, (vcpu), "setjmp return code %s(%d)", 326 vmx_setjmp_rc2str(rc), rc); 327 328 host_rsp = host_rip = ~0; 329 vmread(VMCS_HOST_RIP, &host_rip); 330 vmread(VMCS_HOST_RSP, &host_rsp); 331 VCPU_CTR2((vmx)->vm, (vcpu), "vmcs host_rip 0x%016lx, host_rsp %#lx", 332 host_rip, host_rsp); 333 334 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r15); 335 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r14); 336 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r13); 337 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r12); 338 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbp); 339 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rsp); 340 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbx); 341 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rip); 342 343 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdi); 344 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rsi); 345 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdx); 346 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rcx); 347 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r8); 348 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r9); 349 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rax); 350 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbx); 351 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbp); 352 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r10); 353 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r11); 354 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r12); 355 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r13); 356 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r14); 357 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r15); 358 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_cr2); 359} 360#endif 361#else 362static void __inline 363vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) 364{ 365 return; 366} 367#endif /* KTR */ 368 369u_long 370vmx_fix_cr0(u_long cr0) 371{ 372 373 return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); 374} 375 376u_long 377vmx_fix_cr4(u_long cr4) 378{ 379 380 return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); 381} 382 383static void 384vpid_free(int vpid) 385{ 386 if (vpid < 0 || vpid > 0xffff) 387 panic("vpid_free: invalid vpid %d", vpid); 388 389 /* 390 * VPIDs [0,VM_MAXCPU] are special and are not allocated from 391 * the unit number allocator. 392 */ 393 394 if (vpid > VM_MAXCPU) 395 free_unr(vpid_unr, vpid); 396} 397 398static void 399vpid_alloc(uint16_t *vpid, int num) 400{ 401 int i, x; 402 403 if (num <= 0 || num > VM_MAXCPU) 404 panic("invalid number of vpids requested: %d", num); 405 406 /* 407 * If the "enable vpid" execution control is not enabled then the 408 * VPID is required to be 0 for all vcpus. 409 */ 410 if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) { 411 for (i = 0; i < num; i++) 412 vpid[i] = 0; 413 return; 414 } 415 416 /* 417 * Allocate a unique VPID for each vcpu from the unit number allocator. 418 */ 419 for (i = 0; i < num; i++) { 420 x = alloc_unr(vpid_unr); 421 if (x == -1) 422 break; 423 else 424 vpid[i] = x; 425 } 426 427 if (i < num) { 428 atomic_add_int(&vpid_alloc_failed, 1); 429 430 /* 431 * If the unit number allocator does not have enough unique 432 * VPIDs then we need to allocate from the [1,VM_MAXCPU] range. 433 * 434 * These VPIDs are not be unique across VMs but this does not 435 * affect correctness because the combined mappings are also 436 * tagged with the EP4TA which is unique for each VM. 437 * 438 * It is still sub-optimal because the invvpid will invalidate 439 * combined mappings for a particular VPID across all EP4TAs. 440 */ 441 while (i-- > 0) 442 vpid_free(vpid[i]); 443 444 for (i = 0; i < num; i++) 445 vpid[i] = i + 1; 446 } 447} 448 449static void 450vpid_init(void) 451{ 452 /* 453 * VPID 0 is required when the "enable VPID" execution control is 454 * disabled. 455 * 456 * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the 457 * unit number allocator does not have sufficient unique VPIDs to 458 * satisfy the allocation. 459 * 460 * The remaining VPIDs are managed by the unit number allocator. 461 */ 462 vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL); 463} 464 465static void 466msr_save_area_init(struct msr_entry *g_area, int *g_count) 467{ 468 int cnt; 469 470 static struct msr_entry guest_msrs[] = { 471 { MSR_KGSBASE, 0, 0 }, 472 }; 473 474 cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]); 475 if (cnt > GUEST_MSR_MAX_ENTRIES) 476 panic("guest msr save area overrun"); 477 bcopy(guest_msrs, g_area, sizeof(guest_msrs)); 478 *g_count = cnt; 479} 480 481static void 482vmx_disable(void *arg __unused) 483{ 484 struct invvpid_desc invvpid_desc = { 0 }; 485 struct invept_desc invept_desc = { 0 }; 486 487 if (vmxon_enabled[curcpu]) { 488 /* 489 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b. 490 * 491 * VMXON or VMXOFF are not required to invalidate any TLB 492 * caching structures. This prevents potential retention of 493 * cached information in the TLB between distinct VMX episodes. 494 */ 495 invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc); 496 invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc); 497 vmxoff(); 498 } 499 load_cr4(rcr4() & ~CR4_VMXE); 500} 501 502static int 503vmx_cleanup(void) 504{ 505 506 if (vpid_unr != NULL) { 507 delete_unrhdr(vpid_unr); 508 vpid_unr = NULL; 509 } 510 511 smp_rendezvous(NULL, vmx_disable, NULL, NULL); 512 513 return (0); 514} 515 516static void 517vmx_enable(void *arg __unused) 518{ 519 int error; 520 521 load_cr4(rcr4() | CR4_VMXE); 522 523 *(uint32_t *)vmxon_region[curcpu] = vmx_revision(); 524 error = vmxon(vmxon_region[curcpu]); 525 if (error == 0) 526 vmxon_enabled[curcpu] = 1; 527} 528 529static int 530vmx_init(void) 531{ 532 int error; 533 uint64_t fixed0, fixed1, feature_control; 534 uint32_t tmp; 535 536 /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ 537 if (!(cpu_feature2 & CPUID2_VMX)) { 538 printf("vmx_init: processor does not support VMX operation\n"); 539 return (ENXIO); 540 } 541 542 /* 543 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits 544 * are set (bits 0 and 2 respectively). 545 */ 546 feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); 547 if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 || 548 (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { 549 printf("vmx_init: VMX operation disabled by BIOS\n"); 550 return (ENXIO); 551 } 552 553 /* Check support for primary processor-based VM-execution controls */ 554 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 555 MSR_VMX_TRUE_PROCBASED_CTLS, 556 PROCBASED_CTLS_ONE_SETTING, 557 PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); 558 if (error) { 559 printf("vmx_init: processor does not support desired primary " 560 "processor-based controls\n"); 561 return (error); 562 } 563 564 /* Clear the processor-based ctl bits that are set on demand */ 565 procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; 566 567 /* Check support for secondary processor-based VM-execution controls */ 568 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 569 MSR_VMX_PROCBASED_CTLS2, 570 PROCBASED_CTLS2_ONE_SETTING, 571 PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); 572 if (error) { 573 printf("vmx_init: processor does not support desired secondary " 574 "processor-based controls\n"); 575 return (error); 576 } 577 578 /* Check support for VPID */ 579 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, 580 PROCBASED2_ENABLE_VPID, 0, &tmp); 581 if (error == 0) 582 procbased_ctls2 |= PROCBASED2_ENABLE_VPID; 583 584 /* Check support for pin-based VM-execution controls */ 585 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, 586 MSR_VMX_TRUE_PINBASED_CTLS, 587 PINBASED_CTLS_ONE_SETTING, 588 PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); 589 if (error) { 590 printf("vmx_init: processor does not support desired " 591 "pin-based controls\n"); 592 return (error); 593 } 594 595 /* Check support for VM-exit controls */ 596 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, 597 VM_EXIT_CTLS_ONE_SETTING, 598 VM_EXIT_CTLS_ZERO_SETTING, 599 &exit_ctls); 600 if (error) { 601 /* Try again without the PAT MSR bits */ 602 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, 603 MSR_VMX_TRUE_EXIT_CTLS, 604 VM_EXIT_CTLS_ONE_SETTING_NO_PAT, 605 VM_EXIT_CTLS_ZERO_SETTING, 606 &exit_ctls); 607 if (error) { 608 printf("vmx_init: processor does not support desired " 609 "exit controls\n"); 610 return (error); 611 } else { 612 if (bootverbose) 613 printf("vmm: PAT MSR access not supported\n"); 614 guest_msr_valid(MSR_PAT); 615 vmx_no_patmsr = 1; 616 } 617 } 618 619 /* Check support for VM-entry controls */ 620 if (!vmx_no_patmsr) { 621 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, 622 MSR_VMX_TRUE_ENTRY_CTLS, 623 VM_ENTRY_CTLS_ONE_SETTING, 624 VM_ENTRY_CTLS_ZERO_SETTING, 625 &entry_ctls); 626 } else { 627 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, 628 MSR_VMX_TRUE_ENTRY_CTLS, 629 VM_ENTRY_CTLS_ONE_SETTING_NO_PAT, 630 VM_ENTRY_CTLS_ZERO_SETTING, 631 &entry_ctls); 632 } 633 634 if (error) { 635 printf("vmx_init: processor does not support desired " 636 "entry controls\n"); 637 return (error); 638 } 639 640 /* 641 * Check support for optional features by testing them 642 * as individual bits 643 */ 644 cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 645 MSR_VMX_TRUE_PROCBASED_CTLS, 646 PROCBASED_HLT_EXITING, 0, 647 &tmp) == 0); 648 649 cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 650 MSR_VMX_PROCBASED_CTLS, 651 PROCBASED_MTF, 0, 652 &tmp) == 0); 653 654 cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 655 MSR_VMX_TRUE_PROCBASED_CTLS, 656 PROCBASED_PAUSE_EXITING, 0, 657 &tmp) == 0); 658 659 cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 660 MSR_VMX_PROCBASED_CTLS2, 661 PROCBASED2_UNRESTRICTED_GUEST, 0, 662 &tmp) == 0); 663 664 cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 665 MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0, 666 &tmp) == 0); 667 668 669 /* Initialize EPT */ 670 error = ept_init(); 671 if (error) { 672 printf("vmx_init: ept initialization failed (%d)\n", error); 673 return (error); 674 } 675 676 /* 677 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 678 */ 679 fixed0 = rdmsr(MSR_VMX_CR0_FIXED0); 680 fixed1 = rdmsr(MSR_VMX_CR0_FIXED1); 681 cr0_ones_mask = fixed0 & fixed1; 682 cr0_zeros_mask = ~fixed0 & ~fixed1; 683 684 /* 685 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation 686 * if unrestricted guest execution is allowed. 687 */ 688 if (cap_unrestricted_guest) 689 cr0_ones_mask &= ~(CR0_PG | CR0_PE); 690 691 /* 692 * Do not allow the guest to set CR0_NW or CR0_CD. 693 */ 694 cr0_zeros_mask |= (CR0_NW | CR0_CD); 695 696 fixed0 = rdmsr(MSR_VMX_CR4_FIXED0); 697 fixed1 = rdmsr(MSR_VMX_CR4_FIXED1); 698 cr4_ones_mask = fixed0 & fixed1; 699 cr4_zeros_mask = ~fixed0 & ~fixed1; 700 701 vpid_init(); 702 703 /* enable VMX operation */ 704 smp_rendezvous(NULL, vmx_enable, NULL, NULL); 705 706 vmx_initialized = 1; 707 708 return (0); 709} 710 711static int 712vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial) 713{ 714 int error, mask_ident, shadow_ident; 715 uint64_t mask_value; 716 717 if (which != 0 && which != 4) 718 panic("vmx_setup_cr_shadow: unknown cr%d", which); 719 720 if (which == 0) { 721 mask_ident = VMCS_CR0_MASK; 722 mask_value = cr0_ones_mask | cr0_zeros_mask; 723 shadow_ident = VMCS_CR0_SHADOW; 724 } else { 725 mask_ident = VMCS_CR4_MASK; 726 mask_value = cr4_ones_mask | cr4_zeros_mask; 727 shadow_ident = VMCS_CR4_SHADOW; 728 } 729 730 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value); 731 if (error) 732 return (error); 733 734 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial); 735 if (error) 736 return (error); 737 738 return (0); 739} 740#define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init)) 741#define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init)) 742 743static void * 744vmx_vminit(struct vm *vm, pmap_t pmap) 745{ 746 uint16_t vpid[VM_MAXCPU]; 747 int i, error, guest_msr_count; 748 struct vmx *vmx; 749 750 vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); 751 if ((uintptr_t)vmx & PAGE_MASK) { 752 panic("malloc of struct vmx not aligned on %d byte boundary", 753 PAGE_SIZE); 754 } 755 vmx->vm = vm; 756 757 vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4)); 758 759 /* 760 * Clean up EPTP-tagged guest physical and combined mappings 761 * 762 * VMX transitions are not required to invalidate any guest physical 763 * mappings. So, it may be possible for stale guest physical mappings 764 * to be present in the processor TLBs. 765 * 766 * Combined mappings for this EP4TA are also invalidated for all VPIDs. 767 */ 768 ept_invalidate_mappings(vmx->eptp); 769 770 msr_bitmap_initialize(vmx->msr_bitmap); 771 772 /* 773 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE. 774 * The guest FSBASE and GSBASE are saved and restored during 775 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are 776 * always restored from the vmcs host state area on vm-exit. 777 * 778 * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in 779 * how they are saved/restored so can be directly accessed by the 780 * guest. 781 * 782 * Guest KGSBASE is saved and restored in the guest MSR save area. 783 * Host KGSBASE is restored before returning to userland from the pcb. 784 * There will be a window of time when we are executing in the host 785 * kernel context with a value of KGSBASE from the guest. This is ok 786 * because the value of KGSBASE is inconsequential in kernel context. 787 * 788 * MSR_EFER is saved and restored in the guest VMCS area on a 789 * VM exit and entry respectively. It is also restored from the 790 * host VMCS area on a VM exit. 791 */ 792 if (guest_msr_rw(vmx, MSR_GSBASE) || 793 guest_msr_rw(vmx, MSR_FSBASE) || 794 guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) || 795 guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) || 796 guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) || 797 guest_msr_rw(vmx, MSR_KGSBASE) || 798 guest_msr_rw(vmx, MSR_EFER)) 799 panic("vmx_vminit: error setting guest msr access"); 800 801 /* 802 * MSR_PAT is saved and restored in the guest VMCS are on a VM exit 803 * and entry respectively. It is also restored from the host VMCS 804 * area on a VM exit. However, if running on a system with no 805 * MSR_PAT save/restore support, leave access disabled so accesses 806 * will be trapped. 807 */ 808 if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT)) 809 panic("vmx_vminit: error setting guest pat msr access"); 810 811 vpid_alloc(vpid, VM_MAXCPU); 812 813 for (i = 0; i < VM_MAXCPU; i++) { 814 vmx->vmcs[i].identifier = vmx_revision(); 815 error = vmclear(&vmx->vmcs[i]); 816 if (error != 0) { 817 panic("vmx_vminit: vmclear error %d on vcpu %d\n", 818 error, i); 819 } 820 821 error = vmcs_set_defaults(&vmx->vmcs[i], 822 (u_long)vmx_longjmp, 823 (u_long)&vmx->ctx[i], 824 vmx->eptp, 825 pinbased_ctls, 826 procbased_ctls, 827 procbased_ctls2, 828 exit_ctls, entry_ctls, 829 vtophys(vmx->msr_bitmap), 830 vpid[i]); 831 832 if (error != 0) 833 panic("vmx_vminit: vmcs_set_defaults error %d", error); 834 835 vmx->cap[i].set = 0; 836 vmx->cap[i].proc_ctls = procbased_ctls; 837 vmx->cap[i].proc_ctls2 = procbased_ctls2; 838 839 vmx->state[i].lastcpu = -1; 840 vmx->state[i].vpid = vpid[i]; 841 842 msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count); 843 844 error = vmcs_set_msr_save(&vmx->vmcs[i], 845 vtophys(vmx->guest_msrs[i]), 846 guest_msr_count); 847 if (error != 0) 848 panic("vmcs_set_msr_save error %d", error); 849 850 /* 851 * Set up the CR0/4 shadows, and init the read shadow 852 * to the power-on register value from the Intel Sys Arch. 853 * CR0 - 0x60000010 854 * CR4 - 0 855 */ 856 error = vmx_setup_cr0_shadow(&vmx->vmcs[i], 0x60000010); 857 if (error != 0) 858 panic("vmx_setup_cr0_shadow %d", error); 859 860 error = vmx_setup_cr4_shadow(&vmx->vmcs[i], 0); 861 if (error != 0) 862 panic("vmx_setup_cr4_shadow %d", error); 863 864 vmx->ctx[i].pmap = pmap; 865 vmx->ctx[i].eptp = vmx->eptp; 866 } 867 868 return (vmx); 869} 870 871static int 872vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx) 873{ 874 int handled, func; 875 876 func = vmxctx->guest_rax; 877 878 handled = x86_emulate_cpuid(vm, vcpu, 879 (uint32_t*)(&vmxctx->guest_rax), 880 (uint32_t*)(&vmxctx->guest_rbx), 881 (uint32_t*)(&vmxctx->guest_rcx), 882 (uint32_t*)(&vmxctx->guest_rdx)); 883 return (handled); 884} 885 886static __inline void 887vmx_run_trace(struct vmx *vmx, int vcpu) 888{ 889#ifdef KTR 890 VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip()); 891#endif 892} 893 894static __inline void 895vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, 896 int handled) 897{ 898#ifdef KTR 899 VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx", 900 handled ? "handled" : "unhandled", 901 exit_reason_to_str(exit_reason), rip); 902#endif 903} 904 905static __inline void 906vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip) 907{ 908#ifdef KTR 909 VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip); 910#endif 911} 912 913static int 914vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu) 915{ 916 int error, lastcpu; 917 struct vmxstate *vmxstate; 918 struct invvpid_desc invvpid_desc = { 0 }; 919 920 vmxstate = &vmx->state[vcpu]; 921 lastcpu = vmxstate->lastcpu; 922 vmxstate->lastcpu = curcpu; 923 924 if (lastcpu == curcpu) { 925 error = 0; 926 goto done; 927 } 928 929 vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); 930 931 error = vmwrite(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); 932 if (error != 0) 933 goto done; 934 935 error = vmwrite(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); 936 if (error != 0) 937 goto done; 938 939 error = vmwrite(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); 940 if (error != 0) 941 goto done; 942 943 /* 944 * If we are using VPIDs then invalidate all mappings tagged with 'vpid' 945 * 946 * We do this because this vcpu was executing on a different host 947 * cpu when it last ran. We do not track whether it invalidated 948 * mappings associated with its 'vpid' during that run. So we must 949 * assume that the mappings associated with 'vpid' on 'curcpu' are 950 * stale and invalidate them. 951 * 952 * Note that we incur this penalty only when the scheduler chooses to 953 * move the thread associated with this vcpu between host cpus. 954 * 955 * Note also that this will invalidate mappings tagged with 'vpid' 956 * for "all" EP4TAs. 957 */ 958 if (vmxstate->vpid != 0) { 959 invvpid_desc.vpid = vmxstate->vpid; 960 invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); 961 } 962done: 963 return (error); 964} 965 966static void 967vm_exit_update_rip(struct vm_exit *vmexit) 968{ 969 int error; 970 971 error = vmwrite(VMCS_GUEST_RIP, vmexit->rip + vmexit->inst_length); 972 if (error) 973 panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); 974} 975 976/* 977 * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. 978 */ 979CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); 980 981static void __inline 982vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) 983{ 984 int error; 985 986 vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING; 987 988 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 989 if (error) 990 panic("vmx_set_int_window_exiting: vmwrite error %d", error); 991} 992 993static void __inline 994vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) 995{ 996 int error; 997 998 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; 999 1000 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1001 if (error) 1002 panic("vmx_clear_int_window_exiting: vmwrite error %d", error); 1003} 1004 1005static void __inline 1006vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) 1007{ 1008 int error; 1009 1010 vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; 1011 1012 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1013 if (error) 1014 panic("vmx_set_nmi_window_exiting: vmwrite error %d", error); 1015} 1016 1017static void __inline 1018vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) 1019{ 1020 int error; 1021 1022 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; 1023 1024 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1025 if (error) 1026 panic("vmx_clear_nmi_window_exiting: vmwrite error %d", error); 1027} 1028 1029static int 1030vmx_inject_nmi(struct vmx *vmx, int vcpu) 1031{ 1032 int error; 1033 uint64_t info, interruptibility; 1034 1035 /* Bail out if no NMI requested */ 1036 if (!vm_nmi_pending(vmx->vm, vcpu)) 1037 return (0); 1038 1039 error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility); 1040 if (error) { 1041 panic("vmx_inject_nmi: vmread(interruptibility) %d", 1042 error); 1043 } 1044 if (interruptibility & nmi_blocking_bits) 1045 goto nmiblocked; 1046 1047 /* 1048 * Inject the virtual NMI. The vector must be the NMI IDT entry 1049 * or the VMCS entry check will fail. 1050 */ 1051 info = VMCS_INTERRUPTION_INFO_NMI | VMCS_INTERRUPTION_INFO_VALID; 1052 info |= IDT_NMI; 1053 1054 error = vmwrite(VMCS_ENTRY_INTR_INFO, info); 1055 if (error) 1056 panic("vmx_inject_nmi: vmwrite(intrinfo) %d", error); 1057 1058 VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI"); 1059 1060 /* Clear the request */ 1061 vm_nmi_clear(vmx->vm, vcpu); 1062 return (1); 1063 1064nmiblocked: 1065 /* 1066 * Set the NMI Window Exiting execution control so we can inject 1067 * the virtual NMI as soon as blocking condition goes away. 1068 */ 1069 vmx_set_nmi_window_exiting(vmx, vcpu); 1070 1071 VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting"); 1072 return (1); 1073} 1074 1075static void 1076vmx_inject_interrupts(struct vmx *vmx, int vcpu) 1077{ 1078 int error, vector; 1079 uint64_t info, rflags, interruptibility; 1080 1081 const int HWINTR_BLOCKED = VMCS_INTERRUPTIBILITY_STI_BLOCKING | 1082 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING; 1083 1084 /* 1085 * If there is already an interrupt pending then just return. 1086 * 1087 * This could happen if an interrupt was injected on a prior 1088 * VM entry but the actual entry into guest mode was aborted 1089 * because of a pending AST. 1090 */ 1091 error = vmread(VMCS_ENTRY_INTR_INFO, &info); 1092 if (error) 1093 panic("vmx_inject_interrupts: vmread(intrinfo) %d", error); 1094 if (info & VMCS_INTERRUPTION_INFO_VALID) 1095 return; 1096 1097 /* 1098 * NMI injection has priority so deal with those first 1099 */ 1100 if (vmx_inject_nmi(vmx, vcpu)) 1101 return; 1102 1103 /* Ask the local apic for a vector to inject */ 1104 vector = lapic_pending_intr(vmx->vm, vcpu); 1105 if (vector < 0) 1106 return; 1107 1108 if (vector < 32 || vector > 255) 1109 panic("vmx_inject_interrupts: invalid vector %d\n", vector); 1110 1111 /* Check RFLAGS.IF and the interruptibility state of the guest */ 1112 error = vmread(VMCS_GUEST_RFLAGS, &rflags); 1113 if (error) 1114 panic("vmx_inject_interrupts: vmread(rflags) %d", error); 1115 1116 if ((rflags & PSL_I) == 0) 1117 goto cantinject; 1118 1119 error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility); 1120 if (error) { 1121 panic("vmx_inject_interrupts: vmread(interruptibility) %d", 1122 error); 1123 } 1124 if (interruptibility & HWINTR_BLOCKED) 1125 goto cantinject; 1126 1127 /* Inject the interrupt */ 1128 info = VMCS_INTERRUPTION_INFO_HW_INTR | VMCS_INTERRUPTION_INFO_VALID; 1129 info |= vector; 1130 error = vmwrite(VMCS_ENTRY_INTR_INFO, info); 1131 if (error) 1132 panic("vmx_inject_interrupts: vmwrite(intrinfo) %d", error); 1133 1134 /* Update the Local APIC ISR */ 1135 lapic_intr_accepted(vmx->vm, vcpu, vector); 1136 1137 VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); 1138 1139 return; 1140 1141cantinject: 1142 /* 1143 * Set the Interrupt Window Exiting execution control so we can inject 1144 * the interrupt as soon as blocking condition goes away. 1145 */ 1146 vmx_set_int_window_exiting(vmx, vcpu); 1147 1148 VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting"); 1149} 1150 1151static int 1152vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual) 1153{ 1154 int error, cr, vmcs_guest_cr, vmcs_shadow_cr; 1155 uint64_t crval, regval, ones_mask, zeros_mask; 1156 const struct vmxctx *vmxctx; 1157 1158 /* We only handle mov to %cr0 or %cr4 at this time */ 1159 if ((exitqual & 0xf0) != 0x00) 1160 return (UNHANDLED); 1161 1162 cr = exitqual & 0xf; 1163 if (cr != 0 && cr != 4) 1164 return (UNHANDLED); 1165 1166 vmxctx = &vmx->ctx[vcpu]; 1167 1168 /* 1169 * We must use vmwrite() directly here because vmcs_setreg() will 1170 * call vmclear(vmcs) as a side-effect which we certainly don't want. 1171 */ 1172 switch ((exitqual >> 8) & 0xf) { 1173 case 0: 1174 regval = vmxctx->guest_rax; 1175 break; 1176 case 1: 1177 regval = vmxctx->guest_rcx; 1178 break; 1179 case 2: 1180 regval = vmxctx->guest_rdx; 1181 break; 1182 case 3: 1183 regval = vmxctx->guest_rbx; 1184 break; 1185 case 4: 1186 error = vmread(VMCS_GUEST_RSP, ®val); 1187 if (error) { 1188 panic("vmx_emulate_cr_access: " 1189 "error %d reading guest rsp", error); 1190 } 1191 break; 1192 case 5: 1193 regval = vmxctx->guest_rbp; 1194 break; 1195 case 6: 1196 regval = vmxctx->guest_rsi; 1197 break; 1198 case 7: 1199 regval = vmxctx->guest_rdi; 1200 break; 1201 case 8: 1202 regval = vmxctx->guest_r8; 1203 break; 1204 case 9: 1205 regval = vmxctx->guest_r9; 1206 break; 1207 case 10: 1208 regval = vmxctx->guest_r10; 1209 break; 1210 case 11: 1211 regval = vmxctx->guest_r11; 1212 break; 1213 case 12: 1214 regval = vmxctx->guest_r12; 1215 break; 1216 case 13: 1217 regval = vmxctx->guest_r13; 1218 break; 1219 case 14: 1220 regval = vmxctx->guest_r14; 1221 break; 1222 case 15: 1223 regval = vmxctx->guest_r15; 1224 break; 1225 } 1226 1227 if (cr == 0) { 1228 ones_mask = cr0_ones_mask; 1229 zeros_mask = cr0_zeros_mask; 1230 vmcs_guest_cr = VMCS_GUEST_CR0; 1231 vmcs_shadow_cr = VMCS_CR0_SHADOW; 1232 } else { 1233 ones_mask = cr4_ones_mask; 1234 zeros_mask = cr4_zeros_mask; 1235 vmcs_guest_cr = VMCS_GUEST_CR4; 1236 vmcs_shadow_cr = VMCS_CR4_SHADOW; 1237 } 1238 1239 error = vmwrite(vmcs_shadow_cr, regval); 1240 if (error) { 1241 panic("vmx_emulate_cr_access: error %d writing cr%d shadow", 1242 error, cr); 1243 } 1244 1245 crval = regval | ones_mask; 1246 crval &= ~zeros_mask; 1247 error = vmwrite(vmcs_guest_cr, crval); 1248 if (error) { 1249 panic("vmx_emulate_cr_access: error %d writing cr%d", 1250 error, cr); 1251 } 1252 1253 if (cr == 0 && regval & CR0_PG) { 1254 uint64_t efer, entry_ctls; 1255 1256 /* 1257 * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and 1258 * the "IA-32e mode guest" bit in VM-entry control must be 1259 * equal. 1260 */ 1261 error = vmread(VMCS_GUEST_IA32_EFER, &efer); 1262 if (error) { 1263 panic("vmx_emulate_cr_access: error %d efer read", 1264 error); 1265 } 1266 if (efer & EFER_LME) { 1267 efer |= EFER_LMA; 1268 error = vmwrite(VMCS_GUEST_IA32_EFER, efer); 1269 if (error) { 1270 panic("vmx_emulate_cr_access: error %d" 1271 " efer write", error); 1272 } 1273 error = vmread(VMCS_ENTRY_CTLS, &entry_ctls); 1274 if (error) { 1275 panic("vmx_emulate_cr_access: error %d" 1276 " entry ctls read", error); 1277 } 1278 entry_ctls |= VM_ENTRY_GUEST_LMA; 1279 error = vmwrite(VMCS_ENTRY_CTLS, entry_ctls); 1280 if (error) { 1281 panic("vmx_emulate_cr_access: error %d" 1282 " entry ctls write", error); 1283 } 1284 } 1285 } 1286 1287 return (HANDLED); 1288} 1289 1290static int 1291ept_fault_type(uint64_t ept_qual) 1292{ 1293 int fault_type; 1294 1295 if (ept_qual & EPT_VIOLATION_DATA_WRITE) 1296 fault_type = VM_PROT_WRITE; 1297 else if (ept_qual & EPT_VIOLATION_INST_FETCH) 1298 fault_type = VM_PROT_EXECUTE; 1299 else 1300 fault_type= VM_PROT_READ; 1301 1302 return (fault_type); 1303} 1304 1305static int 1306ept_protection(uint64_t ept_qual) 1307{ 1308 int prot = 0; 1309 1310 if (ept_qual & EPT_VIOLATION_GPA_READABLE) 1311 prot |= VM_PROT_READ; 1312 if (ept_qual & EPT_VIOLATION_GPA_WRITEABLE) 1313 prot |= VM_PROT_WRITE; 1314 if (ept_qual & EPT_VIOLATION_GPA_EXECUTABLE) 1315 prot |= VM_PROT_EXECUTE; 1316 1317 return (prot); 1318} 1319 1320static boolean_t 1321ept_emulation_fault(uint64_t ept_qual) 1322{ 1323 int read, write; 1324 1325 /* EPT fault on an instruction fetch doesn't make sense here */ 1326 if (ept_qual & EPT_VIOLATION_INST_FETCH) 1327 return (FALSE); 1328 1329 /* EPT fault must be a read fault or a write fault */ 1330 read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0; 1331 write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0; 1332 if ((read | write) == 0) 1333 return (FALSE); 1334 1335 /* 1336 * The EPT violation must have been caused by accessing a 1337 * guest-physical address that is a translation of a guest-linear 1338 * address. 1339 */ 1340 if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 || 1341 (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) { 1342 return (FALSE); 1343 } 1344 1345 return (TRUE); 1346} 1347 1348static int 1349vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 1350{ 1351 int error, handled; 1352 struct vmcs *vmcs; 1353 struct vmxctx *vmxctx; 1354 uint32_t eax, ecx, edx, idtvec_info, idtvec_err, reason; 1355 uint64_t qual, gpa; 1356 1357 handled = 0; 1358 vmcs = &vmx->vmcs[vcpu]; 1359 vmxctx = &vmx->ctx[vcpu]; 1360 qual = vmexit->u.vmx.exit_qualification; 1361 reason = vmexit->u.vmx.exit_reason; 1362 vmexit->exitcode = VM_EXITCODE_BOGUS; 1363 1364 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1); 1365 1366 /* 1367 * VM exits that could be triggered during event injection on the 1368 * previous VM entry need to be handled specially by re-injecting 1369 * the event. 1370 * 1371 * See "Information for VM Exits During Event Delivery" in Intel SDM 1372 * for details. 1373 */ 1374 switch (reason) { 1375 case EXIT_REASON_EPT_FAULT: 1376 case EXIT_REASON_EPT_MISCONFIG: 1377 case EXIT_REASON_APIC: 1378 case EXIT_REASON_TASK_SWITCH: 1379 case EXIT_REASON_EXCEPTION: 1380 idtvec_info = vmcs_idt_vectoring_info(); 1381 if (idtvec_info & VMCS_IDT_VEC_VALID) { 1382 idtvec_info &= ~(1 << 12); /* clear undefined bit */ 1383 vmwrite(VMCS_ENTRY_INTR_INFO, idtvec_info); 1384 if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { 1385 idtvec_err = vmcs_idt_vectoring_err(); 1386 vmwrite(VMCS_ENTRY_EXCEPTION_ERROR, idtvec_err); 1387 } 1388 vmwrite(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); 1389 } 1390 default: 1391 break; 1392 } 1393 1394 switch (reason) { 1395 case EXIT_REASON_CR_ACCESS: 1396 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1); 1397 handled = vmx_emulate_cr_access(vmx, vcpu, qual); 1398 break; 1399 case EXIT_REASON_RDMSR: 1400 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1); 1401 ecx = vmxctx->guest_rcx; 1402 error = emulate_rdmsr(vmx->vm, vcpu, ecx); 1403 if (error) { 1404 vmexit->exitcode = VM_EXITCODE_RDMSR; 1405 vmexit->u.msr.code = ecx; 1406 } else 1407 handled = 1; 1408 break; 1409 case EXIT_REASON_WRMSR: 1410 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1); 1411 eax = vmxctx->guest_rax; 1412 ecx = vmxctx->guest_rcx; 1413 edx = vmxctx->guest_rdx; 1414 error = emulate_wrmsr(vmx->vm, vcpu, ecx, 1415 (uint64_t)edx << 32 | eax); 1416 if (error) { 1417 vmexit->exitcode = VM_EXITCODE_WRMSR; 1418 vmexit->u.msr.code = ecx; 1419 vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; 1420 } else 1421 handled = 1; 1422 break; 1423 case EXIT_REASON_HLT: 1424 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1); 1425 vmexit->exitcode = VM_EXITCODE_HLT; 1426 break; 1427 case EXIT_REASON_MTF: 1428 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1); 1429 vmexit->exitcode = VM_EXITCODE_MTRAP; 1430 break; 1431 case EXIT_REASON_PAUSE: 1432 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1); 1433 vmexit->exitcode = VM_EXITCODE_PAUSE; 1434 break; 1435 case EXIT_REASON_INTR_WINDOW: 1436 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1); 1437 vmx_clear_int_window_exiting(vmx, vcpu); 1438 VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); 1439 return (1); 1440 case EXIT_REASON_EXT_INTR: 1441 /* 1442 * External interrupts serve only to cause VM exits and allow 1443 * the host interrupt handler to run. 1444 * 1445 * If this external interrupt triggers a virtual interrupt 1446 * to a VM, then that state will be recorded by the 1447 * host interrupt handler in the VM's softc. We will inject 1448 * this virtual interrupt during the subsequent VM enter. 1449 */ 1450 1451 /* 1452 * This is special. We want to treat this as an 'handled' 1453 * VM-exit but not increment the instruction pointer. 1454 */ 1455 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); 1456 return (1); 1457 case EXIT_REASON_NMI_WINDOW: 1458 /* Exit to allow the pending virtual NMI to be injected */ 1459 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1); 1460 vmx_clear_nmi_window_exiting(vmx, vcpu); 1461 VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting"); 1462 return (1); 1463 case EXIT_REASON_INOUT: 1464 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1); 1465 vmexit->exitcode = VM_EXITCODE_INOUT; 1466 vmexit->u.inout.bytes = (qual & 0x7) + 1; 1467 vmexit->u.inout.in = (qual & 0x8) ? 1 : 0; 1468 vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; 1469 vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; 1470 vmexit->u.inout.port = (uint16_t)(qual >> 16); 1471 vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); 1472 break; 1473 case EXIT_REASON_CPUID: 1474 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1); 1475 handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx); 1476 break; 1477 case EXIT_REASON_EPT_FAULT: 1478 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EPT_FAULT, 1); 1479 /* 1480 * If 'gpa' lies within the address space allocated to 1481 * memory then this must be a nested page fault otherwise 1482 * this must be an instruction that accesses MMIO space. 1483 */ 1484 gpa = vmcs_gpa(); 1485 if (vm_mem_allocated(vmx->vm, gpa)) { 1486 vmexit->exitcode = VM_EXITCODE_PAGING; 1487 vmexit->u.paging.gpa = gpa; 1488 vmexit->u.paging.fault_type = ept_fault_type(qual); 1489 vmexit->u.paging.protection = ept_protection(qual); 1490 } else if (ept_emulation_fault(qual)) { 1491 vmexit->exitcode = VM_EXITCODE_INST_EMUL; 1492 vmexit->u.inst_emul.gpa = gpa; 1493 vmexit->u.inst_emul.gla = vmcs_gla(); 1494 vmexit->u.inst_emul.cr3 = vmcs_guest_cr3(); 1495 } 1496 break; 1497 default: 1498 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1); 1499 break; 1500 } 1501 1502 if (handled) { 1503 /* 1504 * It is possible that control is returned to userland 1505 * even though we were able to handle the VM exit in the 1506 * kernel. 1507 * 1508 * In such a case we want to make sure that the userland 1509 * restarts guest execution at the instruction *after* 1510 * the one we just processed. Therefore we update the 1511 * guest rip in the VMCS and in 'vmexit'. 1512 */ 1513 vm_exit_update_rip(vmexit); 1514 vmexit->rip += vmexit->inst_length; 1515 vmexit->inst_length = 0; 1516 } else { 1517 if (vmexit->exitcode == VM_EXITCODE_BOGUS) { 1518 /* 1519 * If this VM exit was not claimed by anybody then 1520 * treat it as a generic VMX exit. 1521 */ 1522 vmexit->exitcode = VM_EXITCODE_VMX; 1523 vmexit->u.vmx.error = 0; 1524 } else { 1525 /* 1526 * The exitcode and collateral have been populated. 1527 * The VM exit will be processed further in userland. 1528 */ 1529 } 1530 } 1531 return (handled); 1532} 1533 1534static int 1535vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap) 1536{ 1537 int error, vie, rc, handled, astpending; 1538 uint32_t exit_reason; 1539 struct vmx *vmx; 1540 struct vmxctx *vmxctx; 1541 struct vmcs *vmcs; 1542 struct vm_exit *vmexit; 1543 1544 vmx = arg; 1545 vmcs = &vmx->vmcs[vcpu]; 1546 vmxctx = &vmx->ctx[vcpu]; 1547 vmxctx->launched = 0; 1548 1549 astpending = 0; 1550 vmexit = vm_exitinfo(vmx->vm, vcpu); 1551 1552 KASSERT(vmxctx->pmap == pmap, 1553 ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap)); 1554 KASSERT(vmxctx->eptp == vmx->eptp, 1555 ("eptp %p different than ctx eptp %#lx", eptp, vmxctx->eptp)); 1556 1557 /* 1558 * XXX Can we avoid doing this every time we do a vm run? 1559 */ 1560 VMPTRLD(vmcs); 1561 1562 /* 1563 * XXX 1564 * We do this every time because we may setup the virtual machine 1565 * from a different process than the one that actually runs it. 1566 * 1567 * If the life of a virtual machine was spent entirely in the context 1568 * of a single process we could do this once in vmcs_set_defaults(). 1569 */ 1570 if ((error = vmwrite(VMCS_HOST_CR3, rcr3())) != 0) 1571 panic("vmx_run: error %d writing to VMCS_HOST_CR3", error); 1572 1573 if ((error = vmwrite(VMCS_GUEST_RIP, rip)) != 0) 1574 panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); 1575 1576 if ((error = vmx_set_pcpu_defaults(vmx, vcpu)) != 0) 1577 panic("vmx_run: error %d setting up pcpu defaults", error); 1578 1579 do { 1580 lapic_timer_tick(vmx->vm, vcpu); 1581 vmx_inject_interrupts(vmx, vcpu); 1582 vmx_run_trace(vmx, vcpu); 1583 rc = vmx_setjmp(vmxctx); 1584#ifdef SETJMP_TRACE 1585 vmx_setjmp_trace(vmx, vcpu, vmxctx, rc); 1586#endif 1587 switch (rc) { 1588 case VMX_RETURN_DIRECT: 1589 if (vmxctx->launched == 0) { 1590 vmxctx->launched = 1; 1591 vmx_launch(vmxctx); 1592 } else 1593 vmx_resume(vmxctx); 1594 panic("vmx_launch/resume should not return"); 1595 break; 1596 case VMX_RETURN_LONGJMP: 1597 break; /* vm exit */ 1598 case VMX_RETURN_AST: 1599 astpending = 1; 1600 break; 1601 case VMX_RETURN_VMRESUME: 1602 vie = vmcs_instruction_error(); 1603 if (vmxctx->launch_error == VM_FAIL_INVALID || 1604 vie != VMRESUME_WITH_NON_LAUNCHED_VMCS) { 1605 printf("vmresume error %d vmcs inst error %d\n", 1606 vmxctx->launch_error, vie); 1607 goto err_exit; 1608 } 1609 vmx_launch(vmxctx); /* try to launch the guest */ 1610 panic("vmx_launch should not return"); 1611 break; 1612 case VMX_RETURN_VMLAUNCH: 1613 vie = vmcs_instruction_error(); 1614#if 1 1615 printf("vmlaunch error %d vmcs inst error %d\n", 1616 vmxctx->launch_error, vie); 1617#endif 1618 goto err_exit; 1619 case VMX_RETURN_INVEPT: 1620 panic("vm %s:%d invept error %d", 1621 vm_name(vmx->vm), vcpu, vmxctx->launch_error); 1622 default: 1623 panic("vmx_setjmp returned %d", rc); 1624 } 1625 1626 /* enable interrupts */ 1627 enable_intr(); 1628 1629 /* collect some basic information for VM exit processing */ 1630 vmexit->rip = rip = vmcs_guest_rip(); 1631 vmexit->inst_length = vmexit_instruction_length(); 1632 vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); 1633 vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); 1634 1635 if (astpending) { 1636 handled = 1; 1637 vmexit->inst_length = 0; 1638 vmexit->exitcode = VM_EXITCODE_BOGUS; 1639 vmx_astpending_trace(vmx, vcpu, rip); 1640 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_ASTPENDING, 1); 1641 break; 1642 } 1643 1644 handled = vmx_exit_process(vmx, vcpu, vmexit); 1645 vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled); 1646 1647 } while (handled); 1648 1649 /* 1650 * If a VM exit has been handled then the exitcode must be BOGUS 1651 * If a VM exit is not handled then the exitcode must not be BOGUS 1652 */ 1653 if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || 1654 (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { 1655 panic("Mismatch between handled (%d) and exitcode (%d)", 1656 handled, vmexit->exitcode); 1657 } 1658 1659 if (!handled) 1660 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_USERSPACE, 1); 1661 1662 VCPU_CTR1(vmx->vm, vcpu, "goto userland: exitcode %d",vmexit->exitcode); 1663 1664 /* 1665 * XXX 1666 * We need to do this to ensure that any VMCS state cached by the 1667 * processor is flushed to memory. We need to do this in case the 1668 * VM moves to a different cpu the next time it runs. 1669 * 1670 * Can we avoid doing this? 1671 */ 1672 VMCLEAR(vmcs); 1673 return (0); 1674 1675err_exit: 1676 vmexit->exitcode = VM_EXITCODE_VMX; 1677 vmexit->u.vmx.exit_reason = (uint32_t)-1; 1678 vmexit->u.vmx.exit_qualification = (uint32_t)-1; 1679 vmexit->u.vmx.error = vie; 1680 VMCLEAR(vmcs); 1681 return (ENOEXEC); 1682} 1683 1684static void 1685vmx_vmcleanup(void *arg) 1686{ 1687 int i, error; 1688 struct vmx *vmx = arg; 1689 1690 for (i = 0; i < VM_MAXCPU; i++) 1691 vpid_free(vmx->state[i].vpid); 1692 1693 /* 1694 * XXXSMP we also need to clear the VMCS active on the other vcpus. 1695 */ 1696 error = vmclear(&vmx->vmcs[0]); 1697 if (error != 0) 1698 panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error); 1699 1700 free(vmx, M_VMX); 1701 1702 return; 1703} 1704 1705static register_t * 1706vmxctx_regptr(struct vmxctx *vmxctx, int reg) 1707{ 1708 1709 switch (reg) { 1710 case VM_REG_GUEST_RAX: 1711 return (&vmxctx->guest_rax); 1712 case VM_REG_GUEST_RBX: 1713 return (&vmxctx->guest_rbx); 1714 case VM_REG_GUEST_RCX: 1715 return (&vmxctx->guest_rcx); 1716 case VM_REG_GUEST_RDX: 1717 return (&vmxctx->guest_rdx); 1718 case VM_REG_GUEST_RSI: 1719 return (&vmxctx->guest_rsi); 1720 case VM_REG_GUEST_RDI: 1721 return (&vmxctx->guest_rdi); 1722 case VM_REG_GUEST_RBP: 1723 return (&vmxctx->guest_rbp); 1724 case VM_REG_GUEST_R8: 1725 return (&vmxctx->guest_r8); 1726 case VM_REG_GUEST_R9: 1727 return (&vmxctx->guest_r9); 1728 case VM_REG_GUEST_R10: 1729 return (&vmxctx->guest_r10); 1730 case VM_REG_GUEST_R11: 1731 return (&vmxctx->guest_r11); 1732 case VM_REG_GUEST_R12: 1733 return (&vmxctx->guest_r12); 1734 case VM_REG_GUEST_R13: 1735 return (&vmxctx->guest_r13); 1736 case VM_REG_GUEST_R14: 1737 return (&vmxctx->guest_r14); 1738 case VM_REG_GUEST_R15: 1739 return (&vmxctx->guest_r15); 1740 default: 1741 break; 1742 } 1743 return (NULL); 1744} 1745 1746static int 1747vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval) 1748{ 1749 register_t *regp; 1750 1751 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 1752 *retval = *regp; 1753 return (0); 1754 } else 1755 return (EINVAL); 1756} 1757 1758static int 1759vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) 1760{ 1761 register_t *regp; 1762 1763 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 1764 *regp = val; 1765 return (0); 1766 } else 1767 return (EINVAL); 1768} 1769 1770static int 1771vmx_shadow_reg(int reg) 1772{ 1773 int shreg; 1774 1775 shreg = -1; 1776 1777 switch (reg) { 1778 case VM_REG_GUEST_CR0: 1779 shreg = VMCS_CR0_SHADOW; 1780 break; 1781 case VM_REG_GUEST_CR4: 1782 shreg = VMCS_CR4_SHADOW; 1783 break; 1784 default: 1785 break; 1786 } 1787 1788 return (shreg); 1789} 1790 1791static int 1792vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) 1793{ 1794 int running, hostcpu; 1795 struct vmx *vmx = arg; 1796 1797 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 1798 if (running && hostcpu != curcpu) 1799 panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); 1800 1801 if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0) 1802 return (0); 1803 1804 return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval)); 1805} 1806 1807static int 1808vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) 1809{ 1810 int error, hostcpu, running, shadow; 1811 uint64_t ctls; 1812 struct vmx *vmx = arg; 1813 1814 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 1815 if (running && hostcpu != curcpu) 1816 panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); 1817 1818 if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0) 1819 return (0); 1820 1821 error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val); 1822 1823 if (error == 0) { 1824 /* 1825 * If the "load EFER" VM-entry control is 1 then the 1826 * value of EFER.LMA must be identical to "IA-32e mode guest" 1827 * bit in the VM-entry control. 1828 */ 1829 if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 && 1830 (reg == VM_REG_GUEST_EFER)) { 1831 vmcs_getreg(&vmx->vmcs[vcpu], running, 1832 VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls); 1833 if (val & EFER_LMA) 1834 ctls |= VM_ENTRY_GUEST_LMA; 1835 else 1836 ctls &= ~VM_ENTRY_GUEST_LMA; 1837 vmcs_setreg(&vmx->vmcs[vcpu], running, 1838 VMCS_IDENT(VMCS_ENTRY_CTLS), ctls); 1839 } 1840 1841 shadow = vmx_shadow_reg(reg); 1842 if (shadow > 0) { 1843 /* 1844 * Store the unmodified value in the shadow 1845 */ 1846 error = vmcs_setreg(&vmx->vmcs[vcpu], running, 1847 VMCS_IDENT(shadow), val); 1848 } 1849 } 1850 1851 return (error); 1852} 1853 1854static int 1855vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 1856{ 1857 struct vmx *vmx = arg; 1858 1859 return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc)); 1860} 1861 1862static int 1863vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 1864{ 1865 struct vmx *vmx = arg; 1866 1867 return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc)); 1868} 1869 1870static int 1871vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code, 1872 int code_valid) 1873{ 1874 int error; 1875 uint64_t info; 1876 struct vmx *vmx = arg; 1877 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 1878 1879 static uint32_t type_map[VM_EVENT_MAX] = { 1880 0x1, /* VM_EVENT_NONE */ 1881 0x0, /* VM_HW_INTR */ 1882 0x2, /* VM_NMI */ 1883 0x3, /* VM_HW_EXCEPTION */ 1884 0x4, /* VM_SW_INTR */ 1885 0x5, /* VM_PRIV_SW_EXCEPTION */ 1886 0x6, /* VM_SW_EXCEPTION */ 1887 }; 1888 1889 /* 1890 * If there is already an exception pending to be delivered to the 1891 * vcpu then just return. 1892 */ 1893 error = vmcs_getreg(vmcs, 0, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), &info); 1894 if (error) 1895 return (error); 1896 1897 if (info & VMCS_INTERRUPTION_INFO_VALID) 1898 return (EAGAIN); 1899 1900 info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0); 1901 info |= VMCS_INTERRUPTION_INFO_VALID; 1902 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info); 1903 if (error != 0) 1904 return (error); 1905 1906 if (code_valid) { 1907 error = vmcs_setreg(vmcs, 0, 1908 VMCS_IDENT(VMCS_ENTRY_EXCEPTION_ERROR), 1909 code); 1910 } 1911 return (error); 1912} 1913 1914static int 1915vmx_getcap(void *arg, int vcpu, int type, int *retval) 1916{ 1917 struct vmx *vmx = arg; 1918 int vcap; 1919 int ret; 1920 1921 ret = ENOENT; 1922 1923 vcap = vmx->cap[vcpu].set; 1924 1925 switch (type) { 1926 case VM_CAP_HALT_EXIT: 1927 if (cap_halt_exit) 1928 ret = 0; 1929 break; 1930 case VM_CAP_PAUSE_EXIT: 1931 if (cap_pause_exit) 1932 ret = 0; 1933 break; 1934 case VM_CAP_MTRAP_EXIT: 1935 if (cap_monitor_trap) 1936 ret = 0; 1937 break; 1938 case VM_CAP_UNRESTRICTED_GUEST: 1939 if (cap_unrestricted_guest) 1940 ret = 0; 1941 break; 1942 case VM_CAP_ENABLE_INVPCID: 1943 if (cap_invpcid) 1944 ret = 0; 1945 break; 1946 default: 1947 break; 1948 } 1949 1950 if (ret == 0) 1951 *retval = (vcap & (1 << type)) ? 1 : 0; 1952 1953 return (ret); 1954} 1955 1956static int 1957vmx_setcap(void *arg, int vcpu, int type, int val) 1958{ 1959 struct vmx *vmx = arg; 1960 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 1961 uint32_t baseval; 1962 uint32_t *pptr; 1963 int error; 1964 int flag; 1965 int reg; 1966 int retval; 1967 1968 retval = ENOENT; 1969 pptr = NULL; 1970 1971 switch (type) { 1972 case VM_CAP_HALT_EXIT: 1973 if (cap_halt_exit) { 1974 retval = 0; 1975 pptr = &vmx->cap[vcpu].proc_ctls; 1976 baseval = *pptr; 1977 flag = PROCBASED_HLT_EXITING; 1978 reg = VMCS_PRI_PROC_BASED_CTLS; 1979 } 1980 break; 1981 case VM_CAP_MTRAP_EXIT: 1982 if (cap_monitor_trap) { 1983 retval = 0; 1984 pptr = &vmx->cap[vcpu].proc_ctls; 1985 baseval = *pptr; 1986 flag = PROCBASED_MTF; 1987 reg = VMCS_PRI_PROC_BASED_CTLS; 1988 } 1989 break; 1990 case VM_CAP_PAUSE_EXIT: 1991 if (cap_pause_exit) { 1992 retval = 0; 1993 pptr = &vmx->cap[vcpu].proc_ctls; 1994 baseval = *pptr; 1995 flag = PROCBASED_PAUSE_EXITING; 1996 reg = VMCS_PRI_PROC_BASED_CTLS; 1997 } 1998 break; 1999 case VM_CAP_UNRESTRICTED_GUEST: 2000 if (cap_unrestricted_guest) { 2001 retval = 0; 2002 pptr = &vmx->cap[vcpu].proc_ctls2; 2003 baseval = *pptr; 2004 flag = PROCBASED2_UNRESTRICTED_GUEST; 2005 reg = VMCS_SEC_PROC_BASED_CTLS; 2006 } 2007 break; 2008 case VM_CAP_ENABLE_INVPCID: 2009 if (cap_invpcid) { 2010 retval = 0; 2011 pptr = &vmx->cap[vcpu].proc_ctls2; 2012 baseval = *pptr; 2013 flag = PROCBASED2_ENABLE_INVPCID; 2014 reg = VMCS_SEC_PROC_BASED_CTLS; 2015 } 2016 break; 2017 default: 2018 break; 2019 } 2020 2021 if (retval == 0) { 2022 if (val) { 2023 baseval |= flag; 2024 } else { 2025 baseval &= ~flag; 2026 } 2027 VMPTRLD(vmcs); 2028 error = vmwrite(reg, baseval); 2029 VMCLEAR(vmcs); 2030 2031 if (error) { 2032 retval = error; 2033 } else { 2034 /* 2035 * Update optional stored flags, and record 2036 * setting 2037 */ 2038 if (pptr != NULL) { 2039 *pptr = baseval; 2040 } 2041 2042 if (val) { 2043 vmx->cap[vcpu].set |= (1 << type); 2044 } else { 2045 vmx->cap[vcpu].set &= ~(1 << type); 2046 } 2047 } 2048 } 2049 2050 return (retval); 2051} 2052 2053struct vmm_ops vmm_ops_intel = { 2054 vmx_init, 2055 vmx_cleanup, 2056 vmx_vminit, 2057 vmx_run, 2058 vmx_vmcleanup, 2059 vmx_getreg, 2060 vmx_setreg, 2061 vmx_getdesc, 2062 vmx_setdesc, 2063 vmx_inject, 2064 vmx_getcap, 2065 vmx_setcap, 2066 ept_vmspace_alloc, 2067 ept_vmspace_free, 2068}; 2069