vmx.c revision 262350
1/*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD: stable/10/sys/amd64/vmm/intel/vmx.c 262350 2014-02-23 00:46:05Z jhb $ 27 */ 28 29#include <sys/cdefs.h> 30__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/intel/vmx.c 262350 2014-02-23 00:46:05Z jhb $"); 31 32#include <sys/param.h> 33#include <sys/systm.h> 34#include <sys/smp.h> 35#include <sys/kernel.h> 36#include <sys/malloc.h> 37#include <sys/pcpu.h> 38#include <sys/proc.h> 39#include <sys/sysctl.h> 40 41#include <vm/vm.h> 42#include <vm/pmap.h> 43 44#include <machine/psl.h> 45#include <machine/cpufunc.h> 46#include <machine/md_var.h> 47#include <machine/segments.h> 48#include <machine/specialreg.h> 49#include <machine/vmparam.h> 50 51#include <machine/vmm.h> 52#include "vmm_host.h" 53#include "vmm_lapic.h" 54#include "vmm_msr.h" 55#include "vmm_ktr.h" 56#include "vmm_stat.h" 57 58#include "vmx_msr.h" 59#include "ept.h" 60#include "vmx_cpufunc.h" 61#include "vmx.h" 62#include "x86.h" 63#include "vmx_controls.h" 64 65#define PINBASED_CTLS_ONE_SETTING \ 66 (PINBASED_EXTINT_EXITING | \ 67 PINBASED_NMI_EXITING | \ 68 PINBASED_VIRTUAL_NMI) 69#define PINBASED_CTLS_ZERO_SETTING 0 70 71#define PROCBASED_CTLS_WINDOW_SETTING \ 72 (PROCBASED_INT_WINDOW_EXITING | \ 73 PROCBASED_NMI_WINDOW_EXITING) 74 75#define PROCBASED_CTLS_ONE_SETTING \ 76 (PROCBASED_SECONDARY_CONTROLS | \ 77 PROCBASED_IO_EXITING | \ 78 PROCBASED_MSR_BITMAPS | \ 79 PROCBASED_CTLS_WINDOW_SETTING) 80#define PROCBASED_CTLS_ZERO_SETTING \ 81 (PROCBASED_CR3_LOAD_EXITING | \ 82 PROCBASED_CR3_STORE_EXITING | \ 83 PROCBASED_IO_BITMAPS) 84 85#define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT 86#define PROCBASED_CTLS2_ZERO_SETTING 0 87 88#define VM_EXIT_CTLS_ONE_SETTING_NO_PAT \ 89 (VM_EXIT_HOST_LMA | \ 90 VM_EXIT_SAVE_EFER | \ 91 VM_EXIT_LOAD_EFER) 92 93#define VM_EXIT_CTLS_ONE_SETTING \ 94 (VM_EXIT_CTLS_ONE_SETTING_NO_PAT | \ 95 VM_EXIT_SAVE_PAT | \ 96 VM_EXIT_LOAD_PAT) 97#define VM_EXIT_CTLS_ZERO_SETTING VM_EXIT_SAVE_DEBUG_CONTROLS 98 99#define VM_ENTRY_CTLS_ONE_SETTING_NO_PAT VM_ENTRY_LOAD_EFER 100 101#define VM_ENTRY_CTLS_ONE_SETTING \ 102 (VM_ENTRY_CTLS_ONE_SETTING_NO_PAT | \ 103 VM_ENTRY_LOAD_PAT) 104#define VM_ENTRY_CTLS_ZERO_SETTING \ 105 (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ 106 VM_ENTRY_INTO_SMM | \ 107 VM_ENTRY_DEACTIVATE_DUAL_MONITOR) 108 109#define guest_msr_rw(vmx, msr) \ 110 msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW) 111 112#define HANDLED 1 113#define UNHANDLED 0 114 115MALLOC_DEFINE(M_VMX, "vmx", "vmx"); 116 117SYSCTL_DECL(_hw_vmm); 118SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL); 119 120int vmxon_enabled[MAXCPU]; 121static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); 122 123static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; 124static uint32_t exit_ctls, entry_ctls; 125 126static uint64_t cr0_ones_mask, cr0_zeros_mask; 127SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD, 128 &cr0_ones_mask, 0, NULL); 129SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD, 130 &cr0_zeros_mask, 0, NULL); 131 132static uint64_t cr4_ones_mask, cr4_zeros_mask; 133SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD, 134 &cr4_ones_mask, 0, NULL); 135SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD, 136 &cr4_zeros_mask, 0, NULL); 137 138static int vmx_no_patmsr; 139 140static int vmx_initialized; 141SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD, 142 &vmx_initialized, 0, "Intel VMX initialized"); 143 144/* 145 * Virtual NMI blocking conditions. 146 * 147 * Some processor implementations also require NMI to be blocked if 148 * the STI_BLOCKING bit is set. It is possible to detect this at runtime 149 * based on the (exit_reason,exit_qual) tuple being set to 150 * (EXIT_REASON_INVAL_VMCS, EXIT_QUAL_NMI_WHILE_STI_BLOCKING). 151 * 152 * We take the easy way out and also include STI_BLOCKING as one of the 153 * gating items for vNMI injection. 154 */ 155static uint64_t nmi_blocking_bits = VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING | 156 VMCS_INTERRUPTIBILITY_NMI_BLOCKING | 157 VMCS_INTERRUPTIBILITY_STI_BLOCKING; 158 159/* 160 * Optional capabilities 161 */ 162static int cap_halt_exit; 163static int cap_pause_exit; 164static int cap_unrestricted_guest; 165static int cap_monitor_trap; 166static int cap_invpcid; 167 168static struct unrhdr *vpid_unr; 169static u_int vpid_alloc_failed; 170SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD, 171 &vpid_alloc_failed, 0, NULL); 172 173#ifdef KTR 174static const char * 175exit_reason_to_str(int reason) 176{ 177 static char reasonbuf[32]; 178 179 switch (reason) { 180 case EXIT_REASON_EXCEPTION: 181 return "exception"; 182 case EXIT_REASON_EXT_INTR: 183 return "extint"; 184 case EXIT_REASON_TRIPLE_FAULT: 185 return "triplefault"; 186 case EXIT_REASON_INIT: 187 return "init"; 188 case EXIT_REASON_SIPI: 189 return "sipi"; 190 case EXIT_REASON_IO_SMI: 191 return "iosmi"; 192 case EXIT_REASON_SMI: 193 return "smi"; 194 case EXIT_REASON_INTR_WINDOW: 195 return "intrwindow"; 196 case EXIT_REASON_NMI_WINDOW: 197 return "nmiwindow"; 198 case EXIT_REASON_TASK_SWITCH: 199 return "taskswitch"; 200 case EXIT_REASON_CPUID: 201 return "cpuid"; 202 case EXIT_REASON_GETSEC: 203 return "getsec"; 204 case EXIT_REASON_HLT: 205 return "hlt"; 206 case EXIT_REASON_INVD: 207 return "invd"; 208 case EXIT_REASON_INVLPG: 209 return "invlpg"; 210 case EXIT_REASON_RDPMC: 211 return "rdpmc"; 212 case EXIT_REASON_RDTSC: 213 return "rdtsc"; 214 case EXIT_REASON_RSM: 215 return "rsm"; 216 case EXIT_REASON_VMCALL: 217 return "vmcall"; 218 case EXIT_REASON_VMCLEAR: 219 return "vmclear"; 220 case EXIT_REASON_VMLAUNCH: 221 return "vmlaunch"; 222 case EXIT_REASON_VMPTRLD: 223 return "vmptrld"; 224 case EXIT_REASON_VMPTRST: 225 return "vmptrst"; 226 case EXIT_REASON_VMREAD: 227 return "vmread"; 228 case EXIT_REASON_VMRESUME: 229 return "vmresume"; 230 case EXIT_REASON_VMWRITE: 231 return "vmwrite"; 232 case EXIT_REASON_VMXOFF: 233 return "vmxoff"; 234 case EXIT_REASON_VMXON: 235 return "vmxon"; 236 case EXIT_REASON_CR_ACCESS: 237 return "craccess"; 238 case EXIT_REASON_DR_ACCESS: 239 return "draccess"; 240 case EXIT_REASON_INOUT: 241 return "inout"; 242 case EXIT_REASON_RDMSR: 243 return "rdmsr"; 244 case EXIT_REASON_WRMSR: 245 return "wrmsr"; 246 case EXIT_REASON_INVAL_VMCS: 247 return "invalvmcs"; 248 case EXIT_REASON_INVAL_MSR: 249 return "invalmsr"; 250 case EXIT_REASON_MWAIT: 251 return "mwait"; 252 case EXIT_REASON_MTF: 253 return "mtf"; 254 case EXIT_REASON_MONITOR: 255 return "monitor"; 256 case EXIT_REASON_PAUSE: 257 return "pause"; 258 case EXIT_REASON_MCE: 259 return "mce"; 260 case EXIT_REASON_TPR: 261 return "tpr"; 262 case EXIT_REASON_APIC: 263 return "apic"; 264 case EXIT_REASON_GDTR_IDTR: 265 return "gdtridtr"; 266 case EXIT_REASON_LDTR_TR: 267 return "ldtrtr"; 268 case EXIT_REASON_EPT_FAULT: 269 return "eptfault"; 270 case EXIT_REASON_EPT_MISCONFIG: 271 return "eptmisconfig"; 272 case EXIT_REASON_INVEPT: 273 return "invept"; 274 case EXIT_REASON_RDTSCP: 275 return "rdtscp"; 276 case EXIT_REASON_VMX_PREEMPT: 277 return "vmxpreempt"; 278 case EXIT_REASON_INVVPID: 279 return "invvpid"; 280 case EXIT_REASON_WBINVD: 281 return "wbinvd"; 282 case EXIT_REASON_XSETBV: 283 return "xsetbv"; 284 default: 285 snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); 286 return (reasonbuf); 287 } 288} 289 290#ifdef SETJMP_TRACE 291static const char * 292vmx_setjmp_rc2str(int rc) 293{ 294 switch (rc) { 295 case VMX_RETURN_DIRECT: 296 return "direct"; 297 case VMX_RETURN_LONGJMP: 298 return "longjmp"; 299 case VMX_RETURN_VMRESUME: 300 return "vmresume"; 301 case VMX_RETURN_VMLAUNCH: 302 return "vmlaunch"; 303 case VMX_RETURN_AST: 304 return "ast"; 305 default: 306 return "unknown"; 307 } 308} 309 310#define SETJMP_TRACE(vmx, vcpu, vmxctx, regname) \ 311 VCPU_CTR1((vmx)->vm, (vcpu), "setjmp trace " #regname " 0x%016lx", \ 312 (vmxctx)->regname) 313 314static void 315vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) 316{ 317 uint64_t host_rip, host_rsp; 318 319 if (vmxctx != &vmx->ctx[vcpu]) 320 panic("vmx_setjmp_trace: invalid vmxctx %p; should be %p", 321 vmxctx, &vmx->ctx[vcpu]); 322 323 VCPU_CTR1((vmx)->vm, (vcpu), "vmxctx = %p", vmxctx); 324 VCPU_CTR2((vmx)->vm, (vcpu), "setjmp return code %s(%d)", 325 vmx_setjmp_rc2str(rc), rc); 326 327 host_rsp = host_rip = ~0; 328 vmread(VMCS_HOST_RIP, &host_rip); 329 vmread(VMCS_HOST_RSP, &host_rsp); 330 VCPU_CTR2((vmx)->vm, (vcpu), "vmcs host_rip 0x%016lx, host_rsp %#lx", 331 host_rip, host_rsp); 332 333 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r15); 334 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r14); 335 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r13); 336 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r12); 337 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbp); 338 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rsp); 339 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbx); 340 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rip); 341 342 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdi); 343 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rsi); 344 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdx); 345 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rcx); 346 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r8); 347 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r9); 348 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rax); 349 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbx); 350 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbp); 351 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r10); 352 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r11); 353 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r12); 354 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r13); 355 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r14); 356 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r15); 357 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_cr2); 358} 359#endif 360#else 361static void __inline 362vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) 363{ 364 return; 365} 366#endif /* KTR */ 367 368u_long 369vmx_fix_cr0(u_long cr0) 370{ 371 372 return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); 373} 374 375u_long 376vmx_fix_cr4(u_long cr4) 377{ 378 379 return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); 380} 381 382static void 383vpid_free(int vpid) 384{ 385 if (vpid < 0 || vpid > 0xffff) 386 panic("vpid_free: invalid vpid %d", vpid); 387 388 /* 389 * VPIDs [0,VM_MAXCPU] are special and are not allocated from 390 * the unit number allocator. 391 */ 392 393 if (vpid > VM_MAXCPU) 394 free_unr(vpid_unr, vpid); 395} 396 397static void 398vpid_alloc(uint16_t *vpid, int num) 399{ 400 int i, x; 401 402 if (num <= 0 || num > VM_MAXCPU) 403 panic("invalid number of vpids requested: %d", num); 404 405 /* 406 * If the "enable vpid" execution control is not enabled then the 407 * VPID is required to be 0 for all vcpus. 408 */ 409 if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) { 410 for (i = 0; i < num; i++) 411 vpid[i] = 0; 412 return; 413 } 414 415 /* 416 * Allocate a unique VPID for each vcpu from the unit number allocator. 417 */ 418 for (i = 0; i < num; i++) { 419 x = alloc_unr(vpid_unr); 420 if (x == -1) 421 break; 422 else 423 vpid[i] = x; 424 } 425 426 if (i < num) { 427 atomic_add_int(&vpid_alloc_failed, 1); 428 429 /* 430 * If the unit number allocator does not have enough unique 431 * VPIDs then we need to allocate from the [1,VM_MAXCPU] range. 432 * 433 * These VPIDs are not be unique across VMs but this does not 434 * affect correctness because the combined mappings are also 435 * tagged with the EP4TA which is unique for each VM. 436 * 437 * It is still sub-optimal because the invvpid will invalidate 438 * combined mappings for a particular VPID across all EP4TAs. 439 */ 440 while (i-- > 0) 441 vpid_free(vpid[i]); 442 443 for (i = 0; i < num; i++) 444 vpid[i] = i + 1; 445 } 446} 447 448static void 449vpid_init(void) 450{ 451 /* 452 * VPID 0 is required when the "enable VPID" execution control is 453 * disabled. 454 * 455 * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the 456 * unit number allocator does not have sufficient unique VPIDs to 457 * satisfy the allocation. 458 * 459 * The remaining VPIDs are managed by the unit number allocator. 460 */ 461 vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL); 462} 463 464static void 465msr_save_area_init(struct msr_entry *g_area, int *g_count) 466{ 467 int cnt; 468 469 static struct msr_entry guest_msrs[] = { 470 { MSR_KGSBASE, 0, 0 }, 471 }; 472 473 cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]); 474 if (cnt > GUEST_MSR_MAX_ENTRIES) 475 panic("guest msr save area overrun"); 476 bcopy(guest_msrs, g_area, sizeof(guest_msrs)); 477 *g_count = cnt; 478} 479 480static void 481vmx_disable(void *arg __unused) 482{ 483 struct invvpid_desc invvpid_desc = { 0 }; 484 struct invept_desc invept_desc = { 0 }; 485 486 if (vmxon_enabled[curcpu]) { 487 /* 488 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b. 489 * 490 * VMXON or VMXOFF are not required to invalidate any TLB 491 * caching structures. This prevents potential retention of 492 * cached information in the TLB between distinct VMX episodes. 493 */ 494 invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc); 495 invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc); 496 vmxoff(); 497 } 498 load_cr4(rcr4() & ~CR4_VMXE); 499} 500 501static int 502vmx_cleanup(void) 503{ 504 505 if (vpid_unr != NULL) { 506 delete_unrhdr(vpid_unr); 507 vpid_unr = NULL; 508 } 509 510 smp_rendezvous(NULL, vmx_disable, NULL, NULL); 511 512 return (0); 513} 514 515static void 516vmx_enable(void *arg __unused) 517{ 518 int error; 519 520 load_cr4(rcr4() | CR4_VMXE); 521 522 *(uint32_t *)vmxon_region[curcpu] = vmx_revision(); 523 error = vmxon(vmxon_region[curcpu]); 524 if (error == 0) 525 vmxon_enabled[curcpu] = 1; 526} 527 528static void 529vmx_restore(void) 530{ 531 532 if (vmxon_enabled[curcpu]) 533 vmxon(vmxon_region[curcpu]); 534} 535 536static int 537vmx_init(void) 538{ 539 int error; 540 uint64_t fixed0, fixed1, feature_control; 541 uint32_t tmp; 542 543 /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ 544 if (!(cpu_feature2 & CPUID2_VMX)) { 545 printf("vmx_init: processor does not support VMX operation\n"); 546 return (ENXIO); 547 } 548 549 /* 550 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits 551 * are set (bits 0 and 2 respectively). 552 */ 553 feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); 554 if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 || 555 (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { 556 printf("vmx_init: VMX operation disabled by BIOS\n"); 557 return (ENXIO); 558 } 559 560 /* Check support for primary processor-based VM-execution controls */ 561 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 562 MSR_VMX_TRUE_PROCBASED_CTLS, 563 PROCBASED_CTLS_ONE_SETTING, 564 PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); 565 if (error) { 566 printf("vmx_init: processor does not support desired primary " 567 "processor-based controls\n"); 568 return (error); 569 } 570 571 /* Clear the processor-based ctl bits that are set on demand */ 572 procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; 573 574 /* Check support for secondary processor-based VM-execution controls */ 575 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 576 MSR_VMX_PROCBASED_CTLS2, 577 PROCBASED_CTLS2_ONE_SETTING, 578 PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); 579 if (error) { 580 printf("vmx_init: processor does not support desired secondary " 581 "processor-based controls\n"); 582 return (error); 583 } 584 585 /* Check support for VPID */ 586 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, 587 PROCBASED2_ENABLE_VPID, 0, &tmp); 588 if (error == 0) 589 procbased_ctls2 |= PROCBASED2_ENABLE_VPID; 590 591 /* Check support for pin-based VM-execution controls */ 592 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, 593 MSR_VMX_TRUE_PINBASED_CTLS, 594 PINBASED_CTLS_ONE_SETTING, 595 PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); 596 if (error) { 597 printf("vmx_init: processor does not support desired " 598 "pin-based controls\n"); 599 return (error); 600 } 601 602 /* Check support for VM-exit controls */ 603 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, 604 VM_EXIT_CTLS_ONE_SETTING, 605 VM_EXIT_CTLS_ZERO_SETTING, 606 &exit_ctls); 607 if (error) { 608 /* Try again without the PAT MSR bits */ 609 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, 610 MSR_VMX_TRUE_EXIT_CTLS, 611 VM_EXIT_CTLS_ONE_SETTING_NO_PAT, 612 VM_EXIT_CTLS_ZERO_SETTING, 613 &exit_ctls); 614 if (error) { 615 printf("vmx_init: processor does not support desired " 616 "exit controls\n"); 617 return (error); 618 } else { 619 if (bootverbose) 620 printf("vmm: PAT MSR access not supported\n"); 621 guest_msr_valid(MSR_PAT); 622 vmx_no_patmsr = 1; 623 } 624 } 625 626 /* Check support for VM-entry controls */ 627 if (!vmx_no_patmsr) { 628 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, 629 MSR_VMX_TRUE_ENTRY_CTLS, 630 VM_ENTRY_CTLS_ONE_SETTING, 631 VM_ENTRY_CTLS_ZERO_SETTING, 632 &entry_ctls); 633 } else { 634 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, 635 MSR_VMX_TRUE_ENTRY_CTLS, 636 VM_ENTRY_CTLS_ONE_SETTING_NO_PAT, 637 VM_ENTRY_CTLS_ZERO_SETTING, 638 &entry_ctls); 639 } 640 641 if (error) { 642 printf("vmx_init: processor does not support desired " 643 "entry controls\n"); 644 return (error); 645 } 646 647 /* 648 * Check support for optional features by testing them 649 * as individual bits 650 */ 651 cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 652 MSR_VMX_TRUE_PROCBASED_CTLS, 653 PROCBASED_HLT_EXITING, 0, 654 &tmp) == 0); 655 656 cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 657 MSR_VMX_PROCBASED_CTLS, 658 PROCBASED_MTF, 0, 659 &tmp) == 0); 660 661 cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 662 MSR_VMX_TRUE_PROCBASED_CTLS, 663 PROCBASED_PAUSE_EXITING, 0, 664 &tmp) == 0); 665 666 cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 667 MSR_VMX_PROCBASED_CTLS2, 668 PROCBASED2_UNRESTRICTED_GUEST, 0, 669 &tmp) == 0); 670 671 cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 672 MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0, 673 &tmp) == 0); 674 675 676 /* Initialize EPT */ 677 error = ept_init(); 678 if (error) { 679 printf("vmx_init: ept initialization failed (%d)\n", error); 680 return (error); 681 } 682 683 /* 684 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 685 */ 686 fixed0 = rdmsr(MSR_VMX_CR0_FIXED0); 687 fixed1 = rdmsr(MSR_VMX_CR0_FIXED1); 688 cr0_ones_mask = fixed0 & fixed1; 689 cr0_zeros_mask = ~fixed0 & ~fixed1; 690 691 /* 692 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation 693 * if unrestricted guest execution is allowed. 694 */ 695 if (cap_unrestricted_guest) 696 cr0_ones_mask &= ~(CR0_PG | CR0_PE); 697 698 /* 699 * Do not allow the guest to set CR0_NW or CR0_CD. 700 */ 701 cr0_zeros_mask |= (CR0_NW | CR0_CD); 702 703 fixed0 = rdmsr(MSR_VMX_CR4_FIXED0); 704 fixed1 = rdmsr(MSR_VMX_CR4_FIXED1); 705 cr4_ones_mask = fixed0 & fixed1; 706 cr4_zeros_mask = ~fixed0 & ~fixed1; 707 708 vpid_init(); 709 710 /* enable VMX operation */ 711 smp_rendezvous(NULL, vmx_enable, NULL, NULL); 712 713 vmx_initialized = 1; 714 715 return (0); 716} 717 718static int 719vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial) 720{ 721 int error, mask_ident, shadow_ident; 722 uint64_t mask_value; 723 724 if (which != 0 && which != 4) 725 panic("vmx_setup_cr_shadow: unknown cr%d", which); 726 727 if (which == 0) { 728 mask_ident = VMCS_CR0_MASK; 729 mask_value = cr0_ones_mask | cr0_zeros_mask; 730 shadow_ident = VMCS_CR0_SHADOW; 731 } else { 732 mask_ident = VMCS_CR4_MASK; 733 mask_value = cr4_ones_mask | cr4_zeros_mask; 734 shadow_ident = VMCS_CR4_SHADOW; 735 } 736 737 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value); 738 if (error) 739 return (error); 740 741 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial); 742 if (error) 743 return (error); 744 745 return (0); 746} 747#define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init)) 748#define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init)) 749 750static void * 751vmx_vminit(struct vm *vm, pmap_t pmap) 752{ 753 uint16_t vpid[VM_MAXCPU]; 754 int i, error, guest_msr_count; 755 struct vmx *vmx; 756 757 vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); 758 if ((uintptr_t)vmx & PAGE_MASK) { 759 panic("malloc of struct vmx not aligned on %d byte boundary", 760 PAGE_SIZE); 761 } 762 vmx->vm = vm; 763 764 vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4)); 765 766 /* 767 * Clean up EPTP-tagged guest physical and combined mappings 768 * 769 * VMX transitions are not required to invalidate any guest physical 770 * mappings. So, it may be possible for stale guest physical mappings 771 * to be present in the processor TLBs. 772 * 773 * Combined mappings for this EP4TA are also invalidated for all VPIDs. 774 */ 775 ept_invalidate_mappings(vmx->eptp); 776 777 msr_bitmap_initialize(vmx->msr_bitmap); 778 779 /* 780 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE. 781 * The guest FSBASE and GSBASE are saved and restored during 782 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are 783 * always restored from the vmcs host state area on vm-exit. 784 * 785 * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in 786 * how they are saved/restored so can be directly accessed by the 787 * guest. 788 * 789 * Guest KGSBASE is saved and restored in the guest MSR save area. 790 * Host KGSBASE is restored before returning to userland from the pcb. 791 * There will be a window of time when we are executing in the host 792 * kernel context with a value of KGSBASE from the guest. This is ok 793 * because the value of KGSBASE is inconsequential in kernel context. 794 * 795 * MSR_EFER is saved and restored in the guest VMCS area on a 796 * VM exit and entry respectively. It is also restored from the 797 * host VMCS area on a VM exit. 798 */ 799 if (guest_msr_rw(vmx, MSR_GSBASE) || 800 guest_msr_rw(vmx, MSR_FSBASE) || 801 guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) || 802 guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) || 803 guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) || 804 guest_msr_rw(vmx, MSR_KGSBASE) || 805 guest_msr_rw(vmx, MSR_EFER)) 806 panic("vmx_vminit: error setting guest msr access"); 807 808 /* 809 * MSR_PAT is saved and restored in the guest VMCS are on a VM exit 810 * and entry respectively. It is also restored from the host VMCS 811 * area on a VM exit. However, if running on a system with no 812 * MSR_PAT save/restore support, leave access disabled so accesses 813 * will be trapped. 814 */ 815 if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT)) 816 panic("vmx_vminit: error setting guest pat msr access"); 817 818 vpid_alloc(vpid, VM_MAXCPU); 819 820 for (i = 0; i < VM_MAXCPU; i++) { 821 vmx->vmcs[i].identifier = vmx_revision(); 822 error = vmclear(&vmx->vmcs[i]); 823 if (error != 0) { 824 panic("vmx_vminit: vmclear error %d on vcpu %d\n", 825 error, i); 826 } 827 828 error = vmcs_set_defaults(&vmx->vmcs[i], 829 (u_long)vmx_longjmp, 830 (u_long)&vmx->ctx[i], 831 vmx->eptp, 832 pinbased_ctls, 833 procbased_ctls, 834 procbased_ctls2, 835 exit_ctls, entry_ctls, 836 vtophys(vmx->msr_bitmap), 837 vpid[i]); 838 839 if (error != 0) 840 panic("vmx_vminit: vmcs_set_defaults error %d", error); 841 842 vmx->cap[i].set = 0; 843 vmx->cap[i].proc_ctls = procbased_ctls; 844 vmx->cap[i].proc_ctls2 = procbased_ctls2; 845 846 vmx->state[i].lastcpu = -1; 847 vmx->state[i].vpid = vpid[i]; 848 849 msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count); 850 851 error = vmcs_set_msr_save(&vmx->vmcs[i], 852 vtophys(vmx->guest_msrs[i]), 853 guest_msr_count); 854 if (error != 0) 855 panic("vmcs_set_msr_save error %d", error); 856 857 /* 858 * Set up the CR0/4 shadows, and init the read shadow 859 * to the power-on register value from the Intel Sys Arch. 860 * CR0 - 0x60000010 861 * CR4 - 0 862 */ 863 error = vmx_setup_cr0_shadow(&vmx->vmcs[i], 0x60000010); 864 if (error != 0) 865 panic("vmx_setup_cr0_shadow %d", error); 866 867 error = vmx_setup_cr4_shadow(&vmx->vmcs[i], 0); 868 if (error != 0) 869 panic("vmx_setup_cr4_shadow %d", error); 870 871 vmx->ctx[i].pmap = pmap; 872 vmx->ctx[i].eptp = vmx->eptp; 873 } 874 875 return (vmx); 876} 877 878static int 879vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx) 880{ 881 int handled, func; 882 883 func = vmxctx->guest_rax; 884 885 handled = x86_emulate_cpuid(vm, vcpu, 886 (uint32_t*)(&vmxctx->guest_rax), 887 (uint32_t*)(&vmxctx->guest_rbx), 888 (uint32_t*)(&vmxctx->guest_rcx), 889 (uint32_t*)(&vmxctx->guest_rdx)); 890 return (handled); 891} 892 893static __inline void 894vmx_run_trace(struct vmx *vmx, int vcpu) 895{ 896#ifdef KTR 897 VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip()); 898#endif 899} 900 901static __inline void 902vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, 903 int handled) 904{ 905#ifdef KTR 906 VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx", 907 handled ? "handled" : "unhandled", 908 exit_reason_to_str(exit_reason), rip); 909#endif 910} 911 912static __inline void 913vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip) 914{ 915#ifdef KTR 916 VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip); 917#endif 918} 919 920static int 921vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu) 922{ 923 int error, lastcpu; 924 struct vmxstate *vmxstate; 925 struct invvpid_desc invvpid_desc = { 0 }; 926 927 vmxstate = &vmx->state[vcpu]; 928 lastcpu = vmxstate->lastcpu; 929 vmxstate->lastcpu = curcpu; 930 931 if (lastcpu == curcpu) { 932 error = 0; 933 goto done; 934 } 935 936 vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); 937 938 error = vmwrite(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); 939 if (error != 0) 940 goto done; 941 942 error = vmwrite(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); 943 if (error != 0) 944 goto done; 945 946 error = vmwrite(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); 947 if (error != 0) 948 goto done; 949 950 /* 951 * If we are using VPIDs then invalidate all mappings tagged with 'vpid' 952 * 953 * We do this because this vcpu was executing on a different host 954 * cpu when it last ran. We do not track whether it invalidated 955 * mappings associated with its 'vpid' during that run. So we must 956 * assume that the mappings associated with 'vpid' on 'curcpu' are 957 * stale and invalidate them. 958 * 959 * Note that we incur this penalty only when the scheduler chooses to 960 * move the thread associated with this vcpu between host cpus. 961 * 962 * Note also that this will invalidate mappings tagged with 'vpid' 963 * for "all" EP4TAs. 964 */ 965 if (vmxstate->vpid != 0) { 966 invvpid_desc.vpid = vmxstate->vpid; 967 invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); 968 } 969done: 970 return (error); 971} 972 973static void 974vm_exit_update_rip(struct vm_exit *vmexit) 975{ 976 int error; 977 978 error = vmwrite(VMCS_GUEST_RIP, vmexit->rip + vmexit->inst_length); 979 if (error) 980 panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); 981} 982 983/* 984 * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. 985 */ 986CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); 987 988static void __inline 989vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) 990{ 991 int error; 992 993 vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING; 994 995 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 996 if (error) 997 panic("vmx_set_int_window_exiting: vmwrite error %d", error); 998} 999 1000static void __inline 1001vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) 1002{ 1003 int error; 1004 1005 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; 1006 1007 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1008 if (error) 1009 panic("vmx_clear_int_window_exiting: vmwrite error %d", error); 1010} 1011 1012static void __inline 1013vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) 1014{ 1015 int error; 1016 1017 vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; 1018 1019 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1020 if (error) 1021 panic("vmx_set_nmi_window_exiting: vmwrite error %d", error); 1022} 1023 1024static void __inline 1025vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) 1026{ 1027 int error; 1028 1029 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; 1030 1031 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1032 if (error) 1033 panic("vmx_clear_nmi_window_exiting: vmwrite error %d", error); 1034} 1035 1036static int 1037vmx_inject_nmi(struct vmx *vmx, int vcpu) 1038{ 1039 int error; 1040 uint64_t info, interruptibility; 1041 1042 /* Bail out if no NMI requested */ 1043 if (!vm_nmi_pending(vmx->vm, vcpu)) 1044 return (0); 1045 1046 error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility); 1047 if (error) { 1048 panic("vmx_inject_nmi: vmread(interruptibility) %d", 1049 error); 1050 } 1051 if (interruptibility & nmi_blocking_bits) 1052 goto nmiblocked; 1053 1054 /* 1055 * Inject the virtual NMI. The vector must be the NMI IDT entry 1056 * or the VMCS entry check will fail. 1057 */ 1058 info = VMCS_INTERRUPTION_INFO_NMI | VMCS_INTERRUPTION_INFO_VALID; 1059 info |= IDT_NMI; 1060 1061 error = vmwrite(VMCS_ENTRY_INTR_INFO, info); 1062 if (error) 1063 panic("vmx_inject_nmi: vmwrite(intrinfo) %d", error); 1064 1065 VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI"); 1066 1067 /* Clear the request */ 1068 vm_nmi_clear(vmx->vm, vcpu); 1069 return (1); 1070 1071nmiblocked: 1072 /* 1073 * Set the NMI Window Exiting execution control so we can inject 1074 * the virtual NMI as soon as blocking condition goes away. 1075 */ 1076 vmx_set_nmi_window_exiting(vmx, vcpu); 1077 1078 VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting"); 1079 return (1); 1080} 1081 1082static void 1083vmx_inject_interrupts(struct vmx *vmx, int vcpu) 1084{ 1085 int error, vector; 1086 uint64_t info, rflags, interruptibility; 1087 1088 const int HWINTR_BLOCKED = VMCS_INTERRUPTIBILITY_STI_BLOCKING | 1089 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING; 1090 1091 /* 1092 * If there is already an interrupt pending then just return. 1093 * 1094 * This could happen if an interrupt was injected on a prior 1095 * VM entry but the actual entry into guest mode was aborted 1096 * because of a pending AST. 1097 */ 1098 error = vmread(VMCS_ENTRY_INTR_INFO, &info); 1099 if (error) 1100 panic("vmx_inject_interrupts: vmread(intrinfo) %d", error); 1101 if (info & VMCS_INTERRUPTION_INFO_VALID) 1102 return; 1103 1104 /* 1105 * NMI injection has priority so deal with those first 1106 */ 1107 if (vmx_inject_nmi(vmx, vcpu)) 1108 return; 1109 1110 /* Ask the local apic for a vector to inject */ 1111 vector = lapic_pending_intr(vmx->vm, vcpu); 1112 if (vector < 0) 1113 return; 1114 1115 if (vector < 32 || vector > 255) 1116 panic("vmx_inject_interrupts: invalid vector %d\n", vector); 1117 1118 /* Check RFLAGS.IF and the interruptibility state of the guest */ 1119 error = vmread(VMCS_GUEST_RFLAGS, &rflags); 1120 if (error) 1121 panic("vmx_inject_interrupts: vmread(rflags) %d", error); 1122 1123 if ((rflags & PSL_I) == 0) 1124 goto cantinject; 1125 1126 error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility); 1127 if (error) { 1128 panic("vmx_inject_interrupts: vmread(interruptibility) %d", 1129 error); 1130 } 1131 if (interruptibility & HWINTR_BLOCKED) 1132 goto cantinject; 1133 1134 /* Inject the interrupt */ 1135 info = VMCS_INTERRUPTION_INFO_HW_INTR | VMCS_INTERRUPTION_INFO_VALID; 1136 info |= vector; 1137 error = vmwrite(VMCS_ENTRY_INTR_INFO, info); 1138 if (error) 1139 panic("vmx_inject_interrupts: vmwrite(intrinfo) %d", error); 1140 1141 /* Update the Local APIC ISR */ 1142 lapic_intr_accepted(vmx->vm, vcpu, vector); 1143 1144 VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); 1145 1146 return; 1147 1148cantinject: 1149 /* 1150 * Set the Interrupt Window Exiting execution control so we can inject 1151 * the interrupt as soon as blocking condition goes away. 1152 */ 1153 vmx_set_int_window_exiting(vmx, vcpu); 1154 1155 VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting"); 1156} 1157 1158static int 1159vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual) 1160{ 1161 int error, cr, vmcs_guest_cr, vmcs_shadow_cr; 1162 uint64_t crval, regval, ones_mask, zeros_mask; 1163 const struct vmxctx *vmxctx; 1164 1165 /* We only handle mov to %cr0 or %cr4 at this time */ 1166 if ((exitqual & 0xf0) != 0x00) 1167 return (UNHANDLED); 1168 1169 cr = exitqual & 0xf; 1170 if (cr != 0 && cr != 4) 1171 return (UNHANDLED); 1172 1173 vmxctx = &vmx->ctx[vcpu]; 1174 1175 /* 1176 * We must use vmwrite() directly here because vmcs_setreg() will 1177 * call vmclear(vmcs) as a side-effect which we certainly don't want. 1178 */ 1179 switch ((exitqual >> 8) & 0xf) { 1180 case 0: 1181 regval = vmxctx->guest_rax; 1182 break; 1183 case 1: 1184 regval = vmxctx->guest_rcx; 1185 break; 1186 case 2: 1187 regval = vmxctx->guest_rdx; 1188 break; 1189 case 3: 1190 regval = vmxctx->guest_rbx; 1191 break; 1192 case 4: 1193 error = vmread(VMCS_GUEST_RSP, ®val); 1194 if (error) { 1195 panic("vmx_emulate_cr_access: " 1196 "error %d reading guest rsp", error); 1197 } 1198 break; 1199 case 5: 1200 regval = vmxctx->guest_rbp; 1201 break; 1202 case 6: 1203 regval = vmxctx->guest_rsi; 1204 break; 1205 case 7: 1206 regval = vmxctx->guest_rdi; 1207 break; 1208 case 8: 1209 regval = vmxctx->guest_r8; 1210 break; 1211 case 9: 1212 regval = vmxctx->guest_r9; 1213 break; 1214 case 10: 1215 regval = vmxctx->guest_r10; 1216 break; 1217 case 11: 1218 regval = vmxctx->guest_r11; 1219 break; 1220 case 12: 1221 regval = vmxctx->guest_r12; 1222 break; 1223 case 13: 1224 regval = vmxctx->guest_r13; 1225 break; 1226 case 14: 1227 regval = vmxctx->guest_r14; 1228 break; 1229 case 15: 1230 regval = vmxctx->guest_r15; 1231 break; 1232 } 1233 1234 if (cr == 0) { 1235 ones_mask = cr0_ones_mask; 1236 zeros_mask = cr0_zeros_mask; 1237 vmcs_guest_cr = VMCS_GUEST_CR0; 1238 vmcs_shadow_cr = VMCS_CR0_SHADOW; 1239 } else { 1240 ones_mask = cr4_ones_mask; 1241 zeros_mask = cr4_zeros_mask; 1242 vmcs_guest_cr = VMCS_GUEST_CR4; 1243 vmcs_shadow_cr = VMCS_CR4_SHADOW; 1244 } 1245 1246 error = vmwrite(vmcs_shadow_cr, regval); 1247 if (error) { 1248 panic("vmx_emulate_cr_access: error %d writing cr%d shadow", 1249 error, cr); 1250 } 1251 1252 crval = regval | ones_mask; 1253 crval &= ~zeros_mask; 1254 error = vmwrite(vmcs_guest_cr, crval); 1255 if (error) { 1256 panic("vmx_emulate_cr_access: error %d writing cr%d", 1257 error, cr); 1258 } 1259 1260 if (cr == 0 && regval & CR0_PG) { 1261 uint64_t efer, entry_ctls; 1262 1263 /* 1264 * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and 1265 * the "IA-32e mode guest" bit in VM-entry control must be 1266 * equal. 1267 */ 1268 error = vmread(VMCS_GUEST_IA32_EFER, &efer); 1269 if (error) { 1270 panic("vmx_emulate_cr_access: error %d efer read", 1271 error); 1272 } 1273 if (efer & EFER_LME) { 1274 efer |= EFER_LMA; 1275 error = vmwrite(VMCS_GUEST_IA32_EFER, efer); 1276 if (error) { 1277 panic("vmx_emulate_cr_access: error %d" 1278 " efer write", error); 1279 } 1280 error = vmread(VMCS_ENTRY_CTLS, &entry_ctls); 1281 if (error) { 1282 panic("vmx_emulate_cr_access: error %d" 1283 " entry ctls read", error); 1284 } 1285 entry_ctls |= VM_ENTRY_GUEST_LMA; 1286 error = vmwrite(VMCS_ENTRY_CTLS, entry_ctls); 1287 if (error) { 1288 panic("vmx_emulate_cr_access: error %d" 1289 " entry ctls write", error); 1290 } 1291 } 1292 } 1293 1294 return (HANDLED); 1295} 1296 1297static int 1298ept_fault_type(uint64_t ept_qual) 1299{ 1300 int fault_type; 1301 1302 if (ept_qual & EPT_VIOLATION_DATA_WRITE) 1303 fault_type = VM_PROT_WRITE; 1304 else if (ept_qual & EPT_VIOLATION_INST_FETCH) 1305 fault_type = VM_PROT_EXECUTE; 1306 else 1307 fault_type= VM_PROT_READ; 1308 1309 return (fault_type); 1310} 1311 1312static int 1313ept_protection(uint64_t ept_qual) 1314{ 1315 int prot = 0; 1316 1317 if (ept_qual & EPT_VIOLATION_GPA_READABLE) 1318 prot |= VM_PROT_READ; 1319 if (ept_qual & EPT_VIOLATION_GPA_WRITEABLE) 1320 prot |= VM_PROT_WRITE; 1321 if (ept_qual & EPT_VIOLATION_GPA_EXECUTABLE) 1322 prot |= VM_PROT_EXECUTE; 1323 1324 return (prot); 1325} 1326 1327static boolean_t 1328ept_emulation_fault(uint64_t ept_qual) 1329{ 1330 int read, write; 1331 1332 /* EPT fault on an instruction fetch doesn't make sense here */ 1333 if (ept_qual & EPT_VIOLATION_INST_FETCH) 1334 return (FALSE); 1335 1336 /* EPT fault must be a read fault or a write fault */ 1337 read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0; 1338 write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0; 1339 if ((read | write) == 0) 1340 return (FALSE); 1341 1342 /* 1343 * The EPT violation must have been caused by accessing a 1344 * guest-physical address that is a translation of a guest-linear 1345 * address. 1346 */ 1347 if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 || 1348 (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) { 1349 return (FALSE); 1350 } 1351 1352 return (TRUE); 1353} 1354 1355static int 1356vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 1357{ 1358 int error, handled; 1359 struct vmcs *vmcs; 1360 struct vmxctx *vmxctx; 1361 uint32_t eax, ecx, edx, idtvec_info, idtvec_err, reason; 1362 uint64_t qual, gpa, rflags; 1363 bool retu; 1364 1365 handled = 0; 1366 vmcs = &vmx->vmcs[vcpu]; 1367 vmxctx = &vmx->ctx[vcpu]; 1368 qual = vmexit->u.vmx.exit_qualification; 1369 reason = vmexit->u.vmx.exit_reason; 1370 vmexit->exitcode = VM_EXITCODE_BOGUS; 1371 1372 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1); 1373 1374 /* 1375 * VM exits that could be triggered during event injection on the 1376 * previous VM entry need to be handled specially by re-injecting 1377 * the event. 1378 * 1379 * See "Information for VM Exits During Event Delivery" in Intel SDM 1380 * for details. 1381 */ 1382 switch (reason) { 1383 case EXIT_REASON_EPT_FAULT: 1384 case EXIT_REASON_EPT_MISCONFIG: 1385 case EXIT_REASON_APIC: 1386 case EXIT_REASON_TASK_SWITCH: 1387 case EXIT_REASON_EXCEPTION: 1388 idtvec_info = vmcs_idt_vectoring_info(); 1389 if (idtvec_info & VMCS_IDT_VEC_VALID) { 1390 idtvec_info &= ~(1 << 12); /* clear undefined bit */ 1391 vmwrite(VMCS_ENTRY_INTR_INFO, idtvec_info); 1392 if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { 1393 idtvec_err = vmcs_idt_vectoring_err(); 1394 vmwrite(VMCS_ENTRY_EXCEPTION_ERROR, idtvec_err); 1395 } 1396 vmwrite(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); 1397 } 1398 default: 1399 break; 1400 } 1401 1402 switch (reason) { 1403 case EXIT_REASON_CR_ACCESS: 1404 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1); 1405 handled = vmx_emulate_cr_access(vmx, vcpu, qual); 1406 break; 1407 case EXIT_REASON_RDMSR: 1408 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1); 1409 retu = false; 1410 ecx = vmxctx->guest_rcx; 1411 error = emulate_rdmsr(vmx->vm, vcpu, ecx, &retu); 1412 if (error) { 1413 vmexit->exitcode = VM_EXITCODE_RDMSR; 1414 vmexit->u.msr.code = ecx; 1415 } else if (!retu) { 1416 handled = 1; 1417 } else { 1418 /* Return to userspace with a valid exitcode */ 1419 KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, 1420 ("emulate_wrmsr retu with bogus exitcode")); 1421 } 1422 break; 1423 case EXIT_REASON_WRMSR: 1424 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1); 1425 retu = false; 1426 eax = vmxctx->guest_rax; 1427 ecx = vmxctx->guest_rcx; 1428 edx = vmxctx->guest_rdx; 1429 error = emulate_wrmsr(vmx->vm, vcpu, ecx, 1430 (uint64_t)edx << 32 | eax, &retu); 1431 if (error) { 1432 vmexit->exitcode = VM_EXITCODE_WRMSR; 1433 vmexit->u.msr.code = ecx; 1434 vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; 1435 } else if (!retu) { 1436 handled = 1; 1437 } else { 1438 /* Return to userspace with a valid exitcode */ 1439 KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, 1440 ("emulate_wrmsr retu with bogus exitcode")); 1441 } 1442 break; 1443 case EXIT_REASON_HLT: 1444 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1); 1445 if ((error = vmread(VMCS_GUEST_RFLAGS, &rflags)) != 0) 1446 panic("vmx_exit_process: vmread(rflags) %d", error); 1447 vmexit->exitcode = VM_EXITCODE_HLT; 1448 vmexit->u.hlt.rflags = rflags; 1449 break; 1450 case EXIT_REASON_MTF: 1451 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1); 1452 vmexit->exitcode = VM_EXITCODE_MTRAP; 1453 break; 1454 case EXIT_REASON_PAUSE: 1455 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1); 1456 vmexit->exitcode = VM_EXITCODE_PAUSE; 1457 break; 1458 case EXIT_REASON_INTR_WINDOW: 1459 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1); 1460 vmx_clear_int_window_exiting(vmx, vcpu); 1461 VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); 1462 return (1); 1463 case EXIT_REASON_EXT_INTR: 1464 /* 1465 * External interrupts serve only to cause VM exits and allow 1466 * the host interrupt handler to run. 1467 * 1468 * If this external interrupt triggers a virtual interrupt 1469 * to a VM, then that state will be recorded by the 1470 * host interrupt handler in the VM's softc. We will inject 1471 * this virtual interrupt during the subsequent VM enter. 1472 */ 1473 1474 /* 1475 * This is special. We want to treat this as an 'handled' 1476 * VM-exit but not increment the instruction pointer. 1477 */ 1478 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); 1479 return (1); 1480 case EXIT_REASON_NMI_WINDOW: 1481 /* Exit to allow the pending virtual NMI to be injected */ 1482 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1); 1483 vmx_clear_nmi_window_exiting(vmx, vcpu); 1484 VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting"); 1485 return (1); 1486 case EXIT_REASON_INOUT: 1487 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1); 1488 vmexit->exitcode = VM_EXITCODE_INOUT; 1489 vmexit->u.inout.bytes = (qual & 0x7) + 1; 1490 vmexit->u.inout.in = (qual & 0x8) ? 1 : 0; 1491 vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; 1492 vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; 1493 vmexit->u.inout.port = (uint16_t)(qual >> 16); 1494 vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); 1495 break; 1496 case EXIT_REASON_CPUID: 1497 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1); 1498 handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx); 1499 break; 1500 case EXIT_REASON_EPT_FAULT: 1501 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EPT_FAULT, 1); 1502 /* 1503 * If 'gpa' lies within the address space allocated to 1504 * memory then this must be a nested page fault otherwise 1505 * this must be an instruction that accesses MMIO space. 1506 */ 1507 gpa = vmcs_gpa(); 1508 if (vm_mem_allocated(vmx->vm, gpa)) { 1509 vmexit->exitcode = VM_EXITCODE_PAGING; 1510 vmexit->u.paging.gpa = gpa; 1511 vmexit->u.paging.fault_type = ept_fault_type(qual); 1512 vmexit->u.paging.protection = ept_protection(qual); 1513 } else if (ept_emulation_fault(qual)) { 1514 vmexit->exitcode = VM_EXITCODE_INST_EMUL; 1515 vmexit->u.inst_emul.gpa = gpa; 1516 vmexit->u.inst_emul.gla = vmcs_gla(); 1517 vmexit->u.inst_emul.cr3 = vmcs_guest_cr3(); 1518 } 1519 break; 1520 default: 1521 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1); 1522 break; 1523 } 1524 1525 if (handled) { 1526 /* 1527 * It is possible that control is returned to userland 1528 * even though we were able to handle the VM exit in the 1529 * kernel. 1530 * 1531 * In such a case we want to make sure that the userland 1532 * restarts guest execution at the instruction *after* 1533 * the one we just processed. Therefore we update the 1534 * guest rip in the VMCS and in 'vmexit'. 1535 */ 1536 vm_exit_update_rip(vmexit); 1537 vmexit->rip += vmexit->inst_length; 1538 vmexit->inst_length = 0; 1539 } else { 1540 if (vmexit->exitcode == VM_EXITCODE_BOGUS) { 1541 /* 1542 * If this VM exit was not claimed by anybody then 1543 * treat it as a generic VMX exit. 1544 */ 1545 vmexit->exitcode = VM_EXITCODE_VMX; 1546 vmexit->u.vmx.error = 0; 1547 } else { 1548 /* 1549 * The exitcode and collateral have been populated. 1550 * The VM exit will be processed further in userland. 1551 */ 1552 } 1553 } 1554 return (handled); 1555} 1556 1557static int 1558vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap) 1559{ 1560 int error, vie, rc, handled, astpending; 1561 uint32_t exit_reason; 1562 struct vmx *vmx; 1563 struct vmxctx *vmxctx; 1564 struct vmcs *vmcs; 1565 struct vm_exit *vmexit; 1566 1567 vmx = arg; 1568 vmcs = &vmx->vmcs[vcpu]; 1569 vmxctx = &vmx->ctx[vcpu]; 1570 vmxctx->launched = 0; 1571 1572 astpending = 0; 1573 vmexit = vm_exitinfo(vmx->vm, vcpu); 1574 1575 KASSERT(vmxctx->pmap == pmap, 1576 ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap)); 1577 KASSERT(vmxctx->eptp == vmx->eptp, 1578 ("eptp %p different than ctx eptp %#lx", eptp, vmxctx->eptp)); 1579 1580 /* 1581 * XXX Can we avoid doing this every time we do a vm run? 1582 */ 1583 VMPTRLD(vmcs); 1584 1585 /* 1586 * XXX 1587 * We do this every time because we may setup the virtual machine 1588 * from a different process than the one that actually runs it. 1589 * 1590 * If the life of a virtual machine was spent entirely in the context 1591 * of a single process we could do this once in vmcs_set_defaults(). 1592 */ 1593 if ((error = vmwrite(VMCS_HOST_CR3, rcr3())) != 0) 1594 panic("vmx_run: error %d writing to VMCS_HOST_CR3", error); 1595 1596 if ((error = vmwrite(VMCS_GUEST_RIP, rip)) != 0) 1597 panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); 1598 1599 if ((error = vmx_set_pcpu_defaults(vmx, vcpu)) != 0) 1600 panic("vmx_run: error %d setting up pcpu defaults", error); 1601 1602 do { 1603 vmx_inject_interrupts(vmx, vcpu); 1604 vmx_run_trace(vmx, vcpu); 1605 rc = vmx_setjmp(vmxctx); 1606#ifdef SETJMP_TRACE 1607 vmx_setjmp_trace(vmx, vcpu, vmxctx, rc); 1608#endif 1609 switch (rc) { 1610 case VMX_RETURN_DIRECT: 1611 if (vmxctx->launched == 0) { 1612 vmxctx->launched = 1; 1613 vmx_launch(vmxctx); 1614 } else 1615 vmx_resume(vmxctx); 1616 panic("vmx_launch/resume should not return"); 1617 break; 1618 case VMX_RETURN_LONGJMP: 1619 break; /* vm exit */ 1620 case VMX_RETURN_AST: 1621 astpending = 1; 1622 break; 1623 case VMX_RETURN_VMRESUME: 1624 vie = vmcs_instruction_error(); 1625 if (vmxctx->launch_error == VM_FAIL_INVALID || 1626 vie != VMRESUME_WITH_NON_LAUNCHED_VMCS) { 1627 printf("vmresume error %d vmcs inst error %d\n", 1628 vmxctx->launch_error, vie); 1629 goto err_exit; 1630 } 1631 vmx_launch(vmxctx); /* try to launch the guest */ 1632 panic("vmx_launch should not return"); 1633 break; 1634 case VMX_RETURN_VMLAUNCH: 1635 vie = vmcs_instruction_error(); 1636#if 1 1637 printf("vmlaunch error %d vmcs inst error %d\n", 1638 vmxctx->launch_error, vie); 1639#endif 1640 goto err_exit; 1641 case VMX_RETURN_INVEPT: 1642 panic("vm %s:%d invept error %d", 1643 vm_name(vmx->vm), vcpu, vmxctx->launch_error); 1644 default: 1645 panic("vmx_setjmp returned %d", rc); 1646 } 1647 1648 /* enable interrupts */ 1649 enable_intr(); 1650 1651 /* collect some basic information for VM exit processing */ 1652 vmexit->rip = rip = vmcs_guest_rip(); 1653 vmexit->inst_length = vmexit_instruction_length(); 1654 vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); 1655 vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); 1656 1657 if (astpending) { 1658 handled = 1; 1659 vmexit->inst_length = 0; 1660 vmexit->exitcode = VM_EXITCODE_BOGUS; 1661 vmx_astpending_trace(vmx, vcpu, rip); 1662 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_ASTPENDING, 1); 1663 break; 1664 } 1665 1666 handled = vmx_exit_process(vmx, vcpu, vmexit); 1667 vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled); 1668 1669 } while (handled); 1670 1671 /* 1672 * If a VM exit has been handled then the exitcode must be BOGUS 1673 * If a VM exit is not handled then the exitcode must not be BOGUS 1674 */ 1675 if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || 1676 (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { 1677 panic("Mismatch between handled (%d) and exitcode (%d)", 1678 handled, vmexit->exitcode); 1679 } 1680 1681 if (!handled) 1682 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_USERSPACE, 1); 1683 1684 VCPU_CTR1(vmx->vm, vcpu, "goto userland: exitcode %d",vmexit->exitcode); 1685 1686 /* 1687 * XXX 1688 * We need to do this to ensure that any VMCS state cached by the 1689 * processor is flushed to memory. We need to do this in case the 1690 * VM moves to a different cpu the next time it runs. 1691 * 1692 * Can we avoid doing this? 1693 */ 1694 VMCLEAR(vmcs); 1695 return (0); 1696 1697err_exit: 1698 vmexit->exitcode = VM_EXITCODE_VMX; 1699 vmexit->u.vmx.exit_reason = (uint32_t)-1; 1700 vmexit->u.vmx.exit_qualification = (uint32_t)-1; 1701 vmexit->u.vmx.error = vie; 1702 VMCLEAR(vmcs); 1703 return (ENOEXEC); 1704} 1705 1706static void 1707vmx_vmcleanup(void *arg) 1708{ 1709 int i, error; 1710 struct vmx *vmx = arg; 1711 1712 for (i = 0; i < VM_MAXCPU; i++) 1713 vpid_free(vmx->state[i].vpid); 1714 1715 /* 1716 * XXXSMP we also need to clear the VMCS active on the other vcpus. 1717 */ 1718 error = vmclear(&vmx->vmcs[0]); 1719 if (error != 0) 1720 panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error); 1721 1722 free(vmx, M_VMX); 1723 1724 return; 1725} 1726 1727static register_t * 1728vmxctx_regptr(struct vmxctx *vmxctx, int reg) 1729{ 1730 1731 switch (reg) { 1732 case VM_REG_GUEST_RAX: 1733 return (&vmxctx->guest_rax); 1734 case VM_REG_GUEST_RBX: 1735 return (&vmxctx->guest_rbx); 1736 case VM_REG_GUEST_RCX: 1737 return (&vmxctx->guest_rcx); 1738 case VM_REG_GUEST_RDX: 1739 return (&vmxctx->guest_rdx); 1740 case VM_REG_GUEST_RSI: 1741 return (&vmxctx->guest_rsi); 1742 case VM_REG_GUEST_RDI: 1743 return (&vmxctx->guest_rdi); 1744 case VM_REG_GUEST_RBP: 1745 return (&vmxctx->guest_rbp); 1746 case VM_REG_GUEST_R8: 1747 return (&vmxctx->guest_r8); 1748 case VM_REG_GUEST_R9: 1749 return (&vmxctx->guest_r9); 1750 case VM_REG_GUEST_R10: 1751 return (&vmxctx->guest_r10); 1752 case VM_REG_GUEST_R11: 1753 return (&vmxctx->guest_r11); 1754 case VM_REG_GUEST_R12: 1755 return (&vmxctx->guest_r12); 1756 case VM_REG_GUEST_R13: 1757 return (&vmxctx->guest_r13); 1758 case VM_REG_GUEST_R14: 1759 return (&vmxctx->guest_r14); 1760 case VM_REG_GUEST_R15: 1761 return (&vmxctx->guest_r15); 1762 default: 1763 break; 1764 } 1765 return (NULL); 1766} 1767 1768static int 1769vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval) 1770{ 1771 register_t *regp; 1772 1773 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 1774 *retval = *regp; 1775 return (0); 1776 } else 1777 return (EINVAL); 1778} 1779 1780static int 1781vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) 1782{ 1783 register_t *regp; 1784 1785 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 1786 *regp = val; 1787 return (0); 1788 } else 1789 return (EINVAL); 1790} 1791 1792static int 1793vmx_shadow_reg(int reg) 1794{ 1795 int shreg; 1796 1797 shreg = -1; 1798 1799 switch (reg) { 1800 case VM_REG_GUEST_CR0: 1801 shreg = VMCS_CR0_SHADOW; 1802 break; 1803 case VM_REG_GUEST_CR4: 1804 shreg = VMCS_CR4_SHADOW; 1805 break; 1806 default: 1807 break; 1808 } 1809 1810 return (shreg); 1811} 1812 1813static int 1814vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) 1815{ 1816 int running, hostcpu; 1817 struct vmx *vmx = arg; 1818 1819 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 1820 if (running && hostcpu != curcpu) 1821 panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); 1822 1823 if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0) 1824 return (0); 1825 1826 return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval)); 1827} 1828 1829static int 1830vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) 1831{ 1832 int error, hostcpu, running, shadow; 1833 uint64_t ctls; 1834 struct vmx *vmx = arg; 1835 1836 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 1837 if (running && hostcpu != curcpu) 1838 panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); 1839 1840 if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0) 1841 return (0); 1842 1843 error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val); 1844 1845 if (error == 0) { 1846 /* 1847 * If the "load EFER" VM-entry control is 1 then the 1848 * value of EFER.LMA must be identical to "IA-32e mode guest" 1849 * bit in the VM-entry control. 1850 */ 1851 if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 && 1852 (reg == VM_REG_GUEST_EFER)) { 1853 vmcs_getreg(&vmx->vmcs[vcpu], running, 1854 VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls); 1855 if (val & EFER_LMA) 1856 ctls |= VM_ENTRY_GUEST_LMA; 1857 else 1858 ctls &= ~VM_ENTRY_GUEST_LMA; 1859 vmcs_setreg(&vmx->vmcs[vcpu], running, 1860 VMCS_IDENT(VMCS_ENTRY_CTLS), ctls); 1861 } 1862 1863 shadow = vmx_shadow_reg(reg); 1864 if (shadow > 0) { 1865 /* 1866 * Store the unmodified value in the shadow 1867 */ 1868 error = vmcs_setreg(&vmx->vmcs[vcpu], running, 1869 VMCS_IDENT(shadow), val); 1870 } 1871 } 1872 1873 return (error); 1874} 1875 1876static int 1877vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 1878{ 1879 struct vmx *vmx = arg; 1880 1881 return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc)); 1882} 1883 1884static int 1885vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 1886{ 1887 struct vmx *vmx = arg; 1888 1889 return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc)); 1890} 1891 1892static int 1893vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code, 1894 int code_valid) 1895{ 1896 int error; 1897 uint64_t info; 1898 struct vmx *vmx = arg; 1899 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 1900 1901 static uint32_t type_map[VM_EVENT_MAX] = { 1902 0x1, /* VM_EVENT_NONE */ 1903 0x0, /* VM_HW_INTR */ 1904 0x2, /* VM_NMI */ 1905 0x3, /* VM_HW_EXCEPTION */ 1906 0x4, /* VM_SW_INTR */ 1907 0x5, /* VM_PRIV_SW_EXCEPTION */ 1908 0x6, /* VM_SW_EXCEPTION */ 1909 }; 1910 1911 /* 1912 * If there is already an exception pending to be delivered to the 1913 * vcpu then just return. 1914 */ 1915 error = vmcs_getreg(vmcs, 0, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), &info); 1916 if (error) 1917 return (error); 1918 1919 if (info & VMCS_INTERRUPTION_INFO_VALID) 1920 return (EAGAIN); 1921 1922 info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0); 1923 info |= VMCS_INTERRUPTION_INFO_VALID; 1924 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info); 1925 if (error != 0) 1926 return (error); 1927 1928 if (code_valid) { 1929 error = vmcs_setreg(vmcs, 0, 1930 VMCS_IDENT(VMCS_ENTRY_EXCEPTION_ERROR), 1931 code); 1932 } 1933 return (error); 1934} 1935 1936static int 1937vmx_getcap(void *arg, int vcpu, int type, int *retval) 1938{ 1939 struct vmx *vmx = arg; 1940 int vcap; 1941 int ret; 1942 1943 ret = ENOENT; 1944 1945 vcap = vmx->cap[vcpu].set; 1946 1947 switch (type) { 1948 case VM_CAP_HALT_EXIT: 1949 if (cap_halt_exit) 1950 ret = 0; 1951 break; 1952 case VM_CAP_PAUSE_EXIT: 1953 if (cap_pause_exit) 1954 ret = 0; 1955 break; 1956 case VM_CAP_MTRAP_EXIT: 1957 if (cap_monitor_trap) 1958 ret = 0; 1959 break; 1960 case VM_CAP_UNRESTRICTED_GUEST: 1961 if (cap_unrestricted_guest) 1962 ret = 0; 1963 break; 1964 case VM_CAP_ENABLE_INVPCID: 1965 if (cap_invpcid) 1966 ret = 0; 1967 break; 1968 default: 1969 break; 1970 } 1971 1972 if (ret == 0) 1973 *retval = (vcap & (1 << type)) ? 1 : 0; 1974 1975 return (ret); 1976} 1977 1978static int 1979vmx_setcap(void *arg, int vcpu, int type, int val) 1980{ 1981 struct vmx *vmx = arg; 1982 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 1983 uint32_t baseval; 1984 uint32_t *pptr; 1985 int error; 1986 int flag; 1987 int reg; 1988 int retval; 1989 1990 retval = ENOENT; 1991 pptr = NULL; 1992 1993 switch (type) { 1994 case VM_CAP_HALT_EXIT: 1995 if (cap_halt_exit) { 1996 retval = 0; 1997 pptr = &vmx->cap[vcpu].proc_ctls; 1998 baseval = *pptr; 1999 flag = PROCBASED_HLT_EXITING; 2000 reg = VMCS_PRI_PROC_BASED_CTLS; 2001 } 2002 break; 2003 case VM_CAP_MTRAP_EXIT: 2004 if (cap_monitor_trap) { 2005 retval = 0; 2006 pptr = &vmx->cap[vcpu].proc_ctls; 2007 baseval = *pptr; 2008 flag = PROCBASED_MTF; 2009 reg = VMCS_PRI_PROC_BASED_CTLS; 2010 } 2011 break; 2012 case VM_CAP_PAUSE_EXIT: 2013 if (cap_pause_exit) { 2014 retval = 0; 2015 pptr = &vmx->cap[vcpu].proc_ctls; 2016 baseval = *pptr; 2017 flag = PROCBASED_PAUSE_EXITING; 2018 reg = VMCS_PRI_PROC_BASED_CTLS; 2019 } 2020 break; 2021 case VM_CAP_UNRESTRICTED_GUEST: 2022 if (cap_unrestricted_guest) { 2023 retval = 0; 2024 pptr = &vmx->cap[vcpu].proc_ctls2; 2025 baseval = *pptr; 2026 flag = PROCBASED2_UNRESTRICTED_GUEST; 2027 reg = VMCS_SEC_PROC_BASED_CTLS; 2028 } 2029 break; 2030 case VM_CAP_ENABLE_INVPCID: 2031 if (cap_invpcid) { 2032 retval = 0; 2033 pptr = &vmx->cap[vcpu].proc_ctls2; 2034 baseval = *pptr; 2035 flag = PROCBASED2_ENABLE_INVPCID; 2036 reg = VMCS_SEC_PROC_BASED_CTLS; 2037 } 2038 break; 2039 default: 2040 break; 2041 } 2042 2043 if (retval == 0) { 2044 if (val) { 2045 baseval |= flag; 2046 } else { 2047 baseval &= ~flag; 2048 } 2049 VMPTRLD(vmcs); 2050 error = vmwrite(reg, baseval); 2051 VMCLEAR(vmcs); 2052 2053 if (error) { 2054 retval = error; 2055 } else { 2056 /* 2057 * Update optional stored flags, and record 2058 * setting 2059 */ 2060 if (pptr != NULL) { 2061 *pptr = baseval; 2062 } 2063 2064 if (val) { 2065 vmx->cap[vcpu].set |= (1 << type); 2066 } else { 2067 vmx->cap[vcpu].set &= ~(1 << type); 2068 } 2069 } 2070 } 2071 2072 return (retval); 2073} 2074 2075struct vmm_ops vmm_ops_intel = { 2076 vmx_init, 2077 vmx_cleanup, 2078 vmx_restore, 2079 vmx_vminit, 2080 vmx_run, 2081 vmx_vmcleanup, 2082 vmx_getreg, 2083 vmx_setreg, 2084 vmx_getdesc, 2085 vmx_setdesc, 2086 vmx_inject, 2087 vmx_getcap, 2088 vmx_setcap, 2089 ept_vmspace_alloc, 2090 ept_vmspace_free, 2091}; 2092