vmx.c revision 228870
1/*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD$ 27 */ 28 29#include <sys/cdefs.h> 30__FBSDID("$FreeBSD$"); 31 32#include <sys/param.h> 33#include <sys/systm.h> 34#include <sys/smp.h> 35#include <sys/kernel.h> 36#include <sys/malloc.h> 37#include <sys/pcpu.h> 38#include <sys/proc.h> 39 40#include <vm/vm.h> 41#include <vm/pmap.h> 42 43#include <machine/psl.h> 44#include <machine/cpufunc.h> 45#include <machine/md_var.h> 46#include <machine/pmap.h> 47#include <machine/segments.h> 48#include <machine/specialreg.h> 49#include <machine/vmparam.h> 50 51#include <machine/vmm.h> 52#include "vmm_lapic.h" 53#include "vmm_msr.h" 54#include "vmm_ktr.h" 55#include "vmm_stat.h" 56 57#include "vmx_msr.h" 58#include "ept.h" 59#include "vmx_cpufunc.h" 60#include "vmx.h" 61#include "x86.h" 62#include "vmx_controls.h" 63 64#define CR4_VMXE (1UL << 13) 65 66#define PINBASED_CTLS_ONE_SETTING \ 67 (PINBASED_EXTINT_EXITING | \ 68 PINBASED_NMI_EXITING | \ 69 PINBASED_VIRTUAL_NMI) 70#define PINBASED_CTLS_ZERO_SETTING 0 71 72#define PROCBASED_CTLS_WINDOW_SETTING \ 73 (PROCBASED_INT_WINDOW_EXITING | \ 74 PROCBASED_NMI_WINDOW_EXITING) 75 76#define PROCBASED_CTLS_ONE_SETTING \ 77 (PROCBASED_SECONDARY_CONTROLS | \ 78 PROCBASED_IO_EXITING | \ 79 PROCBASED_MSR_BITMAPS | \ 80 PROCBASED_CTLS_WINDOW_SETTING) 81#define PROCBASED_CTLS_ZERO_SETTING \ 82 (PROCBASED_CR3_LOAD_EXITING | \ 83 PROCBASED_CR3_STORE_EXITING | \ 84 PROCBASED_IO_BITMAPS) 85 86#define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT 87#define PROCBASED_CTLS2_ZERO_SETTING 0 88 89#define VM_EXIT_CTLS_ONE_SETTING_NO_PAT \ 90 (VM_EXIT_HOST_LMA | \ 91 VM_EXIT_SAVE_EFER | \ 92 VM_EXIT_LOAD_EFER) 93 94#define VM_EXIT_CTLS_ONE_SETTING \ 95 (VM_EXIT_CTLS_ONE_SETTING_NO_PAT | \ 96 VM_EXIT_SAVE_PAT | \ 97 VM_EXIT_LOAD_PAT) 98#define VM_EXIT_CTLS_ZERO_SETTING VM_EXIT_SAVE_DEBUG_CONTROLS 99 100#define VM_ENTRY_CTLS_ONE_SETTING_NO_PAT VM_ENTRY_LOAD_EFER 101 102#define VM_ENTRY_CTLS_ONE_SETTING \ 103 (VM_ENTRY_CTLS_ONE_SETTING_NO_PAT | \ 104 VM_ENTRY_LOAD_PAT) 105#define VM_ENTRY_CTLS_ZERO_SETTING \ 106 (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ 107 VM_ENTRY_INTO_SMM | \ 108 VM_ENTRY_DEACTIVATE_DUAL_MONITOR) 109 110#define guest_msr_rw(vmx, msr) \ 111 msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW) 112 113#define HANDLED 1 114#define UNHANDLED 0 115 116MALLOC_DEFINE(M_VMX, "vmx", "vmx"); 117 118extern struct pcpu __pcpu[]; 119 120int vmxon_enabled[MAXCPU]; 121static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); 122 123static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; 124static uint32_t exit_ctls, entry_ctls; 125 126static uint64_t cr0_ones_mask, cr0_zeros_mask; 127static uint64_t cr4_ones_mask, cr4_zeros_mask; 128 129static volatile u_int nextvpid; 130 131static int vmx_no_patmsr; 132 133/* 134 * Virtual NMI blocking conditions. 135 * 136 * Some processor implementations also require NMI to be blocked if 137 * the STI_BLOCKING bit is set. It is possible to detect this at runtime 138 * based on the (exit_reason,exit_qual) tuple being set to 139 * (EXIT_REASON_INVAL_VMCS, EXIT_QUAL_NMI_WHILE_STI_BLOCKING). 140 * 141 * We take the easy way out and also include STI_BLOCKING as one of the 142 * gating items for vNMI injection. 143 */ 144static uint64_t nmi_blocking_bits = VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING | 145 VMCS_INTERRUPTIBILITY_NMI_BLOCKING | 146 VMCS_INTERRUPTIBILITY_STI_BLOCKING; 147 148/* 149 * Optional capabilities 150 */ 151static int cap_halt_exit; 152static int cap_pause_exit; 153static int cap_unrestricted_guest; 154static int cap_monitor_trap; 155 156/* statistics */ 157static VMM_STAT_DEFINE(VCPU_MIGRATIONS, "vcpu migration across host cpus"); 158static VMM_STAT_DEFINE(VMEXIT_EXTINT, "vm exits due to external interrupt"); 159 160#ifdef KTR 161static const char * 162exit_reason_to_str(int reason) 163{ 164 static char reasonbuf[32]; 165 166 switch (reason) { 167 case EXIT_REASON_EXCEPTION: 168 return "exception"; 169 case EXIT_REASON_EXT_INTR: 170 return "extint"; 171 case EXIT_REASON_TRIPLE_FAULT: 172 return "triplefault"; 173 case EXIT_REASON_INIT: 174 return "init"; 175 case EXIT_REASON_SIPI: 176 return "sipi"; 177 case EXIT_REASON_IO_SMI: 178 return "iosmi"; 179 case EXIT_REASON_SMI: 180 return "smi"; 181 case EXIT_REASON_INTR_WINDOW: 182 return "intrwindow"; 183 case EXIT_REASON_NMI_WINDOW: 184 return "nmiwindow"; 185 case EXIT_REASON_TASK_SWITCH: 186 return "taskswitch"; 187 case EXIT_REASON_CPUID: 188 return "cpuid"; 189 case EXIT_REASON_GETSEC: 190 return "getsec"; 191 case EXIT_REASON_HLT: 192 return "hlt"; 193 case EXIT_REASON_INVD: 194 return "invd"; 195 case EXIT_REASON_INVLPG: 196 return "invlpg"; 197 case EXIT_REASON_RDPMC: 198 return "rdpmc"; 199 case EXIT_REASON_RDTSC: 200 return "rdtsc"; 201 case EXIT_REASON_RSM: 202 return "rsm"; 203 case EXIT_REASON_VMCALL: 204 return "vmcall"; 205 case EXIT_REASON_VMCLEAR: 206 return "vmclear"; 207 case EXIT_REASON_VMLAUNCH: 208 return "vmlaunch"; 209 case EXIT_REASON_VMPTRLD: 210 return "vmptrld"; 211 case EXIT_REASON_VMPTRST: 212 return "vmptrst"; 213 case EXIT_REASON_VMREAD: 214 return "vmread"; 215 case EXIT_REASON_VMRESUME: 216 return "vmresume"; 217 case EXIT_REASON_VMWRITE: 218 return "vmwrite"; 219 case EXIT_REASON_VMXOFF: 220 return "vmxoff"; 221 case EXIT_REASON_VMXON: 222 return "vmxon"; 223 case EXIT_REASON_CR_ACCESS: 224 return "craccess"; 225 case EXIT_REASON_DR_ACCESS: 226 return "draccess"; 227 case EXIT_REASON_INOUT: 228 return "inout"; 229 case EXIT_REASON_RDMSR: 230 return "rdmsr"; 231 case EXIT_REASON_WRMSR: 232 return "wrmsr"; 233 case EXIT_REASON_INVAL_VMCS: 234 return "invalvmcs"; 235 case EXIT_REASON_INVAL_MSR: 236 return "invalmsr"; 237 case EXIT_REASON_MWAIT: 238 return "mwait"; 239 case EXIT_REASON_MTF: 240 return "mtf"; 241 case EXIT_REASON_MONITOR: 242 return "monitor"; 243 case EXIT_REASON_PAUSE: 244 return "pause"; 245 case EXIT_REASON_MCE: 246 return "mce"; 247 case EXIT_REASON_TPR: 248 return "tpr"; 249 case EXIT_REASON_APIC: 250 return "apic"; 251 case EXIT_REASON_GDTR_IDTR: 252 return "gdtridtr"; 253 case EXIT_REASON_LDTR_TR: 254 return "ldtrtr"; 255 case EXIT_REASON_EPT_FAULT: 256 return "eptfault"; 257 case EXIT_REASON_EPT_MISCONFIG: 258 return "eptmisconfig"; 259 case EXIT_REASON_INVEPT: 260 return "invept"; 261 case EXIT_REASON_RDTSCP: 262 return "rdtscp"; 263 case EXIT_REASON_VMX_PREEMPT: 264 return "vmxpreempt"; 265 case EXIT_REASON_INVVPID: 266 return "invvpid"; 267 case EXIT_REASON_WBINVD: 268 return "wbinvd"; 269 case EXIT_REASON_XSETBV: 270 return "xsetbv"; 271 default: 272 snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); 273 return (reasonbuf); 274 } 275} 276 277#ifdef SETJMP_TRACE 278static const char * 279vmx_setjmp_rc2str(int rc) 280{ 281 switch (rc) { 282 case VMX_RETURN_DIRECT: 283 return "direct"; 284 case VMX_RETURN_LONGJMP: 285 return "longjmp"; 286 case VMX_RETURN_VMRESUME: 287 return "vmresume"; 288 case VMX_RETURN_VMLAUNCH: 289 return "vmlaunch"; 290 default: 291 return "unknown"; 292 } 293} 294 295#define SETJMP_TRACE(vmx, vcpu, vmxctx, regname) \ 296 VMM_CTR1((vmx)->vm, (vcpu), "setjmp trace " #regname " 0x%016lx", \ 297 (vmxctx)->regname) 298 299static void 300vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) 301{ 302 uint64_t host_rip, host_rsp; 303 304 if (vmxctx != &vmx->ctx[vcpu]) 305 panic("vmx_setjmp_trace: invalid vmxctx %p; should be %p", 306 vmxctx, &vmx->ctx[vcpu]); 307 308 VMM_CTR1((vmx)->vm, (vcpu), "vmxctx = %p", vmxctx); 309 VMM_CTR2((vmx)->vm, (vcpu), "setjmp return code %s(%d)", 310 vmx_setjmp_rc2str(rc), rc); 311 312 host_rsp = host_rip = ~0; 313 vmread(VMCS_HOST_RIP, &host_rip); 314 vmread(VMCS_HOST_RSP, &host_rsp); 315 VMM_CTR2((vmx)->vm, (vcpu), "vmcs host_rip 0x%016lx, host_rsp 0x%016lx", 316 host_rip, host_rsp); 317 318 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r15); 319 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r14); 320 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r13); 321 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r12); 322 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbp); 323 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rsp); 324 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbx); 325 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rip); 326 327 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdi); 328 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rsi); 329 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdx); 330 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rcx); 331 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r8); 332 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r9); 333 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rax); 334 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbx); 335 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbp); 336 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r10); 337 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r11); 338 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r12); 339 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r13); 340 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r14); 341 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r15); 342 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_cr2); 343} 344#endif 345#else 346static void __inline 347vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) 348{ 349 return; 350} 351#endif /* KTR */ 352 353u_long 354vmx_fix_cr0(u_long cr0) 355{ 356 357 return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); 358} 359 360u_long 361vmx_fix_cr4(u_long cr4) 362{ 363 364 return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); 365} 366 367static void 368msr_save_area_init(struct msr_entry *g_area, int *g_count) 369{ 370 int cnt; 371 372 static struct msr_entry guest_msrs[] = { 373 { MSR_KGSBASE, 0, 0 }, 374 }; 375 376 cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]); 377 if (cnt > GUEST_MSR_MAX_ENTRIES) 378 panic("guest msr save area overrun"); 379 bcopy(guest_msrs, g_area, sizeof(guest_msrs)); 380 *g_count = cnt; 381} 382 383static void 384vmx_disable(void *arg __unused) 385{ 386 struct invvpid_desc invvpid_desc = { 0 }; 387 struct invept_desc invept_desc = { 0 }; 388 389 if (vmxon_enabled[curcpu]) { 390 /* 391 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b. 392 * 393 * VMXON or VMXOFF are not required to invalidate any TLB 394 * caching structures. This prevents potential retention of 395 * cached information in the TLB between distinct VMX episodes. 396 */ 397 invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc); 398 invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc); 399 vmxoff(); 400 } 401 load_cr4(rcr4() & ~CR4_VMXE); 402} 403 404static int 405vmx_cleanup(void) 406{ 407 408 smp_rendezvous(NULL, vmx_disable, NULL, NULL); 409 410 return (0); 411} 412 413static void 414vmx_enable(void *arg __unused) 415{ 416 int error; 417 418 load_cr4(rcr4() | CR4_VMXE); 419 420 *(uint32_t *)vmxon_region[curcpu] = vmx_revision(); 421 error = vmxon(vmxon_region[curcpu]); 422 if (error == 0) 423 vmxon_enabled[curcpu] = 1; 424} 425 426static int 427vmx_init(void) 428{ 429 int error; 430 uint64_t fixed0, fixed1; 431 uint32_t tmp; 432 433 /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ 434 if (!(cpu_feature2 & CPUID2_VMX)) { 435 printf("vmx_init: processor does not support VMX operation\n"); 436 return (ENXIO); 437 } 438 439 /* Check support for primary processor-based VM-execution controls */ 440 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 441 MSR_VMX_TRUE_PROCBASED_CTLS, 442 PROCBASED_CTLS_ONE_SETTING, 443 PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); 444 if (error) { 445 printf("vmx_init: processor does not support desired primary " 446 "processor-based controls\n"); 447 return (error); 448 } 449 450 /* Clear the processor-based ctl bits that are set on demand */ 451 procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; 452 453 /* Check support for secondary processor-based VM-execution controls */ 454 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 455 MSR_VMX_PROCBASED_CTLS2, 456 PROCBASED_CTLS2_ONE_SETTING, 457 PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); 458 if (error) { 459 printf("vmx_init: processor does not support desired secondary " 460 "processor-based controls\n"); 461 return (error); 462 } 463 464 /* Check support for VPID */ 465 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, 466 PROCBASED2_ENABLE_VPID, 0, &tmp); 467 if (error == 0) 468 procbased_ctls2 |= PROCBASED2_ENABLE_VPID; 469 470 /* Check support for pin-based VM-execution controls */ 471 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, 472 MSR_VMX_TRUE_PINBASED_CTLS, 473 PINBASED_CTLS_ONE_SETTING, 474 PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); 475 if (error) { 476 printf("vmx_init: processor does not support desired " 477 "pin-based controls\n"); 478 return (error); 479 } 480 481 /* Check support for VM-exit controls */ 482 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, 483 VM_EXIT_CTLS_ONE_SETTING, 484 VM_EXIT_CTLS_ZERO_SETTING, 485 &exit_ctls); 486 if (error) { 487 /* Try again without the PAT MSR bits */ 488 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, 489 MSR_VMX_TRUE_EXIT_CTLS, 490 VM_EXIT_CTLS_ONE_SETTING_NO_PAT, 491 VM_EXIT_CTLS_ZERO_SETTING, 492 &exit_ctls); 493 if (error) { 494 printf("vmx_init: processor does not support desired " 495 "exit controls\n"); 496 return (error); 497 } else { 498 if (bootverbose) 499 printf("vmm: PAT MSR access not supported\n"); 500 guest_msr_valid(MSR_PAT); 501 vmx_no_patmsr = 1; 502 } 503 } 504 505 /* Check support for VM-entry controls */ 506 if (!vmx_no_patmsr) { 507 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, 508 MSR_VMX_TRUE_ENTRY_CTLS, 509 VM_ENTRY_CTLS_ONE_SETTING, 510 VM_ENTRY_CTLS_ZERO_SETTING, 511 &entry_ctls); 512 } else { 513 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, 514 MSR_VMX_TRUE_ENTRY_CTLS, 515 VM_ENTRY_CTLS_ONE_SETTING_NO_PAT, 516 VM_ENTRY_CTLS_ZERO_SETTING, 517 &entry_ctls); 518 } 519 520 if (error) { 521 printf("vmx_init: processor does not support desired " 522 "entry controls\n"); 523 return (error); 524 } 525 526 /* 527 * Check support for optional features by testing them 528 * as individual bits 529 */ 530 cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 531 MSR_VMX_TRUE_PROCBASED_CTLS, 532 PROCBASED_HLT_EXITING, 0, 533 &tmp) == 0); 534 535 cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 536 MSR_VMX_PROCBASED_CTLS, 537 PROCBASED_MTF, 0, 538 &tmp) == 0); 539 540 cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 541 MSR_VMX_TRUE_PROCBASED_CTLS, 542 PROCBASED_PAUSE_EXITING, 0, 543 &tmp) == 0); 544 545 cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 546 MSR_VMX_PROCBASED_CTLS2, 547 PROCBASED2_UNRESTRICTED_GUEST, 0, 548 &tmp) == 0); 549 550 /* Initialize EPT */ 551 error = ept_init(); 552 if (error) { 553 printf("vmx_init: ept initialization failed (%d)\n", error); 554 return (error); 555 } 556 557 /* 558 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 559 */ 560 fixed0 = rdmsr(MSR_VMX_CR0_FIXED0); 561 fixed1 = rdmsr(MSR_VMX_CR0_FIXED1); 562 cr0_ones_mask = fixed0 & fixed1; 563 cr0_zeros_mask = ~fixed0 & ~fixed1; 564 565 /* 566 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation 567 * if unrestricted guest execution is allowed. 568 */ 569 if (cap_unrestricted_guest) 570 cr0_ones_mask &= ~(CR0_PG | CR0_PE); 571 572 /* 573 * Do not allow the guest to set CR0_NW or CR0_CD. 574 */ 575 cr0_zeros_mask |= (CR0_NW | CR0_CD); 576 577 fixed0 = rdmsr(MSR_VMX_CR4_FIXED0); 578 fixed1 = rdmsr(MSR_VMX_CR4_FIXED1); 579 cr4_ones_mask = fixed0 & fixed1; 580 cr4_zeros_mask = ~fixed0 & ~fixed1; 581 582 /* enable VMX operation */ 583 smp_rendezvous(NULL, vmx_enable, NULL, NULL); 584 585 return (0); 586} 587 588/* 589 * If this processor does not support VPIDs then simply return 0. 590 * 591 * Otherwise generate the next value of VPID to use. Any value is alright 592 * as long as it is non-zero. 593 * 594 * We always execute in VMX non-root context with EPT enabled. Thus all 595 * combined mappings are tagged with the (EP4TA, VPID, PCID) tuple. This 596 * in turn means that multiple VMs can share the same VPID as long as 597 * they have distinct EPT page tables. 598 * 599 * XXX 600 * We should optimize this so that it returns VPIDs that are not in 601 * use. Then we will not unnecessarily invalidate mappings in 602 * vmx_set_pcpu_defaults() just because two or more vcpus happen to 603 * use the same 'vpid'. 604 */ 605static uint16_t 606vmx_vpid(void) 607{ 608 uint16_t vpid = 0; 609 610 if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) != 0) { 611 do { 612 vpid = atomic_fetchadd_int(&nextvpid, 1); 613 } while (vpid == 0); 614 } 615 616 return (vpid); 617} 618 619static int 620vmx_setup_cr0_shadow(struct vmcs *vmcs) 621{ 622 int error; 623 uint64_t mask, shadow; 624 625 mask = cr0_ones_mask | cr0_zeros_mask; 626 error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_CR0_MASK), mask); 627 if (error) 628 return (error); 629 630 shadow = cr0_ones_mask; 631 error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_CR0_SHADOW), shadow); 632 if (error) 633 return (error); 634 635 return (0); 636} 637 638static void * 639vmx_vminit(struct vm *vm) 640{ 641 uint16_t vpid; 642 int i, error, guest_msr_count; 643 struct vmx *vmx; 644 645 vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); 646 if ((uintptr_t)vmx & PAGE_MASK) { 647 panic("malloc of struct vmx not aligned on %d byte boundary", 648 PAGE_SIZE); 649 } 650 vmx->vm = vm; 651 652 /* 653 * Clean up EPTP-tagged guest physical and combined mappings 654 * 655 * VMX transitions are not required to invalidate any guest physical 656 * mappings. So, it may be possible for stale guest physical mappings 657 * to be present in the processor TLBs. 658 * 659 * Combined mappings for this EP4TA are also invalidated for all VPIDs. 660 */ 661 ept_invalidate_mappings(vtophys(vmx->pml4ept)); 662 663 msr_bitmap_initialize(vmx->msr_bitmap); 664 665 /* 666 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE. 667 * The guest FSBASE and GSBASE are saved and restored during 668 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are 669 * always restored from the vmcs host state area on vm-exit. 670 * 671 * Guest KGSBASE is saved and restored in the guest MSR save area. 672 * Host KGSBASE is restored before returning to userland from the pcb. 673 * There will be a window of time when we are executing in the host 674 * kernel context with a value of KGSBASE from the guest. This is ok 675 * because the value of KGSBASE is inconsequential in kernel context. 676 * 677 * MSR_EFER is saved and restored in the guest VMCS area on a 678 * VM exit and entry respectively. It is also restored from the 679 * host VMCS area on a VM exit. 680 */ 681 if (guest_msr_rw(vmx, MSR_GSBASE) || 682 guest_msr_rw(vmx, MSR_FSBASE) || 683 guest_msr_rw(vmx, MSR_KGSBASE) || 684 guest_msr_rw(vmx, MSR_EFER)) 685 panic("vmx_vminit: error setting guest msr access"); 686 687 /* 688 * MSR_PAT is saved and restored in the guest VMCS are on a VM exit 689 * and entry respectively. It is also restored from the host VMCS 690 * area on a VM exit. However, if running on a system with no 691 * MSR_PAT save/restore support, leave access disabled so accesses 692 * will be trapped. 693 */ 694 if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT)) 695 panic("vmx_vminit: error setting guest pat msr access"); 696 697 for (i = 0; i < VM_MAXCPU; i++) { 698 vmx->vmcs[i].identifier = vmx_revision(); 699 error = vmclear(&vmx->vmcs[i]); 700 if (error != 0) { 701 panic("vmx_vminit: vmclear error %d on vcpu %d\n", 702 error, i); 703 } 704 705 vpid = vmx_vpid(); 706 707 error = vmcs_set_defaults(&vmx->vmcs[i], 708 (u_long)vmx_longjmp, 709 (u_long)&vmx->ctx[i], 710 vtophys(vmx->pml4ept), 711 pinbased_ctls, 712 procbased_ctls, 713 procbased_ctls2, 714 exit_ctls, entry_ctls, 715 vtophys(vmx->msr_bitmap), 716 vpid); 717 718 if (error != 0) 719 panic("vmx_vminit: vmcs_set_defaults error %d", error); 720 721 vmx->cap[i].set = 0; 722 vmx->cap[i].proc_ctls = procbased_ctls; 723 724 vmx->state[i].request_nmi = 0; 725 vmx->state[i].lastcpu = -1; 726 vmx->state[i].vpid = vpid; 727 728 msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count); 729 730 error = vmcs_set_msr_save(&vmx->vmcs[i], 731 vtophys(vmx->guest_msrs[i]), 732 guest_msr_count); 733 if (error != 0) 734 panic("vmcs_set_msr_save error %d", error); 735 736 error = vmx_setup_cr0_shadow(&vmx->vmcs[i]); 737 } 738 739 return (vmx); 740} 741 742static int 743vmx_handle_cpuid(int vcpu, struct vmxctx *vmxctx) 744{ 745 int handled, func; 746 747 func = vmxctx->guest_rax; 748 749 handled = x86_emulate_cpuid((uint32_t*)(&vmxctx->guest_rax), 750 (uint32_t*)(&vmxctx->guest_rbx), (uint32_t*)(&vmxctx->guest_rcx), 751 (uint32_t*)(&vmxctx->guest_rdx), vcpu); 752#if 0 753 printf("%s: func %x rax %lx rbx %lx rcx %lx rdx %lx handled %d\n", 754 __func__, func, vmxctx->guest_rax, vmxctx->guest_rbx, 755 vmxctx->guest_rcx, vmxctx->guest_rdx, handled); 756#endif 757 758 return (handled); 759} 760 761static __inline void 762vmx_run_trace(struct vmx *vmx, int vcpu) 763{ 764#ifdef KTR 765 VMM_CTR1(vmx->vm, vcpu, "Resume execution at 0x%0lx", vmcs_guest_rip()); 766#endif 767} 768 769static __inline void 770vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, 771 int handled, int astpending) 772{ 773#ifdef KTR 774 VMM_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx", 775 handled ? "handled" : "unhandled", 776 exit_reason_to_str(exit_reason), rip); 777 778 if (astpending) 779 VMM_CTR0(vmx->vm, vcpu, "astpending"); 780#endif 781} 782 783static int 784vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu) 785{ 786 int error, lastcpu; 787 struct vmxstate *vmxstate; 788 struct invvpid_desc invvpid_desc = { 0 }; 789 790 vmxstate = &vmx->state[vcpu]; 791 lastcpu = vmxstate->lastcpu; 792 vmxstate->lastcpu = curcpu; 793 794 if (lastcpu == curcpu) { 795 error = 0; 796 goto done; 797 } 798 799 vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); 800 801 error = vmwrite(VMCS_HOST_TR_BASE, (u_long)PCPU_GET(tssp)); 802 if (error != 0) 803 goto done; 804 805 error = vmwrite(VMCS_HOST_GDTR_BASE, (u_long)&gdt[NGDT * curcpu]); 806 if (error != 0) 807 goto done; 808 809 error = vmwrite(VMCS_HOST_GS_BASE, (u_long)&__pcpu[curcpu]); 810 if (error != 0) 811 goto done; 812 813 /* 814 * If we are using VPIDs then invalidate all mappings tagged with 'vpid' 815 * 816 * We do this because this vcpu was executing on a different host 817 * cpu when it last ran. We do not track whether it invalidated 818 * mappings associated with its 'vpid' during that run. So we must 819 * assume that the mappings associated with 'vpid' on 'curcpu' are 820 * stale and invalidate them. 821 * 822 * Note that we incur this penalty only when the scheduler chooses to 823 * move the thread associated with this vcpu between host cpus. 824 * 825 * Note also that this will invalidate mappings tagged with 'vpid' 826 * for "all" EP4TAs. 827 */ 828 if (vmxstate->vpid != 0) { 829 invvpid_desc.vpid = vmxstate->vpid; 830 invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); 831 } 832done: 833 return (error); 834} 835 836static void 837vm_exit_update_rip(struct vm_exit *vmexit) 838{ 839 int error; 840 841 error = vmwrite(VMCS_GUEST_RIP, vmexit->rip + vmexit->inst_length); 842 if (error) 843 panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); 844} 845 846/* 847 * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. 848 */ 849CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); 850 851static void __inline 852vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) 853{ 854 int error; 855 856 vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING; 857 858 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 859 if (error) 860 panic("vmx_set_int_window_exiting: vmwrite error %d", error); 861} 862 863static void __inline 864vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) 865{ 866 int error; 867 868 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; 869 870 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 871 if (error) 872 panic("vmx_clear_int_window_exiting: vmwrite error %d", error); 873} 874 875static void __inline 876vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) 877{ 878 int error; 879 880 vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; 881 882 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 883 if (error) 884 panic("vmx_set_nmi_window_exiting: vmwrite error %d", error); 885} 886 887static void __inline 888vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) 889{ 890 int error; 891 892 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; 893 894 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 895 if (error) 896 panic("vmx_clear_nmi_window_exiting: vmwrite error %d", error); 897} 898 899static int 900vmx_inject_nmi(struct vmx *vmx, int vcpu) 901{ 902 int error; 903 uint64_t info, interruptibility; 904 905 /* Bail out if no NMI requested */ 906 if (vmx->state[vcpu].request_nmi == 0) 907 return (0); 908 909 error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility); 910 if (error) { 911 panic("vmx_inject_nmi: vmread(interruptibility) %d", 912 error); 913 } 914 if (interruptibility & nmi_blocking_bits) 915 goto nmiblocked; 916 917 /* 918 * Inject the virtual NMI. The vector must be the NMI IDT entry 919 * or the VMCS entry check will fail. 920 */ 921 info = VMCS_INTERRUPTION_INFO_NMI | VMCS_INTERRUPTION_INFO_VALID; 922 info |= IDT_NMI; 923 924 error = vmwrite(VMCS_ENTRY_INTR_INFO, info); 925 if (error) 926 panic("vmx_inject_nmi: vmwrite(intrinfo) %d", error); 927 928 VMM_CTR0(vmx->vm, vcpu, "Injecting vNMI"); 929 930 /* Clear the request */ 931 vmx->state[vcpu].request_nmi = 0; 932 return (1); 933 934nmiblocked: 935 /* 936 * Set the NMI Window Exiting execution control so we can inject 937 * the virtual NMI as soon as blocking condition goes away. 938 */ 939 vmx_set_nmi_window_exiting(vmx, vcpu); 940 941 VMM_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting"); 942 return (1); 943} 944 945static void 946vmx_inject_interrupts(struct vmx *vmx, int vcpu) 947{ 948 int error, vector; 949 uint64_t info, rflags, interruptibility; 950 951 const int HWINTR_BLOCKED = VMCS_INTERRUPTIBILITY_STI_BLOCKING | 952 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING; 953 954#if 1 955 /* 956 * XXX 957 * If an event is being injected from userland then just return. 958 * For e.g. we may inject a breakpoint exception to cause the 959 * guest to enter the debugger so we can inspect its state. 960 */ 961 error = vmread(VMCS_ENTRY_INTR_INFO, &info); 962 if (error) 963 panic("vmx_inject_interrupts: vmread(intrinfo) %d", error); 964 if (info & VMCS_INTERRUPTION_INFO_VALID) 965 return; 966#endif 967 /* 968 * NMI injection has priority so deal with those first 969 */ 970 if (vmx_inject_nmi(vmx, vcpu)) 971 return; 972 973 /* Ask the local apic for a vector to inject */ 974 vector = lapic_pending_intr(vmx->vm, vcpu); 975 if (vector < 0) 976 return; 977 978 if (vector < 32 || vector > 255) 979 panic("vmx_inject_interrupts: invalid vector %d\n", vector); 980 981 /* Check RFLAGS.IF and the interruptibility state of the guest */ 982 error = vmread(VMCS_GUEST_RFLAGS, &rflags); 983 if (error) 984 panic("vmx_inject_interrupts: vmread(rflags) %d", error); 985 986 if ((rflags & PSL_I) == 0) 987 goto cantinject; 988 989 error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility); 990 if (error) { 991 panic("vmx_inject_interrupts: vmread(interruptibility) %d", 992 error); 993 } 994 if (interruptibility & HWINTR_BLOCKED) 995 goto cantinject; 996 997 /* Inject the interrupt */ 998 info = VMCS_INTERRUPTION_INFO_HW_INTR | VMCS_INTERRUPTION_INFO_VALID; 999 info |= vector; 1000 error = vmwrite(VMCS_ENTRY_INTR_INFO, info); 1001 if (error) 1002 panic("vmx_inject_interrupts: vmwrite(intrinfo) %d", error); 1003 1004 /* Update the Local APIC ISR */ 1005 lapic_intr_accepted(vmx->vm, vcpu, vector); 1006 1007 VMM_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); 1008 1009 return; 1010 1011cantinject: 1012 /* 1013 * Set the Interrupt Window Exiting execution control so we can inject 1014 * the interrupt as soon as blocking condition goes away. 1015 */ 1016 vmx_set_int_window_exiting(vmx, vcpu); 1017 1018 VMM_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting"); 1019} 1020 1021static int 1022vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual) 1023{ 1024 int error; 1025 uint64_t regval; 1026 const struct vmxctx *vmxctx; 1027 1028 /* We only handle mov to %cr0 at this time */ 1029 if ((exitqual & 0xff) != 0x00) 1030 return (UNHANDLED); 1031 1032 vmxctx = &vmx->ctx[vcpu]; 1033 1034 /* 1035 * We must use vmwrite() directly here because vmcs_setreg() will 1036 * call vmclear(vmcs) as a side-effect which we certainly don't want. 1037 */ 1038 switch ((exitqual >> 8) & 0xf) { 1039 case 0: 1040 regval = vmxctx->guest_rax; 1041 break; 1042 case 1: 1043 regval = vmxctx->guest_rcx; 1044 break; 1045 case 2: 1046 regval = vmxctx->guest_rdx; 1047 break; 1048 case 3: 1049 regval = vmxctx->guest_rbx; 1050 break; 1051 case 4: 1052 error = vmread(VMCS_GUEST_RSP, ®val); 1053 if (error) { 1054 panic("vmx_emulate_cr_access: " 1055 "error %d reading guest rsp", error); 1056 } 1057 break; 1058 case 5: 1059 regval = vmxctx->guest_rbp; 1060 break; 1061 case 6: 1062 regval = vmxctx->guest_rsi; 1063 break; 1064 case 7: 1065 regval = vmxctx->guest_rdi; 1066 break; 1067 case 8: 1068 regval = vmxctx->guest_r8; 1069 break; 1070 case 9: 1071 regval = vmxctx->guest_r9; 1072 break; 1073 case 10: 1074 regval = vmxctx->guest_r10; 1075 break; 1076 case 11: 1077 regval = vmxctx->guest_r11; 1078 break; 1079 case 12: 1080 regval = vmxctx->guest_r12; 1081 break; 1082 case 13: 1083 regval = vmxctx->guest_r13; 1084 break; 1085 case 14: 1086 regval = vmxctx->guest_r14; 1087 break; 1088 case 15: 1089 regval = vmxctx->guest_r15; 1090 break; 1091 } 1092 1093 regval |= cr0_ones_mask; 1094 regval &= ~cr0_zeros_mask; 1095 error = vmwrite(VMCS_GUEST_CR0, regval); 1096 if (error) 1097 panic("vmx_emulate_cr_access: error %d writing cr0", error); 1098 1099 return (HANDLED); 1100} 1101 1102static int 1103vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 1104{ 1105 int handled; 1106 struct vmcs *vmcs; 1107 struct vmxctx *vmxctx; 1108 uint32_t eax, ecx, edx; 1109 uint64_t qual; 1110 1111 handled = 0; 1112 vmcs = &vmx->vmcs[vcpu]; 1113 vmxctx = &vmx->ctx[vcpu]; 1114 qual = vmexit->u.vmx.exit_qualification; 1115 vmexit->exitcode = VM_EXITCODE_BOGUS; 1116 1117 switch (vmexit->u.vmx.exit_reason) { 1118 case EXIT_REASON_CR_ACCESS: 1119 handled = vmx_emulate_cr_access(vmx, vcpu, qual); 1120 break; 1121 case EXIT_REASON_RDMSR: 1122 ecx = vmxctx->guest_rcx; 1123 handled = emulate_rdmsr(vmx->vm, vcpu, ecx); 1124 if (!handled) { 1125 vmexit->exitcode = VM_EXITCODE_RDMSR; 1126 vmexit->u.msr.code = ecx; 1127 } 1128 break; 1129 case EXIT_REASON_WRMSR: 1130 eax = vmxctx->guest_rax; 1131 ecx = vmxctx->guest_rcx; 1132 edx = vmxctx->guest_rdx; 1133 handled = emulate_wrmsr(vmx->vm, vcpu, ecx, 1134 (uint64_t)edx << 32 | eax); 1135 if (!handled) { 1136 vmexit->exitcode = VM_EXITCODE_WRMSR; 1137 vmexit->u.msr.code = ecx; 1138 vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; 1139 } 1140 break; 1141 case EXIT_REASON_HLT: 1142 vmexit->exitcode = VM_EXITCODE_HLT; 1143 break; 1144 case EXIT_REASON_MTF: 1145 vmexit->exitcode = VM_EXITCODE_MTRAP; 1146 break; 1147 case EXIT_REASON_PAUSE: 1148 vmexit->exitcode = VM_EXITCODE_PAUSE; 1149 break; 1150 case EXIT_REASON_INTR_WINDOW: 1151 vmx_clear_int_window_exiting(vmx, vcpu); 1152 VMM_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); 1153 /* FALLTHRU */ 1154 case EXIT_REASON_EXT_INTR: 1155 /* 1156 * External interrupts serve only to cause VM exits and allow 1157 * the host interrupt handler to run. 1158 * 1159 * If this external interrupt triggers a virtual interrupt 1160 * to a VM, then that state will be recorded by the 1161 * host interrupt handler in the VM's softc. We will inject 1162 * this virtual interrupt during the subsequent VM enter. 1163 */ 1164 1165 /* 1166 * This is special. We want to treat this as an 'handled' 1167 * VM-exit but not increment the instruction pointer. 1168 */ 1169 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); 1170 return (1); 1171 case EXIT_REASON_NMI_WINDOW: 1172 /* Exit to allow the pending virtual NMI to be injected */ 1173 vmx_clear_nmi_window_exiting(vmx, vcpu); 1174 VMM_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting"); 1175 return (1); 1176 case EXIT_REASON_INOUT: 1177 vmexit->exitcode = VM_EXITCODE_INOUT; 1178 vmexit->u.inout.bytes = (qual & 0x7) + 1; 1179 vmexit->u.inout.in = (qual & 0x8) ? 1 : 0; 1180 vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; 1181 vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; 1182 vmexit->u.inout.port = (uint16_t)(qual >> 16); 1183 vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); 1184 break; 1185 case EXIT_REASON_CPUID: 1186 handled = vmx_handle_cpuid(vcpu, vmxctx); 1187 break; 1188 default: 1189 break; 1190 } 1191 1192 if (handled) { 1193 /* 1194 * It is possible that control is returned to userland 1195 * even though we were able to handle the VM exit in the 1196 * kernel (for e.g. 'astpending' is set in the run loop). 1197 * 1198 * In such a case we want to make sure that the userland 1199 * restarts guest execution at the instruction *after* 1200 * the one we just processed. Therefore we update the 1201 * guest rip in the VMCS and in 'vmexit'. 1202 */ 1203 vm_exit_update_rip(vmexit); 1204 vmexit->rip += vmexit->inst_length; 1205 vmexit->inst_length = 0; 1206 } else { 1207 if (vmexit->exitcode == VM_EXITCODE_BOGUS) { 1208 /* 1209 * If this VM exit was not claimed by anybody then 1210 * treat it as a generic VMX exit. 1211 */ 1212 vmexit->exitcode = VM_EXITCODE_VMX; 1213 vmexit->u.vmx.error = 0; 1214 } else { 1215 /* 1216 * The exitcode and collateral have been populated. 1217 * The VM exit will be processed further in userland. 1218 */ 1219 } 1220 } 1221 return (handled); 1222} 1223 1224static int 1225vmx_run(void *arg, int vcpu, register_t rip, struct vm_exit *vmexit) 1226{ 1227 int error, vie, rc, handled, astpending; 1228 uint32_t exit_reason; 1229 struct vmx *vmx; 1230 struct vmxctx *vmxctx; 1231 struct vmcs *vmcs; 1232 1233 vmx = arg; 1234 vmcs = &vmx->vmcs[vcpu]; 1235 vmxctx = &vmx->ctx[vcpu]; 1236 vmxctx->launched = 0; 1237 1238 /* 1239 * XXX Can we avoid doing this every time we do a vm run? 1240 */ 1241 VMPTRLD(vmcs); 1242 1243 /* 1244 * XXX 1245 * We do this every time because we may setup the virtual machine 1246 * from a different process than the one that actually runs it. 1247 * 1248 * If the life of a virtual machine was spent entirely in the context 1249 * of a single process we could do this once in vmcs_set_defaults(). 1250 */ 1251 if ((error = vmwrite(VMCS_HOST_CR3, rcr3())) != 0) 1252 panic("vmx_run: error %d writing to VMCS_HOST_CR3", error); 1253 1254 if ((error = vmwrite(VMCS_GUEST_RIP, rip)) != 0) 1255 panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); 1256 1257 if ((error = vmx_set_pcpu_defaults(vmx, vcpu)) != 0) 1258 panic("vmx_run: error %d setting up pcpu defaults", error); 1259 1260 do { 1261 lapic_timer_tick(vmx->vm, vcpu); 1262 vmx_inject_interrupts(vmx, vcpu); 1263 vmx_run_trace(vmx, vcpu); 1264 rc = vmx_setjmp(vmxctx); 1265#ifdef SETJMP_TRACE 1266 vmx_setjmp_trace(vmx, vcpu, vmxctx, rc); 1267#endif 1268 switch (rc) { 1269 case VMX_RETURN_DIRECT: 1270 if (vmxctx->launched == 0) { 1271 vmxctx->launched = 1; 1272 vmx_launch(vmxctx); 1273 } else 1274 vmx_resume(vmxctx); 1275 panic("vmx_launch/resume should not return"); 1276 break; 1277 case VMX_RETURN_LONGJMP: 1278 break; /* vm exit */ 1279 case VMX_RETURN_VMRESUME: 1280 vie = vmcs_instruction_error(); 1281 if (vmxctx->launch_error == VM_FAIL_INVALID || 1282 vie != VMRESUME_WITH_NON_LAUNCHED_VMCS) { 1283 printf("vmresume error %d vmcs inst error %d\n", 1284 vmxctx->launch_error, vie); 1285 goto err_exit; 1286 } 1287 vmx_launch(vmxctx); /* try to launch the guest */ 1288 panic("vmx_launch should not return"); 1289 break; 1290 case VMX_RETURN_VMLAUNCH: 1291 vie = vmcs_instruction_error(); 1292#if 1 1293 printf("vmlaunch error %d vmcs inst error %d\n", 1294 vmxctx->launch_error, vie); 1295#endif 1296 goto err_exit; 1297 default: 1298 panic("vmx_setjmp returned %d", rc); 1299 } 1300 1301 /* 1302 * XXX locking? 1303 * See comments in exception.S about checking for ASTs 1304 * atomically while interrupts are disabled. But it is 1305 * not clear that they apply in our case. 1306 */ 1307 astpending = curthread->td_flags & TDF_ASTPENDING; 1308 1309 /* enable interrupts */ 1310 enable_intr(); 1311 1312 /* collect some basic information for VM exit processing */ 1313 vmexit->rip = rip = vmcs_guest_rip(); 1314 vmexit->inst_length = vmexit_instruction_length(); 1315 vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); 1316 vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); 1317 1318 handled = vmx_exit_process(vmx, vcpu, vmexit); 1319 1320 vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled, 1321 astpending); 1322 } while (handled && !astpending); 1323 1324 /* 1325 * If a VM exit has been handled then the exitcode must be BOGUS 1326 * If a VM exit is not handled then the exitcode must not be BOGUS 1327 */ 1328 if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || 1329 (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { 1330 panic("Mismatch between handled (%d) and exitcode (%d)", 1331 handled, vmexit->exitcode); 1332 } 1333 1334 VMM_CTR1(vmx->vm, vcpu, "goto userland: exitcode %d",vmexit->exitcode); 1335 1336 /* 1337 * XXX 1338 * We need to do this to ensure that any VMCS state cached by the 1339 * processor is flushed to memory. We need to do this in case the 1340 * VM moves to a different cpu the next time it runs. 1341 * 1342 * Can we avoid doing this? 1343 */ 1344 VMCLEAR(vmcs); 1345 return (0); 1346 1347err_exit: 1348 vmexit->exitcode = VM_EXITCODE_VMX; 1349 vmexit->u.vmx.exit_reason = (uint32_t)-1; 1350 vmexit->u.vmx.exit_qualification = (uint32_t)-1; 1351 vmexit->u.vmx.error = vie; 1352 VMCLEAR(vmcs); 1353 return (ENOEXEC); 1354} 1355 1356static void 1357vmx_vmcleanup(void *arg) 1358{ 1359 int error; 1360 struct vmx *vmx = arg; 1361 1362 /* 1363 * XXXSMP we also need to clear the VMCS active on the other vcpus. 1364 */ 1365 error = vmclear(&vmx->vmcs[0]); 1366 if (error != 0) 1367 panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error); 1368 1369 ept_vmcleanup(vmx); 1370 free(vmx, M_VMX); 1371 1372 return; 1373} 1374 1375static register_t * 1376vmxctx_regptr(struct vmxctx *vmxctx, int reg) 1377{ 1378 1379 switch (reg) { 1380 case VM_REG_GUEST_RAX: 1381 return (&vmxctx->guest_rax); 1382 case VM_REG_GUEST_RBX: 1383 return (&vmxctx->guest_rbx); 1384 case VM_REG_GUEST_RCX: 1385 return (&vmxctx->guest_rcx); 1386 case VM_REG_GUEST_RDX: 1387 return (&vmxctx->guest_rdx); 1388 case VM_REG_GUEST_RSI: 1389 return (&vmxctx->guest_rsi); 1390 case VM_REG_GUEST_RDI: 1391 return (&vmxctx->guest_rdi); 1392 case VM_REG_GUEST_RBP: 1393 return (&vmxctx->guest_rbp); 1394 case VM_REG_GUEST_R8: 1395 return (&vmxctx->guest_r8); 1396 case VM_REG_GUEST_R9: 1397 return (&vmxctx->guest_r9); 1398 case VM_REG_GUEST_R10: 1399 return (&vmxctx->guest_r10); 1400 case VM_REG_GUEST_R11: 1401 return (&vmxctx->guest_r11); 1402 case VM_REG_GUEST_R12: 1403 return (&vmxctx->guest_r12); 1404 case VM_REG_GUEST_R13: 1405 return (&vmxctx->guest_r13); 1406 case VM_REG_GUEST_R14: 1407 return (&vmxctx->guest_r14); 1408 case VM_REG_GUEST_R15: 1409 return (&vmxctx->guest_r15); 1410 default: 1411 break; 1412 } 1413 return (NULL); 1414} 1415 1416static int 1417vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval) 1418{ 1419 register_t *regp; 1420 1421 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 1422 *retval = *regp; 1423 return (0); 1424 } else 1425 return (EINVAL); 1426} 1427 1428static int 1429vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) 1430{ 1431 register_t *regp; 1432 1433 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 1434 *regp = val; 1435 return (0); 1436 } else 1437 return (EINVAL); 1438} 1439 1440static int 1441vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) 1442{ 1443 struct vmx *vmx = arg; 1444 1445 if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0) 1446 return (0); 1447 1448 /* 1449 * If the vcpu is running then don't mess with the VMCS. 1450 * 1451 * vmcs_getreg will VMCLEAR the vmcs when it is done which will cause 1452 * the subsequent vmlaunch/vmresume to fail. 1453 */ 1454 if (vcpu_is_running(vmx->vm, vcpu, NULL)) 1455 panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); 1456 1457 return (vmcs_getreg(&vmx->vmcs[vcpu], reg, retval)); 1458} 1459 1460static int 1461vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) 1462{ 1463 int error; 1464 uint64_t ctls; 1465 struct vmx *vmx = arg; 1466 1467 /* 1468 * XXX Allow caller to set contents of the guest registers saved in 1469 * the 'vmxctx' even though the vcpu might be running. We need this 1470 * specifically to support the rdmsr emulation that will set the 1471 * %eax and %edx registers during vm exit processing. 1472 */ 1473 if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0) 1474 return (0); 1475 1476 /* 1477 * If the vcpu is running then don't mess with the VMCS. 1478 * 1479 * vmcs_setreg will VMCLEAR the vmcs when it is done which will cause 1480 * the subsequent vmlaunch/vmresume to fail. 1481 */ 1482 if (vcpu_is_running(vmx->vm, vcpu, NULL)) 1483 panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); 1484 1485 error = vmcs_setreg(&vmx->vmcs[vcpu], reg, val); 1486 1487 if (error == 0) { 1488 /* 1489 * If the "load EFER" VM-entry control is 1 then the 1490 * value of EFER.LMA must be identical to "IA-32e mode guest" 1491 * bit in the VM-entry control. 1492 */ 1493 if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 && 1494 (reg == VM_REG_GUEST_EFER)) { 1495 vmcs_getreg(&vmx->vmcs[vcpu], 1496 VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls); 1497 if (val & EFER_LMA) 1498 ctls |= VM_ENTRY_GUEST_LMA; 1499 else 1500 ctls &= ~VM_ENTRY_GUEST_LMA; 1501 vmcs_setreg(&vmx->vmcs[vcpu], 1502 VMCS_IDENT(VMCS_ENTRY_CTLS), ctls); 1503 } 1504 } 1505 1506 return (error); 1507} 1508 1509static int 1510vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 1511{ 1512 struct vmx *vmx = arg; 1513 1514 return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc)); 1515} 1516 1517static int 1518vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 1519{ 1520 struct vmx *vmx = arg; 1521 1522 return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc)); 1523} 1524 1525static int 1526vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code, 1527 int code_valid) 1528{ 1529 int error; 1530 uint32_t info; 1531 struct vmx *vmx = arg; 1532 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 1533 1534 static uint32_t type_map[VM_EVENT_MAX] = { 1535 0x1, /* VM_EVENT_NONE */ 1536 0x0, /* VM_HW_INTR */ 1537 0x2, /* VM_NMI */ 1538 0x3, /* VM_HW_EXCEPTION */ 1539 0x4, /* VM_SW_INTR */ 1540 0x5, /* VM_PRIV_SW_EXCEPTION */ 1541 0x6, /* VM_SW_EXCEPTION */ 1542 }; 1543 1544 info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0); 1545 info |= VMCS_INTERRUPTION_INFO_VALID; 1546 error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info); 1547 if (error != 0) 1548 return (error); 1549 1550 if (code_valid) { 1551 error = vmcs_setreg(vmcs, 1552 VMCS_IDENT(VMCS_ENTRY_EXCEPTION_ERROR), 1553 code); 1554 } 1555 return (error); 1556} 1557 1558static int 1559vmx_nmi(void *arg, int vcpu) 1560{ 1561 struct vmx *vmx = arg; 1562 1563 atomic_set_int(&vmx->state[vcpu].request_nmi, 1); 1564 1565 return (0); 1566} 1567 1568static int 1569vmx_getcap(void *arg, int vcpu, int type, int *retval) 1570{ 1571 struct vmx *vmx = arg; 1572 int vcap; 1573 int ret; 1574 1575 ret = ENOENT; 1576 1577 vcap = vmx->cap[vcpu].set; 1578 1579 switch (type) { 1580 case VM_CAP_HALT_EXIT: 1581 if (cap_halt_exit) 1582 ret = 0; 1583 break; 1584 case VM_CAP_PAUSE_EXIT: 1585 if (cap_pause_exit) 1586 ret = 0; 1587 break; 1588 case VM_CAP_MTRAP_EXIT: 1589 if (cap_monitor_trap) 1590 ret = 0; 1591 break; 1592 case VM_CAP_UNRESTRICTED_GUEST: 1593 if (cap_unrestricted_guest) 1594 ret = 0; 1595 break; 1596 default: 1597 break; 1598 } 1599 1600 if (ret == 0) 1601 *retval = (vcap & (1 << type)) ? 1 : 0; 1602 1603 return (ret); 1604} 1605 1606static int 1607vmx_setcap(void *arg, int vcpu, int type, int val) 1608{ 1609 struct vmx *vmx = arg; 1610 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 1611 uint32_t baseval; 1612 uint32_t *pptr; 1613 int error; 1614 int flag; 1615 int reg; 1616 int retval; 1617 1618 retval = ENOENT; 1619 pptr = NULL; 1620 1621 switch (type) { 1622 case VM_CAP_HALT_EXIT: 1623 if (cap_halt_exit) { 1624 retval = 0; 1625 pptr = &vmx->cap[vcpu].proc_ctls; 1626 baseval = *pptr; 1627 flag = PROCBASED_HLT_EXITING; 1628 reg = VMCS_PRI_PROC_BASED_CTLS; 1629 } 1630 break; 1631 case VM_CAP_MTRAP_EXIT: 1632 if (cap_monitor_trap) { 1633 retval = 0; 1634 pptr = &vmx->cap[vcpu].proc_ctls; 1635 baseval = *pptr; 1636 flag = PROCBASED_MTF; 1637 reg = VMCS_PRI_PROC_BASED_CTLS; 1638 } 1639 break; 1640 case VM_CAP_PAUSE_EXIT: 1641 if (cap_pause_exit) { 1642 retval = 0; 1643 pptr = &vmx->cap[vcpu].proc_ctls; 1644 baseval = *pptr; 1645 flag = PROCBASED_PAUSE_EXITING; 1646 reg = VMCS_PRI_PROC_BASED_CTLS; 1647 } 1648 break; 1649 case VM_CAP_UNRESTRICTED_GUEST: 1650 if (cap_unrestricted_guest) { 1651 retval = 0; 1652 baseval = procbased_ctls2; 1653 flag = PROCBASED2_UNRESTRICTED_GUEST; 1654 reg = VMCS_SEC_PROC_BASED_CTLS; 1655 } 1656 break; 1657 default: 1658 break; 1659 } 1660 1661 if (retval == 0) { 1662 if (val) { 1663 baseval |= flag; 1664 } else { 1665 baseval &= ~flag; 1666 } 1667 VMPTRLD(vmcs); 1668 error = vmwrite(reg, baseval); 1669 VMCLEAR(vmcs); 1670 1671 if (error) { 1672 retval = error; 1673 } else { 1674 /* 1675 * Update optional stored flags, and record 1676 * setting 1677 */ 1678 if (pptr != NULL) { 1679 *pptr = baseval; 1680 } 1681 1682 if (val) { 1683 vmx->cap[vcpu].set |= (1 << type); 1684 } else { 1685 vmx->cap[vcpu].set &= ~(1 << type); 1686 } 1687 } 1688 } 1689 1690 return (retval); 1691} 1692 1693struct vmm_ops vmm_ops_intel = { 1694 vmx_init, 1695 vmx_cleanup, 1696 vmx_vminit, 1697 vmx_run, 1698 vmx_vmcleanup, 1699 ept_vmmmap, 1700 vmx_getreg, 1701 vmx_setreg, 1702 vmx_getdesc, 1703 vmx_setdesc, 1704 vmx_inject, 1705 vmx_nmi, 1706 vmx_getcap, 1707 vmx_setcap 1708}; 1709