vmx.c revision 268953
1/*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD: stable/10/sys/amd64/vmm/intel/vmx.c 268953 2014-07-21 19:08:02Z jhb $ 27 */ 28 29#include <sys/cdefs.h> 30__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/intel/vmx.c 268953 2014-07-21 19:08:02Z jhb $"); 31 32#include <sys/param.h> 33#include <sys/systm.h> 34#include <sys/smp.h> 35#include <sys/kernel.h> 36#include <sys/malloc.h> 37#include <sys/pcpu.h> 38#include <sys/proc.h> 39#include <sys/sysctl.h> 40 41#include <vm/vm.h> 42#include <vm/pmap.h> 43 44#include <machine/psl.h> 45#include <machine/cpufunc.h> 46#include <machine/md_var.h> 47#include <machine/segments.h> 48#include <machine/smp.h> 49#include <machine/specialreg.h> 50#include <machine/vmparam.h> 51 52#include <machine/vmm.h> 53#include <machine/vmm_dev.h> 54#include "vmm_host.h" 55#include "vmm_ioport.h" 56#include "vmm_ipi.h" 57#include "vmm_msr.h" 58#include "vmm_ktr.h" 59#include "vmm_stat.h" 60#include "vatpic.h" 61#include "vlapic.h" 62#include "vlapic_priv.h" 63 64#include "vmx_msr.h" 65#include "ept.h" 66#include "vmx_cpufunc.h" 67#include "vmx.h" 68#include "x86.h" 69#include "vmx_controls.h" 70 71#define PINBASED_CTLS_ONE_SETTING \ 72 (PINBASED_EXTINT_EXITING | \ 73 PINBASED_NMI_EXITING | \ 74 PINBASED_VIRTUAL_NMI) 75#define PINBASED_CTLS_ZERO_SETTING 0 76 77#define PROCBASED_CTLS_WINDOW_SETTING \ 78 (PROCBASED_INT_WINDOW_EXITING | \ 79 PROCBASED_NMI_WINDOW_EXITING) 80 81#define PROCBASED_CTLS_ONE_SETTING \ 82 (PROCBASED_SECONDARY_CONTROLS | \ 83 PROCBASED_IO_EXITING | \ 84 PROCBASED_MSR_BITMAPS | \ 85 PROCBASED_CTLS_WINDOW_SETTING) 86#define PROCBASED_CTLS_ZERO_SETTING \ 87 (PROCBASED_CR3_LOAD_EXITING | \ 88 PROCBASED_CR3_STORE_EXITING | \ 89 PROCBASED_IO_BITMAPS) 90 91#define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT 92#define PROCBASED_CTLS2_ZERO_SETTING 0 93 94#define VM_EXIT_CTLS_ONE_SETTING_NO_PAT \ 95 (VM_EXIT_HOST_LMA | \ 96 VM_EXIT_SAVE_EFER | \ 97 VM_EXIT_LOAD_EFER) 98 99#define VM_EXIT_CTLS_ONE_SETTING \ 100 (VM_EXIT_CTLS_ONE_SETTING_NO_PAT | \ 101 VM_EXIT_ACKNOWLEDGE_INTERRUPT | \ 102 VM_EXIT_SAVE_PAT | \ 103 VM_EXIT_LOAD_PAT) 104#define VM_EXIT_CTLS_ZERO_SETTING VM_EXIT_SAVE_DEBUG_CONTROLS 105 106#define VM_ENTRY_CTLS_ONE_SETTING_NO_PAT VM_ENTRY_LOAD_EFER 107 108#define VM_ENTRY_CTLS_ONE_SETTING \ 109 (VM_ENTRY_CTLS_ONE_SETTING_NO_PAT | \ 110 VM_ENTRY_LOAD_PAT) 111#define VM_ENTRY_CTLS_ZERO_SETTING \ 112 (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ 113 VM_ENTRY_INTO_SMM | \ 114 VM_ENTRY_DEACTIVATE_DUAL_MONITOR) 115 116#define guest_msr_rw(vmx, msr) \ 117 msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW) 118 119#define guest_msr_ro(vmx, msr) \ 120 msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_READ) 121 122#define HANDLED 1 123#define UNHANDLED 0 124 125static MALLOC_DEFINE(M_VMX, "vmx", "vmx"); 126static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic"); 127 128SYSCTL_DECL(_hw_vmm); 129SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL); 130 131int vmxon_enabled[MAXCPU]; 132static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); 133 134static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; 135static uint32_t exit_ctls, entry_ctls; 136 137static uint64_t cr0_ones_mask, cr0_zeros_mask; 138SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD, 139 &cr0_ones_mask, 0, NULL); 140SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD, 141 &cr0_zeros_mask, 0, NULL); 142 143static uint64_t cr4_ones_mask, cr4_zeros_mask; 144SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD, 145 &cr4_ones_mask, 0, NULL); 146SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD, 147 &cr4_zeros_mask, 0, NULL); 148 149static int vmx_no_patmsr; 150 151static int vmx_initialized; 152SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD, 153 &vmx_initialized, 0, "Intel VMX initialized"); 154 155/* 156 * Optional capabilities 157 */ 158static int cap_halt_exit; 159static int cap_pause_exit; 160static int cap_unrestricted_guest; 161static int cap_monitor_trap; 162static int cap_invpcid; 163 164static int virtual_interrupt_delivery; 165SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD, 166 &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support"); 167 168static int posted_interrupts; 169SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupts, CTLFLAG_RD, 170 &posted_interrupts, 0, "APICv posted interrupt support"); 171 172static int pirvec; 173SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD, 174 &pirvec, 0, "APICv posted interrupt vector"); 175 176static struct unrhdr *vpid_unr; 177static u_int vpid_alloc_failed; 178SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD, 179 &vpid_alloc_failed, 0, NULL); 180 181/* 182 * Use the last page below 4GB as the APIC access address. This address is 183 * occupied by the boot firmware so it is guaranteed that it will not conflict 184 * with a page in system memory. 185 */ 186#define APIC_ACCESS_ADDRESS 0xFFFFF000 187 188static void vmx_inject_pir(struct vlapic *vlapic); 189 190#ifdef KTR 191static const char * 192exit_reason_to_str(int reason) 193{ 194 static char reasonbuf[32]; 195 196 switch (reason) { 197 case EXIT_REASON_EXCEPTION: 198 return "exception"; 199 case EXIT_REASON_EXT_INTR: 200 return "extint"; 201 case EXIT_REASON_TRIPLE_FAULT: 202 return "triplefault"; 203 case EXIT_REASON_INIT: 204 return "init"; 205 case EXIT_REASON_SIPI: 206 return "sipi"; 207 case EXIT_REASON_IO_SMI: 208 return "iosmi"; 209 case EXIT_REASON_SMI: 210 return "smi"; 211 case EXIT_REASON_INTR_WINDOW: 212 return "intrwindow"; 213 case EXIT_REASON_NMI_WINDOW: 214 return "nmiwindow"; 215 case EXIT_REASON_TASK_SWITCH: 216 return "taskswitch"; 217 case EXIT_REASON_CPUID: 218 return "cpuid"; 219 case EXIT_REASON_GETSEC: 220 return "getsec"; 221 case EXIT_REASON_HLT: 222 return "hlt"; 223 case EXIT_REASON_INVD: 224 return "invd"; 225 case EXIT_REASON_INVLPG: 226 return "invlpg"; 227 case EXIT_REASON_RDPMC: 228 return "rdpmc"; 229 case EXIT_REASON_RDTSC: 230 return "rdtsc"; 231 case EXIT_REASON_RSM: 232 return "rsm"; 233 case EXIT_REASON_VMCALL: 234 return "vmcall"; 235 case EXIT_REASON_VMCLEAR: 236 return "vmclear"; 237 case EXIT_REASON_VMLAUNCH: 238 return "vmlaunch"; 239 case EXIT_REASON_VMPTRLD: 240 return "vmptrld"; 241 case EXIT_REASON_VMPTRST: 242 return "vmptrst"; 243 case EXIT_REASON_VMREAD: 244 return "vmread"; 245 case EXIT_REASON_VMRESUME: 246 return "vmresume"; 247 case EXIT_REASON_VMWRITE: 248 return "vmwrite"; 249 case EXIT_REASON_VMXOFF: 250 return "vmxoff"; 251 case EXIT_REASON_VMXON: 252 return "vmxon"; 253 case EXIT_REASON_CR_ACCESS: 254 return "craccess"; 255 case EXIT_REASON_DR_ACCESS: 256 return "draccess"; 257 case EXIT_REASON_INOUT: 258 return "inout"; 259 case EXIT_REASON_RDMSR: 260 return "rdmsr"; 261 case EXIT_REASON_WRMSR: 262 return "wrmsr"; 263 case EXIT_REASON_INVAL_VMCS: 264 return "invalvmcs"; 265 case EXIT_REASON_INVAL_MSR: 266 return "invalmsr"; 267 case EXIT_REASON_MWAIT: 268 return "mwait"; 269 case EXIT_REASON_MTF: 270 return "mtf"; 271 case EXIT_REASON_MONITOR: 272 return "monitor"; 273 case EXIT_REASON_PAUSE: 274 return "pause"; 275 case EXIT_REASON_MCE: 276 return "mce"; 277 case EXIT_REASON_TPR: 278 return "tpr"; 279 case EXIT_REASON_APIC_ACCESS: 280 return "apic-access"; 281 case EXIT_REASON_GDTR_IDTR: 282 return "gdtridtr"; 283 case EXIT_REASON_LDTR_TR: 284 return "ldtrtr"; 285 case EXIT_REASON_EPT_FAULT: 286 return "eptfault"; 287 case EXIT_REASON_EPT_MISCONFIG: 288 return "eptmisconfig"; 289 case EXIT_REASON_INVEPT: 290 return "invept"; 291 case EXIT_REASON_RDTSCP: 292 return "rdtscp"; 293 case EXIT_REASON_VMX_PREEMPT: 294 return "vmxpreempt"; 295 case EXIT_REASON_INVVPID: 296 return "invvpid"; 297 case EXIT_REASON_WBINVD: 298 return "wbinvd"; 299 case EXIT_REASON_XSETBV: 300 return "xsetbv"; 301 case EXIT_REASON_APIC_WRITE: 302 return "apic-write"; 303 default: 304 snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); 305 return (reasonbuf); 306 } 307} 308#endif /* KTR */ 309 310static int 311vmx_allow_x2apic_msrs(struct vmx *vmx) 312{ 313 int i, error; 314 315 error = 0; 316 317 /* 318 * Allow readonly access to the following x2APIC MSRs from the guest. 319 */ 320 error += guest_msr_ro(vmx, MSR_APIC_ID); 321 error += guest_msr_ro(vmx, MSR_APIC_VERSION); 322 error += guest_msr_ro(vmx, MSR_APIC_LDR); 323 error += guest_msr_ro(vmx, MSR_APIC_SVR); 324 325 for (i = 0; i < 8; i++) 326 error += guest_msr_ro(vmx, MSR_APIC_ISR0 + i); 327 328 for (i = 0; i < 8; i++) 329 error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i); 330 331 for (i = 0; i < 8; i++) 332 error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i); 333 334 error += guest_msr_ro(vmx, MSR_APIC_ESR); 335 error += guest_msr_ro(vmx, MSR_APIC_LVT_TIMER); 336 error += guest_msr_ro(vmx, MSR_APIC_LVT_THERMAL); 337 error += guest_msr_ro(vmx, MSR_APIC_LVT_PCINT); 338 error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT0); 339 error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT1); 340 error += guest_msr_ro(vmx, MSR_APIC_LVT_ERROR); 341 error += guest_msr_ro(vmx, MSR_APIC_ICR_TIMER); 342 error += guest_msr_ro(vmx, MSR_APIC_DCR_TIMER); 343 error += guest_msr_ro(vmx, MSR_APIC_ICR); 344 345 /* 346 * Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest. 347 * 348 * These registers get special treatment described in the section 349 * "Virtualizing MSR-Based APIC Accesses". 350 */ 351 error += guest_msr_rw(vmx, MSR_APIC_TPR); 352 error += guest_msr_rw(vmx, MSR_APIC_EOI); 353 error += guest_msr_rw(vmx, MSR_APIC_SELF_IPI); 354 355 return (error); 356} 357 358u_long 359vmx_fix_cr0(u_long cr0) 360{ 361 362 return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); 363} 364 365u_long 366vmx_fix_cr4(u_long cr4) 367{ 368 369 return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); 370} 371 372static void 373vpid_free(int vpid) 374{ 375 if (vpid < 0 || vpid > 0xffff) 376 panic("vpid_free: invalid vpid %d", vpid); 377 378 /* 379 * VPIDs [0,VM_MAXCPU] are special and are not allocated from 380 * the unit number allocator. 381 */ 382 383 if (vpid > VM_MAXCPU) 384 free_unr(vpid_unr, vpid); 385} 386 387static void 388vpid_alloc(uint16_t *vpid, int num) 389{ 390 int i, x; 391 392 if (num <= 0 || num > VM_MAXCPU) 393 panic("invalid number of vpids requested: %d", num); 394 395 /* 396 * If the "enable vpid" execution control is not enabled then the 397 * VPID is required to be 0 for all vcpus. 398 */ 399 if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) { 400 for (i = 0; i < num; i++) 401 vpid[i] = 0; 402 return; 403 } 404 405 /* 406 * Allocate a unique VPID for each vcpu from the unit number allocator. 407 */ 408 for (i = 0; i < num; i++) { 409 x = alloc_unr(vpid_unr); 410 if (x == -1) 411 break; 412 else 413 vpid[i] = x; 414 } 415 416 if (i < num) { 417 atomic_add_int(&vpid_alloc_failed, 1); 418 419 /* 420 * If the unit number allocator does not have enough unique 421 * VPIDs then we need to allocate from the [1,VM_MAXCPU] range. 422 * 423 * These VPIDs are not be unique across VMs but this does not 424 * affect correctness because the combined mappings are also 425 * tagged with the EP4TA which is unique for each VM. 426 * 427 * It is still sub-optimal because the invvpid will invalidate 428 * combined mappings for a particular VPID across all EP4TAs. 429 */ 430 while (i-- > 0) 431 vpid_free(vpid[i]); 432 433 for (i = 0; i < num; i++) 434 vpid[i] = i + 1; 435 } 436} 437 438static void 439vpid_init(void) 440{ 441 /* 442 * VPID 0 is required when the "enable VPID" execution control is 443 * disabled. 444 * 445 * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the 446 * unit number allocator does not have sufficient unique VPIDs to 447 * satisfy the allocation. 448 * 449 * The remaining VPIDs are managed by the unit number allocator. 450 */ 451 vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL); 452} 453 454static void 455msr_save_area_init(struct msr_entry *g_area, int *g_count) 456{ 457 int cnt; 458 459 static struct msr_entry guest_msrs[] = { 460 { MSR_KGSBASE, 0, 0 }, 461 }; 462 463 cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]); 464 if (cnt > GUEST_MSR_MAX_ENTRIES) 465 panic("guest msr save area overrun"); 466 bcopy(guest_msrs, g_area, sizeof(guest_msrs)); 467 *g_count = cnt; 468} 469 470static void 471vmx_disable(void *arg __unused) 472{ 473 struct invvpid_desc invvpid_desc = { 0 }; 474 struct invept_desc invept_desc = { 0 }; 475 476 if (vmxon_enabled[curcpu]) { 477 /* 478 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b. 479 * 480 * VMXON or VMXOFF are not required to invalidate any TLB 481 * caching structures. This prevents potential retention of 482 * cached information in the TLB between distinct VMX episodes. 483 */ 484 invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc); 485 invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc); 486 vmxoff(); 487 } 488 load_cr4(rcr4() & ~CR4_VMXE); 489} 490 491static int 492vmx_cleanup(void) 493{ 494 495 if (pirvec != 0) 496 vmm_ipi_free(pirvec); 497 498 if (vpid_unr != NULL) { 499 delete_unrhdr(vpid_unr); 500 vpid_unr = NULL; 501 } 502 503 smp_rendezvous(NULL, vmx_disable, NULL, NULL); 504 505 return (0); 506} 507 508static void 509vmx_enable(void *arg __unused) 510{ 511 int error; 512 uint64_t feature_control; 513 514 feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); 515 if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 || 516 (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { 517 wrmsr(MSR_IA32_FEATURE_CONTROL, 518 feature_control | IA32_FEATURE_CONTROL_VMX_EN | 519 IA32_FEATURE_CONTROL_LOCK); 520 } 521 522 load_cr4(rcr4() | CR4_VMXE); 523 524 *(uint32_t *)vmxon_region[curcpu] = vmx_revision(); 525 error = vmxon(vmxon_region[curcpu]); 526 if (error == 0) 527 vmxon_enabled[curcpu] = 1; 528} 529 530static void 531vmx_restore(void) 532{ 533 534 if (vmxon_enabled[curcpu]) 535 vmxon(vmxon_region[curcpu]); 536} 537 538static int 539vmx_init(int ipinum) 540{ 541 int error, use_tpr_shadow; 542 uint64_t fixed0, fixed1, feature_control; 543 uint32_t tmp, procbased2_vid_bits; 544 545 /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ 546 if (!(cpu_feature2 & CPUID2_VMX)) { 547 printf("vmx_init: processor does not support VMX operation\n"); 548 return (ENXIO); 549 } 550 551 /* 552 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits 553 * are set (bits 0 and 2 respectively). 554 */ 555 feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); 556 if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 1 && 557 (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { 558 printf("vmx_init: VMX operation disabled by BIOS\n"); 559 return (ENXIO); 560 } 561 562 /* Check support for primary processor-based VM-execution controls */ 563 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 564 MSR_VMX_TRUE_PROCBASED_CTLS, 565 PROCBASED_CTLS_ONE_SETTING, 566 PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); 567 if (error) { 568 printf("vmx_init: processor does not support desired primary " 569 "processor-based controls\n"); 570 return (error); 571 } 572 573 /* Clear the processor-based ctl bits that are set on demand */ 574 procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; 575 576 /* Check support for secondary processor-based VM-execution controls */ 577 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 578 MSR_VMX_PROCBASED_CTLS2, 579 PROCBASED_CTLS2_ONE_SETTING, 580 PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); 581 if (error) { 582 printf("vmx_init: processor does not support desired secondary " 583 "processor-based controls\n"); 584 return (error); 585 } 586 587 /* Check support for VPID */ 588 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, 589 PROCBASED2_ENABLE_VPID, 0, &tmp); 590 if (error == 0) 591 procbased_ctls2 |= PROCBASED2_ENABLE_VPID; 592 593 /* Check support for pin-based VM-execution controls */ 594 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, 595 MSR_VMX_TRUE_PINBASED_CTLS, 596 PINBASED_CTLS_ONE_SETTING, 597 PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); 598 if (error) { 599 printf("vmx_init: processor does not support desired " 600 "pin-based controls\n"); 601 return (error); 602 } 603 604 /* Check support for VM-exit controls */ 605 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, 606 VM_EXIT_CTLS_ONE_SETTING, 607 VM_EXIT_CTLS_ZERO_SETTING, 608 &exit_ctls); 609 if (error) { 610 /* Try again without the PAT MSR bits */ 611 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, 612 MSR_VMX_TRUE_EXIT_CTLS, 613 VM_EXIT_CTLS_ONE_SETTING_NO_PAT, 614 VM_EXIT_CTLS_ZERO_SETTING, 615 &exit_ctls); 616 if (error) { 617 printf("vmx_init: processor does not support desired " 618 "exit controls\n"); 619 return (error); 620 } else { 621 if (bootverbose) 622 printf("vmm: PAT MSR access not supported\n"); 623 guest_msr_valid(MSR_PAT); 624 vmx_no_patmsr = 1; 625 } 626 } 627 628 /* Check support for VM-entry controls */ 629 if (!vmx_no_patmsr) { 630 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, 631 MSR_VMX_TRUE_ENTRY_CTLS, 632 VM_ENTRY_CTLS_ONE_SETTING, 633 VM_ENTRY_CTLS_ZERO_SETTING, 634 &entry_ctls); 635 } else { 636 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, 637 MSR_VMX_TRUE_ENTRY_CTLS, 638 VM_ENTRY_CTLS_ONE_SETTING_NO_PAT, 639 VM_ENTRY_CTLS_ZERO_SETTING, 640 &entry_ctls); 641 } 642 643 if (error) { 644 printf("vmx_init: processor does not support desired " 645 "entry controls\n"); 646 return (error); 647 } 648 649 /* 650 * Check support for optional features by testing them 651 * as individual bits 652 */ 653 cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 654 MSR_VMX_TRUE_PROCBASED_CTLS, 655 PROCBASED_HLT_EXITING, 0, 656 &tmp) == 0); 657 658 cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 659 MSR_VMX_PROCBASED_CTLS, 660 PROCBASED_MTF, 0, 661 &tmp) == 0); 662 663 cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 664 MSR_VMX_TRUE_PROCBASED_CTLS, 665 PROCBASED_PAUSE_EXITING, 0, 666 &tmp) == 0); 667 668 cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 669 MSR_VMX_PROCBASED_CTLS2, 670 PROCBASED2_UNRESTRICTED_GUEST, 0, 671 &tmp) == 0); 672 673 cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 674 MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0, 675 &tmp) == 0); 676 677 /* 678 * Check support for virtual interrupt delivery. 679 */ 680 procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES | 681 PROCBASED2_VIRTUALIZE_X2APIC_MODE | 682 PROCBASED2_APIC_REGISTER_VIRTUALIZATION | 683 PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY); 684 685 use_tpr_shadow = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 686 MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0, 687 &tmp) == 0); 688 689 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, 690 procbased2_vid_bits, 0, &tmp); 691 if (error == 0 && use_tpr_shadow) { 692 virtual_interrupt_delivery = 1; 693 TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid", 694 &virtual_interrupt_delivery); 695 } 696 697 if (virtual_interrupt_delivery) { 698 procbased_ctls |= PROCBASED_USE_TPR_SHADOW; 699 procbased_ctls2 |= procbased2_vid_bits; 700 procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE; 701 702 /* 703 * Check for Posted Interrupts only if Virtual Interrupt 704 * Delivery is enabled. 705 */ 706 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, 707 MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0, 708 &tmp); 709 if (error == 0) { 710 pirvec = vmm_ipi_alloc(); 711 if (pirvec == 0) { 712 if (bootverbose) { 713 printf("vmx_init: unable to allocate " 714 "posted interrupt vector\n"); 715 } 716 } else { 717 posted_interrupts = 1; 718 TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir", 719 &posted_interrupts); 720 } 721 } 722 } 723 724 if (posted_interrupts) 725 pinbased_ctls |= PINBASED_POSTED_INTERRUPT; 726 727 /* Initialize EPT */ 728 error = ept_init(ipinum); 729 if (error) { 730 printf("vmx_init: ept initialization failed (%d)\n", error); 731 return (error); 732 } 733 734 /* 735 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 736 */ 737 fixed0 = rdmsr(MSR_VMX_CR0_FIXED0); 738 fixed1 = rdmsr(MSR_VMX_CR0_FIXED1); 739 cr0_ones_mask = fixed0 & fixed1; 740 cr0_zeros_mask = ~fixed0 & ~fixed1; 741 742 /* 743 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation 744 * if unrestricted guest execution is allowed. 745 */ 746 if (cap_unrestricted_guest) 747 cr0_ones_mask &= ~(CR0_PG | CR0_PE); 748 749 /* 750 * Do not allow the guest to set CR0_NW or CR0_CD. 751 */ 752 cr0_zeros_mask |= (CR0_NW | CR0_CD); 753 754 fixed0 = rdmsr(MSR_VMX_CR4_FIXED0); 755 fixed1 = rdmsr(MSR_VMX_CR4_FIXED1); 756 cr4_ones_mask = fixed0 & fixed1; 757 cr4_zeros_mask = ~fixed0 & ~fixed1; 758 759 vpid_init(); 760 761 /* enable VMX operation */ 762 smp_rendezvous(NULL, vmx_enable, NULL, NULL); 763 764 vmx_initialized = 1; 765 766 return (0); 767} 768 769static void 770vmx_trigger_hostintr(int vector) 771{ 772 uintptr_t func; 773 struct gate_descriptor *gd; 774 775 gd = &idt[vector]; 776 777 KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: " 778 "invalid vector %d", vector)); 779 KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present", 780 vector)); 781 KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d " 782 "has invalid type %d", vector, gd->gd_type)); 783 KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d " 784 "has invalid dpl %d", vector, gd->gd_dpl)); 785 KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor " 786 "for vector %d has invalid selector %d", vector, gd->gd_selector)); 787 KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid " 788 "IST %d", vector, gd->gd_ist)); 789 790 func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset); 791 vmx_call_isr(func); 792} 793 794static int 795vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial) 796{ 797 int error, mask_ident, shadow_ident; 798 uint64_t mask_value; 799 800 if (which != 0 && which != 4) 801 panic("vmx_setup_cr_shadow: unknown cr%d", which); 802 803 if (which == 0) { 804 mask_ident = VMCS_CR0_MASK; 805 mask_value = cr0_ones_mask | cr0_zeros_mask; 806 shadow_ident = VMCS_CR0_SHADOW; 807 } else { 808 mask_ident = VMCS_CR4_MASK; 809 mask_value = cr4_ones_mask | cr4_zeros_mask; 810 shadow_ident = VMCS_CR4_SHADOW; 811 } 812 813 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value); 814 if (error) 815 return (error); 816 817 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial); 818 if (error) 819 return (error); 820 821 return (0); 822} 823#define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init)) 824#define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init)) 825 826static void * 827vmx_vminit(struct vm *vm, pmap_t pmap) 828{ 829 uint16_t vpid[VM_MAXCPU]; 830 int i, error, guest_msr_count; 831 struct vmx *vmx; 832 struct vmcs *vmcs; 833 834 vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); 835 if ((uintptr_t)vmx & PAGE_MASK) { 836 panic("malloc of struct vmx not aligned on %d byte boundary", 837 PAGE_SIZE); 838 } 839 vmx->vm = vm; 840 841 vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4)); 842 843 /* 844 * Clean up EPTP-tagged guest physical and combined mappings 845 * 846 * VMX transitions are not required to invalidate any guest physical 847 * mappings. So, it may be possible for stale guest physical mappings 848 * to be present in the processor TLBs. 849 * 850 * Combined mappings for this EP4TA are also invalidated for all VPIDs. 851 */ 852 ept_invalidate_mappings(vmx->eptp); 853 854 msr_bitmap_initialize(vmx->msr_bitmap); 855 856 /* 857 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE. 858 * The guest FSBASE and GSBASE are saved and restored during 859 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are 860 * always restored from the vmcs host state area on vm-exit. 861 * 862 * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in 863 * how they are saved/restored so can be directly accessed by the 864 * guest. 865 * 866 * Guest KGSBASE is saved and restored in the guest MSR save area. 867 * Host KGSBASE is restored before returning to userland from the pcb. 868 * There will be a window of time when we are executing in the host 869 * kernel context with a value of KGSBASE from the guest. This is ok 870 * because the value of KGSBASE is inconsequential in kernel context. 871 * 872 * MSR_EFER is saved and restored in the guest VMCS area on a 873 * VM exit and entry respectively. It is also restored from the 874 * host VMCS area on a VM exit. 875 * 876 * The TSC MSR is exposed read-only. Writes are disallowed as that 877 * will impact the host TSC. 878 * XXX Writes would be implemented with a wrmsr trap, and 879 * then modifying the TSC offset in the VMCS. 880 */ 881 if (guest_msr_rw(vmx, MSR_GSBASE) || 882 guest_msr_rw(vmx, MSR_FSBASE) || 883 guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) || 884 guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) || 885 guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) || 886 guest_msr_rw(vmx, MSR_KGSBASE) || 887 guest_msr_rw(vmx, MSR_EFER) || 888 guest_msr_ro(vmx, MSR_TSC)) 889 panic("vmx_vminit: error setting guest msr access"); 890 891 /* 892 * MSR_PAT is saved and restored in the guest VMCS are on a VM exit 893 * and entry respectively. It is also restored from the host VMCS 894 * area on a VM exit. However, if running on a system with no 895 * MSR_PAT save/restore support, leave access disabled so accesses 896 * will be trapped. 897 */ 898 if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT)) 899 panic("vmx_vminit: error setting guest pat msr access"); 900 901 vpid_alloc(vpid, VM_MAXCPU); 902 903 if (virtual_interrupt_delivery) { 904 error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE, 905 APIC_ACCESS_ADDRESS); 906 /* XXX this should really return an error to the caller */ 907 KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error)); 908 } 909 910 for (i = 0; i < VM_MAXCPU; i++) { 911 vmcs = &vmx->vmcs[i]; 912 vmcs->identifier = vmx_revision(); 913 error = vmclear(vmcs); 914 if (error != 0) { 915 panic("vmx_vminit: vmclear error %d on vcpu %d\n", 916 error, i); 917 } 918 919 error = vmcs_init(vmcs); 920 KASSERT(error == 0, ("vmcs_init error %d", error)); 921 922 VMPTRLD(vmcs); 923 error = 0; 924 error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]); 925 error += vmwrite(VMCS_EPTP, vmx->eptp); 926 error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls); 927 error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls); 928 error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2); 929 error += vmwrite(VMCS_EXIT_CTLS, exit_ctls); 930 error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls); 931 error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap)); 932 error += vmwrite(VMCS_VPID, vpid[i]); 933 if (virtual_interrupt_delivery) { 934 error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS); 935 error += vmwrite(VMCS_VIRTUAL_APIC, 936 vtophys(&vmx->apic_page[i])); 937 error += vmwrite(VMCS_EOI_EXIT0, 0); 938 error += vmwrite(VMCS_EOI_EXIT1, 0); 939 error += vmwrite(VMCS_EOI_EXIT2, 0); 940 error += vmwrite(VMCS_EOI_EXIT3, 0); 941 } 942 if (posted_interrupts) { 943 error += vmwrite(VMCS_PIR_VECTOR, pirvec); 944 error += vmwrite(VMCS_PIR_DESC, 945 vtophys(&vmx->pir_desc[i])); 946 } 947 VMCLEAR(vmcs); 948 KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs")); 949 950 vmx->cap[i].set = 0; 951 vmx->cap[i].proc_ctls = procbased_ctls; 952 vmx->cap[i].proc_ctls2 = procbased_ctls2; 953 954 vmx->state[i].lastcpu = -1; 955 vmx->state[i].vpid = vpid[i]; 956 957 msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count); 958 959 error = vmcs_set_msr_save(vmcs, vtophys(vmx->guest_msrs[i]), 960 guest_msr_count); 961 if (error != 0) 962 panic("vmcs_set_msr_save error %d", error); 963 964 /* 965 * Set up the CR0/4 shadows, and init the read shadow 966 * to the power-on register value from the Intel Sys Arch. 967 * CR0 - 0x60000010 968 * CR4 - 0 969 */ 970 error = vmx_setup_cr0_shadow(vmcs, 0x60000010); 971 if (error != 0) 972 panic("vmx_setup_cr0_shadow %d", error); 973 974 error = vmx_setup_cr4_shadow(vmcs, 0); 975 if (error != 0) 976 panic("vmx_setup_cr4_shadow %d", error); 977 978 vmx->ctx[i].pmap = pmap; 979 } 980 981 return (vmx); 982} 983 984static int 985vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx) 986{ 987 int handled, func; 988 989 func = vmxctx->guest_rax; 990 991 handled = x86_emulate_cpuid(vm, vcpu, 992 (uint32_t*)(&vmxctx->guest_rax), 993 (uint32_t*)(&vmxctx->guest_rbx), 994 (uint32_t*)(&vmxctx->guest_rcx), 995 (uint32_t*)(&vmxctx->guest_rdx)); 996 return (handled); 997} 998 999static __inline void 1000vmx_run_trace(struct vmx *vmx, int vcpu) 1001{ 1002#ifdef KTR 1003 VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip()); 1004#endif 1005} 1006 1007static __inline void 1008vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, 1009 int handled) 1010{ 1011#ifdef KTR 1012 VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx", 1013 handled ? "handled" : "unhandled", 1014 exit_reason_to_str(exit_reason), rip); 1015#endif 1016} 1017 1018static __inline void 1019vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip) 1020{ 1021#ifdef KTR 1022 VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip); 1023#endif 1024} 1025 1026static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved"); 1027 1028static void 1029vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap) 1030{ 1031 struct vmxstate *vmxstate; 1032 struct invvpid_desc invvpid_desc; 1033 1034 vmxstate = &vmx->state[vcpu]; 1035 if (vmxstate->lastcpu == curcpu) 1036 return; 1037 1038 vmxstate->lastcpu = curcpu; 1039 1040 vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); 1041 1042 vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); 1043 vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); 1044 vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); 1045 1046 /* 1047 * If we are using VPIDs then invalidate all mappings tagged with 'vpid' 1048 * 1049 * We do this because this vcpu was executing on a different host 1050 * cpu when it last ran. We do not track whether it invalidated 1051 * mappings associated with its 'vpid' during that run. So we must 1052 * assume that the mappings associated with 'vpid' on 'curcpu' are 1053 * stale and invalidate them. 1054 * 1055 * Note that we incur this penalty only when the scheduler chooses to 1056 * move the thread associated with this vcpu between host cpus. 1057 * 1058 * Note also that this will invalidate mappings tagged with 'vpid' 1059 * for "all" EP4TAs. 1060 */ 1061 if (vmxstate->vpid != 0) { 1062 if (pmap->pm_eptgen == vmx->eptgen[curcpu]) { 1063 invvpid_desc._res1 = 0; 1064 invvpid_desc._res2 = 0; 1065 invvpid_desc.vpid = vmxstate->vpid; 1066 invvpid_desc.linear_addr = 0; 1067 invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); 1068 } else { 1069 /* 1070 * The invvpid can be skipped if an invept is going to 1071 * be performed before entering the guest. The invept 1072 * will invalidate combined mappings tagged with 1073 * 'vmx->eptp' for all vpids. 1074 */ 1075 vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1); 1076 } 1077 } 1078} 1079 1080/* 1081 * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. 1082 */ 1083CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); 1084 1085static void __inline 1086vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) 1087{ 1088 1089 if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) { 1090 vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING; 1091 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1092 VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting"); 1093 } 1094} 1095 1096static void __inline 1097vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) 1098{ 1099 1100 KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0, 1101 ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls)); 1102 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; 1103 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1104 VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); 1105} 1106 1107static void __inline 1108vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) 1109{ 1110 1111 if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) { 1112 vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; 1113 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1114 VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting"); 1115 } 1116} 1117 1118static void __inline 1119vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) 1120{ 1121 1122 KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0, 1123 ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls)); 1124 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; 1125 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1126 VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting"); 1127} 1128 1129#define NMI_BLOCKING (VMCS_INTERRUPTIBILITY_NMI_BLOCKING | \ 1130 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) 1131#define HWINTR_BLOCKING (VMCS_INTERRUPTIBILITY_STI_BLOCKING | \ 1132 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) 1133 1134static void 1135vmx_inject_nmi(struct vmx *vmx, int vcpu) 1136{ 1137 uint32_t gi, info; 1138 1139 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1140 KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest " 1141 "interruptibility-state %#x", gi)); 1142 1143 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1144 KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid " 1145 "VM-entry interruption information %#x", info)); 1146 1147 /* 1148 * Inject the virtual NMI. The vector must be the NMI IDT entry 1149 * or the VMCS entry check will fail. 1150 */ 1151 info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID; 1152 vmcs_write(VMCS_ENTRY_INTR_INFO, info); 1153 1154 VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI"); 1155 1156 /* Clear the request */ 1157 vm_nmi_clear(vmx->vm, vcpu); 1158} 1159 1160static void 1161vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic) 1162{ 1163 struct vm_exception exc; 1164 int vector, need_nmi_exiting, extint_pending; 1165 uint64_t rflags; 1166 uint32_t gi, info; 1167 1168 if (vm_exception_pending(vmx->vm, vcpu, &exc)) { 1169 KASSERT(exc.vector >= 0 && exc.vector < 32, 1170 ("%s: invalid exception vector %d", __func__, exc.vector)); 1171 1172 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1173 KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject " 1174 "pending exception %d: %#x", __func__, exc.vector, info)); 1175 1176 info = exc.vector | VMCS_INTR_T_HWEXCEPTION | VMCS_INTR_VALID; 1177 if (exc.error_code_valid) { 1178 info |= VMCS_INTR_DEL_ERRCODE; 1179 vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, exc.error_code); 1180 } 1181 vmcs_write(VMCS_ENTRY_INTR_INFO, info); 1182 } 1183 1184 if (vm_nmi_pending(vmx->vm, vcpu)) { 1185 /* 1186 * If there are no conditions blocking NMI injection then 1187 * inject it directly here otherwise enable "NMI window 1188 * exiting" to inject it as soon as we can. 1189 * 1190 * We also check for STI_BLOCKING because some implementations 1191 * don't allow NMI injection in this case. If we are running 1192 * on a processor that doesn't have this restriction it will 1193 * immediately exit and the NMI will be injected in the 1194 * "NMI window exiting" handler. 1195 */ 1196 need_nmi_exiting = 1; 1197 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1198 if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) { 1199 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1200 if ((info & VMCS_INTR_VALID) == 0) { 1201 vmx_inject_nmi(vmx, vcpu); 1202 need_nmi_exiting = 0; 1203 } else { 1204 VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI " 1205 "due to VM-entry intr info %#x", info); 1206 } 1207 } else { 1208 VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to " 1209 "Guest Interruptibility-state %#x", gi); 1210 } 1211 1212 if (need_nmi_exiting) 1213 vmx_set_nmi_window_exiting(vmx, vcpu); 1214 } 1215 1216 extint_pending = vm_extint_pending(vmx->vm, vcpu); 1217 1218 if (!extint_pending && virtual_interrupt_delivery) { 1219 vmx_inject_pir(vlapic); 1220 return; 1221 } 1222 1223 /* 1224 * If interrupt-window exiting is already in effect then don't bother 1225 * checking for pending interrupts. This is just an optimization and 1226 * not needed for correctness. 1227 */ 1228 if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) { 1229 VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to " 1230 "pending int_window_exiting"); 1231 return; 1232 } 1233 1234 if (!extint_pending) { 1235 /* Ask the local apic for a vector to inject */ 1236 if (!vlapic_pending_intr(vlapic, &vector)) 1237 return; 1238 1239 /* 1240 * From the Intel SDM, Volume 3, Section "Maskable 1241 * Hardware Interrupts": 1242 * - maskable interrupt vectors [16,255] can be delivered 1243 * through the local APIC. 1244 */ 1245 KASSERT(vector >= 16 && vector <= 255, 1246 ("invalid vector %d from local APIC", vector)); 1247 } else { 1248 /* Ask the legacy pic for a vector to inject */ 1249 vatpic_pending_intr(vmx->vm, &vector); 1250 1251 /* 1252 * From the Intel SDM, Volume 3, Section "Maskable 1253 * Hardware Interrupts": 1254 * - maskable interrupt vectors [0,255] can be delivered 1255 * through the INTR pin. 1256 */ 1257 KASSERT(vector >= 0 && vector <= 255, 1258 ("invalid vector %d from INTR", vector)); 1259 } 1260 1261 /* Check RFLAGS.IF and the interruptibility state of the guest */ 1262 rflags = vmcs_read(VMCS_GUEST_RFLAGS); 1263 if ((rflags & PSL_I) == 0) { 1264 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " 1265 "rflags %#lx", vector, rflags); 1266 goto cantinject; 1267 } 1268 1269 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1270 if (gi & HWINTR_BLOCKING) { 1271 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " 1272 "Guest Interruptibility-state %#x", vector, gi); 1273 goto cantinject; 1274 } 1275 1276 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1277 if (info & VMCS_INTR_VALID) { 1278 /* 1279 * This is expected and could happen for multiple reasons: 1280 * - A vectoring VM-entry was aborted due to astpending 1281 * - A VM-exit happened during event injection. 1282 * - An exception was injected above. 1283 * - An NMI was injected above or after "NMI window exiting" 1284 */ 1285 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " 1286 "VM-entry intr info %#x", vector, info); 1287 goto cantinject; 1288 } 1289 1290 /* Inject the interrupt */ 1291 info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID; 1292 info |= vector; 1293 vmcs_write(VMCS_ENTRY_INTR_INFO, info); 1294 1295 if (!extint_pending) { 1296 /* Update the Local APIC ISR */ 1297 vlapic_intr_accepted(vlapic, vector); 1298 } else { 1299 vm_extint_clear(vmx->vm, vcpu); 1300 vatpic_intr_accepted(vmx->vm, vector); 1301 1302 /* 1303 * After we accepted the current ExtINT the PIC may 1304 * have posted another one. If that is the case, set 1305 * the Interrupt Window Exiting execution control so 1306 * we can inject that one too. 1307 */ 1308 if (vm_extint_pending(vmx->vm, vcpu)) 1309 vmx_set_int_window_exiting(vmx, vcpu); 1310 } 1311 1312 VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); 1313 1314 return; 1315 1316cantinject: 1317 /* 1318 * Set the Interrupt Window Exiting execution control so we can inject 1319 * the interrupt as soon as blocking condition goes away. 1320 */ 1321 vmx_set_int_window_exiting(vmx, vcpu); 1322} 1323 1324/* 1325 * If the Virtual NMIs execution control is '1' then the logical processor 1326 * tracks virtual-NMI blocking in the Guest Interruptibility-state field of 1327 * the VMCS. An IRET instruction in VMX non-root operation will remove any 1328 * virtual-NMI blocking. 1329 * 1330 * This unblocking occurs even if the IRET causes a fault. In this case the 1331 * hypervisor needs to restore virtual-NMI blocking before resuming the guest. 1332 */ 1333static void 1334vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid) 1335{ 1336 uint32_t gi; 1337 1338 VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking"); 1339 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1340 gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING; 1341 vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); 1342} 1343 1344static void 1345vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid) 1346{ 1347 uint32_t gi; 1348 1349 VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking"); 1350 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1351 gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING; 1352 vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); 1353} 1354 1355static int 1356vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 1357{ 1358 struct vmxctx *vmxctx; 1359 uint64_t xcrval; 1360 const struct xsave_limits *limits; 1361 1362 vmxctx = &vmx->ctx[vcpu]; 1363 limits = vmm_get_xsave_limits(); 1364 1365 /* 1366 * Note that the processor raises a GP# fault on its own if 1367 * xsetbv is executed for CPL != 0, so we do not have to 1368 * emulate that fault here. 1369 */ 1370 1371 /* Only xcr0 is supported. */ 1372 if (vmxctx->guest_rcx != 0) { 1373 vm_inject_gp(vmx->vm, vcpu); 1374 return (HANDLED); 1375 } 1376 1377 /* We only handle xcr0 if both the host and guest have XSAVE enabled. */ 1378 if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) { 1379 vm_inject_ud(vmx->vm, vcpu); 1380 return (HANDLED); 1381 } 1382 1383 xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff); 1384 if ((xcrval & ~limits->xcr0_allowed) != 0) { 1385 vm_inject_gp(vmx->vm, vcpu); 1386 return (HANDLED); 1387 } 1388 1389 if (!(xcrval & XFEATURE_ENABLED_X87)) { 1390 vm_inject_gp(vmx->vm, vcpu); 1391 return (HANDLED); 1392 } 1393 1394 /* AVX (YMM_Hi128) requires SSE. */ 1395 if (xcrval & XFEATURE_ENABLED_AVX && 1396 (xcrval & XFEATURE_AVX) != XFEATURE_AVX) { 1397 vm_inject_gp(vmx->vm, vcpu); 1398 return (HANDLED); 1399 } 1400 1401 /* 1402 * AVX512 requires base AVX (YMM_Hi128) as well as OpMask, 1403 * ZMM_Hi256, and Hi16_ZMM. 1404 */ 1405 if (xcrval & XFEATURE_AVX512 && 1406 (xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) != 1407 (XFEATURE_AVX512 | XFEATURE_AVX)) { 1408 vm_inject_gp(vmx->vm, vcpu); 1409 return (HANDLED); 1410 } 1411 1412 /* 1413 * Intel MPX requires both bound register state flags to be 1414 * set. 1415 */ 1416 if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) != 1417 ((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) { 1418 vm_inject_gp(vmx->vm, vcpu); 1419 return (HANDLED); 1420 } 1421 1422 /* 1423 * This runs "inside" vmrun() with the guest's FPU state, so 1424 * modifying xcr0 directly modifies the guest's xcr0, not the 1425 * host's. 1426 */ 1427 load_xcr(0, xcrval); 1428 return (HANDLED); 1429} 1430 1431static int 1432vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual) 1433{ 1434 int cr, vmcs_guest_cr, vmcs_shadow_cr; 1435 uint64_t crval, regval, ones_mask, zeros_mask; 1436 const struct vmxctx *vmxctx; 1437 1438 /* We only handle mov to %cr0 or %cr4 at this time */ 1439 if ((exitqual & 0xf0) != 0x00) 1440 return (UNHANDLED); 1441 1442 cr = exitqual & 0xf; 1443 if (cr != 0 && cr != 4) 1444 return (UNHANDLED); 1445 1446 regval = 0; /* silence gcc */ 1447 vmxctx = &vmx->ctx[vcpu]; 1448 1449 /* 1450 * We must use vmcs_write() directly here because vmcs_setreg() will 1451 * call vmclear(vmcs) as a side-effect which we certainly don't want. 1452 */ 1453 switch ((exitqual >> 8) & 0xf) { 1454 case 0: 1455 regval = vmxctx->guest_rax; 1456 break; 1457 case 1: 1458 regval = vmxctx->guest_rcx; 1459 break; 1460 case 2: 1461 regval = vmxctx->guest_rdx; 1462 break; 1463 case 3: 1464 regval = vmxctx->guest_rbx; 1465 break; 1466 case 4: 1467 regval = vmcs_read(VMCS_GUEST_RSP); 1468 break; 1469 case 5: 1470 regval = vmxctx->guest_rbp; 1471 break; 1472 case 6: 1473 regval = vmxctx->guest_rsi; 1474 break; 1475 case 7: 1476 regval = vmxctx->guest_rdi; 1477 break; 1478 case 8: 1479 regval = vmxctx->guest_r8; 1480 break; 1481 case 9: 1482 regval = vmxctx->guest_r9; 1483 break; 1484 case 10: 1485 regval = vmxctx->guest_r10; 1486 break; 1487 case 11: 1488 regval = vmxctx->guest_r11; 1489 break; 1490 case 12: 1491 regval = vmxctx->guest_r12; 1492 break; 1493 case 13: 1494 regval = vmxctx->guest_r13; 1495 break; 1496 case 14: 1497 regval = vmxctx->guest_r14; 1498 break; 1499 case 15: 1500 regval = vmxctx->guest_r15; 1501 break; 1502 } 1503 1504 if (cr == 0) { 1505 ones_mask = cr0_ones_mask; 1506 zeros_mask = cr0_zeros_mask; 1507 vmcs_guest_cr = VMCS_GUEST_CR0; 1508 vmcs_shadow_cr = VMCS_CR0_SHADOW; 1509 } else { 1510 ones_mask = cr4_ones_mask; 1511 zeros_mask = cr4_zeros_mask; 1512 vmcs_guest_cr = VMCS_GUEST_CR4; 1513 vmcs_shadow_cr = VMCS_CR4_SHADOW; 1514 } 1515 vmcs_write(vmcs_shadow_cr, regval); 1516 1517 crval = regval | ones_mask; 1518 crval &= ~zeros_mask; 1519 vmcs_write(vmcs_guest_cr, crval); 1520 1521 if (cr == 0 && regval & CR0_PG) { 1522 uint64_t efer, entry_ctls; 1523 1524 /* 1525 * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and 1526 * the "IA-32e mode guest" bit in VM-entry control must be 1527 * equal. 1528 */ 1529 efer = vmcs_read(VMCS_GUEST_IA32_EFER); 1530 if (efer & EFER_LME) { 1531 efer |= EFER_LMA; 1532 vmcs_write(VMCS_GUEST_IA32_EFER, efer); 1533 entry_ctls = vmcs_read(VMCS_ENTRY_CTLS); 1534 entry_ctls |= VM_ENTRY_GUEST_LMA; 1535 vmcs_write(VMCS_ENTRY_CTLS, entry_ctls); 1536 } 1537 } 1538 1539 return (HANDLED); 1540} 1541 1542static enum vie_cpu_mode 1543vmx_cpu_mode(void) 1544{ 1545 1546 if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) 1547 return (CPU_MODE_64BIT); 1548 else 1549 return (CPU_MODE_COMPATIBILITY); 1550} 1551 1552static enum vie_paging_mode 1553vmx_paging_mode(void) 1554{ 1555 1556 if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG)) 1557 return (PAGING_MODE_FLAT); 1558 if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE)) 1559 return (PAGING_MODE_32); 1560 if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME) 1561 return (PAGING_MODE_64); 1562 else 1563 return (PAGING_MODE_PAE); 1564} 1565 1566static int 1567ept_fault_type(uint64_t ept_qual) 1568{ 1569 int fault_type; 1570 1571 if (ept_qual & EPT_VIOLATION_DATA_WRITE) 1572 fault_type = VM_PROT_WRITE; 1573 else if (ept_qual & EPT_VIOLATION_INST_FETCH) 1574 fault_type = VM_PROT_EXECUTE; 1575 else 1576 fault_type= VM_PROT_READ; 1577 1578 return (fault_type); 1579} 1580 1581static boolean_t 1582ept_emulation_fault(uint64_t ept_qual) 1583{ 1584 int read, write; 1585 1586 /* EPT fault on an instruction fetch doesn't make sense here */ 1587 if (ept_qual & EPT_VIOLATION_INST_FETCH) 1588 return (FALSE); 1589 1590 /* EPT fault must be a read fault or a write fault */ 1591 read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0; 1592 write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0; 1593 if ((read | write) == 0) 1594 return (FALSE); 1595 1596 /* 1597 * The EPT violation must have been caused by accessing a 1598 * guest-physical address that is a translation of a guest-linear 1599 * address. 1600 */ 1601 if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 || 1602 (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) { 1603 return (FALSE); 1604 } 1605 1606 return (TRUE); 1607} 1608 1609static __inline int 1610apic_access_virtualization(struct vmx *vmx, int vcpuid) 1611{ 1612 uint32_t proc_ctls2; 1613 1614 proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; 1615 return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0); 1616} 1617 1618static __inline int 1619x2apic_virtualization(struct vmx *vmx, int vcpuid) 1620{ 1621 uint32_t proc_ctls2; 1622 1623 proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; 1624 return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0); 1625} 1626 1627static int 1628vmx_handle_apic_write(struct vmx *vmx, int vcpuid, struct vlapic *vlapic, 1629 uint64_t qual) 1630{ 1631 int error, handled, offset; 1632 uint32_t *apic_regs, vector; 1633 bool retu; 1634 1635 handled = HANDLED; 1636 offset = APIC_WRITE_OFFSET(qual); 1637 1638 if (!apic_access_virtualization(vmx, vcpuid)) { 1639 /* 1640 * In general there should not be any APIC write VM-exits 1641 * unless APIC-access virtualization is enabled. 1642 * 1643 * However self-IPI virtualization can legitimately trigger 1644 * an APIC-write VM-exit so treat it specially. 1645 */ 1646 if (x2apic_virtualization(vmx, vcpuid) && 1647 offset == APIC_OFFSET_SELF_IPI) { 1648 apic_regs = (uint32_t *)(vlapic->apic_page); 1649 vector = apic_regs[APIC_OFFSET_SELF_IPI / 4]; 1650 vlapic_self_ipi_handler(vlapic, vector); 1651 return (HANDLED); 1652 } else 1653 return (UNHANDLED); 1654 } 1655 1656 switch (offset) { 1657 case APIC_OFFSET_ID: 1658 vlapic_id_write_handler(vlapic); 1659 break; 1660 case APIC_OFFSET_LDR: 1661 vlapic_ldr_write_handler(vlapic); 1662 break; 1663 case APIC_OFFSET_DFR: 1664 vlapic_dfr_write_handler(vlapic); 1665 break; 1666 case APIC_OFFSET_SVR: 1667 vlapic_svr_write_handler(vlapic); 1668 break; 1669 case APIC_OFFSET_ESR: 1670 vlapic_esr_write_handler(vlapic); 1671 break; 1672 case APIC_OFFSET_ICR_LOW: 1673 retu = false; 1674 error = vlapic_icrlo_write_handler(vlapic, &retu); 1675 if (error != 0 || retu) 1676 handled = UNHANDLED; 1677 break; 1678 case APIC_OFFSET_CMCI_LVT: 1679 case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: 1680 vlapic_lvt_write_handler(vlapic, offset); 1681 break; 1682 case APIC_OFFSET_TIMER_ICR: 1683 vlapic_icrtmr_write_handler(vlapic); 1684 break; 1685 case APIC_OFFSET_TIMER_DCR: 1686 vlapic_dcr_write_handler(vlapic); 1687 break; 1688 default: 1689 handled = UNHANDLED; 1690 break; 1691 } 1692 return (handled); 1693} 1694 1695static bool 1696apic_access_fault(struct vmx *vmx, int vcpuid, uint64_t gpa) 1697{ 1698 1699 if (apic_access_virtualization(vmx, vcpuid) && 1700 (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE)) 1701 return (true); 1702 else 1703 return (false); 1704} 1705 1706static int 1707vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) 1708{ 1709 uint64_t qual; 1710 int access_type, offset, allowed; 1711 1712 if (!apic_access_virtualization(vmx, vcpuid)) 1713 return (UNHANDLED); 1714 1715 qual = vmexit->u.vmx.exit_qualification; 1716 access_type = APIC_ACCESS_TYPE(qual); 1717 offset = APIC_ACCESS_OFFSET(qual); 1718 1719 allowed = 0; 1720 if (access_type == 0) { 1721 /* 1722 * Read data access to the following registers is expected. 1723 */ 1724 switch (offset) { 1725 case APIC_OFFSET_APR: 1726 case APIC_OFFSET_PPR: 1727 case APIC_OFFSET_RRR: 1728 case APIC_OFFSET_CMCI_LVT: 1729 case APIC_OFFSET_TIMER_CCR: 1730 allowed = 1; 1731 break; 1732 default: 1733 break; 1734 } 1735 } else if (access_type == 1) { 1736 /* 1737 * Write data access to the following registers is expected. 1738 */ 1739 switch (offset) { 1740 case APIC_OFFSET_VER: 1741 case APIC_OFFSET_APR: 1742 case APIC_OFFSET_PPR: 1743 case APIC_OFFSET_RRR: 1744 case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: 1745 case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: 1746 case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: 1747 case APIC_OFFSET_CMCI_LVT: 1748 case APIC_OFFSET_TIMER_CCR: 1749 allowed = 1; 1750 break; 1751 default: 1752 break; 1753 } 1754 } 1755 1756 if (allowed) { 1757 vmexit->exitcode = VM_EXITCODE_INST_EMUL; 1758 vmexit->u.inst_emul.gpa = DEFAULT_APIC_BASE + offset; 1759 vmexit->u.inst_emul.gla = VIE_INVALID_GLA; 1760 vmexit->u.inst_emul.cr3 = vmcs_guest_cr3(); 1761 vmexit->u.inst_emul.cpu_mode = vmx_cpu_mode(); 1762 vmexit->u.inst_emul.paging_mode = vmx_paging_mode(); 1763 } 1764 1765 /* 1766 * Regardless of whether the APIC-access is allowed this handler 1767 * always returns UNHANDLED: 1768 * - if the access is allowed then it is handled by emulating the 1769 * instruction that caused the VM-exit (outside the critical section) 1770 * - if the access is not allowed then it will be converted to an 1771 * exitcode of VM_EXITCODE_VMX and will be dealt with in userland. 1772 */ 1773 return (UNHANDLED); 1774} 1775 1776static int 1777vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 1778{ 1779 int error, handled; 1780 struct vmxctx *vmxctx; 1781 struct vlapic *vlapic; 1782 uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, reason; 1783 uint64_t qual, gpa; 1784 bool retu; 1785 1786 CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0); 1787 CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0); 1788 1789 handled = UNHANDLED; 1790 vmxctx = &vmx->ctx[vcpu]; 1791 1792 qual = vmexit->u.vmx.exit_qualification; 1793 reason = vmexit->u.vmx.exit_reason; 1794 vmexit->exitcode = VM_EXITCODE_BOGUS; 1795 1796 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1); 1797 1798 /* 1799 * VM exits that could be triggered during event injection on the 1800 * previous VM entry need to be handled specially by re-injecting 1801 * the event. 1802 * 1803 * See "Information for VM Exits During Event Delivery" in Intel SDM 1804 * for details. 1805 */ 1806 switch (reason) { 1807 case EXIT_REASON_EPT_FAULT: 1808 case EXIT_REASON_EPT_MISCONFIG: 1809 case EXIT_REASON_APIC_ACCESS: 1810 case EXIT_REASON_TASK_SWITCH: 1811 case EXIT_REASON_EXCEPTION: 1812 idtvec_info = vmcs_idt_vectoring_info(); 1813 if (idtvec_info & VMCS_IDT_VEC_VALID) { 1814 idtvec_info &= ~(1 << 12); /* clear undefined bit */ 1815 vmcs_write(VMCS_ENTRY_INTR_INFO, idtvec_info); 1816 if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { 1817 idtvec_err = vmcs_idt_vectoring_err(); 1818 vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, 1819 idtvec_err); 1820 } 1821 /* 1822 * If 'virtual NMIs' are being used and the VM-exit 1823 * happened while injecting an NMI during the previous 1824 * VM-entry, then clear "blocking by NMI" in the Guest 1825 * Interruptibility-state. 1826 */ 1827 if ((idtvec_info & VMCS_INTR_T_MASK) == 1828 VMCS_INTR_T_NMI) { 1829 vmx_clear_nmi_blocking(vmx, vcpu); 1830 } 1831 vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); 1832 } 1833 default: 1834 idtvec_info = 0; 1835 break; 1836 } 1837 1838 switch (reason) { 1839 case EXIT_REASON_CR_ACCESS: 1840 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1); 1841 handled = vmx_emulate_cr_access(vmx, vcpu, qual); 1842 break; 1843 case EXIT_REASON_RDMSR: 1844 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1); 1845 retu = false; 1846 ecx = vmxctx->guest_rcx; 1847 VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx); 1848 error = emulate_rdmsr(vmx->vm, vcpu, ecx, &retu); 1849 if (error) { 1850 vmexit->exitcode = VM_EXITCODE_RDMSR; 1851 vmexit->u.msr.code = ecx; 1852 } else if (!retu) { 1853 handled = HANDLED; 1854 } else { 1855 /* Return to userspace with a valid exitcode */ 1856 KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, 1857 ("emulate_wrmsr retu with bogus exitcode")); 1858 } 1859 break; 1860 case EXIT_REASON_WRMSR: 1861 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1); 1862 retu = false; 1863 eax = vmxctx->guest_rax; 1864 ecx = vmxctx->guest_rcx; 1865 edx = vmxctx->guest_rdx; 1866 VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016lx", 1867 ecx, (uint64_t)edx << 32 | eax); 1868 error = emulate_wrmsr(vmx->vm, vcpu, ecx, 1869 (uint64_t)edx << 32 | eax, &retu); 1870 if (error) { 1871 vmexit->exitcode = VM_EXITCODE_WRMSR; 1872 vmexit->u.msr.code = ecx; 1873 vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; 1874 } else if (!retu) { 1875 handled = HANDLED; 1876 } else { 1877 /* Return to userspace with a valid exitcode */ 1878 KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, 1879 ("emulate_wrmsr retu with bogus exitcode")); 1880 } 1881 break; 1882 case EXIT_REASON_HLT: 1883 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1); 1884 vmexit->exitcode = VM_EXITCODE_HLT; 1885 vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS); 1886 break; 1887 case EXIT_REASON_MTF: 1888 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1); 1889 vmexit->exitcode = VM_EXITCODE_MTRAP; 1890 break; 1891 case EXIT_REASON_PAUSE: 1892 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1); 1893 vmexit->exitcode = VM_EXITCODE_PAUSE; 1894 break; 1895 case EXIT_REASON_INTR_WINDOW: 1896 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1); 1897 vmx_clear_int_window_exiting(vmx, vcpu); 1898 return (1); 1899 case EXIT_REASON_EXT_INTR: 1900 /* 1901 * External interrupts serve only to cause VM exits and allow 1902 * the host interrupt handler to run. 1903 * 1904 * If this external interrupt triggers a virtual interrupt 1905 * to a VM, then that state will be recorded by the 1906 * host interrupt handler in the VM's softc. We will inject 1907 * this virtual interrupt during the subsequent VM enter. 1908 */ 1909 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); 1910 1911 /* 1912 * XXX: Ignore this exit if VMCS_INTR_VALID is not set. 1913 * This appears to be a bug in VMware Fusion? 1914 */ 1915 if (!(intr_info & VMCS_INTR_VALID)) 1916 return (1); 1917 KASSERT((intr_info & VMCS_INTR_VALID) != 0 && 1918 (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR, 1919 ("VM exit interruption info invalid: %#x", intr_info)); 1920 vmx_trigger_hostintr(intr_info & 0xff); 1921 1922 /* 1923 * This is special. We want to treat this as an 'handled' 1924 * VM-exit but not increment the instruction pointer. 1925 */ 1926 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); 1927 return (1); 1928 case EXIT_REASON_NMI_WINDOW: 1929 /* Exit to allow the pending virtual NMI to be injected */ 1930 if (vm_nmi_pending(vmx->vm, vcpu)) 1931 vmx_inject_nmi(vmx, vcpu); 1932 vmx_clear_nmi_window_exiting(vmx, vcpu); 1933 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1); 1934 return (1); 1935 case EXIT_REASON_INOUT: 1936 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1); 1937 vmexit->exitcode = VM_EXITCODE_INOUT; 1938 vmexit->u.inout.bytes = (qual & 0x7) + 1; 1939 vmexit->u.inout.in = (qual & 0x8) ? 1 : 0; 1940 vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; 1941 vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; 1942 vmexit->u.inout.port = (uint16_t)(qual >> 16); 1943 vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); 1944 error = emulate_ioport(vmx->vm, vcpu, vmexit); 1945 if (error == 0) { 1946 handled = 1; 1947 vmxctx->guest_rax = vmexit->u.inout.eax; 1948 } 1949 break; 1950 case EXIT_REASON_CPUID: 1951 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1); 1952 handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx); 1953 break; 1954 case EXIT_REASON_EXCEPTION: 1955 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1); 1956 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); 1957 KASSERT((intr_info & VMCS_INTR_VALID) != 0, 1958 ("VM exit interruption info invalid: %#x", intr_info)); 1959 1960 /* 1961 * If Virtual NMIs control is 1 and the VM-exit is due to a 1962 * fault encountered during the execution of IRET then we must 1963 * restore the state of "virtual-NMI blocking" before resuming 1964 * the guest. 1965 * 1966 * See "Resuming Guest Software after Handling an Exception". 1967 */ 1968 if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && 1969 (intr_info & 0xff) != IDT_DF && 1970 (intr_info & EXIT_QUAL_NMIUDTI) != 0) 1971 vmx_restore_nmi_blocking(vmx, vcpu); 1972 1973 /* 1974 * The NMI has already been handled in vmx_exit_handle_nmi(). 1975 */ 1976 if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) 1977 return (1); 1978 break; 1979 case EXIT_REASON_EPT_FAULT: 1980 /* 1981 * If 'gpa' lies within the address space allocated to 1982 * memory then this must be a nested page fault otherwise 1983 * this must be an instruction that accesses MMIO space. 1984 */ 1985 gpa = vmcs_gpa(); 1986 if (vm_mem_allocated(vmx->vm, gpa) || 1987 apic_access_fault(vmx, vcpu, gpa)) { 1988 vmexit->exitcode = VM_EXITCODE_PAGING; 1989 vmexit->u.paging.gpa = gpa; 1990 vmexit->u.paging.fault_type = ept_fault_type(qual); 1991 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1); 1992 } else if (ept_emulation_fault(qual)) { 1993 vmexit->exitcode = VM_EXITCODE_INST_EMUL; 1994 vmexit->u.inst_emul.gpa = gpa; 1995 vmexit->u.inst_emul.gla = vmcs_gla(); 1996 vmexit->u.inst_emul.cr3 = vmcs_guest_cr3(); 1997 vmexit->u.inst_emul.cpu_mode = vmx_cpu_mode(); 1998 vmexit->u.inst_emul.paging_mode = vmx_paging_mode(); 1999 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1); 2000 } 2001 /* 2002 * If Virtual NMIs control is 1 and the VM-exit is due to an 2003 * EPT fault during the execution of IRET then we must restore 2004 * the state of "virtual-NMI blocking" before resuming. 2005 * 2006 * See description of "NMI unblocking due to IRET" in 2007 * "Exit Qualification for EPT Violations". 2008 */ 2009 if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && 2010 (qual & EXIT_QUAL_NMIUDTI) != 0) 2011 vmx_restore_nmi_blocking(vmx, vcpu); 2012 break; 2013 case EXIT_REASON_VIRTUALIZED_EOI: 2014 vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI; 2015 vmexit->u.ioapic_eoi.vector = qual & 0xFF; 2016 vmexit->inst_length = 0; /* trap-like */ 2017 break; 2018 case EXIT_REASON_APIC_ACCESS: 2019 handled = vmx_handle_apic_access(vmx, vcpu, vmexit); 2020 break; 2021 case EXIT_REASON_APIC_WRITE: 2022 /* 2023 * APIC-write VM exit is trap-like so the %rip is already 2024 * pointing to the next instruction. 2025 */ 2026 vmexit->inst_length = 0; 2027 vlapic = vm_lapic(vmx->vm, vcpu); 2028 handled = vmx_handle_apic_write(vmx, vcpu, vlapic, qual); 2029 break; 2030 case EXIT_REASON_XSETBV: 2031 handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit); 2032 break; 2033 default: 2034 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1); 2035 break; 2036 } 2037 2038 if (handled) { 2039 /* 2040 * It is possible that control is returned to userland 2041 * even though we were able to handle the VM exit in the 2042 * kernel. 2043 * 2044 * In such a case we want to make sure that the userland 2045 * restarts guest execution at the instruction *after* 2046 * the one we just processed. Therefore we update the 2047 * guest rip in the VMCS and in 'vmexit'. 2048 */ 2049 vmexit->rip += vmexit->inst_length; 2050 vmexit->inst_length = 0; 2051 vmcs_write(VMCS_GUEST_RIP, vmexit->rip); 2052 } else { 2053 if (vmexit->exitcode == VM_EXITCODE_BOGUS) { 2054 /* 2055 * If this VM exit was not claimed by anybody then 2056 * treat it as a generic VMX exit. 2057 */ 2058 vmexit->exitcode = VM_EXITCODE_VMX; 2059 vmexit->u.vmx.status = VM_SUCCESS; 2060 vmexit->u.vmx.inst_type = 0; 2061 vmexit->u.vmx.inst_error = 0; 2062 } else { 2063 /* 2064 * The exitcode and collateral have been populated. 2065 * The VM exit will be processed further in userland. 2066 */ 2067 } 2068 } 2069 return (handled); 2070} 2071 2072static __inline int 2073vmx_exit_astpending(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 2074{ 2075 2076 vmexit->rip = vmcs_guest_rip(); 2077 vmexit->inst_length = 0; 2078 vmexit->exitcode = VM_EXITCODE_BOGUS; 2079 vmx_astpending_trace(vmx, vcpu, vmexit->rip); 2080 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_ASTPENDING, 1); 2081 2082 return (HANDLED); 2083} 2084 2085static __inline int 2086vmx_exit_rendezvous(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 2087{ 2088 2089 vmexit->rip = vmcs_guest_rip(); 2090 vmexit->inst_length = 0; 2091 vmexit->exitcode = VM_EXITCODE_RENDEZVOUS; 2092 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RENDEZVOUS, 1); 2093 2094 return (UNHANDLED); 2095} 2096 2097static __inline int 2098vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit) 2099{ 2100 2101 KASSERT(vmxctx->inst_fail_status != VM_SUCCESS, 2102 ("vmx_exit_inst_error: invalid inst_fail_status %d", 2103 vmxctx->inst_fail_status)); 2104 2105 vmexit->inst_length = 0; 2106 vmexit->exitcode = VM_EXITCODE_VMX; 2107 vmexit->u.vmx.status = vmxctx->inst_fail_status; 2108 vmexit->u.vmx.inst_error = vmcs_instruction_error(); 2109 vmexit->u.vmx.exit_reason = ~0; 2110 vmexit->u.vmx.exit_qualification = ~0; 2111 2112 switch (rc) { 2113 case VMX_VMRESUME_ERROR: 2114 case VMX_VMLAUNCH_ERROR: 2115 case VMX_INVEPT_ERROR: 2116 vmexit->u.vmx.inst_type = rc; 2117 break; 2118 default: 2119 panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc); 2120 } 2121 2122 return (UNHANDLED); 2123} 2124 2125/* 2126 * If the NMI-exiting VM execution control is set to '1' then an NMI in 2127 * non-root operation causes a VM-exit. NMI blocking is in effect so it is 2128 * sufficient to simply vector to the NMI handler via a software interrupt. 2129 * However, this must be done before maskable interrupts are enabled 2130 * otherwise the "iret" issued by an interrupt handler will incorrectly 2131 * clear NMI blocking. 2132 */ 2133static __inline void 2134vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) 2135{ 2136 uint32_t intr_info; 2137 2138 KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled")); 2139 2140 if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION) 2141 return; 2142 2143 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); 2144 KASSERT((intr_info & VMCS_INTR_VALID) != 0, 2145 ("VM exit interruption info invalid: %#x", intr_info)); 2146 2147 if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) { 2148 KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due " 2149 "to NMI has invalid vector: %#x", intr_info)); 2150 VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler"); 2151 __asm __volatile("int $2"); 2152 } 2153} 2154 2155static int 2156vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap, 2157 void *rendezvous_cookie, void *suspend_cookie) 2158{ 2159 int rc, handled, launched; 2160 struct vmx *vmx; 2161 struct vm *vm; 2162 struct vmxctx *vmxctx; 2163 struct vmcs *vmcs; 2164 struct vm_exit *vmexit; 2165 struct vlapic *vlapic; 2166 uint64_t rip; 2167 uint32_t exit_reason; 2168 2169 vmx = arg; 2170 vm = vmx->vm; 2171 vmcs = &vmx->vmcs[vcpu]; 2172 vmxctx = &vmx->ctx[vcpu]; 2173 vlapic = vm_lapic(vm, vcpu); 2174 vmexit = vm_exitinfo(vm, vcpu); 2175 launched = 0; 2176 2177 KASSERT(vmxctx->pmap == pmap, 2178 ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap)); 2179 2180 VMPTRLD(vmcs); 2181 2182 /* 2183 * XXX 2184 * We do this every time because we may setup the virtual machine 2185 * from a different process than the one that actually runs it. 2186 * 2187 * If the life of a virtual machine was spent entirely in the context 2188 * of a single process we could do this once in vmx_vminit(). 2189 */ 2190 vmcs_write(VMCS_HOST_CR3, rcr3()); 2191 2192 vmcs_write(VMCS_GUEST_RIP, startrip); 2193 vmx_set_pcpu_defaults(vmx, vcpu, pmap); 2194 do { 2195 /* 2196 * Interrupts are disabled from this point on until the 2197 * guest starts executing. This is done for the following 2198 * reasons: 2199 * 2200 * If an AST is asserted on this thread after the check below, 2201 * then the IPI_AST notification will not be lost, because it 2202 * will cause a VM exit due to external interrupt as soon as 2203 * the guest state is loaded. 2204 * 2205 * A posted interrupt after 'vmx_inject_interrupts()' will 2206 * not be "lost" because it will be held pending in the host 2207 * APIC because interrupts are disabled. The pending interrupt 2208 * will be recognized as soon as the guest state is loaded. 2209 * 2210 * The same reasoning applies to the IPI generated by 2211 * pmap_invalidate_ept(). 2212 */ 2213 disable_intr(); 2214 if (vcpu_suspended(suspend_cookie)) { 2215 enable_intr(); 2216 vm_exit_suspended(vmx->vm, vcpu, vmcs_guest_rip()); 2217 handled = UNHANDLED; 2218 break; 2219 } 2220 2221 if (vcpu_rendezvous_pending(rendezvous_cookie)) { 2222 enable_intr(); 2223 handled = vmx_exit_rendezvous(vmx, vcpu, vmexit); 2224 break; 2225 } 2226 2227 if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)) { 2228 enable_intr(); 2229 handled = vmx_exit_astpending(vmx, vcpu, vmexit); 2230 break; 2231 } 2232 2233 vmx_inject_interrupts(vmx, vcpu, vlapic); 2234 vmx_run_trace(vmx, vcpu); 2235 rc = vmx_enter_guest(vmxctx, vmx, launched); 2236 2237 /* Collect some information for VM exit processing */ 2238 vmexit->rip = rip = vmcs_guest_rip(); 2239 vmexit->inst_length = vmexit_instruction_length(); 2240 vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); 2241 vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); 2242 2243 if (rc == VMX_GUEST_VMEXIT) { 2244 vmx_exit_handle_nmi(vmx, vcpu, vmexit); 2245 enable_intr(); 2246 handled = vmx_exit_process(vmx, vcpu, vmexit); 2247 } else { 2248 enable_intr(); 2249 handled = vmx_exit_inst_error(vmxctx, rc, vmexit); 2250 } 2251 launched = 1; 2252 vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled); 2253 } while (handled); 2254 2255 /* 2256 * If a VM exit has been handled then the exitcode must be BOGUS 2257 * If a VM exit is not handled then the exitcode must not be BOGUS 2258 */ 2259 if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || 2260 (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { 2261 panic("Mismatch between handled (%d) and exitcode (%d)", 2262 handled, vmexit->exitcode); 2263 } 2264 2265 if (!handled) 2266 vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1); 2267 2268 VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d", 2269 vmexit->exitcode); 2270 2271 VMCLEAR(vmcs); 2272 return (0); 2273} 2274 2275static void 2276vmx_vmcleanup(void *arg) 2277{ 2278 int i; 2279 struct vmx *vmx = arg; 2280 2281 if (apic_access_virtualization(vmx, 0)) 2282 vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); 2283 2284 for (i = 0; i < VM_MAXCPU; i++) 2285 vpid_free(vmx->state[i].vpid); 2286 2287 free(vmx, M_VMX); 2288 2289 return; 2290} 2291 2292static register_t * 2293vmxctx_regptr(struct vmxctx *vmxctx, int reg) 2294{ 2295 2296 switch (reg) { 2297 case VM_REG_GUEST_RAX: 2298 return (&vmxctx->guest_rax); 2299 case VM_REG_GUEST_RBX: 2300 return (&vmxctx->guest_rbx); 2301 case VM_REG_GUEST_RCX: 2302 return (&vmxctx->guest_rcx); 2303 case VM_REG_GUEST_RDX: 2304 return (&vmxctx->guest_rdx); 2305 case VM_REG_GUEST_RSI: 2306 return (&vmxctx->guest_rsi); 2307 case VM_REG_GUEST_RDI: 2308 return (&vmxctx->guest_rdi); 2309 case VM_REG_GUEST_RBP: 2310 return (&vmxctx->guest_rbp); 2311 case VM_REG_GUEST_R8: 2312 return (&vmxctx->guest_r8); 2313 case VM_REG_GUEST_R9: 2314 return (&vmxctx->guest_r9); 2315 case VM_REG_GUEST_R10: 2316 return (&vmxctx->guest_r10); 2317 case VM_REG_GUEST_R11: 2318 return (&vmxctx->guest_r11); 2319 case VM_REG_GUEST_R12: 2320 return (&vmxctx->guest_r12); 2321 case VM_REG_GUEST_R13: 2322 return (&vmxctx->guest_r13); 2323 case VM_REG_GUEST_R14: 2324 return (&vmxctx->guest_r14); 2325 case VM_REG_GUEST_R15: 2326 return (&vmxctx->guest_r15); 2327 default: 2328 break; 2329 } 2330 return (NULL); 2331} 2332 2333static int 2334vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval) 2335{ 2336 register_t *regp; 2337 2338 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 2339 *retval = *regp; 2340 return (0); 2341 } else 2342 return (EINVAL); 2343} 2344 2345static int 2346vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) 2347{ 2348 register_t *regp; 2349 2350 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 2351 *regp = val; 2352 return (0); 2353 } else 2354 return (EINVAL); 2355} 2356 2357static int 2358vmx_shadow_reg(int reg) 2359{ 2360 int shreg; 2361 2362 shreg = -1; 2363 2364 switch (reg) { 2365 case VM_REG_GUEST_CR0: 2366 shreg = VMCS_CR0_SHADOW; 2367 break; 2368 case VM_REG_GUEST_CR4: 2369 shreg = VMCS_CR4_SHADOW; 2370 break; 2371 default: 2372 break; 2373 } 2374 2375 return (shreg); 2376} 2377 2378static int 2379vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) 2380{ 2381 int running, hostcpu; 2382 struct vmx *vmx = arg; 2383 2384 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 2385 if (running && hostcpu != curcpu) 2386 panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); 2387 2388 if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0) 2389 return (0); 2390 2391 return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval)); 2392} 2393 2394static int 2395vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) 2396{ 2397 int error, hostcpu, running, shadow; 2398 uint64_t ctls; 2399 struct vmx *vmx = arg; 2400 2401 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 2402 if (running && hostcpu != curcpu) 2403 panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); 2404 2405 if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0) 2406 return (0); 2407 2408 error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val); 2409 2410 if (error == 0) { 2411 /* 2412 * If the "load EFER" VM-entry control is 1 then the 2413 * value of EFER.LMA must be identical to "IA-32e mode guest" 2414 * bit in the VM-entry control. 2415 */ 2416 if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 && 2417 (reg == VM_REG_GUEST_EFER)) { 2418 vmcs_getreg(&vmx->vmcs[vcpu], running, 2419 VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls); 2420 if (val & EFER_LMA) 2421 ctls |= VM_ENTRY_GUEST_LMA; 2422 else 2423 ctls &= ~VM_ENTRY_GUEST_LMA; 2424 vmcs_setreg(&vmx->vmcs[vcpu], running, 2425 VMCS_IDENT(VMCS_ENTRY_CTLS), ctls); 2426 } 2427 2428 shadow = vmx_shadow_reg(reg); 2429 if (shadow > 0) { 2430 /* 2431 * Store the unmodified value in the shadow 2432 */ 2433 error = vmcs_setreg(&vmx->vmcs[vcpu], running, 2434 VMCS_IDENT(shadow), val); 2435 } 2436 } 2437 2438 return (error); 2439} 2440 2441static int 2442vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 2443{ 2444 int hostcpu, running; 2445 struct vmx *vmx = arg; 2446 2447 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 2448 if (running && hostcpu != curcpu) 2449 panic("vmx_getdesc: %s%d is running", vm_name(vmx->vm), vcpu); 2450 2451 return (vmcs_getdesc(&vmx->vmcs[vcpu], running, reg, desc)); 2452} 2453 2454static int 2455vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 2456{ 2457 int hostcpu, running; 2458 struct vmx *vmx = arg; 2459 2460 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 2461 if (running && hostcpu != curcpu) 2462 panic("vmx_setdesc: %s%d is running", vm_name(vmx->vm), vcpu); 2463 2464 return (vmcs_setdesc(&vmx->vmcs[vcpu], running, reg, desc)); 2465} 2466 2467static int 2468vmx_getcap(void *arg, int vcpu, int type, int *retval) 2469{ 2470 struct vmx *vmx = arg; 2471 int vcap; 2472 int ret; 2473 2474 ret = ENOENT; 2475 2476 vcap = vmx->cap[vcpu].set; 2477 2478 switch (type) { 2479 case VM_CAP_HALT_EXIT: 2480 if (cap_halt_exit) 2481 ret = 0; 2482 break; 2483 case VM_CAP_PAUSE_EXIT: 2484 if (cap_pause_exit) 2485 ret = 0; 2486 break; 2487 case VM_CAP_MTRAP_EXIT: 2488 if (cap_monitor_trap) 2489 ret = 0; 2490 break; 2491 case VM_CAP_UNRESTRICTED_GUEST: 2492 if (cap_unrestricted_guest) 2493 ret = 0; 2494 break; 2495 case VM_CAP_ENABLE_INVPCID: 2496 if (cap_invpcid) 2497 ret = 0; 2498 break; 2499 default: 2500 break; 2501 } 2502 2503 if (ret == 0) 2504 *retval = (vcap & (1 << type)) ? 1 : 0; 2505 2506 return (ret); 2507} 2508 2509static int 2510vmx_setcap(void *arg, int vcpu, int type, int val) 2511{ 2512 struct vmx *vmx = arg; 2513 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 2514 uint32_t baseval; 2515 uint32_t *pptr; 2516 int error; 2517 int flag; 2518 int reg; 2519 int retval; 2520 2521 retval = ENOENT; 2522 pptr = NULL; 2523 2524 switch (type) { 2525 case VM_CAP_HALT_EXIT: 2526 if (cap_halt_exit) { 2527 retval = 0; 2528 pptr = &vmx->cap[vcpu].proc_ctls; 2529 baseval = *pptr; 2530 flag = PROCBASED_HLT_EXITING; 2531 reg = VMCS_PRI_PROC_BASED_CTLS; 2532 } 2533 break; 2534 case VM_CAP_MTRAP_EXIT: 2535 if (cap_monitor_trap) { 2536 retval = 0; 2537 pptr = &vmx->cap[vcpu].proc_ctls; 2538 baseval = *pptr; 2539 flag = PROCBASED_MTF; 2540 reg = VMCS_PRI_PROC_BASED_CTLS; 2541 } 2542 break; 2543 case VM_CAP_PAUSE_EXIT: 2544 if (cap_pause_exit) { 2545 retval = 0; 2546 pptr = &vmx->cap[vcpu].proc_ctls; 2547 baseval = *pptr; 2548 flag = PROCBASED_PAUSE_EXITING; 2549 reg = VMCS_PRI_PROC_BASED_CTLS; 2550 } 2551 break; 2552 case VM_CAP_UNRESTRICTED_GUEST: 2553 if (cap_unrestricted_guest) { 2554 retval = 0; 2555 pptr = &vmx->cap[vcpu].proc_ctls2; 2556 baseval = *pptr; 2557 flag = PROCBASED2_UNRESTRICTED_GUEST; 2558 reg = VMCS_SEC_PROC_BASED_CTLS; 2559 } 2560 break; 2561 case VM_CAP_ENABLE_INVPCID: 2562 if (cap_invpcid) { 2563 retval = 0; 2564 pptr = &vmx->cap[vcpu].proc_ctls2; 2565 baseval = *pptr; 2566 flag = PROCBASED2_ENABLE_INVPCID; 2567 reg = VMCS_SEC_PROC_BASED_CTLS; 2568 } 2569 break; 2570 default: 2571 break; 2572 } 2573 2574 if (retval == 0) { 2575 if (val) { 2576 baseval |= flag; 2577 } else { 2578 baseval &= ~flag; 2579 } 2580 VMPTRLD(vmcs); 2581 error = vmwrite(reg, baseval); 2582 VMCLEAR(vmcs); 2583 2584 if (error) { 2585 retval = error; 2586 } else { 2587 /* 2588 * Update optional stored flags, and record 2589 * setting 2590 */ 2591 if (pptr != NULL) { 2592 *pptr = baseval; 2593 } 2594 2595 if (val) { 2596 vmx->cap[vcpu].set |= (1 << type); 2597 } else { 2598 vmx->cap[vcpu].set &= ~(1 << type); 2599 } 2600 } 2601 } 2602 2603 return (retval); 2604} 2605 2606struct vlapic_vtx { 2607 struct vlapic vlapic; 2608 struct pir_desc *pir_desc; 2609 struct vmx *vmx; 2610}; 2611 2612#define VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg) \ 2613do { \ 2614 VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d", \ 2615 level ? "level" : "edge", vector); \ 2616 VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]); \ 2617 VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]); \ 2618 VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]); \ 2619 VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]); \ 2620 VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\ 2621} while (0) 2622 2623/* 2624 * vlapic->ops handlers that utilize the APICv hardware assist described in 2625 * Chapter 29 of the Intel SDM. 2626 */ 2627static int 2628vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level) 2629{ 2630 struct vlapic_vtx *vlapic_vtx; 2631 struct pir_desc *pir_desc; 2632 uint64_t mask; 2633 int idx, notify; 2634 2635 vlapic_vtx = (struct vlapic_vtx *)vlapic; 2636 pir_desc = vlapic_vtx->pir_desc; 2637 2638 /* 2639 * Keep track of interrupt requests in the PIR descriptor. This is 2640 * because the virtual APIC page pointed to by the VMCS cannot be 2641 * modified if the vcpu is running. 2642 */ 2643 idx = vector / 64; 2644 mask = 1UL << (vector % 64); 2645 atomic_set_long(&pir_desc->pir[idx], mask); 2646 notify = atomic_cmpset_long(&pir_desc->pending, 0, 1); 2647 2648 VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector, 2649 level, "vmx_set_intr_ready"); 2650 return (notify); 2651} 2652 2653static int 2654vmx_pending_intr(struct vlapic *vlapic, int *vecptr) 2655{ 2656 struct vlapic_vtx *vlapic_vtx; 2657 struct pir_desc *pir_desc; 2658 struct LAPIC *lapic; 2659 uint64_t pending, pirval; 2660 uint32_t ppr, vpr; 2661 int i; 2662 2663 /* 2664 * This function is only expected to be called from the 'HLT' exit 2665 * handler which does not care about the vector that is pending. 2666 */ 2667 KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL")); 2668 2669 vlapic_vtx = (struct vlapic_vtx *)vlapic; 2670 pir_desc = vlapic_vtx->pir_desc; 2671 2672 pending = atomic_load_acq_long(&pir_desc->pending); 2673 if (!pending) 2674 return (0); /* common case */ 2675 2676 /* 2677 * If there is an interrupt pending then it will be recognized only 2678 * if its priority is greater than the processor priority. 2679 * 2680 * Special case: if the processor priority is zero then any pending 2681 * interrupt will be recognized. 2682 */ 2683 lapic = vlapic->apic_page; 2684 ppr = lapic->ppr & 0xf0; 2685 if (ppr == 0) 2686 return (1); 2687 2688 VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d", 2689 lapic->ppr); 2690 2691 for (i = 3; i >= 0; i--) { 2692 pirval = pir_desc->pir[i]; 2693 if (pirval != 0) { 2694 vpr = (i * 64 + flsl(pirval) - 1) & 0xf0; 2695 return (vpr > ppr); 2696 } 2697 } 2698 return (0); 2699} 2700 2701static void 2702vmx_intr_accepted(struct vlapic *vlapic, int vector) 2703{ 2704 2705 panic("vmx_intr_accepted: not expected to be called"); 2706} 2707 2708static void 2709vmx_set_tmr(struct vlapic *vlapic, int vector, bool level) 2710{ 2711 struct vlapic_vtx *vlapic_vtx; 2712 struct vmx *vmx; 2713 struct vmcs *vmcs; 2714 uint64_t mask, val; 2715 2716 KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector)); 2717 KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL), 2718 ("vmx_set_tmr: vcpu cannot be running")); 2719 2720 vlapic_vtx = (struct vlapic_vtx *)vlapic; 2721 vmx = vlapic_vtx->vmx; 2722 vmcs = &vmx->vmcs[vlapic->vcpuid]; 2723 mask = 1UL << (vector % 64); 2724 2725 VMPTRLD(vmcs); 2726 val = vmcs_read(VMCS_EOI_EXIT(vector)); 2727 if (level) 2728 val |= mask; 2729 else 2730 val &= ~mask; 2731 vmcs_write(VMCS_EOI_EXIT(vector), val); 2732 VMCLEAR(vmcs); 2733} 2734 2735static void 2736vmx_enable_x2apic_mode(struct vlapic *vlapic) 2737{ 2738 struct vmx *vmx; 2739 struct vmcs *vmcs; 2740 uint32_t proc_ctls2; 2741 int vcpuid, error; 2742 2743 vcpuid = vlapic->vcpuid; 2744 vmx = ((struct vlapic_vtx *)vlapic)->vmx; 2745 vmcs = &vmx->vmcs[vcpuid]; 2746 2747 proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; 2748 KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0, 2749 ("%s: invalid proc_ctls2 %#x", __func__, proc_ctls2)); 2750 2751 proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES; 2752 proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE; 2753 vmx->cap[vcpuid].proc_ctls2 = proc_ctls2; 2754 2755 VMPTRLD(vmcs); 2756 vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2); 2757 VMCLEAR(vmcs); 2758 2759 if (vlapic->vcpuid == 0) { 2760 /* 2761 * The nested page table mappings are shared by all vcpus 2762 * so unmap the APIC access page just once. 2763 */ 2764 error = vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); 2765 KASSERT(error == 0, ("%s: vm_unmap_mmio error %d", 2766 __func__, error)); 2767 2768 /* 2769 * The MSR bitmap is shared by all vcpus so modify it only 2770 * once in the context of vcpu 0. 2771 */ 2772 error = vmx_allow_x2apic_msrs(vmx); 2773 KASSERT(error == 0, ("%s: vmx_allow_x2apic_msrs error %d", 2774 __func__, error)); 2775 } 2776} 2777 2778static void 2779vmx_post_intr(struct vlapic *vlapic, int hostcpu) 2780{ 2781 2782 ipi_cpu(hostcpu, pirvec); 2783} 2784 2785/* 2786 * Transfer the pending interrupts in the PIR descriptor to the IRR 2787 * in the virtual APIC page. 2788 */ 2789static void 2790vmx_inject_pir(struct vlapic *vlapic) 2791{ 2792 struct vlapic_vtx *vlapic_vtx; 2793 struct pir_desc *pir_desc; 2794 struct LAPIC *lapic; 2795 uint64_t val, pirval; 2796 int rvi, pirbase = -1; 2797 uint16_t intr_status_old, intr_status_new; 2798 2799 vlapic_vtx = (struct vlapic_vtx *)vlapic; 2800 pir_desc = vlapic_vtx->pir_desc; 2801 if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) { 2802 VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: " 2803 "no posted interrupt pending"); 2804 return; 2805 } 2806 2807 pirval = 0; 2808 pirbase = -1; 2809 lapic = vlapic->apic_page; 2810 2811 val = atomic_readandclear_long(&pir_desc->pir[0]); 2812 if (val != 0) { 2813 lapic->irr0 |= val; 2814 lapic->irr1 |= val >> 32; 2815 pirbase = 0; 2816 pirval = val; 2817 } 2818 2819 val = atomic_readandclear_long(&pir_desc->pir[1]); 2820 if (val != 0) { 2821 lapic->irr2 |= val; 2822 lapic->irr3 |= val >> 32; 2823 pirbase = 64; 2824 pirval = val; 2825 } 2826 2827 val = atomic_readandclear_long(&pir_desc->pir[2]); 2828 if (val != 0) { 2829 lapic->irr4 |= val; 2830 lapic->irr5 |= val >> 32; 2831 pirbase = 128; 2832 pirval = val; 2833 } 2834 2835 val = atomic_readandclear_long(&pir_desc->pir[3]); 2836 if (val != 0) { 2837 lapic->irr6 |= val; 2838 lapic->irr7 |= val >> 32; 2839 pirbase = 192; 2840 pirval = val; 2841 } 2842 2843 VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir"); 2844 2845 /* 2846 * Update RVI so the processor can evaluate pending virtual 2847 * interrupts on VM-entry. 2848 * 2849 * It is possible for pirval to be 0 here, even though the 2850 * pending bit has been set. The scenario is: 2851 * CPU-Y is sending a posted interrupt to CPU-X, which 2852 * is running a guest and processing posted interrupts in h/w. 2853 * CPU-X will eventually exit and the state seen in s/w is 2854 * the pending bit set, but no PIR bits set. 2855 * 2856 * CPU-X CPU-Y 2857 * (vm running) (host running) 2858 * rx posted interrupt 2859 * CLEAR pending bit 2860 * SET PIR bit 2861 * READ/CLEAR PIR bits 2862 * SET pending bit 2863 * (vm exit) 2864 * pending bit set, PIR 0 2865 */ 2866 if (pirval != 0) { 2867 rvi = pirbase + flsl(pirval) - 1; 2868 intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS); 2869 intr_status_new = (intr_status_old & 0xFF00) | rvi; 2870 if (intr_status_new > intr_status_old) { 2871 vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new); 2872 VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: " 2873 "guest_intr_status changed from 0x%04x to 0x%04x", 2874 intr_status_old, intr_status_new); 2875 } 2876 } 2877} 2878 2879static struct vlapic * 2880vmx_vlapic_init(void *arg, int vcpuid) 2881{ 2882 struct vmx *vmx; 2883 struct vlapic *vlapic; 2884 struct vlapic_vtx *vlapic_vtx; 2885 2886 vmx = arg; 2887 2888 vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO); 2889 vlapic->vm = vmx->vm; 2890 vlapic->vcpuid = vcpuid; 2891 vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid]; 2892 2893 vlapic_vtx = (struct vlapic_vtx *)vlapic; 2894 vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid]; 2895 vlapic_vtx->vmx = vmx; 2896 2897 if (virtual_interrupt_delivery) { 2898 vlapic->ops.set_intr_ready = vmx_set_intr_ready; 2899 vlapic->ops.pending_intr = vmx_pending_intr; 2900 vlapic->ops.intr_accepted = vmx_intr_accepted; 2901 vlapic->ops.set_tmr = vmx_set_tmr; 2902 vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode; 2903 } 2904 2905 if (posted_interrupts) 2906 vlapic->ops.post_intr = vmx_post_intr; 2907 2908 vlapic_init(vlapic); 2909 2910 return (vlapic); 2911} 2912 2913static void 2914vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic) 2915{ 2916 2917 vlapic_cleanup(vlapic); 2918 free(vlapic, M_VLAPIC); 2919} 2920 2921struct vmm_ops vmm_ops_intel = { 2922 vmx_init, 2923 vmx_cleanup, 2924 vmx_restore, 2925 vmx_vminit, 2926 vmx_run, 2927 vmx_vmcleanup, 2928 vmx_getreg, 2929 vmx_setreg, 2930 vmx_getdesc, 2931 vmx_setdesc, 2932 vmx_getcap, 2933 vmx_setcap, 2934 ept_vmspace_alloc, 2935 ept_vmspace_free, 2936 vmx_vlapic_init, 2937 vmx_vlapic_cleanup, 2938}; 2939