1/*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * Copyright (c) 2018 Joyent, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * $FreeBSD$ 30 */ 31 32#include <sys/cdefs.h> 33__FBSDID("$FreeBSD$"); 34 35#include <sys/param.h> 36#include <sys/systm.h> 37#include <sys/smp.h> 38#include <sys/kernel.h> 39#include <sys/malloc.h> 40#include <sys/pcpu.h> 41#include <sys/proc.h> 42#include <sys/sysctl.h> 43 44#include <vm/vm.h> 45#include <vm/pmap.h> 46 47#include <machine/psl.h> 48#include <machine/cpufunc.h> 49#include <machine/md_var.h> 50#include <machine/reg.h> 51#include <machine/segments.h> 52#include <machine/smp.h> 53#include <machine/specialreg.h> 54#include <machine/vmparam.h> 55 56#include <machine/vmm.h> 57#include <machine/vmm_dev.h> 58#include <machine/vmm_instruction_emul.h> 59#include "vmm_lapic.h" 60#include "vmm_host.h" 61#include "vmm_ioport.h" 62#include "vmm_ktr.h" 63#include "vmm_stat.h" 64#include "vatpic.h" 65#include "vlapic.h" 66#include "vlapic_priv.h" 67 68#include "ept.h" 69#include "vmx_cpufunc.h" 70#include "vmx.h" 71#include "vmx_msr.h" 72#include "x86.h" 73#include "vmx_controls.h" 74 75#define PINBASED_CTLS_ONE_SETTING \ 76 (PINBASED_EXTINT_EXITING | \ 77 PINBASED_NMI_EXITING | \ 78 PINBASED_VIRTUAL_NMI) 79#define PINBASED_CTLS_ZERO_SETTING 0 80 81#define PROCBASED_CTLS_WINDOW_SETTING \ 82 (PROCBASED_INT_WINDOW_EXITING | \ 83 PROCBASED_NMI_WINDOW_EXITING) 84 85#define PROCBASED_CTLS_ONE_SETTING \ 86 (PROCBASED_SECONDARY_CONTROLS | \ 87 PROCBASED_MWAIT_EXITING | \ 88 PROCBASED_MONITOR_EXITING | \ 89 PROCBASED_IO_EXITING | \ 90 PROCBASED_MSR_BITMAPS | \ 91 PROCBASED_CTLS_WINDOW_SETTING | \ 92 PROCBASED_CR8_LOAD_EXITING | \ 93 PROCBASED_CR8_STORE_EXITING) 94#define PROCBASED_CTLS_ZERO_SETTING \ 95 (PROCBASED_CR3_LOAD_EXITING | \ 96 PROCBASED_CR3_STORE_EXITING | \ 97 PROCBASED_IO_BITMAPS) 98 99#define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT 100#define PROCBASED_CTLS2_ZERO_SETTING 0 101 102#define VM_EXIT_CTLS_ONE_SETTING \ 103 (VM_EXIT_SAVE_DEBUG_CONTROLS | \ 104 VM_EXIT_HOST_LMA | \ 105 VM_EXIT_SAVE_EFER | \ 106 VM_EXIT_LOAD_EFER | \ 107 VM_EXIT_ACKNOWLEDGE_INTERRUPT) 108 109#define VM_EXIT_CTLS_ZERO_SETTING 0 110 111#define VM_ENTRY_CTLS_ONE_SETTING \ 112 (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ 113 VM_ENTRY_LOAD_EFER) 114 115#define VM_ENTRY_CTLS_ZERO_SETTING \ 116 (VM_ENTRY_INTO_SMM | \ 117 VM_ENTRY_DEACTIVATE_DUAL_MONITOR) 118 119#define HANDLED 1 120#define UNHANDLED 0 121 122static MALLOC_DEFINE(M_VMX, "vmx", "vmx"); 123static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic"); 124 125SYSCTL_DECL(_hw_vmm); 126SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL); 127 128int vmxon_enabled[MAXCPU]; 129static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); 130 131static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; 132static uint32_t exit_ctls, entry_ctls; 133 134static uint64_t cr0_ones_mask, cr0_zeros_mask; 135SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD, 136 &cr0_ones_mask, 0, NULL); 137SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD, 138 &cr0_zeros_mask, 0, NULL); 139 140static uint64_t cr4_ones_mask, cr4_zeros_mask; 141SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD, 142 &cr4_ones_mask, 0, NULL); 143SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD, 144 &cr4_zeros_mask, 0, NULL); 145 146static int vmx_initialized; 147SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD, 148 &vmx_initialized, 0, "Intel VMX initialized"); 149 150/* 151 * Optional capabilities 152 */ 153static SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap, CTLFLAG_RW, NULL, NULL); 154 155static int cap_halt_exit; 156SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0, 157 "HLT triggers a VM-exit"); 158 159static int cap_pause_exit; 160SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit, 161 0, "PAUSE triggers a VM-exit"); 162 163static int cap_rdpid; 164SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, rdpid, CTLFLAG_RD, &cap_rdpid, 0, 165 "Guests are allowed to use RDPID"); 166 167static int cap_rdtscp; 168SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, rdtscp, CTLFLAG_RD, &cap_rdtscp, 0, 169 "Guests are allowed to use RDTSCP"); 170 171static int cap_unrestricted_guest; 172SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD, 173 &cap_unrestricted_guest, 0, "Unrestricted guests"); 174 175static int cap_monitor_trap; 176SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD, 177 &cap_monitor_trap, 0, "Monitor trap flag"); 178 179static int cap_invpcid; 180SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid, 181 0, "Guests are allowed to use INVPCID"); 182 183static int tpr_shadowing; 184SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, tpr_shadowing, CTLFLAG_RD, 185 &tpr_shadowing, 0, "TPR shadowing support"); 186 187static int virtual_interrupt_delivery; 188SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD, 189 &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support"); 190 191static int posted_interrupts; 192SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD, 193 &posted_interrupts, 0, "APICv posted interrupt support"); 194 195static int pirvec = -1; 196SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD, 197 &pirvec, 0, "APICv posted interrupt vector"); 198 199static struct unrhdr *vpid_unr; 200static u_int vpid_alloc_failed; 201SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD, 202 &vpid_alloc_failed, 0, NULL); 203 204int guest_l1d_flush; 205SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush, CTLFLAG_RD, 206 &guest_l1d_flush, 0, NULL); 207int guest_l1d_flush_sw; 208SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush_sw, CTLFLAG_RD, 209 &guest_l1d_flush_sw, 0, NULL); 210 211static struct msr_entry msr_load_list[1] __aligned(16); 212 213/* 214 * The definitions of SDT probes for VMX. 215 */ 216 217SDT_PROBE_DEFINE3(vmm, vmx, exit, entry, 218 "struct vmx *", "int", "struct vm_exit *"); 219 220SDT_PROBE_DEFINE4(vmm, vmx, exit, taskswitch, 221 "struct vmx *", "int", "struct vm_exit *", "struct vm_task_switch *"); 222 223SDT_PROBE_DEFINE4(vmm, vmx, exit, craccess, 224 "struct vmx *", "int", "struct vm_exit *", "uint64_t"); 225 226SDT_PROBE_DEFINE4(vmm, vmx, exit, rdmsr, 227 "struct vmx *", "int", "struct vm_exit *", "uint32_t"); 228 229SDT_PROBE_DEFINE5(vmm, vmx, exit, wrmsr, 230 "struct vmx *", "int", "struct vm_exit *", "uint32_t", "uint64_t"); 231 232SDT_PROBE_DEFINE3(vmm, vmx, exit, halt, 233 "struct vmx *", "int", "struct vm_exit *"); 234 235SDT_PROBE_DEFINE3(vmm, vmx, exit, mtrap, 236 "struct vmx *", "int", "struct vm_exit *"); 237 238SDT_PROBE_DEFINE3(vmm, vmx, exit, pause, 239 "struct vmx *", "int", "struct vm_exit *"); 240 241SDT_PROBE_DEFINE3(vmm, vmx, exit, intrwindow, 242 "struct vmx *", "int", "struct vm_exit *"); 243 244SDT_PROBE_DEFINE4(vmm, vmx, exit, interrupt, 245 "struct vmx *", "int", "struct vm_exit *", "uint32_t"); 246 247SDT_PROBE_DEFINE3(vmm, vmx, exit, nmiwindow, 248 "struct vmx *", "int", "struct vm_exit *"); 249 250SDT_PROBE_DEFINE3(vmm, vmx, exit, inout, 251 "struct vmx *", "int", "struct vm_exit *"); 252 253SDT_PROBE_DEFINE3(vmm, vmx, exit, cpuid, 254 "struct vmx *", "int", "struct vm_exit *"); 255 256SDT_PROBE_DEFINE5(vmm, vmx, exit, exception, 257 "struct vmx *", "int", "struct vm_exit *", "uint32_t", "int"); 258 259SDT_PROBE_DEFINE5(vmm, vmx, exit, nestedfault, 260 "struct vmx *", "int", "struct vm_exit *", "uint64_t", "uint64_t"); 261 262SDT_PROBE_DEFINE4(vmm, vmx, exit, mmiofault, 263 "struct vmx *", "int", "struct vm_exit *", "uint64_t"); 264 265SDT_PROBE_DEFINE3(vmm, vmx, exit, eoi, 266 "struct vmx *", "int", "struct vm_exit *"); 267 268SDT_PROBE_DEFINE3(vmm, vmx, exit, apicaccess, 269 "struct vmx *", "int", "struct vm_exit *"); 270 271SDT_PROBE_DEFINE4(vmm, vmx, exit, apicwrite, 272 "struct vmx *", "int", "struct vm_exit *", "struct vlapic *"); 273 274SDT_PROBE_DEFINE3(vmm, vmx, exit, xsetbv, 275 "struct vmx *", "int", "struct vm_exit *"); 276 277SDT_PROBE_DEFINE3(vmm, vmx, exit, monitor, 278 "struct vmx *", "int", "struct vm_exit *"); 279 280SDT_PROBE_DEFINE3(vmm, vmx, exit, mwait, 281 "struct vmx *", "int", "struct vm_exit *"); 282 283SDT_PROBE_DEFINE3(vmm, vmx, exit, vminsn, 284 "struct vmx *", "int", "struct vm_exit *"); 285 286SDT_PROBE_DEFINE4(vmm, vmx, exit, unknown, 287 "struct vmx *", "int", "struct vm_exit *", "uint32_t"); 288 289SDT_PROBE_DEFINE4(vmm, vmx, exit, return, 290 "struct vmx *", "int", "struct vm_exit *", "int"); 291 292/* 293 * Use the last page below 4GB as the APIC access address. This address is 294 * occupied by the boot firmware so it is guaranteed that it will not conflict 295 * with a page in system memory. 296 */ 297#define APIC_ACCESS_ADDRESS 0xFFFFF000 298 299static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc); 300static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval); 301static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val); 302static void vmx_inject_pir(struct vlapic *vlapic); 303 304static inline bool 305host_has_rdpid(void) 306{ 307 return ((cpu_stdext_feature2 & CPUID_STDEXT2_RDPID) != 0); 308} 309 310static inline bool 311host_has_rdtscp(void) 312{ 313 return ((amd_feature & AMDID_RDTSCP) != 0); 314} 315 316#ifdef KTR 317static const char * 318exit_reason_to_str(int reason) 319{ 320 static char reasonbuf[32]; 321 322 switch (reason) { 323 case EXIT_REASON_EXCEPTION: 324 return "exception"; 325 case EXIT_REASON_EXT_INTR: 326 return "extint"; 327 case EXIT_REASON_TRIPLE_FAULT: 328 return "triplefault"; 329 case EXIT_REASON_INIT: 330 return "init"; 331 case EXIT_REASON_SIPI: 332 return "sipi"; 333 case EXIT_REASON_IO_SMI: 334 return "iosmi"; 335 case EXIT_REASON_SMI: 336 return "smi"; 337 case EXIT_REASON_INTR_WINDOW: 338 return "intrwindow"; 339 case EXIT_REASON_NMI_WINDOW: 340 return "nmiwindow"; 341 case EXIT_REASON_TASK_SWITCH: 342 return "taskswitch"; 343 case EXIT_REASON_CPUID: 344 return "cpuid"; 345 case EXIT_REASON_GETSEC: 346 return "getsec"; 347 case EXIT_REASON_HLT: 348 return "hlt"; 349 case EXIT_REASON_INVD: 350 return "invd"; 351 case EXIT_REASON_INVLPG: 352 return "invlpg"; 353 case EXIT_REASON_RDPMC: 354 return "rdpmc"; 355 case EXIT_REASON_RDTSC: 356 return "rdtsc"; 357 case EXIT_REASON_RSM: 358 return "rsm"; 359 case EXIT_REASON_VMCALL: 360 return "vmcall"; 361 case EXIT_REASON_VMCLEAR: 362 return "vmclear"; 363 case EXIT_REASON_VMLAUNCH: 364 return "vmlaunch"; 365 case EXIT_REASON_VMPTRLD: 366 return "vmptrld"; 367 case EXIT_REASON_VMPTRST: 368 return "vmptrst"; 369 case EXIT_REASON_VMREAD: 370 return "vmread"; 371 case EXIT_REASON_VMRESUME: 372 return "vmresume"; 373 case EXIT_REASON_VMWRITE: 374 return "vmwrite"; 375 case EXIT_REASON_VMXOFF: 376 return "vmxoff"; 377 case EXIT_REASON_VMXON: 378 return "vmxon"; 379 case EXIT_REASON_CR_ACCESS: 380 return "craccess"; 381 case EXIT_REASON_DR_ACCESS: 382 return "draccess"; 383 case EXIT_REASON_INOUT: 384 return "inout"; 385 case EXIT_REASON_RDMSR: 386 return "rdmsr"; 387 case EXIT_REASON_WRMSR: 388 return "wrmsr"; 389 case EXIT_REASON_INVAL_VMCS: 390 return "invalvmcs"; 391 case EXIT_REASON_INVAL_MSR: 392 return "invalmsr"; 393 case EXIT_REASON_MWAIT: 394 return "mwait"; 395 case EXIT_REASON_MTF: 396 return "mtf"; 397 case EXIT_REASON_MONITOR: 398 return "monitor"; 399 case EXIT_REASON_PAUSE: 400 return "pause"; 401 case EXIT_REASON_MCE_DURING_ENTRY: 402 return "mce-during-entry"; 403 case EXIT_REASON_TPR: 404 return "tpr"; 405 case EXIT_REASON_APIC_ACCESS: 406 return "apic-access"; 407 case EXIT_REASON_GDTR_IDTR: 408 return "gdtridtr"; 409 case EXIT_REASON_LDTR_TR: 410 return "ldtrtr"; 411 case EXIT_REASON_EPT_FAULT: 412 return "eptfault"; 413 case EXIT_REASON_EPT_MISCONFIG: 414 return "eptmisconfig"; 415 case EXIT_REASON_INVEPT: 416 return "invept"; 417 case EXIT_REASON_RDTSCP: 418 return "rdtscp"; 419 case EXIT_REASON_VMX_PREEMPT: 420 return "vmxpreempt"; 421 case EXIT_REASON_INVVPID: 422 return "invvpid"; 423 case EXIT_REASON_WBINVD: 424 return "wbinvd"; 425 case EXIT_REASON_XSETBV: 426 return "xsetbv"; 427 case EXIT_REASON_APIC_WRITE: 428 return "apic-write"; 429 default: 430 snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); 431 return (reasonbuf); 432 } 433} 434#endif /* KTR */ 435 436static int 437vmx_allow_x2apic_msrs(struct vmx *vmx) 438{ 439 int i, error; 440 441 error = 0; 442 443 /* 444 * Allow readonly access to the following x2APIC MSRs from the guest. 445 */ 446 error += guest_msr_ro(vmx, MSR_APIC_ID); 447 error += guest_msr_ro(vmx, MSR_APIC_VERSION); 448 error += guest_msr_ro(vmx, MSR_APIC_LDR); 449 error += guest_msr_ro(vmx, MSR_APIC_SVR); 450 451 for (i = 0; i < 8; i++) 452 error += guest_msr_ro(vmx, MSR_APIC_ISR0 + i); 453 454 for (i = 0; i < 8; i++) 455 error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i); 456 457 for (i = 0; i < 8; i++) 458 error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i); 459 460 error += guest_msr_ro(vmx, MSR_APIC_ESR); 461 error += guest_msr_ro(vmx, MSR_APIC_LVT_TIMER); 462 error += guest_msr_ro(vmx, MSR_APIC_LVT_THERMAL); 463 error += guest_msr_ro(vmx, MSR_APIC_LVT_PCINT); 464 error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT0); 465 error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT1); 466 error += guest_msr_ro(vmx, MSR_APIC_LVT_ERROR); 467 error += guest_msr_ro(vmx, MSR_APIC_ICR_TIMER); 468 error += guest_msr_ro(vmx, MSR_APIC_DCR_TIMER); 469 error += guest_msr_ro(vmx, MSR_APIC_ICR); 470 471 /* 472 * Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest. 473 * 474 * These registers get special treatment described in the section 475 * "Virtualizing MSR-Based APIC Accesses". 476 */ 477 error += guest_msr_rw(vmx, MSR_APIC_TPR); 478 error += guest_msr_rw(vmx, MSR_APIC_EOI); 479 error += guest_msr_rw(vmx, MSR_APIC_SELF_IPI); 480 481 return (error); 482} 483 484u_long 485vmx_fix_cr0(u_long cr0) 486{ 487 488 return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); 489} 490 491u_long 492vmx_fix_cr4(u_long cr4) 493{ 494 495 return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); 496} 497 498static void 499vpid_free(int vpid) 500{ 501 if (vpid < 0 || vpid > 0xffff) 502 panic("vpid_free: invalid vpid %d", vpid); 503 504 /* 505 * VPIDs [0,VM_MAXCPU] are special and are not allocated from 506 * the unit number allocator. 507 */ 508 509 if (vpid > VM_MAXCPU) 510 free_unr(vpid_unr, vpid); 511} 512 513static void 514vpid_alloc(uint16_t *vpid, int num) 515{ 516 int i, x; 517 518 if (num <= 0 || num > VM_MAXCPU) 519 panic("invalid number of vpids requested: %d", num); 520 521 /* 522 * If the "enable vpid" execution control is not enabled then the 523 * VPID is required to be 0 for all vcpus. 524 */ 525 if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) { 526 for (i = 0; i < num; i++) 527 vpid[i] = 0; 528 return; 529 } 530 531 /* 532 * Allocate a unique VPID for each vcpu from the unit number allocator. 533 */ 534 for (i = 0; i < num; i++) { 535 x = alloc_unr(vpid_unr); 536 if (x == -1) 537 break; 538 else 539 vpid[i] = x; 540 } 541 542 if (i < num) { 543 atomic_add_int(&vpid_alloc_failed, 1); 544 545 /* 546 * If the unit number allocator does not have enough unique 547 * VPIDs then we need to allocate from the [1,VM_MAXCPU] range. 548 * 549 * These VPIDs are not be unique across VMs but this does not 550 * affect correctness because the combined mappings are also 551 * tagged with the EP4TA which is unique for each VM. 552 * 553 * It is still sub-optimal because the invvpid will invalidate 554 * combined mappings for a particular VPID across all EP4TAs. 555 */ 556 while (i-- > 0) 557 vpid_free(vpid[i]); 558 559 for (i = 0; i < num; i++) 560 vpid[i] = i + 1; 561 } 562} 563 564static void 565vpid_init(void) 566{ 567 /* 568 * VPID 0 is required when the "enable VPID" execution control is 569 * disabled. 570 * 571 * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the 572 * unit number allocator does not have sufficient unique VPIDs to 573 * satisfy the allocation. 574 * 575 * The remaining VPIDs are managed by the unit number allocator. 576 */ 577 vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL); 578} 579 580static void 581vmx_disable(void *arg __unused) 582{ 583 struct invvpid_desc invvpid_desc = { 0 }; 584 struct invept_desc invept_desc = { 0 }; 585 586 if (vmxon_enabled[curcpu]) { 587 /* 588 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b. 589 * 590 * VMXON or VMXOFF are not required to invalidate any TLB 591 * caching structures. This prevents potential retention of 592 * cached information in the TLB between distinct VMX episodes. 593 */ 594 invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc); 595 invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc); 596 vmxoff(); 597 } 598 load_cr4(rcr4() & ~CR4_VMXE); 599} 600 601static int 602vmx_cleanup(void) 603{ 604 605 if (pirvec >= 0) 606 lapic_ipi_free(pirvec); 607 608 if (vpid_unr != NULL) { 609 delete_unrhdr(vpid_unr); 610 vpid_unr = NULL; 611 } 612 613 if (nmi_flush_l1d_sw == 1) 614 nmi_flush_l1d_sw = 0; 615 616 smp_rendezvous(NULL, vmx_disable, NULL, NULL); 617 618 return (0); 619} 620 621static void 622vmx_enable(void *arg __unused) 623{ 624 int error; 625 uint64_t feature_control; 626 627 feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); 628 if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 || 629 (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { 630 wrmsr(MSR_IA32_FEATURE_CONTROL, 631 feature_control | IA32_FEATURE_CONTROL_VMX_EN | 632 IA32_FEATURE_CONTROL_LOCK); 633 } 634 635 load_cr4(rcr4() | CR4_VMXE); 636 637 *(uint32_t *)vmxon_region[curcpu] = vmx_revision(); 638 error = vmxon(vmxon_region[curcpu]); 639 if (error == 0) 640 vmxon_enabled[curcpu] = 1; 641} 642 643static void 644vmx_restore(void) 645{ 646 647 if (vmxon_enabled[curcpu]) 648 vmxon(vmxon_region[curcpu]); 649} 650 651static int 652vmx_init(int ipinum) 653{ 654 int error; 655 uint64_t basic, fixed0, fixed1, feature_control; 656 uint32_t tmp, procbased2_vid_bits; 657 658 /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ 659 if (!(cpu_feature2 & CPUID2_VMX)) { 660 printf("vmx_init: processor does not support VMX operation\n"); 661 return (ENXIO); 662 } 663 664 /* 665 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits 666 * are set (bits 0 and 2 respectively). 667 */ 668 feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); 669 if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 1 && 670 (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { 671 printf("vmx_init: VMX operation disabled by BIOS\n"); 672 return (ENXIO); 673 } 674 675 /* 676 * Verify capabilities MSR_VMX_BASIC: 677 * - bit 54 indicates support for INS/OUTS decoding 678 */ 679 basic = rdmsr(MSR_VMX_BASIC); 680 if ((basic & (1UL << 54)) == 0) { 681 printf("vmx_init: processor does not support desired basic " 682 "capabilities\n"); 683 return (EINVAL); 684 } 685 686 /* Check support for primary processor-based VM-execution controls */ 687 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 688 MSR_VMX_TRUE_PROCBASED_CTLS, 689 PROCBASED_CTLS_ONE_SETTING, 690 PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); 691 if (error) { 692 printf("vmx_init: processor does not support desired primary " 693 "processor-based controls\n"); 694 return (error); 695 } 696 697 /* Clear the processor-based ctl bits that are set on demand */ 698 procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; 699 700 /* Check support for secondary processor-based VM-execution controls */ 701 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 702 MSR_VMX_PROCBASED_CTLS2, 703 PROCBASED_CTLS2_ONE_SETTING, 704 PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); 705 if (error) { 706 printf("vmx_init: processor does not support desired secondary " 707 "processor-based controls\n"); 708 return (error); 709 } 710 711 /* Check support for VPID */ 712 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, 713 PROCBASED2_ENABLE_VPID, 0, &tmp); 714 if (error == 0) 715 procbased_ctls2 |= PROCBASED2_ENABLE_VPID; 716 717 /* Check support for pin-based VM-execution controls */ 718 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, 719 MSR_VMX_TRUE_PINBASED_CTLS, 720 PINBASED_CTLS_ONE_SETTING, 721 PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); 722 if (error) { 723 printf("vmx_init: processor does not support desired " 724 "pin-based controls\n"); 725 return (error); 726 } 727 728 /* Check support for VM-exit controls */ 729 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, 730 VM_EXIT_CTLS_ONE_SETTING, 731 VM_EXIT_CTLS_ZERO_SETTING, 732 &exit_ctls); 733 if (error) { 734 printf("vmx_init: processor does not support desired " 735 "exit controls\n"); 736 return (error); 737 } 738 739 /* Check support for VM-entry controls */ 740 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS, 741 VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING, 742 &entry_ctls); 743 if (error) { 744 printf("vmx_init: processor does not support desired " 745 "entry controls\n"); 746 return (error); 747 } 748 749 /* 750 * Check support for optional features by testing them 751 * as individual bits 752 */ 753 cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 754 MSR_VMX_TRUE_PROCBASED_CTLS, 755 PROCBASED_HLT_EXITING, 0, 756 &tmp) == 0); 757 758 cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 759 MSR_VMX_PROCBASED_CTLS, 760 PROCBASED_MTF, 0, 761 &tmp) == 0); 762 763 cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 764 MSR_VMX_TRUE_PROCBASED_CTLS, 765 PROCBASED_PAUSE_EXITING, 0, 766 &tmp) == 0); 767 768 /* 769 * Check support for RDPID and/or RDTSCP. 770 * 771 * Support a pass-through-based implementation of these via the 772 * "enable RDTSCP" VM-execution control and the "RDTSC exiting" 773 * VM-execution control. 774 * 775 * The "enable RDTSCP" VM-execution control applies to both RDPID 776 * and RDTSCP (see SDM volume 3, section 25.3, "Changes to 777 * Instruction Behavior in VMX Non-root operation"); this is why 778 * only this VM-execution control needs to be enabled in order to 779 * enable passing through whichever of RDPID and/or RDTSCP are 780 * supported by the host. 781 * 782 * The "RDTSC exiting" VM-execution control applies to both RDTSC 783 * and RDTSCP (again, per SDM volume 3, section 25.3), and is 784 * already set up for RDTSC and RDTSCP pass-through by the current 785 * implementation of RDTSC. 786 * 787 * Although RDPID and RDTSCP are optional capabilities, since there 788 * does not currently seem to be a use case for enabling/disabling 789 * these via libvmmapi, choose not to support this and, instead, 790 * just statically always enable or always disable this support 791 * across all vCPUs on all VMs. (Note that there may be some 792 * complications to providing this functionality, e.g., the MSR 793 * bitmap is currently per-VM rather than per-vCPU while the 794 * capability API wants to be able to control capabilities on a 795 * per-vCPU basis). 796 */ 797 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 798 MSR_VMX_PROCBASED_CTLS2, 799 PROCBASED2_ENABLE_RDTSCP, 0, &tmp); 800 cap_rdpid = error == 0 && host_has_rdpid(); 801 cap_rdtscp = error == 0 && host_has_rdtscp(); 802 if (cap_rdpid || cap_rdtscp) 803 procbased_ctls2 |= PROCBASED2_ENABLE_RDTSCP; 804 805 cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 806 MSR_VMX_PROCBASED_CTLS2, 807 PROCBASED2_UNRESTRICTED_GUEST, 0, 808 &tmp) == 0); 809 810 cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 811 MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0, 812 &tmp) == 0); 813 814 /* 815 * Check support for TPR shadow. 816 */ 817 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 818 MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0, 819 &tmp); 820 if (error == 0) { 821 tpr_shadowing = 1; 822 TUNABLE_INT_FETCH("hw.vmm.vmx.use_tpr_shadowing", 823 &tpr_shadowing); 824 } 825 826 if (tpr_shadowing) { 827 procbased_ctls |= PROCBASED_USE_TPR_SHADOW; 828 procbased_ctls &= ~PROCBASED_CR8_LOAD_EXITING; 829 procbased_ctls &= ~PROCBASED_CR8_STORE_EXITING; 830 } 831 832 /* 833 * Check support for virtual interrupt delivery. 834 */ 835 procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES | 836 PROCBASED2_VIRTUALIZE_X2APIC_MODE | 837 PROCBASED2_APIC_REGISTER_VIRTUALIZATION | 838 PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY); 839 840 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, 841 procbased2_vid_bits, 0, &tmp); 842 if (error == 0 && tpr_shadowing) { 843 virtual_interrupt_delivery = 1; 844 TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid", 845 &virtual_interrupt_delivery); 846 } 847 848 if (virtual_interrupt_delivery) { 849 procbased_ctls |= PROCBASED_USE_TPR_SHADOW; 850 procbased_ctls2 |= procbased2_vid_bits; 851 procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE; 852 853 /* 854 * Check for Posted Interrupts only if Virtual Interrupt 855 * Delivery is enabled. 856 */ 857 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, 858 MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0, 859 &tmp); 860 if (error == 0) { 861 pirvec = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) : 862 &IDTVEC(justreturn)); 863 if (pirvec < 0) { 864 if (bootverbose) { 865 printf("vmx_init: unable to allocate " 866 "posted interrupt vector\n"); 867 } 868 } else { 869 posted_interrupts = 1; 870 TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir", 871 &posted_interrupts); 872 } 873 } 874 } 875 876 if (posted_interrupts) 877 pinbased_ctls |= PINBASED_POSTED_INTERRUPT; 878 879 /* Initialize EPT */ 880 error = ept_init(ipinum); 881 if (error) { 882 printf("vmx_init: ept initialization failed (%d)\n", error); 883 return (error); 884 } 885 886 guest_l1d_flush = (cpu_ia32_arch_caps & 887 IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) == 0; 888 TUNABLE_INT_FETCH("hw.vmm.l1d_flush", &guest_l1d_flush); 889 890 /* 891 * L1D cache flush is enabled. Use IA32_FLUSH_CMD MSR when 892 * available. Otherwise fall back to the software flush 893 * method which loads enough data from the kernel text to 894 * flush existing L1D content, both on VMX entry and on NMI 895 * return. 896 */ 897 if (guest_l1d_flush) { 898 if ((cpu_stdext_feature3 & CPUID_STDEXT3_L1D_FLUSH) == 0) { 899 guest_l1d_flush_sw = 1; 900 TUNABLE_INT_FETCH("hw.vmm.l1d_flush_sw", 901 &guest_l1d_flush_sw); 902 } 903 if (guest_l1d_flush_sw) { 904 if (nmi_flush_l1d_sw <= 1) 905 nmi_flush_l1d_sw = 1; 906 } else { 907 msr_load_list[0].index = MSR_IA32_FLUSH_CMD; 908 msr_load_list[0].val = IA32_FLUSH_CMD_L1D; 909 } 910 } 911 912 /* 913 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 914 */ 915 fixed0 = rdmsr(MSR_VMX_CR0_FIXED0); 916 fixed1 = rdmsr(MSR_VMX_CR0_FIXED1); 917 cr0_ones_mask = fixed0 & fixed1; 918 cr0_zeros_mask = ~fixed0 & ~fixed1; 919 920 /* 921 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation 922 * if unrestricted guest execution is allowed. 923 */ 924 if (cap_unrestricted_guest) 925 cr0_ones_mask &= ~(CR0_PG | CR0_PE); 926 927 /* 928 * Do not allow the guest to set CR0_NW or CR0_CD. 929 */ 930 cr0_zeros_mask |= (CR0_NW | CR0_CD); 931 932 fixed0 = rdmsr(MSR_VMX_CR4_FIXED0); 933 fixed1 = rdmsr(MSR_VMX_CR4_FIXED1); 934 cr4_ones_mask = fixed0 & fixed1; 935 cr4_zeros_mask = ~fixed0 & ~fixed1; 936 937 vpid_init(); 938 939 vmx_msr_init(); 940 941 /* enable VMX operation */ 942 smp_rendezvous(NULL, vmx_enable, NULL, NULL); 943 944 vmx_initialized = 1; 945 946 return (0); 947} 948 949static void 950vmx_trigger_hostintr(int vector) 951{ 952 uintptr_t func; 953 struct gate_descriptor *gd; 954 955 gd = &idt[vector]; 956 957 KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: " 958 "invalid vector %d", vector)); 959 KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present", 960 vector)); 961 KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d " 962 "has invalid type %d", vector, gd->gd_type)); 963 KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d " 964 "has invalid dpl %d", vector, gd->gd_dpl)); 965 KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor " 966 "for vector %d has invalid selector %d", vector, gd->gd_selector)); 967 KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid " 968 "IST %d", vector, gd->gd_ist)); 969 970 func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset); 971 vmx_call_isr(func); 972} 973 974static int 975vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial) 976{ 977 int error, mask_ident, shadow_ident; 978 uint64_t mask_value; 979 980 if (which != 0 && which != 4) 981 panic("vmx_setup_cr_shadow: unknown cr%d", which); 982 983 if (which == 0) { 984 mask_ident = VMCS_CR0_MASK; 985 mask_value = cr0_ones_mask | cr0_zeros_mask; 986 shadow_ident = VMCS_CR0_SHADOW; 987 } else { 988 mask_ident = VMCS_CR4_MASK; 989 mask_value = cr4_ones_mask | cr4_zeros_mask; 990 shadow_ident = VMCS_CR4_SHADOW; 991 } 992 993 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value); 994 if (error) 995 return (error); 996 997 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial); 998 if (error) 999 return (error); 1000 1001 return (0); 1002} 1003#define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init)) 1004#define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init)) 1005 1006static void * 1007vmx_vminit(struct vm *vm, pmap_t pmap) 1008{ 1009 uint16_t vpid[VM_MAXCPU]; 1010 int i, error; 1011 struct vmx *vmx; 1012 struct vmcs *vmcs; 1013 uint32_t exc_bitmap; 1014 uint16_t maxcpus; 1015 1016 vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); 1017 if ((uintptr_t)vmx & PAGE_MASK) { 1018 panic("malloc of struct vmx not aligned on %d byte boundary", 1019 PAGE_SIZE); 1020 } 1021 vmx->vm = vm; 1022 1023 vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4)); 1024 1025 /* 1026 * Clean up EPTP-tagged guest physical and combined mappings 1027 * 1028 * VMX transitions are not required to invalidate any guest physical 1029 * mappings. So, it may be possible for stale guest physical mappings 1030 * to be present in the processor TLBs. 1031 * 1032 * Combined mappings for this EP4TA are also invalidated for all VPIDs. 1033 */ 1034 ept_invalidate_mappings(vmx->eptp); 1035 1036 msr_bitmap_initialize(vmx->msr_bitmap); 1037 1038 /* 1039 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE. 1040 * The guest FSBASE and GSBASE are saved and restored during 1041 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are 1042 * always restored from the vmcs host state area on vm-exit. 1043 * 1044 * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in 1045 * how they are saved/restored so can be directly accessed by the 1046 * guest. 1047 * 1048 * MSR_EFER is saved and restored in the guest VMCS area on a 1049 * VM exit and entry respectively. It is also restored from the 1050 * host VMCS area on a VM exit. 1051 * 1052 * The TSC MSR is exposed read-only. Writes are disallowed as 1053 * that will impact the host TSC. If the guest does a write 1054 * the "use TSC offsetting" execution control is enabled and the 1055 * difference between the host TSC and the guest TSC is written 1056 * into the TSC offset in the VMCS. 1057 * 1058 * Guest TSC_AUX support is enabled if any of guest RDPID and/or 1059 * guest RDTSCP support are enabled (since, as per Table 2-2 in SDM 1060 * volume 4, TSC_AUX is supported if any of RDPID and/or RDTSCP are 1061 * supported). If guest TSC_AUX support is enabled, TSC_AUX is 1062 * exposed read-only so that the VMM can do one fewer MSR read per 1063 * exit than if this register were exposed read-write; the guest 1064 * restore value can be updated during guest writes (expected to be 1065 * rare) instead of during all exits (common). 1066 */ 1067 if (guest_msr_rw(vmx, MSR_GSBASE) || 1068 guest_msr_rw(vmx, MSR_FSBASE) || 1069 guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) || 1070 guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) || 1071 guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) || 1072 guest_msr_rw(vmx, MSR_EFER) || 1073 guest_msr_ro(vmx, MSR_TSC) || 1074 ((cap_rdpid || cap_rdtscp) && guest_msr_ro(vmx, MSR_TSC_AUX))) 1075 panic("vmx_vminit: error setting guest msr access"); 1076 1077 vpid_alloc(vpid, VM_MAXCPU); 1078 1079 if (virtual_interrupt_delivery) { 1080 error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE, 1081 APIC_ACCESS_ADDRESS); 1082 /* XXX this should really return an error to the caller */ 1083 KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error)); 1084 } 1085 1086 maxcpus = vm_get_maxcpus(vm); 1087 for (i = 0; i < maxcpus; i++) { 1088 vmcs = &vmx->vmcs[i]; 1089 vmcs->identifier = vmx_revision(); 1090 error = vmclear(vmcs); 1091 if (error != 0) { 1092 panic("vmx_vminit: vmclear error %d on vcpu %d\n", 1093 error, i); 1094 } 1095 1096 vmx_msr_guest_init(vmx, i); 1097 1098 error = vmcs_init(vmcs); 1099 KASSERT(error == 0, ("vmcs_init error %d", error)); 1100 1101 VMPTRLD(vmcs); 1102 error = 0; 1103 error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]); 1104 error += vmwrite(VMCS_EPTP, vmx->eptp); 1105 error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls); 1106 error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls); 1107 error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2); 1108 error += vmwrite(VMCS_EXIT_CTLS, exit_ctls); 1109 error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls); 1110 error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap)); 1111 error += vmwrite(VMCS_VPID, vpid[i]); 1112 1113 if (guest_l1d_flush && !guest_l1d_flush_sw) { 1114 vmcs_write(VMCS_ENTRY_MSR_LOAD, pmap_kextract( 1115 (vm_offset_t)&msr_load_list[0])); 1116 vmcs_write(VMCS_ENTRY_MSR_LOAD_COUNT, 1117 nitems(msr_load_list)); 1118 vmcs_write(VMCS_EXIT_MSR_STORE, 0); 1119 vmcs_write(VMCS_EXIT_MSR_STORE_COUNT, 0); 1120 } 1121 1122 /* exception bitmap */ 1123 if (vcpu_trace_exceptions(vm, i)) 1124 exc_bitmap = 0xffffffff; 1125 else 1126 exc_bitmap = 1 << IDT_MC; 1127 error += vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap); 1128 1129 vmx->ctx[i].guest_dr6 = DBREG_DR6_RESERVED1; 1130 error += vmwrite(VMCS_GUEST_DR7, DBREG_DR7_RESERVED1); 1131 1132 if (tpr_shadowing) { 1133 error += vmwrite(VMCS_VIRTUAL_APIC, 1134 vtophys(&vmx->apic_page[i])); 1135 } 1136 1137 if (virtual_interrupt_delivery) { 1138 error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS); 1139 error += vmwrite(VMCS_EOI_EXIT0, 0); 1140 error += vmwrite(VMCS_EOI_EXIT1, 0); 1141 error += vmwrite(VMCS_EOI_EXIT2, 0); 1142 error += vmwrite(VMCS_EOI_EXIT3, 0); 1143 } 1144 if (posted_interrupts) { 1145 error += vmwrite(VMCS_PIR_VECTOR, pirvec); 1146 error += vmwrite(VMCS_PIR_DESC, 1147 vtophys(&vmx->pir_desc[i])); 1148 } 1149 VMCLEAR(vmcs); 1150 KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs")); 1151 1152 vmx->cap[i].set = 0; 1153 vmx->cap[i].set |= cap_rdpid != 0 ? 1 << VM_CAP_RDPID : 0; 1154 vmx->cap[i].set |= cap_rdtscp != 0 ? 1 << VM_CAP_RDTSCP : 0; 1155 vmx->cap[i].proc_ctls = procbased_ctls; 1156 vmx->cap[i].proc_ctls2 = procbased_ctls2; 1157 vmx->cap[i].exc_bitmap = exc_bitmap; 1158 1159 vmx->state[i].nextrip = ~0; 1160 vmx->state[i].lastcpu = NOCPU; 1161 vmx->state[i].vpid = vpid[i]; 1162 1163 /* 1164 * Set up the CR0/4 shadows, and init the read shadow 1165 * to the power-on register value from the Intel Sys Arch. 1166 * CR0 - 0x60000010 1167 * CR4 - 0 1168 */ 1169 error = vmx_setup_cr0_shadow(vmcs, 0x60000010); 1170 if (error != 0) 1171 panic("vmx_setup_cr0_shadow %d", error); 1172 1173 error = vmx_setup_cr4_shadow(vmcs, 0); 1174 if (error != 0) 1175 panic("vmx_setup_cr4_shadow %d", error); 1176 1177 vmx->ctx[i].pmap = pmap; 1178 } 1179 1180 return (vmx); 1181} 1182 1183static int 1184vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx) 1185{ 1186 int handled, func; 1187 1188 func = vmxctx->guest_rax; 1189 1190 handled = x86_emulate_cpuid(vm, vcpu, 1191 (uint32_t*)(&vmxctx->guest_rax), 1192 (uint32_t*)(&vmxctx->guest_rbx), 1193 (uint32_t*)(&vmxctx->guest_rcx), 1194 (uint32_t*)(&vmxctx->guest_rdx)); 1195 return (handled); 1196} 1197 1198static __inline void 1199vmx_run_trace(struct vmx *vmx, int vcpu) 1200{ 1201#ifdef KTR 1202 VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip()); 1203#endif 1204} 1205 1206static __inline void 1207vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, 1208 int handled) 1209{ 1210#ifdef KTR 1211 VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx", 1212 handled ? "handled" : "unhandled", 1213 exit_reason_to_str(exit_reason), rip); 1214#endif 1215} 1216 1217static __inline void 1218vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip) 1219{ 1220#ifdef KTR 1221 VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip); 1222#endif 1223} 1224 1225static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved"); 1226static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done"); 1227 1228/* 1229 * Invalidate guest mappings identified by its vpid from the TLB. 1230 */ 1231static __inline void 1232vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running) 1233{ 1234 struct vmxstate *vmxstate; 1235 struct invvpid_desc invvpid_desc; 1236 1237 vmxstate = &vmx->state[vcpu]; 1238 if (vmxstate->vpid == 0) 1239 return; 1240 1241 if (!running) { 1242 /* 1243 * Set the 'lastcpu' to an invalid host cpu. 1244 * 1245 * This will invalidate TLB entries tagged with the vcpu's 1246 * vpid the next time it runs via vmx_set_pcpu_defaults(). 1247 */ 1248 vmxstate->lastcpu = NOCPU; 1249 return; 1250 } 1251 1252 KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside " 1253 "critical section", __func__, vcpu)); 1254 1255 /* 1256 * Invalidate all mappings tagged with 'vpid' 1257 * 1258 * We do this because this vcpu was executing on a different host 1259 * cpu when it last ran. We do not track whether it invalidated 1260 * mappings associated with its 'vpid' during that run. So we must 1261 * assume that the mappings associated with 'vpid' on 'curcpu' are 1262 * stale and invalidate them. 1263 * 1264 * Note that we incur this penalty only when the scheduler chooses to 1265 * move the thread associated with this vcpu between host cpus. 1266 * 1267 * Note also that this will invalidate mappings tagged with 'vpid' 1268 * for "all" EP4TAs. 1269 */ 1270 if (pmap->pm_eptgen == vmx->eptgen[curcpu]) { 1271 invvpid_desc._res1 = 0; 1272 invvpid_desc._res2 = 0; 1273 invvpid_desc.vpid = vmxstate->vpid; 1274 invvpid_desc.linear_addr = 0; 1275 invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); 1276 vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1); 1277 } else { 1278 /* 1279 * The invvpid can be skipped if an invept is going to 1280 * be performed before entering the guest. The invept 1281 * will invalidate combined mappings tagged with 1282 * 'vmx->eptp' for all vpids. 1283 */ 1284 vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1); 1285 } 1286} 1287 1288static void 1289vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap) 1290{ 1291 struct vmxstate *vmxstate; 1292 1293 vmxstate = &vmx->state[vcpu]; 1294 if (vmxstate->lastcpu == curcpu) 1295 return; 1296 1297 vmxstate->lastcpu = curcpu; 1298 1299 vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); 1300 1301 vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); 1302 vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); 1303 vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); 1304 vmx_invvpid(vmx, vcpu, pmap, 1); 1305} 1306 1307/* 1308 * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. 1309 */ 1310CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); 1311 1312static void __inline 1313vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) 1314{ 1315 1316 if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) { 1317 vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING; 1318 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1319 VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting"); 1320 } 1321} 1322 1323static void __inline 1324vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) 1325{ 1326 1327 KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0, 1328 ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls)); 1329 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; 1330 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1331 VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); 1332} 1333 1334static void __inline 1335vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) 1336{ 1337 1338 if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) { 1339 vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; 1340 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1341 VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting"); 1342 } 1343} 1344 1345static void __inline 1346vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) 1347{ 1348 1349 KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0, 1350 ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls)); 1351 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; 1352 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1353 VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting"); 1354} 1355 1356int 1357vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset) 1358{ 1359 int error; 1360 1361 if ((vmx->cap[vcpu].proc_ctls & PROCBASED_TSC_OFFSET) == 0) { 1362 vmx->cap[vcpu].proc_ctls |= PROCBASED_TSC_OFFSET; 1363 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1364 VCPU_CTR0(vmx->vm, vcpu, "Enabling TSC offsetting"); 1365 } 1366 1367 error = vmwrite(VMCS_TSC_OFFSET, offset); 1368 1369 return (error); 1370} 1371 1372#define NMI_BLOCKING (VMCS_INTERRUPTIBILITY_NMI_BLOCKING | \ 1373 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) 1374#define HWINTR_BLOCKING (VMCS_INTERRUPTIBILITY_STI_BLOCKING | \ 1375 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) 1376 1377static void 1378vmx_inject_nmi(struct vmx *vmx, int vcpu) 1379{ 1380 uint32_t gi, info; 1381 1382 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1383 KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest " 1384 "interruptibility-state %#x", gi)); 1385 1386 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1387 KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid " 1388 "VM-entry interruption information %#x", info)); 1389 1390 /* 1391 * Inject the virtual NMI. The vector must be the NMI IDT entry 1392 * or the VMCS entry check will fail. 1393 */ 1394 info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID; 1395 vmcs_write(VMCS_ENTRY_INTR_INFO, info); 1396 1397 VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI"); 1398 1399 /* Clear the request */ 1400 vm_nmi_clear(vmx->vm, vcpu); 1401} 1402 1403static void 1404vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic, 1405 uint64_t guestrip) 1406{ 1407 int vector, need_nmi_exiting, extint_pending; 1408 uint64_t rflags, entryinfo; 1409 uint32_t gi, info; 1410 1411 if (vmx->state[vcpu].nextrip != guestrip) { 1412 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1413 if (gi & HWINTR_BLOCKING) { 1414 VCPU_CTR2(vmx->vm, vcpu, "Guest interrupt blocking " 1415 "cleared due to rip change: %#lx/%#lx", 1416 vmx->state[vcpu].nextrip, guestrip); 1417 gi &= ~HWINTR_BLOCKING; 1418 vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); 1419 } 1420 } 1421 1422 if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) { 1423 KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry " 1424 "intinfo is not valid: %#lx", __func__, entryinfo)); 1425 1426 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1427 KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject " 1428 "pending exception: %#lx/%#x", __func__, entryinfo, info)); 1429 1430 info = entryinfo; 1431 vector = info & 0xff; 1432 if (vector == IDT_BP || vector == IDT_OF) { 1433 /* 1434 * VT-x requires #BP and #OF to be injected as software 1435 * exceptions. 1436 */ 1437 info &= ~VMCS_INTR_T_MASK; 1438 info |= VMCS_INTR_T_SWEXCEPTION; 1439 } 1440 1441 if (info & VMCS_INTR_DEL_ERRCODE) 1442 vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32); 1443 1444 vmcs_write(VMCS_ENTRY_INTR_INFO, info); 1445 } 1446 1447 if (vm_nmi_pending(vmx->vm, vcpu)) { 1448 /* 1449 * If there are no conditions blocking NMI injection then 1450 * inject it directly here otherwise enable "NMI window 1451 * exiting" to inject it as soon as we can. 1452 * 1453 * We also check for STI_BLOCKING because some implementations 1454 * don't allow NMI injection in this case. If we are running 1455 * on a processor that doesn't have this restriction it will 1456 * immediately exit and the NMI will be injected in the 1457 * "NMI window exiting" handler. 1458 */ 1459 need_nmi_exiting = 1; 1460 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1461 if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) { 1462 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1463 if ((info & VMCS_INTR_VALID) == 0) { 1464 vmx_inject_nmi(vmx, vcpu); 1465 need_nmi_exiting = 0; 1466 } else { 1467 VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI " 1468 "due to VM-entry intr info %#x", info); 1469 } 1470 } else { 1471 VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to " 1472 "Guest Interruptibility-state %#x", gi); 1473 } 1474 1475 if (need_nmi_exiting) 1476 vmx_set_nmi_window_exiting(vmx, vcpu); 1477 } 1478 1479 extint_pending = vm_extint_pending(vmx->vm, vcpu); 1480 1481 if (!extint_pending && virtual_interrupt_delivery) { 1482 vmx_inject_pir(vlapic); 1483 return; 1484 } 1485 1486 /* 1487 * If interrupt-window exiting is already in effect then don't bother 1488 * checking for pending interrupts. This is just an optimization and 1489 * not needed for correctness. 1490 */ 1491 if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) { 1492 VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to " 1493 "pending int_window_exiting"); 1494 return; 1495 } 1496 1497 if (!extint_pending) { 1498 /* Ask the local apic for a vector to inject */ 1499 if (!vlapic_pending_intr(vlapic, &vector)) 1500 return; 1501 1502 /* 1503 * From the Intel SDM, Volume 3, Section "Maskable 1504 * Hardware Interrupts": 1505 * - maskable interrupt vectors [16,255] can be delivered 1506 * through the local APIC. 1507 */ 1508 KASSERT(vector >= 16 && vector <= 255, 1509 ("invalid vector %d from local APIC", vector)); 1510 } else { 1511 /* Ask the legacy pic for a vector to inject */ 1512 vatpic_pending_intr(vmx->vm, &vector); 1513 1514 /* 1515 * From the Intel SDM, Volume 3, Section "Maskable 1516 * Hardware Interrupts": 1517 * - maskable interrupt vectors [0,255] can be delivered 1518 * through the INTR pin. 1519 */ 1520 KASSERT(vector >= 0 && vector <= 255, 1521 ("invalid vector %d from INTR", vector)); 1522 } 1523 1524 /* Check RFLAGS.IF and the interruptibility state of the guest */ 1525 rflags = vmcs_read(VMCS_GUEST_RFLAGS); 1526 if ((rflags & PSL_I) == 0) { 1527 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " 1528 "rflags %#lx", vector, rflags); 1529 goto cantinject; 1530 } 1531 1532 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1533 if (gi & HWINTR_BLOCKING) { 1534 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " 1535 "Guest Interruptibility-state %#x", vector, gi); 1536 goto cantinject; 1537 } 1538 1539 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1540 if (info & VMCS_INTR_VALID) { 1541 /* 1542 * This is expected and could happen for multiple reasons: 1543 * - A vectoring VM-entry was aborted due to astpending 1544 * - A VM-exit happened during event injection. 1545 * - An exception was injected above. 1546 * - An NMI was injected above or after "NMI window exiting" 1547 */ 1548 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " 1549 "VM-entry intr info %#x", vector, info); 1550 goto cantinject; 1551 } 1552 1553 /* Inject the interrupt */ 1554 info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID; 1555 info |= vector; 1556 vmcs_write(VMCS_ENTRY_INTR_INFO, info); 1557 1558 if (!extint_pending) { 1559 /* Update the Local APIC ISR */ 1560 vlapic_intr_accepted(vlapic, vector); 1561 } else { 1562 vm_extint_clear(vmx->vm, vcpu); 1563 vatpic_intr_accepted(vmx->vm, vector); 1564 1565 /* 1566 * After we accepted the current ExtINT the PIC may 1567 * have posted another one. If that is the case, set 1568 * the Interrupt Window Exiting execution control so 1569 * we can inject that one too. 1570 * 1571 * Also, interrupt window exiting allows us to inject any 1572 * pending APIC vector that was preempted by the ExtINT 1573 * as soon as possible. This applies both for the software 1574 * emulated vlapic and the hardware assisted virtual APIC. 1575 */ 1576 vmx_set_int_window_exiting(vmx, vcpu); 1577 } 1578 1579 VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); 1580 1581 return; 1582 1583cantinject: 1584 /* 1585 * Set the Interrupt Window Exiting execution control so we can inject 1586 * the interrupt as soon as blocking condition goes away. 1587 */ 1588 vmx_set_int_window_exiting(vmx, vcpu); 1589} 1590 1591/* 1592 * If the Virtual NMIs execution control is '1' then the logical processor 1593 * tracks virtual-NMI blocking in the Guest Interruptibility-state field of 1594 * the VMCS. An IRET instruction in VMX non-root operation will remove any 1595 * virtual-NMI blocking. 1596 * 1597 * This unblocking occurs even if the IRET causes a fault. In this case the 1598 * hypervisor needs to restore virtual-NMI blocking before resuming the guest. 1599 */ 1600static void 1601vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid) 1602{ 1603 uint32_t gi; 1604 1605 VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking"); 1606 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1607 gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING; 1608 vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); 1609} 1610 1611static void 1612vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid) 1613{ 1614 uint32_t gi; 1615 1616 VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking"); 1617 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1618 gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING; 1619 vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); 1620} 1621 1622static void 1623vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid) 1624{ 1625 uint32_t gi; 1626 1627 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1628 KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING, 1629 ("NMI blocking is not in effect %#x", gi)); 1630} 1631 1632static int 1633vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 1634{ 1635 struct vmxctx *vmxctx; 1636 uint64_t xcrval; 1637 const struct xsave_limits *limits; 1638 1639 vmxctx = &vmx->ctx[vcpu]; 1640 limits = vmm_get_xsave_limits(); 1641 1642 /* 1643 * Note that the processor raises a GP# fault on its own if 1644 * xsetbv is executed for CPL != 0, so we do not have to 1645 * emulate that fault here. 1646 */ 1647 1648 /* Only xcr0 is supported. */ 1649 if (vmxctx->guest_rcx != 0) { 1650 vm_inject_gp(vmx->vm, vcpu); 1651 return (HANDLED); 1652 } 1653 1654 /* We only handle xcr0 if both the host and guest have XSAVE enabled. */ 1655 if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) { 1656 vm_inject_ud(vmx->vm, vcpu); 1657 return (HANDLED); 1658 } 1659 1660 xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff); 1661 if ((xcrval & ~limits->xcr0_allowed) != 0) { 1662 vm_inject_gp(vmx->vm, vcpu); 1663 return (HANDLED); 1664 } 1665 1666 if (!(xcrval & XFEATURE_ENABLED_X87)) { 1667 vm_inject_gp(vmx->vm, vcpu); 1668 return (HANDLED); 1669 } 1670 1671 /* AVX (YMM_Hi128) requires SSE. */ 1672 if (xcrval & XFEATURE_ENABLED_AVX && 1673 (xcrval & XFEATURE_AVX) != XFEATURE_AVX) { 1674 vm_inject_gp(vmx->vm, vcpu); 1675 return (HANDLED); 1676 } 1677 1678 /* 1679 * AVX512 requires base AVX (YMM_Hi128) as well as OpMask, 1680 * ZMM_Hi256, and Hi16_ZMM. 1681 */ 1682 if (xcrval & XFEATURE_AVX512 && 1683 (xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) != 1684 (XFEATURE_AVX512 | XFEATURE_AVX)) { 1685 vm_inject_gp(vmx->vm, vcpu); 1686 return (HANDLED); 1687 } 1688 1689 /* 1690 * Intel MPX requires both bound register state flags to be 1691 * set. 1692 */ 1693 if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) != 1694 ((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) { 1695 vm_inject_gp(vmx->vm, vcpu); 1696 return (HANDLED); 1697 } 1698 1699 /* 1700 * This runs "inside" vmrun() with the guest's FPU state, so 1701 * modifying xcr0 directly modifies the guest's xcr0, not the 1702 * host's. 1703 */ 1704 load_xcr(0, xcrval); 1705 return (HANDLED); 1706} 1707 1708static uint64_t 1709vmx_get_guest_reg(struct vmx *vmx, int vcpu, int ident) 1710{ 1711 const struct vmxctx *vmxctx; 1712 1713 vmxctx = &vmx->ctx[vcpu]; 1714 1715 switch (ident) { 1716 case 0: 1717 return (vmxctx->guest_rax); 1718 case 1: 1719 return (vmxctx->guest_rcx); 1720 case 2: 1721 return (vmxctx->guest_rdx); 1722 case 3: 1723 return (vmxctx->guest_rbx); 1724 case 4: 1725 return (vmcs_read(VMCS_GUEST_RSP)); 1726 case 5: 1727 return (vmxctx->guest_rbp); 1728 case 6: 1729 return (vmxctx->guest_rsi); 1730 case 7: 1731 return (vmxctx->guest_rdi); 1732 case 8: 1733 return (vmxctx->guest_r8); 1734 case 9: 1735 return (vmxctx->guest_r9); 1736 case 10: 1737 return (vmxctx->guest_r10); 1738 case 11: 1739 return (vmxctx->guest_r11); 1740 case 12: 1741 return (vmxctx->guest_r12); 1742 case 13: 1743 return (vmxctx->guest_r13); 1744 case 14: 1745 return (vmxctx->guest_r14); 1746 case 15: 1747 return (vmxctx->guest_r15); 1748 default: 1749 panic("invalid vmx register %d", ident); 1750 } 1751} 1752 1753static void 1754vmx_set_guest_reg(struct vmx *vmx, int vcpu, int ident, uint64_t regval) 1755{ 1756 struct vmxctx *vmxctx; 1757 1758 vmxctx = &vmx->ctx[vcpu]; 1759 1760 switch (ident) { 1761 case 0: 1762 vmxctx->guest_rax = regval; 1763 break; 1764 case 1: 1765 vmxctx->guest_rcx = regval; 1766 break; 1767 case 2: 1768 vmxctx->guest_rdx = regval; 1769 break; 1770 case 3: 1771 vmxctx->guest_rbx = regval; 1772 break; 1773 case 4: 1774 vmcs_write(VMCS_GUEST_RSP, regval); 1775 break; 1776 case 5: 1777 vmxctx->guest_rbp = regval; 1778 break; 1779 case 6: 1780 vmxctx->guest_rsi = regval; 1781 break; 1782 case 7: 1783 vmxctx->guest_rdi = regval; 1784 break; 1785 case 8: 1786 vmxctx->guest_r8 = regval; 1787 break; 1788 case 9: 1789 vmxctx->guest_r9 = regval; 1790 break; 1791 case 10: 1792 vmxctx->guest_r10 = regval; 1793 break; 1794 case 11: 1795 vmxctx->guest_r11 = regval; 1796 break; 1797 case 12: 1798 vmxctx->guest_r12 = regval; 1799 break; 1800 case 13: 1801 vmxctx->guest_r13 = regval; 1802 break; 1803 case 14: 1804 vmxctx->guest_r14 = regval; 1805 break; 1806 case 15: 1807 vmxctx->guest_r15 = regval; 1808 break; 1809 default: 1810 panic("invalid vmx register %d", ident); 1811 } 1812} 1813 1814static int 1815vmx_emulate_cr0_access(struct vmx *vmx, int vcpu, uint64_t exitqual) 1816{ 1817 uint64_t crval, regval; 1818 1819 /* We only handle mov to %cr0 at this time */ 1820 if ((exitqual & 0xf0) != 0x00) 1821 return (UNHANDLED); 1822 1823 regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf); 1824 1825 vmcs_write(VMCS_CR0_SHADOW, regval); 1826 1827 crval = regval | cr0_ones_mask; 1828 crval &= ~cr0_zeros_mask; 1829 vmcs_write(VMCS_GUEST_CR0, crval); 1830 1831 if (regval & CR0_PG) { 1832 uint64_t efer, entry_ctls; 1833 1834 /* 1835 * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and 1836 * the "IA-32e mode guest" bit in VM-entry control must be 1837 * equal. 1838 */ 1839 efer = vmcs_read(VMCS_GUEST_IA32_EFER); 1840 if (efer & EFER_LME) { 1841 efer |= EFER_LMA; 1842 vmcs_write(VMCS_GUEST_IA32_EFER, efer); 1843 entry_ctls = vmcs_read(VMCS_ENTRY_CTLS); 1844 entry_ctls |= VM_ENTRY_GUEST_LMA; 1845 vmcs_write(VMCS_ENTRY_CTLS, entry_ctls); 1846 } 1847 } 1848 1849 return (HANDLED); 1850} 1851 1852static int 1853vmx_emulate_cr4_access(struct vmx *vmx, int vcpu, uint64_t exitqual) 1854{ 1855 uint64_t crval, regval; 1856 1857 /* We only handle mov to %cr4 at this time */ 1858 if ((exitqual & 0xf0) != 0x00) 1859 return (UNHANDLED); 1860 1861 regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf); 1862 1863 vmcs_write(VMCS_CR4_SHADOW, regval); 1864 1865 crval = regval | cr4_ones_mask; 1866 crval &= ~cr4_zeros_mask; 1867 vmcs_write(VMCS_GUEST_CR4, crval); 1868 1869 return (HANDLED); 1870} 1871 1872static int 1873vmx_emulate_cr8_access(struct vmx *vmx, int vcpu, uint64_t exitqual) 1874{ 1875 struct vlapic *vlapic; 1876 uint64_t cr8; 1877 int regnum; 1878 1879 /* We only handle mov %cr8 to/from a register at this time. */ 1880 if ((exitqual & 0xe0) != 0x00) { 1881 return (UNHANDLED); 1882 } 1883 1884 vlapic = vm_lapic(vmx->vm, vcpu); 1885 regnum = (exitqual >> 8) & 0xf; 1886 if (exitqual & 0x10) { 1887 cr8 = vlapic_get_cr8(vlapic); 1888 vmx_set_guest_reg(vmx, vcpu, regnum, cr8); 1889 } else { 1890 cr8 = vmx_get_guest_reg(vmx, vcpu, regnum); 1891 vlapic_set_cr8(vlapic, cr8); 1892 } 1893 1894 return (HANDLED); 1895} 1896 1897/* 1898 * From section "Guest Register State" in the Intel SDM: CPL = SS.DPL 1899 */ 1900static int 1901vmx_cpl(void) 1902{ 1903 uint32_t ssar; 1904 1905 ssar = vmcs_read(VMCS_GUEST_SS_ACCESS_RIGHTS); 1906 return ((ssar >> 5) & 0x3); 1907} 1908 1909static enum vm_cpu_mode 1910vmx_cpu_mode(void) 1911{ 1912 uint32_t csar; 1913 1914 if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) { 1915 csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); 1916 if (csar & 0x2000) 1917 return (CPU_MODE_64BIT); /* CS.L = 1 */ 1918 else 1919 return (CPU_MODE_COMPATIBILITY); 1920 } else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) { 1921 return (CPU_MODE_PROTECTED); 1922 } else { 1923 return (CPU_MODE_REAL); 1924 } 1925} 1926 1927static enum vm_paging_mode 1928vmx_paging_mode(void) 1929{ 1930 1931 if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG)) 1932 return (PAGING_MODE_FLAT); 1933 if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE)) 1934 return (PAGING_MODE_32); 1935 if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME) 1936 return (PAGING_MODE_64); 1937 else 1938 return (PAGING_MODE_PAE); 1939} 1940 1941static uint64_t 1942inout_str_index(struct vmx *vmx, int vcpuid, int in) 1943{ 1944 uint64_t val; 1945 int error; 1946 enum vm_reg_name reg; 1947 1948 reg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI; 1949 error = vmx_getreg(vmx, vcpuid, reg, &val); 1950 KASSERT(error == 0, ("%s: vmx_getreg error %d", __func__, error)); 1951 return (val); 1952} 1953 1954static uint64_t 1955inout_str_count(struct vmx *vmx, int vcpuid, int rep) 1956{ 1957 uint64_t val; 1958 int error; 1959 1960 if (rep) { 1961 error = vmx_getreg(vmx, vcpuid, VM_REG_GUEST_RCX, &val); 1962 KASSERT(!error, ("%s: vmx_getreg error %d", __func__, error)); 1963 } else { 1964 val = 1; 1965 } 1966 return (val); 1967} 1968 1969static int 1970inout_str_addrsize(uint32_t inst_info) 1971{ 1972 uint32_t size; 1973 1974 size = (inst_info >> 7) & 0x7; 1975 switch (size) { 1976 case 0: 1977 return (2); /* 16 bit */ 1978 case 1: 1979 return (4); /* 32 bit */ 1980 case 2: 1981 return (8); /* 64 bit */ 1982 default: 1983 panic("%s: invalid size encoding %d", __func__, size); 1984 } 1985} 1986 1987static void 1988inout_str_seginfo(struct vmx *vmx, int vcpuid, uint32_t inst_info, int in, 1989 struct vm_inout_str *vis) 1990{ 1991 int error, s; 1992 1993 if (in) { 1994 vis->seg_name = VM_REG_GUEST_ES; 1995 } else { 1996 s = (inst_info >> 15) & 0x7; 1997 vis->seg_name = vm_segment_name(s); 1998 } 1999 2000 error = vmx_getdesc(vmx, vcpuid, vis->seg_name, &vis->seg_desc); 2001 KASSERT(error == 0, ("%s: vmx_getdesc error %d", __func__, error)); 2002} 2003 2004static void 2005vmx_paging_info(struct vm_guest_paging *paging) 2006{ 2007 paging->cr3 = vmcs_guest_cr3(); 2008 paging->cpl = vmx_cpl(); 2009 paging->cpu_mode = vmx_cpu_mode(); 2010 paging->paging_mode = vmx_paging_mode(); 2011} 2012 2013static void 2014vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla) 2015{ 2016 struct vm_guest_paging *paging; 2017 uint32_t csar; 2018 2019 paging = &vmexit->u.inst_emul.paging; 2020 2021 vmexit->exitcode = VM_EXITCODE_INST_EMUL; 2022 vmexit->inst_length = 0; 2023 vmexit->u.inst_emul.gpa = gpa; 2024 vmexit->u.inst_emul.gla = gla; 2025 vmx_paging_info(paging); 2026 switch (paging->cpu_mode) { 2027 case CPU_MODE_REAL: 2028 vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE); 2029 vmexit->u.inst_emul.cs_d = 0; 2030 break; 2031 case CPU_MODE_PROTECTED: 2032 case CPU_MODE_COMPATIBILITY: 2033 vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE); 2034 csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); 2035 vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar); 2036 break; 2037 default: 2038 vmexit->u.inst_emul.cs_base = 0; 2039 vmexit->u.inst_emul.cs_d = 0; 2040 break; 2041 } 2042 vie_init(&vmexit->u.inst_emul.vie, NULL, 0); 2043} 2044 2045static int 2046ept_fault_type(uint64_t ept_qual) 2047{ 2048 int fault_type; 2049 2050 if (ept_qual & EPT_VIOLATION_DATA_WRITE) 2051 fault_type = VM_PROT_WRITE; 2052 else if (ept_qual & EPT_VIOLATION_INST_FETCH) 2053 fault_type = VM_PROT_EXECUTE; 2054 else 2055 fault_type= VM_PROT_READ; 2056 2057 return (fault_type); 2058} 2059 2060static bool 2061ept_emulation_fault(uint64_t ept_qual) 2062{ 2063 int read, write; 2064 2065 /* EPT fault on an instruction fetch doesn't make sense here */ 2066 if (ept_qual & EPT_VIOLATION_INST_FETCH) 2067 return (false); 2068 2069 /* EPT fault must be a read fault or a write fault */ 2070 read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0; 2071 write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0; 2072 if ((read | write) == 0) 2073 return (false); 2074 2075 /* 2076 * The EPT violation must have been caused by accessing a 2077 * guest-physical address that is a translation of a guest-linear 2078 * address. 2079 */ 2080 if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 || 2081 (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) { 2082 return (false); 2083 } 2084 2085 return (true); 2086} 2087 2088static __inline int 2089apic_access_virtualization(struct vmx *vmx, int vcpuid) 2090{ 2091 uint32_t proc_ctls2; 2092 2093 proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; 2094 return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0); 2095} 2096 2097static __inline int 2098x2apic_virtualization(struct vmx *vmx, int vcpuid) 2099{ 2100 uint32_t proc_ctls2; 2101 2102 proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; 2103 return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0); 2104} 2105 2106static int 2107vmx_handle_apic_write(struct vmx *vmx, int vcpuid, struct vlapic *vlapic, 2108 uint64_t qual) 2109{ 2110 int error, handled, offset; 2111 uint32_t *apic_regs, vector; 2112 bool retu; 2113 2114 handled = HANDLED; 2115 offset = APIC_WRITE_OFFSET(qual); 2116 2117 if (!apic_access_virtualization(vmx, vcpuid)) { 2118 /* 2119 * In general there should not be any APIC write VM-exits 2120 * unless APIC-access virtualization is enabled. 2121 * 2122 * However self-IPI virtualization can legitimately trigger 2123 * an APIC-write VM-exit so treat it specially. 2124 */ 2125 if (x2apic_virtualization(vmx, vcpuid) && 2126 offset == APIC_OFFSET_SELF_IPI) { 2127 apic_regs = (uint32_t *)(vlapic->apic_page); 2128 vector = apic_regs[APIC_OFFSET_SELF_IPI / 4]; 2129 vlapic_self_ipi_handler(vlapic, vector); 2130 return (HANDLED); 2131 } else 2132 return (UNHANDLED); 2133 } 2134 2135 switch (offset) { 2136 case APIC_OFFSET_ID: 2137 vlapic_id_write_handler(vlapic); 2138 break; 2139 case APIC_OFFSET_LDR: 2140 vlapic_ldr_write_handler(vlapic); 2141 break; 2142 case APIC_OFFSET_DFR: 2143 vlapic_dfr_write_handler(vlapic); 2144 break; 2145 case APIC_OFFSET_SVR: 2146 vlapic_svr_write_handler(vlapic); 2147 break; 2148 case APIC_OFFSET_ESR: 2149 vlapic_esr_write_handler(vlapic); 2150 break; 2151 case APIC_OFFSET_ICR_LOW: 2152 retu = false; 2153 error = vlapic_icrlo_write_handler(vlapic, &retu); 2154 if (error != 0 || retu) 2155 handled = UNHANDLED; 2156 break; 2157 case APIC_OFFSET_CMCI_LVT: 2158 case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: 2159 vlapic_lvt_write_handler(vlapic, offset); 2160 break; 2161 case APIC_OFFSET_TIMER_ICR: 2162 vlapic_icrtmr_write_handler(vlapic); 2163 break; 2164 case APIC_OFFSET_TIMER_DCR: 2165 vlapic_dcr_write_handler(vlapic); 2166 break; 2167 default: 2168 handled = UNHANDLED; 2169 break; 2170 } 2171 return (handled); 2172} 2173 2174static bool 2175apic_access_fault(struct vmx *vmx, int vcpuid, uint64_t gpa) 2176{ 2177 2178 if (apic_access_virtualization(vmx, vcpuid) && 2179 (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE)) 2180 return (true); 2181 else 2182 return (false); 2183} 2184 2185static int 2186vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) 2187{ 2188 uint64_t qual; 2189 int access_type, offset, allowed; 2190 2191 if (!apic_access_virtualization(vmx, vcpuid)) 2192 return (UNHANDLED); 2193 2194 qual = vmexit->u.vmx.exit_qualification; 2195 access_type = APIC_ACCESS_TYPE(qual); 2196 offset = APIC_ACCESS_OFFSET(qual); 2197 2198 allowed = 0; 2199 if (access_type == 0) { 2200 /* 2201 * Read data access to the following registers is expected. 2202 */ 2203 switch (offset) { 2204 case APIC_OFFSET_APR: 2205 case APIC_OFFSET_PPR: 2206 case APIC_OFFSET_RRR: 2207 case APIC_OFFSET_CMCI_LVT: 2208 case APIC_OFFSET_TIMER_CCR: 2209 allowed = 1; 2210 break; 2211 default: 2212 break; 2213 } 2214 } else if (access_type == 1) { 2215 /* 2216 * Write data access to the following registers is expected. 2217 */ 2218 switch (offset) { 2219 case APIC_OFFSET_VER: 2220 case APIC_OFFSET_APR: 2221 case APIC_OFFSET_PPR: 2222 case APIC_OFFSET_RRR: 2223 case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: 2224 case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: 2225 case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: 2226 case APIC_OFFSET_CMCI_LVT: 2227 case APIC_OFFSET_TIMER_CCR: 2228 allowed = 1; 2229 break; 2230 default: 2231 break; 2232 } 2233 } 2234 2235 if (allowed) { 2236 vmexit_inst_emul(vmexit, DEFAULT_APIC_BASE + offset, 2237 VIE_INVALID_GLA); 2238 } 2239 2240 /* 2241 * Regardless of whether the APIC-access is allowed this handler 2242 * always returns UNHANDLED: 2243 * - if the access is allowed then it is handled by emulating the 2244 * instruction that caused the VM-exit (outside the critical section) 2245 * - if the access is not allowed then it will be converted to an 2246 * exitcode of VM_EXITCODE_VMX and will be dealt with in userland. 2247 */ 2248 return (UNHANDLED); 2249} 2250 2251static enum task_switch_reason 2252vmx_task_switch_reason(uint64_t qual) 2253{ 2254 int reason; 2255 2256 reason = (qual >> 30) & 0x3; 2257 switch (reason) { 2258 case 0: 2259 return (TSR_CALL); 2260 case 1: 2261 return (TSR_IRET); 2262 case 2: 2263 return (TSR_JMP); 2264 case 3: 2265 return (TSR_IDT_GATE); 2266 default: 2267 panic("%s: invalid reason %d", __func__, reason); 2268 } 2269} 2270 2271static int 2272emulate_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu) 2273{ 2274 int error; 2275 2276 if (lapic_msr(num)) 2277 error = lapic_wrmsr(vmx->vm, vcpuid, num, val, retu); 2278 else 2279 error = vmx_wrmsr(vmx, vcpuid, num, val, retu); 2280 2281 return (error); 2282} 2283 2284static int 2285emulate_rdmsr(struct vmx *vmx, int vcpuid, u_int num, bool *retu) 2286{ 2287 struct vmxctx *vmxctx; 2288 uint64_t result; 2289 uint32_t eax, edx; 2290 int error; 2291 2292 if (lapic_msr(num)) 2293 error = lapic_rdmsr(vmx->vm, vcpuid, num, &result, retu); 2294 else 2295 error = vmx_rdmsr(vmx, vcpuid, num, &result, retu); 2296 2297 if (error == 0) { 2298 eax = result; 2299 vmxctx = &vmx->ctx[vcpuid]; 2300 error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RAX, eax); 2301 KASSERT(error == 0, ("vmxctx_setreg(rax) error %d", error)); 2302 2303 edx = result >> 32; 2304 error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RDX, edx); 2305 KASSERT(error == 0, ("vmxctx_setreg(rdx) error %d", error)); 2306 } 2307 2308 return (error); 2309} 2310 2311static int 2312vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 2313{ 2314 int error, errcode, errcode_valid, handled, in; 2315 struct vmxctx *vmxctx; 2316 struct vlapic *vlapic; 2317 struct vm_inout_str *vis; 2318 struct vm_task_switch *ts; 2319 uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info; 2320 uint32_t intr_type, intr_vec, reason; 2321 uint64_t exitintinfo, qual, gpa; 2322 bool retu; 2323 2324 CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0); 2325 CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0); 2326 2327 handled = UNHANDLED; 2328 vmxctx = &vmx->ctx[vcpu]; 2329 2330 qual = vmexit->u.vmx.exit_qualification; 2331 reason = vmexit->u.vmx.exit_reason; 2332 vmexit->exitcode = VM_EXITCODE_BOGUS; 2333 2334 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1); 2335 SDT_PROBE3(vmm, vmx, exit, entry, vmx, vcpu, vmexit); 2336 2337 /* 2338 * VM-entry failures during or after loading guest state. 2339 * 2340 * These VM-exits are uncommon but must be handled specially 2341 * as most VM-exit fields are not populated as usual. 2342 */ 2343 if (__predict_false(reason == EXIT_REASON_MCE_DURING_ENTRY)) { 2344 VCPU_CTR0(vmx->vm, vcpu, "Handling MCE during VM-entry"); 2345 __asm __volatile("int $18"); 2346 return (1); 2347 } 2348 2349 /* 2350 * VM exits that can be triggered during event delivery need to 2351 * be handled specially by re-injecting the event if the IDT 2352 * vectoring information field's valid bit is set. 2353 * 2354 * See "Information for VM Exits During Event Delivery" in Intel SDM 2355 * for details. 2356 */ 2357 idtvec_info = vmcs_idt_vectoring_info(); 2358 if (idtvec_info & VMCS_IDT_VEC_VALID) { 2359 idtvec_info &= ~(1 << 12); /* clear undefined bit */ 2360 exitintinfo = idtvec_info; 2361 if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { 2362 idtvec_err = vmcs_idt_vectoring_err(); 2363 exitintinfo |= (uint64_t)idtvec_err << 32; 2364 } 2365 error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo); 2366 KASSERT(error == 0, ("%s: vm_set_intinfo error %d", 2367 __func__, error)); 2368 2369 /* 2370 * If 'virtual NMIs' are being used and the VM-exit 2371 * happened while injecting an NMI during the previous 2372 * VM-entry, then clear "blocking by NMI" in the 2373 * Guest Interruptibility-State so the NMI can be 2374 * reinjected on the subsequent VM-entry. 2375 * 2376 * However, if the NMI was being delivered through a task 2377 * gate, then the new task must start execution with NMIs 2378 * blocked so don't clear NMI blocking in this case. 2379 */ 2380 intr_type = idtvec_info & VMCS_INTR_T_MASK; 2381 if (intr_type == VMCS_INTR_T_NMI) { 2382 if (reason != EXIT_REASON_TASK_SWITCH) 2383 vmx_clear_nmi_blocking(vmx, vcpu); 2384 else 2385 vmx_assert_nmi_blocking(vmx, vcpu); 2386 } 2387 2388 /* 2389 * Update VM-entry instruction length if the event being 2390 * delivered was a software interrupt or software exception. 2391 */ 2392 if (intr_type == VMCS_INTR_T_SWINTR || 2393 intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION || 2394 intr_type == VMCS_INTR_T_SWEXCEPTION) { 2395 vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); 2396 } 2397 } 2398 2399 switch (reason) { 2400 case EXIT_REASON_TASK_SWITCH: 2401 ts = &vmexit->u.task_switch; 2402 ts->tsssel = qual & 0xffff; 2403 ts->reason = vmx_task_switch_reason(qual); 2404 ts->ext = 0; 2405 ts->errcode_valid = 0; 2406 vmx_paging_info(&ts->paging); 2407 /* 2408 * If the task switch was due to a CALL, JMP, IRET, software 2409 * interrupt (INT n) or software exception (INT3, INTO), 2410 * then the saved %rip references the instruction that caused 2411 * the task switch. The instruction length field in the VMCS 2412 * is valid in this case. 2413 * 2414 * In all other cases (e.g., NMI, hardware exception) the 2415 * saved %rip is one that would have been saved in the old TSS 2416 * had the task switch completed normally so the instruction 2417 * length field is not needed in this case and is explicitly 2418 * set to 0. 2419 */ 2420 if (ts->reason == TSR_IDT_GATE) { 2421 KASSERT(idtvec_info & VMCS_IDT_VEC_VALID, 2422 ("invalid idtvec_info %#x for IDT task switch", 2423 idtvec_info)); 2424 intr_type = idtvec_info & VMCS_INTR_T_MASK; 2425 if (intr_type != VMCS_INTR_T_SWINTR && 2426 intr_type != VMCS_INTR_T_SWEXCEPTION && 2427 intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) { 2428 /* Task switch triggered by external event */ 2429 ts->ext = 1; 2430 vmexit->inst_length = 0; 2431 if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { 2432 ts->errcode_valid = 1; 2433 ts->errcode = vmcs_idt_vectoring_err(); 2434 } 2435 } 2436 } 2437 vmexit->exitcode = VM_EXITCODE_TASK_SWITCH; 2438 SDT_PROBE4(vmm, vmx, exit, taskswitch, vmx, vcpu, vmexit, ts); 2439 VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, " 2440 "%s errcode 0x%016lx", ts->reason, ts->tsssel, 2441 ts->ext ? "external" : "internal", 2442 ((uint64_t)ts->errcode << 32) | ts->errcode_valid); 2443 break; 2444 case EXIT_REASON_CR_ACCESS: 2445 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1); 2446 SDT_PROBE4(vmm, vmx, exit, craccess, vmx, vcpu, vmexit, qual); 2447 switch (qual & 0xf) { 2448 case 0: 2449 handled = vmx_emulate_cr0_access(vmx, vcpu, qual); 2450 break; 2451 case 4: 2452 handled = vmx_emulate_cr4_access(vmx, vcpu, qual); 2453 break; 2454 case 8: 2455 handled = vmx_emulate_cr8_access(vmx, vcpu, qual); 2456 break; 2457 } 2458 break; 2459 case EXIT_REASON_RDMSR: 2460 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1); 2461 retu = false; 2462 ecx = vmxctx->guest_rcx; 2463 VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx); 2464 SDT_PROBE4(vmm, vmx, exit, rdmsr, vmx, vcpu, vmexit, ecx); 2465 error = emulate_rdmsr(vmx, vcpu, ecx, &retu); 2466 if (error) { 2467 vmexit->exitcode = VM_EXITCODE_RDMSR; 2468 vmexit->u.msr.code = ecx; 2469 } else if (!retu) { 2470 handled = HANDLED; 2471 } else { 2472 /* Return to userspace with a valid exitcode */ 2473 KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, 2474 ("emulate_rdmsr retu with bogus exitcode")); 2475 } 2476 break; 2477 case EXIT_REASON_WRMSR: 2478 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1); 2479 retu = false; 2480 eax = vmxctx->guest_rax; 2481 ecx = vmxctx->guest_rcx; 2482 edx = vmxctx->guest_rdx; 2483 VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016lx", 2484 ecx, (uint64_t)edx << 32 | eax); 2485 SDT_PROBE5(vmm, vmx, exit, wrmsr, vmx, vmexit, vcpu, ecx, 2486 (uint64_t)edx << 32 | eax); 2487 error = emulate_wrmsr(vmx, vcpu, ecx, 2488 (uint64_t)edx << 32 | eax, &retu); 2489 if (error) { 2490 vmexit->exitcode = VM_EXITCODE_WRMSR; 2491 vmexit->u.msr.code = ecx; 2492 vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; 2493 } else if (!retu) { 2494 handled = HANDLED; 2495 } else { 2496 /* Return to userspace with a valid exitcode */ 2497 KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, 2498 ("emulate_wrmsr retu with bogus exitcode")); 2499 } 2500 break; 2501 case EXIT_REASON_HLT: 2502 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1); 2503 SDT_PROBE3(vmm, vmx, exit, halt, vmx, vcpu, vmexit); 2504 vmexit->exitcode = VM_EXITCODE_HLT; 2505 vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS); 2506 if (virtual_interrupt_delivery) 2507 vmexit->u.hlt.intr_status = 2508 vmcs_read(VMCS_GUEST_INTR_STATUS); 2509 else 2510 vmexit->u.hlt.intr_status = 0; 2511 break; 2512 case EXIT_REASON_MTF: 2513 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1); 2514 SDT_PROBE3(vmm, vmx, exit, mtrap, vmx, vcpu, vmexit); 2515 vmexit->exitcode = VM_EXITCODE_MTRAP; 2516 vmexit->inst_length = 0; 2517 break; 2518 case EXIT_REASON_PAUSE: 2519 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1); 2520 SDT_PROBE3(vmm, vmx, exit, pause, vmx, vcpu, vmexit); 2521 vmexit->exitcode = VM_EXITCODE_PAUSE; 2522 break; 2523 case EXIT_REASON_INTR_WINDOW: 2524 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1); 2525 SDT_PROBE3(vmm, vmx, exit, intrwindow, vmx, vcpu, vmexit); 2526 vmx_clear_int_window_exiting(vmx, vcpu); 2527 return (1); 2528 case EXIT_REASON_EXT_INTR: 2529 /* 2530 * External interrupts serve only to cause VM exits and allow 2531 * the host interrupt handler to run. 2532 * 2533 * If this external interrupt triggers a virtual interrupt 2534 * to a VM, then that state will be recorded by the 2535 * host interrupt handler in the VM's softc. We will inject 2536 * this virtual interrupt during the subsequent VM enter. 2537 */ 2538 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); 2539 SDT_PROBE4(vmm, vmx, exit, interrupt, 2540 vmx, vcpu, vmexit, intr_info); 2541 2542 /* 2543 * XXX: Ignore this exit if VMCS_INTR_VALID is not set. 2544 * This appears to be a bug in VMware Fusion? 2545 */ 2546 if (!(intr_info & VMCS_INTR_VALID)) 2547 return (1); 2548 KASSERT((intr_info & VMCS_INTR_VALID) != 0 && 2549 (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR, 2550 ("VM exit interruption info invalid: %#x", intr_info)); 2551 vmx_trigger_hostintr(intr_info & 0xff); 2552 2553 /* 2554 * This is special. We want to treat this as an 'handled' 2555 * VM-exit but not increment the instruction pointer. 2556 */ 2557 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); 2558 return (1); 2559 case EXIT_REASON_NMI_WINDOW: 2560 SDT_PROBE3(vmm, vmx, exit, nmiwindow, vmx, vcpu, vmexit); 2561 /* Exit to allow the pending virtual NMI to be injected */ 2562 if (vm_nmi_pending(vmx->vm, vcpu)) 2563 vmx_inject_nmi(vmx, vcpu); 2564 vmx_clear_nmi_window_exiting(vmx, vcpu); 2565 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1); 2566 return (1); 2567 case EXIT_REASON_INOUT: 2568 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1); 2569 vmexit->exitcode = VM_EXITCODE_INOUT; 2570 vmexit->u.inout.bytes = (qual & 0x7) + 1; 2571 vmexit->u.inout.in = in = (qual & 0x8) ? 1 : 0; 2572 vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; 2573 vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; 2574 vmexit->u.inout.port = (uint16_t)(qual >> 16); 2575 vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); 2576 if (vmexit->u.inout.string) { 2577 inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO); 2578 vmexit->exitcode = VM_EXITCODE_INOUT_STR; 2579 vis = &vmexit->u.inout_str; 2580 vmx_paging_info(&vis->paging); 2581 vis->rflags = vmcs_read(VMCS_GUEST_RFLAGS); 2582 vis->cr0 = vmcs_read(VMCS_GUEST_CR0); 2583 vis->index = inout_str_index(vmx, vcpu, in); 2584 vis->count = inout_str_count(vmx, vcpu, vis->inout.rep); 2585 vis->addrsize = inout_str_addrsize(inst_info); 2586 inout_str_seginfo(vmx, vcpu, inst_info, in, vis); 2587 } 2588 SDT_PROBE3(vmm, vmx, exit, inout, vmx, vcpu, vmexit); 2589 break; 2590 case EXIT_REASON_CPUID: 2591 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1); 2592 SDT_PROBE3(vmm, vmx, exit, cpuid, vmx, vcpu, vmexit); 2593 handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx); 2594 break; 2595 case EXIT_REASON_EXCEPTION: 2596 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1); 2597 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); 2598 KASSERT((intr_info & VMCS_INTR_VALID) != 0, 2599 ("VM exit interruption info invalid: %#x", intr_info)); 2600 2601 intr_vec = intr_info & 0xff; 2602 intr_type = intr_info & VMCS_INTR_T_MASK; 2603 2604 /* 2605 * If Virtual NMIs control is 1 and the VM-exit is due to a 2606 * fault encountered during the execution of IRET then we must 2607 * restore the state of "virtual-NMI blocking" before resuming 2608 * the guest. 2609 * 2610 * See "Resuming Guest Software after Handling an Exception". 2611 * See "Information for VM Exits Due to Vectored Events". 2612 */ 2613 if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && 2614 (intr_vec != IDT_DF) && 2615 (intr_info & EXIT_QUAL_NMIUDTI) != 0) 2616 vmx_restore_nmi_blocking(vmx, vcpu); 2617 2618 /* 2619 * The NMI has already been handled in vmx_exit_handle_nmi(). 2620 */ 2621 if (intr_type == VMCS_INTR_T_NMI) 2622 return (1); 2623 2624 /* 2625 * Call the machine check handler by hand. Also don't reflect 2626 * the machine check back into the guest. 2627 */ 2628 if (intr_vec == IDT_MC) { 2629 VCPU_CTR0(vmx->vm, vcpu, "Vectoring to MCE handler"); 2630 __asm __volatile("int $18"); 2631 return (1); 2632 } 2633 2634 /* 2635 * If the hypervisor has requested user exits for 2636 * debug exceptions, bounce them out to userland. 2637 */ 2638 if (intr_type == VMCS_INTR_T_SWEXCEPTION && intr_vec == IDT_BP && 2639 (vmx->cap[vcpu].set & (1 << VM_CAP_BPT_EXIT))) { 2640 vmexit->exitcode = VM_EXITCODE_BPT; 2641 vmexit->u.bpt.inst_length = vmexit->inst_length; 2642 vmexit->inst_length = 0; 2643 break; 2644 } 2645 2646 if (intr_vec == IDT_PF) { 2647 error = vmxctx_setreg(vmxctx, VM_REG_GUEST_CR2, qual); 2648 KASSERT(error == 0, ("%s: vmxctx_setreg(cr2) error %d", 2649 __func__, error)); 2650 } 2651 2652 /* 2653 * Software exceptions exhibit trap-like behavior. This in 2654 * turn requires populating the VM-entry instruction length 2655 * so that the %rip in the trap frame is past the INT3/INTO 2656 * instruction. 2657 */ 2658 if (intr_type == VMCS_INTR_T_SWEXCEPTION) 2659 vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); 2660 2661 /* Reflect all other exceptions back into the guest */ 2662 errcode_valid = errcode = 0; 2663 if (intr_info & VMCS_INTR_DEL_ERRCODE) { 2664 errcode_valid = 1; 2665 errcode = vmcs_read(VMCS_EXIT_INTR_ERRCODE); 2666 } 2667 VCPU_CTR2(vmx->vm, vcpu, "Reflecting exception %d/%#x into " 2668 "the guest", intr_vec, errcode); 2669 SDT_PROBE5(vmm, vmx, exit, exception, 2670 vmx, vcpu, vmexit, intr_vec, errcode); 2671 error = vm_inject_exception(vmx->vm, vcpu, intr_vec, 2672 errcode_valid, errcode, 0); 2673 KASSERT(error == 0, ("%s: vm_inject_exception error %d", 2674 __func__, error)); 2675 return (1); 2676 2677 case EXIT_REASON_EPT_FAULT: 2678 /* 2679 * If 'gpa' lies within the address space allocated to 2680 * memory then this must be a nested page fault otherwise 2681 * this must be an instruction that accesses MMIO space. 2682 */ 2683 gpa = vmcs_gpa(); 2684 if (vm_mem_allocated(vmx->vm, vcpu, gpa) || 2685 apic_access_fault(vmx, vcpu, gpa)) { 2686 vmexit->exitcode = VM_EXITCODE_PAGING; 2687 vmexit->inst_length = 0; 2688 vmexit->u.paging.gpa = gpa; 2689 vmexit->u.paging.fault_type = ept_fault_type(qual); 2690 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1); 2691 SDT_PROBE5(vmm, vmx, exit, nestedfault, 2692 vmx, vcpu, vmexit, gpa, qual); 2693 } else if (ept_emulation_fault(qual)) { 2694 vmexit_inst_emul(vmexit, gpa, vmcs_gla()); 2695 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1); 2696 SDT_PROBE4(vmm, vmx, exit, mmiofault, 2697 vmx, vcpu, vmexit, gpa); 2698 } 2699 /* 2700 * If Virtual NMIs control is 1 and the VM-exit is due to an 2701 * EPT fault during the execution of IRET then we must restore 2702 * the state of "virtual-NMI blocking" before resuming. 2703 * 2704 * See description of "NMI unblocking due to IRET" in 2705 * "Exit Qualification for EPT Violations". 2706 */ 2707 if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && 2708 (qual & EXIT_QUAL_NMIUDTI) != 0) 2709 vmx_restore_nmi_blocking(vmx, vcpu); 2710 break; 2711 case EXIT_REASON_VIRTUALIZED_EOI: 2712 vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI; 2713 vmexit->u.ioapic_eoi.vector = qual & 0xFF; 2714 SDT_PROBE3(vmm, vmx, exit, eoi, vmx, vcpu, vmexit); 2715 vmexit->inst_length = 0; /* trap-like */ 2716 break; 2717 case EXIT_REASON_APIC_ACCESS: 2718 SDT_PROBE3(vmm, vmx, exit, apicaccess, vmx, vcpu, vmexit); 2719 handled = vmx_handle_apic_access(vmx, vcpu, vmexit); 2720 break; 2721 case EXIT_REASON_APIC_WRITE: 2722 /* 2723 * APIC-write VM exit is trap-like so the %rip is already 2724 * pointing to the next instruction. 2725 */ 2726 vmexit->inst_length = 0; 2727 vlapic = vm_lapic(vmx->vm, vcpu); 2728 SDT_PROBE4(vmm, vmx, exit, apicwrite, 2729 vmx, vcpu, vmexit, vlapic); 2730 handled = vmx_handle_apic_write(vmx, vcpu, vlapic, qual); 2731 break; 2732 case EXIT_REASON_XSETBV: 2733 SDT_PROBE3(vmm, vmx, exit, xsetbv, vmx, vcpu, vmexit); 2734 handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit); 2735 break; 2736 case EXIT_REASON_MONITOR: 2737 SDT_PROBE3(vmm, vmx, exit, monitor, vmx, vcpu, vmexit); 2738 vmexit->exitcode = VM_EXITCODE_MONITOR; 2739 break; 2740 case EXIT_REASON_MWAIT: 2741 SDT_PROBE3(vmm, vmx, exit, mwait, vmx, vcpu, vmexit); 2742 vmexit->exitcode = VM_EXITCODE_MWAIT; 2743 break; 2744 case EXIT_REASON_TPR: 2745 vlapic = vm_lapic(vmx->vm, vcpu); 2746 vlapic_sync_tpr(vlapic); 2747 vmexit->inst_length = 0; 2748 handled = HANDLED; 2749 break; 2750 case EXIT_REASON_VMCALL: 2751 case EXIT_REASON_VMCLEAR: 2752 case EXIT_REASON_VMLAUNCH: 2753 case EXIT_REASON_VMPTRLD: 2754 case EXIT_REASON_VMPTRST: 2755 case EXIT_REASON_VMREAD: 2756 case EXIT_REASON_VMRESUME: 2757 case EXIT_REASON_VMWRITE: 2758 case EXIT_REASON_VMXOFF: 2759 case EXIT_REASON_VMXON: 2760 SDT_PROBE3(vmm, vmx, exit, vminsn, vmx, vcpu, vmexit); 2761 vmexit->exitcode = VM_EXITCODE_VMINSN; 2762 break; 2763 default: 2764 SDT_PROBE4(vmm, vmx, exit, unknown, 2765 vmx, vcpu, vmexit, reason); 2766 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1); 2767 break; 2768 } 2769 2770 if (handled) { 2771 /* 2772 * It is possible that control is returned to userland 2773 * even though we were able to handle the VM exit in the 2774 * kernel. 2775 * 2776 * In such a case we want to make sure that the userland 2777 * restarts guest execution at the instruction *after* 2778 * the one we just processed. Therefore we update the 2779 * guest rip in the VMCS and in 'vmexit'. 2780 */ 2781 vmexit->rip += vmexit->inst_length; 2782 vmexit->inst_length = 0; 2783 vmcs_write(VMCS_GUEST_RIP, vmexit->rip); 2784 } else { 2785 if (vmexit->exitcode == VM_EXITCODE_BOGUS) { 2786 /* 2787 * If this VM exit was not claimed by anybody then 2788 * treat it as a generic VMX exit. 2789 */ 2790 vmexit->exitcode = VM_EXITCODE_VMX; 2791 vmexit->u.vmx.status = VM_SUCCESS; 2792 vmexit->u.vmx.inst_type = 0; 2793 vmexit->u.vmx.inst_error = 0; 2794 } else { 2795 /* 2796 * The exitcode and collateral have been populated. 2797 * The VM exit will be processed further in userland. 2798 */ 2799 } 2800 } 2801 2802 SDT_PROBE4(vmm, vmx, exit, return, 2803 vmx, vcpu, vmexit, handled); 2804 return (handled); 2805} 2806 2807static __inline void 2808vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit) 2809{ 2810 2811 KASSERT(vmxctx->inst_fail_status != VM_SUCCESS, 2812 ("vmx_exit_inst_error: invalid inst_fail_status %d", 2813 vmxctx->inst_fail_status)); 2814 2815 vmexit->inst_length = 0; 2816 vmexit->exitcode = VM_EXITCODE_VMX; 2817 vmexit->u.vmx.status = vmxctx->inst_fail_status; 2818 vmexit->u.vmx.inst_error = vmcs_instruction_error(); 2819 vmexit->u.vmx.exit_reason = ~0; 2820 vmexit->u.vmx.exit_qualification = ~0; 2821 2822 switch (rc) { 2823 case VMX_VMRESUME_ERROR: 2824 case VMX_VMLAUNCH_ERROR: 2825 vmexit->u.vmx.inst_type = rc; 2826 break; 2827 default: 2828 panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc); 2829 } 2830} 2831 2832/* 2833 * If the NMI-exiting VM execution control is set to '1' then an NMI in 2834 * non-root operation causes a VM-exit. NMI blocking is in effect so it is 2835 * sufficient to simply vector to the NMI handler via a software interrupt. 2836 * However, this must be done before maskable interrupts are enabled 2837 * otherwise the "iret" issued by an interrupt handler will incorrectly 2838 * clear NMI blocking. 2839 */ 2840static __inline void 2841vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) 2842{ 2843 uint32_t intr_info; 2844 2845 KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled")); 2846 2847 if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION) 2848 return; 2849 2850 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); 2851 KASSERT((intr_info & VMCS_INTR_VALID) != 0, 2852 ("VM exit interruption info invalid: %#x", intr_info)); 2853 2854 if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) { 2855 KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due " 2856 "to NMI has invalid vector: %#x", intr_info)); 2857 VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler"); 2858 __asm __volatile("int $2"); 2859 } 2860} 2861 2862static __inline void 2863vmx_dr_enter_guest(struct vmxctx *vmxctx) 2864{ 2865 register_t rflags; 2866 2867 /* Save host control debug registers. */ 2868 vmxctx->host_dr7 = rdr7(); 2869 vmxctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR); 2870 2871 /* 2872 * Disable debugging in DR7 and DEBUGCTL to avoid triggering 2873 * exceptions in the host based on the guest DRx values. The 2874 * guest DR7 and DEBUGCTL are saved/restored in the VMCS. 2875 */ 2876 load_dr7(0); 2877 wrmsr(MSR_DEBUGCTLMSR, 0); 2878 2879 /* 2880 * Disable single stepping the kernel to avoid corrupting the 2881 * guest DR6. A debugger might still be able to corrupt the 2882 * guest DR6 by setting a breakpoint after this point and then 2883 * single stepping. 2884 */ 2885 rflags = read_rflags(); 2886 vmxctx->host_tf = rflags & PSL_T; 2887 write_rflags(rflags & ~PSL_T); 2888 2889 /* Save host debug registers. */ 2890 vmxctx->host_dr0 = rdr0(); 2891 vmxctx->host_dr1 = rdr1(); 2892 vmxctx->host_dr2 = rdr2(); 2893 vmxctx->host_dr3 = rdr3(); 2894 vmxctx->host_dr6 = rdr6(); 2895 2896 /* Restore guest debug registers. */ 2897 load_dr0(vmxctx->guest_dr0); 2898 load_dr1(vmxctx->guest_dr1); 2899 load_dr2(vmxctx->guest_dr2); 2900 load_dr3(vmxctx->guest_dr3); 2901 load_dr6(vmxctx->guest_dr6); 2902} 2903 2904static __inline void 2905vmx_dr_leave_guest(struct vmxctx *vmxctx) 2906{ 2907 2908 /* Save guest debug registers. */ 2909 vmxctx->guest_dr0 = rdr0(); 2910 vmxctx->guest_dr1 = rdr1(); 2911 vmxctx->guest_dr2 = rdr2(); 2912 vmxctx->guest_dr3 = rdr3(); 2913 vmxctx->guest_dr6 = rdr6(); 2914 2915 /* 2916 * Restore host debug registers. Restore DR7, DEBUGCTL, and 2917 * PSL_T last. 2918 */ 2919 load_dr0(vmxctx->host_dr0); 2920 load_dr1(vmxctx->host_dr1); 2921 load_dr2(vmxctx->host_dr2); 2922 load_dr3(vmxctx->host_dr3); 2923 load_dr6(vmxctx->host_dr6); 2924 wrmsr(MSR_DEBUGCTLMSR, vmxctx->host_debugctl); 2925 load_dr7(vmxctx->host_dr7); 2926 write_rflags(read_rflags() | vmxctx->host_tf); 2927} 2928 2929static __inline void 2930vmx_pmap_activate(struct vmx *vmx, pmap_t pmap) 2931{ 2932 long eptgen; 2933 int cpu; 2934 2935 cpu = curcpu; 2936 2937 CPU_SET_ATOMIC(cpu, &pmap->pm_active); 2938 eptgen = atomic_load_long(&pmap->pm_eptgen); 2939 if (eptgen != vmx->eptgen[cpu]) { 2940 vmx->eptgen[cpu] = eptgen; 2941 invept(INVEPT_TYPE_SINGLE_CONTEXT, 2942 (struct invept_desc){ .eptp = vmx->eptp, ._res = 0 }); 2943 } 2944} 2945 2946static __inline void 2947vmx_pmap_deactivate(struct vmx *vmx, pmap_t pmap) 2948{ 2949 CPU_CLR_ATOMIC(curcpu, &pmap->pm_active); 2950} 2951 2952static int 2953vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap, 2954 struct vm_eventinfo *evinfo) 2955{ 2956 int rc, handled, launched; 2957 struct vmx *vmx; 2958 struct vm *vm; 2959 struct vmxctx *vmxctx; 2960 struct vmcs *vmcs; 2961 struct vm_exit *vmexit; 2962 struct vlapic *vlapic; 2963 uint32_t exit_reason; 2964 struct region_descriptor gdtr, idtr; 2965 uint16_t ldt_sel; 2966 2967 vmx = arg; 2968 vm = vmx->vm; 2969 vmcs = &vmx->vmcs[vcpu]; 2970 vmxctx = &vmx->ctx[vcpu]; 2971 vlapic = vm_lapic(vm, vcpu); 2972 vmexit = vm_exitinfo(vm, vcpu); 2973 launched = 0; 2974 2975 KASSERT(vmxctx->pmap == pmap, 2976 ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap)); 2977 2978 vmx_msr_guest_enter(vmx, vcpu); 2979 2980 VMPTRLD(vmcs); 2981 2982 /* 2983 * XXX 2984 * We do this every time because we may setup the virtual machine 2985 * from a different process than the one that actually runs it. 2986 * 2987 * If the life of a virtual machine was spent entirely in the context 2988 * of a single process we could do this once in vmx_vminit(). 2989 */ 2990 vmcs_write(VMCS_HOST_CR3, rcr3()); 2991 2992 vmcs_write(VMCS_GUEST_RIP, rip); 2993 vmx_set_pcpu_defaults(vmx, vcpu, pmap); 2994 do { 2995 KASSERT(vmcs_guest_rip() == rip, ("%s: vmcs guest rip mismatch " 2996 "%#lx/%#lx", __func__, vmcs_guest_rip(), rip)); 2997 2998 handled = UNHANDLED; 2999 /* 3000 * Interrupts are disabled from this point on until the 3001 * guest starts executing. This is done for the following 3002 * reasons: 3003 * 3004 * If an AST is asserted on this thread after the check below, 3005 * then the IPI_AST notification will not be lost, because it 3006 * will cause a VM exit due to external interrupt as soon as 3007 * the guest state is loaded. 3008 * 3009 * A posted interrupt after 'vmx_inject_interrupts()' will 3010 * not be "lost" because it will be held pending in the host 3011 * APIC because interrupts are disabled. The pending interrupt 3012 * will be recognized as soon as the guest state is loaded. 3013 * 3014 * The same reasoning applies to the IPI generated by 3015 * pmap_invalidate_ept(). 3016 */ 3017 disable_intr(); 3018 vmx_inject_interrupts(vmx, vcpu, vlapic, rip); 3019 3020 /* 3021 * Check for vcpu suspension after injecting events because 3022 * vmx_inject_interrupts() can suspend the vcpu due to a 3023 * triple fault. 3024 */ 3025 if (vcpu_suspended(evinfo)) { 3026 enable_intr(); 3027 vm_exit_suspended(vmx->vm, vcpu, rip); 3028 break; 3029 } 3030 3031 if (vcpu_rendezvous_pending(evinfo)) { 3032 enable_intr(); 3033 vm_exit_rendezvous(vmx->vm, vcpu, rip); 3034 break; 3035 } 3036 3037 if (vcpu_reqidle(evinfo)) { 3038 enable_intr(); 3039 vm_exit_reqidle(vmx->vm, vcpu, rip); 3040 break; 3041 } 3042 3043 if (vcpu_should_yield(vm, vcpu)) { 3044 enable_intr(); 3045 vm_exit_astpending(vmx->vm, vcpu, rip); 3046 vmx_astpending_trace(vmx, vcpu, rip); 3047 handled = HANDLED; 3048 break; 3049 } 3050 3051 if (vcpu_debugged(vm, vcpu)) { 3052 enable_intr(); 3053 vm_exit_debug(vmx->vm, vcpu, rip); 3054 break; 3055 } 3056 3057 /* 3058 * If TPR Shadowing is enabled, the TPR Threshold 3059 * must be updated right before entering the guest. 3060 */ 3061 if (tpr_shadowing && !virtual_interrupt_delivery) { 3062 if ((vmx->cap[vcpu].proc_ctls & PROCBASED_USE_TPR_SHADOW) != 0) { 3063 vmcs_write(VMCS_TPR_THRESHOLD, vlapic_get_cr8(vlapic)); 3064 } 3065 } 3066 3067 /* 3068 * VM exits restore the base address but not the 3069 * limits of GDTR and IDTR. The VMCS only stores the 3070 * base address, so VM exits set the limits to 0xffff. 3071 * Save and restore the full GDTR and IDTR to restore 3072 * the limits. 3073 * 3074 * The VMCS does not save the LDTR at all, and VM 3075 * exits clear LDTR as if a NULL selector were loaded. 3076 * The userspace hypervisor probably doesn't use a 3077 * LDT, but save and restore it to be safe. 3078 */ 3079 sgdt(&gdtr); 3080 sidt(&idtr); 3081 ldt_sel = sldt(); 3082 3083 /* 3084 * The TSC_AUX MSR must be saved/restored while interrupts 3085 * are disabled so that it is not possible for the guest 3086 * TSC_AUX MSR value to be overwritten by the resume 3087 * portion of the IPI_SUSPEND codepath. This is why the 3088 * transition of this MSR is handled separately from those 3089 * handled by vmx_msr_guest_{enter,exit}(), which are ok to 3090 * be transitioned with preemption disabled but interrupts 3091 * enabled. 3092 * 3093 * These vmx_msr_guest_{enter,exit}_tsc_aux() calls can be 3094 * anywhere in this loop so long as they happen with 3095 * interrupts disabled. This location is chosen for 3096 * simplicity. 3097 */ 3098 vmx_msr_guest_enter_tsc_aux(vmx, vcpu); 3099 3100 vmx_dr_enter_guest(vmxctx); 3101 3102 /* 3103 * Mark the EPT as active on this host CPU and invalidate 3104 * EPTP-tagged TLB entries if required. 3105 */ 3106 vmx_pmap_activate(vmx, pmap); 3107 3108 vmx_run_trace(vmx, vcpu); 3109 rc = vmx_enter_guest(vmxctx, vmx, launched); 3110 3111 vmx_pmap_deactivate(vmx, pmap); 3112 vmx_dr_leave_guest(vmxctx); 3113 vmx_msr_guest_exit_tsc_aux(vmx, vcpu); 3114 3115 bare_lgdt(&gdtr); 3116 lidt(&idtr); 3117 lldt(ldt_sel); 3118 3119 /* Collect some information for VM exit processing */ 3120 vmexit->rip = rip = vmcs_guest_rip(); 3121 vmexit->inst_length = vmexit_instruction_length(); 3122 vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); 3123 vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); 3124 3125 /* Update 'nextrip' */ 3126 vmx->state[vcpu].nextrip = rip; 3127 3128 if (rc == VMX_GUEST_VMEXIT) { 3129 vmx_exit_handle_nmi(vmx, vcpu, vmexit); 3130 enable_intr(); 3131 handled = vmx_exit_process(vmx, vcpu, vmexit); 3132 } else { 3133 enable_intr(); 3134 vmx_exit_inst_error(vmxctx, rc, vmexit); 3135 } 3136 launched = 1; 3137 vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled); 3138 rip = vmexit->rip; 3139 } while (handled); 3140 3141 /* 3142 * If a VM exit has been handled then the exitcode must be BOGUS 3143 * If a VM exit is not handled then the exitcode must not be BOGUS 3144 */ 3145 if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || 3146 (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { 3147 panic("Mismatch between handled (%d) and exitcode (%d)", 3148 handled, vmexit->exitcode); 3149 } 3150 3151 if (!handled) 3152 vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1); 3153 3154 VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d", 3155 vmexit->exitcode); 3156 3157 VMCLEAR(vmcs); 3158 vmx_msr_guest_exit(vmx, vcpu); 3159 3160 return (0); 3161} 3162 3163static void 3164vmx_vmcleanup(void *arg) 3165{ 3166 int i; 3167 struct vmx *vmx = arg; 3168 uint16_t maxcpus; 3169 3170 if (apic_access_virtualization(vmx, 0)) 3171 vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); 3172 3173 maxcpus = vm_get_maxcpus(vmx->vm); 3174 for (i = 0; i < maxcpus; i++) 3175 vpid_free(vmx->state[i].vpid); 3176 3177 free(vmx, M_VMX); 3178 3179 return; 3180} 3181 3182static register_t * 3183vmxctx_regptr(struct vmxctx *vmxctx, int reg) 3184{ 3185 3186 switch (reg) { 3187 case VM_REG_GUEST_RAX: 3188 return (&vmxctx->guest_rax); 3189 case VM_REG_GUEST_RBX: 3190 return (&vmxctx->guest_rbx); 3191 case VM_REG_GUEST_RCX: 3192 return (&vmxctx->guest_rcx); 3193 case VM_REG_GUEST_RDX: 3194 return (&vmxctx->guest_rdx); 3195 case VM_REG_GUEST_RSI: 3196 return (&vmxctx->guest_rsi); 3197 case VM_REG_GUEST_RDI: 3198 return (&vmxctx->guest_rdi); 3199 case VM_REG_GUEST_RBP: 3200 return (&vmxctx->guest_rbp); 3201 case VM_REG_GUEST_R8: 3202 return (&vmxctx->guest_r8); 3203 case VM_REG_GUEST_R9: 3204 return (&vmxctx->guest_r9); 3205 case VM_REG_GUEST_R10: 3206 return (&vmxctx->guest_r10); 3207 case VM_REG_GUEST_R11: 3208 return (&vmxctx->guest_r11); 3209 case VM_REG_GUEST_R12: 3210 return (&vmxctx->guest_r12); 3211 case VM_REG_GUEST_R13: 3212 return (&vmxctx->guest_r13); 3213 case VM_REG_GUEST_R14: 3214 return (&vmxctx->guest_r14); 3215 case VM_REG_GUEST_R15: 3216 return (&vmxctx->guest_r15); 3217 case VM_REG_GUEST_CR2: 3218 return (&vmxctx->guest_cr2); 3219 case VM_REG_GUEST_DR0: 3220 return (&vmxctx->guest_dr0); 3221 case VM_REG_GUEST_DR1: 3222 return (&vmxctx->guest_dr1); 3223 case VM_REG_GUEST_DR2: 3224 return (&vmxctx->guest_dr2); 3225 case VM_REG_GUEST_DR3: 3226 return (&vmxctx->guest_dr3); 3227 case VM_REG_GUEST_DR6: 3228 return (&vmxctx->guest_dr6); 3229 default: 3230 break; 3231 } 3232 return (NULL); 3233} 3234 3235static int 3236vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval) 3237{ 3238 register_t *regp; 3239 3240 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 3241 *retval = *regp; 3242 return (0); 3243 } else 3244 return (EINVAL); 3245} 3246 3247static int 3248vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) 3249{ 3250 register_t *regp; 3251 3252 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 3253 *regp = val; 3254 return (0); 3255 } else 3256 return (EINVAL); 3257} 3258 3259static int 3260vmx_get_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t *retval) 3261{ 3262 uint64_t gi; 3263 int error; 3264 3265 error = vmcs_getreg(&vmx->vmcs[vcpu], running, 3266 VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY), &gi); 3267 *retval = (gi & HWINTR_BLOCKING) ? 1 : 0; 3268 return (error); 3269} 3270 3271static int 3272vmx_modify_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t val) 3273{ 3274 struct vmcs *vmcs; 3275 uint64_t gi; 3276 int error, ident; 3277 3278 /* 3279 * Forcing the vcpu into an interrupt shadow is not supported. 3280 */ 3281 if (val) { 3282 error = EINVAL; 3283 goto done; 3284 } 3285 3286 vmcs = &vmx->vmcs[vcpu]; 3287 ident = VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY); 3288 error = vmcs_getreg(vmcs, running, ident, &gi); 3289 if (error == 0) { 3290 gi &= ~HWINTR_BLOCKING; 3291 error = vmcs_setreg(vmcs, running, ident, gi); 3292 } 3293done: 3294 VCPU_CTR2(vmx->vm, vcpu, "Setting intr_shadow to %#lx %s", val, 3295 error ? "failed" : "succeeded"); 3296 return (error); 3297} 3298 3299static int 3300vmx_shadow_reg(int reg) 3301{ 3302 int shreg; 3303 3304 shreg = -1; 3305 3306 switch (reg) { 3307 case VM_REG_GUEST_CR0: 3308 shreg = VMCS_CR0_SHADOW; 3309 break; 3310 case VM_REG_GUEST_CR4: 3311 shreg = VMCS_CR4_SHADOW; 3312 break; 3313 default: 3314 break; 3315 } 3316 3317 return (shreg); 3318} 3319 3320static int 3321vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) 3322{ 3323 int running, hostcpu; 3324 struct vmx *vmx = arg; 3325 3326 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 3327 if (running && hostcpu != curcpu) 3328 panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); 3329 3330 if (reg == VM_REG_GUEST_INTR_SHADOW) 3331 return (vmx_get_intr_shadow(vmx, vcpu, running, retval)); 3332 3333 if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0) 3334 return (0); 3335 3336 return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval)); 3337} 3338 3339static int 3340vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) 3341{ 3342 int error, hostcpu, running, shadow; 3343 uint64_t ctls; 3344 pmap_t pmap; 3345 struct vmx *vmx = arg; 3346 3347 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 3348 if (running && hostcpu != curcpu) 3349 panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); 3350 3351 if (reg == VM_REG_GUEST_INTR_SHADOW) 3352 return (vmx_modify_intr_shadow(vmx, vcpu, running, val)); 3353 3354 if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0) 3355 return (0); 3356 3357 /* Do not permit user write access to VMCS fields by offset. */ 3358 if (reg < 0) 3359 return (EINVAL); 3360 3361 error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val); 3362 3363 if (error == 0) { 3364 /* 3365 * If the "load EFER" VM-entry control is 1 then the 3366 * value of EFER.LMA must be identical to "IA-32e mode guest" 3367 * bit in the VM-entry control. 3368 */ 3369 if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 && 3370 (reg == VM_REG_GUEST_EFER)) { 3371 vmcs_getreg(&vmx->vmcs[vcpu], running, 3372 VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls); 3373 if (val & EFER_LMA) 3374 ctls |= VM_ENTRY_GUEST_LMA; 3375 else 3376 ctls &= ~VM_ENTRY_GUEST_LMA; 3377 vmcs_setreg(&vmx->vmcs[vcpu], running, 3378 VMCS_IDENT(VMCS_ENTRY_CTLS), ctls); 3379 } 3380 3381 shadow = vmx_shadow_reg(reg); 3382 if (shadow > 0) { 3383 /* 3384 * Store the unmodified value in the shadow 3385 */ 3386 error = vmcs_setreg(&vmx->vmcs[vcpu], running, 3387 VMCS_IDENT(shadow), val); 3388 } 3389 3390 if (reg == VM_REG_GUEST_CR3) { 3391 /* 3392 * Invalidate the guest vcpu's TLB mappings to emulate 3393 * the behavior of updating %cr3. 3394 * 3395 * XXX the processor retains global mappings when %cr3 3396 * is updated but vmx_invvpid() does not. 3397 */ 3398 pmap = vmx->ctx[vcpu].pmap; 3399 vmx_invvpid(vmx, vcpu, pmap, running); 3400 } 3401 } 3402 3403 return (error); 3404} 3405 3406static int 3407vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 3408{ 3409 int hostcpu, running; 3410 struct vmx *vmx = arg; 3411 3412 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 3413 if (running && hostcpu != curcpu) 3414 panic("vmx_getdesc: %s%d is running", vm_name(vmx->vm), vcpu); 3415 3416 return (vmcs_getdesc(&vmx->vmcs[vcpu], running, reg, desc)); 3417} 3418 3419static int 3420vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 3421{ 3422 int hostcpu, running; 3423 struct vmx *vmx = arg; 3424 3425 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 3426 if (running && hostcpu != curcpu) 3427 panic("vmx_setdesc: %s%d is running", vm_name(vmx->vm), vcpu); 3428 3429 return (vmcs_setdesc(&vmx->vmcs[vcpu], running, reg, desc)); 3430} 3431 3432static int 3433vmx_getcap(void *arg, int vcpu, int type, int *retval) 3434{ 3435 struct vmx *vmx = arg; 3436 int vcap; 3437 int ret; 3438 3439 ret = ENOENT; 3440 3441 vcap = vmx->cap[vcpu].set; 3442 3443 switch (type) { 3444 case VM_CAP_HALT_EXIT: 3445 if (cap_halt_exit) 3446 ret = 0; 3447 break; 3448 case VM_CAP_PAUSE_EXIT: 3449 if (cap_pause_exit) 3450 ret = 0; 3451 break; 3452 case VM_CAP_MTRAP_EXIT: 3453 if (cap_monitor_trap) 3454 ret = 0; 3455 break; 3456 case VM_CAP_RDPID: 3457 if (cap_rdpid) 3458 ret = 0; 3459 break; 3460 case VM_CAP_RDTSCP: 3461 if (cap_rdtscp) 3462 ret = 0; 3463 break; 3464 case VM_CAP_UNRESTRICTED_GUEST: 3465 if (cap_unrestricted_guest) 3466 ret = 0; 3467 break; 3468 case VM_CAP_ENABLE_INVPCID: 3469 if (cap_invpcid) 3470 ret = 0; 3471 break; 3472 case VM_CAP_BPT_EXIT: 3473 ret = 0; 3474 break; 3475 default: 3476 break; 3477 } 3478 3479 if (ret == 0) 3480 *retval = (vcap & (1 << type)) ? 1 : 0; 3481 3482 return (ret); 3483} 3484 3485static int 3486vmx_setcap(void *arg, int vcpu, int type, int val) 3487{ 3488 struct vmx *vmx = arg; 3489 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 3490 uint32_t baseval; 3491 uint32_t *pptr; 3492 int error; 3493 int flag; 3494 int reg; 3495 int retval; 3496 3497 retval = ENOENT; 3498 pptr = NULL; 3499 3500 switch (type) { 3501 case VM_CAP_HALT_EXIT: 3502 if (cap_halt_exit) { 3503 retval = 0; 3504 pptr = &vmx->cap[vcpu].proc_ctls; 3505 baseval = *pptr; 3506 flag = PROCBASED_HLT_EXITING; 3507 reg = VMCS_PRI_PROC_BASED_CTLS; 3508 } 3509 break; 3510 case VM_CAP_MTRAP_EXIT: 3511 if (cap_monitor_trap) { 3512 retval = 0; 3513 pptr = &vmx->cap[vcpu].proc_ctls; 3514 baseval = *pptr; 3515 flag = PROCBASED_MTF; 3516 reg = VMCS_PRI_PROC_BASED_CTLS; 3517 } 3518 break; 3519 case VM_CAP_PAUSE_EXIT: 3520 if (cap_pause_exit) { 3521 retval = 0; 3522 pptr = &vmx->cap[vcpu].proc_ctls; 3523 baseval = *pptr; 3524 flag = PROCBASED_PAUSE_EXITING; 3525 reg = VMCS_PRI_PROC_BASED_CTLS; 3526 } 3527 break; 3528 case VM_CAP_RDPID: 3529 case VM_CAP_RDTSCP: 3530 if (cap_rdpid || cap_rdtscp) 3531 /* 3532 * Choose not to support enabling/disabling 3533 * RDPID/RDTSCP via libvmmapi since, as per the 3534 * discussion in vmx_init(), RDPID/RDTSCP are 3535 * either always enabled or always disabled. 3536 */ 3537 error = EOPNOTSUPP; 3538 break; 3539 case VM_CAP_UNRESTRICTED_GUEST: 3540 if (cap_unrestricted_guest) { 3541 retval = 0; 3542 pptr = &vmx->cap[vcpu].proc_ctls2; 3543 baseval = *pptr; 3544 flag = PROCBASED2_UNRESTRICTED_GUEST; 3545 reg = VMCS_SEC_PROC_BASED_CTLS; 3546 } 3547 break; 3548 case VM_CAP_ENABLE_INVPCID: 3549 if (cap_invpcid) { 3550 retval = 0; 3551 pptr = &vmx->cap[vcpu].proc_ctls2; 3552 baseval = *pptr; 3553 flag = PROCBASED2_ENABLE_INVPCID; 3554 reg = VMCS_SEC_PROC_BASED_CTLS; 3555 } 3556 break; 3557 case VM_CAP_BPT_EXIT: 3558 retval = 0; 3559 3560 /* Don't change the bitmap if we are tracing all exceptions. */ 3561 if (vmx->cap[vcpu].exc_bitmap != 0xffffffff) { 3562 pptr = &vmx->cap[vcpu].exc_bitmap; 3563 baseval = *pptr; 3564 flag = (1 << IDT_BP); 3565 reg = VMCS_EXCEPTION_BITMAP; 3566 } 3567 break; 3568 default: 3569 break; 3570 } 3571 3572 if (retval) 3573 return (retval); 3574 3575 if (pptr != NULL) { 3576 if (val) { 3577 baseval |= flag; 3578 } else { 3579 baseval &= ~flag; 3580 } 3581 VMPTRLD(vmcs); 3582 error = vmwrite(reg, baseval); 3583 VMCLEAR(vmcs); 3584 3585 if (error) 3586 return (error); 3587 3588 /* 3589 * Update optional stored flags, and record 3590 * setting 3591 */ 3592 *pptr = baseval; 3593 } 3594 3595 if (val) { 3596 vmx->cap[vcpu].set |= (1 << type); 3597 } else { 3598 vmx->cap[vcpu].set &= ~(1 << type); 3599 } 3600 3601 return (0); 3602} 3603 3604struct vlapic_vtx { 3605 struct vlapic vlapic; 3606 struct pir_desc *pir_desc; 3607 struct vmx *vmx; 3608 u_int pending_prio; 3609}; 3610 3611#define VPR_PRIO_BIT(vpr) (1 << ((vpr) >> 4)) 3612 3613#define VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg) \ 3614do { \ 3615 VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d", \ 3616 level ? "level" : "edge", vector); \ 3617 VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]); \ 3618 VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]); \ 3619 VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]); \ 3620 VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]); \ 3621 VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\ 3622} while (0) 3623 3624/* 3625 * vlapic->ops handlers that utilize the APICv hardware assist described in 3626 * Chapter 29 of the Intel SDM. 3627 */ 3628static int 3629vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level) 3630{ 3631 struct vlapic_vtx *vlapic_vtx; 3632 struct pir_desc *pir_desc; 3633 uint64_t mask; 3634 int idx, notify = 0; 3635 3636 vlapic_vtx = (struct vlapic_vtx *)vlapic; 3637 pir_desc = vlapic_vtx->pir_desc; 3638 3639 /* 3640 * Keep track of interrupt requests in the PIR descriptor. This is 3641 * because the virtual APIC page pointed to by the VMCS cannot be 3642 * modified if the vcpu is running. 3643 */ 3644 idx = vector / 64; 3645 mask = 1UL << (vector % 64); 3646 atomic_set_long(&pir_desc->pir[idx], mask); 3647 3648 /* 3649 * A notification is required whenever the 'pending' bit makes a 3650 * transition from 0->1. 3651 * 3652 * Even if the 'pending' bit is already asserted, notification about 3653 * the incoming interrupt may still be necessary. For example, if a 3654 * vCPU is HLTed with a high PPR, a low priority interrupt would cause 3655 * the 0->1 'pending' transition with a notification, but the vCPU 3656 * would ignore the interrupt for the time being. The same vCPU would 3657 * need to then be notified if a high-priority interrupt arrived which 3658 * satisfied the PPR. 3659 * 3660 * The priorities of interrupts injected while 'pending' is asserted 3661 * are tracked in a custom bitfield 'pending_prio'. Should the 3662 * to-be-injected interrupt exceed the priorities already present, the 3663 * notification is sent. The priorities recorded in 'pending_prio' are 3664 * cleared whenever the 'pending' bit makes another 0->1 transition. 3665 */ 3666 if (atomic_cmpset_long(&pir_desc->pending, 0, 1) != 0) { 3667 notify = 1; 3668 vlapic_vtx->pending_prio = 0; 3669 } else { 3670 const u_int old_prio = vlapic_vtx->pending_prio; 3671 const u_int prio_bit = VPR_PRIO_BIT(vector & APIC_TPR_INT); 3672 3673 if ((old_prio & prio_bit) == 0 && prio_bit > old_prio) { 3674 atomic_set_int(&vlapic_vtx->pending_prio, prio_bit); 3675 notify = 1; 3676 } 3677 } 3678 3679 VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector, 3680 level, "vmx_set_intr_ready"); 3681 return (notify); 3682} 3683 3684static int 3685vmx_pending_intr(struct vlapic *vlapic, int *vecptr) 3686{ 3687 struct vlapic_vtx *vlapic_vtx; 3688 struct pir_desc *pir_desc; 3689 struct LAPIC *lapic; 3690 uint64_t pending, pirval; 3691 uint32_t ppr, vpr; 3692 int i; 3693 3694 /* 3695 * This function is only expected to be called from the 'HLT' exit 3696 * handler which does not care about the vector that is pending. 3697 */ 3698 KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL")); 3699 3700 vlapic_vtx = (struct vlapic_vtx *)vlapic; 3701 pir_desc = vlapic_vtx->pir_desc; 3702 3703 pending = atomic_load_acq_long(&pir_desc->pending); 3704 if (!pending) { 3705 /* 3706 * While a virtual interrupt may have already been 3707 * processed the actual delivery maybe pending the 3708 * interruptibility of the guest. Recognize a pending 3709 * interrupt by reevaluating virtual interrupts 3710 * following Section 29.2.1 in the Intel SDM Volume 3. 3711 */ 3712 struct vm_exit *vmexit; 3713 uint8_t rvi, ppr; 3714 3715 vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid); 3716 KASSERT(vmexit->exitcode == VM_EXITCODE_HLT, 3717 ("vmx_pending_intr: exitcode not 'HLT'")); 3718 rvi = vmexit->u.hlt.intr_status & APIC_TPR_INT; 3719 lapic = vlapic->apic_page; 3720 ppr = lapic->ppr & APIC_TPR_INT; 3721 if (rvi > ppr) { 3722 return (1); 3723 } 3724 3725 return (0); 3726 } 3727 3728 /* 3729 * If there is an interrupt pending then it will be recognized only 3730 * if its priority is greater than the processor priority. 3731 * 3732 * Special case: if the processor priority is zero then any pending 3733 * interrupt will be recognized. 3734 */ 3735 lapic = vlapic->apic_page; 3736 ppr = lapic->ppr & APIC_TPR_INT; 3737 if (ppr == 0) 3738 return (1); 3739 3740 VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d", 3741 lapic->ppr); 3742 3743 vpr = 0; 3744 for (i = 3; i >= 0; i--) { 3745 pirval = pir_desc->pir[i]; 3746 if (pirval != 0) { 3747 vpr = (i * 64 + flsl(pirval) - 1) & APIC_TPR_INT; 3748 break; 3749 } 3750 } 3751 3752 /* 3753 * If the highest-priority pending interrupt falls short of the 3754 * processor priority of this vCPU, ensure that 'pending_prio' does not 3755 * have any stale bits which would preclude a higher-priority interrupt 3756 * from incurring a notification later. 3757 */ 3758 if (vpr <= ppr) { 3759 const u_int prio_bit = VPR_PRIO_BIT(vpr); 3760 const u_int old = vlapic_vtx->pending_prio; 3761 3762 if (old > prio_bit && (old & prio_bit) == 0) { 3763 vlapic_vtx->pending_prio = prio_bit; 3764 } 3765 return (0); 3766 } 3767 return (1); 3768} 3769 3770static void 3771vmx_intr_accepted(struct vlapic *vlapic, int vector) 3772{ 3773 3774 panic("vmx_intr_accepted: not expected to be called"); 3775} 3776 3777static void 3778vmx_set_tmr(struct vlapic *vlapic, int vector, bool level) 3779{ 3780 struct vlapic_vtx *vlapic_vtx; 3781 struct vmx *vmx; 3782 struct vmcs *vmcs; 3783 uint64_t mask, val; 3784 3785 KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector)); 3786 KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL), 3787 ("vmx_set_tmr: vcpu cannot be running")); 3788 3789 vlapic_vtx = (struct vlapic_vtx *)vlapic; 3790 vmx = vlapic_vtx->vmx; 3791 vmcs = &vmx->vmcs[vlapic->vcpuid]; 3792 mask = 1UL << (vector % 64); 3793 3794 VMPTRLD(vmcs); 3795 val = vmcs_read(VMCS_EOI_EXIT(vector)); 3796 if (level) 3797 val |= mask; 3798 else 3799 val &= ~mask; 3800 vmcs_write(VMCS_EOI_EXIT(vector), val); 3801 VMCLEAR(vmcs); 3802} 3803 3804static void 3805vmx_enable_x2apic_mode_ts(struct vlapic *vlapic) 3806{ 3807 struct vmx *vmx; 3808 struct vmcs *vmcs; 3809 uint32_t proc_ctls; 3810 int vcpuid; 3811 3812 vcpuid = vlapic->vcpuid; 3813 vmx = ((struct vlapic_vtx *)vlapic)->vmx; 3814 vmcs = &vmx->vmcs[vcpuid]; 3815 3816 proc_ctls = vmx->cap[vcpuid].proc_ctls; 3817 proc_ctls &= ~PROCBASED_USE_TPR_SHADOW; 3818 proc_ctls |= PROCBASED_CR8_LOAD_EXITING; 3819 proc_ctls |= PROCBASED_CR8_STORE_EXITING; 3820 vmx->cap[vcpuid].proc_ctls = proc_ctls; 3821 3822 VMPTRLD(vmcs); 3823 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, proc_ctls); 3824 VMCLEAR(vmcs); 3825} 3826 3827static void 3828vmx_enable_x2apic_mode_vid(struct vlapic *vlapic) 3829{ 3830 struct vmx *vmx; 3831 struct vmcs *vmcs; 3832 uint32_t proc_ctls2; 3833 int vcpuid, error; 3834 3835 vcpuid = vlapic->vcpuid; 3836 vmx = ((struct vlapic_vtx *)vlapic)->vmx; 3837 vmcs = &vmx->vmcs[vcpuid]; 3838 3839 proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; 3840 KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0, 3841 ("%s: invalid proc_ctls2 %#x", __func__, proc_ctls2)); 3842 3843 proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES; 3844 proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE; 3845 vmx->cap[vcpuid].proc_ctls2 = proc_ctls2; 3846 3847 VMPTRLD(vmcs); 3848 vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2); 3849 VMCLEAR(vmcs); 3850 3851 if (vlapic->vcpuid == 0) { 3852 /* 3853 * The nested page table mappings are shared by all vcpus 3854 * so unmap the APIC access page just once. 3855 */ 3856 error = vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); 3857 KASSERT(error == 0, ("%s: vm_unmap_mmio error %d", 3858 __func__, error)); 3859 3860 /* 3861 * The MSR bitmap is shared by all vcpus so modify it only 3862 * once in the context of vcpu 0. 3863 */ 3864 error = vmx_allow_x2apic_msrs(vmx); 3865 KASSERT(error == 0, ("%s: vmx_allow_x2apic_msrs error %d", 3866 __func__, error)); 3867 } 3868} 3869 3870static void 3871vmx_post_intr(struct vlapic *vlapic, int hostcpu) 3872{ 3873 3874 ipi_cpu(hostcpu, pirvec); 3875} 3876 3877/* 3878 * Transfer the pending interrupts in the PIR descriptor to the IRR 3879 * in the virtual APIC page. 3880 */ 3881static void 3882vmx_inject_pir(struct vlapic *vlapic) 3883{ 3884 struct vlapic_vtx *vlapic_vtx; 3885 struct pir_desc *pir_desc; 3886 struct LAPIC *lapic; 3887 uint64_t val, pirval; 3888 int rvi, pirbase = -1; 3889 uint16_t intr_status_old, intr_status_new; 3890 3891 vlapic_vtx = (struct vlapic_vtx *)vlapic; 3892 pir_desc = vlapic_vtx->pir_desc; 3893 if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) { 3894 VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: " 3895 "no posted interrupt pending"); 3896 return; 3897 } 3898 3899 pirval = 0; 3900 pirbase = -1; 3901 lapic = vlapic->apic_page; 3902 3903 val = atomic_readandclear_long(&pir_desc->pir[0]); 3904 if (val != 0) { 3905 lapic->irr0 |= val; 3906 lapic->irr1 |= val >> 32; 3907 pirbase = 0; 3908 pirval = val; 3909 } 3910 3911 val = atomic_readandclear_long(&pir_desc->pir[1]); 3912 if (val != 0) { 3913 lapic->irr2 |= val; 3914 lapic->irr3 |= val >> 32; 3915 pirbase = 64; 3916 pirval = val; 3917 } 3918 3919 val = atomic_readandclear_long(&pir_desc->pir[2]); 3920 if (val != 0) { 3921 lapic->irr4 |= val; 3922 lapic->irr5 |= val >> 32; 3923 pirbase = 128; 3924 pirval = val; 3925 } 3926 3927 val = atomic_readandclear_long(&pir_desc->pir[3]); 3928 if (val != 0) { 3929 lapic->irr6 |= val; 3930 lapic->irr7 |= val >> 32; 3931 pirbase = 192; 3932 pirval = val; 3933 } 3934 3935 VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir"); 3936 3937 /* 3938 * Update RVI so the processor can evaluate pending virtual 3939 * interrupts on VM-entry. 3940 * 3941 * It is possible for pirval to be 0 here, even though the 3942 * pending bit has been set. The scenario is: 3943 * CPU-Y is sending a posted interrupt to CPU-X, which 3944 * is running a guest and processing posted interrupts in h/w. 3945 * CPU-X will eventually exit and the state seen in s/w is 3946 * the pending bit set, but no PIR bits set. 3947 * 3948 * CPU-X CPU-Y 3949 * (vm running) (host running) 3950 * rx posted interrupt 3951 * CLEAR pending bit 3952 * SET PIR bit 3953 * READ/CLEAR PIR bits 3954 * SET pending bit 3955 * (vm exit) 3956 * pending bit set, PIR 0 3957 */ 3958 if (pirval != 0) { 3959 rvi = pirbase + flsl(pirval) - 1; 3960 intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS); 3961 intr_status_new = (intr_status_old & 0xFF00) | rvi; 3962 if (intr_status_new > intr_status_old) { 3963 vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new); 3964 VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: " 3965 "guest_intr_status changed from 0x%04x to 0x%04x", 3966 intr_status_old, intr_status_new); 3967 } 3968 } 3969} 3970 3971static struct vlapic * 3972vmx_vlapic_init(void *arg, int vcpuid) 3973{ 3974 struct vmx *vmx; 3975 struct vlapic *vlapic; 3976 struct vlapic_vtx *vlapic_vtx; 3977 3978 vmx = arg; 3979 3980 vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO); 3981 vlapic->vm = vmx->vm; 3982 vlapic->vcpuid = vcpuid; 3983 vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid]; 3984 3985 vlapic_vtx = (struct vlapic_vtx *)vlapic; 3986 vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid]; 3987 vlapic_vtx->vmx = vmx; 3988 3989 if (tpr_shadowing) { 3990 vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode_ts; 3991 } 3992 3993 if (virtual_interrupt_delivery) { 3994 vlapic->ops.set_intr_ready = vmx_set_intr_ready; 3995 vlapic->ops.pending_intr = vmx_pending_intr; 3996 vlapic->ops.intr_accepted = vmx_intr_accepted; 3997 vlapic->ops.set_tmr = vmx_set_tmr; 3998 vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode_vid; 3999 } 4000 4001 if (posted_interrupts) 4002 vlapic->ops.post_intr = vmx_post_intr; 4003 4004 vlapic_init(vlapic); 4005 4006 return (vlapic); 4007} 4008 4009static void 4010vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic) 4011{ 4012 4013 vlapic_cleanup(vlapic); 4014 free(vlapic, M_VLAPIC); 4015} 4016 4017struct vmm_ops vmm_ops_intel = { 4018 .init = vmx_init, 4019 .cleanup = vmx_cleanup, 4020 .resume = vmx_restore, 4021 .vminit = vmx_vminit, 4022 .vmrun = vmx_run, 4023 .vmcleanup = vmx_vmcleanup, 4024 .vmgetreg = vmx_getreg, 4025 .vmsetreg = vmx_setreg, 4026 .vmgetdesc = vmx_getdesc, 4027 .vmsetdesc = vmx_setdesc, 4028 .vmgetcap = vmx_getcap, 4029 .vmsetcap = vmx_setcap, 4030 .vmspace_alloc = ept_vmspace_alloc, 4031 .vmspace_free = ept_vmspace_free, 4032 .vlapic_init = vmx_vlapic_init, 4033 .vlapic_cleanup = vmx_vlapic_cleanup, 4034}; 4035