1/** 2 * \file 3 * \brief Contains VMKit kernel interface for version using VMX extensions. 4 */ 5 6/* 7 * Copyright (c) 2014, University of Washington. 8 * All rights reserved. 9 * 10 * This file is distributed under the terms in the attached LICENSE file. 11 * If you do not find this file, copies can be found by writing to: 12 * ETH Zurich D-INFK, CAB F.78, Universitaetstrasse 6, CH-8092 Zurich. 13 * Attn: Systems Group. 14 */ 15 16#include <string.h> 17#include <kernel.h> 18#include <paging_kernel_arch.h> 19#include <vmx_vmkit.h> 20#include <vmx_checks.h> 21#include <x86.h> 22#include <dispatch.h> 23#include <exec.h> 24#include <irq.h> 25#include <barrelfish_kpi/vmkit.h> 26#include <barrelfish_kpi/syscalls.h> 27 28#include <dev/ia32_dev.h> 29 30#define ARRAKIS_EPT 31// Execution, entry, and exit controls that we want to use 32// for each VM 33#if defined(CONFIG_ARRAKISMON) && !defined(ARRAKIS_EPT) 34// Arrakis w/o EPT 35#define GUEST_PIN_BASE_CTLS_ENABLE \ 36 (PIN_CTLS_EXT_INTR | PIN_CTLS_NMI | PIN_CTLS_VIRT_NMI) 37 38#define GUEST_PIN_BASE_CTLS_DISABLE \ 39 (0) 40 41#define GUEST_PP_CTLS_ENABLE \ 42 (PP_CLTS_MSRBMP | PP_CLTS_IOBMP | PP_CLTS_HLT) 43 44#define GUEST_PP_CTLS_DISABLE \ 45 (0) 46 47#define GUEST_SP_CTLS_ENABLE \ 48 (0) 49 50#define GUEST_SP_CTLS_DISABLE \ 51 (0) 52 53#define GUEST_EXIT_CTLS_ENABLE \ 54 (EXIT_CLTS_HOST_SIZE | EXIT_CLTS_SAVE_EFER | EXIT_CLTS_LOAD_EFER) 55 56#define GUEST_EXIT_CTLS_DISABLE \ 57 (0) 58 59#define GUEST_ENTRY_CTLS_ENABLE \ 60 (ENTRY_CLTS_LOAD_EFER | ENTRY_CLTS_LOAD_DBG | ENTRY_CLTS_IA32E_MODE) 61 62#define GUEST_ENTRY_CTLS_DISABLE \ 63 (0) 64#elif defined(CONFIG_ARRAKISMON) 65#define GUEST_PIN_BASE_CTLS_ENABLE \ 66 (PIN_CTLS_EXT_INTR | PIN_CTLS_NMI | PIN_CTLS_VIRT_NMI) 67 68#define GUEST_PIN_BASE_CTLS_DISABLE \ 69 (0) 70 71#define GUEST_PP_CTLS_ENABLE \ 72 (PP_CLTS_MSRBMP | PP_CLTS_IOBMP | PP_CLTS_HLT | PP_CLTS_SEC_CTLS) 73 74#define GUEST_PP_CTLS_DISABLE \ 75 (0) 76 77#define GUEST_SP_CTLS_ENABLE \ 78 (SP_CLTS_ENABLE_EPT) 79 80#define GUEST_SP_CTLS_DISABLE \ 81 (0) 82 83#define GUEST_EXIT_CTLS_ENABLE \ 84 (EXIT_CLTS_HOST_SIZE | EXIT_CLTS_SAVE_EFER | EXIT_CLTS_LOAD_EFER) 85 86#define GUEST_EXIT_CTLS_DISABLE \ 87 (0) 88 89#define GUEST_ENTRY_CTLS_ENABLE \ 90 (ENTRY_CLTS_LOAD_EFER | ENTRY_CLTS_LOAD_DBG | ENTRY_CLTS_IA32E_MODE) 91 92#define GUEST_ENTRY_CTLS_DISABLE \ 93 (0) 94#else 95#define GUEST_PIN_BASE_CTLS_ENABLE \ 96 (PIN_CTLS_EXT_INTR | PIN_CTLS_NMI | PIN_CTLS_VIRT_NMI) 97 98#define GUEST_PIN_BASE_CTLS_DISABLE \ 99 (0) 100 101#define GUEST_PP_CTLS_ENABLE \ 102 (PP_CLTS_MSRBMP | PP_CLTS_IOBMP | PP_CLTS_HLT | PP_CLTS_SEC_CTLS) 103 104#define GUEST_PP_CTLS_DISABLE \ 105 (0) 106 107#define GUEST_SP_CTLS_ENABLE \ 108 (SP_CLTS_ENABLE_EPT | SP_CLTS_UNRSTD_GUEST) 109 110#define GUEST_SP_CTLS_DISABLE \ 111 (0) 112 113#define GUEST_EXIT_CTLS_ENABLE \ 114 (EXIT_CLTS_HOST_SIZE | EXIT_CLTS_SAVE_EFER | EXIT_CLTS_LOAD_EFER | \ 115 EXIT_CLTS_SAVE_PAT | EXIT_CLTS_LOAD_PAT) 116 117#define GUEST_EXIT_CTLS_DISABLE \ 118 (0) 119 120#define GUEST_ENTRY_CTLS_ENABLE \ 121 (ENTRY_CLTS_LOAD_EFER) 122 123#define GUEST_ENTRY_CTLS_DISABLE \ 124 (0) 125#endif 126 127extern void *vmx_return_func; 128 129static struct guest_control *ctrl = NULL; 130 131static int launched = 0; 132 133#ifndef CONFIG_ARRAKISMON 134// List of MSRs that are loaded on VM-exit. 135static uint32_t msr_list[VMX_MSR_COUNT] = 136 {MSR_KERNEL_GS_BASE, MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SFMASK}; 137 138// VM-exit MSR-load area that contains host MSR values that are saved prior 139// to VM-entry and loaded on VM exit. 140static struct msr_entry host_msr_area[VMX_MSR_COUNT] 141__attribute__ ((aligned(16))); 142#endif 143 144// VMX controls that are written to the VMCS. In addition to the controls 145// that are requested, these values may have bits that are reserved set. 146vmx_controls pin_based_ctls = 0, pp_based_ctls = 0, sp_based_ctls = 0, 147 entry_ctls = 0, exit_ctls = 0; 148 149static uint8_t vmxon_region[BASE_PAGE_SIZE] 150__attribute__ ((aligned(BASE_PAGE_SIZE))); 151 152// Returns true if extended page tables (EPT) are enabled. 153static inline int ept_enabled(void) 154{ 155 return ((GUEST_SP_CTLS_ENABLE & SP_CLTS_ENABLE_EPT) != 0); 156} 157 158static inline errval_t instr_err(void) 159{ 160 errval_t err; 161 __asm volatile("jnc vmx_err_check_zf%=\n\t" 162 "mov %[VMfailInvalid], %[err]\n\t" 163 "jmp vmx_err_done%=\n\t" 164 "vmx_err_check_zf%=:\n\t" 165 "jnz vmx_err_succeed%=\n\t" 166 "mov %[VMfailValid], %[err]\n\t" 167 "jmp vmx_err_done%=\n\t" 168 "vmx_err_succeed%=:\n\t" 169 "mov %[VMsucceed], %[err]\n\t" 170 "vmx_err_done%=:\n\t" 171 : [err] "=r" (err) 172 : [VMfailInvalid] "i" (SYS_ERR_VMKIT_VMX_VMFAIL_INVALID), 173 [VMfailValid] "i" (SYS_ERR_VMKIT_VMX_VMFAIL_VALID), 174 [VMsucceed] "i" (SYS_ERR_OK) 175 : "memory"); 176 return err; 177} 178 179// Executes the vmptrld instruction, which makes the VMCS referenced by 180// 'vmcs_base' active and current. 181errval_t vmptrld(lpaddr_t vmcs_base) 182{ 183 __asm volatile("vmptrld %[vmcs_base]\n\t" 184 : 185 : [vmcs_base] "m" (vmcs_base) 186 : "memory"); 187 return instr_err(); 188} 189 190// Returns the physical address base of the current VMCS. 191lpaddr_t vmptrst(void) 192{ 193 lpaddr_t dest_addr; 194 __asm volatile("vmptrst %[dest_addr]\n\t" 195 : 196 : [dest_addr] "m" (dest_addr) 197 : "memory"); 198 return dest_addr; 199} 200 201// Executes the vmclear instruction, which makes the VMCS referenced 202// by 'vmcs_base' clear and inactive. This instruction also ensures 203// that the referenced VMCS data is saved. 204errval_t vmclear(lpaddr_t vmcs_base) 205{ 206 __asm volatile("vmclear %[vmcs_base]\n\t" 207 : 208 : [vmcs_base] "m" (vmcs_base) 209 : "memory"); 210 return instr_err(); 211} 212 213// Reads a component with a specified encoding from the current VMCS 214// to an address dest_addr using the vmread instruction. 215errval_t vmread(uintptr_t encoding, lvaddr_t *dest_addr) 216{ 217 __asm volatile("vmread %[encoding], %[dest_addr]\n\t" 218 : 219 : [encoding] "r" (encoding), [dest_addr] "m" (*dest_addr) 220 : "memory"); 221 return instr_err(); 222} 223 224// Writes a component with a specifed encoding and value to the current 225// VMCS using the vmwrite instruction. 226errval_t vmwrite(uintptr_t encoding, uintptr_t value) 227{ 228 __asm volatile("vmwrite %[value], %[encoding]\n\t" 229 : 230 : [encoding] "r" (encoding), [value] "r" (value) 231 : "memory"); 232 return instr_err(); 233} 234 235// Using a provided VMXON region, causes the logical processor to enter 236// into root-mode by executing the vmxon instruction. 237errval_t vmxon(lpaddr_t base_addr) 238{ 239 __asm volatile("vmxon %[base_addr]\n\t" 240 : 241 : [base_addr] "m" (base_addr) 242 : "memory"); 243 return instr_err(); 244} 245 246// Exits VMX operation by executing the vmxoff instruction. 247errval_t vmxoff(void) 248{ 249 __asm volatile("vmxoff"); 250 return instr_err(); 251} 252 253// Reads and returns the MSR that reports the allowed settings 254// for ALL of the bits of the controls indicated by 'type.' 255static uint64_t msr_ctls_true(enum vmx_ctls_t type) 256{ 257 uint64_t true_msr = 0; 258 switch(type) { 259 case VMX_CTLS_PIN_BASED: 260 true_msr = ia32_vmx_true_pinbased_ctls_rd(NULL); 261 break; 262 case VMX_CTLS_PRIMARY_PROCESSOR: 263 true_msr = ia32_vmx_true_ppbased_ctls_rd(NULL); 264 break; 265 case VMX_CTLS_SECONDARY_PROCESSOR: 266 assert(!"No such MSR for secondary processor controls!\n"); 267 break; 268 case VMX_CTLS_EXIT: 269 true_msr = ia32_vmx_true_exit_ctls_rd(NULL); 270 break; 271 case VMX_CTLS_ENTRY: 272 true_msr = ia32_vmx_true_entry_ctls_rd(NULL); 273 break; 274 } 275 return true_msr; 276} 277 278// Reads and returns the MSR that reports the allowed settings 279// for MOST of the bits of the controls indicated by 'type.' 280static uint64_t msr_ctls(enum vmx_ctls_t type) 281{ 282 uint64_t msr = 0; 283 switch(type) { 284 case VMX_CTLS_PIN_BASED: 285 msr = ia32_vmx_pinbased_ctls_rd(NULL); 286 break; 287 case VMX_CTLS_PRIMARY_PROCESSOR: 288 msr = ia32_vmx_ppbased_ctls_rd(NULL); 289 break; 290 case VMX_CTLS_SECONDARY_PROCESSOR: 291 msr = ia32_vmx_spbased_ctls_rd(NULL); 292 break; 293 case VMX_CTLS_EXIT: 294 msr = ia32_vmx_exit_ctls_rd(NULL); 295 break; 296 case VMX_CTLS_ENTRY: 297 msr = ia32_vmx_entry_ctls_rd(NULL); 298 break; 299 } 300 return msr; 301} 302 303// Writes the controls indicated by 'type' to the VMCS using 'mask_1s' 304// and 'mask_0s', which correspond to the controls that should be enabled 305// and disabled, respectively. 306static uint32_t set_vmx_controls(uint32_t mask_1s, 307 uint32_t mask_0s, enum vmx_ctls_t type) 308{ 309 uint32_t controls = 0; 310 311 ia32_vmx_basic_t vmx_basic = ia32_vmx_basic_rd(NULL); 312 bool true_ctls = !!(ia32_vmx_basic_ctls_clear_extract(vmx_basic)); 313 if (true_ctls && (type != VMX_CTLS_SECONDARY_PROCESSOR)) { 314 uint64_t true_msr = msr_ctls_true(type); 315 controls = ((DWORD_LS(true_msr) | mask_1s) & DWORD_MS(true_msr)); 316 } else { 317 uint64_t msr = msr_ctls(type); 318 controls = ((DWORD_LS(msr) | mask_1s) & DWORD_MS(msr)); 319 } 320 assert((mask_1s & (~controls)) == 0); 321 assert((mask_0s & controls) == 0); 322 return controls; 323} 324 325/** 326 * \brief Tries to enable hardware assisted virtualization. 327 * 328 * Checks whether hardware assisted virtualization is available on the platform 329 * and enables this feature. 330 * 331 * \Return Returns VMKIT_ERR_OK on successful initialization of the subsystem 332 * or VMKIT_ERR_UNAVAIL if virtualization is unavailable. 333 */ 334errval_t vmx_enable_virtualization (void) 335{ 336 uint32_t cpuid_ecx; 337 cpuid(CPUID_VMX, NULL, NULL, &cpuid_ecx, NULL); 338 if (!(cpuid_ecx & VMX_SUPPORT)) { 339 return SYS_ERR_VMKIT_UNAVAIL; 340 } 341 342 // The 'lock' and 'enable VMXON outside' bits of the IA32_FEATURE_CONTROL_MSR 343 // must be set 344 ia32_feature_cntl_t feat_cntl_msr; 345 feat_cntl_msr = ia32_feature_cntl_rd(NULL); 346 if (!ia32_feature_cntl_lock_extract(feat_cntl_msr) || 347 !ia32_feature_cntl_vmxoutsmx_extract(feat_cntl_msr)) { 348 return SYS_ERR_VMKIT_UNAVAIL; 349 } 350 351 pin_based_ctls = set_vmx_controls( 352 GUEST_PIN_BASE_CTLS_ENABLE, GUEST_PIN_BASE_CTLS_DISABLE, VMX_CTLS_PIN_BASED); 353 354 pp_based_ctls = set_vmx_controls( 355 GUEST_PP_CTLS_ENABLE, GUEST_PP_CTLS_DISABLE, VMX_CTLS_PRIMARY_PROCESSOR); 356 357 sp_based_ctls = set_vmx_controls( 358 GUEST_SP_CTLS_ENABLE, GUEST_SP_CTLS_DISABLE, VMX_CTLS_SECONDARY_PROCESSOR); 359 360 entry_ctls = set_vmx_controls( 361 GUEST_ENTRY_CTLS_ENABLE, GUEST_ENTRY_CTLS_DISABLE, VMX_CTLS_ENTRY); 362 363 exit_ctls = set_vmx_controls( 364 GUEST_EXIT_CTLS_ENABLE, GUEST_EXIT_CTLS_DISABLE, VMX_CTLS_EXIT); 365 366 // Initialize the VMXON region 367 memset(vmxon_region, 0x0, BASE_PAGE_SIZE); 368 ia32_vmx_basic_t vmx_basic; 369 vmx_basic = ia32_vmx_basic_rd(NULL); 370 uint32_t vmcs_rev_id = ia32_vmx_basic_vmcs_rev_id_extract(vmx_basic); 371 memcpy(vmxon_region, &vmcs_rev_id, sizeof(uint32_t)); 372 373 // The logical processor must use PAE paging 374 uint64_t cr0 = rdcr0(); 375 if ((cr0 & CR0_PE) == 0 || (rdcr0() & CR0_PG) == 0) { 376 return SYS_ERR_VMKIT_UNAVAIL; 377 } 378 379 // The CR0 register value has to support all of the CR0 fixed bits 380 if (cr0 != vmx_fixed_cr0()) { 381 return SYS_ERR_VMKIT_UNAVAIL; 382 } 383 384 // Enable virtualization, if not already enabled 385 if (!vmx_enabled()) { 386 enable_vmx(); 387 } 388 // The CR4 register value has to support all of the CR4 fixed bits 389 if (rdcr4() != vmx_fixed_cr4()) { 390 return SYS_ERR_VMKIT_UNAVAIL; 391 } 392 393 // Execute VMXON to place processor into VMX root operation 394 errval_t err = vmxon(mem_to_local_phys((lvaddr_t)vmxon_region)); 395 assert(err_is_ok(err)); 396 397 return SYS_ERR_OK; 398} 399 400static inline void vmx_set_exception_bitmap(void) 401{ 402 errval_t err = vmwrite(VMX_EXCP_BMP, ~(1UL << 7)); 403 assert(err_is_ok(err)); 404} 405 406#ifndef CONFIG_ARRAKISMON 407static uint64_t vmx_read_msr(uint32_t index) { 408 uint64_t val = 0; 409 switch (index) { 410 case MSR_KERNEL_GS_BASE: 411 val = ia32_kernel_gs_base_rd(NULL); 412 break; 413 case MSR_STAR: 414 val = ia32_star_rd(NULL); 415 break; 416 case MSR_LSTAR: 417 val = ia32_lstar_rd(NULL); 418 break; 419 case MSR_CSTAR: 420 val = ia32_cstar_rd(NULL); 421 break; 422 case MSR_SFMASK: 423 val = ia32_fmask_rd(NULL); 424 break; 425 default: 426 assert(!"MSR index not supported"); 427 panic("MSR index %d not supported\n", index); 428 } 429 return val; 430} 431 432static void vmx_host_msr_area_init(struct msr_entry *msr_area) 433{ 434 for (int i = 0; i < VMX_MSR_COUNT; i++) { 435 msr_area[i].index = msr_list[i]; 436 msr_area[i].val = vmx_read_msr(msr_list[i]); 437 } 438} 439#endif 440 441static inline lpaddr_t mem_to_local_phys_no_assertion(lvaddr_t addr) 442{ 443 return (lpaddr_t)(addr - (lpaddr_t)X86_64_MEMORY_OFFSET); 444} 445 446// Writes the host state, which is used after a VM-exit, to the 447// current VMCS 448static void vmx_set_host_state(void) 449{ 450 // On a page-fault the processor checks whether: 451 // (#PF error-code) & (#PF error-code mask) = (#PF error-code match) 452 453 // Setting the mask to 0, the match to 0xFFFFFFFF, and bit 14 in the 454 // exception bitmap results in no VM-exits on guest page-faults. 455 errval_t err = vmwrite(VMX_PF_ERR_MASK, 0); 456 err += vmwrite(VMX_PF_ERR_MATCH, 0xFFFFFFFF); 457 err += vmwrite(VMX_CR3_TARGET_CNT, 0); 458 459 uint64_t cr0 = rdcr0(), cr3 = rdcr3(), cr4 = rdcr4(); 460 461 uint64_t cr0_fixed0 = ia32_vmx_cr0_fixed0_rd(NULL); 462 uint64_t cr0_fixed1 = ia32_vmx_cr0_fixed1_rd(NULL); 463 uint64_t cr4_fixed0 = ia32_vmx_cr4_fixed0_rd(NULL); 464 uint64_t cr4_fixed1 = ia32_vmx_cr4_fixed1_rd(NULL); 465 466 assert((~cr0 & cr0_fixed0) == 0); 467 assert((cr0 & ~cr0_fixed1) == 0); 468 assert((~cr4 & cr4_fixed0) == 0); 469 assert((cr4 & ~cr4_fixed1) == 0); 470 471 assert(((cr0 | cr0_fixed0) & cr0_fixed1) == cr0); 472 assert(((cr4 | cr4_fixed0) & cr4_fixed1) == cr4); 473 assert(rdcr4() & CR4_PAE); 474 475 err += vmwrite(VMX_HOST_CR0, cr0); 476 err += vmwrite(VMX_HOST_CR3, cr3); 477 err += vmwrite(VMX_HOST_CR4, cr4); 478 479 err += vmwrite(VMX_HOST_ES_SEL, rd_es() & ~0x7); 480 err += vmwrite(VMX_HOST_CS_SEL, rd_cs() & ~0x7); 481 err += vmwrite(VMX_HOST_SS_SEL, rd_ss() & ~0x7); 482 err += vmwrite(VMX_HOST_DS_SEL, rd_ds() & ~0x7); 483 err += vmwrite(VMX_HOST_TR_SEL, rd_tr() & ~0x7); 484 485 err += vmwrite(VMX_HOST_TR_BASE, tr_addr(rd_tr(), gdtr_addr(rd_gdtr()))); 486 err += vmwrite(VMX_HOST_GDTR_BASE, gdtr_addr(rd_gdtr())); 487 err += vmwrite(VMX_HOST_IDTR_BASE, idtr_addr(rd_idtr())); 488 err += vmwrite(VMX_HOST_SYSENTER_CS, 0); 489 err += vmwrite(VMX_HOST_SYSENTER_ESP, 0); 490 err += vmwrite(VMX_HOST_SYSENTER_EIP, 0); 491 err += vmwrite(VMX_HOST_PAT_F, ia32_cr_pat_rd(NULL)); 492 493 ia32_efer_t efer_msr = ia32_efer_rd(NULL); 494 err += vmwrite(VMX_HOST_EFER_F, efer_msr); 495 assert(ia32_efer_lme_extract(efer_msr)); 496 assert(ia32_efer_lma_extract(efer_msr)); 497 498 err += vmwrite(VMX_HOST_GS_SEL, 0x0); 499 err += vmwrite(VMX_HOST_GS_BASE, 0x0); 500 501 err += vmwrite(VMX_HOST_FS_SEL, 0x0); 502 err += vmwrite(VMX_HOST_FS_BASE, 0x0); 503 504 err += vmwrite(VMX_HOST_RIP, (uint64_t)(&vmx_return_func)); 505#ifndef CONFIG_ARRAKISMON 506 vmx_host_msr_area_init(host_msr_area); 507 508 lpaddr_t msr_area_base = mem_to_local_phys_no_assertion( 509 (lvaddr_t) host_msr_area); 510 if (!((lvaddr_t) host_msr_area >= X86_64_MEMORY_OFFSET)) { 511 printk(LOG_NOTE, "assertion failed! 0x%lx >= 0x%lx\n", 512 (lvaddr_t) host_msr_area, 513 X86_64_MEMORY_OFFSET); 514 } 515 516 err += vmwrite(VMX_EXIT_MSR_LOAD_F, canonical_form(msr_area_base)); 517 err += vmwrite(VMX_EXIT_MSR_LOAD_CNT, VMX_MSR_COUNT); 518#endif 519 assert(err_is_ok(err)); 520} 521 522// Writes the VMX controls to the current VMCS. 523void vmx_set_exec_ctls(void) 524{ 525 // VM-execution controls 526 errval_t err = vmwrite(VMX_EXEC_PIN_BASED, pin_based_ctls); 527 err += vmwrite(VMX_EXEC_PRIM_PROC, pp_based_ctls); 528 err += vmwrite(VMX_EXEC_SEC_PROC, sp_based_ctls); 529 530 // VM-entry and VM-exit control fields 531 err += vmwrite(VMX_EXIT_CONTROLS, exit_ctls); 532 err += vmwrite(VMX_ENTRY_CONTROLS, entry_ctls); 533 534 vmx_set_exception_bitmap(); 535 536 err += vmwrite(VMX_ENTRY_INTR_INFO, 0); 537 err += vmwrite(VMX_ENTRY_EXCP_ERR, 0); 538 err += vmwrite(VMX_ENTRY_INSTR_LEN, 0); 539 assert(err_is_ok(err)); 540} 541 542errval_t initialize_vmcs(lpaddr_t vmcs_paddr) 543{ 544 struct vmcs *vmcs = (struct vmcs *)local_phys_to_mem(vmcs_paddr); 545 546 ia32_vmx_basic_t vmx_basic; 547 vmx_basic = ia32_vmx_basic_rd(NULL); 548 uint32_t vmcs_rev_id = ia32_vmx_basic_vmcs_rev_id_extract(vmx_basic); 549 550 memset(vmcs, 0x0, BASE_PAGE_SIZE); 551 vmcs->prelude.p.revision_id = vmcs_rev_id; 552 vmcs->prelude.p.shadow = 0; 553 errval_t err = vmclear(vmcs_paddr); 554 err += vmptrld(vmcs_paddr); 555 556 err += vmwrite(VMX_GUEST_VMCS_LPTR_F, ~0x0); 557 err += vmwrite(VMX_GUEST_VMCS_LPTR_H, ~0x0); 558 err += vmwrite(VMX_GUEST_SYSENTER_CS, 0x0); 559 err += vmwrite(VMX_GUEST_SYSENTER_ESP, 0x0); 560 err += vmwrite(VMX_GUEST_SYSENTER_EIP, 0x0); 561#ifdef CONFIG_ARRAKISMON 562 err += vmwrite(VMX_GUEST_DR7, 0x0); 563 err += vmwrite(VMX_GUEST_EFER_F, ia32_efer_rd(NULL) | EFER_LME | EFER_LMA); 564 565 err += vmwrite(VMX_GUEST_ACTIV_STATE, 0x0); 566 err += vmwrite(VMX_GUEST_INTR_STATE, 0x0); 567 568 err += vmwrite(VMX_GUEST_CS_LIM, 0xFFFFFFFF); 569 err += vmwrite(VMX_GUEST_DS_LIM, 0xFFFFFFFF); 570 err += vmwrite(VMX_GUEST_ES_LIM, 0xFFFFFFFF); 571 err += vmwrite(VMX_GUEST_SS_LIM, 0xFFFFFFFF); 572 err += vmwrite(VMX_GUEST_FS_LIM, 0xFFFFFFFF); 573 err += vmwrite(VMX_GUEST_GS_LIM, 0xFFFFFFFF); 574 err += vmwrite(VMX_GUEST_TR_LIM, 0xFFFF); 575 err += vmwrite(VMX_GUEST_LDTR_LIM, 0xFFFF); 576 err += vmwrite(VMX_GUEST_GDTR_LIM, 0xFFFF); 577 err += vmwrite(VMX_GUEST_IDTR_LIM, 0xFFFF); 578 579 err += vmwrite(VMX_GUEST_CS_ACCESS, 0xA09B); 580 err += vmwrite(VMX_GUEST_DS_ACCESS, 0xC093); 581 err += vmwrite(VMX_GUEST_ES_ACCESS, 0xC093); 582 err += vmwrite(VMX_GUEST_FS_ACCESS, 0xC093); 583 err += vmwrite(VMX_GUEST_GS_ACCESS, 0xC093); 584 err += vmwrite(VMX_GUEST_SS_ACCESS, 0xC093); 585 err += vmwrite(VMX_GUEST_TR_ACCESS, 0x8B); 586 err += vmwrite(VMX_GUEST_LDTR_ACCESS, 0x82); 587 588 err += vmwrite(VMX_GUEST_CS_SEL, 0x8); 589 err += vmwrite(VMX_GUEST_SS_SEL, 0x10); 590 err += vmwrite(VMX_GUEST_DS_SEL, 0x10); 591 err += vmwrite(VMX_GUEST_ES_SEL, 0x10); 592 err += vmwrite(VMX_GUEST_FS_SEL, 0x10); 593 err += vmwrite(VMX_GUEST_GS_SEL, 0x10); 594 err += vmwrite(VMX_GUEST_TR_SEL, 0x10); 595 err += vmwrite(VMX_GUEST_LDTR_SEL, 0x10); 596 597 err += vmwrite(VMX_GUEST_CS_BASE, 0x0); 598 err += vmwrite(VMX_GUEST_SS_BASE, 0x0); 599 err += vmwrite(VMX_GUEST_DS_BASE, 0x0); 600 err += vmwrite(VMX_GUEST_ES_BASE, 0x0); 601 err += vmwrite(VMX_GUEST_FS_BASE, 0x0); 602 err += vmwrite(VMX_GUEST_GS_BASE, 0x0); 603 err += vmwrite(VMX_GUEST_TR_BASE, 0x0); 604 err += vmwrite(VMX_GUEST_LDTR_BASE, 0x0); 605 err += vmwrite(VMX_GUEST_GDTR_BASE, 0x0); 606 err += vmwrite(VMX_GUEST_IDTR_BASE, 0x0); 607 608 uint64_t guest_cr0 = 0x60000010 | CR0_PE | CR0_PG; 609 err += vmwrite(VMX_GUEST_CR0, (uint32_t)(guest_cr0 | ia32_vmx_cr0_fixed0_rd(NULL)) & 610 ia32_vmx_cr0_fixed1_rd(NULL)); 611 612 uint64_t guest_cr4 = CR4_PAE; 613 err += vmwrite(VMX_GUEST_CR4, (guest_cr4 | ia32_vmx_cr4_fixed0_rd(NULL)) & 614 ia32_vmx_cr4_fixed1_rd(NULL)); 615 616 err += vmwrite(VMX_CR0_GH_MASK, 0UL); 617 err += vmwrite(VMX_CR4_GH_MASK, 0UL); 618#else 619 err += vmwrite(VMX_GUEST_DR7, 0x400); 620 err += vmwrite(VMX_GUEST_EFER_F, 0x0); 621 err += vmwrite(VMX_GUEST_PAT_F, 0x0007040600070406ul); 622 623 err += vmwrite(VMX_GUEST_ACTIV_STATE, 0x0); 624 err += vmwrite(VMX_GUEST_INTR_STATE, 0x0); 625 626 err += vmwrite(VMX_GUEST_CS_LIM, 0xFFFF); 627 err += vmwrite(VMX_GUEST_DS_LIM, 0xFFFF); 628 err += vmwrite(VMX_GUEST_ES_LIM, 0xFFFF); 629 err += vmwrite(VMX_GUEST_FS_LIM, 0xFFFF); 630 err += vmwrite(VMX_GUEST_GS_LIM, 0xFFFF); 631 err += vmwrite(VMX_GUEST_SS_LIM, 0xFFFF); 632 err += vmwrite(VMX_GUEST_TR_LIM, 0xFFFF); 633 err += vmwrite(VMX_GUEST_LDTR_LIM, 0xFFFF); 634 err += vmwrite(VMX_GUEST_GDTR_LIM, 0xFFFF); 635 err += vmwrite(VMX_GUEST_IDTR_LIM, 0xFFFF); 636 637 err += vmwrite(VMX_GUEST_CS_ACCESS, 0x9B); 638 err += vmwrite(VMX_GUEST_DS_ACCESS, 0x93); 639 err += vmwrite(VMX_GUEST_ES_ACCESS, 0x93); 640 err += vmwrite(VMX_GUEST_FS_ACCESS, 0x93); 641 err += vmwrite(VMX_GUEST_GS_ACCESS, 0x93); 642 err += vmwrite(VMX_GUEST_SS_ACCESS, 0x93); 643 err += vmwrite(VMX_GUEST_TR_ACCESS, 0x8B); 644 err += vmwrite(VMX_GUEST_LDTR_ACCESS, 0x82); 645 646 err += vmwrite(VMX_GUEST_CS_SEL, 0x0); 647 err += vmwrite(VMX_GUEST_DS_SEL, 0x0); 648 err += vmwrite(VMX_GUEST_ES_SEL, 0x0); 649 err += vmwrite(VMX_GUEST_FS_SEL, 0x0); 650 err += vmwrite(VMX_GUEST_GS_SEL, 0x0); 651 err += vmwrite(VMX_GUEST_SS_SEL, 0x0); 652 err += vmwrite(VMX_GUEST_TR_SEL, 0x0); 653 err += vmwrite(VMX_GUEST_LDTR_SEL, 0x0); 654 655 err += vmwrite(VMX_GUEST_CS_BASE, 0x0); 656 err += vmwrite(VMX_GUEST_DS_BASE, 0x0); 657 err += vmwrite(VMX_GUEST_ES_BASE, 0x0); 658 err += vmwrite(VMX_GUEST_FS_BASE, 0x0); 659 err += vmwrite(VMX_GUEST_GS_BASE, 0x0); 660 err += vmwrite(VMX_GUEST_SS_BASE, 0x0); 661 err += vmwrite(VMX_GUEST_TR_BASE, 0x0); 662 err += vmwrite(VMX_GUEST_LDTR_BASE, 0x0); 663 err += vmwrite(VMX_GUEST_GDTR_BASE, 0x0); 664 err += vmwrite(VMX_GUEST_IDTR_BASE, 0x0); 665 666 err += vmwrite(VMX_GUEST_RFLAGS, 0x200002); 667 err += vmwrite(VMX_GUEST_RIP, 0xFFF0); 668 err += vmwrite(VMX_GUEST_RSP, 0x0); 669 670 uint64_t guest_cr0 = (0x60000010 | ia32_vmx_cr0_fixed0_rd(NULL)) & 671 ia32_vmx_cr0_fixed1_rd(NULL); 672 err += vmwrite(VMX_GUEST_CR0, guest_cr0 & ~(CR0_PE | CR0_PG)); 673 674 uint64_t guest_cr4 = CR4_PAE; 675 err += vmwrite(VMX_GUEST_CR4, (guest_cr4 | ia32_vmx_cr4_fixed0_rd(NULL)) & 676 ia32_vmx_cr4_fixed1_rd(NULL)); 677 assert((guest_cr4 & CR4_PCIDE) == 0); 678 679 uint64_t cr0_shadow; 680 err += vmread(VMX_GUEST_CR0, &cr0_shadow); 681 682 err += vmwrite(VMX_CR0_RD_SHADOW, cr0_shadow); 683 err += vmwrite(VMX_CR0_GH_MASK, CR0_PE); 684 err += vmwrite(VMX_CR4_GH_MASK, 0x20); 685#endif 686 assert(err_is_ok(err)); 687 688 vmx_set_exec_ctls(); 689 690 return SYS_ERR_OK; 691} 692 693static uint32_t fail = 0; 694 695static inline void enter_guest(void) 696{ 697 // Set the host state prior to every VM-entry in case the values 698 // written to the VMCS change. 699 //printf("%s:%d\n", __FUNCTION__, __LINE__); 700 vmx_set_host_state(); 701 702 // This is necessary or else a #GPF will be incurred in the 703 // monitor domain. 704 //printf("%s:%d\n", __FUNCTION__, __LINE__); 705 uint16_t ldtr_sel = rd_ldtr(); 706 707 // Perform most checks that are performed by the processor 708 //printf("%s:%d\n", __FUNCTION__, __LINE__); 709 if (!launched) { 710 check_guest_state_area(); 711 check_host_state_area(); 712 check_vmx_controls(); 713 } 714 //printf("%s:%d\n", __FUNCTION__, __LINE__); 715 716 __asm volatile("mov %[ctrl], %%rdi\n\t" 717 718 // save host host 719 "mov %%rsp, %%r8\n\t" 720 "mov %[host_rsp_encoding], %%r9\n\t" 721 "vmwrite %%r8, %%r9\n\t" 722 723 "mov %%rbx, (148 + 1*8)(%%rdi)\n\t" 724 "mov %%rbp, (148 + 6*8)(%%rdi)\n\t" 725 "mov %%r12, (148 + 12*8)(%%rdi)\n\t" 726 "mov %%r13, (148 + 13*8)(%%rdi)\n\t" 727 "mov %%r14, (148 + 14*8)(%%rdi)\n\t" 728 "mov %%r15, (148 + 15*8)(%%rdi)\n\t" 729 "mov %%cr2, %%rsi\n\t" 730 "mov %%rsi, 38*8(%%rdi)\n\t" 731 732 // load guest state 733 "mov 37*8(%%rdi), %%rsi\n\t" 734 "mov %%rsi, %%cr2\n\t" 735 736 "mov 0*8(%%rdi), %%rax\n\t" 737 "mov 1*8(%%rdi), %%rbx\n\t" 738 "mov 2*8(%%rdi), %%rcx\n\t" 739 "mov 3*8(%%rdi), %%rdx\n\t" 740 "mov 4*8(%%rdi), %%rsi\n\t" 741 "mov 6*8(%%rdi), %%rbp\n\t" 742 "mov 8*8(%%rdi), %%r8\n\t" 743 "mov 9*8(%%rdi), %%r9\n\t" 744 "mov 10*8(%%rdi), %%r10\n\t" 745 "mov 11*8(%%rdi), %%r11\n\t" 746 "mov 12*8(%%rdi), %%r12\n\t" 747 "mov 13*8(%%rdi), %%r13\n\t" 748 "mov 14*8(%%rdi), %%r14\n\t" 749 "mov 15*8(%%rdi), %%r15\n\t" 750 "mov 5*8(%%rdi), %%rdi\n\t" 751 752 // enter the guest VM 753 "cmpl $0, %[launched]\n\t" 754 "jne 1f\n\t" 755 "sti\n\t" 756 "vmlaunch\n\t" 757 "jmp 2f\n\t" 758 "1: " 759 "sti\n\t" 760 "vmresume\n\t" 761 "2: " 762 "setbe %[fail]\n\t" 763 "vmx_return_func:\n\t" 764 "cli\n\t" 765 766 "push %%rdi\n\t" 767 "mov %[ctrl], %%rdi\n\t" 768 769 // save guest state 770 "mov %%rax, 0*8(%%rdi)\n\t" 771 "mov %%rbx, 1*8(%%rdi)\n\t" 772 "mov %%rcx, 2*8(%%rdi)\n\t" 773 "mov %%rdx, 3*8(%%rdi)\n\t" 774 "mov %%rsi, 4*8(%%rdi)\n\t" 775 "mov %%rbp, 6*8(%%rdi)\n\t" 776 "mov %%r8, 8*8(%%rdi)\n\t" 777 "mov %%r9, 9*8(%%rdi)\n\t" 778 "mov %%r10, 10*8(%%rdi)\n\t" 779 "mov %%r11, 11*8(%%rdi)\n\t" 780 "mov %%r12, 12*8(%%rdi)\n\t" 781 "mov %%r13, 13*8(%%rdi)\n\t" 782 "mov %%r14, 14*8(%%rdi)\n\t" 783 "mov %%r15, 15*8(%%rdi)\n\t" 784 785 "mov %%cr2, %%rsi\n\t" 786 "mov %%rsi, 37*8(%%rdi)\n\t" 787 788 "pop %%rsi\n\t" 789 "mov %%rsi, 5*8(%%rdi)\n\t" 790 791 // load host state 792 "mov (148 + 1*8)(%%rdi), %%rbx\n\t" 793 "mov (148 + 6*8)(%%rdi), %%rbp\n\t" 794 "mov (148 + 12*8)(%%rdi), %%r12\n\t" 795 "mov (148 + 13*8)(%%rdi), %%r13\n\t" 796 "mov (148 + 14*8)(%%rdi), %%r14\n\t" 797 "mov (148 + 15*8)(%%rdi), %%r15\n\t" 798 "mov 38*8(%%rdi), %%rsi\n\t" 799 "mov %%rsi, %%cr2\n\t" 800 : [fail] "=m" (fail) 801 : [ctrl] "m" (ctrl), [launched] "m" (launched), 802 [host_rsp_encoding] "i" (VMX_HOST_RSP) 803 : "memory" 804 ); 805 assert(!fail); 806 wr_ldtr(ldtr_sel); 807 808 launched = 1; 809} 810 811static inline void print_vmcs_info(struct guest_control *g) 812{ 813 uint64_t guest_rip, guest_rsp, guest_rflags; 814 uint64_t reason, exit_qual; 815 uint64_t exit_intr_info, intr_err; 816 uint64_t idt_vec_info, idt_vec_err; 817 uint64_t instr_len, instr_info; 818 uint64_t instr_error, gpaddr, gladdr; 819 uint64_t entry_intr_info, activ_state, intr_state; 820 uint64_t guest_cr0, guest_cr3, guest_cr4; 821 uint64_t guest_efer; 822 823 uint64_t guest_es_sel, guest_es_base, guest_es_lim, guest_es_access; 824 uint64_t guest_cs_sel, guest_cs_base, guest_cs_lim, guest_cs_access; 825 uint64_t guest_ss_sel, guest_ss_base, guest_ss_lim, guest_ss_access; 826 uint64_t guest_ds_sel, guest_ds_base, guest_ds_lim, guest_ds_access; 827 uint64_t guest_fs_sel, guest_fs_base, guest_fs_lim, guest_fs_access; 828 uint64_t guest_gs_sel, guest_gs_base, guest_gs_lim, guest_gs_access; 829 uint64_t guest_tr_sel, guest_tr_base, guest_tr_lim, guest_tr_access; 830 uint64_t guest_ldtr_sel, guest_ldtr_base, guest_ldtr_lim, guest_ldtr_access; 831 uint64_t guest_idtr_base, guest_idtr_lim; 832 uint64_t guest_gdtr_base, guest_gdtr_lim; 833 834 errval_t err = vmread(VMX_GUEST_ES_SEL, &guest_es_sel); 835 err += vmread(VMX_GUEST_ES_BASE, &guest_es_base); 836 err += vmread(VMX_GUEST_ES_LIM, &guest_es_lim); 837 err += vmread(VMX_GUEST_ES_ACCESS, &guest_es_access); 838 err += vmread(VMX_GUEST_CS_SEL, &guest_cs_sel); 839 err += vmread(VMX_GUEST_CS_BASE, &guest_cs_base); 840 err += vmread(VMX_GUEST_CS_LIM, &guest_cs_lim); 841 err += vmread(VMX_GUEST_CS_ACCESS, &guest_cs_access); 842 err += vmread(VMX_GUEST_SS_SEL, &guest_ss_sel); 843 err += vmread(VMX_GUEST_SS_BASE, &guest_ss_base); 844 err += vmread(VMX_GUEST_SS_LIM, &guest_ss_lim); 845 err += vmread(VMX_GUEST_SS_ACCESS, &guest_ss_access); 846 err += vmread(VMX_GUEST_DS_SEL, &guest_ds_sel); 847 err += vmread(VMX_GUEST_DS_BASE, &guest_ds_base); 848 err += vmread(VMX_GUEST_DS_LIM, &guest_ds_lim); 849 err += vmread(VMX_GUEST_DS_ACCESS, &guest_ds_access); 850 err += vmread(VMX_GUEST_FS_SEL, &guest_fs_sel); 851 err += vmread(VMX_GUEST_FS_BASE, &guest_fs_base); 852 err += vmread(VMX_GUEST_FS_LIM, &guest_fs_lim); 853 err += vmread(VMX_GUEST_FS_ACCESS, &guest_fs_access); 854 err += vmread(VMX_GUEST_GS_SEL, &guest_gs_sel); 855 err += vmread(VMX_GUEST_GS_BASE, &guest_gs_base); 856 err += vmread(VMX_GUEST_GS_LIM, &guest_gs_lim); 857 err += vmread(VMX_GUEST_GS_ACCESS, &guest_gs_access); 858 err += vmread(VMX_GUEST_TR_SEL, &guest_tr_sel); 859 err += vmread(VMX_GUEST_TR_BASE, &guest_tr_base); 860 err += vmread(VMX_GUEST_TR_LIM, &guest_tr_lim); 861 err += vmread(VMX_GUEST_TR_ACCESS, &guest_tr_access); 862 err += vmread(VMX_GUEST_LDTR_SEL, &guest_ldtr_sel); 863 err += vmread(VMX_GUEST_LDTR_BASE, &guest_ldtr_base); 864 err += vmread(VMX_GUEST_LDTR_LIM, &guest_ldtr_lim); 865 err += vmread(VMX_GUEST_LDTR_ACCESS, &guest_ldtr_access); 866 err += vmread(VMX_GUEST_IDTR_BASE, &guest_idtr_base); 867 err += vmread(VMX_GUEST_IDTR_LIM, &guest_idtr_lim); 868 err += vmread(VMX_GUEST_GDTR_BASE, &guest_gdtr_base); 869 err += vmread(VMX_GUEST_GDTR_LIM, &guest_gdtr_lim); 870 871 err += vmread(VMX_GUEST_RIP, &guest_rip); 872 err += vmread(VMX_GUEST_RSP, &guest_rsp); 873 err += vmread(VMX_GUEST_RFLAGS, &guest_rflags); 874 err += vmread(VMX_EXIT_REASON, &reason); 875 err += vmread(VMX_EXIT_QUAL, &exit_qual); 876 err += vmread(VMX_EXIT_INTR_INFO, &exit_intr_info); 877 err += vmread(VMX_EXIT_INTR_ERR, &intr_err); 878 err += vmread(VMX_IDT_VEC_INFO, &idt_vec_info); 879 err += vmread(VMX_IDT_VEC_ERR, &idt_vec_err); 880 err += vmread(VMX_INSTR_ERROR, &instr_error); 881 err += vmread(VMX_GPADDR_F, &gpaddr); 882 err += vmread(VMX_GL_ADDR, &gladdr); 883 err += vmread(VMX_ENTRY_INTR_INFO, &entry_intr_info); 884 err += vmread(VMX_GUEST_ACTIV_STATE, &activ_state); 885 err += vmread(VMX_GUEST_INTR_STATE, &intr_state); 886 err += vmread(VMX_EXIT_INSTR_LEN, &instr_len); 887 err += vmread(VMX_EXIT_INSTR_INFO, &instr_info); 888 err += vmread(VMX_GUEST_CR0, &guest_cr0); 889 err += vmread(VMX_GUEST_CR3, &guest_cr3); 890 err += vmread(VMX_GUEST_CR4, &guest_cr4); 891 err += vmread(VMX_GUEST_EFER_F, &guest_efer); 892 assert(err_is_ok(err)); 893 894 printf("VMCS info:\n"); 895 printf("\tvmexit reason = %d\n", (int)reason & 0xFFFF); 896 printf("\texit qualification = 0x%"PRIx64"\n", exit_qual); 897 printf("\tBit 31 of reason = %x\n", ((int)reason >> 31) & 1); 898 899 printf("\tVM-exit interruption information = 0x%"PRIx64"\n", exit_intr_info); 900 printf("\tVM-exit interruption error = 0x%"PRIx64"\n", intr_err); 901 902 printf("\tVM-entry interruption info=0x%"PRIx64"\n", entry_intr_info); 903 904 printf("\tIDT vector information = 0x%"PRIx64"\n", idt_vec_info); 905 printf("\tIDT vector error = 0x%"PRIx64"\n", idt_vec_err); 906 907 printf("\tInstruction error = 0x%"PRIx64", gladdr = 0x%"PRIx64", gpaddr = 0x%"PRIx64"\n", 908 instr_error, gpaddr, gladdr); 909 printf("\tActivity state=0x%"PRIx64", Interruptibility state=0x%"PRIx64"\n", 910 activ_state, intr_state); 911 printf("\tVM-exit instruction length = 0x%"PRIx64"\n", instr_len); 912 printf("\tVM-exit instruction info = 0x%"PRIx64"\n", instr_info); 913 914 printf("\tguest_rip = 0x%"PRIx64", guest_rflags = 0x%"PRIx64"\n", 915 guest_rip, guest_rflags); 916 printf("\tRAX=0x%"PRIx64" RBX=0x%"PRIx64" RCX=0x%"PRIx64" RDX=0x%"PRIx64"\n", 917 g->regs.rax, g->regs.rbx, g->regs.rcx, g->regs.rdx); 918 printf("\tRSP=0x%"PRIx64" RBP=0x%"PRIx64" RSI=0x%"PRIx64" RDI=0x%"PRIx64"\n", 919 guest_rsp, g->regs.rbp, g->regs.rsi, g->regs.rdi); 920 printf("\tR8 =0x%"PRIx64" R9 =0x%"PRIx64" R10=0x%"PRIx64" R11=0x%"PRIx64"\n", 921 g->regs.r8, g->regs.r9, g->regs.r10, g->regs.r11); 922 printf("\tR12=0x%"PRIx64" R13=0x%"PRIx64" R14=0x%"PRIx64" R15=0x%"PRIx64"\n", 923 g->regs.r12, g->regs.r13, g->regs.r14, g->regs.r15); 924 printf("\tCR0=0x%"PRIx64", CR3=0x%"PRIx64", CR4=0x%"PRIx64"\n", 925 guest_cr0, guest_cr3, guest_cr4); 926 927 printf("\tES: sel=0x%"PRIx64", base=0x%"PRIx64", lim=0x%"PRIx64", access=0x%"PRIx64"\n", 928 guest_es_sel, guest_es_base, guest_es_lim, guest_es_access); 929 printf("\tCS: sel=0x%"PRIx64", base=0x%"PRIx64", lim=0x%"PRIx64", access=0x%"PRIx64"\n", 930 guest_cs_sel, guest_cs_base, guest_cs_lim, guest_cs_access); 931 printf("\tSS: sel= 0x%"PRIx64", base=0x%"PRIx64", lim=0x%"PRIx64", access=0x%"PRIx64"\n", 932 guest_ss_sel, guest_ss_base, guest_ss_lim, guest_ss_access); 933 printf("\tDS: sel=0x%"PRIx64", base=0x%"PRIx64", lim=0x%"PRIx64", access=0x%"PRIx64"\n", 934 guest_ds_sel, guest_ds_base, guest_ds_lim, guest_ds_access); 935 printf("\tFS: sel=0x%"PRIx64", base=0x%"PRIx64", lim=0x%"PRIx64", access=0x%"PRIx64"\n", 936 guest_fs_sel, guest_fs_base, guest_fs_lim, guest_fs_access); 937 printf("\tGS: sel=0x%"PRIx64", base=0x%"PRIx64", lim=0x%"PRIx64", access=0x%"PRIx64"\n", 938 guest_gs_sel, guest_gs_base, guest_gs_lim, guest_gs_access); 939 printf("\tTR: sel=0x%"PRIx64", base=0x%"PRIx64", lim=0x%"PRIx64", access=0x%"PRIx64"\n", 940 guest_tr_sel, guest_tr_base, guest_tr_lim, guest_tr_access); 941 printf("\tLDTR: sel=0x%"PRIx64", base=0x%"PRIx64", lim=0x%"PRIx64", access=0x%"PRIx64"\n", 942 guest_ldtr_sel, guest_ldtr_base, guest_ldtr_lim, guest_ldtr_access); 943 printf("\tIDTR: base=0x%"PRIx64", lim=0x%"PRIx64"\n", 944 guest_idtr_base, guest_idtr_lim); 945 printf("\tGDTR: base=0x%"PRIx64", lim=0x%"PRIx64"\n", 946 guest_gdtr_base, guest_gdtr_lim); 947 948 printf("\tEFER = 0x%"PRIx64"\n", guest_efer); 949} 950 951static inline uint64_t interruption_type(uint64_t intr_info) { 952 return (intr_info >> 8) & 0x7; 953} 954 955static void __attribute__ ((noreturn)) 956call_monitor(struct dcb *dcb) 957{ 958 ctrl->num_vm_exits_with_monitor_invocation++; 959 /* the guest exited not due to an interrupt but some condition the 960 * monitor has to handle, therefore notify the monitor */ 961 962 assert(dcb->is_vm_guest); 963 964 // disable the domain 965 scheduler_remove(dcb); 966 967 // call the monitor 968 errval_t err = lmp_deliver_notification(&dcb->guest_desc.monitor_ep.cap); 969 if (err_is_fail(err)) { 970 printk(LOG_ERR, "Unexpected error delivering VMEXIT"); 971 } 972 973 // run the monitor 974 dispatch(dcb->guest_desc.monitor_ep.cap.u.endpointlmp.listener); 975} 976 977__attribute__((unused)) 978static void dump_page_tables(lpaddr_t root_pt_phys) 979{ 980 lvaddr_t root_pt = local_phys_to_mem(root_pt_phys); 981 printk(LOG_NOTE, "dumping page tables rooted at 0x%"PRIxLPADDR"\n", root_pt_phys); 982 983 // loop over pdpts 984 union x86_64_ptable_entry *pt; 985 size_t kernel_pml4e = X86_64_PML4_BASE(X86_64_MEMORY_OFFSET); 986 for (int pdpt_index = 0; pdpt_index < kernel_pml4e; pdpt_index++) { 987 union x86_64_pdir_entry *pdpt = (union x86_64_pdir_entry *)root_pt + pdpt_index; 988 if (!pdpt->raw) { continue; } 989 else { 990 genpaddr_t paddr = (genpaddr_t)pdpt->d.base_addr << BASE_PAGE_BITS; 991 printf("%d: 0x%"PRIxGENPADDR" (%d %d), raw=0x%"PRIx64"\n", 992 pdpt_index, paddr, 993 pdpt->d.read_write, pdpt->d.user_supervisor, 994 pdpt->raw); 995 } 996 genpaddr_t pdpt_gp = pdpt->d.base_addr << BASE_PAGE_BITS; 997 lvaddr_t pdpt_lv = local_phys_to_mem(gen_phys_to_local_phys(pdpt_gp)); 998 999 for (int pdir_index = 0; pdir_index < X86_64_PTABLE_SIZE; pdir_index++) { 1000 // get pdir 1001 union x86_64_pdir_entry *pdir = (union x86_64_pdir_entry *)pdpt_lv + pdir_index; 1002 pt = (union x86_64_ptable_entry*)pdir; 1003 if (!pdir->raw) { continue; } 1004 // check if pdir or huge page 1005 if (pt->huge.always1) { 1006 // is huge page mapping 1007 genpaddr_t paddr = (genpaddr_t)pt->huge.base_addr << HUGE_PAGE_BITS; 1008 printf("%d.%d: 0x%"PRIxGENPADDR" (%d %d %d)\n", pdpt_index, 1009 pdir_index, paddr, pt->huge.read_write, 1010 pt->huge.dirty, pt->huge.accessed); 1011 // goto next pdpt entry 1012 continue; 1013 } else { 1014 genpaddr_t paddr = (genpaddr_t)pdir->d.base_addr << BASE_PAGE_BITS; 1015 printf("%d.%d: 0x%"PRIxGENPADDR" (%d %d), raw=0x%"PRIx64"\n", 1016 pdpt_index, pdir_index, paddr, 1017 pdir->d.read_write, pdir->d.user_supervisor, 1018 pdir->raw); 1019 } 1020 genpaddr_t pdir_gp = pdir->d.base_addr << BASE_PAGE_BITS; 1021 lvaddr_t pdir_lv = local_phys_to_mem(gen_phys_to_local_phys(pdir_gp)); 1022 1023 for (int ptable_index = 0; ptable_index < X86_64_PTABLE_SIZE; ptable_index++) { 1024 // get ptable 1025 union x86_64_pdir_entry *ptable = (union x86_64_pdir_entry *)pdir_lv + ptable_index; 1026 pt = (union x86_64_ptable_entry *)ptable; 1027 if (!ptable->raw) { continue; } 1028 // check if ptable or large page 1029 if (pt->large.always1) { 1030 // is large page mapping 1031 genpaddr_t paddr = (genpaddr_t)pt->large.base_addr << LARGE_PAGE_BITS; 1032 printf("%d.%d.%d: 0x%"PRIxGENPADDR" (%d %d %d)\n", 1033 pdpt_index, pdir_index, ptable_index, paddr, 1034 pt->large.read_write, pt->large.dirty, pt->large.accessed); 1035 // goto next pdir entry 1036 continue; 1037 } else { 1038 genpaddr_t paddr = (genpaddr_t)ptable->d.base_addr << BASE_PAGE_BITS; 1039 printf("%d.%d.%d: 0x%"PRIxGENPADDR" (%d %d), raw=0x%"PRIx64"\n", 1040 pdpt_index, pdir_index, ptable_index, paddr, 1041 ptable->d.read_write, ptable->d.user_supervisor, 1042 ptable->raw); 1043 } 1044 genpaddr_t ptable_gp = ptable->d.base_addr << BASE_PAGE_BITS; 1045 lvaddr_t ptable_lv = local_phys_to_mem(gen_phys_to_local_phys(ptable_gp)); 1046 1047 for (int entry = 0; entry < X86_64_PTABLE_SIZE; entry++) { 1048 union x86_64_ptable_entry *e = 1049 (union x86_64_ptable_entry *)ptable_lv + entry; 1050 genpaddr_t paddr = (genpaddr_t)e->base.base_addr << BASE_PAGE_BITS; 1051 if (!paddr) { 1052 continue; 1053 } 1054 printf("%d.%d.%d.%d: 0x%"PRIxGENPADDR" (%d %d %d), raw=0x%"PRIx64"\n", 1055 pdpt_index, pdir_index, ptable_index, entry, 1056 paddr, e->base.read_write, e->base.dirty, e->base.accessed, 1057 e->raw); 1058 } 1059 } 1060 } 1061 } 1062} 1063 1064struct sysret sys_vmcall(uint64_t syscall, uint64_t arg0, uint64_t arg1, 1065 uint64_t *args, uint64_t rflags, uint64_t rip, 1066 struct capability *root); 1067 1068extern uint64_t user_stack_save; 1069 1070void __attribute__ ((noreturn)) 1071vmx_vmkit_vmenter (struct dcb *dcb) 1072{ 1073 errval_t err; 1074 lpaddr_t lpaddr = gen_phys_to_local_phys(dcb->guest_desc.ctrl.cap.u.frame.base); 1075 ctrl = (void *)local_phys_to_mem(lpaddr); 1076 1077 assert(dcb != NULL); 1078 assert(dcb->vspace != 0); 1079 assert(dcb->is_vm_guest); 1080 1081 if (ept_enabled()) { 1082 uint64_t old_eptp_root, old_guest_cr3; 1083 err = vmread(VMX_EPTP_F, &old_eptp_root); 1084 err+= vmread(VMX_GUEST_CR3, &old_guest_cr3); 1085 assert(err_is_ok(err)); 1086 // dcb->vspace is root of EPT, dcb->guest.vspace is root of guest AS 1087 // get dcb->vspace masked with width of physical address space and 1088 // mask out low 12 bits 1089 uint64_t eptp_root = 0x6ull | (3 << 3); 1090 eptp_root |= (dcb->guest_desc.vspace & pa_width_mask()) & ~BASE_PAGE_MASK; 1091 // set bits 5:3 to 0x3 (i.e. 1 less than length of ept walks) 1092 //eptp_root |= 0x18; 1093 //printk(LOG_NOTE, "setting EPTP_F to 0x%lx\n", eptp_root); 1094 if (old_eptp_root != eptp_root) { 1095 printk(LOG_NOTE, "setting EPTP_F to 0x%lx\n", eptp_root); 1096 err = vmwrite(VMX_EPTP_F, eptp_root); 1097 assert(err_is_ok(err)); 1098 } 1099 if (old_guest_cr3 != dcb->vspace) { 1100 printk(LOG_NOTE, "setting GUEST_CR3 to 0x%lx\n", dcb->vspace); 1101 err = vmwrite(VMX_GUEST_CR3, dcb->vspace); 1102 assert(err_is_ok(err)); 1103 } 1104 /* 1105 printk(LOG_NOTE, "doing INVEPT\n"); 1106 uint64_t invept_desc[2] = { 0 }; 1107 invept_desc[0] = eptp_root; 1108 uint64_t mode = 1; 1109 __asm volatile("invept %[desc], %[mode]" 1110 : 1111 : [mode] "r" (mode), [desc] "m" (invept_desc) 1112 : "memory"); 1113 */ 1114 //printf("EPT tables:\n"); 1115 //dump_page_tables(eptp_root & ~BASE_PAGE_MASK); 1116 /* 1117 printf("GUEST tables:\n"); 1118 dump_page_tables(dcb->guest_desc.vspace); 1119 */ 1120 //print_vmcs_info(ctrl); 1121 } else { 1122 err = vmwrite(VMX_GUEST_CR3, dcb->vspace); 1123 assert(err_is_ok(err)); 1124 } 1125 1126vmx_vmenter_loop: 1127 1128 enter_guest(); 1129 1130 //printk(LOG_NOTE, "VMEXIT\n"); 1131 1132 uint16_t exit_reason; 1133 err = vmread(VMX_EXIT_REASON, (uint64_t *)&exit_reason); 1134 1135 //printk(LOG_NOTE, "vmx exit reason: %u\n", exit_reason); 1136 1137 switch(exit_reason) { 1138 case VMX_EXIT_REASON_INVAL_VMCS: 1139 { 1140 // A condition that violates ones of the processor checks may be violated 1141 // during the execution of the guest. With the Linux guest we used, the GS 1142 // limit is set to 0x10ffef, which causes one of the checks to fail. 1143 uint64_t gs_lim; 1144 err += vmread(VMX_GUEST_GS_LIM, &gs_lim); 1145 assert(gs_lim == 0x10ffef); 1146 err += vmwrite(VMX_GUEST_GS_LIM, 0xfffef); 1147 assert(err_is_ok(err)); 1148 } 1149 goto vmx_vmenter_loop; 1150 1151 case VMX_EXIT_REASON_EXCEPTION: 1152 { 1153 uint64_t intr_info, type; 1154 err += vmread(VMX_EXIT_INTR_INFO, &intr_info); 1155 assert(err_is_ok(err)); 1156 1157 type = interruption_type(intr_info); 1158 1159 if (type != TYPE_NMI) { 1160 //printk(LOG_NOTE, "REASON: EXCEPTION, type: %lu, vec: %lu\n", 1161 // type, intr_info & 0xF); 1162 call_monitor(dcb); 1163 break; 1164 } 1165 } 1166 case VMX_EXIT_REASON_EXT_INTR: 1167 case VMX_EXIT_REASON_SMI: 1168 { 1169 ctrl->num_vm_exits_without_monitor_invocation++; 1170 1171#ifdef CONFIG_ARRAKISMON 1172 //printf("EXIT_REASON: INTR || SMI\n"); 1173 uint64_t guest_rip, guest_rsp, guest_rflags; 1174 err += vmread(VMX_GUEST_RIP, &guest_rip); 1175 err += vmread(VMX_GUEST_RSP, &guest_rsp); 1176 err += vmread(VMX_GUEST_RFLAGS, &guest_rflags); 1177 1178 uint64_t guest_fs_sel, guest_gs_sel; 1179 err += vmread(VMX_GUEST_FS_SEL, &guest_fs_sel); 1180 err += vmread(VMX_GUEST_GS_SEL, &guest_gs_sel); 1181 assert(err_is_ok(err)); 1182 1183 arch_registers_state_t *area = NULL; 1184 1185 // Store user state into corresponding save area 1186 if(dispatcher_is_disabled_ip(dcb->disp, guest_rip)) { 1187 area = dispatcher_get_disabled_save_area(dcb->disp); 1188 dcb->disabled = true; 1189 } else { 1190 area = dispatcher_get_enabled_save_area(dcb->disp); 1191 dcb->disabled = false; 1192 } 1193 memcpy(area, &ctrl->regs, sizeof(arch_registers_state_t)); 1194 area->rip = guest_rip; 1195 area->rax = ctrl->regs.rax; 1196 area->rsp = guest_rsp; 1197 area->eflags = guest_rflags; 1198 area->fs = guest_fs_sel; 1199 area->gs = guest_gs_sel; 1200#endif 1201 wait_for_interrupt(); 1202 } 1203 break; 1204#ifdef CONFIG_ARRAKISMON 1205 case VMX_EXIT_REASON_VMCALL: 1206 { 1207 // Translate this to a SYSCALL 1208 struct registers_x86_64 *regs = &ctrl->regs; 1209 uint64_t args[10] = { 1210 regs->r10, regs->r8, regs->r9, regs->r12, regs->r13, regs->r14, 1211 regs->r15, regs->rax, regs->rbp, regs->rbx 1212 }; 1213 1214 //printf("VMMCALL: %lu %lx %lx\n", regs->rdi, regs->rsi, regs->rdx); 1215 1216 uint64_t guest_rip, guest_rsp, guest_rflags, instr_len; 1217 err += vmread(VMX_GUEST_RIP, &guest_rip); 1218 err += vmread(VMX_GUEST_RSP, &guest_rsp); 1219 err += vmread(VMX_GUEST_RFLAGS, &guest_rflags); 1220 // Advance guest RIP to next instruction 1221 err += vmread(VMX_EXIT_INSTR_LEN, &instr_len); 1222 assert(err_is_ok(err)); 1223 err += vmwrite(VMX_GUEST_RIP, guest_rip + instr_len); 1224 assert(err_is_ok(err)); 1225 1226 user_stack_save = guest_rsp; 1227 1228 //printf("doing VMMCALL: %lu %lx %lx\n", regs->rdi, regs->rsi, regs->rdx); 1229 1230 struct sysret ret = sys_vmcall(regs->rdi, regs->rsi, regs->rdx, 1231 args, guest_rflags, guest_rip + instr_len, &dcb->cspace.cap); 1232 1233 //printf("VMMCALL done\n"); 1234 1235 regs->rax = ret.error; 1236 regs->rdx = ret.value; 1237 } 1238 goto vmx_vmenter_loop; 1239#endif 1240 default: 1241 //printk(LOG_NOTE, "EXIT_REASON: %d\n", exit_reason); 1242#if 0 1243 if (exit_reason == VMX_EXIT_REASON_EPT_FAULT) { 1244 uint64_t fault_addr, guest_rip, exit_qual; 1245 err = vmread(VMX_GPADDR_F, &fault_addr); 1246 err+= vmread(VMX_GUEST_RIP, &guest_rip); 1247 err+= vmread(VMX_EXIT_QUAL, &exit_qual); 1248 //err+= vmread(VMX_PF_ERR_MATCH 1249 assert(err_is_ok(err)); 1250 printk(LOG_NOTE, "exit qualification: 0x%lx\n", exit_qual); 1251 printk(LOG_NOTE, "guest page fault on 0x%lx, IP 0x%lx\n", 1252 fault_addr, guest_rip); 1253 paging_dump_tables(dcb); 1254 } 1255#endif 1256 call_monitor(dcb); 1257 break; 1258 } 1259} 1260