1 2/** 3 * \file 4 */ 5 6/* 7 * Copyright (c) 2009, 2010, 2013, ETH Zurich. 8 * All rights reserved. 9 * 10 * This file is distributed under the terms in the attached LICENSE file. 11 * If you do not find this file, copies can be found by writing to: 12 * ETH Zurich D-INFK, Universitaetstrasse 6, CH-8092 Zurich. Attn: Systems Group. 13 */ 14 15#include <stdlib.h> 16#include <string.h> 17#include "vmkitmon.h" 18#include <barrelfish/barrelfish.h> 19#include <barrelfish/lmp_endpoints.h> 20#include <barrelfish/lmp_chan.h> 21#include <barrelfish/dispatcher_arch.h> 22#include <barrelfish/memobj.h> 23#include <barrelfish/vregion.h> 24#include <barrelfish/vspace.h> 25 26#include "x86.h" 27#ifdef CONFIG_SVM 28#include "svm.h" 29#endif 30#include "paging.h" 31//#include "realmode.h" 32#include "hdd.h" 33#include "console.h" 34#include "pc16550d.h" 35#include "apic.h" 36#include "lpc.h" 37#include "pci.h" 38#include "pci_host.h" 39 40#define ARRAKIS_USE_NESTED_PAGING 41//#define EPT_FINE_GRAINED 42 43#define VMCB_SIZE 0x1000 // 4KB 44#ifdef CONFIG_SVM 45#define IOPM_SIZE 0x3000 // 12KB 46#define MSRPM_SIZE 0x2000 // 8KB 47#else 48#define IOBMP_A_SIZE 0x1000 // 4KB 49#define IOBMP_B_SIZE 0x1000 // 4KB 50#define MSRPM_SIZE 0x1000 // 4KB 51#endif 52#define RM_MEM_SIZE (0x100000 + BASE_PAGE_SIZE) // 1MB + A20 gate space 53 54#define APIC_BASE 0xfee00000 55 56#define VREGION_FLAGS_ALL (VREGION_FLAGS_READ_WRITE | VREGION_FLAGS_EXECUTE) 57 58// list of guests 59struct guest *guests = NULL; 60 61static paging_x86_64_flags_t vregion_to_pmap_flag(vregion_flags_t vregion_flags) 62{ 63 paging_x86_64_flags_t pmap_flags = 64 PTABLE_USER_SUPERVISOR | PTABLE_EXECUTE_DISABLE; 65 66 if (!(vregion_flags & VREGION_FLAGS_GUARD)) { 67 if (vregion_flags & VREGION_FLAGS_WRITE) { 68 pmap_flags |= PTABLE_READ_WRITE; 69 } 70 if (vregion_flags & VREGION_FLAGS_EXECUTE) { 71 pmap_flags &= ~PTABLE_EXECUTE_DISABLE; 72 } 73 if (vregion_flags & VREGION_FLAGS_NOCACHE) { 74 pmap_flags |= PTABLE_CACHE_DISABLED; 75 } 76 } 77 78 return pmap_flags; 79} 80 81#ifndef CONFIG_SVM 82extern uint16_t saved_exit_reason; 83extern uint64_t saved_exit_qual, saved_rip; 84 85// List of MSRs that are saved on VM-exit and loaded on VM-entry. 86static uint32_t msr_list[VMX_MSR_COUNT] = 87 {X86_MSR_KERNEL_GS_BASE, X86_MSR_STAR, X86_MSR_LSTAR, X86_MSR_CSTAR, X86_MSR_SFMASK}; 88 89// Saved priority of the most recent irq that is asserted. 90uint8_t interrupt_priority = 0; 91#endif 92 93#ifndef CONFIG_SVM 94static inline int vmx_guest_msr_index(uint32_t msr_index) 95{ 96 for (int i = 0; i < VMX_MSR_COUNT; i++) { 97 if (msr_list[i] == msr_index) { 98 return i; 99 } 100 } 101 return -1; 102} 103 104__attribute__((unused)) 105static void initialize_guest_msr_area(struct guest *g) 106{ 107 struct msr_entry *guest_msr_area = (struct msr_entry *)g->msr_area_va; 108 109 // The values of the MSRs in the guest MSR area are all set to 0. 110 for (int i = 0; i < VMX_MSR_COUNT; i++) { 111 guest_msr_area[i].index = msr_list[i]; 112 guest_msr_area[i].val = 0x0; 113 } 114 115 errval_t err = invoke_dispatcher_vmwrite(g->dcb_cap, VMX_EXIT_MSR_STORE_F, g->msr_area_pa); 116 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_EXIT_MSR_STORE_CNT, VMX_MSR_COUNT); 117 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_ENTRY_MSR_LOAD_F, g->msr_area_pa); 118 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_ENTRY_MSR_LOAD_CNT, VMX_MSR_COUNT); 119 assert(err_is_ok(err)); 120} 121#endif 122 123lvaddr_t guest_offset = 0; 124 125/// stores the last used guest ASID 126static uint32_t last_guest_asid = 0; 127 128// FIXME: this is somewhat broken by design... we should emit proper exceptions 129// to the guest opposed to just halt the VM 130#define guest_assert(g, e) \ 131 ((e) ? (void)0 : (handle_vmexit_unhandeled(g), assert(e))) 132 133static errval_t 134guest_slot_alloc(struct guest *g, struct capref *ret) 135{ 136 return g->slot_alloc.a.alloc(&g->slot_alloc.a, ret); 137} 138 139errval_t guest_vspace_map_wrapper(struct vspace *vspace, lvaddr_t vaddr, 140 struct capref frame, size_t size) 141{ 142 debug_printf("%s:%d\n",__FUNCTION__, __LINE__); 143 errval_t err; 144 struct vregion *vregion = NULL; 145 struct memobj_one_frame *memobj = NULL; 146 147 debug_printf("%s:%d\n",__FUNCTION__, __LINE__); 148 // Allocate space 149 vregion = malloc(sizeof(struct vregion)); 150 if (!vregion) { 151 err = LIB_ERR_MALLOC_FAIL; 152 goto error; 153 } 154 debug_printf("%s:%d\n",__FUNCTION__, __LINE__); 155 memobj = malloc(sizeof(struct memobj_one_frame)); 156 if (!memobj) { 157 err = LIB_ERR_MALLOC_FAIL; 158 goto error; 159 } 160 161 debug_printf("%s:%d\n",__FUNCTION__, __LINE__); 162 // Create the objects 163 err = memobj_create_one_frame(memobj, size, 0); 164 if (err_is_fail(err)) { 165 err = err_push(err, LIB_ERR_MEMOBJ_CREATE_ANON); 166 goto error; 167 } 168 debug_printf("%s:%d\n",__FUNCTION__, __LINE__); 169 err = memobj->m.f.fill(&memobj->m, 0, frame, size); 170 if (err_is_fail(err)) { 171 err = err_push(err, LIB_ERR_MEMOBJ_FILL); 172 goto error; 173 } 174 debug_printf("%s:%d\n",__FUNCTION__, __LINE__); 175 debug_printf("mapping guest vregion (%p) in guest vspace (%p) at 0x%lx, size 0x%lx\n", 176 vregion, vspace, vaddr, size); 177 debug_printf("current regions in guest vspace:\n"); 178 for (struct vregion *v = vspace->head; v; v = v->next) { 179 debug_printf(" 0x%lx, 0x%lx\n", v->base, v->size); 180 } 181 err = vregion_map_fixed(vregion, vspace, &memobj->m, 0, size, vaddr, 182 VREGION_FLAGS_READ | VREGION_FLAGS_WRITE | VREGION_FLAGS_EXECUTE); 183 if (err_is_fail(err)) { 184 err = err_push(err, LIB_ERR_VSPACE_MAP); 185 goto error; 186 } 187 debug_printf("%s:%d\n",__FUNCTION__, __LINE__); 188 err = memobj->m.f.pagefault(&memobj->m, vregion, 0, 0); 189 if (err_is_fail(err)) { 190 err = err_push(err, LIB_ERR_MEMOBJ_PAGEFAULT_HANDLER); 191 goto error; 192 } 193 debug_printf("mapped %zu bytes at 0x%"PRIxGENVADDR"\n", size, vaddr); 194 195 return SYS_ERR_OK; 196 197error: // XXX: proper cleanup 198 if (vregion) { 199 free(vregion); 200 } 201 if (memobj) { 202 free(memobj); 203 } 204 return err; 205} 206 207#define GUEST_VSPACE_SIZE (1ULL<<39) // 512 GB 208 209static errval_t vspace_map_wrapper(lvaddr_t vaddr, struct capref frame, 210 size_t size) 211{ 212 debug_printf("%s:%d\n",__FUNCTION__, __LINE__); 213 errval_t err; 214 static struct memobj_anon *memobj = NULL; 215 static struct vregion *vregion = NULL; 216 static bool initialized = false; 217 218 debug_printf("%s:%d\n",__FUNCTION__, __LINE__); 219 if (!initialized) { 220 debug_printf("%s:%d\n",__FUNCTION__, __LINE__); 221 // Allocate space 222 memobj = malloc(sizeof(struct memobj_anon)); 223 if (!memobj) { 224 return LIB_ERR_MALLOC_FAIL; 225 } 226 debug_printf("%s:%d\n",__FUNCTION__, __LINE__); 227 vregion = malloc(sizeof(struct vregion)); 228 if (!vregion) { 229 return LIB_ERR_MALLOC_FAIL; 230 } 231 232 debug_printf("%s:%d\n",__FUNCTION__, __LINE__); 233 // Create a memobj and vregion 234 err = memobj_create_anon(memobj, GUEST_VSPACE_SIZE, 0); 235 if (err_is_fail(err)) { 236 return err_push(err, LIB_ERR_MEMOBJ_CREATE_ANON); 237 } 238 debug_printf("%s:%d\n",__FUNCTION__, __LINE__); 239 err = vregion_map(vregion, get_current_vspace(), &memobj->m, 0, 240 GUEST_VSPACE_SIZE, VREGION_FLAGS_READ_WRITE); 241 if (err_is_fail(err)) { 242 return err_push(err, LIB_ERR_VREGION_MAP); 243 } 244 245 debug_printf("%s:%d\n",__FUNCTION__, __LINE__); 246 guest_offset = vregion_get_base_addr(vregion); 247 debug_printf("guest_offset = 0x%lx\n", guest_offset); 248 initialized = true; 249 } 250 251 debug_printf("%s:%d\n",__FUNCTION__, __LINE__); 252 // Create mapping 253 err = memobj->m.f.fill(&memobj->m, vaddr, frame, size); 254 if (err_is_fail(err)) { 255 return err_push(err, LIB_ERR_MEMOBJ_FILL); 256 } 257 debug_printf("%s:%d\n",__FUNCTION__, __LINE__); 258 err = memobj->m.f.pagefault(&memobj->m, vregion, vaddr, 0); 259 if (err_is_fail(err)) { 260 return err_push(err, LIB_ERR_MEMOBJ_PAGEFAULT_HANDLER); 261 } 262 263 return SYS_ERR_OK; 264} 265// allocates some bytes of memory for the guest starting at a specific addr 266// also performs the mapping into the vspace of the monitor 267errval_t 268alloc_guest_mem(struct guest *g, lvaddr_t guest_paddr, size_t bytes) 269{ 270 errval_t err; 271 272 // only allow multiple of page sizes to be allocated 273 assert(bytes > 0 && (bytes & BASE_PAGE_MASK) == 0); 274 // do not allow allocation outside of the guests physical memory 275 assert(guest_paddr + bytes <= g->mem_high_va); 276 277 debug_printf("%s:%d\n",__FUNCTION__, __LINE__); 278 // Allocate frame 279 struct capref cap; 280 err = guest_slot_alloc(g, &cap); 281 if (err_is_fail(err)) { 282 return err_push(err, LIB_ERR_SLOT_ALLOC); 283 } 284 debug_printf("%s:%d\n",__FUNCTION__, __LINE__); 285 err = frame_create(cap, bytes, NULL); 286 if (err_is_fail(err)) { 287 return err_push(err, LIB_ERR_FRAME_CREATE); 288 } 289 290 debug_printf("%s:%d\n",__FUNCTION__, __LINE__); 291 // Map into the guest vspace 292 err = guest_vspace_map_wrapper(g->vspace, guest_paddr, cap, bytes); 293 if (err_is_fail(err)) { 294 return err; 295 } 296 297 debug_printf("%s:%d\n",__FUNCTION__, __LINE__); 298 // Create a copy of the capability to map in our vspace 299 struct capref host_cap; 300 err = slot_alloc(&host_cap); 301 if (err_is_fail(err)) { 302 return err; 303 } 304 err = cap_copy(host_cap, cap); 305 if (err_is_fail(err)) { 306 return err; 307 } 308 309 debug_printf("%s:%d\n",__FUNCTION__, __LINE__); 310 // Map into my vspace 311 debug_printf("mapping into our vspace at 0x%lx\n", guest_to_host(guest_paddr)); 312 err = vspace_map_wrapper(guest_to_host(guest_paddr), host_cap, bytes); 313 if (err_is_fail(err)) { 314 return err; 315 } 316 317 debug_printf("%s:%d\n",__FUNCTION__, __LINE__); 318 struct frame_identity frameid = { .base = 0, .bytes = 0 }; 319 errval_t r = frame_identify(cap, &frameid); 320 assert(err_is_ok(r)); 321 debug_printf("alloc_guest_mem: frameid.base: 0x%lx, frameid.bytes: %zu," 322 " g->mem_low_va: 0x%lx, g->mem_high_va: 0x%lx\n", 323 frameid.base, frameid.bytes, g->mem_low_va, g->mem_high_va); 324 325 return SYS_ERR_OK; 326} 327 328static void 329initialize_iopm (struct guest *self) { 330 // intercept all IO port accesses (for now) 331#ifdef CONFIG_SVM 332 memset((void*)self->iopm_va, 0xFF, IOPM_SIZE); 333#else 334 memset((void*)self->iobmp_a_va, 0xFF, IOBMP_A_SIZE); 335 memset((void*)self->iobmp_b_va, 0xFF, IOBMP_B_SIZE); 336#endif 337} 338 339// access_mode: 0 all access, 1 read intercept, 2 write intercept, 3 all interc. 340static inline void 341set_msr_access (struct guest *g, uint32_t msr, int access_mode) 342{ 343 assert(access_mode >= 0 && access_mode <= 3); 344 345 // a region a 2K bytes represents the access bits of 8K MSRs, therefore each 346 // MSR takes two bits (one for rdmsr and one for wrmsr) 347 uintptr_t byte_offset = (msr & 0xffff) / 4; 348 int bit_offset = ((msr & 0xffff) % 4) * 2; 349 350 if (msr < 0x2000) { 351 // do nothing 352 } else if (msr >= 0xc0000000 && msr < 0xc0002000) { 353 byte_offset += 0x800; 354 } else if (msr >= 0xc0010000 && msr < 0xc0012000) { 355 byte_offset += 0x1000; 356 } else { 357 assert(!"not reached"); 358 } 359 360 assert(byte_offset < MSRPM_SIZE); 361 362 // read the byte holding the relevant bits 363 uint8_t val = *(uint8_t *)(g->msrpm_va + byte_offset); 364 // set the access params according to the arguments 365 val = (val & ~(0x3 << bit_offset)) | (access_mode << bit_offset); 366 // store the modified value back in the map 367 *(uint8_t *)(g->msrpm_va + byte_offset) = val; 368 369 //printf("MSR: msr %x, byte_offset %lx, bit_offset %x, val %x\n", msr, byte_offset, bit_offset, val); 370} 371 372static void 373initialize_msrpm (struct guest *g) { 374 // intercept all MSR accesses (for now) 375 memset((void*)g->msrpm_va, 0xff, MSRPM_SIZE); 376 377#if 0 378 // allow performance counters and evnets MSR accesses 379 set_msr_access (g, 0xc0010000, 0); 380 set_msr_access (g, 0xc0010001, 0); 381 set_msr_access (g, 0xc0010002, 0); 382 set_msr_access (g, 0xc0010003, 0); 383 set_msr_access (g, 0xc0010004, 0); 384 set_msr_access (g, 0xc0010005, 0); 385 set_msr_access (g, 0xc0010006, 0); 386 set_msr_access (g, 0xc0010007, 0); 387#endif 388} 389 390#define INIT_DATA_SEGREG(vmcb,x) \ 391do { \ 392 amd_vmcb_seg_attrib_t __sa = { \ 393 .segtype = 2, \ 394 .s = 1, \ 395 .dpl = 0, \ 396 .p = 1, \ 397 .l = 0, \ 398 .db = 1, \ 399 .g = 1, \ 400 }; \ 401 amd_vmcb_##x## _attrib_wr((vmcb), __sa); \ 402 amd_vmcb_##x## _selector_wr((vmcb), 0x10); \ 403 amd_vmcb_##x## _base_wr((vmcb), 0x0); \ 404 amd_vmcb_##x## _limit_wr((vmcb), 0xffffffff); \ 405} while (0) 406 407#define INIT_CODE_SEGREG(vmcb,x) \ 408do { \ 409 amd_vmcb_seg_attrib_t __sa = { \ 410 .segtype = 0xa, \ 411 .s = 1, \ 412 .dpl = 0, \ 413 .p = 1, \ 414 .l = 1, \ 415 .db = 0, \ 416 .g = 1, \ 417 }; \ 418 amd_vmcb_##x## _attrib_wr((vmcb), __sa); \ 419 amd_vmcb_##x## _selector_wr((vmcb), 8); \ 420 amd_vmcb_##x## _base_wr((vmcb), 0x0); \ 421 amd_vmcb_##x## _limit_wr((vmcb), 0xffffffff); \ 422} while (0) 423 424#define INIT_SYS_SEGREG(vmcb,x) \ 425do { \ 426 amd_vmcb_seg_attrib_t __sa = { \ 427 .segtype = 2, \ 428 .s = 1, \ 429 .dpl = 0, \ 430 .p = 1, \ 431 .l = 0, \ 432 .db = 1, \ 433 .g = 1, \ 434 }; \ 435 amd_vmcb_##x## _attrib_wr((vmcb), __sa); \ 436 amd_vmcb_##x## _selector_wr((vmcb), 0x10); \ 437 amd_vmcb_##x## _base_wr((vmcb), 0x0); \ 438 amd_vmcb_##x## _limit_wr((vmcb), 0xffffffff); \ 439} while (0) 440 441#ifdef CONFIG_SVM 442/* This method initializes a new VMCB memory regsion and sets the initial 443 * machine state as defined by the AMD64 architecture specification */ 444static void 445initialize_vmcb (struct guest *self) { 446 amd_vmcb_initialize(&self->vmcb, (mackerel_addr_t)self->vmcb_va); 447 448 // 1. Initialize intercepts 449 450 /* For now we intercept just everything */ 451 452 amd_vmcb_cr_access_wr_raw(&self->vmcb, ~0u); 453 amd_vmcb_cr_access_rdcr2_wrf(&self->vmcb, 0); 454 amd_vmcb_cr_access_wrcr2_wrf(&self->vmcb, 0); 455 amd_vmcb_cr_access_rdcr4_wrf(&self->vmcb, 0); 456 amd_vmcb_cr_access_wrcr4_wrf(&self->vmcb, 0); 457 458 // FIXME: ignoring DR accesses may be insecure 459 //amd_vmcb_dr_access_wr_raw(&self->vmcb, ~0u); 460 amd_vmcb_exceptions_wr_raw(&self->vmcb, 0); 461 /* amd_vmcb_exceptions_vector7_wrf(&self->vmcb, 0); */ 462 /* amd_vmcb_exceptions_vector14_wrf(&self->vmcb, 0); */ 463 464 amd_vmcb_intercepts_wr_raw(&self->vmcb, 0x1fffffffffff); 465 amd_vmcb_intercepts_pushf_wrf(&self->vmcb, 0); 466 amd_vmcb_intercepts_popf_wrf(&self->vmcb, 0); 467 amd_vmcb_intercepts_invlpg_wrf(&self->vmcb, 0); 468 amd_vmcb_intercepts_rdtsc_wrf(&self->vmcb, 0); 469 amd_vmcb_intercepts_rdtscp_wrf(&self->vmcb, 0); 470 amd_vmcb_intercepts_iret_wrf(&self->vmcb, 0); 471 amd_vmcb_intercepts_wbinvd_wrf(&self->vmcb, 0); 472 amd_vmcb_intercepts_pause_wrf(&self->vmcb, 0); 473 amd_vmcb_intercepts_vintr_wrf(&self->vmcb, 0); 474 475 // 2. Setup some config fields 476 477 // physical addresses of IOPM and MSRPM_SIZE 478 amd_vmcb_iopm_base_pa_wr(&self->vmcb, self->iopm_pa); 479 amd_vmcb_msrpm_base_pa_wr(&self->vmcb, self->msrpm_pa); 480 // assign guest ASID 481 // FIXME: use real asid allocator. BF does not know about tagged TLBs atm 482 amd_vmcb_tlb_guest_asid_wrf(&self->vmcb, ++last_guest_asid); 483 // enable virtual intr masking 484 amd_vmcb_vintr_vintr_masking_wrf(&self->vmcb, 1); 485 // enable nested paging 486 amd_vmcb_np_enable_wrf(&self->vmcb, 1); 487 488 /* 3. Guest state initialization 489 * according to Intels Manual 3A: Table 9-1. */ 490 491 // The second bit of rflags needs to be 1, also indicate that we support the 492 // CPUID instruction. 493 amd_vmcb_rflags_wr_raw(&self->vmcb, 0x00200002); 494 amd_vmcb_rip_wr(&self->vmcb, 0x0000fff0); 495 amd_vmcb_cr0_wr_raw(&self->vmcb, 0x60000010); 496 497 INIT_CODE_SEGREG(&self->vmcb, cs); 498 INIT_DATA_SEGREG(&self->vmcb, ss); 499 INIT_DATA_SEGREG(&self->vmcb, ds); 500 INIT_DATA_SEGREG(&self->vmcb, es); 501 INIT_DATA_SEGREG(&self->vmcb, fs); 502 INIT_DATA_SEGREG(&self->vmcb, gs); 503 504 INIT_SYS_SEGREG(&self->vmcb, gdtr); 505 INIT_SYS_SEGREG(&self->vmcb, idtr); 506 INIT_SYS_SEGREG(&self->vmcb, ldtr); 507 INIT_SYS_SEGREG(&self->vmcb, tr); 508 509 amd_vmcb_dr6_wr(&self->vmcb, 0xffff0ff0); 510 amd_vmcb_dr7_wr(&self->vmcb, 0x00000400); 511 512 // taken from the linux SVM source 513 amd_vmcb_gpat_wr(&self->vmcb, 0x0007040600070406ul); 514 515 // svm requires guest EFER.SVME to be set 516 amd_vmcb_efer_svme_wrf(&self->vmcb, 1); 517} 518 519#endif 520 521#ifdef EPT_FINE_GRAINED 522static 523errval_t ept_map_one_frame_fixed_attr(struct guest *g, lvaddr_t addr, size_t size, 524 struct capref frame, vregion_flags_t flags, 525 struct memobj **retmemobj, 526 struct vregion **retvregion) 527{ 528 errval_t err1, err2; 529 struct memobj *memobj = NULL; 530 struct vregion *vregion = NULL; 531 532 size = ROUND_UP(size, BASE_PAGE_SIZE); 533 534 // Allocate space 535 memobj = malloc(sizeof(struct memobj_one_frame)); 536 if (!memobj) { 537 err1 = LIB_ERR_MALLOC_FAIL; 538 goto error; 539 } 540 vregion = malloc(sizeof(struct vregion)); 541 if (!vregion) { 542 err1 = LIB_ERR_MALLOC_FAIL; 543 goto error; 544 } 545 546 // Create mappings 547 err1 = memobj_create_one_frame((struct memobj_one_frame*)memobj, size, 0); 548 if (err_is_fail(err1)) { 549 err1 = err_push(err1, LIB_ERR_MEMOBJ_CREATE_ONE_FRAME); 550 goto error; 551 } 552 553 err1 = memobj->f.fill(memobj, 0, frame, size); 554 if (err_is_fail(err1)) { 555 err1 = err_push(err1, LIB_ERR_MEMOBJ_FILL); 556 goto error; 557 } 558 559 err1 = vregion_map_fixed(vregion, g->vspace, memobj, 0, size, addr, flags); 560 if (err_is_fail(err1)) { 561 err1 = err_push(err1, LIB_ERR_VREGION_MAP); 562 goto error; 563 } 564 565 err1 = memobj->f.pagefault(memobj, vregion, 0, 0); 566 if (err_is_fail(err1)) { 567 err1 = err_push(err1, LIB_ERR_MEMOBJ_PAGEFAULT_HANDLER); 568 goto error; 569 } 570 571 if (retmemobj) { 572 *retmemobj = memobj; 573 } 574 if (retvregion) { 575 *retvregion = vregion; 576 } 577 return SYS_ERR_OK; 578 579 error: 580 DEBUG_ERR(err1, "in %s", __FUNCTION__); 581 if (memobj) { 582 err2 = memobj_destroy_one_frame(memobj); 583 if (err_is_fail(err2)) { 584 DEBUG_ERR(err2, "memobj_destroy_anon failed"); 585 } 586 } 587 if (vregion) { 588 err2 = vregion_destroy(vregion); 589 if (err_is_fail(err2)) { 590 DEBUG_ERR(err2, "vregion_destroy failed"); 591 } 592 } 593 return err1; 594} 595 596static void ept_map(struct guest *g, struct capref cap) 597{ 598 errval_t err; 599 struct capref ept_copy; 600 err = guest_slot_alloc(g, &ept_copy); 601 assert(err_is_ok(err)); 602 err = cap_copy(ept_copy, cap); 603 assert(err_is_ok(err)); 604 605 struct frame_identity fi; 606 err = frame_identify(ept_copy, &fi); 607 608 printf("%s: creating identity mapping for 0x%"PRIxGENPADDR", %lu bytes\n", 609 __FUNCTION__, fi.base, fi.bytes); 610 611 err = ept_map_one_frame_fixed_attr(g, fi.base, fi.bytes, 612 ept_copy, VREGION_FLAGS_READ_WRITE | VREGION_FLAGS_EXECUTE, 613 NULL, NULL); 614 assert(err_is_ok(err)); 615} 616 617static void ept_map_vnode(struct guest *g, struct vnode *v) 618{ 619 assert(v->v.is_vnode); 620 621 ept_map(g, v->v.cap); 622 623 for (int i = 0; i < PTABLE_SIZE; i++) { 624 struct vnode *c = v->u.vnode.children[i]; 625 if (c && c->v.is_vnode) { 626 ept_map_vnode(g, c); 627 } 628 } 629} 630#endif 631 632static void 633idc_handler(void *arg) 634{ 635 struct guest *g = arg; 636 errval_t err; 637 638 // consume message 639 struct lmp_recv_buf buf = { .buflen = 0 }; 640 err = lmp_endpoint_recv(g->monitor_ep, &buf, NULL); 641 assert(err_is_ok(err)); 642 643 // run real handler 644 guest_handle_vmexit(g); 645 646 // re-register 647 struct event_closure cl = { 648 .handler = idc_handler, 649 .arg = arg, 650 }; 651 err = lmp_endpoint_register(g->monitor_ep, get_default_waitset(), cl); 652 assert(err_is_ok(err)); 653} 654 655 656extern errval_t vspace_add_vregion(struct vspace *vspace, struct vregion *region); 657errval_t get_pdpt(struct pmap_x86 *pmap, genvaddr_t base, 658 struct vnode **pdpt); 659errval_t get_pdir(struct pmap_x86 *pmap, genvaddr_t base, 660 struct vnode **pdir); 661extern errval_t get_ptable(struct pmap_x86 *pmap, genvaddr_t base, 662 struct vnode **ptable); 663static void ept_force_mapping(struct guest *g, struct capref mem) 664{ 665 errval_t err; 666 struct frame_identity fi; 667 668 // get info about memory 669 err = frame_identify(mem, &fi); 670 if (err_is_fail(err)) { 671 DEBUG_ERR(err, "id mem cap\n"); 672 } 673 assert(err_is_ok(err)); 674 675 printf("%s: creating identity mapping for 0x%"PRIxGENPADDR", %lu bytes\n", 676 __FUNCTION__, fi.base, fi.bytes); 677 678 // mark off region in vspace 679 struct vregion *v = malloc(sizeof(*v)); 680 v->base = fi.base; 681 v->size = fi.bytes; 682 err = vspace_add_vregion(g->vspace, v); 683 assert(err_is_ok(err)); 684 685 struct pmap_x86 *pmap = (struct pmap_x86 *)vspace_get_pmap(g->vspace); 686 struct vnode *pt; 687 paging_x86_64_flags_t pmap_flags = vregion_to_pmap_flag(VREGION_FLAGS_ALL); 688 size_t npages = 0; 689 690 /* XXX: properly keep track of mappings */ 691 struct capref mapping; 692 err = slot_alloc(&mapping); 693 assert(err_is_ok(err)); 694 695 if (fi.bytes >= X86_64_HUGE_PAGE_SIZE && fi.bytes % X86_64_HUGE_PAGE_SIZE == 0) { 696 // do huge page mappings 697 // get pdpt through pmap 698 err = get_pdpt(pmap, v->base, &pt); 699 assert(err_is_ok(err)); 700 assert(pt->v.is_vnode); 701 npages = v->size / HUGE_PAGE_SIZE; 702 printf(" %zu 1G pages\n", npages); 703 assert(npages <= 512); 704 err = vnode_map(pt->v.cap, mem, X86_64_PDPT_BASE(v->base), 705 pmap_flags, 0, npages, mapping); 706 } else if (fi.bytes >= X86_64_LARGE_PAGE_SIZE && fi.bytes % X86_64_LARGE_PAGE_SIZE == 0) { 707 // do large page mappings 708 err = get_pdir(pmap, v->base, &pt); 709 assert(err_is_ok(err)); 710 assert(pt->v.is_vnode); 711 npages = v->size / LARGE_PAGE_SIZE; 712 printf(" %zu 2M pages\n", npages); 713 assert(npages < 512); 714 err = vnode_map(pt->v.cap, mem, X86_64_PDIR_BASE(v->base), 715 pmap_flags, 0, npages, mapping); 716 } else { 717 // get leaf pt through pmap 718 err = get_ptable(pmap, v->base, &pt); 719 assert(err_is_ok(err)); 720 npages = v->size / BASE_PAGE_SIZE; 721 printf(" %zu 4k pages\n", npages); 722 // should never be full ptable 723 assert(npages < 512); 724 err = vnode_map(pt->v.cap, mem, X86_64_PTABLE_BASE(v->base), 725 pmap_flags, 0, npages, mapping); 726 assert(err_is_ok(err)); 727 } 728} 729 730void npt_map_handler(struct hyper_binding *b, struct capref mem) 731{ 732 uint64_t dispframe = (uint64_t)b->st; 733 struct guest *g; 734 for (g = guests; g; g = g->next) { 735 if (g->dispframe == dispframe) { 736 break; 737 } 738 } 739 if (g == NULL) { 740 b->tx_vtbl.npt_map_response(b, NOP_CONT, ARRA_ERR_GUEST_NOT_FOUND); 741 return; 742 } 743 744 ept_force_mapping(g, mem); 745 b->tx_vtbl.npt_map_response(b, NOP_CONT, SYS_ERR_OK); 746} 747 748#if defined(ARRAKIS_USE_NESTED_PAGING) && !defined(EPT_FINE_GRAINED) 749static void ept_setup_low512g(struct guest *g) 750{ 751 errval_t err; 752 struct pmap_x86 *pmap = (struct pmap_x86 *)vspace_get_pmap(g->vspace); 753 struct vnode *vn; 754 // get first pdpt (512g) 755 err = get_pdpt(pmap, 0, &vn); 756 assert(err_is_ok(err)); 757 union x86_64_ptable_entry *pt; 758 struct capref ept_copy; 759 err = slot_alloc(&ept_copy); 760 err+= cap_copy(ept_copy, vn->v.cap); 761 err+= vspace_map_one_frame_attr((void**)&pt, BASE_PAGE_SIZE, ept_copy, 762 VREGION_FLAGS_READ_WRITE, NULL, NULL); 763 assert(err_is_ok(err)); 764 genvaddr_t base = 0; 765 for (int i = 0; i < PTABLE_SIZE; i++) { 766 union x86_64_ptable_entry tmp; 767 tmp.raw = X86_64_PTABLE_CLEAR; 768 769 tmp.huge.present = 1; 770 tmp.huge.read_write = 1; 771 tmp.huge.user_supervisor = 1; 772 tmp.huge.always1 = 1; 773 tmp.huge.base_addr = base >> X86_64_HUGE_PAGE_BITS; 774 775 // write back cached translations 776 // set bits 5:3 to 6 777 tmp.raw |= (0x6 << 3); 778 // set accessed and dirty to avoid extra memory refs --> same as dune 779 // cf. dune's kern/ept.c:454--459. 780 tmp.raw |= (0x3 << 8); 781 782 pt[i] = tmp; 783 784 base += HUGE_PAGE_SIZE; 785 } 786} 787#endif 788/* This method duplicates some code from spawndomain since we need to spawn very 789 * special domains */ 790void 791spawn_guest_domain (struct guest *g, struct spawninfo *si) 792{ 793 errval_t err; 794 struct capref ept_pml4_cap; 795 796#ifdef ARRAKIS_USE_NESTED_PAGING 797 g->vspace = malloc(sizeof(*(g->vspace))); 798 assert(g->vspace); 799 err = guest_slot_alloc(g, &ept_pml4_cap); 800 assert(err_is_ok(err)); 801 err = vnode_create(ept_pml4_cap, ObjType_VNode_x86_64_pml4); 802 assert(err_is_ok(err)); 803 804 struct pmap *pmap = malloc(sizeof(struct pmap_x86)); 805 assert(pmap); 806 err = pmap_x86_64_init(pmap, g->vspace, ept_pml4_cap, NULL); 807 assert(err_is_ok(err)); 808 err = vspace_init(g->vspace, pmap); 809 assert(err_is_ok(err)); 810 811#ifdef EPT_FINE_GRAINED 812 // populate the guest physical address space 813 // regions for binary 814 for (struct vregion *v = si->vspace->head; v; v = v->next) { 815 printf("memobj type: %d\n",v->memobj->type); 816 switch (v->memobj->type) { 817 case ANONYMOUS: 818 { 819 struct memobj_anon *m = (struct memobj_anon *)v->memobj; 820 for (struct memobj_frame_list *f = m->frame_list; f; f = f->next) { 821 ept_map(g, f->frame); 822 } 823 break; 824 } 825 case ONE_FRAME: 826 { 827 struct memobj_one_frame *m = (struct memobj_one_frame *)v->memobj; 828 ept_map(g, m->frame); 829 break; 830 } 831 default: 832 debug_printf("need to implement handling for memobj type %d\n", 833 v->memobj->type); 834 break; 835 } 836 } 837 // page tables: go through si pmap and setup identity mappings for all 838 // ptables 839 struct pmap_x86 *si_pmap = (struct pmap_x86 *)vspace_get_pmap(si->vspace); 840 ept_map_vnode(g, &si_pmap->root); 841 842 // map frames in basecn so we get some headstart before having to talk to 843 // arrakis.hyper 844 struct capref basecn_cap = { 845 .cnode = si->rootcn, 846 .slot = ROOTCN_SLOT_BASE_PAGE_CN, 847 }; 848 struct cnoderef si_basecn = build_cnoderef(basecn_cap, DEFAULT_CNODE_BITS); 849 for (int i = 0; i < DEFAULT_CNODE_SLOTS; i++) { 850 struct capref mem = { 851 .cnode = si_basecn, 852 .slot = i, 853 }; 854 // cannot retype basecn ram caps to frames here, as this would break 855 // the ability of the guest domain to retype them later on, so we 856 // force insert the mappings here 857 ept_force_mapping(g, mem); 858 } 859#else 860 // 1g pages for 1:1 ept 861 ept_setup_low512g(g); 862#endif 863#else 864 ept_pml4_cap = si->vtree; 865 // set guest's vspace to vspace we created when loading binary 866 g->vspace = si->vspace; 867#endif 868 869 // create end point 870 struct capref ep_cap; 871 872 // use minimum-sized endpoint, because we don't need to buffer >1 vmexit 873 err = endpoint_create(LMP_RECV_LENGTH, &ep_cap, &g->monitor_ep); 874 assert(err_is_ok(err)); 875 876 // register to receive on this endpoint 877 struct event_closure cl = { 878 .handler = idc_handler, 879 .arg = g, 880 }; 881 err = lmp_endpoint_register(g->monitor_ep, get_default_waitset(), cl); 882 assert(err_is_ok(err)); 883 884 // setup the DCB; need to copy cap here as si->dcb will be destroyed when 885 // spawning process is complete! 886 err = slot_alloc(&g->dcb_cap); 887 assert(err_is_ok(err)); 888 err = cap_copy(g->dcb_cap, si->dcb); 889 assert(err_is_ok(err)); 890 891 // set guests disp handle 892 strncpy(g->name, si->name, G_NAME_LEN); 893 g->name[G_NAME_LEN-1] = 0; 894 895 struct frame_identity fi; 896 err = frame_identify(si->dispframe, &fi); 897 assert(err_is_ok(err)); 898 g->dispframe = fi.base; 899 900 err = invoke_dispatcher_setup_guest(g->dcb_cap, ep_cap, ept_pml4_cap, 901 g->vmcb_cap, g->ctrl_cap); 902 if (err_is_fail(err)) { 903 DEBUG_ERR(err, "guest setup"); 904 } 905 assert(err_is_ok(err)); 906 907 err = invoke_dispatcher(si->dcb, cap_dispatcher, si->rootcn_cap, 908 si->vtree, si->dispframe, false); 909 assert(err_is_ok(err)); 910 911 // Setup virtual machine 912 arch_registers_state_t *regs = 913 dispatcher_get_disabled_save_area(si->handle); 914#ifdef CONFIG_SVM 915 amd_vmcb_rax_wr(&g->vmcb, regs->rax); 916 memcpy(&g->ctrl->regs, regs, sizeof(arch_registers_state_t)); 917 amd_vmcb_rsp_wr(&g->vmcb, regs->rsp); 918 amd_vmcb_rip_wr(&g->vmcb, regs->rip); 919 amd_vmcb_rflags_wr_raw(&g->vmcb, regs->eflags); 920 921 // Enable long mode 922 amd_vmcb_cr0_pe_wrf(&g->vmcb, 1); 923 amd_vmcb_cr0_pg_wrf(&g->vmcb, 1); 924 amd_vmcb_cr4_pae_wrf(&g->vmcb, 1); 925 amd_vmcb_efer_lme_wrf(&g->vmcb, 1); 926 amd_vmcb_efer_lma_wrf(&g->vmcb, 1); 927 928 // More "default" settings 929 amd_vmcb_cr4_mce_wrf(&g->vmcb, 1); 930 amd_vmcb_cr4_pge_wrf(&g->vmcb, 1); 931 amd_vmcb_cr4_pce_wrf(&g->vmcb, 1); 932 amd_vmcb_cr4_osfxsr_wrf(&g->vmcb, 1); 933 amd_vmcb_efer_sce_wrf(&g->vmcb, 1); 934 amd_vmcb_efer_nxe_wrf(&g->vmcb, 1); 935 936 // disable GDTR intercept 937 amd_vmcb_intercepts_rdgdtr_wrf(&g->vmcb, 0); 938 amd_vmcb_intercepts_wrgdtr_wrf(&g->vmcb, 0); 939 // disable GDTR intercept 940 amd_vmcb_intercepts_rdldtr_wrf(&g->vmcb, 0); 941 amd_vmcb_intercepts_wrldtr_wrf(&g->vmcb, 0); 942 // disable IDTR intercept 943 amd_vmcb_intercepts_rdidtr_wrf(&g->vmcb, 0); 944 amd_vmcb_intercepts_wridtr_wrf(&g->vmcb, 0); 945 // disable TR intercept 946 amd_vmcb_intercepts_rdtr_wrf(&g->vmcb, 0); 947 amd_vmcb_intercepts_wrtr_wrf(&g->vmcb, 0); 948 // disable non essential CR0 access intercepts 949 amd_vmcb_cr_access_rdcr0_wrf(&g->vmcb, 0); 950 amd_vmcb_cr_access_wrcr0_wrf(&g->vmcb, 0); 951 // disable CR3 access intercepts 952 amd_vmcb_cr_access_rdcr3_wrf(&g->vmcb, 0); 953 amd_vmcb_cr_access_wrcr3_wrf(&g->vmcb, 0); 954 // disable INTn intercept 955 /* amd_vmcb_intercepts_intn_wrf(&g->vmcb, 0); */ 956 957 // Disable nested paging 958 amd_vmcb_np_enable_wrf(&g->vmcb, 0); 959#else 960 err = 0; 961 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_IOBMP_A_F, g->iobmp_a_pa); 962 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_IOBMP_B_F, g->iobmp_b_pa); 963 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_MSRBMP_F, g->msrpm_pa); 964 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_VPID, ++last_guest_asid); 965 966 memcpy(&g->ctrl->regs, regs, sizeof(arch_registers_state_t)); 967 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RSP, regs->rsp); 968 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, regs->rip); 969 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RFLAGS, regs->eflags); 970 assert(err_is_ok(err)); 971 debug_printf("guest domain vpid %d init ip: %lx, sp: %lx\n", 972 last_guest_asid, regs->rip, regs->rsp); 973#endif 974 for(int i = 0; i < si->vregions; i++) { 975 printf("vregion %d: base = %" PRIxGENVADDR ", region = %" PRIxGENVADDR "\n", 976 i, si->base[i], vregion_get_base_addr(si->vregion[i])); 977 } 978} 979 980#if 0 981static void 982install_grub_stage2 (struct guest *g, void *img, size_t img_size) 983{ 984 assert(img != NULL); 985 986 /* the grub image goes to 0x8000 according to 987 * http://www.gnu.org/software/grub/manual/html_node/Memory-map.html */ 988 memcpy((void *)(guest_to_host(g->mem_low_va + 0x8000)), img, img_size); 989 // according to grub stage2 source its entry point is at 0x8200 990 amd_vmcb_rip_wr(&g->vmcb, 0x8200); 991 // switch to the first segment 992 amd_vmcb_cs_selector_wr(&g->vmcb, 0x0); 993 amd_vmcb_cs_base_wr(&g->vmcb, 0x0); 994 amd_vmcb_cs_limit_wr(&g->vmcb, 0xffff); 995} 996#endif 997 998#if 0 999static void 1000install_debug_app (struct guest *g) 1001{ 1002 //static uint8_t app[] = { 0xcd, 0x20 }; 1003 static uint8_t app[] = { 0xcd, 0x20, 0x90, 0x90, 0x90, 0x90, 0x90 }; 1004 /* static uint8_t app[] = { 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90 }; */ 1005 memcpy((void *)(guest_to_host(g->mem_low_va + 0xf000)), app, sizeof(app)); 1006 amd_vmcb_rip_wr(&g->vmcb, 0xf000); 1007 amd_vmcb_rsp_wr(&g->vmcb, 0x10000); 1008 1009#if 0 1010 static uint8_t gdt[] = { 1011 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 1012 0xff,0xff,0x00,0x00,0x00,0x9a,0xaf,0x00, // 64bit code segment, D _cleared_ => "16bit" 1013 0xff,0xff,0x00,0x00,0x00,0x92,0xcf,0x00, // data 1014 0xff,0xff,0x00,0x00,0x00,0x9a,0xcf,0x00, // 32bit code segment for protected-mode 1015 0xff,0xff,0x00,0x80,0x0b,0x92,0xff,0x00, // screen 1016 0xff,0xff,0x00,0x60,0x00,0x9a,0xcf,0x00, // segment at linear address 0x6000 1017 0xff,0xff,0x00,0x00,0x00,0x92,0xaf,0x00 // stack segment in 64bit mode 1018 }; 1019 memcpy((void *)(guest_to_host(g->mem_low_va)), gdt, sizeof(gdt)); 1020 1021 amd_vmcb_gdtr_base_wr(&g->vmcb, 0); 1022 amd_vmcb_gdtr_limit_wr(&g->vmcb, sizeof(gdt)); 1023#endif 1024 1025 // disable nested pageing in real mode 1026 /* amd_vmcb_np_enable_wrf(&g->vmcb, 1); */ 1027 // enable paged real mode 1028 //amd_vmcb_cr0_pg_wrf(&g->vmcb, 0x1); 1029 //g->save_area->cr0 |= X86_CR0_PE_MASK; 1030 1031#if 0 1032 // Write page table 1033 static uint64_t pml4[512] = { 1034 0xffffffffffffffffUL, 1035 0x00002007, 1036 0 1037 }; 1038 memcpy((void *)(guest_to_host(g->mem_low_va + 0x1000)), pml4, sizeof(pml4)); 1039 1040 static uint64_t pdpt[512] = { 1041 0x00003007, 1042 0 1043 }; 1044 memcpy((void *)(guest_to_host(g->mem_low_va + 0x2000)), pdpt, sizeof(pdpt)); 1045 1046 static uint64_t pdir[512] = { 1047 0x00000087, 1048 0x00200087, 1049 0x00400087, 1050 0x00600087, 1051 0x00800087, 1052 0x00a00087, 1053 0x00c00087, 1054 0x00e00087, 1055 0x01000087, 1056 0 1057 }; 1058 memcpy((void *)(guest_to_host(g->mem_low_va + 0x3000)), pdir, sizeof(pdir)); 1059 1060 amd_vmcb_cr3_wr(&g->vmcb, 0x1000); 1061#endif 1062 1063 // Enable long mode 1064 amd_vmcb_cr0_pe_wrf(&g->vmcb, 1); 1065 amd_vmcb_cr0_pg_wrf(&g->vmcb, 1); 1066 amd_vmcb_cr4_pae_wrf(&g->vmcb, 1); 1067 amd_vmcb_efer_lme_wrf(&g->vmcb, 1); 1068 amd_vmcb_efer_lma_wrf(&g->vmcb, 1); 1069 1070 // Disable nested paging 1071 amd_vmcb_np_enable_wrf(&g->vmcb, 0); 1072 1073 /* amd_vmcb_cs_selector_wr(&g->vmcb, 0x0); */ 1074 /* amd_vmcb_cs_base_wr(&g->vmcb, 0x0); */ 1075 /* amd_vmcb_cs_limit_wr(&g->vmcb, 0xfffff); */ 1076 /* amd_vmcb_cs_attrib_wr(&g->vmcb, */ 1077 //g->save_area->cs.selector = 0x1000; 1078 //g->save_area->cs.base = 0x10000; 1079 //g->save_area->cs.base = 0x1ffff; 1080} 1081#endif 1082 1083static bool 1084virq_pending (void *ud, uint8_t *irq, uint8_t *irq_prio) 1085{ 1086 assert(ud != NULL); 1087 1088 struct guest *g = ud; 1089#ifdef CONFIG_SVM 1090 if (amd_vmcb_vintr_rd(&g->vmcb).virq == 1) { 1091#else 1092 uint64_t info; 1093 errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_ENTRY_INTR_INFO, &info); 1094 assert(err_is_ok(err)); 1095 if (!!(info & (1UL << 31))) { 1096#endif 1097 if (irq != NULL) { 1098#ifdef CONFIG_SVM 1099 *irq = amd_vmcb_vintr_rd(&g->vmcb).vintr_vector; 1100#else 1101 *irq = info & 0xff; 1102#endif 1103 } 1104 if (irq_prio != NULL) { 1105#ifdef CONFIG_SVM 1106 *irq_prio = amd_vmcb_vintr_rd(&g->vmcb).vintr_prio; 1107#else 1108 *irq_prio = interrupt_priority; 1109#endif 1110 } 1111 return true; 1112 } else { 1113 return false; 1114 } 1115} 1116 1117#ifndef CONFIG_SVM 1118static bool 1119virq_accepting (void *ud) 1120{ 1121 assert(ud != NULL); 1122 1123 struct guest *g = ud; 1124 1125 uint64_t guest_rflags; 1126 errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RFLAGS, &guest_rflags); 1127 assert(err_is_ok(err)); 1128 return (guest_rflags & (1UL << 9)); 1129} 1130#endif 1131 1132static void 1133virq_handler (void *ud, uint8_t irq, uint8_t irq_prio) 1134{ 1135 assert(ud != NULL); 1136 1137 struct guest *g = ud; 1138 1139 // tell the hw extensions that there is a virtual IRQ pending 1140#ifdef CONFIG_SVM 1141 amd_vmcb_vintr_virq_wrf(&g->vmcb, 1); 1142 amd_vmcb_vintr_vintr_prio_wrf(&g->vmcb, irq_prio); 1143 amd_vmcb_vintr_vintr_vector_wrf(&g->vmcb, irq); 1144 amd_vmcb_vintr_v_ign_tpr_wrf(&g->vmcb, 1); 1145#else 1146 uint64_t guest_rflags; 1147 errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RFLAGS, &guest_rflags); 1148 assert(guest_rflags & (1UL << 9)); 1149 1150 uint64_t info = (0 << 8 /*HWINTR*/) | (1UL << 31 /*INTR VALID*/) | irq; 1151 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_ENTRY_INTR_INFO, info); 1152 1153 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_ACTIV_STATE, 0x0); 1154 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_INTR_STATE, 0x0); 1155 assert(err_is_ok(err)); 1156 1157 interrupt_priority = irq_prio; 1158#endif 1159 // if the guest is currently waiting then we have to restart it to make 1160 // forward progress 1161 if (!g->runnable) { 1162 g->runnable = true; 1163 guest_make_runnable(g, true); 1164 } 1165} 1166 1167static void 1168guest_setup (struct guest *g) 1169{ 1170 errval_t err; 1171 1172 // initialize the guests slot_allocator 1173 err = two_level_slot_alloc_init(&g->slot_alloc); 1174 assert_err(err, "two_level_slot_alloc_init"); 1175 1176 struct frame_identity fi; 1177 1178 // allocate memory for the vmcb 1179 err = guest_slot_alloc(g, &g->vmcb_cap); 1180 assert_err(err, "guest_cspace_alloc"); 1181 err = frame_create(g->vmcb_cap, VMCB_SIZE, NULL); 1182 assert_err(err, "frame_create"); 1183 err = frame_identify(g->vmcb_cap, &fi); 1184 assert_err(err, "frame_identify"); 1185 g->vmcb_pa = fi.base; 1186 err = vspace_map_one_frame_attr((void**)&g->vmcb_va, VMCB_SIZE, g->vmcb_cap, 1187 VREGION_FLAGS_READ_WRITE_NOCACHE, 1188 NULL, NULL); 1189 if (err_is_fail(err)) { 1190 DEBUG_ERR(err, "vspace_map_one_frame_attr failed"); 1191 } 1192 1193 // guest control 1194 err = frame_alloc(&g->ctrl_cap, sizeof(struct guest_control), NULL); 1195 assert_err(err, "frame_alloc"); 1196 size_t size = ROUND_UP(sizeof(struct guest_control), BASE_PAGE_SIZE); 1197 err = vspace_map_one_frame_attr((void**)&g->ctrl, size, g->ctrl_cap, 1198 VREGION_FLAGS_READ_WRITE_NOCACHE, 1199 NULL, NULL); 1200 if (err_is_fail(err)) { 1201 DEBUG_ERR(err, "vspace_map_one_frame_attr failed"); 1202 } 1203 g->ctrl->num_vm_exits_with_monitor_invocation = 0; 1204 g->ctrl->num_vm_exits_without_monitor_invocation = 0; 1205 1206#ifdef CONFIG_SVM 1207 // allocate memory for the iopm 1208 err = frame_alloc(&g->iopm_cap, IOPM_SIZE, NULL); 1209 assert_err(err, "frame_alloc"); 1210 err = frame_identify(g->iopm_cap, &fi); 1211 assert_err(err, "frame_identify"); 1212 g->iopm_pa = fi.base; 1213 err = vspace_map_one_frame_attr((void**)&g->iopm_va, IOPM_SIZE, g->iopm_cap, 1214 VREGION_FLAGS_READ_WRITE_NOCACHE, 1215 NULL, NULL); 1216 if (err_is_fail(err)) { 1217 DEBUG_ERR(err, "vspace_map_one_frame_attr failed"); 1218 } 1219#else 1220 // allocate memory for I/O bitmap A 1221 err = frame_alloc(&g->iobmp_a_cap, IOBMP_A_SIZE, NULL); 1222 assert_err(err, "frame_alloc"); 1223 err = frame_identify(g->iobmp_a_cap, &fi); 1224 assert_err(err, "frame_identify"); 1225 g->iobmp_a_pa = fi.base; 1226 err = vspace_map_one_frame_attr((void**)&g->iobmp_a_va, IOBMP_A_SIZE, g->iobmp_a_cap, 1227 VREGION_FLAGS_READ_WRITE_NOCACHE, 1228 NULL, NULL); 1229 if (err_is_fail(err)) { 1230 DEBUG_ERR(err, "vspace_map_one_frame_attr failed"); 1231 } 1232 1233 // allocate memory for I/O bitmap B 1234 err = frame_alloc(&g->iobmp_b_cap, IOBMP_B_SIZE, NULL); 1235 assert_err(err, "frame_alloc"); 1236 err = frame_identify(g->iobmp_b_cap, &fi); 1237 assert_err(err, "frame_identify"); 1238 g->iobmp_b_pa = fi.base; 1239 err = vspace_map_one_frame_attr((void**)&g->iobmp_b_va, IOBMP_B_SIZE, g->iobmp_b_cap, 1240 VREGION_FLAGS_READ_WRITE_NOCACHE, 1241 NULL, NULL); 1242 if (err_is_fail(err)) { 1243 DEBUG_ERR(err, "vspace_map_one_frame_attr failed"); 1244 } 1245 1246 // allocate memory for the guest MSR store/load area 1247 err = frame_alloc(&g->msr_area_cap, VMX_MSR_AREA_SIZE, NULL); 1248 assert_err(err, "frame_alloc"); 1249 err = frame_identify(g->msr_area_cap, &fi); 1250 assert_err(err, "frame_identify"); 1251 g->msr_area_pa = fi.base; 1252 err = vspace_map_one_frame_attr((void**)&g->msr_area_va, VMX_MSR_AREA_SIZE, 1253 g->msr_area_cap, 1254 VREGION_FLAGS_READ_WRITE_NOCACHE, 1255 NULL, NULL); 1256 if (err_is_fail(err)) { 1257 DEBUG_ERR(err, "vspace_map_one_frame_attr failed"); 1258 } 1259#endif 1260 // allocate memory for the msrpm 1261 err = frame_alloc(&g->msrpm_cap, MSRPM_SIZE, NULL); 1262 assert_err(err, "frame_alloc"); 1263 err = frame_identify(g->msrpm_cap, &fi); 1264 assert_err(err, "frame_identify"); 1265 g->msrpm_pa = fi.base; 1266 err = vspace_map_one_frame_attr((void**)&g->msrpm_va, MSRPM_SIZE, 1267 g->msrpm_cap, 1268 VREGION_FLAGS_READ_WRITE_NOCACHE, 1269 NULL, NULL); 1270 if (err_is_fail(err)) { 1271 DEBUG_ERR(err, "vspace_map_one_frame_attr failed"); 1272 } 1273 1274 // initialize the allocated structures 1275 initialize_iopm(g); 1276 initialize_msrpm(g); 1277#ifdef CONFIG_SVM 1278 initialize_vmcb(g); 1279#endif 1280 // spawn the guest domain 1281 /* spawn_guest_domain(g); */ 1282 /* assert (grub_image != NULL); */ 1283 //install_grub_stage2(g, grub_image, grub_image_size); 1284 /* install_debug_app(g); */ 1285 1286 // add virtual hardware 1287 g->apic = apic_new(APIC_BASE); 1288 g->lpc = lpc_new(virq_handler, virq_pending, 1289#ifndef CONFIG_SVM 1290 virq_accepting, 1291#endif 1292 g, g->apic); 1293#if 0 1294 if (hdd0_image != NULL) { 1295 g->hdds[0] = hdd_new_from_memory(hdd0_image, hdd0_image_size); 1296 g->hdd_count++; 1297 } 1298 g->console = console_new(); 1299 g->serial_ports[0] = pc16550d_new(0x3f8, 4, g->lpc); 1300 pc16550d_attach_to_console(g->serial_ports[0]); 1301 g->serial_ports[1] = pc16550d_new(0x2f8, 3, g->lpc); 1302 g->serial_ports[2] = pc16550d_new(0x3e8, 4, g->lpc); 1303 g->serial_ports[3] = pc16550d_new(0x2e8, 3, g->lpc); 1304 g->serial_port_count = 4; 1305 1306 g->pci = pci_new(); 1307 init_host_devices(g->pci); 1308 1309 // set up bios memory 1310 // FIXME: find a modular way to do this 1311 *(uint16_t *)guest_to_host(g->mem_low_va + 0x400) = 0x3f8; // COM1 1312 *(uint16_t *)guest_to_host(g->mem_low_va + 0x402) = 0x2f8; // COM2 1313#endif 1314 1315 g->runnable = true; 1316} 1317 1318/** 1319 * \brief Create a new guest. 1320 * 1321 * This function creates a new guest. It will do everything necessary to make 1322 * the guest accept images to run. It will create a new domain and assign some 1323 * memory to that domain. Afterwards it will load a bios into the memory and 1324 * set the guest initial IP to the POST entry of the bios. 1325 * 1326 * \return The pointer to the newly created structure describing the guest. 1327 */ 1328struct guest * 1329guest_create (void) 1330{ 1331 struct guest *newguest = malloc(sizeof(struct guest)); 1332 memset(newguest, 0, sizeof(struct guest)); 1333 guest_setup(newguest); 1334 // insert in list 1335 newguest->next = guests; 1336 guests = newguest; 1337 return newguest; 1338} 1339 1340#if 0 1341static int 1342run_realmode (struct guest *g) 1343{ 1344 int r; 1345 1346 realmode_switch_to(g); 1347 r = realmode_exec(); 1348 assert(r == REALMODE_ERR_OK); 1349 realmode_switch_from(g); 1350 1351 guest_handle_vmexit(g); 1352 1353 return 0; 1354}; 1355#endif 1356 1357#ifndef CONFIG_SVM 1358// Return true if the "Enable EPT" Secondary Processor-based control is 1359// set in the VMCS, else false. 1360static inline bool vmx_ept_enabled(struct guest *g) 1361{ 1362 uint64_t sp_controls; 1363 errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_EXEC_SEC_PROC, &sp_controls); 1364 assert(err_is_ok(err)); 1365 return ((sp_controls & SP_CLTS_ENABLE_EPT) != 0); 1366} 1367 1368// Set or clear the "Descriptor-table exiting" Secondary Processor-based 1369// control if val is 1 or 0, respectively. 1370static inline void vmx_intercept_desc_table_wrf(struct guest *g, int val) 1371{ 1372 assert(val == 0 || val == 1); 1373 1374 uint64_t sec_proc_ctrls; 1375 errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_EXEC_SEC_PROC, &sec_proc_ctrls); 1376 if (val) { 1377 uint64_t prim_proc_ctrls; 1378 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_EXEC_PRIM_PROC, &prim_proc_ctrls); 1379 assert(prim_proc_ctrls & PP_CLTS_SEC_CTLS); 1380 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_EXEC_SEC_PROC, 1381 sec_proc_ctrls | SP_CLTS_DESC_TABLE); 1382 } else { 1383 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_EXEC_SEC_PROC, 1384 sec_proc_ctrls & ~SP_CLTS_DESC_TABLE); 1385 } 1386 assert(err_is_ok(err)); 1387} 1388 1389 1390// Before entering the guest, synchronize the CR0 shadow with the guest 1391// CR0 value that is potentially changed in the real-mode emulator. 1392static inline void vmx_set_cr0_shadow(struct guest *g) 1393{ 1394 uint64_t cr0_shadow; 1395 errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR0, &cr0_shadow); 1396 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_CR0_RD_SHADOW, cr0_shadow); 1397 assert(err_is_ok(err)); 1398} 1399#endif 1400 1401/** 1402 * \brief Marks a guest as runnable. 1403 * 1404 * A call to this method will update the guest's runnable state and, if made 1405 * runnable, yield the remaining time slice to the guest domain. 1406 * 1407 * \return Zero on success, non-zero on error 1408 */ 1409errval_t 1410guest_make_runnable (struct guest *g, bool run) 1411{ 1412 assert(g->runnable); 1413 1414 errval_t err; 1415 1416#if 0 1417 /* If the guest is currently in real mode (CR0.PE flag clear) then we do not 1418 * schedule the domain to run the virtualization but run the real-mode 1419 * emulation */ 1420#ifdef CONFIG_SVM 1421 if (UNLIKELY(run && amd_vmcb_cr0_rd(&g->vmcb).pe == 0)) { 1422 if (!g->emulated_before_exit) { 1423 // do the inverse of the code below 1424 amd_vmcb_intercepts_rdgdtr_wrf(&g->vmcb, 1); 1425 amd_vmcb_intercepts_wrgdtr_wrf(&g->vmcb, 1); 1426 amd_vmcb_intercepts_rdldtr_wrf(&g->vmcb, 1); 1427 amd_vmcb_intercepts_wrldtr_wrf(&g->vmcb, 1); 1428 amd_vmcb_intercepts_rdidtr_wrf(&g->vmcb, 1); 1429 amd_vmcb_intercepts_wridtr_wrf(&g->vmcb, 1); 1430 amd_vmcb_intercepts_rdtr_wrf(&g->vmcb, 1); 1431 amd_vmcb_intercepts_wrtr_wrf(&g->vmcb, 1); 1432 amd_vmcb_cr_access_rdcr0_wrf(&g->vmcb, 1); 1433 amd_vmcb_cr_access_wrcr0_wrf(&g->vmcb, 1); 1434 amd_vmcb_cr_access_rdcr3_wrf(&g->vmcb, 1); 1435 amd_vmcb_cr_access_wrcr3_wrf(&g->vmcb, 1); 1436 amd_vmcb_intercepts_intn_wrf(&g->vmcb, 1); 1437 1438 // mark guest as emulated 1439 g->emulated_before_exit = true; 1440 } 1441#else 1442 uint64_t guest_cr0; 1443 err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR0, &guest_cr0); 1444 assert(err_is_ok(err)); 1445 if (UNLIKELY(run && (guest_cr0 & CR0_PE) == 0)) { 1446 if (!g->emulated_before_exit) { 1447 vmx_intercept_desc_table_wrf(g, 1); 1448 g->emulated_before_exit = true; 1449 } 1450#endif 1451#if 0 /* why create a thread for this? it seems fine without! -AB */ 1452 struct thread *t = thread_create((thread_func_t)run_realmode, g); 1453 assert(t != NULL); 1454 err = thread_detach(t); 1455 assert(err_is_ok(err)); 1456#else 1457 run_realmode(g); 1458#endif 1459 return SYS_ERR_OK; 1460 } 1461 1462 /* every time we move the machine from the emulated to virtualized we need 1463 * to adjust some intercepts */ 1464 if (UNLIKELY(run && g->emulated_before_exit)) { 1465#ifdef CONFIG_SVM 1466 // we enforce NP to be enabled (no shadow paging support) 1467 assert(amd_vmcb_np_rd(&g->vmcb).enable == 1); 1468 1469 // disable GDTR intercept 1470 amd_vmcb_intercepts_rdgdtr_wrf(&g->vmcb, 0); 1471 amd_vmcb_intercepts_wrgdtr_wrf(&g->vmcb, 0); 1472 // disable GDTR intercept 1473 amd_vmcb_intercepts_rdldtr_wrf(&g->vmcb, 0); 1474 amd_vmcb_intercepts_wrldtr_wrf(&g->vmcb, 0); 1475 // disable IDTR intercept 1476 amd_vmcb_intercepts_rdidtr_wrf(&g->vmcb, 0); 1477 amd_vmcb_intercepts_wridtr_wrf(&g->vmcb, 0); 1478 // disable TR intercept 1479 amd_vmcb_intercepts_rdtr_wrf(&g->vmcb, 0); 1480 amd_vmcb_intercepts_wrtr_wrf(&g->vmcb, 0); 1481 // disable non essential CR0 access intercepts_t 1482 amd_vmcb_cr_access_rdcr0_wrf(&g->vmcb, 0); 1483 amd_vmcb_cr_access_wrcr0_wrf(&g->vmcb, 0); 1484 // disable CR3 access intercepts 1485 assert(amd_vmcb_np_rd(&g->vmcb).enable != 0); 1486 amd_vmcb_cr_access_rdcr3_wrf(&g->vmcb, 0); 1487 amd_vmcb_cr_access_wrcr3_wrf(&g->vmcb, 0); 1488 // disable INTn intercept 1489 // we have to be outside of real mode for this to work 1490 assert(amd_vmcb_cr0_rd(&g->vmcb).pe != 0); 1491 amd_vmcb_intercepts_intn_wrf(&g->vmcb, 0); 1492#else 1493 bool ept_enabled = vmx_ept_enabled(g); 1494 assert(ept_enabled); 1495 vmx_intercept_desc_table_wrf(g, 0); 1496 assert(guest_cr0 & CR0_PE); 1497 vmx_set_cr0_shadow(g); 1498#endif 1499 // mark guest as not emulated 1500 g->emulated_before_exit = false; 1501 } 1502#endif 1503 1504 // update the guets domain's runnable state 1505 err = invoke_dispatcher(g->dcb_cap, NULL_CAP, NULL_CAP, NULL_CAP, NULL_CAP, run); 1506 assert_err(err, "dispatcher_make_runnable"); 1507 // yield the dispatcher 1508 if (run) { 1509 thread_yield_dispatcher(NULL_CAP); 1510 } 1511 1512 return SYS_ERR_OK; 1513} 1514 1515/* VMEXIT hanlders */ 1516 1517#define HANDLER_ERR_OK (0) 1518#define HANDLER_ERR_FATAL (-1) 1519 1520#ifdef CONFIG_SVM 1521static int 1522handle_vmexit_unhandeled (struct guest *g) 1523{ 1524 printf("Unhandled guest vmexit:\n"); 1525 printf(" code:\t %lx\n", amd_vmcb_exitcode_rd(&g->vmcb)); 1526 printf(" info1:\t %lx\n", amd_vmcb_exitinfo1_rd(&g->vmcb)); 1527 printf(" info2:\t %lx\n", amd_vmcb_exitinfo2_rd(&g->vmcb)); 1528 printf(" intinfo: %lx\n", amd_vmcb_exitintinfo_rd(&g->vmcb)); 1529 1530 printf("VMCB save area:\n"); 1531 printf(" cr0:\t%lx\n", amd_vmcb_cr0_rd_raw(&g->vmcb)); 1532 printf(" cr2:\t%lx\n", amd_vmcb_cr2_rd_raw(&g->vmcb)); 1533 printf(" cr3:\t%lx\n", amd_vmcb_cr3_rd_raw(&g->vmcb)); 1534 printf(" cr4:\t%lx\n", amd_vmcb_cr4_rd_raw(&g->vmcb)); 1535 printf(" efer:\t%lx\n", amd_vmcb_efer_rd_raw(&g->vmcb)); 1536 printf(" rip:\t%lx\n", amd_vmcb_rip_rd_raw(&g->vmcb)); 1537 printf(" rsp:\t%lx\n", amd_vmcb_rsp_rd_raw(&g->vmcb)); 1538 printf(" cs:\tselector %x, base %lx, limit %x, attrib %x\n", 1539 amd_vmcb_cs_selector_rd(&g->vmcb), amd_vmcb_cs_base_rd(&g->vmcb), 1540 amd_vmcb_cs_limit_rd(&g->vmcb), amd_vmcb_cs_attrib_rd_raw(&g->vmcb)); 1541 printf(" ds:\tselector %x, base %lx, limit %x, attrib %x\n", 1542 amd_vmcb_ds_selector_rd(&g->vmcb), amd_vmcb_ds_base_rd(&g->vmcb), 1543 amd_vmcb_ds_limit_rd(&g->vmcb), amd_vmcb_ds_attrib_rd_raw(&g->vmcb)); 1544 printf(" es:\tselector %x, base %lx, limit %x, attrib %x\n", 1545 amd_vmcb_es_selector_rd(&g->vmcb), amd_vmcb_es_base_rd(&g->vmcb), 1546 amd_vmcb_es_limit_rd(&g->vmcb), amd_vmcb_es_attrib_rd_raw(&g->vmcb)); 1547 printf(" ss:\tselector %x, base %lx, limit %x, attrib %x\n", 1548 amd_vmcb_ss_selector_rd(&g->vmcb), amd_vmcb_ss_base_rd(&g->vmcb), 1549 amd_vmcb_ss_limit_rd(&g->vmcb), amd_vmcb_ss_attrib_rd_raw(&g->vmcb)); 1550 printf(" rax:\t%lx\n", amd_vmcb_rax_rd_raw(&g->vmcb)); 1551 printf(" rbx:\t%lx\n", g->ctrl->regs.rbx); 1552 printf(" rcx:\t%lx\n", g->ctrl->regs.rcx); 1553 printf(" rdx:\t%lx\n", g->ctrl->regs.rdx); 1554 printf(" rsi:\t%lx\n", g->ctrl->regs.rsi); 1555 printf(" rdi:\t%lx\n", g->ctrl->regs.rdi); 1556 1557 return HANDLER_ERR_FATAL; 1558} 1559#else 1560static int 1561handle_vmexit_unhandeled (struct guest *g) 1562{ 1563 printf("Unhandeled guest vmexit:\n"); 1564 printf(" exit reason:\t %"PRIu16"\n", saved_exit_reason); 1565 printf(" exit qualification:\t %"PRIx64"\n", saved_exit_qual); 1566 printf(" next rip (I/O instruction):\t %"PRIx64"\n", saved_rip); 1567 1568 uint64_t gpaddr; 1569 errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GPADDR_F, &gpaddr); 1570 printf(" guest physical-address:\t %"PRIx64"\n", gpaddr); 1571 1572 uint64_t guest_cr0, guest_cr3, guest_cr4; 1573 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR0, &guest_cr0); 1574 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR3, &guest_cr3); 1575 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR4, &guest_cr4); 1576 1577 uint64_t guest_efer, guest_rip; 1578 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_EFER_F, &guest_efer); 1579 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip); 1580 1581 uint64_t guest_cs_sel, guest_cs_base, guest_cs_lim, 1582 guest_cs_access; 1583 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CS_SEL, &guest_cs_sel); 1584 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CS_BASE, &guest_cs_base); 1585 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CS_LIM, &guest_cs_lim); 1586 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CS_ACCESS, &guest_cs_access); 1587 1588 uint64_t guest_ds_sel, guest_ds_base, guest_ds_lim, 1589 guest_ds_access; 1590 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_DS_SEL, &guest_ds_sel); 1591 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_DS_BASE, &guest_ds_base); 1592 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_DS_LIM, &guest_ds_lim); 1593 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_DS_ACCESS, &guest_ds_access); 1594 1595 uint64_t guest_es_sel, guest_es_base, guest_es_lim, 1596 guest_es_access; 1597 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_ES_SEL, &guest_es_sel); 1598 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_ES_BASE, &guest_es_base); 1599 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_ES_LIM, &guest_es_lim); 1600 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_ES_ACCESS, &guest_es_access); 1601 1602 uint64_t guest_ss_sel, guest_ss_base, guest_ss_lim, 1603 guest_ss_access; 1604 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_SS_SEL, &guest_ss_sel); 1605 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_SS_BASE, &guest_ss_base); 1606 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_SS_LIM, &guest_ss_lim); 1607 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_SS_ACCESS, &guest_ss_access); 1608 assert(err_is_ok(err)); 1609 1610 printf("VMCS save area:\n"); 1611 printf(" cr0:\t%lx\n", guest_cr0); 1612 printf(" cr3:\t%lx\n", guest_cr3); 1613 printf(" cr4:\t%lx\n", guest_cr4); 1614 printf(" efer:\t%lx\n", guest_efer); 1615 printf(" rip:\t%lx\n", guest_rip); 1616 printf(" cs:\tselector %lx, base %lx, limit %lx, access %lx\n", 1617 guest_cs_sel, guest_cs_base, guest_cs_lim, guest_cs_access); 1618 printf(" ds:\tselector %lx, base %lx, limit %lx, access %lx\n", 1619 guest_ds_sel, guest_ds_base, guest_ds_lim, guest_ds_access); 1620 printf(" es:\tselector %lx, base %lx, limit %lx, access %lx\n", 1621 guest_es_sel, guest_es_base, guest_es_lim, guest_es_access); 1622 printf(" ss:\tselector %lx, base %lx, limit %lx, access %lx\n", 1623 guest_ss_sel, guest_ss_base, guest_ss_lim, guest_ss_access); 1624 printf(" rax:\t%lx\n", g->ctrl->regs.rax); 1625 printf(" rbx:\t%lx\n", g->ctrl->regs.rbx); 1626 printf(" rcx:\t%lx\n", g->ctrl->regs.rcx); 1627 printf(" rdx:\t%lx\n", g->ctrl->regs.rdx); 1628 printf(" rsi:\t%lx\n", g->ctrl->regs.rsi); 1629 printf(" rdi:\t%lx\n", g->ctrl->regs.rdi); 1630 1631 return HANDLER_ERR_FATAL; 1632} 1633#endif 1634 1635static inline uint64_t 1636lookup_paddr_long_mode (struct guest *g, uint64_t vaddr) 1637{ 1638 union x86_lm_va va = { .raw = vaddr }; 1639 uint64_t *page_table; 1640 1641 // get a pointer to the pml4 table 1642#ifdef CONFIG_SVM 1643 page_table = (uint64_t *)guest_to_host(amd_vmcb_cr3_rd(&g->vmcb)); 1644#else 1645 uint64_t guest_cr3; 1646 errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR3, &guest_cr3); 1647 assert(err_is_ok(err)); 1648 page_table = (uint64_t *)guest_to_host(guest_cr3); 1649#endif 1650 // get pml4 entry 1651 union x86_lm_pml4_entry pml4e = { .raw = page_table[va.u.pml4_idx] }; 1652 assert (pml4e.u.p == 1); 1653 1654 // get a pointer to the pdp table 1655 page_table = (uint64_t *)guest_to_host(pml4e.u.pdp_base_pa << 12); 1656 // get pdp entry 1657 union x86_lm_pdp_entry pdpe = { .raw = page_table[va.u.pdp_idx] }; 1658 assert(pdpe.u.p == 1); 1659 // check for 1GB page (PS bit set) 1660 if (pdpe.u.ps == 1) { 1661 return (pdpe.u1gb.base_pa << 30) | va.u1gb.pa_offset; 1662 } 1663 1664 // get a pointer to the pd table 1665 page_table = (uint64_t *)guest_to_host(pdpe.u.pd_base_pa << 12); 1666 // get pd entry 1667 union x86_lm_pd_entry pde = { .raw = page_table[va.u.pd_idx] }; 1668 if (pde.u.p == 0) { 1669 printf("g2h %lx, pml4e %p %lx, pdpe %p %lx, pde %p %lx\n", 1670 guest_to_host(0), &pml4e, pml4e.raw, &pdpe, pdpe.raw, &pde, pde.raw); 1671 } 1672 assert(pde.u.p == 1); 1673 // check for 2MB page (PS bit set) 1674 if (pde.u.ps == 1) { 1675 return (pde.u2mb.base_pa << 21) | va.u2mb.pa_offset; 1676 } 1677 1678 // get a pointer to the page table 1679 page_table = (uint64_t *)guest_to_host(pde.u.pt_base_pa << 12); 1680 // get the page table entry 1681 union x86_lm_pt_entry pte = { .raw = page_table[va.u.pt_idx] }; 1682 assert(pte.u.p == 1); 1683 1684 return (pte.u.base_pa << 12) | va.u.pa_offset; 1685} 1686 1687static inline uint32_t 1688lookup_paddr_legacy_mode (struct guest *g, uint32_t vaddr) 1689{ 1690 // PAE not supported 1691#ifdef CONFIG_SVM 1692 guest_assert(g, amd_vmcb_cr4_rd(&g->vmcb).pae == 0); 1693#else 1694 uint64_t guest_cr4; 1695 errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR4, &guest_cr4); 1696 guest_assert(g, (guest_cr4 & CR4_PAE) == 0); 1697#endif 1698 union x86_legm_va va = { .raw = vaddr }; 1699 uint32_t *page_table; 1700 1701 // get a pointer to the pd table 1702#ifdef CONFIG_SVM 1703 page_table = (uint32_t *)guest_to_host(amd_vmcb_cr3_rd(&g->vmcb)); 1704#else 1705 uint64_t guest_cr3; 1706 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR3, &guest_cr3); 1707 assert(err_is_ok(err)); 1708 page_table = (uint32_t *)guest_to_host(guest_cr3); 1709#endif 1710 1711 // get pd entry 1712 union x86_legm_pd_entry pde = { .raw = page_table[va.u.pd_idx] }; 1713 assert (pde.u.p == 1); 1714 // check for 4MB page (PS bit set) 1715 if (pde.u.ps == 1) { 1716 return (pde.u4mb.base_pa << 22) | va.u4mb.pa_offset; 1717 } 1718 1719 // get a pointer to the page table 1720 page_table = (uint32_t *)guest_to_host(pde.u.pt_base_pa << 12); 1721 // get the page table entry 1722 union x86_legm_pt_entry pte = { .raw = page_table[va.u.pt_idx] }; 1723 assert(pte.u.p == 1); 1724 1725 return (pte.u.base_pa << 12) | va.u.pa_offset; 1726} 1727 1728// retunrs a pointer to a byte array starting at the current instruction 1729static inline int 1730get_instr_arr (struct guest *g, uint8_t **arr) 1731{ 1732#ifdef CONFIG_SVM 1733 if (UNLIKELY(amd_vmcb_cr0_rd(&g->vmcb).pg == 0)) { 1734#else 1735 uint64_t guest_cr0; 1736 errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR0, &guest_cr0); 1737 if (UNLIKELY((guest_cr0 & CR0_PG) == 0)) { 1738#endif 1739 //printf("Segmentation active!\n"); 1740 // without paging 1741 // take segmentation into account 1742#ifdef CONFIG_SVM 1743 *arr = (uint8_t *)(guest_to_host(g->mem_low_va) + 1744 amd_vmcb_cs_base_rd(&g->vmcb) + 1745 amd_vmcb_rip_rd(&g->vmcb)); 1746#else 1747 uint64_t guest_cs_base, guest_rip; 1748 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CS_BASE, &guest_cs_base); 1749 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip); 1750 *arr = (uint8_t *)(guest_to_host(g->mem_low_va) + 1751 guest_cs_base + guest_rip); 1752#endif 1753 } else { 1754 // with paging 1755#ifdef CONFIG_SVM 1756 if (amd_vmcb_efer_rd(&g->vmcb).lma == 1) { 1757#else 1758 uint64_t guest_efer; 1759 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_EFER_F, &guest_efer); 1760 if (guest_efer & EFER_LMA) { 1761#endif 1762 // long mode 1763#ifdef CONFIG_SVM 1764 if (amd_vmcb_cs_attrib_rd(&g->vmcb).l == 1) { 1765 // 64-bit mode 1766 *arr = (uint8_t *)guest_to_host(lookup_paddr_long_mode(g, 1767 amd_vmcb_rip_rd(&g->vmcb))); 1768#else 1769 uint64_t cs_access_rights, guest_rip; 1770 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CS_ACCESS, &cs_access_rights); 1771 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip); 1772 if (cs_access_rights & ACCESS_RIGHTS_LONG_MODE) { 1773 *arr = (uint8_t *)guest_to_host(lookup_paddr_long_mode(g, 1774 guest_rip)); 1775#endif 1776 } else { 1777 // cmpatibility mode 1778 guest_assert(g, !"compatiblity mode not supported yet"); 1779 } 1780 } else { 1781 // Legacy (aka. Paged Protected) Mode 1782#ifdef CONFIG_SVM 1783 assert(amd_vmcb_cr0_rd(&g->vmcb).pe == 1); 1784 1785 *arr = (uint8_t *)guest_to_host(lookup_paddr_legacy_mode(g, 1786 amd_vmcb_rip_rd(&g->vmcb))); 1787#else 1788 assert(guest_cr0 & CR0_PE); 1789 1790 uint64_t guest_rip; 1791 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip); 1792 *arr = (uint8_t *)guest_to_host(lookup_paddr_legacy_mode(g, 1793 guest_rip)); 1794#endif 1795 } 1796 } 1797#ifndef CONFIG_SVM 1798 assert(err_is_ok(err)); 1799#endif 1800 return HANDLER_ERR_OK; 1801} 1802 1803static inline uint64_t 1804get_reg_val_by_reg_num (struct guest *g, uint8_t regnum) { 1805 switch (regnum) { 1806 case 0: 1807 return guest_get_rax(g); 1808 case 1: 1809 return guest_get_rcx(g); 1810 case 2: 1811 return guest_get_rdx(g); 1812 case 3: 1813 return guest_get_rbx(g); 1814 case 4: 1815 return guest_get_rsp(g); 1816 case 5: 1817 return guest_get_rbp(g); 1818 case 6: 1819 return guest_get_rsi(g); 1820 case 7: 1821 return guest_get_rdi(g); 1822 default: 1823 assert(!"not reached"); 1824 return 0; 1825 } 1826} 1827 1828static inline void 1829set_reg_val_by_reg_num (struct guest *g, uint8_t regnum, uint64_t val) { 1830 switch (regnum) { 1831 case 0: 1832 guest_set_rax(g, val); 1833 break; 1834 case 1: 1835 guest_set_rcx(g, val); 1836 break; 1837 case 2: 1838 guest_set_rdx(g, val); 1839 break; 1840 case 3: 1841 guest_set_rbx(g, val); 1842 break; 1843 case 4: 1844 guest_set_rsp(g, val); 1845 break; 1846 case 5: 1847 guest_set_rbp(g, val); 1848 break; 1849 case 6: 1850 guest_set_rsi(g, val); 1851 break; 1852 case 7: 1853 guest_set_rdi(g, val); 1854 break; 1855 default: 1856 assert(!"not reached"); 1857 break; 1858 } 1859} 1860 1861static int 1862handle_vmexit_cr_access (struct guest *g) 1863{ 1864 int r; 1865 uint8_t *code = NULL; 1866#ifndef CONFIG_SVM 1867 errval_t err = 0; 1868 if (g->emulated_before_exit) { 1869 assert(saved_exit_reason == VMX_EXIT_REASON_CR_ACCESS); 1870 assert(((saved_exit_qual >> 0) & 0xf) == 0); 1871 } 1872#endif 1873 // fetch the location to the code 1874 r = get_instr_arr(g, &code); 1875 if (r != HANDLER_ERR_OK) { 1876 return r; 1877 } 1878 assert(code != NULL); 1879 1880 assert(code[0] == 0x0f && (code[1] == 0x20 || code[1] == 0x22)); 1881 1882 uint64_t val; 1883 bool read = (code[1] == 0x20); 1884 union x86_modrm mod; 1885 mod.raw = code[2]; 1886 1887 // FIXME: use proper exception 1888 assert(mod.u.mod == 3); 1889 1890 // source 1891 if (read) { 1892 // read from CR 1893 switch (mod.u.regop) { 1894 case 0: 1895#ifdef CONFIG_SVM 1896 val = amd_vmcb_cr0_rd_raw(&g->vmcb); 1897#else 1898 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR0, &val); 1899#endif 1900 break; 1901 default: 1902 printf("CR access: unknown CR source register\n"); 1903 return handle_vmexit_unhandeled(g); 1904 } 1905 } else { 1906 // read from GPR 1907 val = get_reg_val_by_reg_num(g, mod.u.rm); 1908 } 1909 1910 // destination 1911 if (read) { 1912 // write to GPR 1913 switch (mod.u.rm) { 1914 case 0: 1915 guest_set_rax(g, val); 1916 break; 1917 case 1: 1918 guest_set_rcx(g, val); 1919 break; 1920 case 2: 1921 guest_set_rdx(g, val); 1922 break; 1923 case 3: 1924 guest_set_rbx(g, val); 1925 break; 1926 default: 1927 printf("CR access: unknown GPR destination register\n"); 1928 return handle_vmexit_unhandeled(g); 1929 } 1930 } else { 1931 // write to CR 1932 switch (mod.u.regop) { 1933 case 0: 1934#ifdef CONFIG_SVM 1935 amd_vmcb_cr0_wr_raw(&g->vmcb, val); 1936#else 1937 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_CR0, val); 1938#endif 1939 break; 1940 default: 1941 printf("CR access: unknown CR destination register\n"); 1942 return handle_vmexit_unhandeled(g); 1943 } 1944 } 1945 1946 // advance the rip beyond the instruction 1947#ifdef CONFIG_SVM 1948 amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_rip_rd(&g->vmcb) + 3); 1949#else 1950 uint64_t guest_rip; 1951 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip); 1952 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip + 3); 1953 assert(err_is_ok(err)); 1954#endif 1955 return HANDLER_ERR_OK; 1956} 1957 1958static int 1959handle_vmexit_ldt (struct guest *g) 1960{ 1961 int r; 1962 uint8_t *code = NULL; 1963 uint8_t *mem; 1964 1965 // this handler supports only real-mode 1966#ifdef CONFIG_SVM 1967 assert(amd_vmcb_cr0_rd(&g->vmcb).pe == 0); 1968#else 1969 uint64_t guest_cr0; 1970 errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR0, &guest_cr0); 1971 assert((guest_cr0 & CR0_PE) == 0); 1972#endif 1973 // fetch the location to the code 1974 r = get_instr_arr(g, &code); 1975 if (r != HANDLER_ERR_OK) { 1976 return r; 1977 } 1978 mem = (uint8_t *)guest_to_host(g->mem_low_va); 1979 assert(code != NULL); 1980 1981 assert (code[0] == 0x0f && code[1] == 0x01); 1982 1983 // check for relevant instruction prefixes 1984 bool addr32 = code[-2] == 0x67 || code[-1] == 0x67; 1985 bool op32 = code[-2] == 0x66 || code[-1] == 0x66; 1986 // fetch modrm 1987 union x86_modrm modrm = { .raw = code[2] }; 1988 1989 assert(modrm.u.regop == 2 || modrm.u.regop == 3); 1990 guest_assert(g, op32); 1991 1992 uint32_t addr; 1993 if (addr32) { 1994 // byte 3-6 hold a 32 bit address to a mem location where the first word 1995 // holds the limit and the following dword holds the base 1996 addr = *(uint32_t *)&code[3]; 1997 } else { 1998 // byte 3-4 hold a 16 bit address to a mem location where the first word 1999 // holds the limit and the following dword holds the base 2000 // this address is relative to DS base 2001#ifdef CONFIG_SVM 2002 addr = *(uint16_t *)&code[3] + amd_vmcb_ds_base_rd(&g->vmcb); 2003#else 2004 uint64_t guest_ds_base; 2005 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_DS_BASE, &guest_ds_base); 2006 addr = *(uint16_t *)&code[3] + guest_ds_base; 2007#endif 2008 } 2009 2010 // santity check on the addr 2011 // FIXME: raise a proper exception 2012 if (addr > g->mem_high_va) { 2013 printf("Memory access beyond physical address space\n"); 2014 return HANDLER_ERR_FATAL; 2015 } 2016 2017 // load the actual register 2018 if (modrm.u.regop == 2) { 2019 // LGDT 2020#ifdef CONFIG_SVM 2021 amd_vmcb_gdtr_limit_wr(&g->vmcb, *(uint16_t*)(mem + addr)); 2022 amd_vmcb_gdtr_base_wr(&g->vmcb, *(uint32_t*)(mem + addr + 2)); 2023#else 2024 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_GDTR_LIM, 2025 *(uint16_t*)(mem + addr)); 2026 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_GDTR_BASE, 2027 *(uint32_t*)(mem + addr + 2)); 2028#endif 2029 2030 } else if (modrm.u.regop == 3) { 2031 // LIDT 2032#ifdef CONFIG_SVM 2033 amd_vmcb_idtr_limit_wr(&g->vmcb, *(uint16_t*)(mem + addr)); 2034 amd_vmcb_idtr_base_wr(&g->vmcb, *(uint32_t*)(mem + addr + 2)); 2035#else 2036 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_IDTR_LIM, 2037 *(uint16_t*)(mem + addr)); 2038 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_IDTR_BASE, 2039 *(uint32_t*)(mem + addr + 2)); 2040#endif 2041 } else { 2042 assert(!"not reached"); 2043 } 2044 2045 // advance the rip beyond the instruction 2046#ifdef CONFIG_SVM 2047 if (addr32) { 2048 amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_rip_rd(&g->vmcb) + 7); 2049 } else { 2050 amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_rip_rd(&g->vmcb) + 5); 2051 } 2052#else 2053 uint64_t guest_rip; 2054 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip); 2055 if (addr32) { 2056 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip + 7); 2057 } else { 2058 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip + 5); 2059 } 2060 assert(err_is_ok(err)); 2061#endif 2062 return HANDLER_ERR_OK; 2063} 2064 2065#ifndef CONFIG_SVM 2066static inline void vmx_vmcs_rflags_cf_wrf(struct guest *g, int val) { 2067 assert(val == 0 || val == 1); 2068 uint64_t guest_rflags; 2069 errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RFLAGS, &guest_rflags); 2070 if (val) { 2071 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RFLAGS, 2072 guest_rflags | RFLAGS_CF); 2073 } else { 2074 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RFLAGS, 2075 guest_rflags & (~RFLAGS_CF)); 2076 } 2077 assert(err_is_ok(err)); 2078} 2079#endif 2080 2081static int 2082handle_vmexit_swint (struct guest *g) 2083{ 2084 int r; 2085 uint8_t *code = NULL; 2086 2087 r = get_instr_arr(g, &code); 2088 if (r != HANDLER_ERR_OK) { 2089 return r; 2090 } 2091 assert (code != NULL); 2092 2093 // check for correct instruciton 2094 assert(code[0] == 0xcd); 2095 2096 // the number of the interrupt is followed by the INT (0xcd) opcode 2097 uint8_t int_num = code[1]; 2098 2099 // check whether the guest is in real mode 2100#ifdef CONFIG_SVM 2101 if (amd_vmcb_cr0_rd(&g->vmcb).pe == 0) { 2102#else 2103 uint64_t guest_ds_base, es_guest_base; 2104 uint64_t guest_cr0, guest_rip; 2105 errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR0, &guest_cr0); 2106 if ((guest_cr0 & CR0_PE) == 0) { 2107#endif 2108 // in real mode the interrupts starting at 10 have different meaning 2109 // examine the sw interrupt 2110 switch (int_num) { 2111 case 0x10: 2112 r = console_handle_int10(g->console, g); 2113 if (r != HANDLER_ERR_OK) { 2114 printf("Unhandled method on INT 0x10\n"); 2115 return handle_vmexit_unhandeled(g); 2116 } 2117 break; 2118 case 0x12: 2119 switch (guest_get_ax(g)) { 2120 case 0: // GET MEMORY SIZE 2121 // our VM always has 1MB of base memory 2122 // AX holds the amount of 1KB memory blocks starting at 2123 // addr 0 which is 640 (640 KiB) 2124 guest_set_ax(g, 640); 2125 break; 2126 default: 2127 printf("Unhandled method on INT 0x12\n"); 2128 return handle_vmexit_unhandeled(g); 2129 } 2130 break; 2131 case 0x13: 2132 // Bootable CD-ROM - GET STATUS 2133 if (guest_get_ax(g) == 0x4b01) { 2134 // no cdrom support 2135#ifdef CONFIG_SVM 2136 amd_vmcb_rflags_cf_wrf(&g->vmcb, 1); 2137#else 2138 vmx_vmcs_rflags_cf_wrf(g, 1); 2139#endif 2140 } 2141 // DISK RESET 2142 else if (guest_get_ah(g) == 0) { 2143 for (int i = 0; i < g->hdd_count; i++) { 2144 hdd_reset(g->hdds[i]); 2145 } 2146 } 2147 // DISK - GET DRIVE PARAMETERS (PC,XT286,CONV,PS,ESDI,SCSI) 2148 else if (guest_get_ah(g) == 0x08) { 2149 uint8_t dl = guest_get_dl(g); 2150 2151 // only respond to installed hard disks 2152 if ((dl >> 7) && ((dl & 0x7f) < g->hdd_count)) { 2153 uint16_t c; 2154 uint8_t h, s; 2155 2156 r = hdd_get_geometry_chs(g->hdds[dl & 0x7f], &c, &h, &s); 2157 assert(r == 0); 2158 2159 // set some return values for success 2160 guest_set_ah(g, 0); 2161#ifdef CONFIG_SVM 2162 amd_vmcb_rflags_cf_wrf(&g->vmcb, 0); 2163#else 2164 vmx_vmcs_rflags_cf_wrf(g, 0); 2165#endif 2166 guest_set_bl(g, 0); 2167 // store the geometry into the correct registers 2168 guest_set_cx(g, c << 6 | (s & 0x3f)); 2169 guest_set_dh(g, h); 2170 guest_set_dl(g, g->hdd_count); 2171 } else { 2172#ifdef CONFIG_SVM 2173 amd_vmcb_rflags_cf_wrf(&g->vmcb, 1); 2174#else 2175 vmx_vmcs_rflags_cf_wrf(g, 1); 2176#endif 2177 // it is not really clear to me what ah should contain 2178 // when the drive is not present, so set it to FF 2179 guest_set_ah(g, 1); 2180 } 2181 } 2182 // INT 13 Extensions - INSTALLATION CHECK 2183 else if (guest_get_ah(g) == 0x41 && guest_get_bx(g) == 0x55aa) { 2184#ifdef CONFIG_SVM 2185 amd_vmcb_rflags_cf_wrf(&g->vmcb, 0); 2186#else 2187 vmx_vmcs_rflags_cf_wrf(g, 0); 2188#endif 2189 guest_set_bx(g, 0xaa55); 2190 guest_set_ah(g, 0x01); // Drive extensions 1.x 2191 guest_set_al(g, 0); 2192 guest_set_cx(g, 0x5); 2193 } 2194 // IBM/MS INT 13 Extensions - EXTENDED READ 2195 else if (guest_get_ah(g) == 0x42) { 2196 uint8_t dl = guest_get_dl(g); 2197 2198 // only respond to installed hard disks 2199 if ((dl >> 7) && ((dl & 0x7f) < g->hdd_count)) { 2200#ifdef CONFIG_SVM 2201 amd_vmcb_rflags_cf_wrf(&g->vmcb, 0); 2202#else 2203 vmx_vmcs_rflags_cf_wrf(g, 0); 2204#endif 2205 guest_set_ah(g, 0); 2206 2207 struct disk_access_block { 2208 uint8_t size; 2209 uint8_t reserved; 2210 uint16_t count; 2211 // pointer to the data buffer formated like 2212 // SEGMENT:ADDRESS 2213 uint32_t transfer_buffer; 2214 uint64_t abs_block_number; 2215 } __attribute__ ((packed)); 2216 2217 // memory location of the disk access block 2218#ifdef CONFIG_SVM 2219 uintptr_t mem = guest_to_host(g->mem_low_va) + 2220 amd_vmcb_ds_base_rd(&g->vmcb) + 2221 guest_get_si(g); 2222#else 2223 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_DS_BASE, &guest_ds_base); 2224 uintptr_t mem = guest_to_host(g->mem_low_va) + 2225 guest_ds_base + guest_get_si(g); 2226#endif 2227 2228 struct disk_access_block *dap = (void *)mem; 2229 2230 if (dap->size < 0x10) { 2231#ifdef CONFIG_SVM 2232 amd_vmcb_rflags_cf_wrf(&g->vmcb, 1); 2233#else 2234 vmx_vmcs_rflags_cf_wrf(g, 1); 2235#endif 2236 guest_set_ah(g, 1); 2237 } else { 2238 // dap->transfer buffer points to a real-mode segment 2239 // resolve it according to that rules 2240 mem = guest_to_host(g->mem_low_va) + 2241 ((dap->transfer_buffer >> 16) << 4) + 2242 (dap->transfer_buffer & 0xffff); 2243 2244 size_t count = dap->count; 2245 r = hdd_read_blocks(g->hdds[dl & 0x7f], 2246 dap->abs_block_number, 2247 &count, mem); 2248 dap->count = count; 2249 2250 if (r != HANDLER_ERR_OK) { 2251#ifdef CONFIG_SVM 2252 amd_vmcb_rflags_cf_wrf(&g->vmcb, 1); 2253#else 2254 vmx_vmcs_rflags_cf_wrf(g, 1); 2255#endif 2256 guest_set_ah(g, 1); 2257 } 2258 } 2259 } else { 2260#ifdef CONFIG_SVM 2261 amd_vmcb_rflags_cf_wrf(&g->vmcb, 1); 2262#else 2263 vmx_vmcs_rflags_cf_wrf(g, 1); 2264#endif 2265 // it is not really clear to me what ah should contain 2266 // when the drive is not present, so set it to FF 2267 guest_set_ah(g, 1); 2268 } 2269 } 2270 // IBM/MS INT 13 Extensions - GET DRIVE PARAMETERS 2271 else if (guest_get_ah(g) == 0x48) { 2272 uint8_t dl = guest_get_dl(g); 2273 2274 // only respond to installed hard disks 2275 if ((dl >> 7) && ((dl & 0x7f) < g->hdd_count)) { 2276 // structure to hold drive info 2277 struct drive_params { 2278 uint16_t size; 2279 uint16_t flags; 2280 uint32_t cylinders; 2281 uint32_t heads; 2282 uint32_t sectors; 2283 uint64_t total_sectors; 2284 uint16_t bytes_per_sector; 2285 } __attribute__ ((packed)); 2286 2287 // memory where the drive info shall be stored 2288#ifdef CONFIG_SVM 2289 uintptr_t mem = guest_to_host(g->mem_low_va) + 2290 amd_vmcb_ds_base_rd(&g->vmcb) + 2291 guest_get_si(g); 2292#else 2293 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_DS_BASE, &guest_ds_base); 2294 uintptr_t mem = guest_to_host(g->mem_low_va) + 2295 guest_ds_base + guest_get_si(g); 2296#endif 2297 2298 struct drive_params *drp = (void *)mem; 2299 2300 // sanity check 2301 if (drp->size < sizeof(struct drive_params)) { 2302#ifdef CONFIG_SVM 2303 amd_vmcb_rflags_cf_wrf(&g->vmcb, 1); 2304#else 2305 vmx_vmcs_rflags_cf_wrf(g, 1); 2306#endif 2307 } else { 2308#ifdef CONFIG_SVM 2309 amd_vmcb_rflags_cf_wrf(&g->vmcb, 0); 2310#else 2311 vmx_vmcs_rflags_cf_wrf(g, 0); 2312#endif 2313 guest_set_ah(g, 0); 2314 2315 drp->size = sizeof(struct drive_params); 2316 // CHS invalid, no removable drive, etc 2317 drp->flags = 0; 2318 drp->cylinders = 0; 2319 drp->heads = 0; 2320 drp->sectors = 0; 2321 drp->total_sectors = hdd_get_blocks_count( 2322 g->hdds[dl & 0x7f]); 2323 drp->bytes_per_sector = 512; // FIXME: Hardcoded 2324 } 2325 } else { 2326#ifdef CONFIG_SVM 2327 amd_vmcb_rflags_cf_wrf(&g->vmcb, 1); 2328#else 2329 vmx_vmcs_rflags_cf_wrf(g, 1); 2330#endif 2331 // it is not really clear to me what ah should contain 2332 // when the drive is not present, so set it to FF 2333 guest_set_ah(g, 0x1); 2334 } 2335 } else { 2336 printf("Unhandeled method on INT 0x13\n"); 2337 return handle_vmexit_unhandeled(g); 2338 } 2339 break; 2340 case 0x15: 2341 // ENABLE A20 GATE 2342 if (guest_get_ax(g) == 0x2401) { 2343 g->a20_gate_enabled = true; 2344#ifdef CONFIG_SVM 2345 amd_vmcb_rflags_cf_wrf(&g->vmcb, 0); 2346#else 2347 vmx_vmcs_rflags_cf_wrf(g, 0); 2348#endif 2349 guest_set_ah(g, 0); 2350 } 2351 // APM INSTALLATION CHECK 2352 else if (guest_get_ax(g) == 0x5300) { 2353 // we do not support APM - set carry flag to indicate error 2354#ifdef CONFIG_SVM 2355 amd_vmcb_rflags_cf_wrf(&g->vmcb, 1); 2356#else 2357 vmx_vmcs_rflags_cf_wrf(g, 1); 2358#endif 2359 } 2360 // APM DISCONNECT 2361 else if (guest_get_ax(g) == 0x5304) { 2362 // we do not support APM - set carry flag to indicate error 2363#ifdef CONFIG_SVM 2364 amd_vmcb_rflags_cf_wrf(&g->vmcb, 1); 2365#else 2366 vmx_vmcs_rflags_cf_wrf(g, 1); 2367#endif 2368 } 2369 // GET MEMORY SIZE FOR >64M CONFIGURATIONS 2370 else if (guest_get_ax(g) == 0xe801) { 2371 // we do not support this BIOS call 2372 // both grub and linux may also use the 0xe820 call 2373#ifdef CONFIG_SVM 2374 amd_vmcb_rflags_cf_wrf(&g->vmcb, 1); 2375#else 2376 vmx_vmcs_rflags_cf_wrf(g, 1); 2377#endif 2378 } 2379 // GET SYSTEM MEMORY MAP 2380 // EDX has to contain 0x534d4150 (== 'SMAP') 2381 else if (guest_get_ax(g) == 0xe820 && 2382 guest_get_edx(g) == 0x534d4150) { 2383 // for now we return only one entry containing the real mem 2384 if (guest_get_ebx(g) > 1 || guest_get_ecx(g) < 20) { 2385 // wrong input params -> report error 2386#ifdef CONFIG_SVM 2387 amd_vmcb_rflags_cf_wrf(&g->vmcb, 1); 2388#else 2389 vmx_vmcs_rflags_cf_wrf(g, 1); 2390#endif 2391 } else { 2392 // taken from http://www.ctyme.com/intr/rb-1741.htm 2393#ifdef CONFIG_SVM 2394 uintptr_t addr = guest_to_host(g->mem_low_va) + 2395 amd_vmcb_es_base_rd(&g->vmcb) + 2396 guest_get_di(g); 2397#else 2398 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_ES_BASE, &es_guest_base); 2399 uintptr_t addr = guest_to_host(g->mem_low_va) + 2400 es_guest_base + guest_get_di(g); 2401#endif 2402 // set EAX to 'SMAP' 2403 guest_set_eax(g, 0x534D4150); 2404 // returned bytes (always 20) 2405 guest_set_ecx(g, 20); 2406 2407 switch (guest_get_ebx(g)) { 2408 case 0x0: 2409 // base memory 2410 assert(g->mem_low_va == 0); 2411 // base address 2412 *(uint64_t *)addr = 0; 2413 // size of the memory block 2414 *(uint64_t *)(addr + 8) = 0xa0000; // 640 KiB 2415 // mem type, 1 == "memory, available to the OS" 2416 *(uint32_t *)(addr + 16) = 1; 2417 // indicate that there is more data 2418 guest_set_ebx(g, 1); 2419 break; 2420 case 0x1: 2421 // extended memory 2422 assert(g->mem_high_va > 0x100000); 2423 // base address 2424 *(uint64_t *)addr = 0x100000; // 1 MiB 2425 // size of the memory block 2426 *(uint64_t *)(addr + 8) = g->mem_high_va - 0x100000; 2427 // mem type, 1 == "memory, available to the OS" 2428 *(uint32_t *)(addr + 16) = 1; 2429 // indicate that there is no more data 2430 guest_set_ebx(g, 0); 2431 break; 2432 default: 2433 assert(!"not reached"); 2434 break; 2435 } 2436 2437 // mark success 2438#ifdef CONFIG_SVM 2439 amd_vmcb_rflags_cf_wrf(&g->vmcb, 0); 2440#else 2441 vmx_vmcs_rflags_cf_wrf(g, 0); 2442#endif 2443 } 2444 } 2445 // SYSTEM - Get Intel SpeedStep (IST) information 2446 else if (guest_get_ax(g) == 0xe980) { 2447 // not supportet yet 2448#ifdef CONFIG_SVM 2449 amd_vmcb_rflags_cf_wrf(&g->vmcb, 1); 2450#else 2451 vmx_vmcs_rflags_cf_wrf(g, 1); 2452#endif 2453 } 2454 // SYSTEM - GET CONFIGURATION (XT >1986/1/10,AT mdl 3x9, 2455 // CONV,XT286,PS) 2456 // GRUB BUG: it puts 0xc0 into AX instead of AH 2457 else if (guest_get_ax(g) == 0xc0) { 2458 // we do not support this 2459#ifdef CONFIG_SVM 2460 amd_vmcb_rflags_cf_wrf(&g->vmcb, 1); 2461#else 2462 vmx_vmcs_rflags_cf_wrf(g, 1); 2463#endif 2464 guest_set_ah(g, 0x80); 2465 } 2466 // GET EXTENDED MEMORY SIZE 2467 else if (guest_get_ah(g) == 0x88) { 2468 // calculate number of 1KB chunks starting from 1MB but not 2469 // beyond 16MB 2470 assert(((g->mem_high_va - g->mem_low_va) & 0x3ff) == 0); 2471 guest_set_ax(g, MIN(0x3c00 /* 16MB */, 2472 (g->mem_high_va - g->mem_low_va) / 1024)); 2473 // indicate no error occured 2474#ifdef CONFIG_SVM 2475 amd_vmcb_rflags_cf_wrf(&g->vmcb, 0); 2476#else 2477 vmx_vmcs_rflags_cf_wrf(g, 0); 2478#endif 2479 } 2480 // SYSTEM - GET CONFIGURATION (XT >1986/1/10,AT mdl 3x9, 2481 // CONV,XT286,PS) 2482 else if (guest_get_ah(g) == 0xc0) { 2483 // we do not support this 2484#ifdef CONFIG_SVM 2485 amd_vmcb_rflags_cf_wrf(&g->vmcb, 1); 2486#else 2487 vmx_vmcs_rflags_cf_wrf(g, 1); 2488#endif 2489 guest_set_ah(g, 0x80); 2490 // SYSTEM - SET BIOS MODE 2491 } else if (guest_get_ah(g) == 0xec) { 2492 // I do no really know the use of this bios call and linux 2493 // expects no action what so ever 2494 } else { 2495 printf("Unhandeled method on INT 0x15\n"); 2496 return handle_vmexit_unhandeled(g); 2497 } 2498 break; 2499 case 0x16: 2500 // KEYBOARD - SET TYPEMATIC RATE AND DELAY 2501 if (guest_get_ah(g) == 0x3) { 2502 // ignore this 2503 } else { 2504 printf("Unhandeled method on INT 0x16\n"); 2505 return handle_vmexit_unhandeled(g); 2506 } 2507 break; 2508 case 0x1a: 2509 // TIME - GET REAL-TIME CLOCK TIME (AT,XT286,PS) 2510 if (guest_get_ah(g) == 0x2) { 2511 uint8_t h, m, s; 2512 lpc_rtc_get_time_bcd(g->lpc, &h, &m, &s); 2513 guest_set_ch(g, h); 2514 guest_set_cl(g, m); 2515 guest_set_dh(g, s); 2516 guest_set_dl(g, 0); 2517 // mark success 2518#ifdef CONFIG_SVM 2519 amd_vmcb_rflags_cf_wrf(&g->vmcb, 0); 2520#else 2521 vmx_vmcs_rflags_cf_wrf(g, 0); 2522#endif 2523 } else { 2524 printf("Unhandeled method on INT 0x1a\n"); 2525 return handle_vmexit_unhandeled(g); 2526 } 2527 break; 2528 default: 2529 printf("handle_vmexit_swint: Unhandeled real-mode interrupt " 2530 "0x%x (%d).\n", int_num, int_num); 2531 return handle_vmexit_unhandeled(g); 2532 } 2533 } else { 2534 printf("vmkitmon: encountered INT instruction outside real mode\n"); 2535 return handle_vmexit_unhandeled(g); 2536 } 2537 2538 // advance the rip beyond the instruction 2539#ifdef CONFIG_SVM 2540 amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_rip_rd(&g->vmcb) + 2); 2541#else 2542 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip); 2543 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip + 2); 2544 assert(err_is_ok(err)); 2545#endif 2546 return HANDLER_ERR_OK; 2547} 2548 2549static inline enum opsize 2550io_access_size_to_opsize (enum x86_io_access io) 2551{ 2552 if (io & X86_IO_ACCESS_SZ8) { 2553 return OPSIZE_8; 2554 } else if (io & X86_IO_ACCESS_SZ16) { 2555 return OPSIZE_16; 2556 } else if (io & X86_IO_ACCESS_SZ32) { 2557 return OPSIZE_32; 2558 } else { 2559 assert(!"NYI"); 2560 return 0; 2561 } 2562} 2563 2564static int 2565handle_vmexit_ioio (struct guest *g) 2566{ 2567 int r; 2568#ifdef CONFIG_SVM 2569 uint64_t info1 = amd_vmcb_exitinfo1_rd(&g->vmcb); 2570 enum x86_io_access io; 2571 uint16_t port = info1 >> 16; 2572#else 2573 errval_t err = 0; 2574 if (!g->emulated_before_exit) { 2575 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_EXIT_QUAL, &saved_exit_qual); 2576 uint64_t instr_len, guest_rip; 2577 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_EXIT_INSTR_LEN, &instr_len); 2578 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip); 2579 saved_rip = guest_rip + instr_len; 2580 } 2581 uint16_t port = (saved_exit_qual >> 16) & 0xffff; 2582#endif 2583 bool write; 2584 enum opsize size; 2585 uint32_t val; 2586 bool newapi = false; // needed as a transition 2587 2588#ifdef CONFIG_SVM 2589 // copy the access flags 2590 // FIXME: this severely exploits the way the x86_io_access flags are set up 2591 io = (info1 >> 1); 2592 io |= info1 & SVM_IOIO_TYPE_MASK; 2593 2594 // gather some params for the io access 2595 write = (io & X86_IO_ACCESS_TYPE) == 0; 2596 size = OPSIZE_8; // make gcc happy 2597 if (io & X86_IO_ACCESS_SZ8) { 2598 size = OPSIZE_8; 2599 } else if (io & X86_IO_ACCESS_SZ16) { 2600 size = OPSIZE_16; 2601 } else if (io & X86_IO_ACCESS_SZ32) { 2602 size = OPSIZE_32; 2603 } 2604#else 2605 write = ((saved_exit_qual >> 3) & 0x1) == 0; 2606 size = OPSIZE_8; 2607 if ((saved_exit_qual & 0x7) == 0) { 2608 size = OPSIZE_8; 2609 } else if ((saved_exit_qual & 0x7) == 1) { 2610 size = OPSIZE_16; 2611 } else if ((saved_exit_qual & 0x7) == 3) { 2612 size = OPSIZE_32; 2613 } else { 2614 assert(!"Invalid size of access value"); 2615 } 2616#endif 2617 // fetch the source val if neccessary 2618 if (write) { 2619 switch (size) { 2620 case OPSIZE_8: 2621 val = guest_get_al(g); 2622 break; 2623 case OPSIZE_16: 2624 val = guest_get_ax(g); 2625 break; 2626 case OPSIZE_32: 2627 val = guest_get_eax(g); 2628 break; 2629 default: 2630 assert(!"not reached"); 2631 break; 2632 } 2633 } 2634 2635 // assign the request to the corresponding subsystem 2636 switch (port) { 2637 // LPC devices 2638 case 0x20: // primary PIC 2639 case 0x21: // primary PIC 2640 case 0x40: // Timer 2641 case 0x41: // Timer 2642 case 0x42: // Timer 2643 case 0x43: // Timer 2644 case 0x61: // NMI Controller 2645 case 0x70: // RTC 2646 case 0x71: // RTC 2647 case 0x72: // RTC 2648 case 0x73: // RTC 2649 case 0x74: // RTC 2650 case 0x75: // RTC 2651 case 0x76: // RTC 2652 case 0x77: // RTC 2653 case 0xa0: // secondary PIC 2654 case 0xa1: // secondary PIC 2655 if (write) { 2656 r = lpc_handle_pio_write(g->lpc, port, size, val); 2657 guest_assert(g, r == 0); 2658 } else { 2659 r = lpc_handle_pio_read(g->lpc, port, size, &val); 2660 assert(r == 0); 2661 } 2662 newapi = true; 2663 break; 2664 // Keyboard 2665 case 0x60: 2666 case 0x64: 2667 // we currently do not support a keyboard 2668 if (!write) { 2669 val = ~0; 2670 } 2671 newapi = true; 2672 break; 2673 case 0x80: 2674 // some apps use writing to this port as a method to delay execution 2675 // so we just do noting 2676 break; 2677 // Coprocessor 2678 case 0xf0: 2679 case 0xf1: 2680 // coprocessor IGNNE# - do nothing for now 2681 break; 2682 2683 // serial COM1 port 2684 // FIXME: this should not be hardcoded ! 2685 case 0x3f8: 2686 case 0x3f9: 2687 case 0x3fa: 2688 case 0x3fb: 2689 case 0x3fc: 2690 case 0x3fd: 2691 case 0x3fe: 2692 case 0x3ff: 2693 // COM2 2694 case 0x2f8: 2695 case 0x2f9: 2696 case 0x2fa: 2697 case 0x2fb: 2698 case 0x2fc: 2699 case 0x2fd: 2700 case 0x2fe: 2701 case 0x2ff: 2702 // COM3 2703 case 0x3e8: 2704 case 0x3e9: 2705 case 0x3ea: 2706 case 0x3eb: 2707 case 0x3ec: 2708 case 0x3ed: 2709 case 0x3ee: 2710 case 0x3ef: 2711 // COM4 2712 case 0x2e8: 2713 case 0x2e9: 2714 case 0x2ea: 2715 case 0x2eb: 2716 case 0x2ec: 2717 case 0x2ed: 2718 case 0x2ee: 2719 case 0x2ef: { 2720 int com; 2721 2722 com = (port & 0xf0) == 0xf0 ? !(port & 0x100) : !(port & 0x100) + 2; 2723 assert(com >= 0 && com < 4); 2724 if (write) { 2725 r = pc16550d_handle_pio_write(g->serial_ports[com], port, 2726 size, val); 2727 assert(r == 0); 2728 } else { 2729 r = pc16550d_handle_pio_read(g->serial_ports[com], port, 2730 size, &val); 2731 assert(r == 0); 2732 } 2733 newapi = true; 2734 break; 2735 } 2736 2737 // PCI config space (address) 2738 case 0xcf8: 2739 case 0xcf9: 2740 case 0xcfa: 2741 case 0xcfb: 2742 // PCI config space (data) 2743 case 0xcfc: 2744 case 0xcfd: 2745 case 0xcfe: 2746 case 0xcff: 2747 if(write) { 2748 r = pci_handle_pio_write(g->pci, port, size, val); 2749 } else { 2750 r = pci_handle_pio_read(g->pci, port, size, &val); 2751 } 2752 assert(r == 0); 2753 newapi = true; 2754 break; 2755 2756 default: 2757 // the default is to return 0xff and to ignore writes 2758 if (!write) { 2759 val = 0xffffffff; 2760 } 2761 newapi = true; 2762 }; 2763 2764 // set the destination when neccessary 2765 if (newapi && !write) { 2766 switch (size) { 2767 case OPSIZE_8: 2768 guest_set_al(g, val); 2769 break; 2770 case OPSIZE_16: 2771 guest_set_ax(g, val); 2772 break; 2773 case OPSIZE_32: 2774 guest_set_eax(g, val); 2775 break; 2776 default: 2777 assert(!"not reached"); 2778 break; 2779 } 2780 } 2781 2782 // the following IP is stored in the exitinfo2 field 2783#ifdef CONFIG_SVM 2784 amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_exitinfo2_rd(&g->vmcb)); 2785#else 2786 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, saved_rip); 2787 assert(err_is_ok(err)); 2788#endif 2789 return HANDLER_ERR_OK; 2790} 2791 2792static int 2793handle_vmexit_msr (struct guest *g) { 2794#ifdef CONFIG_SVM 2795 bool write = amd_vmcb_exitinfo1_rd(&g->vmcb) == 1; 2796#else 2797 int msr_index; 2798 errval_t err = 0; 2799 bool write = (saved_exit_reason == VMX_EXIT_REASON_WRMSR); 2800 struct msr_entry *guest_msr_area = (struct msr_entry *)g->msr_area_va; 2801#endif 2802 uint32_t msr = guest_get_ecx(g); 2803 uint64_t val; 2804 2805 // there may be writes or reads to MSRs 2806 if (write) { 2807 // fetch the value to write from EDX:EAX 2808 val = ((uint64_t)guest_get_edx(g) << 32) | guest_get_eax(g); 2809 2810 // store the read value into the corresponding location 2811 switch (msr) { 2812 case X86_MSR_SYSENTER_CS: 2813#ifdef CONFIG_SVM 2814 amd_vmcb_sysenter_cs_wr(&g->vmcb, val); 2815#else 2816 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_SYSENTER_CS, val); 2817#endif 2818 break; 2819 case X86_MSR_SYSENTER_ESP: 2820#ifdef CONFIG_SVM 2821 amd_vmcb_sysenter_esp_wr(&g->vmcb, val); 2822#else 2823 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_SYSENTER_ESP, val); 2824#endif 2825 break; 2826 case X86_MSR_SYSENTER_EIP: 2827#ifdef CONFIG_SVM 2828 amd_vmcb_sysenter_eip_wr(&g->vmcb, val); 2829#else 2830 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_SYSENTER_EIP, val); 2831#endif 2832 break; 2833 case X86_MSR_EFER: 2834#ifdef CONFIG_SVM 2835 amd_vmcb_efer_wr_raw(&g->vmcb, val); 2836#else 2837 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_EFER_F, val); 2838#endif 2839 break; 2840 case X86_MSR_FS_BASE: 2841#ifdef CONFIG_SVM 2842 amd_vmcb_fs_base_wr(&g->vmcb, val); 2843#else 2844 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_FS_BASE, val); 2845#endif 2846 break; 2847 case X86_MSR_GS_BASE: 2848#ifdef CONFIG_SVM 2849 amd_vmcb_gs_base_wr(&g->vmcb, val); 2850#else 2851 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_GS_BASE, val); 2852#endif 2853 break; 2854#ifdef CONFIG_SVM 2855 case X86_MSR_KERNEL_GS_BASE: 2856 amd_vmcb_kernel_gs_base_wr(&g->vmcb, val); 2857 break; 2858 case X86_MSR_STAR: 2859 amd_vmcb_star_wr(&g->vmcb, val); 2860 break; 2861 case X86_MSR_LSTAR: 2862 amd_vmcb_lstar_wr(&g->vmcb, val); 2863 break; 2864 case X86_MSR_CSTAR: 2865 amd_vmcb_cstar_wr(&g->vmcb, val); 2866 break; 2867 case X86_MSR_SFMASK: 2868 amd_vmcb_sfmask_wr(&g->vmcb, val); 2869 break; 2870 default: 2871 printf("MSR: unhandeled MSR write access to %x\n", msr); 2872 return handle_vmexit_unhandeled(g); 2873#else 2874 default: 2875 msr_index = vmx_guest_msr_index(msr); 2876 if (msr_index == -1) { 2877 printf("MSR: unhandeled MSR write access to %x\n", msr); 2878 return handle_vmexit_unhandeled(g); 2879 } 2880 guest_msr_area[msr_index].val = val; 2881 break; 2882#endif 2883 } 2884 } else { 2885 // read the value from the corresponding location 2886 switch (msr) { 2887 case X86_MSR_SYSENTER_CS: 2888#ifdef CONFIG_SVM 2889 val = amd_vmcb_sysenter_cs_rd(&g->vmcb); 2890#else 2891 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_SYSENTER_CS, &val); 2892#endif 2893 break; 2894 case X86_MSR_SYSENTER_ESP: 2895#ifdef CONFIG_SVM 2896 val = amd_vmcb_sysenter_esp_rd(&g->vmcb); 2897#else 2898 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_SYSENTER_ESP, &val); 2899#endif 2900 break; 2901 case X86_MSR_SYSENTER_EIP: 2902#ifdef CONFIG_SVM 2903 val = amd_vmcb_sysenter_eip_rd(&g->vmcb); 2904#else 2905 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_SYSENTER_EIP, &val); 2906#endif 2907 break; 2908 case X86_MSR_EFER: 2909#ifdef CONFIG_SVM 2910 val = amd_vmcb_efer_rd_raw(&g->vmcb); 2911#else 2912 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_EFER_F, &val); 2913#endif 2914 break; 2915 case X86_MSR_FS_BASE: 2916#ifdef CONFIG_SVM 2917 val = amd_vmcb_fs_base_rd(&g->vmcb); 2918#else 2919 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_FS_BASE, &val); 2920#endif 2921 break; 2922 case X86_MSR_GS_BASE: 2923#ifdef CONFIG_SVM 2924 val = amd_vmcb_gs_base_rd(&g->vmcb); 2925#else 2926 err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_GS_BASE, &val); 2927#endif 2928 break; 2929#ifdef CONFIG_SVM 2930 case X86_MSR_KERNEL_GS_BASE: 2931 val = amd_vmcb_kernel_gs_base_rd(&g->vmcb); 2932 break; 2933 case X86_MSR_STAR: 2934 val = amd_vmcb_star_rd(&g->vmcb); 2935 break; 2936 case X86_MSR_LSTAR: 2937 val = amd_vmcb_lstar_rd(&g->vmcb); 2938 break; 2939 case X86_MSR_CSTAR: 2940 val = amd_vmcb_cstar_rd(&g->vmcb); 2941 break; 2942 case X86_MSR_SFMASK: 2943 val = amd_vmcb_sfmask_rd(&g->vmcb); 2944 break; 2945 default: 2946 printf("MSR: unhandeled MSR read access to %x\n", msr); 2947 return handle_vmexit_unhandeled(g); 2948#else 2949 default: 2950 msr_index = vmx_guest_msr_index(msr); 2951 if (msr_index == -1) { 2952 printf("MSR: unhandeled MSR read access to %x\n", msr); 2953 return handle_vmexit_unhandeled(g); 2954 } 2955 val = guest_msr_area[msr_index].val; 2956 break; 2957#endif 2958 } 2959 2960 // store the value in EDX:EAX 2961 guest_set_eax(g, val); 2962 guest_set_edx(g, val >> 32); 2963 } 2964 2965 // advance the rip beyond the current instruction 2966#ifdef CONFIG_SVM 2967 amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_rip_rd(&g->vmcb) + 2); 2968#else 2969 uint64_t guest_rip; 2970 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip); 2971 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip + 2); 2972 assert(err_is_ok(err)); 2973#endif 2974 return HANDLER_ERR_OK; 2975} 2976 2977static int 2978handle_vmexit_cpuid (struct guest *g) { 2979 uint32_t eax, ebx, ecx, edx; 2980 uint32_t func = guest_get_eax(g); 2981 2982 switch (func) { 2983 // Processor Vendor and Largest Standard Function Number 2984 case 0: 2985 case 0x80000000: 2986 // max standard function offset 2987 eax = func == 0 ? 0x1 : 0x80000000; 2988 // string "AuthenticAMD" 2989 ebx = 0x68747541; 2990 ecx = 0x444d4163; 2991 edx = 0x69746e65; 2992 break; 2993 2994 // Family, Model, Stepping Identifiers 2995 case 1: 2996 // we simulate a AMD K6-3D 2997 // Family 5, Model 8, Stepping 12 2998 eax = 0x58c; 2999 // no brand, clflush size 16, no mulitprocessing, no local apic 3000 ebx = 0x0f00; 3001 // support the popcnt instr 3002 ecx = 0x800000; 3003 // support some basic features 3004 edx = 0x89a91b; 3005 break; 3006 3007 default: 3008 // use the answer of the host if there is any other request 3009 // FIXME: this is probably not a good idea ;) 3010 cpuid(func, &eax, &ebx, &ecx, &edx); 3011 printf("handle_vmexit_cpuid: CPUID: func %x, host reports: eax %x, " 3012 "ebx %x, ecx %x, edx %x\n", func, eax, ebx, ecx, edx); 3013 break; 3014 } 3015 3016 guest_set_eax(g, eax); 3017 guest_set_ebx(g, ebx); 3018 guest_set_ecx(g, ecx); 3019 guest_set_edx(g, edx); 3020 3021 // advance the rip beyond the instruction 3022#ifdef CONFIG_SVM 3023 amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_rip_rd(&g->vmcb) + 2); 3024#else 3025 uint64_t guest_rip; 3026 errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip); 3027 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip + 2); 3028 assert(err_is_ok(err)); 3029#endif 3030 return HANDLER_ERR_OK; 3031} 3032 3033static int 3034handle_vmexit_vmmcall (struct guest *g) { 3035 printf("VMMCALL: tsc %lu, exits with mon invocation %lu, exits w/o mon " 3036 "invocation %lu\n", rdtsc(), 3037 g->ctrl->num_vm_exits_with_monitor_invocation, 3038 g->ctrl->num_vm_exits_without_monitor_invocation); 3039 3040 // advance the rip beyond the instruction 3041#ifdef CONFIG_SVM 3042 amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_rip_rd(&g->vmcb) + 3); 3043#else 3044 uint64_t guest_rip; 3045 errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip); 3046 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip + 3); 3047 assert(err_is_ok(err)); 3048#endif 3049 return HANDLER_ERR_OK; 3050} 3051 3052static int 3053handle_vmexit_hlt (struct guest *g) { 3054 // the guest has nothing to do - poll out irq sources for pending IRQs 3055 // if they do not assert a virtual IRQ then we will do nothing 3056 lpc_pic_process_irqs(g->lpc); 3057 3058 // advance the rip beyond the instruction 3059#ifdef CONFIG_SVM 3060 amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_rip_rd(&g->vmcb) + 1); 3061#else 3062 uint64_t guest_rip; 3063 errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip); 3064 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip + 1); 3065#endif 3066 3067 // running HLT with IRQs masked does not make any sense 3068 // FIXME: this assert silly, shutting down the VM would be the right way 3069#ifdef CONFIG_SVM 3070 guest_assert(g, amd_vmcb_rflags_rd(&g->vmcb).intrf == 1); 3071#else 3072 uint64_t guest_rflags; 3073 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RFLAGS, &guest_rflags); 3074 assert(err_is_ok(err)); 3075 guest_assert(g, guest_rflags & RFLAGS_IF); 3076#endif 3077 if (virq_pending(g, NULL, NULL)) { 3078 // there is an IRQ pending, proceed as normal, the CPU will take it 3079 } else { 3080 // there is really nothing to do - stop the VM and wait 3081 g->runnable = false; 3082 } 3083 3084 return HANDLER_ERR_OK; 3085} 3086 3087static inline int 3088decode_mov_instr_length (struct guest *g, uint8_t *code) 3089{ 3090 int len; 3091 3092 // we only support long mode for now 3093 //assert(amd_vmcb_efer_rd(&g->vmcb).lma == 1); 3094 3095 // all non special MOV instructions use one byte as opcode and at least a 3096 // ModR/M byte 3097 len = 2; 3098 // check for the REX prefix 3099 if ((code[0] >> 4) == 0x4) { 3100 len++; 3101 code++; 3102 } 3103 // precaution because I did no check all variants of MOV, at least these two 3104 // variants are supported 3105 assert(code[0] == 0x89 || code[0] == 0x8b); 3106 3107 union x86_modrm modrm = { .raw = code[1] }; 3108 // check for displacements 3109 if (modrm.u.mod == 0x1) { 3110 // 1B displacement 3111 len++; 3112 } else if (modrm.u.mod == 0x2) { 3113 // 4B displacement 3114 len += 4; 3115 } 3116 3117 // check for SIB byte 3118 if (modrm.u.rm == 0x4 && modrm.u.mod != 0x3) { 3119 len++; 3120 } 3121 3122 return len; 3123} 3124 3125// finds out whether a move instruction is a read or a write with respect to 3126// memory 3127static inline bool 3128decode_mov_is_write (struct guest *g, uint8_t *code) 3129{ 3130 // check for the REX prefix 3131 if ((code[0] >> 4) == 0x4) { 3132 code++; 3133 } 3134 3135 // we only support one move variant (in each direction) for now 3136 assert(code[0] == 0x89 || code[0] == 0x8b); 3137 3138 union x86_modrm modrm = { .raw = code[1] }; 3139 // not defined for reg to reg moves 3140 assert(modrm.u.mod != 3); 3141 3142 return code[0] == 0x89; // 0x89 ==> MOV reg -> mem 3143} 3144 3145static inline enum opsize 3146decode_mov_op_size (struct guest *g, uint8_t *code) 3147{ 3148 /* 3149 printf("EFER: 0x%lx\n", amd_vmcb_efer_rd_raw(&g->vmcb)); 3150 printf("Code: 0x%lx\n", *((uint64_t *)code)); 3151 printf("Code[0]: 0x%x, Code[1]: 0x%x, Code[2]: 0x%x, Code[3]: 0x%x\n", code[0],code[1],code[2],code[3]); 3152 printf("Guest EAX: 0x%x\n", guest_get_eax(g)); 3153 printf("Guest EBX: 0x%x\n", guest_get_ebx(g)); 3154 printf("Guest ECX: 0x%x\n", guest_get_ecx(g)); 3155 3156 printf("Guest EDX: 0x%x\n", guest_get_edx(g)); 3157 printf("Guest RDI: 0x%lx\n", guest_get_rdi(g)); 3158 printf("Guest RSI: 0x%lx\n", guest_get_rsi(g)); 3159 printf("Guest RSP: 0x%lx\n", guest_get_rsp(g)); 3160 printf("Guest RBP: 0x%lx\n", guest_get_rbp(g)); 3161 */ 3162 3163 // we only support long mode for now 3164 //assert(amd_vmcb_efer_rd(&g->vmcb).lma == 1); 3165 3166 // check for the REX prefix 3167 if ((code[0] >> 4) == 0x4 && code[0] & 0x48) { 3168 return OPSIZE_64; 3169 } 3170 return OPSIZE_32; 3171} 3172 3173 3174static inline uint64_t 3175decode_mov_src_val (struct guest *g, uint8_t *code) { 3176 // we only support long mode for now 3177 //assert(amd_vmcb_efer_rd(&g->vmcb).lma == 1); 3178 3179 // check for the REX prefix 3180 if ((code[0] >> 4) == 0x4) { 3181 code++; 3182 } 3183 3184 // we only support one variant for now 3185 assert(code[0] == 0x89); 3186 3187 union x86_modrm modrm = { .raw = code[1] }; 3188 return get_reg_val_by_reg_num(g, modrm.u.regop); 3189} 3190 3191 3192static inline void 3193decode_mov_dest_val (struct guest *g, uint8_t *code, uint64_t val) 3194{ 3195 // we only support long mode for now 3196 //assert(amd_vmcb_efer_rd(&g->vmcb).lma == 1); 3197 3198 // check for the REX prefix 3199 if ((code[0] >> 4) == 0x4) { 3200 code++; 3201 } 3202 3203 // we only support one variant for now 3204 assert(code[0] == 0x8b); 3205 3206 union x86_modrm modrm = { .raw = code[1] }; 3207 set_reg_val_by_reg_num(g, modrm.u.regop, val); 3208} 3209 3210static int 3211handle_vmexit_npf (struct guest *g) { 3212 int r; 3213#ifdef CONFIG_SVM 3214 uint64_t fault_addr = amd_vmcb_exitinfo2_rd(&g->vmcb); 3215 uint64_t guest_rip = amd_vmcb_rip_rd(&g->vmcb); 3216#else 3217 uint64_t fault_addr, guest_rip; 3218 errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GPADDR_F, &fault_addr); 3219 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip); 3220 assert(err_is_ok(err)); 3221#endif 3222 invoke_dispatcher_dump_ptables(g->dcb_cap, 0); 3223 debug_printf("handling guest page fault on 0x%lx, IP 0x%lx\n", 3224 fault_addr, guest_rip); 3225 uint8_t *code = NULL; 3226 3227 if (vspace_get_region(g->vspace, (void*)fault_addr) != NULL) { 3228 USER_PANIC("NPF vmexit on address that's mapped in EPT\n"); 3229 } 3230 3231 USER_PANIC("npf handling NYI for Arrakis guest!\n"); 3232 // check for fault inside the guest physical memory region 3233 if (fault_addr >= g->mem_low_va && fault_addr < g->mem_high_va) { 3234 // allocate the missing memory 3235 alloc_guest_mem(g, fault_addr & ~BASE_PAGE_MASK, BASE_PAGE_SIZE); 3236 // do not advance the RIP, it is safe (and neccessary) to 3237 // replay the faulting instruction 3238 return HANDLER_ERR_OK; 3239 } 3240 3241 // fetch the location to the code 3242 r = get_instr_arr(g, &code); 3243 assert (r == 0); 3244 3245 // virtual devices 3246 switch (fault_addr & ~BASE_PAGE_MASK) { 3247 case APIC_BASE: { 3248 uint64_t val; 3249 enum opsize size; 3250 3251 assert(g->apic != NULL); 3252 size = decode_mov_op_size(g, code); 3253 if (decode_mov_is_write(g, code)) { 3254 val = decode_mov_src_val(g, code); 3255 r = apic_handle_mmio_write(g->apic, fault_addr, size, val); 3256 assert(r == 0); 3257 } else { 3258 r = apic_handle_mmio_read(g->apic, fault_addr, size, &val); 3259 assert(r == 0); 3260 decode_mov_dest_val(g, code, val); 3261 } 3262 3263 // advance the rip beyond the instruction 3264#ifdef CONFIG_SVM 3265 amd_vmcb_rip_wr(&g->vmcb, guest_rip + 3266 decode_mov_instr_length(g, code)); 3267#else 3268 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip + 3269 decode_mov_instr_length(g, code)); 3270 assert(err_is_ok(err)); 3271#endif 3272 return HANDLER_ERR_OK; 3273 } 3274 } 3275 3276 printf("arrkismon: access to an unknown memory location: %lx\n", fault_addr); 3277 return handle_vmexit_unhandeled(g); 3278} 3279 3280typedef int (*vmexit_handler)(struct guest *g); 3281 3282#ifdef CONFIG_SVM 3283static vmexit_handler vmexit_handlers[0x8c] = { 3284 [SVM_VMEXIT_CR0_READ] = handle_vmexit_cr_access, 3285 [SVM_VMEXIT_CR0_WRITE] = handle_vmexit_cr_access, 3286 [SVM_VMEXIT_CR0_SEL_WRITE] = handle_vmexit_cr_access, 3287 [SVM_VMEXIT_SWINT] = handle_vmexit_swint, 3288 [SVM_VMEXIT_IDTR_WRITE] = handle_vmexit_ldt, 3289 [SVM_VMEXIT_GDTR_WRITE] = handle_vmexit_ldt, 3290 [SVM_VMEXIT_IOIO] = handle_vmexit_ioio, 3291 [SVM_VMEXIT_MSR] = handle_vmexit_msr, 3292 [SVM_VMEXIT_CPUID] = handle_vmexit_cpuid, 3293 [SVM_VMEXIT_VMMCALL] = handle_vmexit_vmmcall, 3294 [SVM_VMEXIT_HLT] = handle_vmexit_hlt 3295}; 3296#else 3297static vmexit_handler vmexit_handlers[0x8c] = { 3298 [VMX_EXIT_REASON_CPUID] = handle_vmexit_cpuid, 3299 [VMX_EXIT_REASON_HLT] = handle_vmexit_hlt, 3300 [VMX_EXIT_REASON_VMCALL] = handle_vmexit_vmmcall, 3301 [VMX_EXIT_REASON_CR_ACCESS] = handle_vmexit_cr_access, 3302 [VMX_EXIT_REASON_INOUT] = handle_vmexit_ioio, 3303 [VMX_EXIT_REASON_RDMSR] = handle_vmexit_msr, 3304 [VMX_EXIT_REASON_WRMSR] = handle_vmexit_msr, 3305 [VMX_EXIT_REASON_GDTR_IDTR] = handle_vmexit_ldt, 3306 [VMX_EXIT_REASON_EPT_FAULT] = handle_vmexit_npf, 3307 [VMX_EXIT_REASON_SWINT] = handle_vmexit_swint 3308}; 3309#endif 3310 3311void 3312guest_handle_vmexit (struct guest *g) { 3313 vmexit_handler handler; 3314#ifdef CONFIG_SVM 3315 uint64_t exitcode = amd_vmcb_exitcode_rd(&g->vmcb); 3316 if (exitcode == SVM_VMEXIT_NPF) { 3317 handler = handle_vmexit_npf; 3318 } else if (LIKELY(vmexit_handlers[exitcode] != NULL)) { 3319 handler = vmexit_handlers[exitcode]; 3320 } else { 3321 handle_vmexit_unhandeled(g); 3322 return; 3323 } 3324#else 3325 if (!g->emulated_before_exit) { 3326 errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_EXIT_REASON, 3327 (uint64_t *)&saved_exit_reason); 3328 DEBUG_ERR(err, "vmread exit_reason"); 3329 assert(err_is_ok(err)); 3330 } 3331 3332 if (LIKELY(vmexit_handlers[saved_exit_reason] != NULL)) { 3333 handler = vmexit_handlers[saved_exit_reason]; 3334 } else { 3335 handle_vmexit_unhandeled(g); 3336 return; 3337 } 3338#endif 3339 int r = handler(g); 3340 if (LIKELY(r == HANDLER_ERR_OK)) { 3341 if (g->runnable) { 3342 guest_make_runnable(g, true); 3343 } 3344 } 3345} 3346