1
2/**
3 * \file
4 */
5
6/*
7 * Copyright (c) 2009, 2010, 2013, ETH Zurich.
8 * All rights reserved.
9 *
10 * This file is distributed under the terms in the attached LICENSE file.
11 * If you do not find this file, copies can be found by writing to:
12 * ETH Zurich D-INFK, Universitaetstrasse 6, CH-8092 Zurich. Attn: Systems Group.
13 */
14
15#include <stdlib.h>
16#include <string.h>
17#include "vmkitmon.h"
18#include <barrelfish/barrelfish.h>
19#include <barrelfish/lmp_endpoints.h>
20#include <barrelfish/lmp_chan.h>
21#include <barrelfish/dispatcher_arch.h>
22#include <barrelfish/memobj.h>
23#include <barrelfish/vregion.h>
24#include <barrelfish/vspace.h>
25
26#include "x86.h"
27#ifdef CONFIG_SVM
28#include "svm.h"
29#endif
30#include "paging.h"
31//#include "realmode.h"
32#include "hdd.h"
33#include "console.h"
34#include "pc16550d.h"
35#include "apic.h"
36#include "lpc.h"
37#include "pci.h"
38#include "pci_host.h"
39
40#define ARRAKIS_USE_NESTED_PAGING
41//#define EPT_FINE_GRAINED
42
43#define VMCB_SIZE       0x1000      // 4KB
44#ifdef CONFIG_SVM
45#define IOPM_SIZE       0x3000      // 12KB
46#define MSRPM_SIZE      0x2000      // 8KB
47#else
48#define IOBMP_A_SIZE    0x1000      // 4KB
49#define IOBMP_B_SIZE    0x1000      // 4KB
50#define MSRPM_SIZE      0x1000      // 4KB
51#endif
52#define RM_MEM_SIZE     (0x100000 + BASE_PAGE_SIZE)    // 1MB + A20 gate space
53
54#define APIC_BASE       0xfee00000
55
56#define VREGION_FLAGS_ALL (VREGION_FLAGS_READ_WRITE | VREGION_FLAGS_EXECUTE)
57
58// list of guests
59struct guest *guests = NULL;
60
61static paging_x86_64_flags_t vregion_to_pmap_flag(vregion_flags_t vregion_flags)
62{
63    paging_x86_64_flags_t pmap_flags =
64        PTABLE_USER_SUPERVISOR | PTABLE_EXECUTE_DISABLE;
65
66    if (!(vregion_flags & VREGION_FLAGS_GUARD)) {
67        if (vregion_flags & VREGION_FLAGS_WRITE) {
68            pmap_flags |= PTABLE_READ_WRITE;
69        }
70        if (vregion_flags & VREGION_FLAGS_EXECUTE) {
71            pmap_flags &= ~PTABLE_EXECUTE_DISABLE;
72        }
73        if (vregion_flags & VREGION_FLAGS_NOCACHE) {
74            pmap_flags |= PTABLE_CACHE_DISABLED;
75        }
76    }
77
78    return pmap_flags;
79}
80
81#ifndef CONFIG_SVM
82extern uint16_t saved_exit_reason;
83extern uint64_t saved_exit_qual, saved_rip;
84
85// List of MSRs that are saved on VM-exit and loaded on VM-entry.
86static uint32_t msr_list[VMX_MSR_COUNT] =
87    {X86_MSR_KERNEL_GS_BASE, X86_MSR_STAR, X86_MSR_LSTAR, X86_MSR_CSTAR, X86_MSR_SFMASK};
88
89// Saved priority of the most recent irq that is asserted.
90uint8_t interrupt_priority = 0;
91#endif
92
93#ifndef CONFIG_SVM
94static inline int vmx_guest_msr_index(uint32_t msr_index)
95{
96    for (int i = 0; i < VMX_MSR_COUNT; i++) {
97        if (msr_list[i] == msr_index) {
98            return i;
99	}
100    }
101    return -1;
102}
103
104__attribute__((unused))
105static void initialize_guest_msr_area(struct guest *g)
106{
107    struct msr_entry *guest_msr_area = (struct msr_entry *)g->msr_area_va;
108
109    // The values of the MSRs in the guest MSR area are all set to 0.
110    for (int i = 0; i < VMX_MSR_COUNT; i++) {
111        guest_msr_area[i].index = msr_list[i];
112	guest_msr_area[i].val = 0x0;
113    }
114
115    errval_t err = invoke_dispatcher_vmwrite(g->dcb_cap, VMX_EXIT_MSR_STORE_F, g->msr_area_pa);
116    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_EXIT_MSR_STORE_CNT, VMX_MSR_COUNT);
117    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_ENTRY_MSR_LOAD_F, g->msr_area_pa);
118    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_ENTRY_MSR_LOAD_CNT, VMX_MSR_COUNT);
119    assert(err_is_ok(err));
120}
121#endif
122
123lvaddr_t guest_offset = 0;
124
125/// stores the last used guest ASID
126static uint32_t last_guest_asid = 0;
127
128// FIXME: this is somewhat broken by design... we should emit proper exceptions
129//        to the guest opposed to just halt the VM
130#define guest_assert(g, e) \
131    ((e) ? (void)0 : (handle_vmexit_unhandeled(g), assert(e)))
132
133static errval_t
134guest_slot_alloc(struct guest *g, struct capref *ret)
135{
136    return g->slot_alloc.a.alloc(&g->slot_alloc.a, ret);
137}
138
139errval_t guest_vspace_map_wrapper(struct vspace *vspace, lvaddr_t vaddr,
140                                  struct capref frame,  size_t size)
141{
142    debug_printf("%s:%d\n",__FUNCTION__, __LINE__);
143    errval_t err;
144    struct vregion *vregion = NULL;
145    struct memobj_one_frame *memobj = NULL;
146
147    debug_printf("%s:%d\n",__FUNCTION__, __LINE__);
148    // Allocate space
149    vregion = malloc(sizeof(struct vregion));
150    if (!vregion) {
151        err = LIB_ERR_MALLOC_FAIL;
152        goto error;
153    }
154    debug_printf("%s:%d\n",__FUNCTION__, __LINE__);
155    memobj = malloc(sizeof(struct memobj_one_frame));
156    if (!memobj) {
157        err = LIB_ERR_MALLOC_FAIL;
158        goto error;
159    }
160
161    debug_printf("%s:%d\n",__FUNCTION__, __LINE__);
162    // Create the objects
163    err = memobj_create_one_frame(memobj, size, 0);
164    if (err_is_fail(err)) {
165        err = err_push(err, LIB_ERR_MEMOBJ_CREATE_ANON);
166        goto error;
167    }
168    debug_printf("%s:%d\n",__FUNCTION__, __LINE__);
169    err = memobj->m.f.fill(&memobj->m, 0, frame, size);
170    if (err_is_fail(err)) {
171        err = err_push(err, LIB_ERR_MEMOBJ_FILL);
172        goto error;
173    }
174    debug_printf("%s:%d\n",__FUNCTION__, __LINE__);
175    debug_printf("mapping guest vregion (%p) in guest vspace (%p) at 0x%lx, size 0x%lx\n",
176            vregion, vspace, vaddr, size);
177    debug_printf("current regions in guest vspace:\n");
178    for (struct vregion *v = vspace->head; v; v = v->next) {
179        debug_printf("   0x%lx, 0x%lx\n", v->base, v->size);
180    }
181    err = vregion_map_fixed(vregion, vspace, &memobj->m, 0, size, vaddr,
182                            VREGION_FLAGS_READ | VREGION_FLAGS_WRITE | VREGION_FLAGS_EXECUTE);
183    if (err_is_fail(err)) {
184        err = err_push(err, LIB_ERR_VSPACE_MAP);
185        goto error;
186    }
187    debug_printf("%s:%d\n",__FUNCTION__, __LINE__);
188    err = memobj->m.f.pagefault(&memobj->m, vregion, 0, 0);
189    if (err_is_fail(err)) {
190        err = err_push(err, LIB_ERR_MEMOBJ_PAGEFAULT_HANDLER);
191        goto error;
192    }
193    debug_printf("mapped %zu bytes at 0x%"PRIxGENVADDR"\n", size, vaddr);
194
195    return SYS_ERR_OK;
196
197error: // XXX: proper cleanup
198    if (vregion) {
199        free(vregion);
200    }
201    if (memobj) {
202        free(memobj);
203    }
204    return err;
205}
206
207#define GUEST_VSPACE_SIZE (1ULL<<39) // 512 GB
208
209static errval_t vspace_map_wrapper(lvaddr_t vaddr, struct capref frame,
210                                   size_t size)
211{
212    debug_printf("%s:%d\n",__FUNCTION__, __LINE__);
213    errval_t err;
214    static struct memobj_anon *memobj = NULL;
215    static struct vregion *vregion = NULL;
216    static bool initialized = false;
217
218    debug_printf("%s:%d\n",__FUNCTION__, __LINE__);
219    if (!initialized) {
220        debug_printf("%s:%d\n",__FUNCTION__, __LINE__);
221        // Allocate space
222        memobj = malloc(sizeof(struct memobj_anon));
223        if (!memobj) {
224            return LIB_ERR_MALLOC_FAIL;
225        }
226        debug_printf("%s:%d\n",__FUNCTION__, __LINE__);
227        vregion = malloc(sizeof(struct vregion));
228        if (!vregion) {
229            return LIB_ERR_MALLOC_FAIL;
230        }
231
232        debug_printf("%s:%d\n",__FUNCTION__, __LINE__);
233        // Create a memobj and vregion
234        err = memobj_create_anon(memobj, GUEST_VSPACE_SIZE, 0);
235        if (err_is_fail(err)) {
236            return err_push(err, LIB_ERR_MEMOBJ_CREATE_ANON);
237        }
238        debug_printf("%s:%d\n",__FUNCTION__, __LINE__);
239        err = vregion_map(vregion, get_current_vspace(), &memobj->m, 0,
240                          GUEST_VSPACE_SIZE, VREGION_FLAGS_READ_WRITE);
241        if (err_is_fail(err)) {
242            return err_push(err, LIB_ERR_VREGION_MAP);
243        }
244
245        debug_printf("%s:%d\n",__FUNCTION__, __LINE__);
246        guest_offset = vregion_get_base_addr(vregion);
247        debug_printf("guest_offset = 0x%lx\n", guest_offset);
248        initialized = true;
249    }
250
251    debug_printf("%s:%d\n",__FUNCTION__, __LINE__);
252    // Create mapping
253    err = memobj->m.f.fill(&memobj->m, vaddr, frame, size);
254    if (err_is_fail(err)) {
255        return err_push(err, LIB_ERR_MEMOBJ_FILL);
256    }
257    debug_printf("%s:%d\n",__FUNCTION__, __LINE__);
258    err = memobj->m.f.pagefault(&memobj->m, vregion, vaddr, 0);
259    if (err_is_fail(err)) {
260        return err_push(err, LIB_ERR_MEMOBJ_PAGEFAULT_HANDLER);
261    }
262
263    return SYS_ERR_OK;
264}
265// allocates some bytes of memory for the guest starting at a specific addr
266// also performs the mapping into the vspace of the monitor
267errval_t
268alloc_guest_mem(struct guest *g, lvaddr_t guest_paddr, size_t bytes)
269{
270    errval_t err;
271
272    // only allow multiple of page sizes to be allocated
273    assert(bytes > 0 && (bytes & BASE_PAGE_MASK) == 0);
274    // do not allow allocation outside of the guests physical memory
275    assert(guest_paddr + bytes <= g->mem_high_va);
276
277    debug_printf("%s:%d\n",__FUNCTION__, __LINE__);
278    // Allocate frame
279    struct capref cap;
280    err = guest_slot_alloc(g, &cap);
281    if (err_is_fail(err)) {
282        return err_push(err, LIB_ERR_SLOT_ALLOC);
283    }
284    debug_printf("%s:%d\n",__FUNCTION__, __LINE__);
285    err = frame_create(cap, bytes, NULL);
286    if (err_is_fail(err)) {
287        return err_push(err, LIB_ERR_FRAME_CREATE);
288    }
289
290    debug_printf("%s:%d\n",__FUNCTION__, __LINE__);
291    // Map into the guest vspace
292    err = guest_vspace_map_wrapper(g->vspace, guest_paddr, cap, bytes);
293    if (err_is_fail(err)) {
294        return err;
295    }
296
297    debug_printf("%s:%d\n",__FUNCTION__, __LINE__);
298    // Create a copy of the capability to map in our vspace
299    struct capref host_cap;
300    err = slot_alloc(&host_cap);
301    if (err_is_fail(err)) {
302        return err;
303    }
304    err = cap_copy(host_cap, cap);
305    if (err_is_fail(err)) {
306        return err;
307    }
308
309    debug_printf("%s:%d\n",__FUNCTION__, __LINE__);
310    // Map into my vspace
311    debug_printf("mapping into our vspace at 0x%lx\n", guest_to_host(guest_paddr));
312    err = vspace_map_wrapper(guest_to_host(guest_paddr), host_cap, bytes);
313    if (err_is_fail(err)) {
314        return err;
315    }
316
317    debug_printf("%s:%d\n",__FUNCTION__, __LINE__);
318    struct frame_identity frameid = { .base = 0, .bytes = 0 };
319    errval_t r = frame_identify(cap, &frameid);
320    assert(err_is_ok(r));
321    debug_printf("alloc_guest_mem: frameid.base: 0x%lx, frameid.bytes: %zu,"
322            " g->mem_low_va: 0x%lx, g->mem_high_va: 0x%lx\n",
323            frameid.base, frameid.bytes, g->mem_low_va, g->mem_high_va);
324
325    return SYS_ERR_OK;
326}
327
328static void
329initialize_iopm (struct guest *self) {
330    // intercept all IO port accesses (for now)
331#ifdef CONFIG_SVM
332    memset((void*)self->iopm_va, 0xFF, IOPM_SIZE);
333#else
334    memset((void*)self->iobmp_a_va, 0xFF, IOBMP_A_SIZE);
335    memset((void*)self->iobmp_b_va, 0xFF, IOBMP_B_SIZE);
336#endif
337}
338
339// access_mode: 0 all access, 1 read intercept, 2 write intercept, 3 all interc.
340static inline void
341set_msr_access (struct guest *g, uint32_t msr, int access_mode)
342{
343    assert(access_mode >= 0 && access_mode <= 3);
344
345    // a region a 2K bytes represents the access bits of 8K MSRs, therefore each
346    // MSR takes two bits (one for rdmsr and one for wrmsr)
347    uintptr_t byte_offset = (msr & 0xffff) / 4;
348    int bit_offset = ((msr & 0xffff) % 4) * 2;
349
350    if (msr < 0x2000) {
351        // do nothing
352    } else if (msr >= 0xc0000000 && msr < 0xc0002000) {
353        byte_offset += 0x800;
354    } else if (msr >= 0xc0010000 && msr < 0xc0012000) {
355        byte_offset += 0x1000;
356    } else {
357        assert(!"not reached");
358    }
359
360    assert(byte_offset < MSRPM_SIZE);
361
362    // read the byte holding the relevant bits
363    uint8_t val = *(uint8_t *)(g->msrpm_va + byte_offset);
364    // set the access params according to the arguments
365    val = (val & ~(0x3 << bit_offset)) | (access_mode << bit_offset);
366    // store the modified value back in the map
367    *(uint8_t *)(g->msrpm_va + byte_offset) = val;
368
369    //printf("MSR: msr %x, byte_offset %lx, bit_offset %x, val %x\n", msr, byte_offset, bit_offset, val);
370}
371
372static void
373initialize_msrpm (struct guest *g) {
374    // intercept all MSR accesses (for now)
375    memset((void*)g->msrpm_va, 0xff, MSRPM_SIZE);
376
377#if 0
378    // allow performance counters and evnets MSR accesses
379    set_msr_access (g, 0xc0010000, 0);
380    set_msr_access (g, 0xc0010001, 0);
381    set_msr_access (g, 0xc0010002, 0);
382    set_msr_access (g, 0xc0010003, 0);
383    set_msr_access (g, 0xc0010004, 0);
384    set_msr_access (g, 0xc0010005, 0);
385    set_msr_access (g, 0xc0010006, 0);
386    set_msr_access (g, 0xc0010007, 0);
387#endif
388}
389
390#define INIT_DATA_SEGREG(vmcb,x)                 \
391do {                                             \
392    amd_vmcb_seg_attrib_t __sa = {               \
393        .segtype = 2,                            \
394        .s = 1,                                  \
395        .dpl = 0,				 \
396        .p = 1,                                  \
397        .l = 0,					 \
398        .db = 1,				 \
399        .g = 1,					 \
400    };                                           \
401    amd_vmcb_##x## _attrib_wr((vmcb), __sa);     \
402    amd_vmcb_##x## _selector_wr((vmcb), 0x10);   \
403    amd_vmcb_##x## _base_wr((vmcb), 0x0);        \
404    amd_vmcb_##x## _limit_wr((vmcb), 0xffffffff);   \
405} while (0)
406
407#define INIT_CODE_SEGREG(vmcb,x)                 \
408do {                                             \
409    amd_vmcb_seg_attrib_t __sa = {               \
410        .segtype = 0xa,                          \
411        .s = 1,                                  \
412        .dpl = 0,				 \
413        .p = 1,                                  \
414        .l = 1,					 \
415        .db = 0,				 \
416        .g = 1,					 \
417    };                                           \
418    amd_vmcb_##x## _attrib_wr((vmcb), __sa);     \
419    amd_vmcb_##x## _selector_wr((vmcb), 8); 	 \
420    amd_vmcb_##x## _base_wr((vmcb), 0x0);        \
421    amd_vmcb_##x## _limit_wr((vmcb), 0xffffffff);   \
422} while (0)
423
424#define INIT_SYS_SEGREG(vmcb,x)                  \
425do {                                             \
426    amd_vmcb_seg_attrib_t __sa = {               \
427        .segtype = 2,                            \
428        .s = 1,                                  \
429        .dpl = 0,				 \
430        .p = 1,                                  \
431        .l = 0,					 \
432        .db = 1,				 \
433        .g = 1,					 \
434    };                                           \
435    amd_vmcb_##x## _attrib_wr((vmcb), __sa);     \
436    amd_vmcb_##x## _selector_wr((vmcb), 0x10);   \
437    amd_vmcb_##x## _base_wr((vmcb), 0x0);        \
438    amd_vmcb_##x## _limit_wr((vmcb), 0xffffffff);   \
439} while (0)
440
441#ifdef CONFIG_SVM
442/* This method initializes a new VMCB memory regsion and sets the initial
443 * machine state as defined by the AMD64 architecture specification */
444static void
445initialize_vmcb (struct guest *self) {
446    amd_vmcb_initialize(&self->vmcb, (mackerel_addr_t)self->vmcb_va);
447
448    // 1. Initialize intercepts
449
450    /* For now we intercept just everything */
451
452    amd_vmcb_cr_access_wr_raw(&self->vmcb, ~0u);
453    amd_vmcb_cr_access_rdcr2_wrf(&self->vmcb, 0);
454    amd_vmcb_cr_access_wrcr2_wrf(&self->vmcb, 0);
455    amd_vmcb_cr_access_rdcr4_wrf(&self->vmcb, 0);
456    amd_vmcb_cr_access_wrcr4_wrf(&self->vmcb, 0);
457
458    // FIXME: ignoring DR accesses may be insecure
459    //amd_vmcb_dr_access_wr_raw(&self->vmcb, ~0u);
460    amd_vmcb_exceptions_wr_raw(&self->vmcb, 0);
461    /* amd_vmcb_exceptions_vector7_wrf(&self->vmcb, 0); */
462    /* amd_vmcb_exceptions_vector14_wrf(&self->vmcb, 0); */
463
464    amd_vmcb_intercepts_wr_raw(&self->vmcb, 0x1fffffffffff);
465    amd_vmcb_intercepts_pushf_wrf(&self->vmcb, 0);
466    amd_vmcb_intercepts_popf_wrf(&self->vmcb, 0);
467    amd_vmcb_intercepts_invlpg_wrf(&self->vmcb, 0);
468    amd_vmcb_intercepts_rdtsc_wrf(&self->vmcb, 0);
469    amd_vmcb_intercepts_rdtscp_wrf(&self->vmcb, 0);
470    amd_vmcb_intercepts_iret_wrf(&self->vmcb, 0);
471    amd_vmcb_intercepts_wbinvd_wrf(&self->vmcb, 0);
472    amd_vmcb_intercepts_pause_wrf(&self->vmcb, 0);
473    amd_vmcb_intercepts_vintr_wrf(&self->vmcb, 0);
474
475    // 2. Setup some config fields
476
477    // physical addresses of IOPM and MSRPM_SIZE
478    amd_vmcb_iopm_base_pa_wr(&self->vmcb, self->iopm_pa);
479    amd_vmcb_msrpm_base_pa_wr(&self->vmcb, self->msrpm_pa);
480    // assign guest ASID
481    // FIXME: use real asid allocator. BF does not know about tagged TLBs atm
482    amd_vmcb_tlb_guest_asid_wrf(&self->vmcb, ++last_guest_asid);
483    // enable virtual intr masking
484    amd_vmcb_vintr_vintr_masking_wrf(&self->vmcb, 1);
485    // enable nested paging
486    amd_vmcb_np_enable_wrf(&self->vmcb, 1);
487
488    /* 3. Guest state initialization
489     * according to Intels Manual 3A: Table 9-1. */
490
491    // The second bit of rflags needs to be 1, also indicate that we support the
492    // CPUID instruction.
493    amd_vmcb_rflags_wr_raw(&self->vmcb, 0x00200002);
494    amd_vmcb_rip_wr(&self->vmcb, 0x0000fff0);
495    amd_vmcb_cr0_wr_raw(&self->vmcb, 0x60000010);
496
497    INIT_CODE_SEGREG(&self->vmcb, cs);
498    INIT_DATA_SEGREG(&self->vmcb, ss);
499    INIT_DATA_SEGREG(&self->vmcb, ds);
500    INIT_DATA_SEGREG(&self->vmcb, es);
501    INIT_DATA_SEGREG(&self->vmcb, fs);
502    INIT_DATA_SEGREG(&self->vmcb, gs);
503
504    INIT_SYS_SEGREG(&self->vmcb, gdtr);
505    INIT_SYS_SEGREG(&self->vmcb, idtr);
506    INIT_SYS_SEGREG(&self->vmcb, ldtr);
507    INIT_SYS_SEGREG(&self->vmcb, tr);
508
509    amd_vmcb_dr6_wr(&self->vmcb, 0xffff0ff0);
510    amd_vmcb_dr7_wr(&self->vmcb, 0x00000400);
511
512    // taken from the linux SVM source
513    amd_vmcb_gpat_wr(&self->vmcb, 0x0007040600070406ul);
514
515    // svm requires guest EFER.SVME to be set
516    amd_vmcb_efer_svme_wrf(&self->vmcb, 1);
517}
518
519#endif
520
521#ifdef EPT_FINE_GRAINED
522static
523errval_t ept_map_one_frame_fixed_attr(struct guest *g, lvaddr_t addr, size_t size,
524                                    struct capref frame, vregion_flags_t flags,
525                                    struct memobj **retmemobj,
526                                    struct vregion **retvregion)
527{
528    errval_t err1, err2;
529    struct memobj *memobj   = NULL;
530    struct vregion *vregion = NULL;
531
532    size = ROUND_UP(size, BASE_PAGE_SIZE);
533
534    // Allocate space
535    memobj = malloc(sizeof(struct memobj_one_frame));
536    if (!memobj) {
537        err1 = LIB_ERR_MALLOC_FAIL;
538        goto error;
539    }
540    vregion = malloc(sizeof(struct vregion));
541    if (!vregion) {
542        err1 = LIB_ERR_MALLOC_FAIL;
543        goto error;
544    }
545
546    // Create mappings
547    err1 = memobj_create_one_frame((struct memobj_one_frame*)memobj, size, 0);
548    if (err_is_fail(err1)) {
549        err1 = err_push(err1, LIB_ERR_MEMOBJ_CREATE_ONE_FRAME);
550        goto error;
551    }
552
553    err1 = memobj->f.fill(memobj, 0, frame, size);
554    if (err_is_fail(err1)) {
555        err1 = err_push(err1, LIB_ERR_MEMOBJ_FILL);
556        goto error;
557    }
558
559    err1 = vregion_map_fixed(vregion, g->vspace, memobj, 0, size, addr, flags);
560    if (err_is_fail(err1)) {
561        err1 = err_push(err1, LIB_ERR_VREGION_MAP);
562        goto error;
563    }
564
565    err1 = memobj->f.pagefault(memobj, vregion, 0, 0);
566    if (err_is_fail(err1)) {
567        err1 = err_push(err1, LIB_ERR_MEMOBJ_PAGEFAULT_HANDLER);
568        goto error;
569    }
570
571    if (retmemobj) {
572        *retmemobj = memobj;
573    }
574    if (retvregion) {
575        *retvregion = vregion;
576    }
577    return SYS_ERR_OK;
578
579 error:
580    DEBUG_ERR(err1, "in %s", __FUNCTION__);
581    if (memobj) {
582        err2 = memobj_destroy_one_frame(memobj);
583        if (err_is_fail(err2)) {
584            DEBUG_ERR(err2, "memobj_destroy_anon failed");
585        }
586    }
587    if (vregion) {
588        err2 = vregion_destroy(vregion);
589        if (err_is_fail(err2)) {
590            DEBUG_ERR(err2, "vregion_destroy failed");
591        }
592    }
593    return err1;
594}
595
596static void ept_map(struct guest *g, struct capref cap)
597{
598    errval_t err;
599    struct capref ept_copy;
600    err = guest_slot_alloc(g, &ept_copy);
601    assert(err_is_ok(err));
602    err = cap_copy(ept_copy, cap);
603    assert(err_is_ok(err));
604
605    struct frame_identity fi;
606    err = frame_identify(ept_copy, &fi);
607
608    printf("%s: creating identity mapping for 0x%"PRIxGENPADDR", %lu bytes\n",
609            __FUNCTION__, fi.base, fi.bytes);
610
611    err = ept_map_one_frame_fixed_attr(g, fi.base, fi.bytes,
612            ept_copy, VREGION_FLAGS_READ_WRITE | VREGION_FLAGS_EXECUTE,
613            NULL, NULL);
614    assert(err_is_ok(err));
615}
616
617static void ept_map_vnode(struct guest *g, struct vnode *v)
618{
619    assert(v->v.is_vnode);
620
621    ept_map(g, v->v.cap);
622
623    for (int i = 0; i < PTABLE_SIZE; i++) {
624        struct vnode *c = v->u.vnode.children[i];
625        if (c && c->v.is_vnode) {
626            ept_map_vnode(g, c);
627        }
628    }
629}
630#endif
631
632static void
633idc_handler(void *arg)
634{
635    struct guest *g = arg;
636    errval_t err;
637
638    // consume message
639    struct lmp_recv_buf buf = { .buflen = 0 };
640    err = lmp_endpoint_recv(g->monitor_ep, &buf, NULL);
641    assert(err_is_ok(err));
642
643    // run real handler
644    guest_handle_vmexit(g);
645
646    // re-register
647    struct event_closure cl = {
648        .handler = idc_handler,
649        .arg = arg,
650    };
651    err = lmp_endpoint_register(g->monitor_ep, get_default_waitset(), cl);
652    assert(err_is_ok(err));
653}
654
655
656extern errval_t vspace_add_vregion(struct vspace *vspace, struct vregion *region);
657errval_t get_pdpt(struct pmap_x86 *pmap, genvaddr_t base,
658                                struct vnode **pdpt);
659errval_t get_pdir(struct pmap_x86 *pmap, genvaddr_t base,
660                                struct vnode **pdir);
661extern errval_t get_ptable(struct pmap_x86 *pmap, genvaddr_t base,
662                           struct vnode **ptable);
663static void ept_force_mapping(struct guest *g, struct capref mem)
664{
665    errval_t err;
666    struct frame_identity fi;
667
668    // get info about memory
669    err = frame_identify(mem, &fi);
670    if (err_is_fail(err)) {
671        DEBUG_ERR(err, "id mem cap\n");
672    }
673    assert(err_is_ok(err));
674
675    printf("%s: creating identity mapping for 0x%"PRIxGENPADDR", %lu bytes\n",
676            __FUNCTION__, fi.base, fi.bytes);
677
678    // mark off region in vspace
679    struct vregion *v = malloc(sizeof(*v));
680    v->base = fi.base;
681    v->size = fi.bytes;
682    err = vspace_add_vregion(g->vspace, v);
683    assert(err_is_ok(err));
684
685    struct pmap_x86 *pmap = (struct pmap_x86 *)vspace_get_pmap(g->vspace);
686    struct vnode *pt;
687    paging_x86_64_flags_t pmap_flags = vregion_to_pmap_flag(VREGION_FLAGS_ALL);
688    size_t npages = 0;
689
690    /* XXX: properly keep track of mappings */
691    struct capref mapping;
692    err = slot_alloc(&mapping);
693    assert(err_is_ok(err));
694
695    if (fi.bytes >= X86_64_HUGE_PAGE_SIZE && fi.bytes % X86_64_HUGE_PAGE_SIZE == 0) {
696        // do huge page mappings
697        // get pdpt through pmap
698        err = get_pdpt(pmap, v->base, &pt);
699        assert(err_is_ok(err));
700        assert(pt->v.is_vnode);
701        npages = v->size / HUGE_PAGE_SIZE;
702        printf("     %zu 1G pages\n", npages);
703        assert(npages <= 512);
704        err = vnode_map(pt->v.cap, mem, X86_64_PDPT_BASE(v->base),
705                pmap_flags, 0, npages, mapping);
706    } else if (fi.bytes >= X86_64_LARGE_PAGE_SIZE && fi.bytes % X86_64_LARGE_PAGE_SIZE == 0) {
707        // do large page mappings
708        err = get_pdir(pmap, v->base, &pt);
709        assert(err_is_ok(err));
710        assert(pt->v.is_vnode);
711        npages = v->size / LARGE_PAGE_SIZE;
712        printf("     %zu 2M pages\n", npages);
713        assert(npages < 512);
714        err = vnode_map(pt->v.cap, mem, X86_64_PDIR_BASE(v->base),
715                pmap_flags, 0, npages, mapping);
716    } else {
717        // get leaf pt through pmap
718        err = get_ptable(pmap, v->base, &pt);
719        assert(err_is_ok(err));
720        npages = v->size / BASE_PAGE_SIZE;
721        printf("     %zu 4k pages\n", npages);
722        // should never be full ptable
723        assert(npages < 512);
724        err = vnode_map(pt->v.cap, mem, X86_64_PTABLE_BASE(v->base),
725                pmap_flags, 0, npages, mapping);
726        assert(err_is_ok(err));
727    }
728}
729
730void npt_map_handler(struct hyper_binding *b, struct capref mem)
731{
732    uint64_t dispframe = (uint64_t)b->st;
733    struct guest *g;
734    for (g = guests; g; g = g->next) {
735        if (g->dispframe == dispframe) {
736            break;
737        }
738    }
739    if (g == NULL) {
740        b->tx_vtbl.npt_map_response(b, NOP_CONT, ARRA_ERR_GUEST_NOT_FOUND);
741        return;
742    }
743
744    ept_force_mapping(g, mem);
745    b->tx_vtbl.npt_map_response(b, NOP_CONT, SYS_ERR_OK);
746}
747
748#if defined(ARRAKIS_USE_NESTED_PAGING) && !defined(EPT_FINE_GRAINED)
749static void ept_setup_low512g(struct guest *g)
750{
751    errval_t err;
752    struct pmap_x86 *pmap = (struct pmap_x86 *)vspace_get_pmap(g->vspace);
753    struct vnode *vn;
754    // get first pdpt (512g)
755    err = get_pdpt(pmap, 0, &vn);
756    assert(err_is_ok(err));
757    union x86_64_ptable_entry *pt;
758    struct capref ept_copy;
759    err = slot_alloc(&ept_copy);
760    err+= cap_copy(ept_copy, vn->v.cap);
761    err+= vspace_map_one_frame_attr((void**)&pt, BASE_PAGE_SIZE, ept_copy,
762                                    VREGION_FLAGS_READ_WRITE, NULL, NULL);
763    assert(err_is_ok(err));
764    genvaddr_t base = 0;
765    for (int i = 0; i < PTABLE_SIZE; i++) {
766        union x86_64_ptable_entry tmp;
767        tmp.raw = X86_64_PTABLE_CLEAR;
768
769        tmp.huge.present = 1;
770        tmp.huge.read_write = 1;
771        tmp.huge.user_supervisor = 1;
772        tmp.huge.always1 = 1;
773        tmp.huge.base_addr = base >> X86_64_HUGE_PAGE_BITS;
774
775        // write back cached translations
776        // set bits 5:3 to 6
777        tmp.raw |= (0x6 << 3);
778        // set accessed and dirty to avoid extra memory refs --> same as dune
779        // cf. dune's kern/ept.c:454--459.
780        tmp.raw |= (0x3 << 8);
781
782        pt[i] = tmp;
783
784        base += HUGE_PAGE_SIZE;
785    }
786}
787#endif
788/* This method duplicates some code from spawndomain since we need to spawn very
789 * special domains */
790void
791spawn_guest_domain (struct guest *g, struct spawninfo *si)
792{
793    errval_t err;
794    struct capref ept_pml4_cap;
795
796#ifdef ARRAKIS_USE_NESTED_PAGING
797    g->vspace = malloc(sizeof(*(g->vspace)));
798    assert(g->vspace);
799    err = guest_slot_alloc(g, &ept_pml4_cap);
800    assert(err_is_ok(err));
801    err = vnode_create(ept_pml4_cap, ObjType_VNode_x86_64_pml4);
802    assert(err_is_ok(err));
803
804    struct pmap *pmap = malloc(sizeof(struct pmap_x86));
805    assert(pmap);
806    err = pmap_x86_64_init(pmap, g->vspace, ept_pml4_cap, NULL);
807    assert(err_is_ok(err));
808    err = vspace_init(g->vspace, pmap);
809    assert(err_is_ok(err));
810
811#ifdef EPT_FINE_GRAINED
812    // populate the guest physical address space
813    // regions for binary
814    for (struct vregion *v = si->vspace->head; v; v = v->next) {
815        printf("memobj type: %d\n",v->memobj->type);
816        switch (v->memobj->type) {
817            case ANONYMOUS:
818            {
819                struct memobj_anon *m = (struct memobj_anon *)v->memobj;
820                for (struct memobj_frame_list *f = m->frame_list; f; f = f->next) {
821                    ept_map(g, f->frame);
822                }
823                break;
824            }
825            case ONE_FRAME:
826            {
827                struct memobj_one_frame *m = (struct memobj_one_frame *)v->memobj;
828                ept_map(g, m->frame);
829                break;
830            }
831            default:
832                debug_printf("need to implement handling for memobj type %d\n",
833                        v->memobj->type);
834                break;
835        }
836    }
837    // page tables: go through si pmap and setup identity mappings for all
838    // ptables
839    struct pmap_x86 *si_pmap = (struct pmap_x86 *)vspace_get_pmap(si->vspace);
840    ept_map_vnode(g, &si_pmap->root);
841
842    // map frames in basecn so we get some headstart before having to talk to
843    // arrakis.hyper
844    struct capref basecn_cap = {
845        .cnode = si->rootcn,
846        .slot = ROOTCN_SLOT_BASE_PAGE_CN,
847    };
848    struct cnoderef si_basecn = build_cnoderef(basecn_cap, DEFAULT_CNODE_BITS);
849    for (int i = 0; i < DEFAULT_CNODE_SLOTS; i++) {
850        struct capref mem = {
851            .cnode = si_basecn,
852            .slot = i,
853        };
854        // cannot retype basecn ram caps to frames here, as this would break
855        // the ability of the guest domain to retype them later on, so we
856        // force insert the mappings here
857        ept_force_mapping(g, mem);
858    }
859#else
860    // 1g pages for 1:1 ept
861    ept_setup_low512g(g);
862#endif
863#else
864    ept_pml4_cap = si->vtree;
865    // set guest's vspace to vspace we created when loading binary
866    g->vspace = si->vspace;
867#endif
868
869    // create end point
870    struct capref ep_cap;
871
872    // use minimum-sized endpoint, because we don't need to buffer >1 vmexit
873    err = endpoint_create(LMP_RECV_LENGTH, &ep_cap, &g->monitor_ep);
874    assert(err_is_ok(err));
875
876    // register to receive on this endpoint
877    struct event_closure cl = {
878        .handler = idc_handler,
879        .arg = g,
880    };
881    err = lmp_endpoint_register(g->monitor_ep, get_default_waitset(), cl);
882    assert(err_is_ok(err));
883
884    // setup the DCB; need to copy cap here as si->dcb will be destroyed when
885    // spawning process is complete!
886    err = slot_alloc(&g->dcb_cap);
887    assert(err_is_ok(err));
888    err = cap_copy(g->dcb_cap, si->dcb);
889    assert(err_is_ok(err));
890
891    // set guests disp handle
892    strncpy(g->name, si->name, G_NAME_LEN);
893    g->name[G_NAME_LEN-1] = 0;
894
895    struct frame_identity fi;
896    err = frame_identify(si->dispframe, &fi);
897    assert(err_is_ok(err));
898    g->dispframe = fi.base;
899
900    err = invoke_dispatcher_setup_guest(g->dcb_cap, ep_cap, ept_pml4_cap,
901                                        g->vmcb_cap, g->ctrl_cap);
902    if (err_is_fail(err)) {
903        DEBUG_ERR(err, "guest setup");
904    }
905    assert(err_is_ok(err));
906
907    err = invoke_dispatcher(si->dcb, cap_dispatcher, si->rootcn_cap,
908			    si->vtree, si->dispframe, false);
909    assert(err_is_ok(err));
910
911    // Setup virtual machine
912    arch_registers_state_t *regs =
913        dispatcher_get_disabled_save_area(si->handle);
914#ifdef CONFIG_SVM
915    amd_vmcb_rax_wr(&g->vmcb, regs->rax);
916    memcpy(&g->ctrl->regs, regs, sizeof(arch_registers_state_t));
917    amd_vmcb_rsp_wr(&g->vmcb, regs->rsp);
918    amd_vmcb_rip_wr(&g->vmcb, regs->rip);
919    amd_vmcb_rflags_wr_raw(&g->vmcb, regs->eflags);
920
921    // Enable long mode
922    amd_vmcb_cr0_pe_wrf(&g->vmcb, 1);
923    amd_vmcb_cr0_pg_wrf(&g->vmcb, 1);
924    amd_vmcb_cr4_pae_wrf(&g->vmcb, 1);
925    amd_vmcb_efer_lme_wrf(&g->vmcb, 1);
926    amd_vmcb_efer_lma_wrf(&g->vmcb, 1);
927
928    // More "default" settings
929    amd_vmcb_cr4_mce_wrf(&g->vmcb, 1);
930    amd_vmcb_cr4_pge_wrf(&g->vmcb, 1);
931    amd_vmcb_cr4_pce_wrf(&g->vmcb, 1);
932    amd_vmcb_cr4_osfxsr_wrf(&g->vmcb, 1);
933    amd_vmcb_efer_sce_wrf(&g->vmcb, 1);
934    amd_vmcb_efer_nxe_wrf(&g->vmcb, 1);
935
936    // disable GDTR intercept
937    amd_vmcb_intercepts_rdgdtr_wrf(&g->vmcb, 0);
938    amd_vmcb_intercepts_wrgdtr_wrf(&g->vmcb, 0);
939    // disable GDTR intercept
940    amd_vmcb_intercepts_rdldtr_wrf(&g->vmcb, 0);
941    amd_vmcb_intercepts_wrldtr_wrf(&g->vmcb, 0);
942    // disable IDTR intercept
943    amd_vmcb_intercepts_rdidtr_wrf(&g->vmcb, 0);
944    amd_vmcb_intercepts_wridtr_wrf(&g->vmcb, 0);
945    // disable TR intercept
946    amd_vmcb_intercepts_rdtr_wrf(&g->vmcb, 0);
947    amd_vmcb_intercepts_wrtr_wrf(&g->vmcb, 0);
948    // disable non essential CR0 access intercepts
949    amd_vmcb_cr_access_rdcr0_wrf(&g->vmcb, 0);
950    amd_vmcb_cr_access_wrcr0_wrf(&g->vmcb, 0);
951    // disable CR3 access intercepts
952    amd_vmcb_cr_access_rdcr3_wrf(&g->vmcb, 0);
953    amd_vmcb_cr_access_wrcr3_wrf(&g->vmcb, 0);
954    // disable INTn intercept
955    /* amd_vmcb_intercepts_intn_wrf(&g->vmcb, 0); */
956
957    // Disable nested paging
958    amd_vmcb_np_enable_wrf(&g->vmcb, 0);
959#else
960    err = 0;
961    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_IOBMP_A_F, g->iobmp_a_pa);
962    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_IOBMP_B_F, g->iobmp_b_pa);
963    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_MSRBMP_F, g->msrpm_pa);
964    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_VPID, ++last_guest_asid);
965
966    memcpy(&g->ctrl->regs, regs, sizeof(arch_registers_state_t));
967    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RSP, regs->rsp);
968    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, regs->rip);
969    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RFLAGS, regs->eflags);
970    assert(err_is_ok(err));
971    debug_printf("guest domain vpid %d init ip: %lx, sp: %lx\n",
972            last_guest_asid, regs->rip, regs->rsp);
973#endif
974    for(int i = 0; i < si->vregions; i++) {
975      printf("vregion %d: base = %" PRIxGENVADDR ", region = %" PRIxGENVADDR "\n",
976	     i, si->base[i], vregion_get_base_addr(si->vregion[i]));
977    }
978}
979
980#if 0
981static void
982install_grub_stage2 (struct guest *g, void *img, size_t img_size)
983{
984    assert(img != NULL);
985
986    /* the grub image goes to 0x8000 according to
987     * http://www.gnu.org/software/grub/manual/html_node/Memory-map.html */
988    memcpy((void *)(guest_to_host(g->mem_low_va + 0x8000)), img, img_size);
989    // according to grub stage2 source its entry point is at 0x8200
990    amd_vmcb_rip_wr(&g->vmcb, 0x8200);
991    // switch to the first segment
992    amd_vmcb_cs_selector_wr(&g->vmcb, 0x0);
993    amd_vmcb_cs_base_wr(&g->vmcb, 0x0);
994    amd_vmcb_cs_limit_wr(&g->vmcb, 0xffff);
995}
996#endif
997
998#if 0
999static void
1000install_debug_app (struct guest *g)
1001{
1002    //static uint8_t app[] = { 0xcd, 0x20 };
1003    static uint8_t app[] = { 0xcd, 0x20, 0x90, 0x90, 0x90, 0x90, 0x90 };
1004    /* static uint8_t app[] = { 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90 }; */
1005    memcpy((void *)(guest_to_host(g->mem_low_va + 0xf000)), app, sizeof(app));
1006    amd_vmcb_rip_wr(&g->vmcb, 0xf000);
1007    amd_vmcb_rsp_wr(&g->vmcb, 0x10000);
1008
1009#if 0
1010    static uint8_t gdt[] = {
1011      0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
1012      0xff,0xff,0x00,0x00,0x00,0x9a,0xaf,0x00, // 64bit code segment, D _cleared_ => "16bit"
1013      0xff,0xff,0x00,0x00,0x00,0x92,0xcf,0x00, // data
1014      0xff,0xff,0x00,0x00,0x00,0x9a,0xcf,0x00, // 32bit code segment for protected-mode
1015      0xff,0xff,0x00,0x80,0x0b,0x92,0xff,0x00, // screen
1016      0xff,0xff,0x00,0x60,0x00,0x9a,0xcf,0x00, // segment at linear address 0x6000
1017      0xff,0xff,0x00,0x00,0x00,0x92,0xaf,0x00  // stack segment in 64bit mode
1018    };
1019    memcpy((void *)(guest_to_host(g->mem_low_va)), gdt, sizeof(gdt));
1020
1021    amd_vmcb_gdtr_base_wr(&g->vmcb, 0);
1022    amd_vmcb_gdtr_limit_wr(&g->vmcb, sizeof(gdt));
1023#endif
1024
1025    // disable nested pageing in real mode
1026    /* amd_vmcb_np_enable_wrf(&g->vmcb, 1); */
1027    // enable paged real mode
1028    //amd_vmcb_cr0_pg_wrf(&g->vmcb, 0x1);
1029    //g->save_area->cr0 |= X86_CR0_PE_MASK;
1030
1031#if 0
1032    // Write page table
1033    static uint64_t pml4[512] = {
1034      0xffffffffffffffffUL,
1035      0x00002007,
1036      0
1037    };
1038    memcpy((void *)(guest_to_host(g->mem_low_va + 0x1000)), pml4, sizeof(pml4));
1039
1040    static uint64_t pdpt[512] = {
1041      0x00003007,
1042      0
1043    };
1044    memcpy((void *)(guest_to_host(g->mem_low_va + 0x2000)), pdpt, sizeof(pdpt));
1045
1046    static uint64_t pdir[512] = {
1047      0x00000087,
1048      0x00200087,
1049      0x00400087,
1050      0x00600087,
1051      0x00800087,
1052      0x00a00087,
1053      0x00c00087,
1054      0x00e00087,
1055      0x01000087,
1056      0
1057    };
1058    memcpy((void *)(guest_to_host(g->mem_low_va + 0x3000)), pdir, sizeof(pdir));
1059
1060    amd_vmcb_cr3_wr(&g->vmcb, 0x1000);
1061#endif
1062
1063    // Enable long mode
1064    amd_vmcb_cr0_pe_wrf(&g->vmcb, 1);
1065    amd_vmcb_cr0_pg_wrf(&g->vmcb, 1);
1066    amd_vmcb_cr4_pae_wrf(&g->vmcb, 1);
1067    amd_vmcb_efer_lme_wrf(&g->vmcb, 1);
1068    amd_vmcb_efer_lma_wrf(&g->vmcb, 1);
1069
1070    // Disable nested paging
1071    amd_vmcb_np_enable_wrf(&g->vmcb, 0);
1072
1073    /* amd_vmcb_cs_selector_wr(&g->vmcb, 0x0); */
1074    /* amd_vmcb_cs_base_wr(&g->vmcb, 0x0); */
1075    /* amd_vmcb_cs_limit_wr(&g->vmcb, 0xfffff); */
1076    /* amd_vmcb_cs_attrib_wr(&g->vmcb,  */
1077    //g->save_area->cs.selector = 0x1000;
1078    //g->save_area->cs.base = 0x10000;
1079    //g->save_area->cs.base = 0x1ffff;
1080}
1081#endif
1082
1083static bool
1084virq_pending (void *ud, uint8_t *irq, uint8_t *irq_prio)
1085{
1086    assert(ud != NULL);
1087
1088    struct guest *g = ud;
1089#ifdef CONFIG_SVM
1090    if (amd_vmcb_vintr_rd(&g->vmcb).virq == 1) {
1091#else
1092    uint64_t info;
1093    errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_ENTRY_INTR_INFO, &info);
1094    assert(err_is_ok(err));
1095    if (!!(info & (1UL << 31))) {
1096#endif
1097        if (irq != NULL) {
1098#ifdef CONFIG_SVM
1099            *irq = amd_vmcb_vintr_rd(&g->vmcb).vintr_vector;
1100#else
1101	    *irq = info & 0xff;
1102#endif
1103        }
1104        if (irq_prio != NULL) {
1105#ifdef CONFIG_SVM
1106            *irq_prio = amd_vmcb_vintr_rd(&g->vmcb).vintr_prio;
1107#else
1108	    *irq_prio = interrupt_priority;
1109#endif
1110        }
1111        return true;
1112    } else {
1113        return false;
1114    }
1115}
1116
1117#ifndef CONFIG_SVM
1118static bool
1119virq_accepting (void *ud)
1120{
1121    assert(ud != NULL);
1122
1123    struct guest *g = ud;
1124
1125    uint64_t guest_rflags;
1126    errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RFLAGS, &guest_rflags);
1127    assert(err_is_ok(err));
1128    return (guest_rflags & (1UL << 9));
1129}
1130#endif
1131
1132static void
1133virq_handler (void *ud, uint8_t irq, uint8_t irq_prio)
1134{
1135    assert(ud != NULL);
1136
1137    struct guest *g = ud;
1138
1139    // tell the hw extensions that there is a virtual IRQ pending
1140#ifdef CONFIG_SVM
1141    amd_vmcb_vintr_virq_wrf(&g->vmcb, 1);
1142    amd_vmcb_vintr_vintr_prio_wrf(&g->vmcb, irq_prio);
1143    amd_vmcb_vintr_vintr_vector_wrf(&g->vmcb, irq);
1144    amd_vmcb_vintr_v_ign_tpr_wrf(&g->vmcb, 1);
1145#else
1146    uint64_t guest_rflags;
1147    errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RFLAGS, &guest_rflags);
1148    assert(guest_rflags & (1UL << 9));
1149
1150    uint64_t info = (0 << 8 /*HWINTR*/) | (1UL << 31 /*INTR VALID*/) | irq;
1151    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_ENTRY_INTR_INFO, info);
1152
1153    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_ACTIV_STATE, 0x0);
1154    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_INTR_STATE, 0x0);
1155    assert(err_is_ok(err));
1156
1157    interrupt_priority = irq_prio;
1158#endif
1159    // if the guest is currently waiting then we have to restart it to make
1160    // forward progress
1161    if (!g->runnable) {
1162        g->runnable = true;
1163        guest_make_runnable(g, true);
1164    }
1165}
1166
1167static void
1168guest_setup (struct guest *g)
1169{
1170    errval_t err;
1171
1172    // initialize the guests slot_allocator
1173    err = two_level_slot_alloc_init(&g->slot_alloc);
1174    assert_err(err, "two_level_slot_alloc_init");
1175
1176    struct frame_identity fi;
1177
1178    // allocate memory for the vmcb
1179    err = guest_slot_alloc(g, &g->vmcb_cap);
1180    assert_err(err, "guest_cspace_alloc");
1181    err = frame_create(g->vmcb_cap, VMCB_SIZE, NULL);
1182    assert_err(err, "frame_create");
1183    err = frame_identify(g->vmcb_cap, &fi);
1184    assert_err(err, "frame_identify");
1185    g->vmcb_pa = fi.base;
1186    err = vspace_map_one_frame_attr((void**)&g->vmcb_va, VMCB_SIZE, g->vmcb_cap,
1187                                    VREGION_FLAGS_READ_WRITE_NOCACHE,
1188                                    NULL, NULL);
1189    if (err_is_fail(err)) {
1190        DEBUG_ERR(err, "vspace_map_one_frame_attr failed");
1191    }
1192
1193    // guest control
1194    err = frame_alloc(&g->ctrl_cap, sizeof(struct guest_control), NULL);
1195    assert_err(err, "frame_alloc");
1196    size_t size = ROUND_UP(sizeof(struct guest_control), BASE_PAGE_SIZE);
1197    err = vspace_map_one_frame_attr((void**)&g->ctrl, size, g->ctrl_cap,
1198                                    VREGION_FLAGS_READ_WRITE_NOCACHE,
1199                                    NULL, NULL);
1200    if (err_is_fail(err)) {
1201        DEBUG_ERR(err, "vspace_map_one_frame_attr failed");
1202    }
1203    g->ctrl->num_vm_exits_with_monitor_invocation = 0;
1204    g->ctrl->num_vm_exits_without_monitor_invocation = 0;
1205
1206#ifdef CONFIG_SVM
1207    // allocate memory for the iopm
1208    err = frame_alloc(&g->iopm_cap, IOPM_SIZE, NULL);
1209    assert_err(err, "frame_alloc");
1210    err = frame_identify(g->iopm_cap, &fi);
1211    assert_err(err, "frame_identify");
1212    g->iopm_pa = fi.base;
1213    err = vspace_map_one_frame_attr((void**)&g->iopm_va, IOPM_SIZE, g->iopm_cap,
1214                                    VREGION_FLAGS_READ_WRITE_NOCACHE,
1215                                    NULL, NULL);
1216    if (err_is_fail(err)) {
1217        DEBUG_ERR(err, "vspace_map_one_frame_attr failed");
1218    }
1219#else
1220    // allocate memory for I/O bitmap A
1221    err = frame_alloc(&g->iobmp_a_cap, IOBMP_A_SIZE, NULL);
1222    assert_err(err, "frame_alloc");
1223    err = frame_identify(g->iobmp_a_cap, &fi);
1224    assert_err(err, "frame_identify");
1225    g->iobmp_a_pa = fi.base;
1226    err = vspace_map_one_frame_attr((void**)&g->iobmp_a_va, IOBMP_A_SIZE, g->iobmp_a_cap,
1227                                    VREGION_FLAGS_READ_WRITE_NOCACHE,
1228                                    NULL, NULL);
1229    if (err_is_fail(err)) {
1230        DEBUG_ERR(err, "vspace_map_one_frame_attr failed");
1231    }
1232
1233    // allocate memory for I/O bitmap B
1234    err = frame_alloc(&g->iobmp_b_cap, IOBMP_B_SIZE, NULL);
1235    assert_err(err, "frame_alloc");
1236    err = frame_identify(g->iobmp_b_cap, &fi);
1237    assert_err(err, "frame_identify");
1238    g->iobmp_b_pa = fi.base;
1239    err = vspace_map_one_frame_attr((void**)&g->iobmp_b_va, IOBMP_B_SIZE, g->iobmp_b_cap,
1240                                    VREGION_FLAGS_READ_WRITE_NOCACHE,
1241                                    NULL, NULL);
1242    if (err_is_fail(err)) {
1243        DEBUG_ERR(err, "vspace_map_one_frame_attr failed");
1244    }
1245
1246    // allocate memory for the guest MSR store/load area
1247    err = frame_alloc(&g->msr_area_cap, VMX_MSR_AREA_SIZE, NULL);
1248    assert_err(err, "frame_alloc");
1249    err = frame_identify(g->msr_area_cap, &fi);
1250    assert_err(err, "frame_identify");
1251    g->msr_area_pa = fi.base;
1252    err = vspace_map_one_frame_attr((void**)&g->msr_area_va, VMX_MSR_AREA_SIZE,
1253                                    g->msr_area_cap,
1254                                    VREGION_FLAGS_READ_WRITE_NOCACHE,
1255                                    NULL, NULL);
1256    if (err_is_fail(err)) {
1257        DEBUG_ERR(err, "vspace_map_one_frame_attr failed");
1258    }
1259#endif
1260    // allocate memory for the msrpm
1261    err = frame_alloc(&g->msrpm_cap, MSRPM_SIZE, NULL);
1262    assert_err(err, "frame_alloc");
1263    err = frame_identify(g->msrpm_cap, &fi);
1264    assert_err(err, "frame_identify");
1265    g->msrpm_pa = fi.base;
1266    err = vspace_map_one_frame_attr((void**)&g->msrpm_va, MSRPM_SIZE,
1267                                    g->msrpm_cap,
1268                                    VREGION_FLAGS_READ_WRITE_NOCACHE,
1269                                    NULL, NULL);
1270    if (err_is_fail(err)) {
1271        DEBUG_ERR(err, "vspace_map_one_frame_attr failed");
1272    }
1273
1274    // initialize the allocated structures
1275    initialize_iopm(g);
1276    initialize_msrpm(g);
1277#ifdef CONFIG_SVM
1278    initialize_vmcb(g);
1279#endif
1280    // spawn the guest domain
1281    /* spawn_guest_domain(g); */
1282    /* assert (grub_image != NULL); */
1283    //install_grub_stage2(g, grub_image, grub_image_size);
1284    /* install_debug_app(g); */
1285
1286    // add virtual hardware
1287    g->apic = apic_new(APIC_BASE);
1288    g->lpc = lpc_new(virq_handler, virq_pending,
1289#ifndef CONFIG_SVM
1290		     virq_accepting,
1291#endif
1292		     g, g->apic);
1293#if 0
1294    if (hdd0_image != NULL) {
1295        g->hdds[0] = hdd_new_from_memory(hdd0_image, hdd0_image_size);
1296        g->hdd_count++;
1297    }
1298    g->console = console_new();
1299    g->serial_ports[0] = pc16550d_new(0x3f8, 4, g->lpc);
1300    pc16550d_attach_to_console(g->serial_ports[0]);
1301    g->serial_ports[1] = pc16550d_new(0x2f8, 3, g->lpc);
1302    g->serial_ports[2] = pc16550d_new(0x3e8, 4, g->lpc);
1303    g->serial_ports[3] = pc16550d_new(0x2e8, 3, g->lpc);
1304    g->serial_port_count = 4;
1305
1306    g->pci = pci_new();
1307    init_host_devices(g->pci);
1308
1309    // set up bios memory
1310    // FIXME: find a modular way to do this
1311    *(uint16_t *)guest_to_host(g->mem_low_va + 0x400) = 0x3f8;  // COM1
1312    *(uint16_t *)guest_to_host(g->mem_low_va + 0x402) = 0x2f8;  // COM2
1313#endif
1314
1315    g->runnable = true;
1316}
1317
1318/**
1319 * \brief Create a new guest.
1320 *
1321 * This function creates a new guest. It will do everything necessary to make
1322 * the guest accept images to run. It will create a new domain and assign some
1323 * memory to that domain. Afterwards it will load a bios into the memory and
1324 * set the guest initial IP to the POST entry of the bios.
1325 *
1326 * \return The pointer to the newly created structure describing the guest.
1327 */
1328struct guest *
1329guest_create (void)
1330{
1331  struct guest *newguest = malloc(sizeof(struct guest));
1332  memset(newguest, 0, sizeof(struct guest));
1333  guest_setup(newguest);
1334  // insert in list
1335  newguest->next = guests;
1336  guests = newguest;
1337  return newguest;
1338}
1339
1340#if 0
1341static int
1342run_realmode (struct guest *g)
1343{
1344    int r;
1345
1346    realmode_switch_to(g);
1347    r = realmode_exec();
1348    assert(r == REALMODE_ERR_OK);
1349    realmode_switch_from(g);
1350
1351    guest_handle_vmexit(g);
1352
1353    return 0;
1354};
1355#endif
1356
1357#ifndef CONFIG_SVM
1358// Return true if the "Enable EPT" Secondary Processor-based control is
1359// set in the VMCS, else false.
1360static inline bool vmx_ept_enabled(struct guest *g)
1361{
1362    uint64_t sp_controls;
1363    errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_EXEC_SEC_PROC, &sp_controls);
1364    assert(err_is_ok(err));
1365    return ((sp_controls & SP_CLTS_ENABLE_EPT) != 0);
1366}
1367
1368// Set or clear the "Descriptor-table exiting" Secondary Processor-based
1369// control if val is 1 or 0, respectively.
1370static inline void vmx_intercept_desc_table_wrf(struct guest *g, int val)
1371{
1372    assert(val == 0 || val == 1);
1373
1374    uint64_t sec_proc_ctrls;
1375    errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_EXEC_SEC_PROC, &sec_proc_ctrls);
1376    if (val) {
1377        uint64_t prim_proc_ctrls;
1378	err += invoke_dispatcher_vmread(g->dcb_cap, VMX_EXEC_PRIM_PROC, &prim_proc_ctrls);
1379	assert(prim_proc_ctrls & PP_CLTS_SEC_CTLS);
1380	err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_EXEC_SEC_PROC,
1381					 sec_proc_ctrls | SP_CLTS_DESC_TABLE);
1382    } else {
1383        err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_EXEC_SEC_PROC,
1384					 sec_proc_ctrls & ~SP_CLTS_DESC_TABLE);
1385    }
1386    assert(err_is_ok(err));
1387}
1388
1389
1390// Before entering the guest, synchronize the CR0 shadow with the guest
1391// CR0 value that is potentially changed in the real-mode emulator.
1392static inline void vmx_set_cr0_shadow(struct guest *g)
1393{
1394    uint64_t cr0_shadow;
1395    errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR0, &cr0_shadow);
1396    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_CR0_RD_SHADOW, cr0_shadow);
1397    assert(err_is_ok(err));
1398}
1399#endif
1400
1401/**
1402 * \brief Marks a guest as runnable.
1403 *
1404 * A call to this method will update the guest's runnable state and, if made
1405 * runnable, yield the remaining time slice to the guest domain.
1406 *
1407 * \return Zero on success, non-zero on error
1408 */
1409errval_t
1410guest_make_runnable (struct guest *g, bool run)
1411{
1412    assert(g->runnable);
1413
1414    errval_t err;
1415
1416#if 0
1417    /* If the guest is currently in real mode (CR0.PE flag clear) then we do not
1418     * schedule the domain to run the virtualization but run the real-mode
1419     * emulation */
1420#ifdef CONFIG_SVM
1421    if (UNLIKELY(run && amd_vmcb_cr0_rd(&g->vmcb).pe == 0)) {
1422        if (!g->emulated_before_exit) {
1423            // do the inverse of the code below
1424            amd_vmcb_intercepts_rdgdtr_wrf(&g->vmcb, 1);
1425            amd_vmcb_intercepts_wrgdtr_wrf(&g->vmcb, 1);
1426            amd_vmcb_intercepts_rdldtr_wrf(&g->vmcb, 1);
1427            amd_vmcb_intercepts_wrldtr_wrf(&g->vmcb, 1);
1428            amd_vmcb_intercepts_rdidtr_wrf(&g->vmcb, 1);
1429            amd_vmcb_intercepts_wridtr_wrf(&g->vmcb, 1);
1430            amd_vmcb_intercepts_rdtr_wrf(&g->vmcb, 1);
1431            amd_vmcb_intercepts_wrtr_wrf(&g->vmcb, 1);
1432            amd_vmcb_cr_access_rdcr0_wrf(&g->vmcb, 1);
1433            amd_vmcb_cr_access_wrcr0_wrf(&g->vmcb, 1);
1434            amd_vmcb_cr_access_rdcr3_wrf(&g->vmcb, 1);
1435            amd_vmcb_cr_access_wrcr3_wrf(&g->vmcb, 1);
1436            amd_vmcb_intercepts_intn_wrf(&g->vmcb, 1);
1437
1438            // mark guest as emulated
1439            g->emulated_before_exit = true;
1440        }
1441#else
1442    uint64_t guest_cr0;
1443    err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR0, &guest_cr0);
1444    assert(err_is_ok(err));
1445    if (UNLIKELY(run && (guest_cr0 & CR0_PE) == 0)) {
1446        if (!g->emulated_before_exit) {
1447	    vmx_intercept_desc_table_wrf(g, 1);
1448	    g->emulated_before_exit = true;
1449	}
1450#endif
1451#if 0 /* why create a thread for this? it seems fine without! -AB */
1452        struct thread *t = thread_create((thread_func_t)run_realmode, g);
1453        assert(t != NULL);
1454        err = thread_detach(t);
1455        assert(err_is_ok(err));
1456#else
1457        run_realmode(g);
1458#endif
1459        return SYS_ERR_OK;
1460    }
1461
1462    /* every time we move the machine from the emulated to virtualized we need
1463     * to adjust some intercepts */
1464    if (UNLIKELY(run && g->emulated_before_exit)) {
1465#ifdef CONFIG_SVM
1466        // we enforce NP to be enabled (no shadow paging support)
1467        assert(amd_vmcb_np_rd(&g->vmcb).enable == 1);
1468
1469        // disable GDTR intercept
1470        amd_vmcb_intercepts_rdgdtr_wrf(&g->vmcb, 0);
1471        amd_vmcb_intercepts_wrgdtr_wrf(&g->vmcb, 0);
1472        // disable GDTR intercept
1473        amd_vmcb_intercepts_rdldtr_wrf(&g->vmcb, 0);
1474        amd_vmcb_intercepts_wrldtr_wrf(&g->vmcb, 0);
1475        // disable IDTR intercept
1476        amd_vmcb_intercepts_rdidtr_wrf(&g->vmcb, 0);
1477        amd_vmcb_intercepts_wridtr_wrf(&g->vmcb, 0);
1478        // disable TR intercept
1479        amd_vmcb_intercepts_rdtr_wrf(&g->vmcb, 0);
1480        amd_vmcb_intercepts_wrtr_wrf(&g->vmcb, 0);
1481        // disable non essential CR0 access intercepts_t
1482        amd_vmcb_cr_access_rdcr0_wrf(&g->vmcb, 0);
1483        amd_vmcb_cr_access_wrcr0_wrf(&g->vmcb, 0);
1484        // disable CR3 access intercepts
1485        assert(amd_vmcb_np_rd(&g->vmcb).enable != 0);
1486        amd_vmcb_cr_access_rdcr3_wrf(&g->vmcb, 0);
1487        amd_vmcb_cr_access_wrcr3_wrf(&g->vmcb, 0);
1488        // disable INTn intercept
1489        // we have to be outside of real mode for this to work
1490        assert(amd_vmcb_cr0_rd(&g->vmcb).pe != 0);
1491        amd_vmcb_intercepts_intn_wrf(&g->vmcb, 0);
1492#else
1493        bool ept_enabled = vmx_ept_enabled(g);
1494	assert(ept_enabled);
1495	vmx_intercept_desc_table_wrf(g, 0);
1496	assert(guest_cr0 & CR0_PE);
1497	vmx_set_cr0_shadow(g);
1498#endif
1499        // mark guest as not emulated
1500        g->emulated_before_exit = false;
1501    }
1502#endif
1503
1504    // update the guets domain's runnable state
1505    err = invoke_dispatcher(g->dcb_cap, NULL_CAP, NULL_CAP, NULL_CAP, NULL_CAP, run);
1506    assert_err(err, "dispatcher_make_runnable");
1507    // yield the dispatcher
1508    if (run) {
1509        thread_yield_dispatcher(NULL_CAP);
1510    }
1511
1512    return SYS_ERR_OK;
1513}
1514
1515/* VMEXIT hanlders */
1516
1517#define HANDLER_ERR_OK          (0)
1518#define HANDLER_ERR_FATAL       (-1)
1519
1520#ifdef CONFIG_SVM
1521static int
1522handle_vmexit_unhandeled (struct guest *g)
1523{
1524    printf("Unhandled guest vmexit:\n");
1525    printf(" code:\t  %lx\n", amd_vmcb_exitcode_rd(&g->vmcb));
1526    printf(" info1:\t  %lx\n", amd_vmcb_exitinfo1_rd(&g->vmcb));
1527    printf(" info2:\t  %lx\n", amd_vmcb_exitinfo2_rd(&g->vmcb));
1528    printf(" intinfo: %lx\n", amd_vmcb_exitintinfo_rd(&g->vmcb));
1529
1530    printf("VMCB save area:\n");
1531    printf(" cr0:\t%lx\n", amd_vmcb_cr0_rd_raw(&g->vmcb));
1532    printf(" cr2:\t%lx\n", amd_vmcb_cr2_rd_raw(&g->vmcb));
1533    printf(" cr3:\t%lx\n", amd_vmcb_cr3_rd_raw(&g->vmcb));
1534    printf(" cr4:\t%lx\n", amd_vmcb_cr4_rd_raw(&g->vmcb));
1535    printf(" efer:\t%lx\n", amd_vmcb_efer_rd_raw(&g->vmcb));
1536    printf(" rip:\t%lx\n", amd_vmcb_rip_rd_raw(&g->vmcb));
1537    printf(" rsp:\t%lx\n", amd_vmcb_rsp_rd_raw(&g->vmcb));
1538    printf(" cs:\tselector %x, base %lx, limit %x, attrib %x\n",
1539           amd_vmcb_cs_selector_rd(&g->vmcb), amd_vmcb_cs_base_rd(&g->vmcb),
1540           amd_vmcb_cs_limit_rd(&g->vmcb), amd_vmcb_cs_attrib_rd_raw(&g->vmcb));
1541    printf(" ds:\tselector %x, base %lx, limit %x, attrib %x\n",
1542           amd_vmcb_ds_selector_rd(&g->vmcb), amd_vmcb_ds_base_rd(&g->vmcb),
1543           amd_vmcb_ds_limit_rd(&g->vmcb), amd_vmcb_ds_attrib_rd_raw(&g->vmcb));
1544    printf(" es:\tselector %x, base %lx, limit %x, attrib %x\n",
1545           amd_vmcb_es_selector_rd(&g->vmcb), amd_vmcb_es_base_rd(&g->vmcb),
1546           amd_vmcb_es_limit_rd(&g->vmcb), amd_vmcb_es_attrib_rd_raw(&g->vmcb));
1547    printf(" ss:\tselector %x, base %lx, limit %x, attrib %x\n",
1548           amd_vmcb_ss_selector_rd(&g->vmcb), amd_vmcb_ss_base_rd(&g->vmcb),
1549           amd_vmcb_ss_limit_rd(&g->vmcb), amd_vmcb_ss_attrib_rd_raw(&g->vmcb));
1550    printf(" rax:\t%lx\n", amd_vmcb_rax_rd_raw(&g->vmcb));
1551    printf(" rbx:\t%lx\n", g->ctrl->regs.rbx);
1552    printf(" rcx:\t%lx\n", g->ctrl->regs.rcx);
1553    printf(" rdx:\t%lx\n", g->ctrl->regs.rdx);
1554    printf(" rsi:\t%lx\n", g->ctrl->regs.rsi);
1555    printf(" rdi:\t%lx\n", g->ctrl->regs.rdi);
1556
1557    return HANDLER_ERR_FATAL;
1558}
1559#else
1560static int
1561handle_vmexit_unhandeled (struct guest *g)
1562{
1563    printf("Unhandeled guest vmexit:\n");
1564    printf(" exit reason:\t %"PRIu16"\n", saved_exit_reason);
1565    printf(" exit qualification:\t %"PRIx64"\n", saved_exit_qual);
1566    printf(" next rip (I/O instruction):\t %"PRIx64"\n", saved_rip);
1567
1568    uint64_t gpaddr;
1569    errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GPADDR_F, &gpaddr);
1570    printf(" guest physical-address:\t %"PRIx64"\n", gpaddr);
1571
1572    uint64_t guest_cr0, guest_cr3, guest_cr4;
1573    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR0, &guest_cr0);
1574    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR3, &guest_cr3);
1575    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR4, &guest_cr4);
1576
1577    uint64_t guest_efer, guest_rip;
1578    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_EFER_F, &guest_efer);
1579    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
1580
1581    uint64_t guest_cs_sel, guest_cs_base, guest_cs_lim,
1582        guest_cs_access;
1583    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CS_SEL, &guest_cs_sel);
1584    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CS_BASE, &guest_cs_base);
1585    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CS_LIM, &guest_cs_lim);
1586    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CS_ACCESS, &guest_cs_access);
1587
1588    uint64_t guest_ds_sel, guest_ds_base, guest_ds_lim,
1589        guest_ds_access;
1590    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_DS_SEL, &guest_ds_sel);
1591    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_DS_BASE, &guest_ds_base);
1592    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_DS_LIM, &guest_ds_lim);
1593    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_DS_ACCESS, &guest_ds_access);
1594
1595    uint64_t guest_es_sel, guest_es_base, guest_es_lim,
1596        guest_es_access;
1597    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_ES_SEL, &guest_es_sel);
1598    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_ES_BASE, &guest_es_base);
1599    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_ES_LIM, &guest_es_lim);
1600    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_ES_ACCESS, &guest_es_access);
1601
1602    uint64_t guest_ss_sel, guest_ss_base, guest_ss_lim,
1603        guest_ss_access;
1604    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_SS_SEL, &guest_ss_sel);
1605    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_SS_BASE, &guest_ss_base);
1606    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_SS_LIM, &guest_ss_lim);
1607    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_SS_ACCESS, &guest_ss_access);
1608    assert(err_is_ok(err));
1609
1610    printf("VMCS save area:\n");
1611    printf(" cr0:\t%lx\n", guest_cr0);
1612    printf(" cr3:\t%lx\n", guest_cr3);
1613    printf(" cr4:\t%lx\n", guest_cr4);
1614    printf(" efer:\t%lx\n", guest_efer);
1615    printf(" rip:\t%lx\n", guest_rip);
1616    printf(" cs:\tselector %lx, base %lx, limit %lx, access %lx\n",
1617           guest_cs_sel, guest_cs_base, guest_cs_lim, guest_cs_access);
1618    printf(" ds:\tselector %lx, base %lx, limit %lx, access %lx\n",
1619           guest_ds_sel, guest_ds_base, guest_ds_lim, guest_ds_access);
1620    printf(" es:\tselector %lx, base %lx, limit %lx, access %lx\n",
1621           guest_es_sel, guest_es_base, guest_es_lim, guest_es_access);
1622    printf(" ss:\tselector %lx, base %lx, limit %lx, access %lx\n",
1623           guest_ss_sel, guest_ss_base, guest_ss_lim, guest_ss_access);
1624    printf(" rax:\t%lx\n", g->ctrl->regs.rax);
1625    printf(" rbx:\t%lx\n", g->ctrl->regs.rbx);
1626    printf(" rcx:\t%lx\n", g->ctrl->regs.rcx);
1627    printf(" rdx:\t%lx\n", g->ctrl->regs.rdx);
1628    printf(" rsi:\t%lx\n", g->ctrl->regs.rsi);
1629    printf(" rdi:\t%lx\n", g->ctrl->regs.rdi);
1630
1631    return HANDLER_ERR_FATAL;
1632}
1633#endif
1634
1635static inline uint64_t
1636lookup_paddr_long_mode (struct guest *g, uint64_t vaddr)
1637{
1638    union x86_lm_va va = { .raw = vaddr };
1639    uint64_t *page_table;
1640
1641    // get a pointer to the pml4 table
1642#ifdef CONFIG_SVM
1643    page_table = (uint64_t *)guest_to_host(amd_vmcb_cr3_rd(&g->vmcb));
1644#else
1645    uint64_t guest_cr3;
1646    errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR3, &guest_cr3);
1647    assert(err_is_ok(err));
1648    page_table = (uint64_t *)guest_to_host(guest_cr3);
1649#endif
1650    // get pml4 entry
1651    union x86_lm_pml4_entry pml4e = { .raw = page_table[va.u.pml4_idx] };
1652    assert (pml4e.u.p == 1);
1653
1654    // get a pointer to the pdp table
1655    page_table = (uint64_t *)guest_to_host(pml4e.u.pdp_base_pa << 12);
1656    // get pdp entry
1657    union x86_lm_pdp_entry pdpe = { .raw = page_table[va.u.pdp_idx] };
1658    assert(pdpe.u.p == 1);
1659    // check for 1GB page (PS bit set)
1660    if (pdpe.u.ps == 1) {
1661        return (pdpe.u1gb.base_pa << 30) | va.u1gb.pa_offset;
1662    }
1663
1664    // get a pointer to the pd table
1665    page_table = (uint64_t *)guest_to_host(pdpe.u.pd_base_pa << 12);
1666    // get pd entry
1667    union x86_lm_pd_entry pde = { .raw = page_table[va.u.pd_idx] };
1668    if (pde.u.p == 0) {
1669        printf("g2h %lx, pml4e %p %lx, pdpe %p %lx, pde %p %lx\n",
1670	       guest_to_host(0), &pml4e, pml4e.raw, &pdpe, pdpe.raw, &pde, pde.raw);
1671    }
1672    assert(pde.u.p == 1);
1673    // check for 2MB page (PS bit set)
1674    if (pde.u.ps == 1) {
1675        return (pde.u2mb.base_pa << 21) | va.u2mb.pa_offset;
1676    }
1677
1678    // get a pointer to the page table
1679    page_table = (uint64_t *)guest_to_host(pde.u.pt_base_pa << 12);
1680    // get the page table entry
1681    union x86_lm_pt_entry pte = { .raw = page_table[va.u.pt_idx] };
1682    assert(pte.u.p == 1);
1683
1684    return (pte.u.base_pa << 12) | va.u.pa_offset;
1685}
1686
1687static inline uint32_t
1688lookup_paddr_legacy_mode (struct guest *g, uint32_t vaddr)
1689{
1690    // PAE not supported
1691#ifdef CONFIG_SVM
1692    guest_assert(g, amd_vmcb_cr4_rd(&g->vmcb).pae == 0);
1693#else
1694    uint64_t guest_cr4;
1695    errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR4, &guest_cr4);
1696    guest_assert(g, (guest_cr4 & CR4_PAE) == 0);
1697#endif
1698    union x86_legm_va va = { .raw = vaddr };
1699    uint32_t *page_table;
1700
1701    // get a pointer to the pd table
1702#ifdef CONFIG_SVM
1703    page_table = (uint32_t *)guest_to_host(amd_vmcb_cr3_rd(&g->vmcb));
1704#else
1705    uint64_t guest_cr3;
1706    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR3, &guest_cr3);
1707    assert(err_is_ok(err));
1708    page_table = (uint32_t *)guest_to_host(guest_cr3);
1709#endif
1710
1711    // get pd entry
1712    union x86_legm_pd_entry pde = { .raw = page_table[va.u.pd_idx] };
1713    assert (pde.u.p == 1);
1714    // check for 4MB page (PS bit set)
1715    if (pde.u.ps == 1) {
1716        return (pde.u4mb.base_pa << 22) | va.u4mb.pa_offset;
1717    }
1718
1719    // get a pointer to the page table
1720    page_table = (uint32_t *)guest_to_host(pde.u.pt_base_pa << 12);
1721    // get the page table entry
1722    union x86_legm_pt_entry pte = { .raw = page_table[va.u.pt_idx] };
1723    assert(pte.u.p == 1);
1724
1725    return (pte.u.base_pa << 12) | va.u.pa_offset;
1726}
1727
1728// retunrs a pointer to a byte array starting at the current instruction
1729static inline int
1730get_instr_arr (struct guest *g, uint8_t **arr)
1731{
1732#ifdef CONFIG_SVM
1733    if (UNLIKELY(amd_vmcb_cr0_rd(&g->vmcb).pg == 0)) {
1734#else
1735    uint64_t guest_cr0;
1736    errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR0, &guest_cr0);
1737    if (UNLIKELY((guest_cr0 & CR0_PG) == 0)) {
1738#endif
1739    	//printf("Segmentation active!\n");
1740        // without paging
1741        // take segmentation into account
1742#ifdef CONFIG_SVM
1743        *arr = (uint8_t *)(guest_to_host(g->mem_low_va) +
1744               amd_vmcb_cs_base_rd(&g->vmcb) +
1745               amd_vmcb_rip_rd(&g->vmcb));
1746#else
1747	uint64_t guest_cs_base, guest_rip;
1748	err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CS_BASE, &guest_cs_base);
1749	err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
1750        *arr = (uint8_t *)(guest_to_host(g->mem_low_va) +
1751			   guest_cs_base + guest_rip);
1752#endif
1753    } else {
1754        // with paging
1755#ifdef CONFIG_SVM
1756        if (amd_vmcb_efer_rd(&g->vmcb).lma == 1) {
1757#else
1758	uint64_t guest_efer;
1759	err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_EFER_F, &guest_efer);
1760	if (guest_efer & EFER_LMA) {
1761#endif
1762            // long mode
1763#ifdef CONFIG_SVM
1764            if (amd_vmcb_cs_attrib_rd(&g->vmcb).l == 1) {
1765                // 64-bit mode
1766                *arr = (uint8_t *)guest_to_host(lookup_paddr_long_mode(g,
1767                                                amd_vmcb_rip_rd(&g->vmcb)));
1768#else
1769	    uint64_t cs_access_rights, guest_rip;
1770	    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CS_ACCESS, &cs_access_rights);
1771	    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
1772	    if (cs_access_rights & ACCESS_RIGHTS_LONG_MODE) {
1773                *arr = (uint8_t *)guest_to_host(lookup_paddr_long_mode(g,
1774                                                guest_rip));
1775#endif
1776            } else {
1777                // cmpatibility mode
1778                guest_assert(g, !"compatiblity mode not supported yet");
1779            }
1780        } else {
1781            // Legacy (aka. Paged Protected) Mode
1782#ifdef CONFIG_SVM
1783            assert(amd_vmcb_cr0_rd(&g->vmcb).pe == 1);
1784
1785            *arr = (uint8_t *)guest_to_host(lookup_paddr_legacy_mode(g,
1786                                            amd_vmcb_rip_rd(&g->vmcb)));
1787#else
1788	    assert(guest_cr0 & CR0_PE);
1789
1790	    uint64_t guest_rip;
1791	    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
1792            *arr = (uint8_t *)guest_to_host(lookup_paddr_legacy_mode(g,
1793                                            guest_rip));
1794#endif
1795        }
1796    }
1797#ifndef CONFIG_SVM
1798    assert(err_is_ok(err));
1799#endif
1800    return HANDLER_ERR_OK;
1801}
1802
1803static inline uint64_t
1804get_reg_val_by_reg_num (struct guest *g, uint8_t regnum) {
1805    switch (regnum) {
1806    case 0:
1807        return guest_get_rax(g);
1808    case 1:
1809        return guest_get_rcx(g);
1810    case 2:
1811        return guest_get_rdx(g);
1812    case 3:
1813        return guest_get_rbx(g);
1814    case 4:
1815        return guest_get_rsp(g);
1816    case 5:
1817        return guest_get_rbp(g);
1818    case 6:
1819        return guest_get_rsi(g);
1820    case 7:
1821        return guest_get_rdi(g);
1822    default:
1823        assert(!"not reached");
1824        return 0;
1825    }
1826}
1827
1828static inline void
1829set_reg_val_by_reg_num (struct guest *g, uint8_t regnum, uint64_t val) {
1830    switch (regnum) {
1831    case 0:
1832        guest_set_rax(g, val);
1833        break;
1834    case 1:
1835        guest_set_rcx(g, val);
1836        break;
1837    case 2:
1838        guest_set_rdx(g, val);
1839        break;
1840    case 3:
1841        guest_set_rbx(g, val);
1842        break;
1843    case 4:
1844        guest_set_rsp(g, val);
1845        break;
1846    case 5:
1847        guest_set_rbp(g, val);
1848        break;
1849    case 6:
1850        guest_set_rsi(g, val);
1851        break;
1852    case 7:
1853        guest_set_rdi(g, val);
1854        break;
1855    default:
1856        assert(!"not reached");
1857        break;
1858    }
1859}
1860
1861static int
1862handle_vmexit_cr_access (struct guest *g)
1863{
1864    int r;
1865    uint8_t *code = NULL;
1866#ifndef CONFIG_SVM
1867    errval_t err = 0;
1868    if (g->emulated_before_exit) {
1869        assert(saved_exit_reason == VMX_EXIT_REASON_CR_ACCESS);
1870        assert(((saved_exit_qual >> 0) & 0xf) == 0);
1871    }
1872#endif
1873    // fetch the location to the code
1874    r = get_instr_arr(g, &code);
1875    if (r != HANDLER_ERR_OK) {
1876        return r;
1877    }
1878    assert(code != NULL);
1879
1880    assert(code[0] == 0x0f && (code[1] == 0x20 || code[1] == 0x22));
1881
1882    uint64_t val;
1883    bool read = (code[1] == 0x20);
1884    union x86_modrm mod;
1885    mod.raw = code[2];
1886
1887    // FIXME: use proper exception
1888    assert(mod.u.mod == 3);
1889
1890    // source
1891    if (read) {
1892        // read from CR
1893        switch (mod.u.regop) {
1894        case 0:
1895#ifdef CONFIG_SVM
1896            val = amd_vmcb_cr0_rd_raw(&g->vmcb);
1897#else
1898	    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR0, &val);
1899#endif
1900            break;
1901        default:
1902            printf("CR access: unknown CR source register\n");
1903            return handle_vmexit_unhandeled(g);
1904        }
1905    } else {
1906        // read from GPR
1907        val = get_reg_val_by_reg_num(g, mod.u.rm);
1908    }
1909
1910    // destination
1911    if (read) {
1912        // write to GPR
1913        switch (mod.u.rm) {
1914        case 0:
1915            guest_set_rax(g, val);
1916            break;
1917        case 1:
1918            guest_set_rcx(g, val);
1919            break;
1920        case 2:
1921            guest_set_rdx(g, val);
1922            break;
1923        case 3:
1924            guest_set_rbx(g, val);
1925            break;
1926        default:
1927            printf("CR access: unknown GPR destination register\n");
1928            return handle_vmexit_unhandeled(g);
1929        }
1930    } else {
1931        // write to CR
1932        switch (mod.u.regop) {
1933        case 0:
1934#ifdef CONFIG_SVM
1935            amd_vmcb_cr0_wr_raw(&g->vmcb, val);
1936#else
1937	    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_CR0, val);
1938#endif
1939            break;
1940        default:
1941            printf("CR access: unknown CR destination register\n");
1942            return handle_vmexit_unhandeled(g);
1943        }
1944    }
1945
1946    // advance the rip beyond the instruction
1947#ifdef CONFIG_SVM
1948    amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_rip_rd(&g->vmcb) + 3);
1949#else
1950    uint64_t guest_rip;
1951    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
1952    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip + 3);
1953    assert(err_is_ok(err));
1954#endif
1955    return HANDLER_ERR_OK;
1956}
1957
1958static int
1959handle_vmexit_ldt (struct guest *g)
1960{
1961    int r;
1962    uint8_t *code = NULL;
1963    uint8_t *mem;
1964
1965    // this handler supports only real-mode
1966#ifdef CONFIG_SVM
1967    assert(amd_vmcb_cr0_rd(&g->vmcb).pe == 0);
1968#else
1969    uint64_t guest_cr0;
1970    errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR0, &guest_cr0);
1971    assert((guest_cr0 & CR0_PE) == 0);
1972#endif
1973    // fetch the location to the code
1974    r = get_instr_arr(g, &code);
1975    if (r != HANDLER_ERR_OK) {
1976        return r;
1977    }
1978    mem = (uint8_t *)guest_to_host(g->mem_low_va);
1979    assert(code != NULL);
1980
1981    assert (code[0] == 0x0f && code[1] == 0x01);
1982
1983    // check for relevant instruction prefixes
1984    bool addr32 = code[-2] == 0x67 || code[-1] == 0x67;
1985    bool op32 = code[-2] == 0x66 || code[-1] == 0x66;
1986    // fetch modrm
1987    union x86_modrm modrm = { .raw = code[2] };
1988
1989    assert(modrm.u.regop == 2 || modrm.u.regop == 3);
1990    guest_assert(g, op32);
1991
1992    uint32_t addr;
1993    if (addr32) {
1994        // byte 3-6 hold a 32 bit address to a mem location where the first word
1995        // holds the limit and the following dword holds the base
1996        addr = *(uint32_t *)&code[3];
1997    } else {
1998        // byte 3-4 hold a 16 bit address to a mem location where the first word
1999        // holds the limit and the following dword holds the base
2000        // this address is relative to DS base
2001#ifdef CONFIG_SVM
2002        addr = *(uint16_t *)&code[3] + amd_vmcb_ds_base_rd(&g->vmcb);
2003#else
2004	uint64_t guest_ds_base;
2005	err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_DS_BASE, &guest_ds_base);
2006	addr = *(uint16_t *)&code[3] + guest_ds_base;
2007#endif
2008    }
2009
2010    // santity check on the addr
2011    // FIXME: raise a proper exception
2012    if (addr > g->mem_high_va) {
2013        printf("Memory access beyond physical address space\n");
2014        return HANDLER_ERR_FATAL;
2015    }
2016
2017    // load the actual register
2018    if (modrm.u.regop == 2) {
2019        // LGDT
2020#ifdef CONFIG_SVM
2021        amd_vmcb_gdtr_limit_wr(&g->vmcb, *(uint16_t*)(mem + addr));
2022        amd_vmcb_gdtr_base_wr(&g->vmcb, *(uint32_t*)(mem + addr + 2));
2023#else
2024	err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_GDTR_LIM,
2025					 *(uint16_t*)(mem + addr));
2026        err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_GDTR_BASE,
2027					 *(uint32_t*)(mem + addr + 2));
2028#endif
2029
2030    } else if (modrm.u.regop == 3) {
2031        // LIDT
2032#ifdef CONFIG_SVM
2033        amd_vmcb_idtr_limit_wr(&g->vmcb, *(uint16_t*)(mem + addr));
2034        amd_vmcb_idtr_base_wr(&g->vmcb, *(uint32_t*)(mem + addr + 2));
2035#else
2036	err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_IDTR_LIM,
2037					 *(uint16_t*)(mem + addr));
2038	err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_IDTR_BASE,
2039					 *(uint32_t*)(mem + addr + 2));
2040#endif
2041    } else {
2042        assert(!"not reached");
2043    }
2044
2045    // advance the rip beyond the instruction
2046#ifdef CONFIG_SVM
2047    if (addr32) {
2048        amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_rip_rd(&g->vmcb) + 7);
2049    } else {
2050        amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_rip_rd(&g->vmcb) + 5);
2051    }
2052#else
2053    uint64_t guest_rip;
2054    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
2055    if (addr32) {
2056        err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip + 7);
2057    } else {
2058        err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip + 5);
2059    }
2060    assert(err_is_ok(err));
2061#endif
2062    return HANDLER_ERR_OK;
2063}
2064
2065#ifndef CONFIG_SVM
2066static inline void vmx_vmcs_rflags_cf_wrf(struct guest *g, int val) {
2067    assert(val == 0 || val == 1);
2068    uint64_t guest_rflags;
2069    errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RFLAGS, &guest_rflags);
2070    if (val) {
2071        err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RFLAGS,
2072					 guest_rflags | RFLAGS_CF);
2073    } else {
2074        err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RFLAGS,
2075					 guest_rflags & (~RFLAGS_CF));
2076    }
2077    assert(err_is_ok(err));
2078}
2079#endif
2080
2081static int
2082handle_vmexit_swint (struct guest *g)
2083{
2084    int r;
2085    uint8_t *code = NULL;
2086
2087    r = get_instr_arr(g, &code);
2088    if (r != HANDLER_ERR_OK) {
2089        return r;
2090    }
2091    assert (code != NULL);
2092
2093    // check for correct instruciton
2094    assert(code[0] == 0xcd);
2095
2096    // the number of the interrupt is followed by the INT (0xcd) opcode
2097    uint8_t int_num = code[1];
2098
2099    // check whether the guest is in real mode
2100#ifdef CONFIG_SVM
2101    if (amd_vmcb_cr0_rd(&g->vmcb).pe == 0) {
2102#else
2103    uint64_t guest_ds_base, es_guest_base;
2104    uint64_t guest_cr0, guest_rip;
2105    errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR0, &guest_cr0);
2106    if ((guest_cr0 & CR0_PE) == 0) {
2107#endif
2108        // in real mode the interrupts starting at 10 have different meaning
2109        // examine the sw interrupt
2110        switch (int_num) {
2111            case 0x10:
2112                r = console_handle_int10(g->console, g);
2113                if (r != HANDLER_ERR_OK) {
2114                    printf("Unhandled method on INT 0x10\n");
2115                    return handle_vmexit_unhandeled(g);
2116                }
2117                break;
2118            case 0x12:
2119                switch (guest_get_ax(g)) {
2120                    case 0: // GET MEMORY SIZE
2121                        // our VM always has 1MB of base memory
2122                        // AX holds the amount of 1KB memory blocks starting at
2123                        // addr 0 which is 640 (640 KiB)
2124                        guest_set_ax(g, 640);
2125                        break;
2126                    default:
2127                        printf("Unhandled method on INT 0x12\n");
2128                        return handle_vmexit_unhandeled(g);
2129                }
2130                break;
2131            case 0x13:
2132                // Bootable CD-ROM - GET STATUS
2133                if (guest_get_ax(g) == 0x4b01) {
2134                    // no cdrom support
2135#ifdef CONFIG_SVM
2136                    amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
2137#else
2138		    vmx_vmcs_rflags_cf_wrf(g, 1);
2139#endif
2140                }
2141                // DISK RESET
2142                else if (guest_get_ah(g) == 0) {
2143                    for (int i = 0; i < g->hdd_count; i++) {
2144                        hdd_reset(g->hdds[i]);
2145                    }
2146                }
2147                // DISK - GET DRIVE PARAMETERS (PC,XT286,CONV,PS,ESDI,SCSI)
2148                else if (guest_get_ah(g) == 0x08) {
2149                    uint8_t dl = guest_get_dl(g);
2150
2151                    // only respond to installed hard disks
2152                    if ((dl >> 7) && ((dl & 0x7f) < g->hdd_count)) {
2153                        uint16_t c;
2154                        uint8_t h, s;
2155
2156                        r = hdd_get_geometry_chs(g->hdds[dl & 0x7f], &c, &h, &s);
2157                        assert(r == 0);
2158
2159                        // set some return values for success
2160                        guest_set_ah(g, 0);
2161#ifdef CONFIG_SVM
2162                        amd_vmcb_rflags_cf_wrf(&g->vmcb, 0);
2163#else
2164			vmx_vmcs_rflags_cf_wrf(g, 0);
2165#endif
2166                        guest_set_bl(g, 0);
2167                        // store the geometry into the correct registers
2168                        guest_set_cx(g, c << 6 | (s & 0x3f));
2169                        guest_set_dh(g, h);
2170                        guest_set_dl(g, g->hdd_count);
2171                    } else {
2172#ifdef CONFIG_SVM
2173                        amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
2174#else
2175			vmx_vmcs_rflags_cf_wrf(g, 1);
2176#endif
2177                        // it is not really clear to me what ah should contain
2178                        // when the drive is not present, so set it to FF
2179                        guest_set_ah(g, 1);
2180                    }
2181                }
2182                // INT 13 Extensions - INSTALLATION CHECK
2183                else if (guest_get_ah(g) == 0x41 && guest_get_bx(g) == 0x55aa) {
2184#ifdef CONFIG_SVM
2185                    amd_vmcb_rflags_cf_wrf(&g->vmcb, 0);
2186#else
2187		    vmx_vmcs_rflags_cf_wrf(g, 0);
2188#endif
2189                    guest_set_bx(g, 0xaa55);
2190                    guest_set_ah(g, 0x01); // Drive extensions 1.x
2191                    guest_set_al(g, 0);
2192                    guest_set_cx(g, 0x5);
2193                }
2194                // IBM/MS INT 13 Extensions - EXTENDED READ
2195                else if (guest_get_ah(g) == 0x42) {
2196                    uint8_t dl = guest_get_dl(g);
2197
2198                    // only respond to installed hard disks
2199                    if ((dl >> 7) && ((dl & 0x7f) < g->hdd_count)) {
2200#ifdef CONFIG_SVM
2201                        amd_vmcb_rflags_cf_wrf(&g->vmcb, 0);
2202#else
2203			vmx_vmcs_rflags_cf_wrf(g, 0);
2204#endif
2205                        guest_set_ah(g, 0);
2206
2207                        struct disk_access_block {
2208                            uint8_t     size;
2209                            uint8_t     reserved;
2210                            uint16_t    count;
2211                            // pointer to the data buffer formated like
2212                            // SEGMENT:ADDRESS
2213                            uint32_t    transfer_buffer;
2214                            uint64_t    abs_block_number;
2215                        } __attribute__ ((packed));
2216
2217                        // memory location of the disk access block
2218#ifdef CONFIG_SVM
2219                        uintptr_t mem = guest_to_host(g->mem_low_va) +
2220                                        amd_vmcb_ds_base_rd(&g->vmcb) +
2221                                        guest_get_si(g);
2222#else
2223			err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_DS_BASE, &guest_ds_base);
2224                        uintptr_t mem = guest_to_host(g->mem_low_va) +
2225                                        guest_ds_base + guest_get_si(g);
2226#endif
2227
2228                        struct disk_access_block *dap = (void *)mem;
2229
2230                        if (dap->size < 0x10) {
2231#ifdef CONFIG_SVM
2232                            amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
2233#else
2234			    vmx_vmcs_rflags_cf_wrf(g, 1);
2235#endif
2236                            guest_set_ah(g, 1);
2237                        } else {
2238                            // dap->transfer buffer points to a real-mode segment
2239                            // resolve it according to that rules
2240                            mem = guest_to_host(g->mem_low_va) +
2241                                  ((dap->transfer_buffer >> 16) << 4) +
2242                                  (dap->transfer_buffer & 0xffff);
2243
2244                            size_t count = dap->count;
2245                            r = hdd_read_blocks(g->hdds[dl & 0x7f],
2246                                                dap->abs_block_number,
2247                                                &count, mem);
2248                            dap->count = count;
2249
2250                            if (r != HANDLER_ERR_OK) {
2251#ifdef CONFIG_SVM
2252                                amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
2253#else
2254				vmx_vmcs_rflags_cf_wrf(g, 1);
2255#endif
2256                                guest_set_ah(g, 1);
2257                            }
2258                        }
2259                    } else {
2260#ifdef CONFIG_SVM
2261                        amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
2262#else
2263			vmx_vmcs_rflags_cf_wrf(g, 1);
2264#endif
2265                        // it is not really clear to me what ah should contain
2266                        // when the drive is not present, so set it to FF
2267                        guest_set_ah(g, 1);
2268                    }
2269                }
2270                // IBM/MS INT 13 Extensions - GET DRIVE PARAMETERS
2271                else if (guest_get_ah(g) == 0x48) {
2272                    uint8_t dl = guest_get_dl(g);
2273
2274                    // only respond to installed hard disks
2275                    if ((dl >> 7) && ((dl & 0x7f) < g->hdd_count)) {
2276                        // structure to hold drive info
2277                        struct drive_params {
2278                            uint16_t size;
2279                            uint16_t flags;
2280                            uint32_t cylinders;
2281                            uint32_t heads;
2282                            uint32_t sectors;
2283                            uint64_t total_sectors;
2284                            uint16_t bytes_per_sector;
2285                        } __attribute__ ((packed));
2286
2287                        // memory where the drive info shall be stored
2288#ifdef CONFIG_SVM
2289                        uintptr_t mem = guest_to_host(g->mem_low_va) +
2290                                        amd_vmcb_ds_base_rd(&g->vmcb) +
2291                                        guest_get_si(g);
2292#else
2293			err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_DS_BASE, &guest_ds_base);
2294                        uintptr_t mem = guest_to_host(g->mem_low_va) +
2295                                        guest_ds_base + guest_get_si(g);
2296#endif
2297
2298                        struct drive_params *drp = (void *)mem;
2299
2300                        // sanity check
2301                        if (drp->size < sizeof(struct drive_params)) {
2302#ifdef CONFIG_SVM
2303                            amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
2304#else
2305			    vmx_vmcs_rflags_cf_wrf(g, 1);
2306#endif
2307                        } else {
2308#ifdef CONFIG_SVM
2309                            amd_vmcb_rflags_cf_wrf(&g->vmcb, 0);
2310#else
2311			    vmx_vmcs_rflags_cf_wrf(g, 0);
2312#endif
2313                            guest_set_ah(g, 0);
2314
2315                            drp->size = sizeof(struct drive_params);
2316                            // CHS invalid, no removable drive, etc
2317                            drp->flags = 0;
2318                            drp->cylinders = 0;
2319                            drp->heads = 0;
2320                            drp->sectors = 0;
2321                            drp->total_sectors = hdd_get_blocks_count(
2322                                                    g->hdds[dl & 0x7f]);
2323                            drp->bytes_per_sector = 512; // FIXME: Hardcoded
2324                        }
2325                    } else {
2326#ifdef CONFIG_SVM
2327                        amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
2328#else
2329			vmx_vmcs_rflags_cf_wrf(g, 1);
2330#endif
2331                        // it is not really clear to me what ah should contain
2332                        // when the drive is not present, so set it to FF
2333                        guest_set_ah(g, 0x1);
2334                    }
2335                } else {
2336                    printf("Unhandeled method on INT 0x13\n");
2337                    return handle_vmexit_unhandeled(g);
2338                }
2339                break;
2340            case 0x15:
2341                // ENABLE A20 GATE
2342                if (guest_get_ax(g) == 0x2401) {
2343                    g->a20_gate_enabled = true;
2344#ifdef CONFIG_SVM
2345                    amd_vmcb_rflags_cf_wrf(&g->vmcb, 0);
2346#else
2347		    vmx_vmcs_rflags_cf_wrf(g, 0);
2348#endif
2349                    guest_set_ah(g, 0);
2350                }
2351                // APM INSTALLATION CHECK
2352                else if (guest_get_ax(g) == 0x5300) {
2353                    // we do not support APM - set carry flag to indicate error
2354#ifdef CONFIG_SVM
2355                    amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
2356#else
2357		    vmx_vmcs_rflags_cf_wrf(g, 1);
2358#endif
2359                }
2360                // APM DISCONNECT
2361                else if (guest_get_ax(g) == 0x5304) {
2362                    // we do not support APM - set carry flag to indicate error
2363#ifdef CONFIG_SVM
2364                    amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
2365#else
2366		    vmx_vmcs_rflags_cf_wrf(g, 1);
2367#endif
2368                }
2369                // GET MEMORY SIZE FOR >64M CONFIGURATIONS
2370                else if (guest_get_ax(g) == 0xe801) {
2371                    // we do not support this BIOS call
2372                    // both grub and linux may also use the 0xe820 call
2373#ifdef CONFIG_SVM
2374                    amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
2375#else
2376		    vmx_vmcs_rflags_cf_wrf(g, 1);
2377#endif
2378                }
2379                // GET SYSTEM MEMORY MAP
2380                // EDX has to contain 0x534d4150 (== 'SMAP')
2381                else if (guest_get_ax(g) == 0xe820 &&
2382                         guest_get_edx(g) == 0x534d4150) {
2383                    // for now we return only one entry containing the real mem
2384                    if (guest_get_ebx(g) > 1 || guest_get_ecx(g) < 20) {
2385                        // wrong input params -> report error
2386#ifdef CONFIG_SVM
2387                        amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
2388#else
2389			vmx_vmcs_rflags_cf_wrf(g, 1);
2390#endif
2391                    } else {
2392                        // taken from http://www.ctyme.com/intr/rb-1741.htm
2393#ifdef CONFIG_SVM
2394                        uintptr_t addr = guest_to_host(g->mem_low_va) +
2395                                         amd_vmcb_es_base_rd(&g->vmcb) +
2396                                         guest_get_di(g);
2397#else
2398			err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_ES_BASE, &es_guest_base);
2399                        uintptr_t addr = guest_to_host(g->mem_low_va) +
2400                                         es_guest_base + guest_get_di(g);
2401#endif
2402                        // set EAX to 'SMAP'
2403                        guest_set_eax(g, 0x534D4150);
2404                        // returned bytes (always 20)
2405                        guest_set_ecx(g, 20);
2406
2407                        switch (guest_get_ebx(g)) {
2408                        case 0x0:
2409                            // base memory
2410                            assert(g->mem_low_va == 0);
2411                            // base address
2412                            *(uint64_t *)addr = 0;
2413                            // size of the memory block
2414                            *(uint64_t *)(addr + 8) = 0xa0000; // 640 KiB
2415                            // mem type, 1 == "memory, available to the OS"
2416                            *(uint32_t *)(addr + 16) = 1;
2417                            // indicate that there is more data
2418                            guest_set_ebx(g, 1);
2419                            break;
2420                        case 0x1:
2421                            // extended memory
2422                            assert(g->mem_high_va > 0x100000);
2423                            // base address
2424                            *(uint64_t *)addr = 0x100000;   // 1 MiB
2425                            // size of the memory block
2426                            *(uint64_t *)(addr + 8) = g->mem_high_va - 0x100000;
2427                            // mem type, 1 == "memory, available to the OS"
2428                            *(uint32_t *)(addr + 16) = 1;
2429                            // indicate that there is no more data
2430                            guest_set_ebx(g, 0);
2431                            break;
2432                        default:
2433                            assert(!"not reached");
2434                            break;
2435                        }
2436
2437                        // mark success
2438#ifdef CONFIG_SVM
2439                        amd_vmcb_rflags_cf_wrf(&g->vmcb, 0);
2440#else
2441			vmx_vmcs_rflags_cf_wrf(g, 0);
2442#endif
2443                    }
2444                }
2445                // SYSTEM - Get Intel SpeedStep (IST) information
2446                else if (guest_get_ax(g) == 0xe980) {
2447                    // not supportet yet
2448#ifdef CONFIG_SVM
2449                    amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
2450#else
2451		    vmx_vmcs_rflags_cf_wrf(g, 1);
2452#endif
2453                }
2454                // SYSTEM - GET CONFIGURATION (XT >1986/1/10,AT mdl 3x9,
2455                // CONV,XT286,PS)
2456                // GRUB BUG: it puts 0xc0 into AX instead of AH
2457                else if (guest_get_ax(g) == 0xc0) {
2458                    // we do not support this
2459#ifdef CONFIG_SVM
2460                    amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
2461#else
2462		    vmx_vmcs_rflags_cf_wrf(g, 1);
2463#endif
2464                    guest_set_ah(g, 0x80);
2465                }
2466                // GET EXTENDED MEMORY SIZE
2467                else if (guest_get_ah(g) == 0x88) {
2468                    // calculate number of 1KB chunks starting from 1MB but not
2469                    // beyond 16MB
2470                    assert(((g->mem_high_va - g->mem_low_va) & 0x3ff) == 0);
2471                    guest_set_ax(g, MIN(0x3c00 /* 16MB */,
2472                                 (g->mem_high_va - g->mem_low_va) / 1024));
2473                    // indicate no error occured
2474#ifdef CONFIG_SVM
2475                    amd_vmcb_rflags_cf_wrf(&g->vmcb, 0);
2476#else
2477		    vmx_vmcs_rflags_cf_wrf(g, 0);
2478#endif
2479                }
2480                // SYSTEM - GET CONFIGURATION (XT >1986/1/10,AT mdl 3x9,
2481                // CONV,XT286,PS)
2482                else if (guest_get_ah(g) == 0xc0) {
2483                    // we do not support this
2484#ifdef CONFIG_SVM
2485                    amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
2486#else
2487		    vmx_vmcs_rflags_cf_wrf(g, 1);
2488#endif
2489                    guest_set_ah(g, 0x80);
2490                // SYSTEM - SET BIOS MODE
2491                } else if (guest_get_ah(g) == 0xec) {
2492                    // I do no really know the use of this bios call and linux
2493                    // expects no action what so ever
2494                } else {
2495                    printf("Unhandeled method on INT 0x15\n");
2496                    return handle_vmexit_unhandeled(g);
2497                }
2498                break;
2499            case 0x16:
2500                // KEYBOARD - SET TYPEMATIC RATE AND DELAY
2501                if (guest_get_ah(g) == 0x3) {
2502                    // ignore this
2503                } else {
2504                    printf("Unhandeled method on INT 0x16\n");
2505                    return handle_vmexit_unhandeled(g);
2506                }
2507                break;
2508            case 0x1a:
2509                // TIME - GET REAL-TIME CLOCK TIME (AT,XT286,PS)
2510                if (guest_get_ah(g) == 0x2) {
2511                    uint8_t h, m, s;
2512                    lpc_rtc_get_time_bcd(g->lpc, &h, &m, &s);
2513                    guest_set_ch(g, h);
2514                    guest_set_cl(g, m);
2515                    guest_set_dh(g, s);
2516                    guest_set_dl(g, 0);
2517                    // mark success
2518#ifdef CONFIG_SVM
2519                    amd_vmcb_rflags_cf_wrf(&g->vmcb, 0);
2520#else
2521		    vmx_vmcs_rflags_cf_wrf(g, 0);
2522#endif
2523                } else {
2524                    printf("Unhandeled method on INT 0x1a\n");
2525                    return handle_vmexit_unhandeled(g);
2526                }
2527                break;
2528            default:
2529                printf("handle_vmexit_swint: Unhandeled real-mode interrupt "
2530                       "0x%x (%d).\n", int_num, int_num);
2531                return handle_vmexit_unhandeled(g);
2532        }
2533    } else {
2534        printf("vmkitmon: encountered INT instruction outside real mode\n");
2535        return handle_vmexit_unhandeled(g);
2536    }
2537
2538    // advance the rip beyond the instruction
2539#ifdef CONFIG_SVM
2540    amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_rip_rd(&g->vmcb) + 2);
2541#else
2542    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
2543    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip + 2);
2544    assert(err_is_ok(err));
2545#endif
2546    return HANDLER_ERR_OK;
2547}
2548
2549static inline enum opsize
2550io_access_size_to_opsize (enum x86_io_access io)
2551{
2552    if (io & X86_IO_ACCESS_SZ8) {
2553        return OPSIZE_8;
2554    } else if (io & X86_IO_ACCESS_SZ16) {
2555        return OPSIZE_16;
2556    } else if (io & X86_IO_ACCESS_SZ32) {
2557        return OPSIZE_32;
2558    } else {
2559        assert(!"NYI");
2560        return 0;
2561    }
2562}
2563
2564static int
2565handle_vmexit_ioio (struct guest *g)
2566{
2567    int r;
2568#ifdef CONFIG_SVM
2569    uint64_t info1 = amd_vmcb_exitinfo1_rd(&g->vmcb);
2570    enum x86_io_access io;
2571    uint16_t port = info1 >> 16;
2572#else
2573    errval_t err = 0;
2574    if (!g->emulated_before_exit) {
2575        err += invoke_dispatcher_vmread(g->dcb_cap, VMX_EXIT_QUAL, &saved_exit_qual);
2576	uint64_t instr_len, guest_rip;
2577	err += invoke_dispatcher_vmread(g->dcb_cap, VMX_EXIT_INSTR_LEN, &instr_len);
2578	err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
2579	saved_rip = guest_rip + instr_len;
2580    }
2581    uint16_t port = (saved_exit_qual >> 16) & 0xffff;
2582#endif
2583    bool write;
2584    enum opsize size;
2585    uint32_t val;
2586    bool newapi = false; // needed as a transition
2587
2588#ifdef CONFIG_SVM
2589    // copy the access flags
2590    // FIXME: this severely exploits the way the x86_io_access flags are set up
2591    io = (info1 >> 1);
2592    io |= info1 & SVM_IOIO_TYPE_MASK;
2593
2594    // gather some params for the io access
2595    write = (io & X86_IO_ACCESS_TYPE) == 0;
2596    size = OPSIZE_8; // make gcc happy
2597    if (io & X86_IO_ACCESS_SZ8) {
2598        size = OPSIZE_8;
2599    } else if (io & X86_IO_ACCESS_SZ16) {
2600        size = OPSIZE_16;
2601    } else if (io & X86_IO_ACCESS_SZ32) {
2602        size = OPSIZE_32;
2603    }
2604#else
2605    write = ((saved_exit_qual >> 3) & 0x1) == 0;
2606    size = OPSIZE_8;
2607    if ((saved_exit_qual & 0x7) == 0) {
2608        size = OPSIZE_8;
2609    } else if ((saved_exit_qual & 0x7) == 1) {
2610        size = OPSIZE_16;
2611    } else if ((saved_exit_qual & 0x7) == 3) {
2612        size = OPSIZE_32;
2613    } else {
2614        assert(!"Invalid size of access value");
2615    }
2616#endif
2617    // fetch the source val if neccessary
2618    if (write) {
2619        switch (size) {
2620        case OPSIZE_8:
2621            val = guest_get_al(g);
2622            break;
2623        case OPSIZE_16:
2624            val = guest_get_ax(g);
2625            break;
2626        case OPSIZE_32:
2627            val = guest_get_eax(g);
2628            break;
2629        default:
2630            assert(!"not reached");
2631            break;
2632        }
2633    }
2634
2635    // assign the request to the corresponding subsystem
2636    switch (port) {
2637        // LPC devices
2638        case 0x20:  // primary PIC
2639        case 0x21:  // primary PIC
2640        case 0x40:  // Timer
2641        case 0x41:  // Timer
2642        case 0x42:  // Timer
2643        case 0x43:  // Timer
2644        case 0x61:  // NMI Controller
2645        case 0x70:  // RTC
2646        case 0x71:  // RTC
2647        case 0x72:  // RTC
2648        case 0x73:  // RTC
2649        case 0x74:  // RTC
2650        case 0x75:  // RTC
2651        case 0x76:  // RTC
2652        case 0x77:  // RTC
2653        case 0xa0:  // secondary PIC
2654        case 0xa1:  // secondary PIC
2655            if (write) {
2656                r = lpc_handle_pio_write(g->lpc, port, size, val);
2657                guest_assert(g, r == 0);
2658            } else {
2659                r = lpc_handle_pio_read(g->lpc, port, size, &val);
2660                assert(r == 0);
2661            }
2662            newapi = true;
2663            break;
2664        // Keyboard
2665        case 0x60:
2666        case 0x64:
2667            // we currently do not support a keyboard
2668            if (!write) {
2669                val = ~0;
2670            }
2671            newapi = true;
2672            break;
2673        case 0x80:
2674            // some apps use writing to this port as a method to delay execution
2675            // so we just do noting
2676            break;
2677        // Coprocessor
2678        case 0xf0:
2679        case 0xf1:
2680            // coprocessor IGNNE# - do nothing for now
2681            break;
2682
2683        // serial COM1 port
2684        // FIXME: this should not be hardcoded !
2685        case 0x3f8:
2686        case 0x3f9:
2687        case 0x3fa:
2688        case 0x3fb:
2689        case 0x3fc:
2690        case 0x3fd:
2691        case 0x3fe:
2692        case 0x3ff:
2693        // COM2
2694        case 0x2f8:
2695        case 0x2f9:
2696        case 0x2fa:
2697        case 0x2fb:
2698        case 0x2fc:
2699        case 0x2fd:
2700        case 0x2fe:
2701        case 0x2ff:
2702        // COM3
2703        case 0x3e8:
2704        case 0x3e9:
2705        case 0x3ea:
2706        case 0x3eb:
2707        case 0x3ec:
2708        case 0x3ed:
2709        case 0x3ee:
2710        case 0x3ef:
2711        // COM4
2712        case 0x2e8:
2713        case 0x2e9:
2714        case 0x2ea:
2715        case 0x2eb:
2716        case 0x2ec:
2717        case 0x2ed:
2718        case 0x2ee:
2719        case 0x2ef: {
2720            int com;
2721
2722            com = (port & 0xf0) == 0xf0 ? !(port & 0x100) : !(port & 0x100) + 2;
2723            assert(com >= 0 && com < 4);
2724            if (write) {
2725                r = pc16550d_handle_pio_write(g->serial_ports[com], port,
2726                                              size, val);
2727                assert(r == 0);
2728            } else {
2729                r = pc16550d_handle_pio_read(g->serial_ports[com], port,
2730                                             size, &val);
2731                assert(r == 0);
2732            }
2733            newapi = true;
2734            break;
2735        }
2736
2737            // PCI config space (address)
2738    case 0xcf8:
2739    case 0xcf9:
2740    case 0xcfa:
2741    case 0xcfb:
2742            // PCI config space (data)
2743    case 0xcfc:
2744    case 0xcfd:
2745    case 0xcfe:
2746    case 0xcff:
2747        if(write) {
2748            r = pci_handle_pio_write(g->pci, port, size, val);
2749        } else {
2750            r = pci_handle_pio_read(g->pci, port, size, &val);
2751        }
2752        assert(r == 0);
2753        newapi = true;
2754        break;
2755
2756        default:
2757            // the default is to return 0xff and to ignore writes
2758            if (!write) {
2759                val = 0xffffffff;
2760            }
2761            newapi = true;
2762    };
2763
2764    // set the destination when neccessary
2765    if (newapi && !write) {
2766        switch (size) {
2767        case OPSIZE_8:
2768            guest_set_al(g, val);
2769            break;
2770        case OPSIZE_16:
2771            guest_set_ax(g, val);
2772            break;
2773        case OPSIZE_32:
2774            guest_set_eax(g, val);
2775            break;
2776        default:
2777            assert(!"not reached");
2778            break;
2779        }
2780    }
2781
2782    // the following IP is stored in the exitinfo2 field
2783#ifdef CONFIG_SVM
2784    amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_exitinfo2_rd(&g->vmcb));
2785#else
2786    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, saved_rip);
2787    assert(err_is_ok(err));
2788#endif
2789    return HANDLER_ERR_OK;
2790}
2791
2792static int
2793handle_vmexit_msr (struct guest *g) {
2794#ifdef CONFIG_SVM
2795    bool write = amd_vmcb_exitinfo1_rd(&g->vmcb) == 1;
2796#else
2797    int msr_index;
2798    errval_t err = 0;
2799    bool write = (saved_exit_reason == VMX_EXIT_REASON_WRMSR);
2800    struct msr_entry *guest_msr_area = (struct msr_entry *)g->msr_area_va;
2801#endif
2802    uint32_t msr = guest_get_ecx(g);
2803    uint64_t val;
2804
2805    // there may be writes or reads to MSRs
2806    if (write) {
2807        // fetch the value to write from EDX:EAX
2808        val = ((uint64_t)guest_get_edx(g) << 32) | guest_get_eax(g);
2809
2810        // store the read value into the corresponding location
2811        switch (msr) {
2812        case X86_MSR_SYSENTER_CS:
2813#ifdef CONFIG_SVM
2814            amd_vmcb_sysenter_cs_wr(&g->vmcb, val);
2815#else
2816	    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_SYSENTER_CS, val);
2817#endif
2818            break;
2819        case X86_MSR_SYSENTER_ESP:
2820#ifdef CONFIG_SVM
2821            amd_vmcb_sysenter_esp_wr(&g->vmcb, val);
2822#else
2823	    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_SYSENTER_ESP, val);
2824#endif
2825            break;
2826        case X86_MSR_SYSENTER_EIP:
2827#ifdef CONFIG_SVM
2828            amd_vmcb_sysenter_eip_wr(&g->vmcb, val);
2829#else
2830	    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_SYSENTER_EIP, val);
2831#endif
2832            break;
2833        case X86_MSR_EFER:
2834#ifdef CONFIG_SVM
2835            amd_vmcb_efer_wr_raw(&g->vmcb, val);
2836#else
2837	    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_EFER_F, val);
2838#endif
2839            break;
2840        case X86_MSR_FS_BASE:
2841#ifdef CONFIG_SVM
2842            amd_vmcb_fs_base_wr(&g->vmcb, val);
2843#else
2844	    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_FS_BASE, val);
2845#endif
2846            break;
2847        case X86_MSR_GS_BASE:
2848#ifdef CONFIG_SVM
2849            amd_vmcb_gs_base_wr(&g->vmcb, val);
2850#else
2851	    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_GS_BASE, val);
2852#endif
2853            break;
2854#ifdef CONFIG_SVM
2855        case X86_MSR_KERNEL_GS_BASE:
2856            amd_vmcb_kernel_gs_base_wr(&g->vmcb, val);
2857            break;
2858        case X86_MSR_STAR:
2859            amd_vmcb_star_wr(&g->vmcb, val);
2860            break;
2861        case X86_MSR_LSTAR:
2862            amd_vmcb_lstar_wr(&g->vmcb, val);
2863            break;
2864        case X86_MSR_CSTAR:
2865            amd_vmcb_cstar_wr(&g->vmcb, val);
2866            break;
2867        case X86_MSR_SFMASK:
2868            amd_vmcb_sfmask_wr(&g->vmcb, val);
2869            break;
2870        default:
2871            printf("MSR: unhandeled MSR write access to %x\n", msr);
2872            return handle_vmexit_unhandeled(g);
2873#else
2874	default:
2875	    msr_index = vmx_guest_msr_index(msr);
2876	    if (msr_index == -1) {
2877	        printf("MSR: unhandeled MSR write access to %x\n", msr);
2878		return handle_vmexit_unhandeled(g);
2879	    }
2880	    guest_msr_area[msr_index].val = val;
2881	    break;
2882#endif
2883        }
2884    } else {
2885        // read the value from the corresponding location
2886        switch (msr) {
2887        case X86_MSR_SYSENTER_CS:
2888#ifdef CONFIG_SVM
2889            val = amd_vmcb_sysenter_cs_rd(&g->vmcb);
2890#else
2891	    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_SYSENTER_CS, &val);
2892#endif
2893            break;
2894        case X86_MSR_SYSENTER_ESP:
2895#ifdef CONFIG_SVM
2896            val = amd_vmcb_sysenter_esp_rd(&g->vmcb);
2897#else
2898	    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_SYSENTER_ESP, &val);
2899#endif
2900            break;
2901        case X86_MSR_SYSENTER_EIP:
2902#ifdef CONFIG_SVM
2903            val = amd_vmcb_sysenter_eip_rd(&g->vmcb);
2904#else
2905	    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_SYSENTER_EIP, &val);
2906#endif
2907            break;
2908        case X86_MSR_EFER:
2909#ifdef CONFIG_SVM
2910            val = amd_vmcb_efer_rd_raw(&g->vmcb);
2911#else
2912	    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_EFER_F, &val);
2913#endif
2914            break;
2915        case X86_MSR_FS_BASE:
2916#ifdef CONFIG_SVM
2917            val = amd_vmcb_fs_base_rd(&g->vmcb);
2918#else
2919	    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_FS_BASE, &val);
2920#endif
2921            break;
2922        case X86_MSR_GS_BASE:
2923#ifdef CONFIG_SVM
2924            val = amd_vmcb_gs_base_rd(&g->vmcb);
2925#else
2926	    err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_GS_BASE, &val);
2927#endif
2928            break;
2929#ifdef CONFIG_SVM
2930        case X86_MSR_KERNEL_GS_BASE:
2931            val = amd_vmcb_kernel_gs_base_rd(&g->vmcb);
2932            break;
2933        case X86_MSR_STAR:
2934            val = amd_vmcb_star_rd(&g->vmcb);
2935            break;
2936        case X86_MSR_LSTAR:
2937            val = amd_vmcb_lstar_rd(&g->vmcb);
2938            break;
2939        case X86_MSR_CSTAR:
2940            val = amd_vmcb_cstar_rd(&g->vmcb);
2941            break;
2942        case X86_MSR_SFMASK:
2943            val = amd_vmcb_sfmask_rd(&g->vmcb);
2944            break;
2945        default:
2946            printf("MSR: unhandeled MSR read access to %x\n", msr);
2947            return handle_vmexit_unhandeled(g);
2948#else
2949	default:
2950	    msr_index = vmx_guest_msr_index(msr);
2951	    if (msr_index == -1) {
2952	      printf("MSR: unhandeled MSR read access to %x\n", msr);
2953	      return handle_vmexit_unhandeled(g);
2954	    }
2955	    val = guest_msr_area[msr_index].val;
2956	    break;
2957#endif
2958        }
2959
2960        // store the value in EDX:EAX
2961        guest_set_eax(g, val);
2962        guest_set_edx(g, val >> 32);
2963    }
2964
2965    // advance the rip beyond the current instruction
2966#ifdef CONFIG_SVM
2967    amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_rip_rd(&g->vmcb) + 2);
2968#else
2969    uint64_t guest_rip;
2970    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
2971    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip + 2);
2972    assert(err_is_ok(err));
2973#endif
2974    return HANDLER_ERR_OK;
2975}
2976
2977static int
2978handle_vmexit_cpuid (struct guest *g) {
2979    uint32_t eax, ebx, ecx, edx;
2980    uint32_t func = guest_get_eax(g);
2981
2982    switch (func) {
2983    // Processor Vendor and Largest Standard Function Number
2984    case 0:
2985    case 0x80000000:
2986        // max standard function offset
2987        eax = func == 0 ? 0x1 : 0x80000000;
2988        // string "AuthenticAMD"
2989        ebx = 0x68747541;
2990        ecx = 0x444d4163;
2991        edx = 0x69746e65;
2992    break;
2993
2994    // Family, Model, Stepping Identifiers
2995    case 1:
2996        // we simulate a AMD K6-3D
2997        // Family 5, Model 8, Stepping 12
2998        eax = 0x58c;
2999        // no brand, clflush size 16, no mulitprocessing, no local apic
3000        ebx = 0x0f00;
3001        // support the popcnt instr
3002        ecx = 0x800000;
3003        // support some basic features
3004        edx = 0x89a91b;
3005    break;
3006
3007    default:
3008        // use the answer of the host if there is any other request
3009        // FIXME: this is probably not a good idea ;)
3010        cpuid(func, &eax, &ebx, &ecx, &edx);
3011        printf("handle_vmexit_cpuid: CPUID: func %x, host reports: eax %x, "
3012                "ebx %x, ecx %x, edx %x\n", func, eax, ebx, ecx, edx);
3013        break;
3014    }
3015
3016    guest_set_eax(g, eax);
3017    guest_set_ebx(g, ebx);
3018    guest_set_ecx(g, ecx);
3019    guest_set_edx(g, edx);
3020
3021    // advance the rip beyond the instruction
3022#ifdef CONFIG_SVM
3023    amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_rip_rd(&g->vmcb) + 2);
3024#else
3025    uint64_t guest_rip;
3026    errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
3027    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip + 2);
3028    assert(err_is_ok(err));
3029#endif
3030    return HANDLER_ERR_OK;
3031}
3032
3033static int
3034handle_vmexit_vmmcall (struct guest *g) {
3035    printf("VMMCALL: tsc %lu, exits with mon invocation %lu, exits w/o mon "
3036           "invocation %lu\n", rdtsc(),
3037           g->ctrl->num_vm_exits_with_monitor_invocation,
3038           g->ctrl->num_vm_exits_without_monitor_invocation);
3039
3040    // advance the rip beyond the instruction
3041#ifdef CONFIG_SVM
3042    amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_rip_rd(&g->vmcb) + 3);
3043#else
3044    uint64_t guest_rip;
3045    errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
3046    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip + 3);
3047    assert(err_is_ok(err));
3048#endif
3049    return HANDLER_ERR_OK;
3050}
3051
3052static int
3053handle_vmexit_hlt (struct guest *g) {
3054    // the guest has nothing to do - poll out irq sources for pending IRQs
3055    // if they do not assert a virtual IRQ then we will do nothing
3056    lpc_pic_process_irqs(g->lpc);
3057
3058    // advance the rip beyond the instruction
3059#ifdef CONFIG_SVM
3060    amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_rip_rd(&g->vmcb) + 1);
3061#else
3062    uint64_t guest_rip;
3063    errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
3064    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip + 1);
3065#endif
3066
3067    // running HLT with IRQs masked does not make any sense
3068    // FIXME: this assert silly, shutting down the VM would be the right way
3069#ifdef CONFIG_SVM
3070    guest_assert(g, amd_vmcb_rflags_rd(&g->vmcb).intrf == 1);
3071#else
3072    uint64_t guest_rflags;
3073    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RFLAGS, &guest_rflags);
3074    assert(err_is_ok(err));
3075    guest_assert(g, guest_rflags & RFLAGS_IF);
3076#endif
3077    if (virq_pending(g, NULL, NULL)) {
3078        // there is an IRQ pending, proceed as normal, the CPU will take it
3079    } else {
3080        // there is really nothing to do - stop the VM and wait
3081        g->runnable = false;
3082    }
3083
3084    return HANDLER_ERR_OK;
3085}
3086
3087static inline int
3088decode_mov_instr_length (struct guest *g, uint8_t *code)
3089{
3090    int len;
3091
3092    // we only support long mode for now
3093    //assert(amd_vmcb_efer_rd(&g->vmcb).lma == 1);
3094
3095    // all non special MOV instructions use one byte as opcode and at least a
3096    // ModR/M byte
3097    len = 2;
3098    // check for the REX prefix
3099    if ((code[0] >> 4) == 0x4) {
3100        len++;
3101        code++;
3102    }
3103    // precaution because I did no check all variants of MOV, at least these two
3104    // variants are supported
3105    assert(code[0] == 0x89 || code[0] == 0x8b);
3106
3107    union x86_modrm modrm = { .raw = code[1] };
3108    // check for displacements
3109    if (modrm.u.mod == 0x1) {
3110        // 1B displacement
3111        len++;
3112    } else if (modrm.u.mod == 0x2) {
3113        // 4B displacement
3114        len += 4;
3115    }
3116
3117    // check for SIB byte
3118    if (modrm.u.rm == 0x4 && modrm.u.mod != 0x3) {
3119        len++;
3120    }
3121
3122    return len;
3123}
3124
3125// finds out whether a move instruction is a read or a write with respect to
3126// memory
3127static inline bool
3128decode_mov_is_write (struct guest *g, uint8_t *code)
3129{
3130    // check for the REX prefix
3131    if ((code[0] >> 4) == 0x4) {
3132        code++;
3133    }
3134
3135    // we only support one move variant (in each direction) for now
3136    assert(code[0] == 0x89 || code[0] == 0x8b);
3137
3138    union x86_modrm modrm = { .raw = code[1] };
3139    // not defined for reg to reg moves
3140    assert(modrm.u.mod != 3);
3141
3142    return code[0] == 0x89; // 0x89 ==> MOV reg -> mem
3143}
3144
3145static inline enum opsize
3146decode_mov_op_size (struct guest *g, uint8_t *code)
3147{
3148    /*
3149	printf("EFER: 0x%lx\n", amd_vmcb_efer_rd_raw(&g->vmcb));
3150	printf("Code: 0x%lx\n", *((uint64_t *)code));
3151	printf("Code[0]: 0x%x, Code[1]: 0x%x, Code[2]: 0x%x, Code[3]: 0x%x\n", code[0],code[1],code[2],code[3]);
3152	printf("Guest EAX: 0x%x\n", guest_get_eax(g));
3153	printf("Guest EBX: 0x%x\n", guest_get_ebx(g));
3154	printf("Guest ECX: 0x%x\n", guest_get_ecx(g));
3155
3156	printf("Guest EDX: 0x%x\n", guest_get_edx(g));
3157	printf("Guest RDI: 0x%lx\n", guest_get_rdi(g));
3158	printf("Guest RSI: 0x%lx\n", guest_get_rsi(g));
3159	printf("Guest RSP: 0x%lx\n", guest_get_rsp(g));
3160	printf("Guest RBP: 0x%lx\n", guest_get_rbp(g));
3161    */
3162
3163    // we only support long mode for now
3164    //assert(amd_vmcb_efer_rd(&g->vmcb).lma == 1);
3165
3166    // check for the REX prefix
3167    if ((code[0] >> 4) == 0x4 && code[0] & 0x48) {
3168        return OPSIZE_64;
3169    }
3170    return OPSIZE_32;
3171}
3172
3173
3174static inline uint64_t
3175decode_mov_src_val (struct guest *g, uint8_t *code) {
3176    // we only support long mode for now
3177    //assert(amd_vmcb_efer_rd(&g->vmcb).lma == 1);
3178
3179    // check for the REX prefix
3180    if ((code[0] >> 4) == 0x4) {
3181        code++;
3182    }
3183
3184    // we only support one variant for now
3185    assert(code[0] == 0x89);
3186
3187    union x86_modrm modrm = { .raw = code[1] };
3188    return get_reg_val_by_reg_num(g, modrm.u.regop);
3189}
3190
3191
3192static inline void
3193decode_mov_dest_val (struct guest *g, uint8_t *code, uint64_t val)
3194{
3195    // we only support long mode for now
3196    //assert(amd_vmcb_efer_rd(&g->vmcb).lma == 1);
3197
3198    // check for the REX prefix
3199    if ((code[0] >> 4) == 0x4) {
3200        code++;
3201    }
3202
3203    // we only support one variant for now
3204    assert(code[0] == 0x8b);
3205
3206    union x86_modrm modrm = { .raw = code[1] };
3207    set_reg_val_by_reg_num(g, modrm.u.regop, val);
3208}
3209
3210static int
3211handle_vmexit_npf (struct guest *g) {
3212    int r;
3213#ifdef CONFIG_SVM
3214    uint64_t fault_addr = amd_vmcb_exitinfo2_rd(&g->vmcb);
3215    uint64_t guest_rip  = amd_vmcb_rip_rd(&g->vmcb);
3216#else
3217    uint64_t fault_addr, guest_rip;
3218    errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GPADDR_F, &fault_addr);
3219    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
3220    assert(err_is_ok(err));
3221#endif
3222    invoke_dispatcher_dump_ptables(g->dcb_cap, 0);
3223    debug_printf("handling guest page fault on 0x%lx, IP 0x%lx\n",
3224            fault_addr, guest_rip);
3225    uint8_t *code = NULL;
3226
3227    if (vspace_get_region(g->vspace, (void*)fault_addr) != NULL) {
3228        USER_PANIC("NPF vmexit on address that's mapped in EPT\n");
3229    }
3230
3231    USER_PANIC("npf handling NYI for Arrakis guest!\n");
3232    // check for fault inside the guest physical memory region
3233    if (fault_addr >= g->mem_low_va && fault_addr < g->mem_high_va) {
3234        // allocate the missing memory
3235        alloc_guest_mem(g, fault_addr & ~BASE_PAGE_MASK, BASE_PAGE_SIZE);
3236        // do not advance the RIP, it is safe (and neccessary) to
3237        // replay the faulting instruction
3238        return HANDLER_ERR_OK;
3239    }
3240
3241    // fetch the location to the code
3242    r = get_instr_arr(g, &code);
3243    assert (r == 0);
3244
3245    // virtual devices
3246    switch (fault_addr & ~BASE_PAGE_MASK) {
3247    case APIC_BASE: {
3248        uint64_t val;
3249        enum opsize size;
3250
3251        assert(g->apic != NULL);
3252        size = decode_mov_op_size(g, code);
3253        if (decode_mov_is_write(g, code)) {
3254            val = decode_mov_src_val(g, code);
3255            r = apic_handle_mmio_write(g->apic, fault_addr, size, val);
3256            assert(r == 0);
3257        } else {
3258            r = apic_handle_mmio_read(g->apic, fault_addr, size, &val);
3259            assert(r == 0);
3260            decode_mov_dest_val(g, code, val);
3261        }
3262
3263        // advance the rip beyond the instruction
3264#ifdef CONFIG_SVM
3265        amd_vmcb_rip_wr(&g->vmcb, guest_rip +
3266                        decode_mov_instr_length(g, code));
3267#else
3268	err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip +
3269					 decode_mov_instr_length(g, code));
3270	assert(err_is_ok(err));
3271#endif
3272        return HANDLER_ERR_OK;
3273    }
3274    }
3275
3276    printf("arrkismon: access to an unknown memory location: %lx\n", fault_addr);
3277    return handle_vmexit_unhandeled(g);
3278}
3279
3280typedef int (*vmexit_handler)(struct guest *g);
3281
3282#ifdef CONFIG_SVM
3283static vmexit_handler vmexit_handlers[0x8c] = {
3284    [SVM_VMEXIT_CR0_READ] = handle_vmexit_cr_access,
3285    [SVM_VMEXIT_CR0_WRITE] = handle_vmexit_cr_access,
3286    [SVM_VMEXIT_CR0_SEL_WRITE] = handle_vmexit_cr_access,
3287    [SVM_VMEXIT_SWINT] = handle_vmexit_swint,
3288    [SVM_VMEXIT_IDTR_WRITE] = handle_vmexit_ldt,
3289    [SVM_VMEXIT_GDTR_WRITE] = handle_vmexit_ldt,
3290    [SVM_VMEXIT_IOIO] = handle_vmexit_ioio,
3291    [SVM_VMEXIT_MSR] = handle_vmexit_msr,
3292    [SVM_VMEXIT_CPUID] = handle_vmexit_cpuid,
3293    [SVM_VMEXIT_VMMCALL] = handle_vmexit_vmmcall,
3294    [SVM_VMEXIT_HLT] = handle_vmexit_hlt
3295};
3296#else
3297static vmexit_handler vmexit_handlers[0x8c] = {
3298    [VMX_EXIT_REASON_CPUID] = handle_vmexit_cpuid,
3299    [VMX_EXIT_REASON_HLT] = handle_vmexit_hlt,
3300    [VMX_EXIT_REASON_VMCALL] = handle_vmexit_vmmcall,
3301    [VMX_EXIT_REASON_CR_ACCESS] = handle_vmexit_cr_access,
3302    [VMX_EXIT_REASON_INOUT] = handle_vmexit_ioio,
3303    [VMX_EXIT_REASON_RDMSR] = handle_vmexit_msr,
3304    [VMX_EXIT_REASON_WRMSR] = handle_vmexit_msr,
3305    [VMX_EXIT_REASON_GDTR_IDTR] = handle_vmexit_ldt,
3306    [VMX_EXIT_REASON_EPT_FAULT] = handle_vmexit_npf,
3307    [VMX_EXIT_REASON_SWINT] = handle_vmexit_swint
3308};
3309#endif
3310
3311void
3312guest_handle_vmexit (struct guest *g) {
3313    vmexit_handler handler;
3314#ifdef CONFIG_SVM
3315    uint64_t exitcode = amd_vmcb_exitcode_rd(&g->vmcb);
3316    if (exitcode == SVM_VMEXIT_NPF) {
3317        handler = handle_vmexit_npf;
3318    } else if (LIKELY(vmexit_handlers[exitcode] != NULL)) {
3319        handler = vmexit_handlers[exitcode];
3320    } else {
3321        handle_vmexit_unhandeled(g);
3322        return;
3323    }
3324#else
3325    if (!g->emulated_before_exit) {
3326        errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_EXIT_REASON,
3327						(uint64_t *)&saved_exit_reason);
3328        DEBUG_ERR(err, "vmread exit_reason");
3329	assert(err_is_ok(err));
3330    }
3331
3332    if (LIKELY(vmexit_handlers[saved_exit_reason] != NULL)) {
3333        handler = vmexit_handlers[saved_exit_reason];
3334    } else {
3335        handle_vmexit_unhandeled(g);
3336	return;
3337    }
3338#endif
3339    int r = handler(g);
3340    if (LIKELY(r == HANDLER_ERR_OK)) {
3341        if (g->runnable) {
3342            guest_make_runnable(g, true);
3343        }
3344    }
3345}
3346