1
2/**
3 * \file
4 */
5
6/*
7 * Copyright (c) 2009, 2010, ETH Zurich.
8 * All rights reserved.
9 *
10 * This file is distributed under the terms in the attached LICENSE file.
11 * If you do not find this file, copies can be found by writing to:
12 * ETH Zurich D-INFK, Universitaetstrasse 6, CH-8092 Zurich. Attn: Systems Group.
13 */
14
15#include <stdlib.h>
16#include <string.h>
17#include "vmkitmon.h"
18#include <barrelfish/lmp_endpoints.h>
19#include "x86.h"
20#ifdef CONFIG_SVM
21#include "svm.h"
22#endif
23#include "realmode.h"
24#include "hdd.h"
25#include "console.h"
26#include "pc16550d.h"
27#include "apic.h"
28#include "lpc.h"
29#include "pci.h"
30#include "pci_host.h"
31#include "pci_devices.h"
32#include "pci_ethernet.h"
33#include <driverkit/hwmodel.h>
34#include <driverkit/iommu.h>
35#include <skb/skb.h>
36
37#define VMCB_SIZE       0x1000      // 4KB
38
39#ifdef CONFIG_SVM
40#define IOPM_SIZE       0x3000      // 12KB
41#define MSRPM_SIZE      0x2000      // 8KB
42#else
43#define IOBMP_A_SIZE    0x1000      // 4KB
44#define IOBMP_B_SIZE    0x1000      // 4KB
45#define MSRPM_SIZE      0x1000      // 4KB
46#endif
47
48#define RM_MEM_SIZE     (0x100000 + BASE_PAGE_SIZE)    // 1MB + A20 gate space
49
50#define APIC_BASE       0xfee00000
51
52#define SERIAL_DRIVER   "serial0.raw"
53
54#ifndef CONFIG_SVM
55extern uint16_t saved_exit_reason;
56extern uint64_t saved_exit_qual, saved_rip;
57
58// List of MSRs that are saved on VM-exit and loaded on VM-entry.
59static uint32_t msr_list[VMX_MSR_COUNT] =
60    {X86_MSR_KERNEL_GS_BASE, X86_MSR_STAR, X86_MSR_LSTAR, X86_MSR_CSTAR, X86_MSR_SFMASK};
61
62// Saved priority of the most recent irq that is asserted.
63uint8_t interrupt_priority = 0;
64#endif
65
66#ifndef CONFIG_SVM
67static inline int vmx_guest_msr_index(uint32_t msr_index)
68{
69    for (int i = 0; i < VMX_MSR_COUNT; i++) {
70        if (msr_list[i] == msr_index) {
71            return i;
72	}
73    }
74    return -1;
75}
76
77static void initialize_guest_msr_area(struct guest *g)
78{
79    struct msr_entry *guest_msr_area = (struct msr_entry *)g->msr_area_va;
80
81    // The values of the MSRs in the guest MSR area are all set to 0.
82    for (int i = 0; i < VMX_MSR_COUNT; i++) {
83        guest_msr_area[i].index = msr_list[i];
84	guest_msr_area[i].val = 0x0;
85    }
86
87    errval_t err = invoke_dispatcher_vmwrite(g->dcb_cap, VMX_EXIT_MSR_STORE_F, g->msr_area_pa);
88    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_EXIT_MSR_STORE_CNT, VMX_MSR_COUNT);
89    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_ENTRY_MSR_LOAD_F, g->msr_area_pa);
90    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_ENTRY_MSR_LOAD_CNT, VMX_MSR_COUNT);
91    assert(err_is_ok(err));
92}
93#endif
94
95lvaddr_t guest_offset = 0;
96static struct guest __guest;
97static struct guest *__guestp = NULL;
98
99#ifdef CONFIG_SVM
100/// stores the last used guest ASID
101static uint32_t last_guest_asid = 0;
102#endif
103
104// FIXME: this is somewhat broken by design... we should emit proper exceptions
105//        to the guest opposed to just halt the VM
106#define guest_assert(g, e) \
107    ((e) ? (void)0 : (handle_vmexit_unhandeled(g), assert(e)))
108
109static errval_t
110guest_slot_alloc(struct guest *g, struct capref *ret)
111{
112    return g->slot_alloc.a.alloc(&g->slot_alloc.a, ret);
113}
114
115errval_t guest_vspace_map_wrapper(struct vspace *vspace, lvaddr_t vaddr,
116                                         struct capref frame,  size_t size)
117{
118    errval_t err;
119    struct vregion *vregion = NULL;
120    struct memobj_one_frame *memobj = NULL;
121
122    // Allocate space
123    vregion = malloc(sizeof(struct vregion));
124    if (!vregion) {
125        err = LIB_ERR_MALLOC_FAIL;
126        goto error;
127    }
128    memobj = malloc(sizeof(struct memobj_one_frame));
129    if (!memobj) {
130        err = LIB_ERR_MALLOC_FAIL;
131        goto error;
132    }
133
134    // Create the objects
135    err = memobj_create_one_frame(memobj, size, 0);
136    if (err_is_fail(err)) {
137        err = err_push(err, LIB_ERR_MEMOBJ_CREATE_ANON);
138        goto error;
139    }
140    err = memobj->m.f.fill(&memobj->m, 0, frame, size);
141    if (err_is_fail(err)) {
142        err = err_push(err, LIB_ERR_MEMOBJ_FILL);
143        goto error;
144    }
145    err = vregion_map_fixed(vregion, vspace, &memobj->m, 0, size, vaddr,
146                            VREGION_FLAGS_READ | VREGION_FLAGS_WRITE | VREGION_FLAGS_EXECUTE);
147    if (err_is_fail(err)) {
148        err = LIB_ERR_VSPACE_MAP;
149        goto error;
150    }
151    err = memobj->m.f.pagefault(&memobj->m, vregion, 0, 0);
152    if (err_is_fail(err)) {
153        err = err_push(err, LIB_ERR_MEMOBJ_PAGEFAULT_HANDLER);
154        goto error;
155    }
156
157    return SYS_ERR_OK;
158
159 error: // XXX: proper cleanup
160    if (vregion) {
161        free(vregion);
162    }
163    if (memobj) {
164        free(memobj);
165    }
166    return err;
167}
168
169#ifdef DISABLE_MODEL
170#define GUEST_VSPACE_SIZE 1073741824UL // 1GB
171#else
172#define GUEST_VSPACE_SIZE (1ul<<32) // GB
173#endif
174static errval_t vspace_map_wrapper(lvaddr_t vaddr, struct capref frame,
175                                   size_t size)
176{
177    errval_t err;
178    static struct memobj_anon *memobj = NULL;
179    static struct vregion *vregion = NULL;
180    static bool initialized = false;
181
182    if (!initialized) {
183        // Allocate space
184        memobj = malloc(sizeof(struct memobj_anon));
185        if (!memobj) {
186            return LIB_ERR_MALLOC_FAIL;
187        }
188        vregion = malloc(sizeof(struct vregion));
189        if (!vregion) {
190            return LIB_ERR_MALLOC_FAIL;
191        }
192
193        // Create a memobj and vregion
194        err = memobj_create_anon(memobj, GUEST_VSPACE_SIZE, 0);
195        if (err_is_fail(err)) {
196            return err_push(err, LIB_ERR_MEMOBJ_CREATE_ANON);
197        }
198        err = vregion_map(vregion, get_current_vspace(), &memobj->m, 0,
199                          GUEST_VSPACE_SIZE, VREGION_FLAGS_READ_WRITE);
200        if (err_is_fail(err)) {
201            return err_push(err, LIB_ERR_VREGION_MAP);
202        }
203
204        guest_offset = vregion_get_base_addr(vregion);
205        initialized = true;
206    }
207
208    // Create mapping
209    err = memobj->m.f.fill(&memobj->m, vaddr, frame, size);
210    if (err_is_fail(err)) {
211        return err_push(err, LIB_ERR_MEMOBJ_FILL);
212    }
213    err = memobj->m.f.pagefault(&memobj->m, vregion, vaddr, 0);
214    if (err_is_fail(err)) {
215        return err_push(err, LIB_ERR_MEMOBJ_PAGEFAULT_HANDLER);
216    }
217
218    return SYS_ERR_OK;
219}
220// allocates some bytes of memory for the guest starting at a specific addr
221// also performs the mapping into the vspace of the monitor
222errval_t
223alloc_guest_mem(struct guest *g, lvaddr_t guest_paddr, size_t bytes)
224{
225    errval_t err;
226
227    // only allow multiple of page sizes to be allocated
228    assert(bytes > 0 && (bytes & BASE_PAGE_MASK) == 0);
229    // do not allow allocation outside of the guests physical memory
230    assert(guest_paddr + bytes <= g->mem_high_va);
231
232    // Allocate frame
233    struct capref cap;
234
235#ifdef DISABLE_MODEL
236    int32_t node_id_self = driverkit_hwmodel_get_my_node_id();
237    int32_t node_id_ram = driverkit_hwmodel_lookup_dram_node_id();
238    int32_t nodes_data[] = {node_id_self, 0};
239
240    err = driverkit_hwmodel_frame_alloc(&cap, bytes, node_id_ram, nodes_data);
241    if (err_is_fail(err)) {
242        return err;
243    }
244
245#else
246    if (err_is_fail(err)) {
247        return err_push(err, LIB_ERR_SLOT_ALLOC);
248    }
249    err = frame_create(cap, bytes, NULL);
250    if (err_is_fail(err)) {
251        return err_push(err, LIB_ERR_FRAME_CREATE);
252    }
253#endif
254
255    // Map into the guest vspace
256    err = guest_vspace_map_wrapper(&g->vspace, guest_paddr, cap, bytes);
257    if (err_is_fail(err)) {
258        return err;
259    }
260
261    // Create a copy of the capability to map in our vspace
262    struct capref host_cap;
263    err = slot_alloc(&host_cap);
264    if (err_is_fail(err)) {
265        return err;
266    }
267    err = cap_copy(host_cap, cap);
268    if (err_is_fail(err)) {
269        return err;
270    }
271
272    // Map into my vspace
273    err = vspace_map_wrapper(guest_to_host(guest_paddr), host_cap, bytes);
274    if (err_is_fail(err)) {
275        return err;
276    }
277
278	struct frame_identity frameid = { .base = 0, .bytes = 0 };
279	errval_t r = frame_identify(cap, &frameid);
280	assert(err_is_ok(r));
281	VMKIT_PCI_DEBUG("alloc_guest_mem: frameid.base: 0x%lx, frameid.bytes: %zd, "
282                "g->mem_low_va: 0x%lx, g->mem_high_va: 0x%lx\n",
283                frameid.base, frameid.bytes, g->mem_low_va, g->mem_high_va);
284
285    return SYS_ERR_OK;
286}
287
288static void
289initialize_iopm (struct guest *self) {
290    // intercept all IO port accesses (for now)
291#ifdef CONFIG_SVM
292    memset((void*)self->iopm_va, 0xFF, IOPM_SIZE);
293#else
294    memset((void*)self->iobmp_a_va, 0xFF, IOBMP_A_SIZE);
295    memset((void*)self->iobmp_b_va, 0xFF, IOBMP_B_SIZE);
296#endif
297}
298
299// access_mode: 0 all access, 1 read intercept, 2 write intercept, 3 all interc.
300static inline void
301set_msr_access (struct guest *g, uint32_t msr, int access_mode)
302{
303    assert(access_mode >= 0 && access_mode <= 3);
304
305    // a region a 2K bytes represents the access bits of 8K MSRs, therefore each
306    // MSR takes two bits (one for rdmsr and one for wrmsr)
307    uintptr_t byte_offset = (msr & 0xffff) / 4;
308    int bit_offset = ((msr & 0xffff) % 4) * 2;
309
310    if (msr < 0x2000) {
311        // do nothing
312    } else if (msr >= 0xc0000000 && msr < 0xc0002000) {
313        byte_offset += 0x800;
314    } else if (msr >= 0xc0010000 && msr < 0xc0012000) {
315        byte_offset += 0x1000;
316    } else {
317        assert(!"not reached");
318    }
319
320    assert(byte_offset < MSRPM_SIZE);
321
322    // read the byte holding the relevant bits
323    uint8_t val = *(uint8_t *)(g->msrpm_va + byte_offset);
324    // set the access params according to the arguments
325    val = (val & ~(0x3 << bit_offset)) | (access_mode << bit_offset);
326    // store the modified value back in the map
327    *(uint8_t *)(g->msrpm_va + byte_offset) = val;
328
329    //printf("MSR: msr %x, byte_offset %lx, bit_offset %x, val %x\n", msr, byte_offset, bit_offset, val);
330}
331
332static void
333initialize_msrpm (struct guest *g) {
334    // intercept all MSR accesses (for now)
335    memset((void*)g->msrpm_va, 0xff, MSRPM_SIZE);
336#if 0
337    // allow performance counters and evnets MSR accesses
338    set_msr_access (g, 0xc0010000, 0);
339    set_msr_access (g, 0xc0010001, 0);
340    set_msr_access (g, 0xc0010002, 0);
341    set_msr_access (g, 0xc0010003, 0);
342    set_msr_access (g, 0xc0010004, 0);
343    set_msr_access (g, 0xc0010005, 0);
344    set_msr_access (g, 0xc0010006, 0);
345    set_msr_access (g, 0xc0010007, 0);
346#endif
347}
348
349#define INIT_DATA_SEGREG(vmcb,x)                 \
350do {                                             \
351    amd_vmcb_seg_attrib_t __sa = {               \
352        .segtype = 3,                            \
353        .p = 1,                                  \
354        .s = 1                                   \
355    };                                           \
356    amd_vmcb_##x## _attrib_wr((vmcb), __sa);     \
357    amd_vmcb_##x## _selector_wr((vmcb), 0x0);    \
358    amd_vmcb_##x## _base_wr((vmcb), 0x0);        \
359    amd_vmcb_##x## _limit_wr((vmcb), 0xffff);    \
360} while (0)
361
362#define INIT_CODE_SEGREG(vmcb,x)                 \
363do {                                             \
364    amd_vmcb_seg_attrib_t __sa = {               \
365        .segtype = 11,                           \
366        .p = 1,                                  \
367        .s = 1                                   \
368    };                                           \
369    amd_vmcb_##x## _attrib_wr((vmcb), __sa);     \
370    amd_vmcb_##x## _selector_wr((vmcb), 0xf000); \
371    amd_vmcb_##x## _base_wr((vmcb), 0xffff0000); \
372    amd_vmcb_##x## _limit_wr((vmcb), 0xffff);    \
373} while (0)
374
375#define INIT_SYS_SEGREG(vmcb,x)                  \
376do {                                             \
377    amd_vmcb_seg_attrib_t __sa = {               \
378        .segtype = 2,                            \
379        .p = 1                                   \
380    };                                           \
381    amd_vmcb_##x## _attrib_wr((vmcb), __sa);     \
382    amd_vmcb_##x## _selector_wr((vmcb), 0x0);    \
383    amd_vmcb_##x## _base_wr((vmcb), 0x0);        \
384    amd_vmcb_##x## _limit_wr((vmcb), 0xffff);    \
385} while (0)
386
387/* This method initializes a new VMCB memory regsion and sets the initial
388 * machine state as defined by the AMD64 architecture specification */
389#ifdef CONFIG_SVM
390static void
391initialize_vmcb (struct guest *self) {
392    amd_vmcb_initialize(&self->vmcb, (mackerel_addr_t)self->vmcb_va);
393
394    // 1. Initialize intercepts
395
396    /* For now we intercept just everything */
397
398    amd_vmcb_cr_access_wr_raw(&self->vmcb, ~0u);
399    amd_vmcb_cr_access_rdcr2_wrf(&self->vmcb, 0);
400    amd_vmcb_cr_access_wrcr2_wrf(&self->vmcb, 0);
401    amd_vmcb_cr_access_rdcr4_wrf(&self->vmcb, 0);
402    amd_vmcb_cr_access_wrcr4_wrf(&self->vmcb, 0);
403
404    // FIXME: ignoring DR accesses may be insecure
405    //amd_vmcb_dr_access_wr_raw(&self->vmcb, ~0u);
406    amd_vmcb_exceptions_wr_raw(&self->vmcb, ~0u);
407    amd_vmcb_exceptions_vector7_wrf(&self->vmcb, 0);
408    amd_vmcb_exceptions_vector14_wrf(&self->vmcb, 0);
409
410    amd_vmcb_intercepts_wr_raw(&self->vmcb, 0x1fffffffffff);
411    amd_vmcb_intercepts_pushf_wrf(&self->vmcb, 0);
412    amd_vmcb_intercepts_popf_wrf(&self->vmcb, 0);
413    amd_vmcb_intercepts_invlpg_wrf(&self->vmcb, 0);
414    amd_vmcb_intercepts_rdtsc_wrf(&self->vmcb, 0);
415    amd_vmcb_intercepts_rdtscp_wrf(&self->vmcb, 0);
416    amd_vmcb_intercepts_iret_wrf(&self->vmcb, 0);
417    amd_vmcb_intercepts_wbinvd_wrf(&self->vmcb, 0);
418    amd_vmcb_intercepts_pause_wrf(&self->vmcb, 0);
419    amd_vmcb_intercepts_vintr_wrf(&self->vmcb, 0);
420
421    // 2. Setup some config fields
422
423    // physical addresses of IOPM and MSRPM_SIZE
424    amd_vmcb_iopm_base_pa_wr(&self->vmcb, self->iopm_pa);
425    amd_vmcb_msrpm_base_pa_wr(&self->vmcb, self->msrpm_pa);
426    // assign guest ASID
427    // FIXME: use real asid allocator. BF does not know about tagged TLBs atm
428    amd_vmcb_tlb_guest_asid_wrf(&self->vmcb, ++last_guest_asid);
429    // enable virtual intr masking
430    amd_vmcb_vintr_vintr_masking_wrf(&self->vmcb, 1);
431    // enable nested paging
432    amd_vmcb_np_enable_wrf(&self->vmcb, 1);
433
434    /* 3. Guest state initialization
435     * according to Intels Manual 3A: Table 9-1. */
436
437    // The second bit of rflags needs to be 1, also indicate that we support the
438    // CPUID instruction.
439    amd_vmcb_rflags_wr_raw(&self->vmcb, 0x00200002);
440    amd_vmcb_rip_wr(&self->vmcb, 0x0000fff0);
441    amd_vmcb_cr0_wr_raw(&self->vmcb, 0x60000010);
442
443    INIT_CODE_SEGREG(&self->vmcb, cs);
444    INIT_DATA_SEGREG(&self->vmcb, ss);
445    INIT_DATA_SEGREG(&self->vmcb, ds);
446    INIT_DATA_SEGREG(&self->vmcb, es);
447    INIT_DATA_SEGREG(&self->vmcb, fs);
448    INIT_DATA_SEGREG(&self->vmcb, gs);
449
450    INIT_SYS_SEGREG(&self->vmcb, gdtr);
451    INIT_SYS_SEGREG(&self->vmcb, idtr);
452    INIT_SYS_SEGREG(&self->vmcb, ldtr);
453    INIT_SYS_SEGREG(&self->vmcb, tr);
454
455    amd_vmcb_dr6_wr(&self->vmcb, 0xffff0ff0);
456    amd_vmcb_dr7_wr(&self->vmcb, 0x00000400);
457
458    // taken from the linux SVM source
459    amd_vmcb_gpat_wr(&self->vmcb, 0x0007040600070406ul);
460
461    // svm requires guest EFER.SVME to be set
462    amd_vmcb_efer_svme_wrf(&self->vmcb, 1);
463}
464#endif
465
466static void
467idc_handler(void *arg)
468{
469    struct guest *g = arg;
470    errval_t err;
471
472    // consume message
473    struct lmp_recv_buf buf = { .buflen = 0 };
474    err = lmp_endpoint_recv(g->monitor_ep, &buf, NULL);
475    assert(err_is_ok(err));
476
477    // run real handler
478    guest_handle_vmexit(g);
479
480    // re-register
481    struct event_closure cl = {
482        .handler = idc_handler,
483        .arg = arg,
484    };
485    err = lmp_endpoint_register(g->monitor_ep, get_default_waitset(), cl);
486    assert(err_is_ok(err));
487}
488
489/* This method duplicates some code from spawndomain since we need to spawn very
490 * special domains */
491static void
492spawn_guest_domain (struct guest *self) {
493    errval_t err;
494
495    // create the guest virtual address space
496    struct capref vnode_cap;
497    err = guest_slot_alloc(self, &vnode_cap);
498    assert(err_is_ok(err));
499    err = vnode_create(vnode_cap, ObjType_VNode_x86_64_pml4);
500    assert(err_is_ok(err));
501
502    struct pmap *pmap = malloc(sizeof(struct pmap_x86));
503    assert(pmap);
504    err = pmap_x86_64_init(pmap, &self->vspace, vnode_cap, NULL);
505    assert(err_is_ok(err));
506    err = vspace_init(&self->vspace, pmap);
507    assert(err_is_ok(err));
508
509    // create DCB
510    err = guest_slot_alloc(self, &self->dcb_cap);
511    assert(err_is_ok(err));
512    err = dispatcher_create(self->dcb_cap);
513    assert(err_is_ok(err));
514
515    // create end point
516    struct capref ep_cap;
517
518    // use minimum-sized endpoint, because we don't need to buffer >1 vmexit
519    err = endpoint_create(LMP_RECV_LENGTH, &ep_cap, &self->monitor_ep);
520    assert(err_is_ok(err));
521
522    // register to receive on this endpoint
523    struct event_closure cl = {
524        .handler = idc_handler,
525        .arg = self,
526    };
527    err = lmp_endpoint_register(self->monitor_ep, get_default_waitset(), cl);
528    assert(err_is_ok(err));
529
530    // setup the DCB
531    err = invoke_dispatcher_setup_guest(self->dcb_cap, ep_cap, vnode_cap,
532                                        self->vmcb_cap, self->ctrl_cap);
533    assert(err_is_ok(err));
534
535#ifndef CONFIG_SVM
536    initialize_guest_msr_area(self);
537
538    err = 0;
539    err += invoke_dispatcher_vmwrite(self->dcb_cap, VMX_IOBMP_A_F, self->iobmp_a_pa);
540    err += invoke_dispatcher_vmwrite(self->dcb_cap, VMX_IOBMP_B_F, self->iobmp_b_pa);
541    err += invoke_dispatcher_vmwrite(self->dcb_cap, VMX_MSRBMP_F, self->msrpm_pa);
542    assert(err_is_ok(err));
543#endif
544    // set up the guests physical address space
545    self->mem_low_va = 0;
546    // FIXME: Hardcoded guest memory size
547    // allocate the memory used for real mode
548    // This is not 100% necessary since one could also catch the pagefaults.
549    // If we allocate the whole memory at once we use less caps and reduce
550    // the risk run out of CSpace.
551#ifdef DISABLE_MODEL
552    self->mem_high_va = 0x80000000;
553    err = alloc_guest_mem(self, 0x0, 0x80000000);
554#else
555    self->mem_high_va = GUEST_VSPACE_SIZE;
556    err = alloc_guest_mem(self, 0x0, GUEST_VSPACE_SIZE);
557#endif
558    assert_err(err, "alloc_guest_mem");
559}
560
561static void
562install_grub_stage2 (struct guest *g, void *img, size_t img_size)
563{
564    assert(img != NULL);
565
566    /* the grub image goes to 0x8000 according to
567     * http://www.gnu.org/software/grub/manual/html_node/Memory-map.html */
568    memcpy((void *)(guest_to_host(g->mem_low_va + 0x8000)), img, img_size);
569    // according to grub stage2 source its entry point is at 0x8200
570#ifdef CONFIG_SVM
571    amd_vmcb_rip_wr(&g->vmcb, 0x8200);
572    // switch to the first segment
573    amd_vmcb_cs_selector_wr(&g->vmcb, 0x0);
574    amd_vmcb_cs_base_wr(&g->vmcb, 0x0);
575    amd_vmcb_cs_limit_wr(&g->vmcb, 0xffff);
576#else
577    errval_t err = invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, 0x8200);
578    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_CS_SEL, 0x0);
579    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_CS_BASE, 0x0);
580    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_CS_LIM, 0xffff);
581    assert(err_is_ok(err));
582#endif
583
584}
585
586#if 0
587static void
588install_debug_app (struct guest *g)
589{
590    //static uint8_t app[] = { 0xcd, 0x20 };
591    static uint8_t app[] = { 0xcd, 0x20, 0x90, 0x90, 0x90, 0x90, 0x90 };
592    memcpy((void *)g->rm_mem_va, app, sizeof(app));
593    amd_vmcb_rip_wr(&g->vmcb, 0x0);
594    // disable nested pageing in real mode
595    amd_vmcb_np_enable_wrf(&g->vmcb, 0);
596    // enable paged real mode
597    //amd_vmcb_cr0_pg_wrf(&g->vmcb, 0x1);
598    //g->save_area->cr0 |= X86_CR0_PE_MASK;
599    amd_vmcb_rsp_wr(&g->vmcb, 0x1000);
600    amd_vmcb_cs_selector_wr(&g->vmcb, 0x0);
601    amd_vmcb_cs_base_wr(&g->vmcb, 0x0);
602    amd_vmcb_cs_limit_wr(&g->vmcb, 0xffff);
603    //g->save_area->cs.selector = 0x1000;
604    //g->save_area->cs.base = 0x10000;
605    //g->save_area->cs.base = 0x1ffff;
606}
607#endif
608
609static bool
610virq_pending (void *ud, uint8_t *irq, uint8_t *irq_prio)
611{
612    assert(ud != NULL);
613
614    struct guest *g = ud;
615#ifdef CONFIG_SVM
616    if (amd_vmcb_vintr_rd(&g->vmcb).virq == 1) {
617#else
618    uint64_t info;
619    errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_ENTRY_INTR_INFO, &info);
620    assert(err_is_ok(err));
621    if (!!(info & (1UL << 31))) {
622#endif
623        if (irq != NULL) {
624#ifdef CONFIG_SVM
625            *irq = amd_vmcb_vintr_rd(&g->vmcb).vintr_vector;
626#else
627	    *irq = info & 0xff;
628#endif
629        }
630        if (irq_prio != NULL) {
631#ifdef CONFIG_SVM
632            *irq_prio = amd_vmcb_vintr_rd(&g->vmcb).vintr_prio;
633#else
634	    *irq_prio = interrupt_priority;
635#endif
636        }
637        return true;
638    } else {
639        return false;
640    }
641}
642
643#ifndef CONFIG_SVM
644static bool
645virq_accepting (void *ud)
646{
647    assert(ud != NULL);
648
649    struct guest *g = ud;
650
651    uint64_t guest_rflags;
652    errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RFLAGS, &guest_rflags);
653    assert(err_is_ok(err));
654    return (guest_rflags & (1UL << 9));
655}
656#endif
657
658static void
659virq_handler (void *ud, uint8_t irq, uint8_t irq_prio)
660{
661    assert(ud != NULL);
662
663    struct guest *g = ud;
664
665    // tell the hw extensions that there is a virtual IRQ pending
666#ifdef CONFIG_SVM
667    amd_vmcb_vintr_virq_wrf(&g->vmcb, 1);
668    amd_vmcb_vintr_vintr_prio_wrf(&g->vmcb, irq_prio);
669    amd_vmcb_vintr_vintr_vector_wrf(&g->vmcb, irq);
670    amd_vmcb_vintr_v_ign_tpr_wrf(&g->vmcb, 1);
671#else
672    uint64_t guest_rflags;
673    errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RFLAGS, &guest_rflags);
674    assert(guest_rflags & (1UL << 9));
675
676    uint64_t info = (0 << 8 /*HWINTR*/) | (1UL << 31 /*INTR VALID*/) | irq;
677    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_ENTRY_INTR_INFO, info);
678
679    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_ACTIV_STATE, 0x0);
680    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_INTR_STATE, 0x0);
681    assert(err_is_ok(err));
682
683    interrupt_priority = irq_prio;
684#endif
685    // if the guest is currently waiting then we have to restart it to make
686    // forward progress
687    if (!g->runnable) {
688        g->runnable = true;
689        guest_make_runnable(g, true);
690    }
691}
692
693static void
694guest_setup (struct guest *g)
695{
696    errval_t err;
697
698    // initialize the guests slot_allocator
699    err = two_level_slot_alloc_init(&g->slot_alloc);
700    assert_err(err, "two_level_slot_alloc_init");
701
702    struct frame_identity fi;
703
704    // allocate memory for the vmcb
705    err = guest_slot_alloc(g, &g->vmcb_cap);
706    assert_err(err, "guest_cspace_alloc");
707    err = frame_create(g->vmcb_cap, VMCB_SIZE, NULL);
708    assert_err(err, "frame_create");
709    err = frame_identify(g->vmcb_cap, &fi);
710    assert_err(err, "frame_identify");
711    g->vmcb_pa = fi.base;
712    err = vspace_map_one_frame_attr((void**)&g->vmcb_va, VMCB_SIZE, g->vmcb_cap,
713                                    VREGION_FLAGS_READ_WRITE_NOCACHE,
714                                    NULL, NULL);
715    if (err_is_fail(err)) {
716        DEBUG_ERR(err, "vspace_map_one_frame_attr failed");
717    }
718
719    // guest control
720    err = frame_alloc(&g->ctrl_cap, sizeof(struct guest_control), NULL);
721    assert_err(err, "frame_alloc");
722    size_t size = ROUND_UP(sizeof(struct guest_control), BASE_PAGE_SIZE);
723    err = vspace_map_one_frame_attr((void**)&g->ctrl, size, g->ctrl_cap,
724                                    VREGION_FLAGS_READ_WRITE_NOCACHE,
725                                    NULL, NULL);
726    if (err_is_fail(err)) {
727        DEBUG_ERR(err, "vspace_map_one_frame_attr failed");
728    }
729    g->ctrl->num_vm_exits_with_monitor_invocation = 0;
730    g->ctrl->num_vm_exits_without_monitor_invocation = 0;
731#ifdef CONFIG_SVM
732    // allocate memory for the iopm
733    err = frame_alloc(&g->iopm_cap, IOPM_SIZE, NULL);
734    assert_err(err, "frame_alloc");
735    err = frame_identify(g->iopm_cap, &fi);
736    assert_err(err, "frame_identify");
737    g->iopm_pa = fi.base;
738    err = vspace_map_one_frame_attr((void**)&g->iopm_va, IOPM_SIZE, g->iopm_cap,
739                                    VREGION_FLAGS_READ_WRITE_NOCACHE,
740                                    NULL, NULL);
741    if (err_is_fail(err)) {
742        DEBUG_ERR(err, "vspace_map_one_frame_attr failed");
743    }
744#else
745    // allocate memory for I/O bitmap A
746    err = frame_alloc(&g->iobmp_a_cap, IOBMP_A_SIZE, NULL);
747    assert_err(err, "frame_alloc");
748    err = frame_identify(g->iobmp_a_cap, &fi);
749    assert_err(err, "frame_identify");
750    g->iobmp_a_pa = fi.base;
751    err = vspace_map_one_frame_attr((void**)&g->iobmp_a_va, IOBMP_A_SIZE, g->iobmp_a_cap,
752                                    VREGION_FLAGS_READ_WRITE_NOCACHE,
753                                    NULL, NULL);
754    if (err_is_fail(err)) {
755        DEBUG_ERR(err, "vspace_map_one_frame_attr failed");
756    }
757
758    // allocate memory for I/O bitmap B
759    err = frame_alloc(&g->iobmp_b_cap, IOBMP_B_SIZE, NULL);
760    assert_err(err, "frame_alloc");
761    err = frame_identify(g->iobmp_b_cap, &fi);
762    assert_err(err, "frame_identify");
763    g->iobmp_b_pa = fi.base;
764    err = vspace_map_one_frame_attr((void**)&g->iobmp_b_va, IOBMP_B_SIZE, g->iobmp_b_cap,
765                                    VREGION_FLAGS_READ_WRITE_NOCACHE,
766                                    NULL, NULL);
767    if (err_is_fail(err)) {
768        DEBUG_ERR(err, "vspace_map_one_frame_attr failed");
769    }
770
771    // allocate memory for the guest MSR store/load area
772    err = frame_alloc(&g->msr_area_cap, VMX_MSR_AREA_SIZE, NULL);
773    assert_err(err, "frame_alloc");
774    err = frame_identify(g->msr_area_cap, &fi);
775    assert_err(err, "frame_identify");
776    g->msr_area_pa = fi.base;
777    err = vspace_map_one_frame_attr((void**)&g->msr_area_va, VMX_MSR_AREA_SIZE,
778                                    g->msr_area_cap,
779                                    VREGION_FLAGS_READ_WRITE_NOCACHE,
780                                    NULL, NULL);
781    if (err_is_fail(err)) {
782        DEBUG_ERR(err, "vspace_map_one_frame_attr failed");
783    }
784#endif
785    // allocate memory for the msrpm
786    err = frame_alloc(&g->msrpm_cap, MSRPM_SIZE, NULL);
787    assert_err(err, "frame_alloc");
788    err = frame_identify(g->msrpm_cap, &fi);
789    assert_err(err, "frame_identify");
790    g->msrpm_pa = fi.base;
791    err = vspace_map_one_frame_attr((void**)&g->msrpm_va, MSRPM_SIZE,
792                                    g->msrpm_cap,
793                                    VREGION_FLAGS_READ_WRITE_NOCACHE,
794                                    NULL, NULL);
795    if (err_is_fail(err)) {
796        DEBUG_ERR(err, "vspace_map_one_frame_attr failed");
797    }
798
799    // initialize the allocated structures
800    initialize_iopm(g);
801    initialize_msrpm(g);
802#ifdef CONFIG_SVM
803    initialize_vmcb(g);
804#endif
805    // spawn the guest domain
806    spawn_guest_domain(g);
807    assert (grub_image != NULL);
808    install_grub_stage2(g, grub_image, grub_image_size);
809    //install_debug_app(g);
810
811    // add virtual hardware
812    g->apic = apic_new(APIC_BASE);
813    g->lpc = lpc_new(virq_handler, virq_pending,
814#ifndef CONFIG_SVM
815		     virq_accepting,
816#endif
817		     g, g->apic);
818    if (hdd0_image != NULL) {
819        g->hdds[0] = hdd_new_from_memory(hdd0_image, hdd0_image_size);
820        g->hdd_count++;
821    }
822    g->console = console_new();
823    g->serial_ports[0] = pc16550d_new(0x3f8, 4, g->lpc);
824
825    // FIXME: Which virtual uart port is connected to which host port
826    //        should be adjustable from the command line or a configuration
827    //        file.
828    pc16550d_attach_to_host_uart(g->serial_ports[0], SERIAL_DRIVER);
829    g->serial_ports[1] = pc16550d_new(0x2f8, 3, g->lpc);
830    g->serial_ports[2] = pc16550d_new(0x3e8, 4, g->lpc);
831    g->serial_ports[3] = pc16550d_new(0x2e8, 3, g->lpc);
832    g->serial_port_count = 4;
833
834    g->pci = pci_new();
835    init_host_devices(g->pci);
836
837//    struct pci_device *ethernet = pci_ethernet_new(g->lpc, g);
838//    int r = pci_attach_device(g->pci, 0, 2, ethernet);
839//	assert(r == 0);
840//
841//	struct pci_device *vmkitmon_eth = pci_vmkitmon_eth_new(g->lpc, g);
842//	r = pci_attach_device(g->pci, 0, 3, vmkitmon_eth);
843//	assert(r==0);
844
845    // set up bios memory
846    // FIXME: find a modular way to do this
847    *(uint16_t *)guest_to_host(g->mem_low_va + 0x400) = 0x3f8;  // COM1
848    *(uint16_t *)guest_to_host(g->mem_low_va + 0x402) = 0x2f8;  // COM2
849
850    g->runnable = true;
851}
852
853/**
854 * \brief Create a new guest.
855 *
856 * This function creates a new guest. It will do everything necessary to make
857 * the guest accept images to run. It will create a new domain and assign some
858 * memory to that domain. Afterwards it will load a bios into the memory and
859 * set the guest initial IP to the POST entry of the bios.
860 *
861 * \return The pointer to the newly created structure describing the guest.
862 */
863struct guest *
864guest_create (void)
865{
866    // support the allocation of one guest for now
867    assert(__guestp == NULL);
868    __guestp = &__guest;
869    memset(__guestp, 0, sizeof(struct guest));
870    guest_setup(__guestp);
871    return __guestp;
872}
873
874static int
875run_realmode (struct guest *g)
876{
877    int r;
878
879    realmode_switch_to(g);
880    r = realmode_exec();
881    assert(r == REALMODE_ERR_OK);
882    realmode_switch_from(g);
883
884    guest_handle_vmexit(g);
885
886    return 0;
887};
888
889#ifndef CONFIG_SVM
890// Return true if the "Enable EPT" Secondary Processor-based control is
891// set in the VMCS, else false.
892static inline bool vmx_ept_enabled(struct guest *g)
893{
894    uint64_t sp_controls;
895    errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_EXEC_SEC_PROC, &sp_controls);
896    assert(err_is_ok(err));
897    return ((sp_controls & SP_CLTS_ENABLE_EPT) != 0);
898}
899
900// Set or clear the "Descriptor-table exiting" Secondary Processor-based
901// control if val is 1 or 0, respectively.
902static inline void vmx_intercept_desc_table_wrf(struct guest *g, int val)
903{
904    assert(val == 0 || val == 1);
905
906    uint64_t sec_proc_ctrls;
907    errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_EXEC_SEC_PROC, &sec_proc_ctrls);
908    if (val) {
909        uint64_t prim_proc_ctrls;
910	err += invoke_dispatcher_vmread(g->dcb_cap, VMX_EXEC_PRIM_PROC, &prim_proc_ctrls);
911	assert(prim_proc_ctrls & PP_CLTS_SEC_CTLS);
912	err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_EXEC_SEC_PROC,
913					 sec_proc_ctrls | SP_CLTS_DESC_TABLE);
914    } else {
915        err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_EXEC_SEC_PROC,
916					 sec_proc_ctrls & ~SP_CLTS_DESC_TABLE);
917    }
918    assert(err_is_ok(err));
919}
920
921
922// Before entering the guest, synchronize the CR0 shadow with the guest
923// CR0 value that is potentially changed in the real-mode emulator.
924static inline void vmx_set_cr0_shadow(struct guest *g)
925{
926    uint64_t cr0_shadow;
927    errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR0, &cr0_shadow);
928    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_CR0_RD_SHADOW, cr0_shadow);
929    assert(err_is_ok(err));
930}
931#endif
932
933/**
934 * \brief Marks a guest as runnable.
935 *
936 * A call to this method will update the guest's runnable state and, if made
937 * runnable, yield the remaining time slice to the guest domain.
938 *
939 * \return Zero on success, non-zero on error
940 */
941errval_t
942guest_make_runnable (struct guest *g, bool run)
943{
944    assert(g->runnable);
945
946    errval_t err;
947
948    /* If the guest is currently in real mode (CR0.PE flag clear) then we do not
949     * schedule the domain to run the virtualization but run the real-mode
950     * emulation */
951#ifdef CONFIG_SVM
952    if (UNLIKELY(run && amd_vmcb_cr0_rd(&g->vmcb).pe == 0)) {
953        if (!g->emulated_before_exit) {
954            // do the inverse of the code below
955            amd_vmcb_intercepts_rdgdtr_wrf(&g->vmcb, 1);
956            amd_vmcb_intercepts_wrgdtr_wrf(&g->vmcb, 1);
957            amd_vmcb_intercepts_rdldtr_wrf(&g->vmcb, 1);
958            amd_vmcb_intercepts_wrldtr_wrf(&g->vmcb, 1);
959            amd_vmcb_intercepts_rdidtr_wrf(&g->vmcb, 1);
960            amd_vmcb_intercepts_wridtr_wrf(&g->vmcb, 1);
961            amd_vmcb_intercepts_rdtr_wrf(&g->vmcb, 1);
962            amd_vmcb_intercepts_wrtr_wrf(&g->vmcb, 1);
963            amd_vmcb_cr_access_rdcr0_wrf(&g->vmcb, 1);
964            amd_vmcb_cr_access_wrcr0_wrf(&g->vmcb, 1);
965            amd_vmcb_cr_access_rdcr3_wrf(&g->vmcb, 1);
966            amd_vmcb_cr_access_wrcr3_wrf(&g->vmcb, 1);
967            amd_vmcb_intercepts_intn_wrf(&g->vmcb, 1);
968
969            // mark guest as emulated
970            g->emulated_before_exit = true;
971        }
972#else
973    uint64_t guest_cr0;
974    err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR0, &guest_cr0);
975    assert(err_is_ok(err));
976    if (UNLIKELY(run && (guest_cr0 & CR0_PE) == 0)) {
977        if (!g->emulated_before_exit) {
978	    vmx_intercept_desc_table_wrf(g, 1);
979	    g->emulated_before_exit = true;
980	}
981#endif
982#if 0 /* why create a thread for this? it seems fine without! -AB */
983        struct thread *t = thread_create((thread_func_t)run_realmode, g);
984        assert(t != NULL);
985        err = thread_detach(t);
986        assert(err_is_ok(err));
987#else
988        run_realmode(g);
989#endif
990        return SYS_ERR_OK;
991    }
992
993    /* every time we move the machine from the emulated to virtualized we need
994     * to adjust some intercepts */
995    if (UNLIKELY(run && g->emulated_before_exit)) {
996#ifdef CONFIG_SVM
997        // we enforce NP to be enabled (no shadow paging support)
998        assert(amd_vmcb_np_rd(&g->vmcb).enable == 1);
999
1000        // disable GDTR intercept
1001        amd_vmcb_intercepts_rdgdtr_wrf(&g->vmcb, 0);
1002        amd_vmcb_intercepts_wrgdtr_wrf(&g->vmcb, 0);
1003        // disable GDTR intercept
1004        amd_vmcb_intercepts_rdldtr_wrf(&g->vmcb, 0);
1005        amd_vmcb_intercepts_wrldtr_wrf(&g->vmcb, 0);
1006        // disable IDTR intercept
1007        amd_vmcb_intercepts_rdidtr_wrf(&g->vmcb, 0);
1008        amd_vmcb_intercepts_wridtr_wrf(&g->vmcb, 0);
1009        // disable TR intercept
1010        amd_vmcb_intercepts_rdtr_wrf(&g->vmcb, 0);
1011        amd_vmcb_intercepts_wrtr_wrf(&g->vmcb, 0);
1012        // disable non essential CR0 access intercepts_t
1013        amd_vmcb_cr_access_rdcr0_wrf(&g->vmcb, 0);
1014        amd_vmcb_cr_access_wrcr0_wrf(&g->vmcb, 0);
1015        // disable CR3 access intercepts
1016        assert(amd_vmcb_np_rd(&g->vmcb).enable != 0);
1017        amd_vmcb_cr_access_rdcr3_wrf(&g->vmcb, 0);
1018        amd_vmcb_cr_access_wrcr3_wrf(&g->vmcb, 0);
1019        // disable INTn intercept
1020        // we have to be outside of real mode for this to work
1021        assert(amd_vmcb_cr0_rd(&g->vmcb).pe != 0);
1022        amd_vmcb_intercepts_intn_wrf(&g->vmcb, 0);
1023#else
1024        bool ept_enabled = vmx_ept_enabled(g);
1025	assert(ept_enabled);
1026	vmx_intercept_desc_table_wrf(g, 0);
1027	assert(guest_cr0 & CR0_PE);
1028	vmx_set_cr0_shadow(g);
1029#endif
1030        // mark guest as not emulated
1031        g->emulated_before_exit = false;
1032    }
1033
1034    // update the guets domain's runnable state
1035    err = invoke_dispatcher(g->dcb_cap, NULL_CAP, NULL_CAP, NULL_CAP, NULL_CAP, run);
1036    assert_err(err, "dispatcher_make_runnable");
1037    // yield the dispatcher
1038    if (run) {
1039        thread_yield_dispatcher(NULL_CAP);
1040    }
1041
1042    return SYS_ERR_OK;
1043}
1044
1045/* VMEXIT hanlders */
1046
1047#define HANDLER_ERR_OK          (0)
1048#define HANDLER_ERR_FATAL       (-1)
1049
1050#ifdef CONFIG_SVM
1051static int
1052handle_vmexit_unhandeled (struct guest *g)
1053{
1054    printf("Unhandeled guest vmexit:\n");
1055    printf(" code:\t  %lx\n", amd_vmcb_exitcode_rd(&g->vmcb));
1056    printf(" info1:\t  %lx\n", amd_vmcb_exitinfo1_rd(&g->vmcb));
1057    printf(" info2:\t  %lx\n", amd_vmcb_exitinfo2_rd(&g->vmcb));
1058    printf(" intinfo: %lx\n", amd_vmcb_exitintinfo_rd(&g->vmcb));
1059
1060    printf("VMCB save area:\n");
1061    printf(" cr0:\t%lx\n", amd_vmcb_cr0_rd_raw(&g->vmcb));
1062    printf(" cr2:\t%lx\n", amd_vmcb_cr2_rd_raw(&g->vmcb));
1063    printf(" cr3:\t%lx\n", amd_vmcb_cr3_rd_raw(&g->vmcb));
1064    printf(" cr4:\t%lx\n", amd_vmcb_cr4_rd_raw(&g->vmcb));
1065    printf(" efer:\t%lx\n", amd_vmcb_efer_rd_raw(&g->vmcb));
1066    printf(" rip:\t%lx\n", amd_vmcb_rip_rd_raw(&g->vmcb));
1067    printf(" cs:\tselector %x, base %lx, limit %x, attrib %x\n",
1068           amd_vmcb_cs_selector_rd(&g->vmcb), amd_vmcb_cs_base_rd(&g->vmcb),
1069           amd_vmcb_cs_limit_rd(&g->vmcb), amd_vmcb_cs_attrib_rd_raw(&g->vmcb));
1070    printf(" ds:\tselector %x, base %lx, limit %x, attrib %x\n",
1071           amd_vmcb_ds_selector_rd(&g->vmcb), amd_vmcb_ds_base_rd(&g->vmcb),
1072           amd_vmcb_ds_limit_rd(&g->vmcb), amd_vmcb_ds_attrib_rd_raw(&g->vmcb));
1073    printf(" es:\tselector %x, base %lx, limit %x, attrib %x\n",
1074           amd_vmcb_es_selector_rd(&g->vmcb), amd_vmcb_es_base_rd(&g->vmcb),
1075           amd_vmcb_es_limit_rd(&g->vmcb), amd_vmcb_es_attrib_rd_raw(&g->vmcb));
1076    printf(" ss:\tselector %x, base %lx, limit %x, attrib %x\n",
1077           amd_vmcb_ss_selector_rd(&g->vmcb), amd_vmcb_ss_base_rd(&g->vmcb),
1078           amd_vmcb_ss_limit_rd(&g->vmcb), amd_vmcb_ss_attrib_rd_raw(&g->vmcb));
1079    printf(" rax:\t%lx\n", amd_vmcb_rax_rd_raw(&g->vmcb));
1080    printf(" rbx:\t%lx\n", g->ctrl->regs.rbx);
1081    printf(" rcx:\t%lx\n", g->ctrl->regs.rcx);
1082    printf(" rdx:\t%lx\n", g->ctrl->regs.rdx);
1083    printf(" rsi:\t%lx\n", g->ctrl->regs.rsi);
1084    printf(" rdi:\t%lx\n", g->ctrl->regs.rdi);
1085
1086    return HANDLER_ERR_FATAL;
1087}
1088#else
1089static int
1090handle_vmexit_unhandeled (struct guest *g)
1091{
1092    printf("Unhandeled guest vmexit:\n");
1093    printf(" exit reason:\t %"PRIu16"\n", saved_exit_reason);
1094    printf(" exit qualification:\t %"PRIx64"\n", saved_exit_qual);
1095    printf(" next rip (I/O instruction):\t %"PRIx64"\n", saved_rip);
1096
1097    uint64_t gpaddr;
1098    errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GPADDR_F, &gpaddr);
1099    printf(" guest physical-address:\t %"PRIx64"\n", gpaddr);
1100
1101    uint64_t guest_cr0, guest_cr3, guest_cr4;
1102    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR0, &guest_cr0);
1103    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR3, &guest_cr3);
1104    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR4, &guest_cr4);
1105
1106    uint64_t guest_efer, guest_rip;
1107    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_EFER_F, &guest_efer);
1108    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
1109
1110    uint64_t guest_cs_sel, guest_cs_base, guest_cs_lim,
1111        guest_cs_access;
1112    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CS_SEL, &guest_cs_sel);
1113    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CS_BASE, &guest_cs_base);
1114    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CS_LIM, &guest_cs_lim);
1115    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CS_ACCESS, &guest_cs_access);
1116
1117    uint64_t guest_ds_sel, guest_ds_base, guest_ds_lim,
1118        guest_ds_access;
1119    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_DS_SEL, &guest_ds_sel);
1120    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_DS_BASE, &guest_ds_base);
1121    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_DS_LIM, &guest_ds_lim);
1122    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_DS_ACCESS, &guest_ds_access);
1123
1124    uint64_t guest_es_sel, guest_es_base, guest_es_lim,
1125        guest_es_access;
1126    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_ES_SEL, &guest_es_sel);
1127    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_ES_BASE, &guest_es_base);
1128    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_ES_LIM, &guest_es_lim);
1129    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_ES_ACCESS, &guest_es_access);
1130
1131    uint64_t guest_ss_sel, guest_ss_base, guest_ss_lim,
1132        guest_ss_access;
1133    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_SS_SEL, &guest_ss_sel);
1134    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_SS_BASE, &guest_ss_base);
1135    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_SS_LIM, &guest_ss_lim);
1136    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_SS_ACCESS, &guest_ss_access);
1137    assert(err_is_ok(err));
1138
1139    printf("VMCS save area:\n");
1140    printf(" cr0:\t%lx\n", guest_cr0);
1141    printf(" cr3:\t%lx\n", guest_cr3);
1142    printf(" cr4:\t%lx\n", guest_cr4);
1143    printf(" efer:\t%lx\n", guest_efer);
1144    printf(" rip:\t%lx\n", guest_rip);
1145    printf(" cs:\tselector %lx, base %lx, limit %lx, access %lx\n",
1146           guest_cs_sel, guest_cs_base, guest_cs_lim, guest_cs_access);
1147    printf(" ds:\tselector %lx, base %lx, limit %lx, access %lx\n",
1148           guest_ds_sel, guest_ds_base, guest_ds_lim, guest_ds_access);
1149    printf(" es:\tselector %lx, base %lx, limit %lx, access %lx\n",
1150           guest_es_sel, guest_es_base, guest_es_lim, guest_es_access);
1151    printf(" ss:\tselector %lx, base %lx, limit %lx, access %lx\n",
1152           guest_ss_sel, guest_ss_base, guest_ss_lim, guest_ss_access);
1153    printf(" rax:\t%lx\n", g->ctrl->regs.rax);
1154    printf(" rbx:\t%lx\n", g->ctrl->regs.rbx);
1155    printf(" rcx:\t%lx\n", g->ctrl->regs.rcx);
1156    printf(" rdx:\t%lx\n", g->ctrl->regs.rdx);
1157    printf(" rsi:\t%lx\n", g->ctrl->regs.rsi);
1158    printf(" rdi:\t%lx\n", g->ctrl->regs.rdi);
1159
1160    return HANDLER_ERR_FATAL;
1161}
1162#endif
1163
1164static inline uint64_t
1165lookup_paddr_long_mode (struct guest *g, uint64_t vaddr)
1166{
1167    union x86_lm_va va = { .raw = vaddr };
1168    uint64_t *page_table;
1169
1170    // get a pointer to the pml4 table
1171#ifdef CONFIG_SVM
1172    page_table = (uint64_t *)guest_to_host(amd_vmcb_cr3_rd(&g->vmcb));
1173#else
1174    uint64_t guest_cr3;
1175    errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR3, &guest_cr3);
1176    assert(err_is_ok(err));
1177    page_table = (uint64_t *)guest_to_host(guest_cr3);
1178#endif
1179    // get pml4 entry
1180    union x86_lm_pml4_entry pml4e = { .raw = page_table[va.u.pml4_idx] };
1181    assert (pml4e.u.p == 1);
1182
1183    // get a pointer to the pdp table
1184    page_table = (uint64_t *)guest_to_host(pml4e.u.pdp_base_pa << 12);
1185    // get pdp entry
1186    union x86_lm_pdp_entry pdpe = { .raw = page_table[va.u.pdp_idx] };
1187    assert(pdpe.u.p == 1);
1188    // check for 1GB page (PS bit set)
1189    if (pdpe.u.ps == 1) {
1190        return (pdpe.u1gb.base_pa << 30) | va.u1gb.pa_offset;
1191    }
1192
1193    // get a pointer to the pd table
1194    page_table = (uint64_t *)guest_to_host(pdpe.u.pd_base_pa << 12);
1195    // get pd entry
1196    union x86_lm_pd_entry pde = { .raw = page_table[va.u.pd_idx] };
1197    if (pde.u.p == 0) {
1198        printf("g2h %lx, pml4e %p %lx, pdpe %p %lx, pde %p %lx\n",
1199	       guest_to_host(0), &pml4e, pml4e.raw, &pdpe, pdpe.raw, &pde, pde.raw);
1200    }
1201    assert(pde.u.p == 1);
1202    // check for 2MB page (PS bit set)
1203    if (pde.u.ps == 1) {
1204        return (pde.u2mb.base_pa << 21) | va.u2mb.pa_offset;
1205    }
1206
1207    // get a pointer to the page table
1208    page_table = (uint64_t *)guest_to_host(pde.u.pt_base_pa << 12);
1209    // get the page table entry
1210    union x86_lm_pt_entry pte = { .raw = page_table[va.u.pt_idx] };
1211    assert(pte.u.p == 1);
1212
1213    return (pte.u.base_pa << 12) | va.u.pa_offset;
1214}
1215
1216static inline uint32_t
1217lookup_paddr_legacy_mode (struct guest *g, uint32_t vaddr)
1218{
1219//	printf("lookup_paddr_legacy_mode enter\n");
1220    // PAE not supported
1221#ifdef CONFIG_SVM
1222    guest_assert(g, amd_vmcb_cr4_rd(&g->vmcb).pae == 0);
1223#else
1224    uint64_t guest_cr4;
1225    errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR4, &guest_cr4);
1226    guest_assert(g, (guest_cr4 & CR4_PAE) == 0);
1227#endif
1228    union x86_legm_va va = { .raw = vaddr };
1229    uint32_t *page_table;
1230
1231    // get a pointer to the pd table
1232#ifdef CONFIG_SVM
1233    page_table = (uint32_t *)guest_to_host(amd_vmcb_cr3_rd(&g->vmcb));
1234#else
1235    uint64_t guest_cr3;
1236    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR3, &guest_cr3);
1237    assert(err_is_ok(err));
1238    page_table = (uint32_t *)guest_to_host(guest_cr3);
1239#endif
1240
1241    // get pd entry
1242    union x86_legm_pd_entry pde = { .raw = page_table[va.u.pd_idx] };
1243    assert (pde.u.p == 1);
1244    // check for 4MB page (PS bit set)
1245    if (pde.u.ps == 1) {
1246        return (pde.u4mb.base_pa << 22) | va.u4mb.pa_offset;
1247    }
1248
1249    // get a pointer to the page table
1250    page_table = (uint32_t *)guest_to_host(pde.u.pt_base_pa << 12);
1251    // get the page table entry
1252    union x86_legm_pt_entry pte = { .raw = page_table[va.u.pt_idx] };
1253    assert(pte.u.p == 1);
1254
1255    return (pte.u.base_pa << 12) | va.u.pa_offset;
1256}
1257
1258// retunrs a pointer to a byte array starting at the current instruction
1259static inline int
1260get_instr_arr (struct guest *g, uint8_t **arr)
1261{
1262#ifdef CONFIG_SVM
1263    if (UNLIKELY(amd_vmcb_cr0_rd(&g->vmcb).pg == 0)) {
1264#else
1265    uint64_t guest_cr0;
1266    errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR0, &guest_cr0);
1267    if (UNLIKELY((guest_cr0 & CR0_PG) == 0)) {
1268#endif
1269    	//printf("Segmentation active!\n");
1270        // without paging
1271        // take segmentation into account
1272#ifdef CONFIG_SVM
1273        *arr = (uint8_t *)(guest_to_host(g->mem_low_va) +
1274               amd_vmcb_cs_base_rd(&g->vmcb) +
1275               amd_vmcb_rip_rd(&g->vmcb));
1276#else
1277	uint64_t guest_cs_base, guest_rip;
1278	err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CS_BASE, &guest_cs_base);
1279	err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
1280        *arr = (uint8_t *)(guest_to_host(g->mem_low_va) +
1281			   guest_cs_base + guest_rip);
1282#endif
1283    } else {
1284        // with paging
1285#ifdef CONFIG_SVM
1286        if (amd_vmcb_efer_rd(&g->vmcb).lma == 1) {
1287#else
1288	uint64_t guest_efer;
1289	err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_EFER_F, &guest_efer);
1290	if (guest_efer & EFER_LMA) {
1291#endif
1292            // long mode
1293#ifdef CONFIG_SVM
1294            if (amd_vmcb_cs_attrib_rd(&g->vmcb).l == 1) {
1295                // 64-bit mode
1296                *arr = (uint8_t *)guest_to_host(lookup_paddr_long_mode(g,
1297                                                amd_vmcb_rip_rd(&g->vmcb)));
1298#else
1299	    uint64_t cs_access_rights, guest_rip;
1300	    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CS_ACCESS, &cs_access_rights);
1301	    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
1302	    if (cs_access_rights & ACCESS_RIGHTS_LONG_MODE) {
1303                *arr = (uint8_t *)guest_to_host(lookup_paddr_long_mode(g,
1304                                                guest_rip));
1305#endif
1306            } else {
1307                // cmpatibility mode
1308                guest_assert(g, !"compatiblity mode not supported yet");
1309            }
1310        } else {
1311            // Legacy (aka. Paged Protected) Mode
1312#ifdef CONFIG_SVM
1313            assert(amd_vmcb_cr0_rd(&g->vmcb).pe == 1);
1314
1315            *arr = (uint8_t *)guest_to_host(lookup_paddr_legacy_mode(g,
1316                                            amd_vmcb_rip_rd(&g->vmcb)));
1317#else
1318	    assert(guest_cr0 & CR0_PE);
1319
1320	    uint64_t guest_rip;
1321	    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
1322            *arr = (uint8_t *)guest_to_host(lookup_paddr_legacy_mode(g,
1323                                            guest_rip));
1324#endif
1325        }
1326    }
1327#ifndef CONFIG_SVM
1328    assert(err_is_ok(err));
1329#endif
1330    return HANDLER_ERR_OK;
1331}
1332
1333static inline uint64_t
1334get_reg_val_by_reg_num (struct guest *g, uint8_t regnum) {
1335    switch (regnum) {
1336    case 0:
1337        return guest_get_rax(g);
1338    case 1:
1339        return guest_get_rcx(g);
1340    case 2:
1341        return guest_get_rdx(g);
1342    case 3:
1343        return guest_get_rbx(g);
1344    case 4:
1345        return guest_get_rsp(g);
1346    case 5:
1347        return guest_get_rbp(g);
1348    case 6:
1349        return guest_get_rsi(g);
1350    case 7:
1351        return guest_get_rdi(g);
1352    default:
1353        assert(!"not reached");
1354        return 0;
1355    }
1356}
1357
1358static inline void
1359set_reg_val_by_reg_num (struct guest *g, uint8_t regnum, uint64_t val) {
1360    switch (regnum) {
1361    case 0:
1362        guest_set_rax(g, val);
1363        break;
1364    case 1:
1365        guest_set_rcx(g, val);
1366        break;
1367    case 2:
1368        guest_set_rdx(g, val);
1369        break;
1370    case 3:
1371        guest_set_rbx(g, val);
1372        break;
1373    case 4:
1374        guest_set_rsp(g, val);
1375        break;
1376    case 5:
1377        guest_set_rbp(g, val);
1378        break;
1379    case 6:
1380        guest_set_rsi(g, val);
1381        break;
1382    case 7:
1383        guest_set_rdi(g, val);
1384        break;
1385    default:
1386        assert(!"not reached");
1387        break;
1388    }
1389}
1390
1391static int
1392handle_vmexit_cr_access (struct guest *g)
1393{
1394    int r;
1395    uint8_t *code = NULL;
1396#ifndef CONFIG_SVM
1397    errval_t err = 0;
1398    if (g->emulated_before_exit) {
1399        assert(saved_exit_reason == VMX_EXIT_REASON_CR_ACCESS);
1400        assert(((saved_exit_qual >> 0) & 0xf) == 0);
1401    }
1402#endif
1403    // fetch the location to the code
1404    r = get_instr_arr(g, &code);
1405    if (r != HANDLER_ERR_OK) {
1406        return r;
1407    }
1408    assert(code != NULL);
1409
1410    assert(code[0] == 0x0f && (code[1] == 0x20 || code[1] == 0x22));
1411
1412    uint64_t val;
1413    bool read = (code[1] == 0x20);
1414    union x86_modrm mod;
1415    mod.raw = code[2];
1416
1417    // FIXME: use proper exception
1418    assert(mod.u.mod == 3);
1419
1420    // source
1421    if (read) {
1422        // read from CR
1423        switch (mod.u.regop) {
1424        case 0:
1425#ifdef CONFIG_SVM
1426            val = amd_vmcb_cr0_rd_raw(&g->vmcb);
1427#else
1428	    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR0, &val);
1429#endif
1430            break;
1431        default:
1432            printf("CR access: unknown CR source register\n");
1433            return handle_vmexit_unhandeled(g);
1434        }
1435    } else {
1436        // read from GPR
1437        val = get_reg_val_by_reg_num(g, mod.u.rm);
1438    }
1439
1440    // destination
1441    if (read) {
1442        // write to GPR
1443        switch (mod.u.rm) {
1444        case 0:
1445            guest_set_rax(g, val);
1446            break;
1447        case 1:
1448            guest_set_rcx(g, val);
1449            break;
1450        case 2:
1451            guest_set_rdx(g, val);
1452            break;
1453        case 3:
1454            guest_set_rbx(g, val);
1455            break;
1456        default:
1457            printf("CR access: unknown GPR destination register\n");
1458            return handle_vmexit_unhandeled(g);
1459        }
1460    } else {
1461        // write to CR
1462        switch (mod.u.regop) {
1463        case 0:
1464#ifdef CONFIG_SVM
1465            amd_vmcb_cr0_wr_raw(&g->vmcb, val);
1466#else
1467	    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_CR0, val);
1468#endif
1469            break;
1470
1471        case 4:
1472            // ignore writing to CR4
1473            // allow writing to CR4 by do nothing for this case
1474            break;
1475        default:
1476            printf("CR access: unknown CR destination register\n");
1477            return handle_vmexit_unhandeled(g);
1478        }
1479    }
1480
1481    // advance the rip beyond the instruction
1482#ifdef CONFIG_SVM
1483    amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_rip_rd(&g->vmcb) + 3);
1484#else
1485    uint64_t guest_rip;
1486    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
1487    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip + 3);
1488    assert(err_is_ok(err));
1489#endif
1490    return HANDLER_ERR_OK;
1491}
1492
1493static int
1494handle_vmexit_ldt (struct guest *g)
1495{
1496    int r;
1497    uint8_t *code = NULL;
1498    uint8_t *mem;
1499
1500    // this handler supports only real-mode
1501#ifdef CONFIG_SVM
1502    assert(amd_vmcb_cr0_rd(&g->vmcb).pe == 0);
1503#else
1504    uint64_t guest_cr0;
1505    errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR0, &guest_cr0);
1506    assert((guest_cr0 & CR0_PE) == 0);
1507#endif
1508    // fetch the location to the code
1509    r = get_instr_arr(g, &code);
1510    if (r != HANDLER_ERR_OK) {
1511        return r;
1512    }
1513    mem = (uint8_t *)guest_to_host(g->mem_low_va);
1514    assert(code != NULL);
1515
1516    assert (code[0] == 0x0f && code[1] == 0x01);
1517
1518    // check for relevant instruction prefixes
1519    bool addr32 = code[-2] == 0x67 || code[-1] == 0x67;
1520    bool op32 = code[-2] == 0x66 || code[-1] == 0x66;
1521    // fetch modrm
1522    union x86_modrm modrm = { .raw = code[2] };
1523
1524    assert(modrm.u.regop == 2 || modrm.u.regop == 3);
1525    guest_assert(g, op32);
1526
1527    uint32_t addr;
1528    if (addr32) {
1529        // byte 3-6 hold a 32 bit address to a mem location where the first word
1530        // holds the limit and the following dword holds the base
1531        addr = *(uint32_t *)&code[3];
1532    } else {
1533        // byte 3-4 hold a 16 bit address to a mem location where the first word
1534        // holds the limit and the following dword holds the base
1535        // this address is relative to DS base
1536#ifdef CONFIG_SVM
1537        addr = *(uint16_t *)&code[3] + amd_vmcb_ds_base_rd(&g->vmcb);
1538#else
1539	uint64_t guest_ds_base;
1540	err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_DS_BASE, &guest_ds_base);
1541	addr = *(uint16_t *)&code[3] + guest_ds_base;
1542#endif
1543    }
1544
1545    // santity check on the addr
1546    // FIXME: raise a proper exception
1547    if (addr > g->mem_high_va) {
1548        printf("Memory access beyond physical address space\n");
1549        return HANDLER_ERR_FATAL;
1550    }
1551
1552    // load the actual register
1553    if (modrm.u.regop == 2) {
1554        // LGDT
1555#ifdef CONFIG_SVM
1556        amd_vmcb_gdtr_limit_wr(&g->vmcb, *(uint16_t*)(mem + addr));
1557        amd_vmcb_gdtr_base_wr(&g->vmcb, *(uint32_t*)(mem + addr + 2));
1558#else
1559	err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_GDTR_LIM,
1560					 *(uint16_t*)(mem + addr));
1561        err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_GDTR_BASE,
1562					 *(uint32_t*)(mem + addr + 2));
1563#endif
1564
1565    } else if (modrm.u.regop == 3) {
1566        // LIDT
1567#ifdef CONFIG_SVM
1568        amd_vmcb_idtr_limit_wr(&g->vmcb, *(uint16_t*)(mem + addr));
1569        amd_vmcb_idtr_base_wr(&g->vmcb, *(uint32_t*)(mem + addr + 2));
1570#else
1571	err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_IDTR_LIM,
1572					 *(uint16_t*)(mem + addr));
1573	err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_IDTR_BASE,
1574					 *(uint32_t*)(mem + addr + 2));
1575#endif
1576    } else {
1577        assert(!"not reached");
1578    }
1579
1580    // advance the rip beyond the instruction
1581#ifdef CONFIG_SVM
1582    if (addr32) {
1583        amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_rip_rd(&g->vmcb) + 7);
1584    } else {
1585        amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_rip_rd(&g->vmcb) + 5);
1586    }
1587#else
1588    uint64_t guest_rip;
1589    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
1590    if (addr32) {
1591        err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip + 7);
1592    } else {
1593        err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip + 5);
1594    }
1595    assert(err_is_ok(err));
1596#endif
1597    return HANDLER_ERR_OK;
1598}
1599
1600#ifndef CONFIG_SVM
1601static inline void vmx_vmcs_rflags_cf_wrf(struct guest *g, int val) {
1602    assert(val == 0 || val == 1);
1603    uint64_t guest_rflags;
1604    errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RFLAGS, &guest_rflags);
1605    if (val) {
1606        err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RFLAGS,
1607					 guest_rflags | RFLAGS_CF);
1608    } else {
1609        err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RFLAGS,
1610					 guest_rflags & (~RFLAGS_CF));
1611    }
1612    assert(err_is_ok(err));
1613}
1614#endif
1615
1616static int
1617handle_vmexit_swint (struct guest *g)
1618{
1619    int r;
1620    uint8_t *code = NULL;
1621
1622    r = get_instr_arr(g, &code);
1623    if (r != HANDLER_ERR_OK) {
1624        return r;
1625    }
1626    assert (code != NULL);
1627
1628    // check for correct instruciton
1629    assert(code[0] == 0xcd);
1630
1631    // the number of the interrupt is followed by the INT (0xcd) opcode
1632    uint8_t int_num = code[1];
1633
1634    // check whether the guest is in real mode
1635#ifdef CONFIG_SVM
1636    if (amd_vmcb_cr0_rd(&g->vmcb).pe == 0) {
1637#else
1638    uint64_t guest_ds_base, es_guest_base;
1639    uint64_t guest_cr0, guest_rip;
1640    errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR0, &guest_cr0);
1641    if ((guest_cr0 & CR0_PE) == 0) {
1642#endif
1643        // in real mode the interrupts starting at 10 have different meaning
1644        // examine the sw interrupt
1645        switch (int_num) {
1646            case 0x10:
1647                r = console_handle_int10(g->console, g);
1648                if (r != HANDLER_ERR_OK) {
1649                    printf("Unhandeled method on INT 0x10\n");
1650                    return handle_vmexit_unhandeled(g);
1651                }
1652                break;
1653            case 0x12:
1654                switch (guest_get_ax(g)) {
1655                    case 0: // GET MEMORY SIZE
1656                        // our VM always has 1MB of base memory
1657                        // AX holds the amount of 1KB memory blocks starting at
1658                        // addr 0 which is 640 (640 KiB)
1659                        guest_set_ax(g, 640);
1660                        break;
1661                    default:
1662                        printf("Unhandeled method on INT 0x12\n");
1663                        return handle_vmexit_unhandeled(g);
1664                }
1665                break;
1666            case 0x13:
1667                // Bootable CD-ROM - GET STATUS
1668                if (guest_get_ax(g) == 0x4b01) {
1669                    // no cdrom support
1670#ifdef CONFIG_SVM
1671                    amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
1672#else
1673		    vmx_vmcs_rflags_cf_wrf(g, 1);
1674#endif
1675                }
1676                // DISK RESET
1677                else if (guest_get_ah(g) == 0) {
1678                    for (int i = 0; i < g->hdd_count; i++) {
1679                        hdd_reset(g->hdds[i]);
1680                    }
1681                }
1682                // DISK - GET DRIVE PARAMETERS (PC,XT286,CONV,PS,ESDI,SCSI)
1683                else if (guest_get_ah(g) == 0x08) {
1684                    uint8_t dl = guest_get_dl(g);
1685
1686                    // only respond to installed hard disks
1687                    if ((dl >> 7) && ((dl & 0x7f) < g->hdd_count)) {
1688                        uint16_t c;
1689                        uint8_t h, s;
1690
1691                        r = hdd_get_geometry_chs(g->hdds[dl & 0x7f], &c, &h, &s);
1692                        assert(r == 0);
1693
1694                        // set some return values for success
1695                        guest_set_ah(g, 0);
1696#ifdef CONFIG_SVM
1697                        amd_vmcb_rflags_cf_wrf(&g->vmcb, 0);
1698#else
1699			vmx_vmcs_rflags_cf_wrf(g, 0);
1700#endif
1701                        guest_set_bl(g, 0);
1702                        // store the geometry into the correct registers
1703                        guest_set_cx(g, c << 6 | (s & 0x3f));
1704                        guest_set_dh(g, h);
1705                        guest_set_dl(g, g->hdd_count);
1706                    } else {
1707#ifdef CONFIG_SVM
1708                        amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
1709#else
1710			vmx_vmcs_rflags_cf_wrf(g, 1);
1711#endif
1712                        // it is not really clear to me what ah should contain
1713                        // when the drive is not present, so set it to FF
1714                        guest_set_ah(g, 1);
1715                    }
1716                }
1717                // INT 13 Extensions - INSTALLATION CHECK
1718                else if (guest_get_ah(g) == 0x41 && guest_get_bx(g) == 0x55aa) {
1719#ifdef CONFIG_SVM
1720                    amd_vmcb_rflags_cf_wrf(&g->vmcb, 0);
1721#else
1722		    vmx_vmcs_rflags_cf_wrf(g, 0);
1723#endif
1724                    guest_set_bx(g, 0xaa55);
1725                    guest_set_ah(g, 0x01); // Drive extensions 1.x
1726                    guest_set_al(g, 0);
1727                    guest_set_cx(g, 0x5);
1728                }
1729                // IBM/MS INT 13 Extensions - EXTENDED READ
1730                else if (guest_get_ah(g) == 0x42) {
1731                    uint8_t dl = guest_get_dl(g);
1732
1733                    // only respond to installed hard disks
1734                    if ((dl >> 7) && ((dl & 0x7f) < g->hdd_count)) {
1735#ifdef CONFIG_SVM
1736                        amd_vmcb_rflags_cf_wrf(&g->vmcb, 0);
1737#else
1738			vmx_vmcs_rflags_cf_wrf(g, 0);
1739#endif
1740                        guest_set_ah(g, 0);
1741
1742                        struct disk_access_block {
1743                            uint8_t     size;
1744                            uint8_t     reserved;
1745                            uint16_t    count;
1746                            // pointer to the data buffer formated like
1747                            // SEGMENT:ADDRESS
1748                            uint32_t    transfer_buffer;
1749                            uint64_t    abs_block_number;
1750                        } __attribute__ ((packed));
1751
1752                        // memory location of the disk access block
1753#ifdef CONFIG_SVM
1754                        uintptr_t mem = guest_to_host(g->mem_low_va) +
1755                                        amd_vmcb_ds_base_rd(&g->vmcb) +
1756                                        guest_get_si(g);
1757#else
1758			err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_DS_BASE, &guest_ds_base);
1759                        uintptr_t mem = guest_to_host(g->mem_low_va) +
1760                                        guest_ds_base + guest_get_si(g);
1761#endif
1762
1763                        struct disk_access_block *dap = (void *)mem;
1764
1765                        if (dap->size < 0x10) {
1766#ifdef CONFIG_SVM
1767                            amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
1768#else
1769			    vmx_vmcs_rflags_cf_wrf(g, 1);
1770#endif
1771                            guest_set_ah(g, 1);
1772                        } else {
1773                            // dap->transfer buffer points to a real-mode segment
1774                            // resolve it according to that rules
1775                            mem = guest_to_host(g->mem_low_va) +
1776                                  ((dap->transfer_buffer >> 16) << 4) +
1777                                  (dap->transfer_buffer & 0xffff);
1778
1779                            size_t count = dap->count;
1780                            r = hdd_read_blocks(g->hdds[dl & 0x7f],
1781                                                dap->abs_block_number,
1782                                                &count, mem);
1783                            dap->count = count;
1784
1785                            if (r != HANDLER_ERR_OK) {
1786#ifdef CONFIG_SVM
1787                                amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
1788#else
1789				vmx_vmcs_rflags_cf_wrf(g, 1);
1790#endif
1791                                guest_set_ah(g, 1);
1792                            }
1793                        }
1794                    } else {
1795#ifdef CONFIG_SVM
1796                        amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
1797#else
1798			vmx_vmcs_rflags_cf_wrf(g, 1);
1799#endif
1800                        // it is not really clear to me what ah should contain
1801                        // when the drive is not present, so set it to FF
1802                        guest_set_ah(g, 1);
1803                    }
1804                }
1805                // IBM/MS INT 13 Extensions - GET DRIVE PARAMETERS
1806                else if (guest_get_ah(g) == 0x48) {
1807                    uint8_t dl = guest_get_dl(g);
1808
1809                    // only respond to installed hard disks
1810                    if ((dl >> 7) && ((dl & 0x7f) < g->hdd_count)) {
1811                        // structure to hold drive info
1812                        struct drive_params {
1813                            uint16_t size;
1814                            uint16_t flags;
1815                            uint32_t cylinders;
1816                            uint32_t heads;
1817                            uint32_t sectors;
1818                            uint64_t total_sectors;
1819                            uint16_t bytes_per_sector;
1820                        } __attribute__ ((packed));
1821
1822                        // memory where the drive info shall be stored
1823#ifdef CONFIG_SVM
1824                        uintptr_t mem = guest_to_host(g->mem_low_va) +
1825                                        amd_vmcb_ds_base_rd(&g->vmcb) +
1826                                        guest_get_si(g);
1827#else
1828			err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_DS_BASE, &guest_ds_base);
1829                        uintptr_t mem = guest_to_host(g->mem_low_va) +
1830                                        guest_ds_base + guest_get_si(g);
1831#endif
1832
1833                        struct drive_params *drp = (void *)mem;
1834
1835                        // sanity check
1836                        if (drp->size < sizeof(struct drive_params)) {
1837#ifdef CONFIG_SVM
1838                            amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
1839#else
1840			    vmx_vmcs_rflags_cf_wrf(g, 1);
1841#endif
1842                        } else {
1843#ifdef CONFIG_SVM
1844                            amd_vmcb_rflags_cf_wrf(&g->vmcb, 0);
1845#else
1846			    vmx_vmcs_rflags_cf_wrf(g, 0);
1847#endif
1848                            guest_set_ah(g, 0);
1849
1850                            drp->size = sizeof(struct drive_params);
1851                            // CHS invalid, no removable drive, etc
1852                            drp->flags = 0;
1853                            drp->cylinders = 0;
1854                            drp->heads = 0;
1855                            drp->sectors = 0;
1856                            drp->total_sectors = hdd_get_blocks_count(
1857                                                    g->hdds[dl & 0x7f]);
1858                            drp->bytes_per_sector = 512; // FIXME: Hardcoded
1859                        }
1860                    } else {
1861#ifdef CONFIG_SVM
1862                        amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
1863#else
1864			vmx_vmcs_rflags_cf_wrf(g, 1);
1865#endif
1866                        // it is not really clear to me what ah should contain
1867                        // when the drive is not present, so set it to FF
1868                        guest_set_ah(g, 0x1);
1869                    }
1870                } else {
1871                    printf("Unhandeled method on INT 0x13\n");
1872                    return handle_vmexit_unhandeled(g);
1873                }
1874                break;
1875            case 0x15:
1876                // ENABLE A20 GATE
1877                if (guest_get_ax(g) == 0x2401) {
1878                    g->a20_gate_enabled = true;
1879#ifdef CONFIG_SVM
1880                    amd_vmcb_rflags_cf_wrf(&g->vmcb, 0);
1881#else
1882		    vmx_vmcs_rflags_cf_wrf(g, 0);
1883#endif
1884                    guest_set_ah(g, 0);
1885                }
1886                // APM INSTALLATION CHECK
1887                else if (guest_get_ax(g) == 0x5300) {
1888                    // we do not support APM - set carry flag to indicate error
1889#ifdef CONFIG_SVM
1890                    amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
1891#else
1892		    vmx_vmcs_rflags_cf_wrf(g, 1);
1893#endif
1894                }
1895                // APM DISCONNECT
1896                else if (guest_get_ax(g) == 0x5304) {
1897                    // we do not support APM - set carry flag to indicate error
1898#ifdef CONFIG_SVM
1899                    amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
1900#else
1901		    vmx_vmcs_rflags_cf_wrf(g, 1);
1902#endif
1903                }
1904                // GET MEMORY SIZE FOR >64M CONFIGURATIONS
1905                else if (guest_get_ax(g) == 0xe801) {
1906                    // we do not support this BIOS call
1907                    // both grub and linux may also use the 0xe820 call
1908#ifdef CONFIG_SVM
1909                    amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
1910#else
1911		    vmx_vmcs_rflags_cf_wrf(g, 1);
1912#endif
1913                }
1914                // GET SYSTEM MEMORY MAP
1915                // EDX has to contain 0x534d4150 (== 'SMAP')
1916                else if (guest_get_ax(g) == 0xe820 &&
1917                         guest_get_edx(g) == 0x534d4150) {
1918                    // for now we return only one entry containing the real mem
1919                    if (guest_get_ebx(g) > 1 || guest_get_ecx(g) < 20) {
1920                        // wrong input params -> report error
1921#ifdef CONFIG_SVM
1922                        amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
1923#else
1924			vmx_vmcs_rflags_cf_wrf(g, 1);
1925#endif
1926                    } else {
1927                        // taken from http://www.ctyme.com/intr/rb-1741.htm
1928#ifdef CONFIG_SVM
1929                        uintptr_t addr = guest_to_host(g->mem_low_va) +
1930                                         amd_vmcb_es_base_rd(&g->vmcb) +
1931                                         guest_get_di(g);
1932#else
1933			err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_ES_BASE, &es_guest_base);
1934                        uintptr_t addr = guest_to_host(g->mem_low_va) +
1935                                         es_guest_base + guest_get_di(g);
1936#endif
1937                        // set EAX to 'SMAP'
1938                        guest_set_eax(g, 0x534D4150);
1939                        // returned bytes (always 20)
1940                        guest_set_ecx(g, 20);
1941
1942                        switch (guest_get_ebx(g)) {
1943                        case 0x0:
1944                            // base memory
1945                            assert(g->mem_low_va == 0);
1946                            // base address
1947                            *(uint64_t *)addr = 0;
1948                            // size of the memory block
1949                            *(uint64_t *)(addr + 8) = 0xa0000; // 640 KiB
1950                            // mem type, 1 == "memory, available to the OS"
1951                            *(uint32_t *)(addr + 16) = 1;
1952                            // indicate that there is more data
1953                            guest_set_ebx(g, 1);
1954                            break;
1955                        case 0x1:
1956                            // extended memory
1957                            assert(g->mem_high_va > 0x100000);
1958                            // base address
1959                            *(uint64_t *)addr = 0x100000;   // 1 MiB
1960                            // size of the memory block
1961                            *(uint64_t *)(addr + 8) = g->mem_high_va - 0x100000;
1962                            // mem type, 1 == "memory, available to the OS"
1963                            *(uint32_t *)(addr + 16) = 1;
1964                            // indicate that there is no more data
1965                            guest_set_ebx(g, 0);
1966                            break;
1967                        default:
1968                            assert(!"not reached");
1969                            break;
1970                        }
1971
1972                        // mark success
1973#ifdef CONFIG_SVM
1974                        amd_vmcb_rflags_cf_wrf(&g->vmcb, 0);
1975#else
1976			vmx_vmcs_rflags_cf_wrf(g, 0);
1977#endif
1978                    }
1979                }
1980                // SYSTEM - Get Intel SpeedStep (IST) information
1981                else if (guest_get_ax(g) == 0xe980) {
1982                    // not supportet yet
1983#ifdef CONFIG_SVM
1984                    amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
1985#else
1986		    vmx_vmcs_rflags_cf_wrf(g, 1);
1987#endif
1988                }
1989                // SYSTEM - GET CONFIGURATION (XT >1986/1/10,AT mdl 3x9,
1990                // CONV,XT286,PS)
1991                // GRUB BUG: it puts 0xc0 into AX instead of AH
1992                else if (guest_get_ax(g) == 0xc0) {
1993                    // we do not support this
1994#ifdef CONFIG_SVM
1995                    amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
1996#else
1997		    vmx_vmcs_rflags_cf_wrf(g, 1);
1998#endif
1999                    guest_set_ah(g, 0x80);
2000                }
2001                // GET EXTENDED MEMORY SIZE
2002                else if (guest_get_ah(g) == 0x88) {
2003                    // calculate number of 1KB chunks starting from 1MB but not
2004                    // beyond 16MB
2005                    assert(((g->mem_high_va - g->mem_low_va) & 0x3ff) == 0);
2006                    guest_set_ax(g, MIN(0x3c00 /* 16MB */,
2007                                 (g->mem_high_va - g->mem_low_va) / 1024));
2008                    // indicate no error occured
2009#ifdef CONFIG_SVM
2010                    amd_vmcb_rflags_cf_wrf(&g->vmcb, 0);
2011#else
2012		    vmx_vmcs_rflags_cf_wrf(g, 0);
2013#endif
2014                }
2015                // SYSTEM - GET CONFIGURATION (XT >1986/1/10,AT mdl 3x9,
2016                // CONV,XT286,PS)
2017                else if (guest_get_ah(g) == 0xc0) {
2018                    // we do not support this
2019#ifdef CONFIG_SVM
2020                    amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
2021#else
2022		    vmx_vmcs_rflags_cf_wrf(g, 1);
2023#endif
2024                    guest_set_ah(g, 0x80);
2025                // SYSTEM - SET BIOS MODE
2026                } else if (guest_get_ah(g) == 0xec) {
2027                    // I do no really know the use of this bios call and linux
2028                    // expects no action what so ever
2029                } else {
2030                    printf("Unhandeled method on INT 0x15\n");
2031                    return handle_vmexit_unhandeled(g);
2032                }
2033                break;
2034            case 0x16:
2035                // KEYBOARD - SET TYPEMATIC RATE AND DELAY
2036                if (guest_get_ah(g) == 0x3) {
2037                    // ignore this
2038                } else if (guest_get_ah(g) == 0x2) {
2039                    // Return keyboard flags
2040                    guest_set_al(g, 0x0);
2041                } else {
2042                    printf("Unhandeled method on INT 0x16\n");
2043                    return handle_vmexit_unhandeled(g);
2044                }
2045                break;
2046            case 0x1a:
2047                // TIME - GET REAL-TIME CLOCK TIME (AT,XT286,PS)
2048                if (guest_get_ah(g) == 0x2) {
2049                    uint8_t h, m, s;
2050                    lpc_rtc_get_time_bcd(g->lpc, &h, &m, &s);
2051                    guest_set_ch(g, h);
2052                    guest_set_cl(g, m);
2053                    guest_set_dh(g, s);
2054                    guest_set_dl(g, 0);
2055                    // mark success
2056#ifdef CONFIG_SVM
2057                    amd_vmcb_rflags_cf_wrf(&g->vmcb, 0);
2058#else
2059		    vmx_vmcs_rflags_cf_wrf(g, 0);
2060#endif
2061                } else {
2062                    printf("Unhandeled method on INT 0x1a\n");
2063                    return handle_vmexit_unhandeled(g);
2064                }
2065                break;
2066            default:
2067                printf("handle_vmexit_swint: Unhandeled real-mode interrupt "
2068                       "0x%x (%d).\n", int_num, int_num);
2069                return handle_vmexit_unhandeled(g);
2070        }
2071    } else {
2072        printf("vmkitmon: encountered INT instruction outside real mode\n");
2073        return handle_vmexit_unhandeled(g);
2074    }
2075
2076    // advance the rip beyond the instruction
2077#ifdef CONFIG_SVM
2078    amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_rip_rd(&g->vmcb) + 2);
2079#else
2080    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
2081    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip + 2);
2082    assert(err_is_ok(err));
2083#endif
2084    return HANDLER_ERR_OK;
2085}
2086
2087static inline enum opsize
2088io_access_size_to_opsize (enum x86_io_access io)
2089{
2090    if (io & X86_IO_ACCESS_SZ8) {
2091        return OPSIZE_8;
2092    } else if (io & X86_IO_ACCESS_SZ16) {
2093        return OPSIZE_16;
2094    } else if (io & X86_IO_ACCESS_SZ32) {
2095        return OPSIZE_32;
2096    } else {
2097        assert(!"NYI");
2098        return 0;
2099    }
2100}
2101
2102static int
2103handle_vmexit_ioio (struct guest *g)
2104{
2105    int r;
2106#ifdef CONFIG_SVM
2107    uint64_t info1 = amd_vmcb_exitinfo1_rd(&g->vmcb);
2108    enum x86_io_access io;
2109    uint16_t port = info1 >> 16;
2110#else
2111    errval_t err = 0;
2112    if (!g->emulated_before_exit) {
2113        err += invoke_dispatcher_vmread(g->dcb_cap, VMX_EXIT_QUAL, &saved_exit_qual);
2114	uint64_t instr_len, guest_rip;
2115	err += invoke_dispatcher_vmread(g->dcb_cap, VMX_EXIT_INSTR_LEN, &instr_len);
2116	err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
2117	saved_rip = guest_rip + instr_len;
2118    }
2119    uint16_t port = (saved_exit_qual >> 16) & 0xffff;
2120#endif
2121    bool write;
2122    enum opsize size;
2123    uint32_t val;
2124    bool newapi = false; // needed as a transition
2125
2126#ifdef CONFIG_SVM
2127    // copy the access flags
2128    // FIXME: this severely exploits the way the x86_io_access flags are set up
2129    io = (info1 >> 1);
2130    io |= info1 & SVM_IOIO_TYPE_MASK;
2131
2132    // gather some params for the io access
2133    write = (io & X86_IO_ACCESS_TYPE) == 0;
2134    size = OPSIZE_8; // make gcc happy
2135    if (io & X86_IO_ACCESS_SZ8) {
2136        size = OPSIZE_8;
2137    } else if (io & X86_IO_ACCESS_SZ16) {
2138        size = OPSIZE_16;
2139    } else if (io & X86_IO_ACCESS_SZ32) {
2140        size = OPSIZE_32;
2141    }
2142#else
2143    write = ((saved_exit_qual >> 3) & 0x1) == 0;
2144    size = OPSIZE_8;
2145    if ((saved_exit_qual & 0x7) == 0) {
2146        size = OPSIZE_8;
2147    } else if ((saved_exit_qual & 0x7) == 1) {
2148        size = OPSIZE_16;
2149    } else if ((saved_exit_qual & 0x7) == 3) {
2150        size = OPSIZE_32;
2151    } else {
2152        assert(!"Invalid size of access value");
2153    }
2154#endif
2155    // fetch the source val if neccessary
2156    if (write) {
2157        switch (size) {
2158        case OPSIZE_8:
2159            val = guest_get_al(g);
2160            break;
2161        case OPSIZE_16:
2162            val = guest_get_ax(g);
2163            break;
2164        case OPSIZE_32:
2165            val = guest_get_eax(g);
2166            break;
2167        default:
2168            assert(!"not reached");
2169            break;
2170        }
2171    }
2172
2173    // assign the request to the corresponding subsystem
2174    switch (port) {
2175        // LPC devices
2176        case 0x20:  // primary PIC
2177        case 0x21:  // primary PIC
2178        case 0x40:  // Timer
2179        case 0x41:  // Timer
2180        case 0x42:  // Timer
2181        case 0x43:  // Timer
2182        case 0x61:  // NMI Controller
2183        case 0x70:  // RTC
2184        case 0x71:  // RTC
2185        case 0x72:  // RTC
2186        case 0x73:  // RTC
2187        case 0x74:  // RTC
2188        case 0x75:  // RTC
2189        case 0x76:  // RTC
2190        case 0x77:  // RTC
2191        case 0xa0:  // secondary PIC
2192        case 0xa1:  // secondary PIC
2193            if (write) {
2194                r = lpc_handle_pio_write(g->lpc, port, size, val);
2195                guest_assert(g, r == 0);
2196            } else {
2197                r = lpc_handle_pio_read(g->lpc, port, size, &val);
2198                assert(r == 0);
2199            }
2200            newapi = true;
2201            break;
2202        // Keyboard
2203        case 0x60:
2204        case 0x64:
2205            // we currently do not support a keyboard
2206            if (!write) {
2207                val = ~0;
2208            }
2209            newapi = true;
2210            break;
2211        case 0x80:
2212            // some apps use writing to this port as a method to delay execution
2213            // so we just do noting
2214            break;
2215        // Coprocessor
2216        case 0xf0:
2217        case 0xf1:
2218            // coprocessor IGNNE# - do nothing for now
2219            break;
2220
2221        // serial COM1 port
2222        // FIXME: this should not be hardcoded !
2223        case 0x3f8:
2224        case 0x3f9:
2225        case 0x3fa:
2226        case 0x3fb:
2227        case 0x3fc:
2228        case 0x3fd:
2229        case 0x3fe:
2230        case 0x3ff:
2231        // COM2
2232        case 0x2f8:
2233        case 0x2f9:
2234        case 0x2fa:
2235        case 0x2fb:
2236        case 0x2fc:
2237        case 0x2fd:
2238        case 0x2fe:
2239        case 0x2ff:
2240        // COM3
2241        case 0x3e8:
2242        case 0x3e9:
2243        case 0x3ea:
2244        case 0x3eb:
2245        case 0x3ec:
2246        case 0x3ed:
2247        case 0x3ee:
2248        case 0x3ef:
2249        // COM4
2250        case 0x2e8:
2251        case 0x2e9:
2252        case 0x2ea:
2253        case 0x2eb:
2254        case 0x2ec:
2255        case 0x2ed:
2256        case 0x2ee:
2257        case 0x2ef: {
2258            int com;
2259
2260            com = (port & 0xf0) == 0xf0 ? !(port & 0x100) : !(port & 0x100) + 2;
2261            assert(com >= 0 && com < 4);
2262            if (write) {
2263                r = pc16550d_handle_pio_write(g->serial_ports[com], port,
2264                                              size, val);
2265                assert(r == 0);
2266            } else {
2267                r = pc16550d_handle_pio_read(g->serial_ports[com], port,
2268                                             size, &val);
2269                assert(r == 0);
2270            }
2271            newapi = true;
2272            break;
2273        }
2274
2275            // PCI config space (address)
2276    case 0xcf8:
2277    case 0xcf9:
2278    case 0xcfa:
2279    case 0xcfb:
2280            // PCI config space (data)
2281    case 0xcfc:
2282    case 0xcfd:
2283    case 0xcfe:
2284    case 0xcff:
2285        if(write) {
2286            r = pci_handle_pio_write(g->pci, port, size, val);
2287        } else {
2288            r = pci_handle_pio_read(g->pci, port, size, &val);
2289        }
2290        assert(r == 0);
2291        newapi = true;
2292        break;
2293
2294        default:
2295            // the default is to return 0xff and to ignore writes
2296            if (!write) {
2297                val = 0xffffffff;
2298            }
2299            newapi = true;
2300    };
2301
2302    // set the destination when neccessary
2303    if (newapi && !write) {
2304        switch (size) {
2305        case OPSIZE_8:
2306            guest_set_al(g, val);
2307            break;
2308        case OPSIZE_16:
2309            guest_set_ax(g, val);
2310            break;
2311        case OPSIZE_32:
2312            guest_set_eax(g, val);
2313            break;
2314        default:
2315            assert(!"not reached");
2316            break;
2317        }
2318    }
2319
2320    // the following IP is stored in the exitinfo2 field
2321#ifdef CONFIG_SVM
2322    amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_exitinfo2_rd(&g->vmcb));
2323#else
2324    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, saved_rip);
2325    assert(err_is_ok(err));
2326#endif
2327    return HANDLER_ERR_OK;
2328}
2329
2330static int
2331handle_vmexit_msr (struct guest *g) {
2332#ifdef CONFIG_SVM
2333    bool write = amd_vmcb_exitinfo1_rd(&g->vmcb) == 1;
2334#else
2335    int msr_index;
2336    errval_t err = 0;
2337    bool write = (saved_exit_reason == VMX_EXIT_REASON_WRMSR);
2338    struct msr_entry *guest_msr_area = (struct msr_entry *)g->msr_area_va;
2339#endif
2340    uint32_t msr = guest_get_ecx(g);
2341    uint64_t val;
2342
2343    // there may be writes or reads to MSRs
2344    if (write) {
2345        // fetch the value to write from EDX:EAX
2346        val = ((uint64_t)guest_get_edx(g) << 32) | guest_get_eax(g);
2347
2348        // store the read value into the corresponding location
2349        switch (msr) {
2350        case X86_MSR_SYSENTER_CS:
2351#ifdef CONFIG_SVM
2352            amd_vmcb_sysenter_cs_wr(&g->vmcb, val);
2353#else
2354	    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_SYSENTER_CS, val);
2355#endif
2356            break;
2357        case X86_MSR_SYSENTER_ESP:
2358#ifdef CONFIG_SVM
2359            amd_vmcb_sysenter_esp_wr(&g->vmcb, val);
2360#else
2361	    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_SYSENTER_ESP, val);
2362#endif
2363            break;
2364        case X86_MSR_SYSENTER_EIP:
2365#ifdef CONFIG_SVM
2366            amd_vmcb_sysenter_eip_wr(&g->vmcb, val);
2367#else
2368	    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_SYSENTER_EIP, val);
2369#endif
2370            break;
2371        case X86_MSR_EFER:
2372#ifdef CONFIG_SVM
2373            amd_vmcb_efer_wr_raw(&g->vmcb, val);
2374#else
2375	    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_EFER_F, val);
2376#endif
2377            break;
2378        case X86_MSR_FS_BASE:
2379#ifdef CONFIG_SVM
2380            amd_vmcb_fs_base_wr(&g->vmcb, val);
2381#else
2382	    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_FS_BASE, val);
2383#endif
2384            break;
2385        case X86_MSR_GS_BASE:
2386#ifdef CONFIG_SVM
2387            amd_vmcb_gs_base_wr(&g->vmcb, val);
2388#else
2389	    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_GS_BASE, val);
2390#endif
2391            break;
2392#ifdef CONFIG_SVM
2393        case X86_MSR_KERNEL_GS_BASE:
2394            amd_vmcb_kernel_gs_base_wr(&g->vmcb, val);
2395            break;
2396        case X86_MSR_STAR:
2397            amd_vmcb_star_wr(&g->vmcb, val);
2398            break;
2399        case X86_MSR_LSTAR:
2400            amd_vmcb_lstar_wr(&g->vmcb, val);
2401            break;
2402        case X86_MSR_CSTAR:
2403            amd_vmcb_cstar_wr(&g->vmcb, val);
2404            break;
2405        case X86_MSR_SFMASK:
2406            amd_vmcb_sfmask_wr(&g->vmcb, val);
2407            break;
2408        default:
2409            printf("MSR: unhandeled MSR write access to %x\n", msr);
2410            return handle_vmexit_unhandeled(g);
2411#else
2412        case X86_MSR_BIOS_SIGN_ID:
2413            break;
2414	default:
2415	    msr_index = vmx_guest_msr_index(msr);
2416	    if (msr_index == -1) {
2417	        printf("MSR: unhandeled MSR write access to %x\n", msr);
2418		return handle_vmexit_unhandeled(g);
2419	    }
2420	    guest_msr_area[msr_index].val = val;
2421	    break;
2422#endif
2423        }
2424    } else {
2425        // read the value from the corresponding location
2426        switch (msr) {
2427        case X86_MSR_SYSENTER_CS:
2428#ifdef CONFIG_SVM
2429            val = amd_vmcb_sysenter_cs_rd(&g->vmcb);
2430#else
2431	    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_SYSENTER_CS, &val);
2432#endif
2433            break;
2434        case X86_MSR_SYSENTER_ESP:
2435#ifdef CONFIG_SVM
2436            val = amd_vmcb_sysenter_esp_rd(&g->vmcb);
2437#else
2438	    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_SYSENTER_ESP, &val);
2439#endif
2440            break;
2441        case X86_MSR_SYSENTER_EIP:
2442#ifdef CONFIG_SVM
2443            val = amd_vmcb_sysenter_eip_rd(&g->vmcb);
2444#else
2445	    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_SYSENTER_EIP, &val);
2446#endif
2447            break;
2448        case X86_MSR_EFER:
2449#ifdef CONFIG_SVM
2450            val = amd_vmcb_efer_rd_raw(&g->vmcb);
2451#else
2452	    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_EFER_F, &val);
2453#endif
2454            break;
2455        case X86_MSR_FS_BASE:
2456#ifdef CONFIG_SVM
2457            val = amd_vmcb_fs_base_rd(&g->vmcb);
2458#else
2459	    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_FS_BASE, &val);
2460#endif
2461            break;
2462        case X86_MSR_GS_BASE:
2463#ifdef CONFIG_SVM
2464            val = amd_vmcb_gs_base_rd(&g->vmcb);
2465#else
2466	    err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_GS_BASE, &val);
2467#endif
2468            break;
2469#ifdef CONFIG_SVM
2470        case X86_MSR_KERNEL_GS_BASE:
2471            val = amd_vmcb_kernel_gs_base_rd(&g->vmcb);
2472            break;
2473        case X86_MSR_STAR:
2474            val = amd_vmcb_star_rd(&g->vmcb);
2475            break;
2476        case X86_MSR_LSTAR:
2477            val = amd_vmcb_lstar_rd(&g->vmcb);
2478            break;
2479        case X86_MSR_CSTAR:
2480            val = amd_vmcb_cstar_rd(&g->vmcb);
2481            break;
2482        case X86_MSR_SFMASK:
2483            val = amd_vmcb_sfmask_rd(&g->vmcb);
2484            break;
2485        default:
2486            printf("MSR: unhandeled MSR read access to %x\n", msr);
2487            return handle_vmexit_unhandeled(g);
2488#else
2489        case X86_MSR_APIC_BASE:
2490        case X86_MSR_BIOS_SIGN_ID:
2491        case X86_MSR_MTRRCAP:
2492        case X86_MSR_MCG_CAP:
2493        case X86_MSR_MCG_STATUS:
2494        case X86_MSR_PAT:
2495        case X86_MTRR_DEF_TYPE:
2496            val = 0x0;
2497            break;
2498        case X86_MSR_MISC_ENABLE:
2499            val = 0x1; // enable fast-string instructions
2500            break;
2501	default:
2502	    msr_index = vmx_guest_msr_index(msr);
2503	    if (msr_index == -1) {
2504	      printf("MSR: unhandeled MSR read access to %x\n", msr);
2505	      return handle_vmexit_unhandeled(g);
2506	    }
2507	    val = guest_msr_area[msr_index].val;
2508	    break;
2509#endif
2510        }
2511
2512        // store the value in EDX:EAX
2513        guest_set_eax(g, val);
2514        guest_set_edx(g, val >> 32);
2515    }
2516
2517    // advance the rip beyond the current instruction
2518#ifdef CONFIG_SVM
2519    amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_rip_rd(&g->vmcb) + 2);
2520#else
2521    uint64_t guest_rip;
2522    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
2523    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip + 2);
2524    assert(err_is_ok(err));
2525#endif
2526    return HANDLER_ERR_OK;
2527}
2528
2529static int
2530handle_vmexit_cpuid (struct guest *g) {
2531    uint32_t eax, ebx, ecx, edx;
2532    uint32_t func = guest_get_eax(g);
2533
2534    /* the register values are copied from an emuliated Pentium processor in QEMU*/
2535    switch (func) {
2536#ifdef CONFIG_SVM
2537    // Processor Vendor and Largest Standard Function Number
2538    case 0:
2539    case 0x80000000:
2540        // max standard function offset
2541        eax = func == 0 ? 0x1 : 0x80000000;
2542        // string "AuthenticAMD"
2543        ebx = 0x68747541;
2544        ecx = 0x444d4163;
2545        edx = 0x69746e65;
2546    break;
2547
2548    // Family, Model, Stepping Identifiers
2549    case 1:
2550        // we simulate a AMD K6-3D
2551        // Family 5, Model 8, Stepping 12
2552        eax = 0x58c;
2553        // no brand, clflush size 16, no mulitprocessing, no local apic
2554        ebx = 0x0f00;
2555        // support the popcnt instr
2556        ecx = 0x800000;
2557        // support some basic features
2558        edx = 0x89a91b;
2559    break;
2560
2561    default:
2562        // use the answer of the host if there is any other request
2563        // FIXME: this is probably not a good idea ;)
2564        cpuid(func, &eax, &ebx, &ecx, &edx);
2565        printf("handle_vmexit_cpuid: CPUID: func %x, host reports: eax %x, "
2566                "ebx %x, ecx %x, edx %x\n", func, eax, ebx, ecx, edx);
2567        break;
2568#else
2569    case 0:
2570        eax = 0x2;
2571        ebx = 0x756e6547;
2572        ecx = 0x6c65746e;
2573        edx = 0x49656e69;
2574        break;
2575    case 1:
2576        eax = 0x800;
2577        ebx = 0x800;
2578        ecx = 0x80200000;
2579        edx = 0x183fbff;
2580        break;
2581    case 2:
2582        eax = 0x1;
2583        ebx = 0x0;
2584        ecx = 0x4d;
2585        edx = 0x2c307d;
2586    default:
2587        eax = 0x0;
2588        ebx = 0x0;
2589        ecx = 0x0;
2590        edx = 0x0;
2591        break;
2592#endif
2593    }
2594
2595    guest_set_eax(g, eax);
2596    guest_set_ebx(g, ebx);
2597    guest_set_ecx(g, ecx);
2598    guest_set_edx(g, edx);
2599
2600    // advance the rip beyond the instruction
2601#ifdef CONFIG_SVM
2602    amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_rip_rd(&g->vmcb) + 2);
2603#else
2604    uint64_t guest_rip;
2605    errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
2606    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip + 2);
2607    assert(err_is_ok(err));
2608#endif
2609    return HANDLER_ERR_OK;
2610}
2611
2612static int
2613handle_vmexit_vmmcall (struct guest *g) {
2614    /*printf("VMMCALL: tsc %lu, exits with mon invocation %lu, exits w/o mon "
2615           "invocation %lu\n", rdtsc(),
2616           g->ctrl->num_vm_exits_with_monitor_invocation,
2617           g->ctrl->num_vm_exits_without_monitor_invocation);*/
2618
2619    // advance the rip beyond the instruction
2620#ifdef CONFIG_SVM
2621    amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_rip_rd(&g->vmcb) + 3);
2622#else
2623    uint64_t guest_rip;
2624    errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
2625    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip + 3);
2626    assert(err_is_ok(err));
2627#endif
2628    return HANDLER_ERR_OK;
2629}
2630
2631static int
2632handle_vmexit_hlt (struct guest *g) {
2633    // the guest has nothing to do - poll out irq sources for pending IRQs
2634    // if they do not assert a virtual IRQ then we will do nothing
2635    lpc_pic_process_irqs(g->lpc);
2636
2637    // advance the rip beyond the instruction
2638#ifdef CONFIG_SVM
2639    amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_rip_rd(&g->vmcb) + 1);
2640#else
2641    uint64_t guest_rip;
2642    errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
2643    err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip + 1);
2644#endif
2645
2646    // running HLT with IRQs masked does not make any sense
2647    // FIXME: this assert silly, shutting down the VM would be the right way
2648#ifdef CONFIG_SVM
2649    guest_assert(g, amd_vmcb_rflags_rd(&g->vmcb).intrf == 1);
2650#else
2651    uint64_t guest_rflags;
2652    err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RFLAGS, &guest_rflags);
2653    assert(err_is_ok(err));
2654    guest_assert(g, guest_rflags & RFLAGS_IF);
2655#endif
2656    if (virq_pending(g, NULL, NULL)) {
2657        // there is an IRQ pending, proceed as normal, the CPU will take it
2658    } else {
2659        // there is really nothing to do - stop the VM and wait
2660        g->runnable = false;
2661    }
2662
2663    return HANDLER_ERR_OK;
2664}
2665
2666static inline int
2667decode_mov_instr_length (struct guest *g, uint8_t *code)
2668{
2669    int len;
2670
2671    // we only support long mode for now
2672    //assert(amd_vmcb_efer_rd(&g->vmcb).lma == 1);
2673
2674    // all non special MOV instructions use one byte as opcode and at least a
2675    // ModR/M byte
2676    len = 2;
2677    // check for the REX prefix
2678    if ((code[0] >> 4) == 0x4) {
2679        len++;
2680        code++;
2681    }
2682    // precaution because I did no check all variants of MOV, at least these two
2683    // variants are supported
2684    assert(code[0] == 0x89 || code[0] == 0x8b);
2685
2686    union x86_modrm modrm = { .raw = code[1] };
2687    // check for displacements
2688    if (modrm.u.mod == 0x1) {
2689        // 1B displacement
2690        len++;
2691    } else if (modrm.u.mod == 0x2) {
2692        // 4B displacement
2693        len += 4;
2694    }
2695
2696    // check for SIB byte
2697    if (modrm.u.rm == 0x4 && modrm.u.mod != 0x3) {
2698        len++;
2699    }
2700
2701    return len;
2702}
2703
2704// finds out whether a move instruction is a read or a write with respect to
2705// memory
2706static inline bool
2707decode_mov_is_write (struct guest *g, uint8_t *code)
2708{
2709    // check for the REX prefix
2710    if ((code[0] >> 4) == 0x4) {
2711        code++;
2712    }
2713
2714    // we only support one move variant (in each direction) for now
2715    assert(code[0] == 0x89 || code[0] == 0x8b);
2716
2717    union x86_modrm modrm = { .raw = code[1] };
2718    // not defined for reg to reg moves
2719    assert(modrm.u.mod != 3);
2720
2721    return code[0] == 0x89; // 0x89 ==> MOV reg -> mem
2722}
2723
2724static inline enum opsize
2725decode_mov_op_size (struct guest *g, uint8_t *code)
2726{
2727    /*
2728	printf("EFER: 0x%lx\n", amd_vmcb_efer_rd_raw(&g->vmcb));
2729	printf("Code: 0x%lx\n", *((uint64_t *)code));
2730	printf("Code[0]: 0x%x, Code[1]: 0x%x, Code[2]: 0x%x, Code[3]: 0x%x\n", code[0],code[1],code[2],code[3]);
2731	printf("Guest EAX: 0x%x\n", guest_get_eax(g));
2732	printf("Guest EBX: 0x%x\n", guest_get_ebx(g));
2733	printf("Guest ECX: 0x%x\n", guest_get_ecx(g));
2734
2735	printf("Guest EDX: 0x%x\n", guest_get_edx(g));
2736	printf("Guest RDI: 0x%lx\n", guest_get_rdi(g));
2737	printf("Guest RSI: 0x%lx\n", guest_get_rsi(g));
2738	printf("Guest RSP: 0x%lx\n", guest_get_rsp(g));
2739	printf("Guest RBP: 0x%lx\n", guest_get_rbp(g));
2740    */
2741
2742    // we only support long mode for now
2743    //assert(amd_vmcb_efer_rd(&g->vmcb).lma == 1);
2744
2745    // check for the REX prefix
2746    if ((code[0] >> 4) == 0x4 && code[0] & 0x48) {
2747        return OPSIZE_64;
2748    }
2749    return OPSIZE_32;
2750}
2751
2752
2753static inline uint64_t
2754decode_mov_src_val (struct guest *g, uint8_t *code) {
2755
2756    // we only support long mode for now
2757    //assert(amd_vmcb_efer_rd(&g->vmcb).lma == 1);
2758
2759    // check for the REX prefix
2760    if ((code[0] >> 4) == 0x4) {
2761        code++;
2762    }
2763
2764    // we only support one variant for now
2765    assert(code[0] == 0x89);
2766
2767    union x86_modrm modrm = { .raw = code[1] };
2768    return get_reg_val_by_reg_num(g, modrm.u.regop);
2769}
2770
2771
2772static inline void
2773decode_mov_dest_val (struct guest *g, uint8_t *code, uint64_t val)
2774{
2775    // we only support long mode for now
2776    //assert(amd_vmcb_efer_rd(&g->vmcb).lma == 1);
2777
2778    // check for the REX prefix
2779    if ((code[0] >> 4) == 0x4) {
2780        code++;
2781    }
2782
2783    // we only support one variant for now
2784    assert(code[0] == 0x8b);
2785
2786    union x86_modrm modrm = { .raw = code[1] };
2787    set_reg_val_by_reg_num(g, modrm.u.regop, val);
2788}
2789
2790/**** e1000
2791#define TDBAL_OFFSET 0x3800
2792#define TDBAH_OFFSET 0x3804
2793#define RDBAL_OFFSET 0x2800
2794#define RDBAH_OFFSET 0x2804
2795#define TDT_OFFSET 0x3818 //Transmit descriptor tail. Writes to this toggle transmission
2796#define TCTL_OFFSET 0x400 //Transmission Control
2797
2798#define IMS_OFFSET 0xd0 // Interrupt Mask Set/Read Register
2799#define ICS_OFFSET 0xc8 // Interrupt Cause Set Register
2800
2801static int register_needs_translation(uint64_t addr){
2802	return (
2803		addr == TDBAL_OFFSET ||
2804		addr == TDBAH_OFFSET ||
2805		addr == RDBAL_OFFSET ||
2806		addr == RDBAH_OFFSET
2807	);
2808
2809}
2810
2811**** e1000 */
2812
2813
2814
2815
2816#define MMIO_MASK(bytes) (~(~(bytes) + 1)) // I think ~(-bytes) is also correct
2817
2818static int
2819handle_vmexit_npf (struct guest *g) {
2820    int r;
2821#ifdef CONFIG_SVM
2822    uint64_t fault_addr = amd_vmcb_exitinfo2_rd(&g->vmcb);
2823#else
2824    uint64_t fault_addr, guest_rip;
2825    errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GPADDR_F, &fault_addr);
2826    assert(err_is_ok(err));
2827#endif
2828    uint8_t *code = NULL;
2829
2830    // check for fault inside the guest physical memory region
2831    if (fault_addr >= g->mem_low_va && fault_addr < g->mem_high_va) {
2832        // allocate the missing memory
2833        alloc_guest_mem(g, fault_addr & ~BASE_PAGE_MASK, BASE_PAGE_SIZE);
2834        // do not advance the RIP, it is safe (and neccessary) to
2835        // replay the faulting instruction
2836        return HANDLER_ERR_OK;
2837    }
2838
2839    // fetch the location to the code
2840    r = get_instr_arr(g, &code);
2841    assert (r == 0);
2842
2843    // virtual devices
2844    switch (fault_addr & ~BASE_PAGE_MASK) {
2845    case APIC_BASE: {
2846        uint64_t val;
2847        enum opsize size;
2848
2849        assert(g->apic != NULL);
2850        size = decode_mov_op_size(g, code);
2851        if (decode_mov_is_write(g, code)) {
2852            val = decode_mov_src_val(g, code);
2853            r = apic_handle_mmio_write(g->apic, fault_addr, size, val);
2854            assert(r == 0);
2855        } else {
2856            r = apic_handle_mmio_read(g->apic, fault_addr, size, &val);
2857            assert(r == 0);
2858            decode_mov_dest_val(g, code, val);
2859        }
2860
2861        // advance the rip beyond the instruction
2862#ifdef CONFIG_SVM
2863        amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_rip_rd(&g->vmcb) +
2864                        decode_mov_instr_length(g, code));
2865#else
2866	err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
2867	err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip +
2868					 decode_mov_instr_length(g, code));
2869	assert(err_is_ok(err));
2870#endif
2871        return HANDLER_ERR_OK;
2872    }
2873    }
2874
2875    //Check if this is a access to a pci device memory
2876
2877    for(int bus_i = 0; bus_i<256; bus_i++){
2878    	for(int dev_i = 0; dev_i < 32; dev_i++){
2879    		struct pci_bus *bus = g->pci->bus[bus_i];
2880			if(bus) {
2881				struct pci_device* dev = bus->device[dev_i];
2882				if(dev){
2883					for(int bar_i=0; bar_i<5; bar_i++){
2884						struct bar_info *curbar = &dev->bars[bar_i];
2885						if(curbar->paddr <= fault_addr && fault_addr < curbar->paddr + curbar->bytes){
2886							if(decode_mov_is_write(g, code)){
2887								uint64_t val = decode_mov_src_val(g, code);
2888								if(dev->mem_write) {
2889									dev->mem_write(dev, MMIO_MASK(curbar->bytes) & fault_addr, bar_i, val );
2890								} else {
2891									goto error;
2892								}
2893							} else {
2894								uint64_t val;
2895								if(dev->mem_read){
2896									dev->mem_read(dev, MMIO_MASK(curbar->bytes) & fault_addr, bar_i, (uint32_t*)&val);
2897									decode_mov_dest_val(g, code, val);
2898								} else {
2899									goto error;
2900								}
2901							}
2902#ifdef CONFIG_SVM
2903							amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_rip_rd(&g->vmcb) +
2904							                        decode_mov_instr_length(g, code));
2905#else
2906							err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
2907							err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip +
2908											 decode_mov_instr_length(g, code));
2909							assert(err_is_ok(err));
2910#endif
2911							return HANDLER_ERR_OK;
2912						}
2913					}
2914				}
2915			}
2916    	}
2917    }
2918
2919    error:
2920    printf("vmkitmon: access to an unknown memory location: %lx", fault_addr);
2921    return handle_vmexit_unhandeled(g);
2922}
2923
2924typedef int (*vmexit_handler)(struct guest *g);
2925
2926#ifdef CONFIG_SVM
2927static vmexit_handler vmexit_handlers[0x8c] = {
2928    [SVM_VMEXIT_CR0_READ] = handle_vmexit_cr_access,
2929    [SVM_VMEXIT_CR0_WRITE] = handle_vmexit_cr_access,
2930    [SVM_VMEXIT_CR0_SEL_WRITE] = handle_vmexit_cr_access,
2931    [SVM_VMEXIT_SWINT] = handle_vmexit_swint,
2932    [SVM_VMEXIT_IDTR_WRITE] = handle_vmexit_ldt,
2933    [SVM_VMEXIT_GDTR_WRITE] = handle_vmexit_ldt,
2934    [SVM_VMEXIT_IOIO] = handle_vmexit_ioio,
2935    [SVM_VMEXIT_MSR] = handle_vmexit_msr,
2936    [SVM_VMEXIT_CPUID] = handle_vmexit_cpuid,
2937    [SVM_VMEXIT_VMMCALL] = handle_vmexit_vmmcall,
2938    [SVM_VMEXIT_HLT] = handle_vmexit_hlt
2939};
2940#else
2941static vmexit_handler vmexit_handlers[0x8c] = {
2942    [VMX_EXIT_REASON_CPUID] = handle_vmexit_cpuid,
2943    [VMX_EXIT_REASON_HLT] = handle_vmexit_hlt,
2944    [VMX_EXIT_REASON_VMCALL] = handle_vmexit_vmmcall,
2945    [VMX_EXIT_REASON_CR_ACCESS] = handle_vmexit_cr_access,
2946    [VMX_EXIT_REASON_INOUT] = handle_vmexit_ioio,
2947    [VMX_EXIT_REASON_RDMSR] = handle_vmexit_msr,
2948    [VMX_EXIT_REASON_WRMSR] = handle_vmexit_msr,
2949    [VMX_EXIT_REASON_GDTR_IDTR] = handle_vmexit_ldt,
2950    [VMX_EXIT_REASON_EPT_FAULT] = handle_vmexit_npf,
2951    [VMX_EXIT_REASON_SWINT] = handle_vmexit_swint
2952};
2953#endif
2954
2955void
2956guest_handle_vmexit (struct guest *g) {
2957	//struct pci_ethernet * eth = (struct pci_ethernet * ) g->pci->bus[0]->device[2]->state;//
2958	//printf("guest_handle_vmexit\n");
2959    vmexit_handler handler;
2960#ifdef CONFIG_SVM
2961    uint64_t exitcode = amd_vmcb_exitcode_rd(&g->vmcb);
2962    if (exitcode == SVM_VMEXIT_NPF) {
2963        handler = handle_vmexit_npf;
2964    } else if (LIKELY(vmexit_handlers[exitcode] != NULL)) {
2965        handler = vmexit_handlers[exitcode];
2966    } else {
2967        handle_vmexit_unhandeled(g);
2968        return;
2969    }
2970#else
2971    if (!g->emulated_before_exit) {
2972        errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_EXIT_REASON,
2973						(uint64_t *)&saved_exit_reason);
2974	assert(err_is_ok(err));
2975    }
2976
2977    if (LIKELY(vmexit_handlers[saved_exit_reason] != NULL)) {
2978        handler = vmexit_handlers[saved_exit_reason];
2979    } else {
2980        handle_vmexit_unhandeled(g);
2981	return;
2982    }
2983#endif
2984    int r = handler(g);
2985    if (LIKELY(r == HANDLER_ERR_OK)) {
2986        if (g->runnable) {
2987            guest_make_runnable(g, true);
2988        }
2989    }
2990}
2991