1/**
2 * \file
3 * \brief Contains VMKit kernel interface for version using VMX extensions.
4 */
5
6/*
7 * Copyright (c) 2014, University of Washington.
8 * All rights reserved.
9 *
10 * This file is distributed under the terms in the attached LICENSE file.
11 * If you do not find this file, copies can be found by writing to:
12 * ETH Zurich D-INFK, CAB F.78, Universitaetstrasse 6, CH-8092 Zurich.
13 * Attn: Systems Group.
14 */
15
16#include <string.h>
17#include <kernel.h>
18#include <paging_kernel_arch.h>
19#include <vmx_vmkit.h>
20#include <vmx_checks.h>
21#include <x86.h>
22#include <dispatch.h>
23#include <exec.h>
24#include <irq.h>
25#include <barrelfish_kpi/vmkit.h>
26#include <barrelfish_kpi/syscalls.h>
27
28#include <dev/ia32_dev.h>
29
30#define ARRAKIS_EPT
31// Execution, entry, and exit controls that we want to use
32// for each VM
33#if defined(CONFIG_ARRAKISMON) && !defined(ARRAKIS_EPT)
34// Arrakis w/o EPT
35#define GUEST_PIN_BASE_CTLS_ENABLE \
36    (PIN_CTLS_EXT_INTR | PIN_CTLS_NMI | PIN_CTLS_VIRT_NMI)
37
38#define GUEST_PIN_BASE_CTLS_DISABLE \
39    (0)
40
41#define GUEST_PP_CTLS_ENABLE \
42    (PP_CLTS_MSRBMP | PP_CLTS_IOBMP | PP_CLTS_HLT)
43
44#define GUEST_PP_CTLS_DISABLE \
45    (0)
46
47#define GUEST_SP_CTLS_ENABLE \
48    (0)
49
50#define GUEST_SP_CTLS_DISABLE \
51    (0)
52
53#define GUEST_EXIT_CTLS_ENABLE \
54    (EXIT_CLTS_HOST_SIZE | EXIT_CLTS_SAVE_EFER | EXIT_CLTS_LOAD_EFER)
55
56#define GUEST_EXIT_CTLS_DISABLE \
57    (0)
58
59#define GUEST_ENTRY_CTLS_ENABLE \
60    (ENTRY_CLTS_LOAD_EFER | ENTRY_CLTS_LOAD_DBG | ENTRY_CLTS_IA32E_MODE)
61
62#define GUEST_ENTRY_CTLS_DISABLE \
63    (0)
64#elif defined(CONFIG_ARRAKISMON)
65#define GUEST_PIN_BASE_CTLS_ENABLE \
66    (PIN_CTLS_EXT_INTR | PIN_CTLS_NMI | PIN_CTLS_VIRT_NMI)
67
68#define GUEST_PIN_BASE_CTLS_DISABLE \
69    (0)
70
71#define GUEST_PP_CTLS_ENABLE \
72    (PP_CLTS_MSRBMP | PP_CLTS_IOBMP | PP_CLTS_HLT | PP_CLTS_SEC_CTLS)
73
74#define GUEST_PP_CTLS_DISABLE \
75    (0)
76
77#define GUEST_SP_CTLS_ENABLE \
78    (SP_CLTS_ENABLE_EPT)
79
80#define GUEST_SP_CTLS_DISABLE \
81    (0)
82
83#define GUEST_EXIT_CTLS_ENABLE \
84    (EXIT_CLTS_HOST_SIZE | EXIT_CLTS_SAVE_EFER | EXIT_CLTS_LOAD_EFER)
85
86#define GUEST_EXIT_CTLS_DISABLE \
87    (0)
88
89#define GUEST_ENTRY_CTLS_ENABLE \
90    (ENTRY_CLTS_LOAD_EFER | ENTRY_CLTS_LOAD_DBG | ENTRY_CLTS_IA32E_MODE)
91
92#define GUEST_ENTRY_CTLS_DISABLE \
93    (0)
94#else
95#define GUEST_PIN_BASE_CTLS_ENABLE \
96    (PIN_CTLS_EXT_INTR | PIN_CTLS_NMI | PIN_CTLS_VIRT_NMI)
97
98#define GUEST_PIN_BASE_CTLS_DISABLE \
99    (0)
100
101#define GUEST_PP_CTLS_ENABLE \
102    (PP_CLTS_MSRBMP | PP_CLTS_IOBMP | PP_CLTS_HLT | PP_CLTS_SEC_CTLS)
103
104#define GUEST_PP_CTLS_DISABLE \
105    (0)
106
107#define GUEST_SP_CTLS_ENABLE \
108    (SP_CLTS_ENABLE_EPT | SP_CLTS_UNRSTD_GUEST)
109
110#define GUEST_SP_CTLS_DISABLE \
111    (0)
112
113#define GUEST_EXIT_CTLS_ENABLE \
114    (EXIT_CLTS_HOST_SIZE | EXIT_CLTS_SAVE_EFER | EXIT_CLTS_LOAD_EFER | \
115     EXIT_CLTS_SAVE_PAT  | EXIT_CLTS_LOAD_PAT)
116
117#define GUEST_EXIT_CTLS_DISABLE \
118    (0)
119
120#define GUEST_ENTRY_CTLS_ENABLE \
121    (ENTRY_CLTS_LOAD_EFER)
122
123#define GUEST_ENTRY_CTLS_DISABLE \
124    (0)
125#endif
126
127extern void *vmx_return_func;
128
129static struct guest_control *ctrl = NULL;
130
131static int launched = 0;
132
133#ifndef CONFIG_ARRAKISMON
134// List of MSRs that are loaded on VM-exit.
135static uint32_t msr_list[VMX_MSR_COUNT] =
136    {MSR_KERNEL_GS_BASE, MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SFMASK};
137
138// VM-exit MSR-load area that contains host MSR values that are saved prior
139// to VM-entry and loaded on VM exit.
140static struct msr_entry host_msr_area[VMX_MSR_COUNT]
141__attribute__ ((aligned(16)));
142#endif
143
144// VMX controls that are written to the VMCS. In addition to the controls
145// that are requested, these values may have bits that are reserved set.
146vmx_controls pin_based_ctls = 0, pp_based_ctls = 0, sp_based_ctls = 0,
147    entry_ctls = 0, exit_ctls = 0;
148
149static uint8_t vmxon_region[BASE_PAGE_SIZE]
150__attribute__ ((aligned(BASE_PAGE_SIZE)));
151
152// Returns true if extended page tables (EPT) are enabled.
153static inline int ept_enabled(void)
154{
155    return ((GUEST_SP_CTLS_ENABLE & SP_CLTS_ENABLE_EPT) != 0);
156}
157
158static inline errval_t instr_err(void)
159{
160    errval_t err;
161    __asm volatile("jnc vmx_err_check_zf%=\n\t"
162		   "mov %[VMfailInvalid], %[err]\n\t"
163		   "jmp vmx_err_done%=\n\t"
164		   "vmx_err_check_zf%=:\n\t"
165		   "jnz vmx_err_succeed%=\n\t"
166		   "mov %[VMfailValid], %[err]\n\t"
167		   "jmp vmx_err_done%=\n\t"
168		   "vmx_err_succeed%=:\n\t"
169		   "mov %[VMsucceed], %[err]\n\t"
170		   "vmx_err_done%=:\n\t"
171		   : [err] "=r" (err)
172		   : [VMfailInvalid] "i" (SYS_ERR_VMKIT_VMX_VMFAIL_INVALID),
173		     [VMfailValid] "i" (SYS_ERR_VMKIT_VMX_VMFAIL_VALID),
174		     [VMsucceed] "i" (SYS_ERR_OK)
175		   : "memory");
176    return err;
177}
178
179// Executes the vmptrld instruction, which makes the VMCS referenced by
180// 'vmcs_base' active and current.
181errval_t vmptrld(lpaddr_t vmcs_base)
182{
183    __asm volatile("vmptrld %[vmcs_base]\n\t"
184		   :
185		   : [vmcs_base] "m" (vmcs_base)
186		   : "memory");
187    return instr_err();
188}
189
190// Returns the physical address base of the current VMCS.
191lpaddr_t vmptrst(void)
192{
193   lpaddr_t dest_addr;
194    __asm volatile("vmptrst %[dest_addr]\n\t"
195		   :
196		   : [dest_addr] "m" (dest_addr)
197		   : "memory");
198    return dest_addr;
199}
200
201// Executes the vmclear instruction, which makes the VMCS referenced
202// by 'vmcs_base' clear and inactive. This instruction also ensures
203// that the referenced VMCS data is saved.
204errval_t vmclear(lpaddr_t vmcs_base)
205{
206    __asm volatile("vmclear %[vmcs_base]\n\t"
207		   :
208		   : [vmcs_base] "m" (vmcs_base)
209		   : "memory");
210    return instr_err();
211}
212
213// Reads a component with a specified encoding from the current VMCS
214// to an address dest_addr using the vmread instruction.
215errval_t vmread(uintptr_t encoding, lvaddr_t *dest_addr)
216{
217    __asm volatile("vmread %[encoding], %[dest_addr]\n\t"
218		   :
219		   : [encoding] "r" (encoding), [dest_addr] "m" (*dest_addr)
220		   : "memory");
221    return instr_err();
222}
223
224// Writes a component with a specifed encoding and value to the current
225// VMCS using the vmwrite instruction.
226errval_t vmwrite(uintptr_t encoding, uintptr_t value)
227{
228    __asm volatile("vmwrite %[value], %[encoding]\n\t"
229		   :
230		   : [encoding] "r" (encoding), [value] "r" (value)
231		   : "memory");
232    return instr_err();
233}
234
235// Using a provided VMXON region, causes the logical processor to enter
236// into root-mode by executing the vmxon instruction.
237errval_t vmxon(lpaddr_t base_addr)
238{
239    __asm volatile("vmxon %[base_addr]\n\t"
240		   :
241		   : [base_addr] "m" (base_addr)
242		   : "memory");
243    return instr_err();
244}
245
246// Exits VMX operation by executing the vmxoff instruction.
247errval_t vmxoff(void)
248{
249    __asm volatile("vmxoff");
250    return instr_err();
251}
252
253// Reads and returns the MSR that reports the allowed settings
254// for ALL of the bits of the controls indicated by 'type.'
255static uint64_t msr_ctls_true(enum vmx_ctls_t type)
256{
257    uint64_t true_msr = 0;
258    switch(type) {
259    case VMX_CTLS_PIN_BASED:
260        true_msr = ia32_vmx_true_pinbased_ctls_rd(NULL);
261	break;
262    case VMX_CTLS_PRIMARY_PROCESSOR:
263        true_msr = ia32_vmx_true_ppbased_ctls_rd(NULL);
264	break;
265    case VMX_CTLS_SECONDARY_PROCESSOR:
266        assert(!"No such MSR for secondary processor controls!\n");
267	break;
268    case VMX_CTLS_EXIT:
269        true_msr = ia32_vmx_true_exit_ctls_rd(NULL);
270	break;
271    case VMX_CTLS_ENTRY:
272        true_msr = ia32_vmx_true_entry_ctls_rd(NULL);
273	break;
274    }
275    return true_msr;
276}
277
278// Reads and returns the MSR that reports the allowed settings
279// for MOST of the bits of the controls indicated by 'type.'
280static uint64_t msr_ctls(enum vmx_ctls_t type)
281{
282    uint64_t msr = 0;
283    switch(type) {
284    case VMX_CTLS_PIN_BASED:
285        msr = ia32_vmx_pinbased_ctls_rd(NULL);
286	break;
287    case VMX_CTLS_PRIMARY_PROCESSOR:
288        msr = ia32_vmx_ppbased_ctls_rd(NULL);
289	break;
290    case VMX_CTLS_SECONDARY_PROCESSOR:
291        msr = ia32_vmx_spbased_ctls_rd(NULL);
292	break;
293    case VMX_CTLS_EXIT:
294        msr = ia32_vmx_exit_ctls_rd(NULL);
295	break;
296    case VMX_CTLS_ENTRY:
297        msr = ia32_vmx_entry_ctls_rd(NULL);
298	break;
299    }
300    return msr;
301}
302
303// Writes the controls indicated by 'type' to the VMCS using 'mask_1s'
304// and 'mask_0s', which correspond to the controls that should be enabled
305// and disabled, respectively.
306static uint32_t set_vmx_controls(uint32_t mask_1s,
307    uint32_t mask_0s, enum vmx_ctls_t type)
308{
309    uint32_t controls = 0;
310
311    ia32_vmx_basic_t vmx_basic = ia32_vmx_basic_rd(NULL);
312    bool true_ctls = !!(ia32_vmx_basic_ctls_clear_extract(vmx_basic));
313    if (true_ctls && (type != VMX_CTLS_SECONDARY_PROCESSOR)) {
314        uint64_t true_msr = msr_ctls_true(type);
315	controls = ((DWORD_LS(true_msr) | mask_1s) & DWORD_MS(true_msr));
316    } else {
317        uint64_t msr = msr_ctls(type);
318	controls = ((DWORD_LS(msr) | mask_1s) & DWORD_MS(msr));
319    }
320    assert((mask_1s & (~controls)) == 0);
321    assert((mask_0s & controls) == 0);
322    return controls;
323}
324
325/**
326 * \brief Tries to enable hardware assisted virtualization.
327 *
328 * Checks whether hardware assisted virtualization is available on the platform
329 * and enables this feature.
330 *
331 * \Return Returns VMKIT_ERR_OK on successful initialization of the subsystem
332 *         or VMKIT_ERR_UNAVAIL if virtualization is unavailable.
333 */
334errval_t vmx_enable_virtualization (void)
335{
336    uint32_t cpuid_ecx;
337    cpuid(CPUID_VMX, NULL, NULL, &cpuid_ecx, NULL);
338    if (!(cpuid_ecx & VMX_SUPPORT)) {
339        return SYS_ERR_VMKIT_UNAVAIL;
340    }
341
342    // The 'lock' and 'enable VMXON outside' bits of the IA32_FEATURE_CONTROL_MSR
343    // must be set
344    ia32_feature_cntl_t feat_cntl_msr;
345    feat_cntl_msr = ia32_feature_cntl_rd(NULL);
346    if (!ia32_feature_cntl_lock_extract(feat_cntl_msr) ||
347	!ia32_feature_cntl_vmxoutsmx_extract(feat_cntl_msr)) {
348        return SYS_ERR_VMKIT_UNAVAIL;
349    }
350
351    pin_based_ctls = set_vmx_controls(
352        GUEST_PIN_BASE_CTLS_ENABLE, GUEST_PIN_BASE_CTLS_DISABLE, VMX_CTLS_PIN_BASED);
353
354    pp_based_ctls = set_vmx_controls(
355	GUEST_PP_CTLS_ENABLE, GUEST_PP_CTLS_DISABLE, VMX_CTLS_PRIMARY_PROCESSOR);
356
357    sp_based_ctls = set_vmx_controls(
358	GUEST_SP_CTLS_ENABLE, GUEST_SP_CTLS_DISABLE, VMX_CTLS_SECONDARY_PROCESSOR);
359
360    entry_ctls = set_vmx_controls(
361        GUEST_ENTRY_CTLS_ENABLE, GUEST_ENTRY_CTLS_DISABLE, VMX_CTLS_ENTRY);
362
363    exit_ctls = set_vmx_controls(
364        GUEST_EXIT_CTLS_ENABLE, GUEST_EXIT_CTLS_DISABLE, VMX_CTLS_EXIT);
365
366    // Initialize the VMXON region
367    memset(vmxon_region, 0x0, BASE_PAGE_SIZE);
368    ia32_vmx_basic_t vmx_basic;
369    vmx_basic = ia32_vmx_basic_rd(NULL);
370    uint32_t vmcs_rev_id = ia32_vmx_basic_vmcs_rev_id_extract(vmx_basic);
371    memcpy(vmxon_region, &vmcs_rev_id, sizeof(uint32_t));
372
373    // The logical processor must use PAE paging
374    uint64_t cr0 = rdcr0();
375    if ((cr0 & CR0_PE) == 0 || (rdcr0() & CR0_PG) == 0) {
376        return SYS_ERR_VMKIT_UNAVAIL;
377    }
378
379    // The CR0 register value has to support all of the CR0 fixed bits
380    if (cr0 != vmx_fixed_cr0()) {
381        return SYS_ERR_VMKIT_UNAVAIL;
382    }
383
384    // Enable virtualization, if not already enabled
385    if (!vmx_enabled()) {
386        enable_vmx();
387    }
388    // The CR4 register value has to support all of the CR4 fixed bits
389    if (rdcr4() != vmx_fixed_cr4()) {
390        return SYS_ERR_VMKIT_UNAVAIL;
391    }
392
393    // Execute VMXON to place processor into VMX root operation
394    errval_t err = vmxon(mem_to_local_phys((lvaddr_t)vmxon_region));
395    assert(err_is_ok(err));
396
397    return SYS_ERR_OK;
398}
399
400static inline void vmx_set_exception_bitmap(void)
401{
402    errval_t err = vmwrite(VMX_EXCP_BMP, ~(1UL << 7));
403    assert(err_is_ok(err));
404}
405
406#ifndef CONFIG_ARRAKISMON
407static uint64_t vmx_read_msr(uint32_t index) {
408    uint64_t val = 0;
409    switch (index) {
410    case MSR_KERNEL_GS_BASE:
411        val = ia32_kernel_gs_base_rd(NULL);
412	break;
413    case MSR_STAR:
414        val = ia32_star_rd(NULL);
415	break;
416    case MSR_LSTAR:
417        val = ia32_lstar_rd(NULL);
418	break;
419    case MSR_CSTAR:
420        val = ia32_cstar_rd(NULL);
421	break;
422    case MSR_SFMASK:
423        val = ia32_fmask_rd(NULL);
424	break;
425    default:
426        assert(!"MSR index not supported");
427        panic("MSR index %d not supported\n", index);
428    }
429    return val;
430}
431
432static void vmx_host_msr_area_init(struct msr_entry *msr_area)
433{
434    for (int i = 0; i < VMX_MSR_COUNT; i++) {
435        msr_area[i].index = msr_list[i];
436	msr_area[i].val = vmx_read_msr(msr_list[i]);
437    }
438}
439#endif
440
441static inline lpaddr_t mem_to_local_phys_no_assertion(lvaddr_t addr)
442{
443    return (lpaddr_t)(addr - (lpaddr_t)X86_64_MEMORY_OFFSET);
444}
445
446// Writes the host state, which is used after a VM-exit, to the
447// current VMCS
448static void vmx_set_host_state(void)
449{
450    // On a page-fault the processor checks whether:
451    // (#PF error-code) & (#PF error-code mask) = (#PF error-code match)
452
453    // Setting the mask to 0, the match to 0xFFFFFFFF, and bit 14 in the
454    // exception bitmap results in no VM-exits on guest page-faults.
455    errval_t err = vmwrite(VMX_PF_ERR_MASK, 0);
456    err += vmwrite(VMX_PF_ERR_MATCH, 0xFFFFFFFF);
457    err += vmwrite(VMX_CR3_TARGET_CNT, 0);
458
459    uint64_t cr0 = rdcr0(), cr3 = rdcr3(), cr4 = rdcr4();
460
461    uint64_t cr0_fixed0 = ia32_vmx_cr0_fixed0_rd(NULL);
462    uint64_t cr0_fixed1 = ia32_vmx_cr0_fixed1_rd(NULL);
463    uint64_t cr4_fixed0 = ia32_vmx_cr4_fixed0_rd(NULL);
464    uint64_t cr4_fixed1 = ia32_vmx_cr4_fixed1_rd(NULL);
465
466    assert((~cr0 & cr0_fixed0) == 0);
467    assert((cr0 & ~cr0_fixed1) == 0);
468    assert((~cr4 & cr4_fixed0) == 0);
469    assert((cr4 & ~cr4_fixed1) == 0);
470
471    assert(((cr0 | cr0_fixed0) & cr0_fixed1) == cr0);
472    assert(((cr4 | cr4_fixed0) & cr4_fixed1) == cr4);
473    assert(rdcr4() & CR4_PAE);
474
475    err += vmwrite(VMX_HOST_CR0, cr0);
476    err += vmwrite(VMX_HOST_CR3, cr3);
477    err += vmwrite(VMX_HOST_CR4, cr4);
478
479    err += vmwrite(VMX_HOST_ES_SEL, rd_es() & ~0x7);
480    err += vmwrite(VMX_HOST_CS_SEL, rd_cs() & ~0x7);
481    err += vmwrite(VMX_HOST_SS_SEL, rd_ss() & ~0x7);
482    err += vmwrite(VMX_HOST_DS_SEL, rd_ds() & ~0x7);
483    err += vmwrite(VMX_HOST_TR_SEL, rd_tr() & ~0x7);
484
485    err += vmwrite(VMX_HOST_TR_BASE, tr_addr(rd_tr(), gdtr_addr(rd_gdtr())));
486    err += vmwrite(VMX_HOST_GDTR_BASE, gdtr_addr(rd_gdtr()));
487    err += vmwrite(VMX_HOST_IDTR_BASE, idtr_addr(rd_idtr()));
488    err += vmwrite(VMX_HOST_SYSENTER_CS, 0);
489    err += vmwrite(VMX_HOST_SYSENTER_ESP, 0);
490    err += vmwrite(VMX_HOST_SYSENTER_EIP, 0);
491    err += vmwrite(VMX_HOST_PAT_F, ia32_cr_pat_rd(NULL));
492
493    ia32_efer_t efer_msr = ia32_efer_rd(NULL);
494    err += vmwrite(VMX_HOST_EFER_F, efer_msr);
495    assert(ia32_efer_lme_extract(efer_msr));
496    assert(ia32_efer_lma_extract(efer_msr));
497
498    err += vmwrite(VMX_HOST_GS_SEL, 0x0);
499    err += vmwrite(VMX_HOST_GS_BASE, 0x0);
500
501    err += vmwrite(VMX_HOST_FS_SEL, 0x0);
502    err += vmwrite(VMX_HOST_FS_BASE, 0x0);
503
504    err += vmwrite(VMX_HOST_RIP, (uint64_t)(&vmx_return_func));
505#ifndef CONFIG_ARRAKISMON
506    vmx_host_msr_area_init(host_msr_area);
507
508    lpaddr_t msr_area_base = mem_to_local_phys_no_assertion(
509            (lvaddr_t) host_msr_area);
510    if (!((lvaddr_t) host_msr_area >= X86_64_MEMORY_OFFSET)) {
511        printk(LOG_NOTE, "assertion failed! 0x%lx >= 0x%lx\n",
512                (lvaddr_t) host_msr_area,
513                X86_64_MEMORY_OFFSET);
514    }
515
516    err += vmwrite(VMX_EXIT_MSR_LOAD_F, canonical_form(msr_area_base));
517    err += vmwrite(VMX_EXIT_MSR_LOAD_CNT, VMX_MSR_COUNT);
518#endif
519    assert(err_is_ok(err));
520}
521
522// Writes the VMX controls to the current VMCS.
523void vmx_set_exec_ctls(void)
524{
525    // VM-execution controls
526    errval_t err = vmwrite(VMX_EXEC_PIN_BASED, pin_based_ctls);
527    err += vmwrite(VMX_EXEC_PRIM_PROC, pp_based_ctls);
528    err += vmwrite(VMX_EXEC_SEC_PROC, sp_based_ctls);
529
530    // VM-entry and VM-exit control fields
531    err += vmwrite(VMX_EXIT_CONTROLS, exit_ctls);
532    err += vmwrite(VMX_ENTRY_CONTROLS, entry_ctls);
533
534    vmx_set_exception_bitmap();
535
536    err += vmwrite(VMX_ENTRY_INTR_INFO, 0);
537    err += vmwrite(VMX_ENTRY_EXCP_ERR, 0);
538    err += vmwrite(VMX_ENTRY_INSTR_LEN, 0);
539    assert(err_is_ok(err));
540}
541
542errval_t initialize_vmcs(lpaddr_t vmcs_paddr)
543{
544    struct vmcs *vmcs = (struct vmcs *)local_phys_to_mem(vmcs_paddr);
545
546    ia32_vmx_basic_t vmx_basic;
547    vmx_basic = ia32_vmx_basic_rd(NULL);
548    uint32_t vmcs_rev_id = ia32_vmx_basic_vmcs_rev_id_extract(vmx_basic);
549
550    memset(vmcs, 0x0, BASE_PAGE_SIZE);
551    vmcs->prelude.p.revision_id = vmcs_rev_id;
552    vmcs->prelude.p.shadow = 0;
553    errval_t err = vmclear(vmcs_paddr);
554    err += vmptrld(vmcs_paddr);
555
556    err += vmwrite(VMX_GUEST_VMCS_LPTR_F, ~0x0);
557    err += vmwrite(VMX_GUEST_VMCS_LPTR_H, ~0x0);
558    err += vmwrite(VMX_GUEST_SYSENTER_CS, 0x0);
559    err += vmwrite(VMX_GUEST_SYSENTER_ESP, 0x0);
560    err += vmwrite(VMX_GUEST_SYSENTER_EIP, 0x0);
561#ifdef CONFIG_ARRAKISMON
562    err += vmwrite(VMX_GUEST_DR7, 0x0);
563    err += vmwrite(VMX_GUEST_EFER_F, ia32_efer_rd(NULL) | EFER_LME | EFER_LMA);
564
565    err += vmwrite(VMX_GUEST_ACTIV_STATE, 0x0);
566    err += vmwrite(VMX_GUEST_INTR_STATE, 0x0);
567
568    err += vmwrite(VMX_GUEST_CS_LIM, 0xFFFFFFFF);
569    err += vmwrite(VMX_GUEST_DS_LIM, 0xFFFFFFFF);
570    err += vmwrite(VMX_GUEST_ES_LIM, 0xFFFFFFFF);
571    err += vmwrite(VMX_GUEST_SS_LIM, 0xFFFFFFFF);
572    err += vmwrite(VMX_GUEST_FS_LIM, 0xFFFFFFFF);
573    err += vmwrite(VMX_GUEST_GS_LIM, 0xFFFFFFFF);
574    err += vmwrite(VMX_GUEST_TR_LIM, 0xFFFF);
575    err += vmwrite(VMX_GUEST_LDTR_LIM, 0xFFFF);
576    err += vmwrite(VMX_GUEST_GDTR_LIM, 0xFFFF);
577    err += vmwrite(VMX_GUEST_IDTR_LIM, 0xFFFF);
578
579    err += vmwrite(VMX_GUEST_CS_ACCESS, 0xA09B);
580    err += vmwrite(VMX_GUEST_DS_ACCESS, 0xC093);
581    err += vmwrite(VMX_GUEST_ES_ACCESS, 0xC093);
582    err += vmwrite(VMX_GUEST_FS_ACCESS, 0xC093);
583    err += vmwrite(VMX_GUEST_GS_ACCESS, 0xC093);
584    err += vmwrite(VMX_GUEST_SS_ACCESS, 0xC093);
585    err += vmwrite(VMX_GUEST_TR_ACCESS, 0x8B);
586    err += vmwrite(VMX_GUEST_LDTR_ACCESS, 0x82);
587
588    err += vmwrite(VMX_GUEST_CS_SEL, 0x8);
589    err += vmwrite(VMX_GUEST_SS_SEL, 0x10);
590    err += vmwrite(VMX_GUEST_DS_SEL, 0x10);
591    err += vmwrite(VMX_GUEST_ES_SEL, 0x10);
592    err += vmwrite(VMX_GUEST_FS_SEL, 0x10);
593    err += vmwrite(VMX_GUEST_GS_SEL, 0x10);
594    err += vmwrite(VMX_GUEST_TR_SEL, 0x10);
595    err += vmwrite(VMX_GUEST_LDTR_SEL, 0x10);
596
597    err += vmwrite(VMX_GUEST_CS_BASE, 0x0);
598    err += vmwrite(VMX_GUEST_SS_BASE, 0x0);
599    err += vmwrite(VMX_GUEST_DS_BASE, 0x0);
600    err += vmwrite(VMX_GUEST_ES_BASE, 0x0);
601    err += vmwrite(VMX_GUEST_FS_BASE, 0x0);
602    err += vmwrite(VMX_GUEST_GS_BASE, 0x0);
603    err += vmwrite(VMX_GUEST_TR_BASE, 0x0);
604    err += vmwrite(VMX_GUEST_LDTR_BASE, 0x0);
605    err += vmwrite(VMX_GUEST_GDTR_BASE, 0x0);
606    err += vmwrite(VMX_GUEST_IDTR_BASE, 0x0);
607
608    uint64_t guest_cr0 = 0x60000010 | CR0_PE | CR0_PG;
609    err += vmwrite(VMX_GUEST_CR0, (uint32_t)(guest_cr0 | ia32_vmx_cr0_fixed0_rd(NULL)) &
610		   ia32_vmx_cr0_fixed1_rd(NULL));
611
612    uint64_t guest_cr4 = CR4_PAE;
613    err += vmwrite(VMX_GUEST_CR4, (guest_cr4 | ia32_vmx_cr4_fixed0_rd(NULL)) &
614		   ia32_vmx_cr4_fixed1_rd(NULL));
615
616    err += vmwrite(VMX_CR0_GH_MASK, 0UL);
617    err += vmwrite(VMX_CR4_GH_MASK, 0UL);
618#else
619    err += vmwrite(VMX_GUEST_DR7, 0x400);
620    err += vmwrite(VMX_GUEST_EFER_F, 0x0);
621    err += vmwrite(VMX_GUEST_PAT_F, 0x0007040600070406ul);
622
623    err += vmwrite(VMX_GUEST_ACTIV_STATE, 0x0);
624    err += vmwrite(VMX_GUEST_INTR_STATE, 0x0);
625
626    err += vmwrite(VMX_GUEST_CS_LIM, 0xFFFF);
627    err += vmwrite(VMX_GUEST_DS_LIM, 0xFFFF);
628    err += vmwrite(VMX_GUEST_ES_LIM, 0xFFFF);
629    err += vmwrite(VMX_GUEST_FS_LIM, 0xFFFF);
630    err += vmwrite(VMX_GUEST_GS_LIM, 0xFFFF);
631    err += vmwrite(VMX_GUEST_SS_LIM, 0xFFFF);
632    err += vmwrite(VMX_GUEST_TR_LIM, 0xFFFF);
633    err += vmwrite(VMX_GUEST_LDTR_LIM, 0xFFFF);
634    err += vmwrite(VMX_GUEST_GDTR_LIM, 0xFFFF);
635    err += vmwrite(VMX_GUEST_IDTR_LIM, 0xFFFF);
636
637    err += vmwrite(VMX_GUEST_CS_ACCESS, 0x9B);
638    err += vmwrite(VMX_GUEST_DS_ACCESS, 0x93);
639    err += vmwrite(VMX_GUEST_ES_ACCESS, 0x93);
640    err += vmwrite(VMX_GUEST_FS_ACCESS, 0x93);
641    err += vmwrite(VMX_GUEST_GS_ACCESS, 0x93);
642    err += vmwrite(VMX_GUEST_SS_ACCESS, 0x93);
643    err += vmwrite(VMX_GUEST_TR_ACCESS, 0x8B);
644    err += vmwrite(VMX_GUEST_LDTR_ACCESS, 0x82);
645
646    err += vmwrite(VMX_GUEST_CS_SEL, 0x0);
647    err += vmwrite(VMX_GUEST_DS_SEL, 0x0);
648    err += vmwrite(VMX_GUEST_ES_SEL, 0x0);
649    err += vmwrite(VMX_GUEST_FS_SEL, 0x0);
650    err += vmwrite(VMX_GUEST_GS_SEL, 0x0);
651    err += vmwrite(VMX_GUEST_SS_SEL, 0x0);
652    err += vmwrite(VMX_GUEST_TR_SEL, 0x0);
653    err += vmwrite(VMX_GUEST_LDTR_SEL, 0x0);
654
655    err += vmwrite(VMX_GUEST_CS_BASE, 0x0);
656    err += vmwrite(VMX_GUEST_DS_BASE, 0x0);
657    err += vmwrite(VMX_GUEST_ES_BASE, 0x0);
658    err += vmwrite(VMX_GUEST_FS_BASE, 0x0);
659    err += vmwrite(VMX_GUEST_GS_BASE, 0x0);
660    err += vmwrite(VMX_GUEST_SS_BASE, 0x0);
661    err += vmwrite(VMX_GUEST_TR_BASE, 0x0);
662    err += vmwrite(VMX_GUEST_LDTR_BASE, 0x0);
663    err += vmwrite(VMX_GUEST_GDTR_BASE, 0x0);
664    err += vmwrite(VMX_GUEST_IDTR_BASE, 0x0);
665
666    err += vmwrite(VMX_GUEST_RFLAGS, 0x200002);
667    err += vmwrite(VMX_GUEST_RIP, 0xFFF0);
668    err += vmwrite(VMX_GUEST_RSP, 0x0);
669
670    uint64_t guest_cr0 = (0x60000010 | ia32_vmx_cr0_fixed0_rd(NULL)) &
671        ia32_vmx_cr0_fixed1_rd(NULL);
672    err += vmwrite(VMX_GUEST_CR0, guest_cr0 & ~(CR0_PE | CR0_PG));
673
674    uint64_t guest_cr4 = CR4_PAE;
675    err += vmwrite(VMX_GUEST_CR4, (guest_cr4 | ia32_vmx_cr4_fixed0_rd(NULL)) &
676            ia32_vmx_cr4_fixed1_rd(NULL));
677    assert((guest_cr4 & CR4_PCIDE) == 0);
678
679    uint64_t cr0_shadow;
680    err += vmread(VMX_GUEST_CR0, &cr0_shadow);
681
682    err += vmwrite(VMX_CR0_RD_SHADOW, cr0_shadow);
683    err += vmwrite(VMX_CR0_GH_MASK, CR0_PE);
684    err += vmwrite(VMX_CR4_GH_MASK, 0x20);
685#endif
686    assert(err_is_ok(err));
687
688    vmx_set_exec_ctls();
689
690    return SYS_ERR_OK;
691}
692
693static uint32_t fail = 0;
694
695static inline void enter_guest(void)
696{
697    // Set the host state prior to every VM-entry in case the values
698    // written to the VMCS change.
699    //printf("%s:%d\n", __FUNCTION__, __LINE__);
700    vmx_set_host_state();
701
702    // This is necessary or else a #GPF will be incurred in the
703    // monitor domain.
704    //printf("%s:%d\n", __FUNCTION__, __LINE__);
705    uint16_t ldtr_sel = rd_ldtr();
706
707    // Perform most checks that are performed by the processor
708    //printf("%s:%d\n", __FUNCTION__, __LINE__);
709    if (!launched) {
710        check_guest_state_area();
711	check_host_state_area();
712	check_vmx_controls();
713    }
714    //printf("%s:%d\n", __FUNCTION__, __LINE__);
715
716    __asm volatile("mov %[ctrl], %%rdi\n\t"
717
718		   // save host host
719		   "mov %%rsp, %%r8\n\t"
720		   "mov %[host_rsp_encoding], %%r9\n\t"
721		   "vmwrite %%r8, %%r9\n\t"
722
723		   "mov %%rbx, (148 + 1*8)(%%rdi)\n\t"
724		   "mov %%rbp, (148 + 6*8)(%%rdi)\n\t"
725		   "mov %%r12, (148 + 12*8)(%%rdi)\n\t"
726		   "mov %%r13, (148 + 13*8)(%%rdi)\n\t"
727		   "mov %%r14, (148 + 14*8)(%%rdi)\n\t"
728		   "mov %%r15, (148 + 15*8)(%%rdi)\n\t"
729		   "mov %%cr2, %%rsi\n\t"
730		   "mov %%rsi, 38*8(%%rdi)\n\t"
731
732		   // load guest state
733		   "mov 37*8(%%rdi), %%rsi\n\t"
734		   "mov %%rsi, %%cr2\n\t"
735
736		   "mov 0*8(%%rdi), %%rax\n\t"
737		   "mov 1*8(%%rdi), %%rbx\n\t"
738		   "mov 2*8(%%rdi), %%rcx\n\t"
739		   "mov 3*8(%%rdi), %%rdx\n\t"
740		   "mov 4*8(%%rdi), %%rsi\n\t"
741		   "mov 6*8(%%rdi), %%rbp\n\t"
742		   "mov 8*8(%%rdi), %%r8\n\t"
743		   "mov 9*8(%%rdi), %%r9\n\t"
744		   "mov 10*8(%%rdi), %%r10\n\t"
745		   "mov 11*8(%%rdi), %%r11\n\t"
746		   "mov 12*8(%%rdi), %%r12\n\t"
747		   "mov 13*8(%%rdi), %%r13\n\t"
748		   "mov 14*8(%%rdi), %%r14\n\t"
749		   "mov 15*8(%%rdi), %%r15\n\t"
750		   "mov 5*8(%%rdi), %%rdi\n\t"
751
752		   // enter the guest VM
753		   "cmpl $0, %[launched]\n\t"
754		   "jne 1f\n\t"
755		   "sti\n\t"
756		   "vmlaunch\n\t"
757		   "jmp 2f\n\t"
758		   "1: "
759		   "sti\n\t"
760		   "vmresume\n\t"
761		   "2: "
762		   "setbe %[fail]\n\t"
763		   "vmx_return_func:\n\t"
764		   "cli\n\t"
765
766		   "push %%rdi\n\t"
767		   "mov %[ctrl], %%rdi\n\t"
768
769		   // save guest state
770		   "mov %%rax, 0*8(%%rdi)\n\t"
771		   "mov %%rbx, 1*8(%%rdi)\n\t"
772		   "mov %%rcx, 2*8(%%rdi)\n\t"
773		   "mov %%rdx, 3*8(%%rdi)\n\t"
774		   "mov %%rsi, 4*8(%%rdi)\n\t"
775		   "mov %%rbp, 6*8(%%rdi)\n\t"
776		   "mov %%r8, 8*8(%%rdi)\n\t"
777		   "mov %%r9, 9*8(%%rdi)\n\t"
778		   "mov %%r10, 10*8(%%rdi)\n\t"
779		   "mov %%r11, 11*8(%%rdi)\n\t"
780		   "mov %%r12, 12*8(%%rdi)\n\t"
781		   "mov %%r13, 13*8(%%rdi)\n\t"
782		   "mov %%r14, 14*8(%%rdi)\n\t"
783		   "mov %%r15, 15*8(%%rdi)\n\t"
784
785		   "mov %%cr2, %%rsi\n\t"
786		   "mov %%rsi, 37*8(%%rdi)\n\t"
787
788		   "pop %%rsi\n\t"
789		   "mov %%rsi, 5*8(%%rdi)\n\t"
790
791		   // load host state
792		   "mov (148 + 1*8)(%%rdi), %%rbx\n\t"
793		   "mov (148 + 6*8)(%%rdi), %%rbp\n\t"
794		   "mov (148 + 12*8)(%%rdi), %%r12\n\t"
795		   "mov (148 + 13*8)(%%rdi), %%r13\n\t"
796		   "mov (148 + 14*8)(%%rdi), %%r14\n\t"
797		   "mov (148 + 15*8)(%%rdi), %%r15\n\t"
798		   "mov 38*8(%%rdi), %%rsi\n\t"
799		   "mov %%rsi, %%cr2\n\t"
800		   : [fail] "=m" (fail)
801		   : [ctrl] "m" (ctrl), [launched] "m" (launched),
802		     [host_rsp_encoding] "i" (VMX_HOST_RSP)
803		   : "memory"
804		   );
805    assert(!fail);
806    wr_ldtr(ldtr_sel);
807
808    launched = 1;
809}
810
811static inline void print_vmcs_info(struct guest_control *g)
812{
813    uint64_t guest_rip, guest_rsp, guest_rflags;
814    uint64_t reason, exit_qual;
815    uint64_t exit_intr_info, intr_err;
816    uint64_t idt_vec_info, idt_vec_err;
817    uint64_t instr_len, instr_info;
818    uint64_t instr_error, gpaddr, gladdr;
819    uint64_t entry_intr_info, activ_state, intr_state;
820    uint64_t guest_cr0, guest_cr3, guest_cr4;
821    uint64_t guest_efer;
822
823    uint64_t guest_es_sel, guest_es_base, guest_es_lim, guest_es_access;
824    uint64_t guest_cs_sel, guest_cs_base, guest_cs_lim, guest_cs_access;
825    uint64_t guest_ss_sel, guest_ss_base, guest_ss_lim, guest_ss_access;
826    uint64_t guest_ds_sel, guest_ds_base, guest_ds_lim, guest_ds_access;
827    uint64_t guest_fs_sel, guest_fs_base, guest_fs_lim, guest_fs_access;
828    uint64_t guest_gs_sel, guest_gs_base, guest_gs_lim, guest_gs_access;
829    uint64_t guest_tr_sel, guest_tr_base, guest_tr_lim, guest_tr_access;
830    uint64_t guest_ldtr_sel, guest_ldtr_base, guest_ldtr_lim, guest_ldtr_access;
831    uint64_t guest_idtr_base, guest_idtr_lim;
832    uint64_t guest_gdtr_base, guest_gdtr_lim;
833
834    errval_t err = vmread(VMX_GUEST_ES_SEL, &guest_es_sel);
835    err += vmread(VMX_GUEST_ES_BASE, &guest_es_base);
836    err += vmread(VMX_GUEST_ES_LIM, &guest_es_lim);
837    err += vmread(VMX_GUEST_ES_ACCESS, &guest_es_access);
838    err += vmread(VMX_GUEST_CS_SEL, &guest_cs_sel);
839    err += vmread(VMX_GUEST_CS_BASE, &guest_cs_base);
840    err += vmread(VMX_GUEST_CS_LIM, &guest_cs_lim);
841    err += vmread(VMX_GUEST_CS_ACCESS, &guest_cs_access);
842    err += vmread(VMX_GUEST_SS_SEL, &guest_ss_sel);
843    err += vmread(VMX_GUEST_SS_BASE, &guest_ss_base);
844    err += vmread(VMX_GUEST_SS_LIM, &guest_ss_lim);
845    err += vmread(VMX_GUEST_SS_ACCESS, &guest_ss_access);
846    err += vmread(VMX_GUEST_DS_SEL, &guest_ds_sel);
847    err += vmread(VMX_GUEST_DS_BASE, &guest_ds_base);
848    err += vmread(VMX_GUEST_DS_LIM, &guest_ds_lim);
849    err += vmread(VMX_GUEST_DS_ACCESS, &guest_ds_access);
850    err += vmread(VMX_GUEST_FS_SEL, &guest_fs_sel);
851    err += vmread(VMX_GUEST_FS_BASE, &guest_fs_base);
852    err += vmread(VMX_GUEST_FS_LIM, &guest_fs_lim);
853    err += vmread(VMX_GUEST_FS_ACCESS, &guest_fs_access);
854    err += vmread(VMX_GUEST_GS_SEL, &guest_gs_sel);
855    err += vmread(VMX_GUEST_GS_BASE, &guest_gs_base);
856    err += vmread(VMX_GUEST_GS_LIM, &guest_gs_lim);
857    err += vmread(VMX_GUEST_GS_ACCESS, &guest_gs_access);
858    err += vmread(VMX_GUEST_TR_SEL, &guest_tr_sel);
859    err += vmread(VMX_GUEST_TR_BASE, &guest_tr_base);
860    err += vmread(VMX_GUEST_TR_LIM, &guest_tr_lim);
861    err += vmread(VMX_GUEST_TR_ACCESS, &guest_tr_access);
862    err += vmread(VMX_GUEST_LDTR_SEL, &guest_ldtr_sel);
863    err += vmread(VMX_GUEST_LDTR_BASE, &guest_ldtr_base);
864    err += vmread(VMX_GUEST_LDTR_LIM, &guest_ldtr_lim);
865    err += vmread(VMX_GUEST_LDTR_ACCESS, &guest_ldtr_access);
866    err += vmread(VMX_GUEST_IDTR_BASE, &guest_idtr_base);
867    err += vmread(VMX_GUEST_IDTR_LIM, &guest_idtr_lim);
868    err += vmread(VMX_GUEST_GDTR_BASE, &guest_gdtr_base);
869    err += vmread(VMX_GUEST_GDTR_LIM, &guest_gdtr_lim);
870
871    err += vmread(VMX_GUEST_RIP, &guest_rip);
872    err += vmread(VMX_GUEST_RSP, &guest_rsp);
873    err += vmread(VMX_GUEST_RFLAGS, &guest_rflags);
874    err += vmread(VMX_EXIT_REASON, &reason);
875    err += vmread(VMX_EXIT_QUAL, &exit_qual);
876    err += vmread(VMX_EXIT_INTR_INFO, &exit_intr_info);
877    err += vmread(VMX_EXIT_INTR_ERR, &intr_err);
878    err += vmread(VMX_IDT_VEC_INFO, &idt_vec_info);
879    err += vmread(VMX_IDT_VEC_ERR, &idt_vec_err);
880    err += vmread(VMX_INSTR_ERROR, &instr_error);
881    err += vmread(VMX_GPADDR_F, &gpaddr);
882    err += vmread(VMX_GL_ADDR, &gladdr);
883    err += vmread(VMX_ENTRY_INTR_INFO, &entry_intr_info);
884    err += vmread(VMX_GUEST_ACTIV_STATE, &activ_state);
885    err += vmread(VMX_GUEST_INTR_STATE, &intr_state);
886    err += vmread(VMX_EXIT_INSTR_LEN, &instr_len);
887    err += vmread(VMX_EXIT_INSTR_INFO, &instr_info);
888    err += vmread(VMX_GUEST_CR0, &guest_cr0);
889    err += vmread(VMX_GUEST_CR3, &guest_cr3);
890    err += vmread(VMX_GUEST_CR4, &guest_cr4);
891    err += vmread(VMX_GUEST_EFER_F, &guest_efer);
892    assert(err_is_ok(err));
893
894    printf("VMCS info:\n");
895    printf("\tvmexit reason = %d\n", (int)reason & 0xFFFF);
896    printf("\texit qualification = 0x%"PRIx64"\n", exit_qual);
897    printf("\tBit 31 of reason = %x\n", ((int)reason >> 31) & 1);
898
899    printf("\tVM-exit interruption information = 0x%"PRIx64"\n", exit_intr_info);
900    printf("\tVM-exit interruption error = 0x%"PRIx64"\n", intr_err);
901
902    printf("\tVM-entry interruption info=0x%"PRIx64"\n", entry_intr_info);
903
904    printf("\tIDT vector information = 0x%"PRIx64"\n", idt_vec_info);
905    printf("\tIDT vector error = 0x%"PRIx64"\n", idt_vec_err);
906
907    printf("\tInstruction error = 0x%"PRIx64", gladdr = 0x%"PRIx64", gpaddr = 0x%"PRIx64"\n",
908	   instr_error, gpaddr, gladdr);
909    printf("\tActivity state=0x%"PRIx64", Interruptibility state=0x%"PRIx64"\n",
910	   activ_state, intr_state);
911    printf("\tVM-exit instruction length = 0x%"PRIx64"\n", instr_len);
912    printf("\tVM-exit instruction info = 0x%"PRIx64"\n", instr_info);
913
914    printf("\tguest_rip = 0x%"PRIx64", guest_rflags = 0x%"PRIx64"\n",
915	   guest_rip, guest_rflags);
916    printf("\tRAX=0x%"PRIx64"    RBX=0x%"PRIx64"    RCX=0x%"PRIx64"    RDX=0x%"PRIx64"\n",
917	   g->regs.rax, g->regs.rbx, g->regs.rcx, g->regs.rdx);
918    printf("\tRSP=0x%"PRIx64"    RBP=0x%"PRIx64"    RSI=0x%"PRIx64"    RDI=0x%"PRIx64"\n",
919	   guest_rsp, g->regs.rbp, g->regs.rsi, g->regs.rdi);
920    printf("\tR8 =0x%"PRIx64"    R9 =0x%"PRIx64"    R10=0x%"PRIx64"    R11=0x%"PRIx64"\n",
921	   g->regs.r8, g->regs.r9, g->regs.r10, g->regs.r11);
922    printf("\tR12=0x%"PRIx64"    R13=0x%"PRIx64"    R14=0x%"PRIx64"    R15=0x%"PRIx64"\n",
923	   g->regs.r12, g->regs.r13, g->regs.r14, g->regs.r15);
924    printf("\tCR0=0x%"PRIx64", CR3=0x%"PRIx64", CR4=0x%"PRIx64"\n",
925	   guest_cr0, guest_cr3, guest_cr4);
926
927    printf("\tES: sel=0x%"PRIx64", base=0x%"PRIx64", lim=0x%"PRIx64", access=0x%"PRIx64"\n",
928	   guest_es_sel, guest_es_base, guest_es_lim, guest_es_access);
929    printf("\tCS: sel=0x%"PRIx64", base=0x%"PRIx64", lim=0x%"PRIx64", access=0x%"PRIx64"\n",
930	   guest_cs_sel, guest_cs_base, guest_cs_lim, guest_cs_access);
931    printf("\tSS: sel= 0x%"PRIx64", base=0x%"PRIx64", lim=0x%"PRIx64", access=0x%"PRIx64"\n",
932	   guest_ss_sel, guest_ss_base, guest_ss_lim, guest_ss_access);
933    printf("\tDS: sel=0x%"PRIx64", base=0x%"PRIx64", lim=0x%"PRIx64", access=0x%"PRIx64"\n",
934	   guest_ds_sel, guest_ds_base, guest_ds_lim, guest_ds_access);
935    printf("\tFS: sel=0x%"PRIx64", base=0x%"PRIx64", lim=0x%"PRIx64", access=0x%"PRIx64"\n",
936	   guest_fs_sel, guest_fs_base, guest_fs_lim, guest_fs_access);
937    printf("\tGS: sel=0x%"PRIx64", base=0x%"PRIx64", lim=0x%"PRIx64", access=0x%"PRIx64"\n",
938	   guest_gs_sel, guest_gs_base, guest_gs_lim, guest_gs_access);
939    printf("\tTR: sel=0x%"PRIx64", base=0x%"PRIx64", lim=0x%"PRIx64", access=0x%"PRIx64"\n",
940	   guest_tr_sel, guest_tr_base, guest_tr_lim, guest_tr_access);
941    printf("\tLDTR: sel=0x%"PRIx64", base=0x%"PRIx64", lim=0x%"PRIx64", access=0x%"PRIx64"\n",
942	   guest_ldtr_sel, guest_ldtr_base, guest_ldtr_lim, guest_ldtr_access);
943    printf("\tIDTR: base=0x%"PRIx64", lim=0x%"PRIx64"\n",
944	   guest_idtr_base, guest_idtr_lim);
945    printf("\tGDTR: base=0x%"PRIx64", lim=0x%"PRIx64"\n",
946	   guest_gdtr_base, guest_gdtr_lim);
947
948    printf("\tEFER = 0x%"PRIx64"\n", guest_efer);
949}
950
951static inline uint64_t interruption_type(uint64_t intr_info) {
952    return (intr_info >> 8) & 0x7;
953}
954
955static void __attribute__ ((noreturn))
956call_monitor(struct dcb *dcb)
957{
958    ctrl->num_vm_exits_with_monitor_invocation++;
959    /* the guest exited not due to an interrupt but some condition the
960     * monitor has to handle, therefore notify the monitor */
961
962    assert(dcb->is_vm_guest);
963
964    // disable the domain
965    scheduler_remove(dcb);
966
967    // call the monitor
968    errval_t err = lmp_deliver_notification(&dcb->guest_desc.monitor_ep.cap);
969    if (err_is_fail(err)) {
970        printk(LOG_ERR, "Unexpected error delivering VMEXIT");
971    }
972
973    // run the monitor
974    dispatch(dcb->guest_desc.monitor_ep.cap.u.endpointlmp.listener);
975}
976
977__attribute__((unused))
978static void dump_page_tables(lpaddr_t root_pt_phys)
979{
980    lvaddr_t root_pt = local_phys_to_mem(root_pt_phys);
981    printk(LOG_NOTE, "dumping page tables rooted at 0x%"PRIxLPADDR"\n", root_pt_phys);
982
983    // loop over pdpts
984    union x86_64_ptable_entry *pt;
985    size_t kernel_pml4e = X86_64_PML4_BASE(X86_64_MEMORY_OFFSET);
986    for (int pdpt_index = 0; pdpt_index < kernel_pml4e; pdpt_index++) {
987        union x86_64_pdir_entry *pdpt = (union x86_64_pdir_entry *)root_pt + pdpt_index;
988        if (!pdpt->raw) { continue; }
989        else {
990            genpaddr_t paddr = (genpaddr_t)pdpt->d.base_addr << BASE_PAGE_BITS;
991            printf("%d: 0x%"PRIxGENPADDR" (%d %d), raw=0x%"PRIx64"\n",
992                    pdpt_index, paddr,
993                    pdpt->d.read_write, pdpt->d.user_supervisor,
994                    pdpt->raw);
995        }
996        genpaddr_t pdpt_gp = pdpt->d.base_addr << BASE_PAGE_BITS;
997        lvaddr_t pdpt_lv = local_phys_to_mem(gen_phys_to_local_phys(pdpt_gp));
998
999        for (int pdir_index = 0; pdir_index < X86_64_PTABLE_SIZE; pdir_index++) {
1000            // get pdir
1001            union x86_64_pdir_entry *pdir = (union x86_64_pdir_entry *)pdpt_lv + pdir_index;
1002            pt = (union x86_64_ptable_entry*)pdir;
1003            if (!pdir->raw) { continue; }
1004            // check if pdir or huge page
1005            if (pt->huge.always1) {
1006                // is huge page mapping
1007                genpaddr_t paddr = (genpaddr_t)pt->huge.base_addr << HUGE_PAGE_BITS;
1008                printf("%d.%d: 0x%"PRIxGENPADDR" (%d %d %d)\n", pdpt_index,
1009                        pdir_index, paddr, pt->huge.read_write,
1010                        pt->huge.dirty, pt->huge.accessed);
1011                // goto next pdpt entry
1012                continue;
1013            } else {
1014                genpaddr_t paddr = (genpaddr_t)pdir->d.base_addr << BASE_PAGE_BITS;
1015                printf("%d.%d: 0x%"PRIxGENPADDR" (%d %d), raw=0x%"PRIx64"\n",
1016                        pdpt_index, pdir_index, paddr,
1017                        pdir->d.read_write, pdir->d.user_supervisor,
1018                        pdir->raw);
1019            }
1020            genpaddr_t pdir_gp = pdir->d.base_addr << BASE_PAGE_BITS;
1021            lvaddr_t pdir_lv = local_phys_to_mem(gen_phys_to_local_phys(pdir_gp));
1022
1023            for (int ptable_index = 0; ptable_index < X86_64_PTABLE_SIZE; ptable_index++) {
1024                // get ptable
1025                union x86_64_pdir_entry *ptable = (union x86_64_pdir_entry *)pdir_lv + ptable_index;
1026                pt = (union x86_64_ptable_entry *)ptable;
1027                if (!ptable->raw) { continue; }
1028                // check if ptable or large page
1029                if (pt->large.always1) {
1030                    // is large page mapping
1031                    genpaddr_t paddr = (genpaddr_t)pt->large.base_addr << LARGE_PAGE_BITS;
1032                    printf("%d.%d.%d: 0x%"PRIxGENPADDR" (%d %d %d)\n",
1033                            pdpt_index, pdir_index, ptable_index, paddr,
1034                            pt->large.read_write, pt->large.dirty, pt->large.accessed);
1035                    // goto next pdir entry
1036                    continue;
1037                } else {
1038                    genpaddr_t paddr = (genpaddr_t)ptable->d.base_addr << BASE_PAGE_BITS;
1039                    printf("%d.%d.%d: 0x%"PRIxGENPADDR" (%d %d), raw=0x%"PRIx64"\n",
1040                            pdpt_index, pdir_index, ptable_index, paddr,
1041                            ptable->d.read_write, ptable->d.user_supervisor,
1042                            ptable->raw);
1043                }
1044                genpaddr_t ptable_gp = ptable->d.base_addr << BASE_PAGE_BITS;
1045                lvaddr_t ptable_lv = local_phys_to_mem(gen_phys_to_local_phys(ptable_gp));
1046
1047                for (int entry = 0; entry < X86_64_PTABLE_SIZE; entry++) {
1048                    union x86_64_ptable_entry *e =
1049                        (union x86_64_ptable_entry *)ptable_lv + entry;
1050                    genpaddr_t paddr = (genpaddr_t)e->base.base_addr << BASE_PAGE_BITS;
1051                    if (!paddr) {
1052                        continue;
1053                    }
1054                    printf("%d.%d.%d.%d: 0x%"PRIxGENPADDR" (%d %d %d), raw=0x%"PRIx64"\n",
1055                            pdpt_index, pdir_index, ptable_index, entry,
1056                            paddr, e->base.read_write, e->base.dirty, e->base.accessed,
1057                            e->raw);
1058                }
1059            }
1060        }
1061    }
1062}
1063
1064struct sysret sys_vmcall(uint64_t syscall, uint64_t arg0, uint64_t arg1,
1065                         uint64_t *args, uint64_t rflags, uint64_t rip,
1066                         struct capability *root);
1067
1068extern uint64_t user_stack_save;
1069
1070void __attribute__ ((noreturn))
1071vmx_vmkit_vmenter (struct dcb *dcb)
1072{
1073    errval_t err;
1074    lpaddr_t lpaddr = gen_phys_to_local_phys(dcb->guest_desc.ctrl.cap.u.frame.base);
1075    ctrl = (void *)local_phys_to_mem(lpaddr);
1076
1077    assert(dcb != NULL);
1078    assert(dcb->vspace != 0);
1079    assert(dcb->is_vm_guest);
1080
1081    if (ept_enabled()) {
1082        uint64_t old_eptp_root, old_guest_cr3;
1083        err = vmread(VMX_EPTP_F, &old_eptp_root);
1084        err+= vmread(VMX_GUEST_CR3, &old_guest_cr3);
1085        assert(err_is_ok(err));
1086        // dcb->vspace is root of EPT, dcb->guest.vspace is root of guest AS
1087        // get dcb->vspace masked with width of physical address space and
1088        // mask out low 12 bits
1089        uint64_t eptp_root = 0x6ull | (3 << 3);
1090        eptp_root |= (dcb->guest_desc.vspace & pa_width_mask()) & ~BASE_PAGE_MASK;
1091        // set bits 5:3 to 0x3 (i.e. 1 less than length of ept walks)
1092        //eptp_root |= 0x18;
1093        //printk(LOG_NOTE, "setting EPTP_F to 0x%lx\n", eptp_root);
1094        if (old_eptp_root != eptp_root) {
1095            printk(LOG_NOTE, "setting EPTP_F to 0x%lx\n", eptp_root);
1096            err = vmwrite(VMX_EPTP_F, eptp_root);
1097            assert(err_is_ok(err));
1098        }
1099        if (old_guest_cr3 != dcb->vspace) {
1100            printk(LOG_NOTE, "setting GUEST_CR3 to 0x%lx\n", dcb->vspace);
1101            err = vmwrite(VMX_GUEST_CR3, dcb->vspace);
1102            assert(err_is_ok(err));
1103        }
1104        /*
1105        printk(LOG_NOTE, "doing INVEPT\n");
1106        uint64_t invept_desc[2] = { 0 };
1107        invept_desc[0] = eptp_root;
1108        uint64_t mode = 1;
1109        __asm volatile("invept %[desc], %[mode]"
1110                       :
1111                       : [mode] "r" (mode), [desc] "m" (invept_desc)
1112                       : "memory");
1113        */
1114        //printf("EPT tables:\n");
1115        //dump_page_tables(eptp_root & ~BASE_PAGE_MASK);
1116        /*
1117        printf("GUEST tables:\n");
1118        dump_page_tables(dcb->guest_desc.vspace);
1119        */
1120        //print_vmcs_info(ctrl);
1121    } else {
1122        err = vmwrite(VMX_GUEST_CR3, dcb->vspace);
1123	assert(err_is_ok(err));
1124    }
1125
1126vmx_vmenter_loop:
1127
1128    enter_guest();
1129
1130    //printk(LOG_NOTE, "VMEXIT\n");
1131
1132    uint16_t exit_reason;
1133    err = vmread(VMX_EXIT_REASON, (uint64_t *)&exit_reason);
1134
1135    //printk(LOG_NOTE, "vmx exit reason: %u\n", exit_reason);
1136
1137    switch(exit_reason) {
1138        case VMX_EXIT_REASON_INVAL_VMCS:
1139            {
1140                // A condition that violates ones of the processor checks may be violated
1141                // during the execution of the guest. With the Linux guest we used, the GS
1142                // limit is set to 0x10ffef, which causes one of the checks to fail.
1143                uint64_t gs_lim;
1144                err += vmread(VMX_GUEST_GS_LIM, &gs_lim);
1145                assert(gs_lim == 0x10ffef);
1146                err += vmwrite(VMX_GUEST_GS_LIM, 0xfffef);
1147                assert(err_is_ok(err));
1148            }
1149            goto vmx_vmenter_loop;
1150
1151        case VMX_EXIT_REASON_EXCEPTION:
1152            {
1153                uint64_t intr_info, type;
1154                err += vmread(VMX_EXIT_INTR_INFO, &intr_info);
1155                assert(err_is_ok(err));
1156
1157                type = interruption_type(intr_info);
1158
1159                if (type != TYPE_NMI) {
1160                    //printk(LOG_NOTE, "REASON: EXCEPTION, type: %lu, vec: %lu\n",
1161                    //        type, intr_info & 0xF);
1162                    call_monitor(dcb);
1163                    break;
1164                }
1165            }
1166        case VMX_EXIT_REASON_EXT_INTR:
1167        case VMX_EXIT_REASON_SMI:
1168            {
1169                ctrl->num_vm_exits_without_monitor_invocation++;
1170
1171#ifdef CONFIG_ARRAKISMON
1172                //printf("EXIT_REASON: INTR || SMI\n");
1173                uint64_t guest_rip, guest_rsp, guest_rflags;
1174                err += vmread(VMX_GUEST_RIP, &guest_rip);
1175                err += vmread(VMX_GUEST_RSP, &guest_rsp);
1176                err += vmread(VMX_GUEST_RFLAGS, &guest_rflags);
1177
1178                uint64_t guest_fs_sel, guest_gs_sel;
1179                err += vmread(VMX_GUEST_FS_SEL, &guest_fs_sel);
1180                err += vmread(VMX_GUEST_GS_SEL, &guest_gs_sel);
1181                assert(err_is_ok(err));
1182
1183                arch_registers_state_t *area = NULL;
1184
1185                // Store user state into corresponding save area
1186                if(dispatcher_is_disabled_ip(dcb->disp, guest_rip)) {
1187                    area = dispatcher_get_disabled_save_area(dcb->disp);
1188                    dcb->disabled = true;
1189                } else {
1190                    area = dispatcher_get_enabled_save_area(dcb->disp);
1191                    dcb->disabled = false;
1192                }
1193                memcpy(area, &ctrl->regs, sizeof(arch_registers_state_t));
1194                area->rip = guest_rip;
1195                area->rax = ctrl->regs.rax;
1196                area->rsp = guest_rsp;
1197                area->eflags = guest_rflags;
1198                area->fs = guest_fs_sel;
1199                area->gs = guest_gs_sel;
1200#endif
1201                wait_for_interrupt();
1202            }
1203            break;
1204#ifdef CONFIG_ARRAKISMON
1205        case VMX_EXIT_REASON_VMCALL:
1206            {
1207                // Translate this to a SYSCALL
1208                struct registers_x86_64 *regs = &ctrl->regs;
1209                uint64_t args[10] = {
1210                    regs->r10, regs->r8, regs->r9, regs->r12, regs->r13, regs->r14,
1211                    regs->r15, regs->rax, regs->rbp, regs->rbx
1212                };
1213
1214                //printf("VMMCALL: %lu %lx %lx\n", regs->rdi, regs->rsi, regs->rdx);
1215
1216                uint64_t guest_rip, guest_rsp, guest_rflags, instr_len;
1217                err += vmread(VMX_GUEST_RIP, &guest_rip);
1218                err += vmread(VMX_GUEST_RSP, &guest_rsp);
1219                err += vmread(VMX_GUEST_RFLAGS, &guest_rflags);
1220                // Advance guest RIP to next instruction
1221                err += vmread(VMX_EXIT_INSTR_LEN, &instr_len);
1222                assert(err_is_ok(err));
1223                err += vmwrite(VMX_GUEST_RIP, guest_rip + instr_len);
1224                assert(err_is_ok(err));
1225
1226                user_stack_save = guest_rsp;
1227
1228                //printf("doing VMMCALL: %lu %lx %lx\n", regs->rdi, regs->rsi, regs->rdx);
1229
1230                struct sysret ret = sys_vmcall(regs->rdi, regs->rsi, regs->rdx,
1231                        args, guest_rflags, guest_rip + instr_len, &dcb->cspace.cap);
1232
1233                //printf("VMMCALL done\n");
1234
1235                regs->rax = ret.error;
1236                regs->rdx = ret.value;
1237            }
1238            goto vmx_vmenter_loop;
1239#endif
1240        default:
1241            //printk(LOG_NOTE, "EXIT_REASON: %d\n", exit_reason);
1242#if 0
1243            if (exit_reason == VMX_EXIT_REASON_EPT_FAULT) {
1244                uint64_t fault_addr, guest_rip, exit_qual;
1245                err = vmread(VMX_GPADDR_F, &fault_addr);
1246                err+= vmread(VMX_GUEST_RIP, &guest_rip);
1247                err+= vmread(VMX_EXIT_QUAL, &exit_qual);
1248                //err+= vmread(VMX_PF_ERR_MATCH
1249                assert(err_is_ok(err));
1250                printk(LOG_NOTE, "exit qualification: 0x%lx\n", exit_qual);
1251                printk(LOG_NOTE, "guest page fault on 0x%lx, IP 0x%lx\n",
1252                        fault_addr, guest_rip);
1253                paging_dump_tables(dcb);
1254            }
1255#endif
1256            call_monitor(dcb);
1257            break;
1258    }
1259}
1260