1/**
2 * \file
3 * \brief Contains VMKit kernel interface for version using VMX extensions.
4 */
5
6/*
7 * Copyright (c) 2014, University of Washington.
8 * All rights reserved.
9 *
10 * This file is distributed under the terms in the attached LICENSE file.
11 * If you do not find this file, copies can be found by writing to:
12 * ETH Zurich D-INFK, CAB F.78, Universitaetstr. 6, CH-8092 Zurich.
13 * Attn: Systems Group.
14 */
15
16#include <string.h>
17#include <kernel.h>
18#include <paging_kernel_arch.h>
19#include <vmx_vmkit.h>
20#include <vmx_checks.h>
21#include <x86.h>
22#include <dispatch.h>
23#include <exec.h>
24#include <irq.h>
25#include <barrelfish_kpi/vmkit.h>
26#include <barrelfish_kpi/syscalls.h>
27
28#include <dev/ia32_dev.h>
29
30// Execution, entry, and exit controls that we want to use
31// for each VM
32#ifdef CONFIG_ARRAKISMON
33#define GUEST_PIN_BASE_CTLS_ENABLE \
34    (PIN_CTLS_EXT_INTR | PIN_CTLS_NMI | PIN_CTLS_VIRT_NMI)
35
36#define GUEST_PIN_BASE_CTLS_DISABLE \
37    (0)
38
39#define GUEST_PP_CTLS_ENABLE \
40    (PP_CLTS_MSRBMP | PP_CLTS_IOBMP | PP_CLTS_HLT)
41
42#define GUEST_PP_CTLS_DISABLE \
43    (0)
44
45#define GUEST_SP_CTLS_ENABLE \
46    (0)
47
48#define GUEST_SP_CTLS_DISABLE \
49    (0)
50
51#define GUEST_EXIT_CTLS_ENABLE \
52    (EXIT_CLTS_HOST_SIZE | EXIT_CLTS_SAVE_EFER | EXIT_CLTS_LOAD_EFER)
53
54#define GUEST_EXIT_CTLS_DISABLE \
55    (0)
56
57#define GUEST_ENTRY_CTLS_ENABLE \
58    (ENTRY_CLTS_LOAD_EFER | ENTRY_CLTS_LOAD_DBG | ENTRY_CLTS_IA32E_MODE)
59
60#define GUEST_ENTRY_CTLS_DISABLE \
61    (0)
62#else
63#define GUEST_PIN_BASE_CTLS_ENABLE \
64    (PIN_CTLS_EXT_INTR | PIN_CTLS_NMI | PIN_CTLS_VIRT_NMI)
65
66#define GUEST_PIN_BASE_CTLS_DISABLE \
67    (0)
68
69#define GUEST_PP_CTLS_ENABLE \
70    (PP_CLTS_MSRBMP | PP_CLTS_IOBMP | PP_CLTS_HLT | PP_CLTS_SEC_CTLS)
71
72#define GUEST_PP_CTLS_DISABLE \
73    (0)
74
75#define GUEST_SP_CTLS_ENABLE \
76    (SP_CLTS_ENABLE_EPT | SP_CLTS_UNRSTD_GUEST)
77
78#define GUEST_SP_CTLS_DISABLE \
79    (0)
80
81#define GUEST_EXIT_CTLS_ENABLE \
82    (EXIT_CLTS_HOST_SIZE | EXIT_CLTS_SAVE_EFER | EXIT_CLTS_LOAD_EFER | \
83     EXIT_CLTS_SAVE_PAT  | EXIT_CLTS_LOAD_PAT)
84
85#define GUEST_EXIT_CTLS_DISABLE \
86    (0)
87
88#define GUEST_ENTRY_CTLS_ENABLE \
89    (ENTRY_CLTS_LOAD_EFER)
90
91#define GUEST_ENTRY_CTLS_DISABLE \
92    (0)
93#endif
94
95extern void *vmx_return_func;
96
97static struct guest_control *ctrl = NULL;
98
99static int launched = 0;
100
101#ifndef CONFIG_ARRAKISMON
102// List of MSRs that are loaded on VM-exit.
103static uint32_t msr_list[VMX_MSR_COUNT] =
104    {MSR_KERNEL_GS_BASE, MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SFMASK};
105
106// VM-exit MSR-load area that contains host MSR values that are saved prior
107// to VM-entry and loaded on VM exit.
108static struct msr_entry host_msr_area[VMX_MSR_COUNT]
109__attribute__ ((aligned(16)));
110#endif
111
112// VMX controls that are written to the VMCS. In addition to the controls
113// that are requested, these values may have bits that are reserved set.
114vmx_controls pin_based_ctls = 0, pp_based_ctls = 0, sp_based_ctls = 0,
115    entry_ctls = 0, exit_ctls = 0;
116
117static uint8_t vmxon_region[BASE_PAGE_SIZE]
118__attribute__ ((aligned(BASE_PAGE_SIZE)));
119
120// Returns true if extended page tables (EPT) are enabled.
121static inline int ept_enabled(void)
122{
123    return ((GUEST_SP_CTLS_ENABLE & SP_CLTS_ENABLE_EPT) != 0);
124}
125
126static inline errval_t instr_err(void)
127{
128    errval_t err;
129    __asm volatile("jnc vmx_err_check_zf%=\n\t"
130		   "mov %[VMfailInvalid], %[err]\n\t"
131		   "jmp vmx_err_done%=\n\t"
132		   "vmx_err_check_zf%=:\n\t"
133		   "jnz vmx_err_succeed%=\n\t"
134		   "mov %[VMfailValid], %[err]\n\t"
135		   "jmp vmx_err_done%=\n\t"
136		   "vmx_err_succeed%=:\n\t"
137		   "mov %[VMsucceed], %[err]\n\t"
138		   "vmx_err_done%=:\n\t"
139		   : [err] "=r" (err)
140		   : [VMfailInvalid] "i" (SYS_ERR_VMKIT_VMX_VMFAIL_INVALID),
141		     [VMfailValid] "i" (SYS_ERR_VMKIT_VMX_VMFAIL_VALID),
142		     [VMsucceed] "i" (SYS_ERR_OK)
143		   : "memory");
144    return err;
145}
146
147// Executes the vmptrld instruction, which makes the VMCS referenced by
148// 'vmcs_base' active and current.
149errval_t vmptrld(lpaddr_t vmcs_base)
150{
151    __asm volatile("vmptrld %[vmcs_base]\n\t"
152		   :
153		   : [vmcs_base] "m" (vmcs_base)
154		   : "memory");
155    return instr_err();
156}
157
158// Returns the physical address base of the current VMCS.
159lpaddr_t vmptrst(void)
160{
161   lpaddr_t dest_addr;
162    __asm volatile("vmptrst %[dest_addr]\n\t"
163		   :
164		   : [dest_addr] "m" (dest_addr)
165		   : "memory");
166    return dest_addr;
167}
168
169// Executes the vmclear instruction, which makes the VMCS referenced
170// by 'vmcs_base' clear and inactive. This instruction also ensures
171// that the referenced VMCS data is saved.
172errval_t vmclear(lpaddr_t vmcs_base)
173{
174    __asm volatile("vmclear %[vmcs_base]\n\t"
175		   :
176		   : [vmcs_base] "m" (vmcs_base)
177		   : "memory");
178    return instr_err();
179}
180
181// Reads a component with a specified encoding from the current VMCS
182// to an address dest_addr using the vmread instruction.
183errval_t vmread(uintptr_t encoding, lvaddr_t *dest_addr)
184{
185    __asm volatile("vmread %[encoding], %[dest_addr]\n\t"
186		   :
187		   : [encoding] "r" (encoding), [dest_addr] "m" (*dest_addr)
188		   : "memory");
189    return instr_err();
190}
191
192// Writes a component with a specifed encoding and value to the current
193// VMCS using the vmwrite instruction.
194errval_t vmwrite(uintptr_t encoding, uintptr_t value)
195{
196    __asm volatile("vmwrite %[value], %[encoding]\n\t"
197		   :
198		   : [encoding] "r" (encoding), [value] "r" (value)
199		   : "memory");
200    return instr_err();
201}
202
203// Using a provided VMXON region, causes the logical processor to enter
204// into root-mode by executing the vmxon instruction.
205errval_t vmxon(lpaddr_t base_addr)
206{
207    __asm volatile("vmxon %[base_addr]\n\t"
208		   :
209		   : [base_addr] "m" (base_addr)
210		   : "memory");
211    return instr_err();
212}
213
214// Exits VMX operation by executing the vmxoff instruction.
215errval_t vmxoff(void)
216{
217    __asm volatile("vmxoff");
218    return instr_err();
219}
220
221// Reads and returns the MSR that reports the allowed settings
222// for ALL of the bits of the controls indicated by 'type.'
223static uint64_t msr_ctls_true(enum vmx_ctls_t type)
224{
225    uint64_t true_msr = 0;
226    switch(type) {
227    case VMX_CTLS_PIN_BASED:
228        true_msr = ia32_vmx_true_pinbased_ctls_rd(NULL);
229	break;
230    case VMX_CTLS_PRIMARY_PROCESSOR:
231        true_msr = ia32_vmx_true_ppbased_ctls_rd(NULL);
232	break;
233    case VMX_CTLS_SECONDARY_PROCESSOR:
234        assert(!"No such MSR for secondary processor controls!\n");
235	break;
236    case VMX_CTLS_EXIT:
237        true_msr = ia32_vmx_true_exit_ctls_rd(NULL);
238	break;
239    case VMX_CTLS_ENTRY:
240        true_msr = ia32_vmx_true_entry_ctls_rd(NULL);
241	break;
242    }
243    return true_msr;
244}
245
246// Reads and returns the MSR that reports the allowed settings
247// for MOST of the bits of the controls indicated by 'type.'
248static uint64_t msr_ctls(enum vmx_ctls_t type)
249{
250    uint64_t msr = 0;
251    switch(type) {
252    case VMX_CTLS_PIN_BASED:
253        msr = ia32_vmx_pinbased_ctls_rd(NULL);
254	break;
255    case VMX_CTLS_PRIMARY_PROCESSOR:
256        msr = ia32_vmx_ppbased_ctls_rd(NULL);
257	break;
258    case VMX_CTLS_SECONDARY_PROCESSOR:
259        msr = ia32_vmx_spbased_ctls_rd(NULL);
260	break;
261    case VMX_CTLS_EXIT:
262        msr = ia32_vmx_exit_ctls_rd(NULL);
263	break;
264    case VMX_CTLS_ENTRY:
265        msr = ia32_vmx_entry_ctls_rd(NULL);
266	break;
267    }
268    return msr;
269}
270
271// Writes the controls indicated by 'type' to the VMCS using 'mask_1s'
272// and 'mask_0s', which correspond to the controls that should be enabled
273// and disabled, respectively.
274static uint32_t set_vmx_controls(uint32_t mask_1s,
275    uint32_t mask_0s, enum vmx_ctls_t type)
276{
277    uint32_t controls = 0;
278
279    ia32_vmx_basic_t vmx_basic = ia32_vmx_basic_rd(NULL);
280    bool true_ctls = !!(ia32_vmx_basic_ctls_clear_extract(vmx_basic));
281    if (true_ctls && (type != VMX_CTLS_SECONDARY_PROCESSOR)) {
282        uint64_t true_msr = msr_ctls_true(type);
283	controls = ((DWORD_LS(true_msr) | mask_1s) & DWORD_MS(true_msr));
284    } else {
285        uint64_t msr = msr_ctls(type);
286	controls = ((DWORD_LS(msr) | mask_1s) & DWORD_MS(msr));
287    }
288    assert((mask_1s & (~controls)) == 0);
289    assert((mask_0s & controls) == 0);
290    return controls;
291}
292
293/**
294 * \brief Tries to enable hardware assisted virtualization.
295 *
296 * Checks whether hardware assisted virtualization is available on the platform
297 * and enables this feature.
298 *
299 * \Return Returns VMKIT_ERR_OK on successful initialization of the subsystem
300 *         or VMKIT_ERR_UNAVAIL if virtualization is unavailable.
301 */
302errval_t vmx_enable_virtualization (void)
303{
304    uint32_t cpuid_ecx;
305    cpuid(CPUID_VMX, NULL, NULL, &cpuid_ecx, NULL);
306    if (!(cpuid_ecx & VMX_SUPPORT)) {
307        return SYS_ERR_VMKIT_UNAVAIL;
308    }
309
310    // The 'lock' and 'enable VMXON outside' bits of the IA32_FEATURE_CONTROL_MSR
311    // must be set
312    ia32_feature_cntl_t feat_cntl_msr;
313    feat_cntl_msr = ia32_feature_cntl_rd(NULL);
314    if (!ia32_feature_cntl_lock_extract(feat_cntl_msr) ||
315	!ia32_feature_cntl_vmxoutsmx_extract(feat_cntl_msr)) {
316        return SYS_ERR_VMKIT_UNAVAIL;
317    }
318
319    pin_based_ctls = set_vmx_controls(
320        GUEST_PIN_BASE_CTLS_ENABLE, GUEST_PIN_BASE_CTLS_DISABLE, VMX_CTLS_PIN_BASED);
321
322    pp_based_ctls = set_vmx_controls(
323	GUEST_PP_CTLS_ENABLE, GUEST_PP_CTLS_DISABLE, VMX_CTLS_PRIMARY_PROCESSOR);
324
325    sp_based_ctls = set_vmx_controls(
326	GUEST_SP_CTLS_ENABLE, GUEST_SP_CTLS_DISABLE, VMX_CTLS_SECONDARY_PROCESSOR);
327
328    entry_ctls = set_vmx_controls(
329        GUEST_ENTRY_CTLS_ENABLE, GUEST_ENTRY_CTLS_DISABLE, VMX_CTLS_ENTRY);
330
331    exit_ctls = set_vmx_controls(
332        GUEST_EXIT_CTLS_ENABLE, GUEST_EXIT_CTLS_DISABLE, VMX_CTLS_EXIT);
333
334    // Initialize the VMXON region
335    memset(vmxon_region, 0x0, BASE_PAGE_SIZE);
336    ia32_vmx_basic_t vmx_basic;
337    vmx_basic = ia32_vmx_basic_rd(NULL);
338    uint32_t vmcs_rev_id = ia32_vmx_basic_vmcs_rev_id_extract(vmx_basic);
339    memcpy(vmxon_region, &vmcs_rev_id, sizeof(uint32_t));
340
341    // The logical processor must use PAE paging
342    uint64_t cr0 = rdcr0();
343    if ((cr0 & CR0_PE) == 0 || (rdcr0() & CR0_PG) == 0) {
344        return SYS_ERR_VMKIT_UNAVAIL;
345    }
346
347    // The CR0 register value has to support all of the CR0 fixed bits
348    if (cr0 != vmx_fixed_cr0()) {
349        return SYS_ERR_VMKIT_UNAVAIL;
350    }
351
352    // Enable virtualization, if not already enabled
353    if (!vmx_enabled()) {
354        enable_vmx();
355    }
356    // The CR4 register value has to support all of the CR4 fixed bits
357    if (rdcr4() != vmx_fixed_cr4()) {
358        return SYS_ERR_VMKIT_UNAVAIL;
359    }
360
361    // Execute VMXON to place processor into VMX root operation
362    errval_t err = vmxon(mem_to_local_phys((lvaddr_t)vmxon_region));
363    assert(err_is_ok(err));
364
365    return SYS_ERR_OK;
366}
367
368static inline void vmx_set_exception_bitmap(void)
369{
370    errval_t err = vmwrite(VMX_EXCP_BMP, ~(1UL << 7));
371    assert(err_is_ok(err));
372}
373
374#ifndef CONFIG_ARRAKISMON
375static uint64_t vmx_read_msr(uint32_t index) {
376    uint64_t val = 0;
377    switch (index) {
378    case MSR_KERNEL_GS_BASE:
379        val = ia32_kernel_gs_base_rd(NULL);
380	break;
381    case MSR_STAR:
382        val = ia32_star_rd(NULL);
383	break;
384    case MSR_LSTAR:
385        val = ia32_lstar_rd(NULL);
386	break;
387    case MSR_CSTAR:
388        val = ia32_cstar_rd(NULL);
389	break;
390    case MSR_SFMASK:
391        val = ia32_fmask_rd(NULL);
392	break;
393    default:
394        assert(!"MSR index not supported");
395    }
396    return val;
397}
398
399static void vmx_host_msr_area_init(struct msr_entry *msr_area)
400{
401    for (int i = 0; i < VMX_MSR_COUNT; i++) {
402        msr_area[i].index = msr_list[i];
403	msr_area[i].val = vmx_read_msr(msr_list[i]);
404    }
405}
406#endif
407
408static inline lpaddr_t mem_to_local_phys_no_assertion(lvaddr_t addr)
409{
410    return (lpaddr_t)(addr - (lpaddr_t)X86_64_MEMORY_OFFSET);
411}
412
413// Writes the host state, which is used after a VM-exit, to the
414// current VMCS
415static void vmx_set_host_state(void)
416{
417    // On a page-fault the processor checks whether:
418    // (#PF error-code) & (#PF error-code mask) = (#PF error-code match)
419
420    // Setting the mask to 0, the match to 0xFFFFFFFF, and bit 14 in the
421    // exception bitmap results in no VM-exits on guest page-faults.
422    errval_t err = vmwrite(VMX_PF_ERR_MASK, 0);
423    err += vmwrite(VMX_PF_ERR_MATCH, 0xFFFFFFFF);
424    err += vmwrite(VMX_CR3_TARGET_CNT, 0);
425
426    uint64_t cr0 = rdcr0(), cr3 = rdcr3(), cr4 = rdcr4();
427
428    uint64_t cr0_fixed0 = ia32_vmx_cr0_fixed0_rd(NULL);
429    uint64_t cr0_fixed1 = ia32_vmx_cr0_fixed1_rd(NULL);
430    uint64_t cr4_fixed0 = ia32_vmx_cr4_fixed0_rd(NULL);
431    uint64_t cr4_fixed1 = ia32_vmx_cr4_fixed1_rd(NULL);
432
433    assert((~cr0 & cr0_fixed0) == 0);
434    assert((cr0 & ~cr0_fixed1) == 0);
435    assert((~cr4 & cr4_fixed0) == 0);
436    assert((cr4 & ~cr4_fixed1) == 0);
437
438    assert(((cr0 | cr0_fixed0) & cr0_fixed1) == cr0);
439    assert(((cr4 | cr4_fixed0) & cr4_fixed1) == cr4);
440    assert(rdcr4() & CR4_PAE);
441
442    err += vmwrite(VMX_HOST_CR0, cr0);
443    err += vmwrite(VMX_HOST_CR3, cr3);
444    err += vmwrite(VMX_HOST_CR4, cr4);
445
446    err += vmwrite(VMX_HOST_ES_SEL, rd_es() & ~0x7);
447    err += vmwrite(VMX_HOST_CS_SEL, rd_cs() & ~0x7);
448    err += vmwrite(VMX_HOST_SS_SEL, rd_ss() & ~0x7);
449    err += vmwrite(VMX_HOST_DS_SEL, rd_ds() & ~0x7);
450    err += vmwrite(VMX_HOST_TR_SEL, rd_tr() & ~0x7);
451
452    err += vmwrite(VMX_HOST_TR_BASE, tr_addr(rd_tr(), gdtr_addr(rd_gdtr())));
453    err += vmwrite(VMX_HOST_GDTR_BASE, gdtr_addr(rd_gdtr()));
454    err += vmwrite(VMX_HOST_IDTR_BASE, idtr_addr(rd_idtr()));
455    err += vmwrite(VMX_HOST_SYSENTER_CS, 0);
456    err += vmwrite(VMX_HOST_SYSENTER_ESP, 0);
457    err += vmwrite(VMX_HOST_SYSENTER_EIP, 0);
458    err += vmwrite(VMX_HOST_PAT_F, ia32_cr_pat_rd(NULL));
459
460    ia32_efer_t efer_msr = ia32_efer_rd(NULL);
461    err += vmwrite(VMX_HOST_EFER_F, efer_msr);
462    assert(ia32_efer_lme_extract(efer_msr));
463    assert(ia32_efer_lma_extract(efer_msr));
464
465    err += vmwrite(VMX_HOST_GS_SEL, 0x0);
466    err += vmwrite(VMX_HOST_GS_BASE, 0x0);
467
468    err += vmwrite(VMX_HOST_FS_SEL, 0x0);
469    err += vmwrite(VMX_HOST_FS_BASE, 0x0);
470
471    err += vmwrite(VMX_HOST_RIP, (uint64_t)(&vmx_return_func));
472#ifndef CONFIG_ARRAKISMON
473    vmx_host_msr_area_init(host_msr_area);
474
475    lpaddr_t msr_area_base = mem_to_local_phys_no_assertion(
476            (lvaddr_t) host_msr_area);
477    if (!((lvaddr_t) host_msr_area >= X86_64_MEMORY_OFFSET)) {
478        printk(LOG_NOTE, "assertion failed! 0x%lx >= 0x%lx\n",
479                (lvaddr_t) host_msr_area,
480                X86_64_MEMORY_OFFSET);
481    }
482    err += vmwrite(VMX_EXIT_MSR_LOAD_F, canonical_form(msr_area_base));
483    err += vmwrite(VMX_EXIT_MSR_LOAD_CNT, VMX_MSR_COUNT);
484#endif
485    assert(err_is_ok(err));
486}
487
488// Writes the VMX controls to the current VMCS.
489void vmx_set_exec_ctls(void)
490{
491    // VM-execution controls
492    errval_t err = vmwrite(VMX_EXEC_PIN_BASED, pin_based_ctls);
493    err += vmwrite(VMX_EXEC_PRIM_PROC, pp_based_ctls);
494    err += vmwrite(VMX_EXEC_SEC_PROC, sp_based_ctls);
495
496    // VM-entry and VM-exit control fields
497    err += vmwrite(VMX_EXIT_CONTROLS, exit_ctls);
498    err += vmwrite(VMX_ENTRY_CONTROLS, entry_ctls);
499
500    vmx_set_exception_bitmap();
501
502    err += vmwrite(VMX_ENTRY_INTR_INFO, 0);
503    err += vmwrite(VMX_ENTRY_EXCP_ERR, 0);
504    err += vmwrite(VMX_ENTRY_INSTR_LEN, 0);
505    assert(err_is_ok(err));
506}
507
508errval_t initialize_vmcs(lpaddr_t vmcs_paddr)
509{
510    struct vmcs *vmcs = (struct vmcs *)local_phys_to_mem(vmcs_paddr);
511
512    ia32_vmx_basic_t vmx_basic;
513    vmx_basic = ia32_vmx_basic_rd(NULL);
514    uint32_t vmcs_rev_id = ia32_vmx_basic_vmcs_rev_id_extract(vmx_basic);
515
516    memset(vmcs, 0x0, BASE_PAGE_SIZE);
517    vmcs->prelude.p.revision_id = vmcs_rev_id;
518    vmcs->prelude.p.shadow = 0;
519    errval_t err = vmclear(vmcs_paddr);
520    err += vmptrld(vmcs_paddr);
521
522    err += vmwrite(VMX_GUEST_VMCS_LPTR_F, ~0x0);
523    err += vmwrite(VMX_GUEST_VMCS_LPTR_H, ~0x0);
524    err += vmwrite(VMX_GUEST_SYSENTER_CS, 0x0);
525    err += vmwrite(VMX_GUEST_SYSENTER_ESP, 0x0);
526    err += vmwrite(VMX_GUEST_SYSENTER_EIP, 0x0);
527#ifdef CONFIG_ARRAKISMON
528    err += vmwrite(VMX_GUEST_DR7, 0x0);
529    err += vmwrite(VMX_GUEST_EFER_F, ia32_efer_rd(NULL) | EFER_LME | EFER_LMA);
530
531    err += vmwrite(VMX_GUEST_ACTIV_STATE, 0x0);
532    err += vmwrite(VMX_GUEST_INTR_STATE, 0x0);
533
534    err += vmwrite(VMX_GUEST_CS_LIM, 0xFFFFFFFF);
535    err += vmwrite(VMX_GUEST_DS_LIM, 0xFFFFFFFF);
536    err += vmwrite(VMX_GUEST_ES_LIM, 0xFFFFFFFF);
537    err += vmwrite(VMX_GUEST_SS_LIM, 0xFFFFFFFF);
538    err += vmwrite(VMX_GUEST_FS_LIM, 0xFFFFFFFF);
539    err += vmwrite(VMX_GUEST_GS_LIM, 0xFFFFFFFF);
540    err += vmwrite(VMX_GUEST_TR_LIM, 0xFFFF);
541    err += vmwrite(VMX_GUEST_LDTR_LIM, 0xFFFF);
542    err += vmwrite(VMX_GUEST_GDTR_LIM, 0xFFFF);
543    err += vmwrite(VMX_GUEST_IDTR_LIM, 0xFFFF);
544
545    err += vmwrite(VMX_GUEST_CS_ACCESS, 0xA09B);
546    err += vmwrite(VMX_GUEST_DS_ACCESS, 0xC093);
547    err += vmwrite(VMX_GUEST_ES_ACCESS, 0xC093);
548    err += vmwrite(VMX_GUEST_FS_ACCESS, 0xC093);
549    err += vmwrite(VMX_GUEST_GS_ACCESS, 0xC093);
550    err += vmwrite(VMX_GUEST_SS_ACCESS, 0xC093);
551    err += vmwrite(VMX_GUEST_TR_ACCESS, 0x8B);
552    err += vmwrite(VMX_GUEST_LDTR_ACCESS, 0x82);
553
554    err += vmwrite(VMX_GUEST_CS_SEL, 0x8);
555    err += vmwrite(VMX_GUEST_SS_SEL, 0x10);
556    err += vmwrite(VMX_GUEST_DS_SEL, 0x10);
557    err += vmwrite(VMX_GUEST_ES_SEL, 0x10);
558    err += vmwrite(VMX_GUEST_FS_SEL, 0x10);
559    err += vmwrite(VMX_GUEST_GS_SEL, 0x10);
560    err += vmwrite(VMX_GUEST_TR_SEL, 0x10);
561    err += vmwrite(VMX_GUEST_LDTR_SEL, 0x10);
562
563    err += vmwrite(VMX_GUEST_CS_BASE, 0x0);
564    err += vmwrite(VMX_GUEST_SS_BASE, 0x0);
565    err += vmwrite(VMX_GUEST_DS_BASE, 0x0);
566    err += vmwrite(VMX_GUEST_ES_BASE, 0x0);
567    err += vmwrite(VMX_GUEST_FS_BASE, 0x0);
568    err += vmwrite(VMX_GUEST_GS_BASE, 0x0);
569    err += vmwrite(VMX_GUEST_TR_BASE, 0x0);
570    err += vmwrite(VMX_GUEST_LDTR_BASE, 0x0);
571    err += vmwrite(VMX_GUEST_GDTR_BASE, 0x0);
572    err += vmwrite(VMX_GUEST_IDTR_BASE, 0x0);
573
574    uint64_t guest_cr0 = 0x60000010 | CR0_PE | CR0_PG;
575    err += vmwrite(VMX_GUEST_CR0, (uint32_t)(guest_cr0 | ia32_vmx_cr0_fixed0_rd(NULL)) &
576		   ia32_vmx_cr0_fixed1_rd(NULL));
577
578    uint64_t guest_cr4 = CR4_PAE;
579    err += vmwrite(VMX_GUEST_CR4, (guest_cr4 | ia32_vmx_cr4_fixed0_rd(NULL)) &
580		   ia32_vmx_cr4_fixed1_rd(NULL));
581
582    err += vmwrite(VMX_CR0_GH_MASK, 0UL);
583    err += vmwrite(VMX_CR4_GH_MASK, 0UL);
584#else
585    err += vmwrite(VMX_GUEST_DR7, 0x400);
586    err += vmwrite(VMX_GUEST_EFER_F, 0x0);
587    err += vmwrite(VMX_GUEST_PAT_F, 0x0007040600070406ul);
588
589    err += vmwrite(VMX_GUEST_ACTIV_STATE, 0x0);
590    err += vmwrite(VMX_GUEST_INTR_STATE, 0x0);
591
592    err += vmwrite(VMX_GUEST_CS_LIM, 0xFFFF);
593    err += vmwrite(VMX_GUEST_DS_LIM, 0xFFFF);
594    err += vmwrite(VMX_GUEST_ES_LIM, 0xFFFF);
595    err += vmwrite(VMX_GUEST_FS_LIM, 0xFFFF);
596    err += vmwrite(VMX_GUEST_GS_LIM, 0xFFFF);
597    err += vmwrite(VMX_GUEST_SS_LIM, 0xFFFF);
598    err += vmwrite(VMX_GUEST_TR_LIM, 0xFFFF);
599    err += vmwrite(VMX_GUEST_LDTR_LIM, 0xFFFF);
600    err += vmwrite(VMX_GUEST_GDTR_LIM, 0xFFFF);
601    err += vmwrite(VMX_GUEST_IDTR_LIM, 0xFFFF);
602
603    err += vmwrite(VMX_GUEST_CS_ACCESS, 0x9B);
604    err += vmwrite(VMX_GUEST_DS_ACCESS, 0x93);
605    err += vmwrite(VMX_GUEST_ES_ACCESS, 0x93);
606    err += vmwrite(VMX_GUEST_FS_ACCESS, 0x93);
607    err += vmwrite(VMX_GUEST_GS_ACCESS, 0x93);
608    err += vmwrite(VMX_GUEST_SS_ACCESS, 0x93);
609    err += vmwrite(VMX_GUEST_TR_ACCESS, 0x8B);
610    err += vmwrite(VMX_GUEST_LDTR_ACCESS, 0x82);
611
612    err += vmwrite(VMX_GUEST_CS_SEL, 0x0);
613    err += vmwrite(VMX_GUEST_DS_SEL, 0x0);
614    err += vmwrite(VMX_GUEST_ES_SEL, 0x0);
615    err += vmwrite(VMX_GUEST_FS_SEL, 0x0);
616    err += vmwrite(VMX_GUEST_GS_SEL, 0x0);
617    err += vmwrite(VMX_GUEST_SS_SEL, 0x0);
618    err += vmwrite(VMX_GUEST_TR_SEL, 0x0);
619    err += vmwrite(VMX_GUEST_LDTR_SEL, 0x0);
620
621    err += vmwrite(VMX_GUEST_CS_BASE, 0x0);
622    err += vmwrite(VMX_GUEST_DS_BASE, 0x0);
623    err += vmwrite(VMX_GUEST_ES_BASE, 0x0);
624    err += vmwrite(VMX_GUEST_FS_BASE, 0x0);
625    err += vmwrite(VMX_GUEST_GS_BASE, 0x0);
626    err += vmwrite(VMX_GUEST_SS_BASE, 0x0);
627    err += vmwrite(VMX_GUEST_TR_BASE, 0x0);
628    err += vmwrite(VMX_GUEST_LDTR_BASE, 0x0);
629    err += vmwrite(VMX_GUEST_GDTR_BASE, 0x0);
630    err += vmwrite(VMX_GUEST_IDTR_BASE, 0x0);
631
632    err += vmwrite(VMX_GUEST_RFLAGS, 0x200002);
633    err += vmwrite(VMX_GUEST_RIP, 0xFFF0);
634    err += vmwrite(VMX_GUEST_RSP, 0x0);
635
636    uint64_t guest_cr0 = (0x60000010 | ia32_vmx_cr0_fixed0_rd(NULL)) &
637        ia32_vmx_cr0_fixed1_rd(NULL);
638    err += vmwrite(VMX_GUEST_CR0, guest_cr0 & ~(CR0_PE | CR0_PG));
639
640    uint64_t guest_cr4 = CR4_PAE;
641    err += vmwrite(VMX_GUEST_CR4, (guest_cr4 | ia32_vmx_cr4_fixed0_rd(NULL)) &
642            ia32_vmx_cr4_fixed1_rd(NULL));
643    assert((guest_cr4 & CR4_PCIDE) == 0);
644
645    uint64_t cr0_shadow;
646    err += vmread(VMX_GUEST_CR0, &cr0_shadow);
647
648    err += vmwrite(VMX_CR0_RD_SHADOW, cr0_shadow);
649    err += vmwrite(VMX_CR0_GH_MASK, CR0_PE);
650    err += vmwrite(VMX_CR4_GH_MASK, 0x20);
651#endif
652    assert(err_is_ok(err));
653
654    vmx_set_exec_ctls();
655
656    return SYS_ERR_OK;
657}
658
659static uint32_t fail = 0;
660
661static inline void enter_guest(void)
662{
663    // Set the host state prior to every VM-entry in case the values
664    // written to the VMCS change.
665    vmx_set_host_state();
666
667    // This is necessary or else a #GPF will be incurred in the
668    // monitor domain.
669    uint16_t ldtr_sel = rd_ldtr();
670
671    // Perform most checks that are performed by the processor
672    if (!launched) {
673        check_guest_state_area();
674	check_host_state_area();
675	check_vmx_controls();
676    }
677
678    __asm volatile("mov %[ctrl], %%rdi\n\t"
679
680		   // save host host
681		   "mov %%rsp, %%r8\n\t"
682		   "mov %[host_rsp_encoding], %%r9\n\t"
683		   "vmwrite %%r8, %%r9\n\t"
684
685		   "mov %%rbx, (148 + 1*8)(%%rdi)\n\t"
686		   "mov %%rbp, (148 + 6*8)(%%rdi)\n\t"
687		   "mov %%r12, (148 + 12*8)(%%rdi)\n\t"
688		   "mov %%r13, (148 + 13*8)(%%rdi)\n\t"
689		   "mov %%r14, (148 + 14*8)(%%rdi)\n\t"
690		   "mov %%r15, (148 + 15*8)(%%rdi)\n\t"
691		   "mov %%cr2, %%rsi\n\t"
692		   "mov %%rsi, 38*8(%%rdi)\n\t"
693
694		   // load guest state
695		   "mov 37*8(%%rdi), %%rsi\n\t"
696		   "mov %%rsi, %%cr2\n\t"
697
698		   "mov 0*8(%%rdi), %%rax\n\t"
699		   "mov 1*8(%%rdi), %%rbx\n\t"
700		   "mov 2*8(%%rdi), %%rcx\n\t"
701		   "mov 3*8(%%rdi), %%rdx\n\t"
702		   "mov 4*8(%%rdi), %%rsi\n\t"
703		   "mov 6*8(%%rdi), %%rbp\n\t"
704		   "mov 8*8(%%rdi), %%r8\n\t"
705		   "mov 9*8(%%rdi), %%r9\n\t"
706		   "mov 10*8(%%rdi), %%r10\n\t"
707		   "mov 11*8(%%rdi), %%r11\n\t"
708		   "mov 12*8(%%rdi), %%r12\n\t"
709		   "mov 13*8(%%rdi), %%r13\n\t"
710		   "mov 14*8(%%rdi), %%r14\n\t"
711		   "mov 15*8(%%rdi), %%r15\n\t"
712		   "mov 5*8(%%rdi), %%rdi\n\t"
713
714		   // enter the guest VM
715		   "cmpl $0, %[launched]\n\t"
716		   "jne 1f\n\t"
717		   "sti\n\t"
718		   "vmlaunch\n\t"
719		   "jmp 2f\n\t"
720		   "1: "
721		   "sti\n\t"
722		   "vmresume\n\t"
723		   "2: "
724		   "setbe %[fail]\n\t"
725		   "vmx_return_func:\n\t"
726		   "cli\n\t"
727
728		   "push %%rdi\n\t"
729		   "mov %[ctrl], %%rdi\n\t"
730
731		   // save guest state
732		   "mov %%rax, 0*8(%%rdi)\n\t"
733		   "mov %%rbx, 1*8(%%rdi)\n\t"
734		   "mov %%rcx, 2*8(%%rdi)\n\t"
735		   "mov %%rdx, 3*8(%%rdi)\n\t"
736		   "mov %%rsi, 4*8(%%rdi)\n\t"
737		   "mov %%rbp, 6*8(%%rdi)\n\t"
738		   "mov %%r8, 8*8(%%rdi)\n\t"
739		   "mov %%r9, 9*8(%%rdi)\n\t"
740		   "mov %%r10, 10*8(%%rdi)\n\t"
741		   "mov %%r11, 11*8(%%rdi)\n\t"
742		   "mov %%r12, 12*8(%%rdi)\n\t"
743		   "mov %%r13, 13*8(%%rdi)\n\t"
744		   "mov %%r14, 14*8(%%rdi)\n\t"
745		   "mov %%r15, 15*8(%%rdi)\n\t"
746
747		   "mov %%cr2, %%rsi\n\t"
748		   "mov %%rsi, 37*8(%%rdi)\n\t"
749
750		   "pop %%rsi\n\t"
751		   "mov %%rsi, 5*8(%%rdi)\n\t"
752
753		   // load host state
754		   "mov (148 + 1*8)(%%rdi), %%rbx\n\t"
755		   "mov (148 + 6*8)(%%rdi), %%rbp\n\t"
756		   "mov (148 + 12*8)(%%rdi), %%r12\n\t"
757		   "mov (148 + 13*8)(%%rdi), %%r13\n\t"
758		   "mov (148 + 14*8)(%%rdi), %%r14\n\t"
759		   "mov (148 + 15*8)(%%rdi), %%r15\n\t"
760		   "mov 38*8(%%rdi), %%rsi\n\t"
761		   "mov %%rsi, %%cr2\n\t"
762		   : [fail] "=m" (fail)
763		   : [ctrl] "m" (ctrl), [launched] "m" (launched),
764		     [host_rsp_encoding] "i" (VMX_HOST_RSP)
765		   : "memory"
766		   );
767    assert(!fail);
768    wr_ldtr(ldtr_sel);
769
770    launched = 1;
771}
772
773static inline void print_vmcs_info(struct guest_control *g)
774{
775    uint64_t guest_rip, guest_rsp, guest_rflags;
776    uint64_t reason, exit_qual;
777    uint64_t exit_intr_info, intr_err;
778    uint64_t idt_vec_info, idt_vec_err;
779    uint64_t instr_len, instr_info;
780    uint64_t instr_error, gpaddr, gladdr;
781    uint64_t entry_intr_info, activ_state, intr_state;
782    uint64_t guest_cr0, guest_cr3, guest_cr4;
783    uint64_t guest_efer;
784
785    uint64_t guest_es_sel, guest_es_base, guest_es_lim, guest_es_access;
786    uint64_t guest_cs_sel, guest_cs_base, guest_cs_lim, guest_cs_access;
787    uint64_t guest_ss_sel, guest_ss_base, guest_ss_lim, guest_ss_access;
788    uint64_t guest_ds_sel, guest_ds_base, guest_ds_lim, guest_ds_access;
789    uint64_t guest_fs_sel, guest_fs_base, guest_fs_lim, guest_fs_access;
790    uint64_t guest_gs_sel, guest_gs_base, guest_gs_lim, guest_gs_access;
791    uint64_t guest_tr_sel, guest_tr_base, guest_tr_lim, guest_tr_access;
792    uint64_t guest_ldtr_sel, guest_ldtr_base, guest_ldtr_lim, guest_ldtr_access;
793    uint64_t guest_idtr_base, guest_idtr_lim;
794    uint64_t guest_gdtr_base, guest_gdtr_lim;
795
796    errval_t err = vmread(VMX_GUEST_ES_SEL, &guest_es_sel);
797    err += vmread(VMX_GUEST_ES_BASE, &guest_es_base);
798    err += vmread(VMX_GUEST_ES_LIM, &guest_es_lim);
799    err += vmread(VMX_GUEST_ES_ACCESS, &guest_es_access);
800    err += vmread(VMX_GUEST_CS_SEL, &guest_cs_sel);
801    err += vmread(VMX_GUEST_CS_BASE, &guest_cs_base);
802    err += vmread(VMX_GUEST_CS_LIM, &guest_cs_lim);
803    err += vmread(VMX_GUEST_CS_ACCESS, &guest_cs_access);
804    err += vmread(VMX_GUEST_SS_SEL, &guest_ss_sel);
805    err += vmread(VMX_GUEST_SS_BASE, &guest_ss_base);
806    err += vmread(VMX_GUEST_SS_LIM, &guest_ss_lim);
807    err += vmread(VMX_GUEST_SS_ACCESS, &guest_ss_access);
808    err += vmread(VMX_GUEST_DS_SEL, &guest_ds_sel);
809    err += vmread(VMX_GUEST_DS_BASE, &guest_ds_base);
810    err += vmread(VMX_GUEST_DS_LIM, &guest_ds_lim);
811    err += vmread(VMX_GUEST_DS_ACCESS, &guest_ds_access);
812    err += vmread(VMX_GUEST_FS_SEL, &guest_fs_sel);
813    err += vmread(VMX_GUEST_FS_BASE, &guest_fs_base);
814    err += vmread(VMX_GUEST_FS_LIM, &guest_fs_lim);
815    err += vmread(VMX_GUEST_FS_ACCESS, &guest_fs_access);
816    err += vmread(VMX_GUEST_GS_SEL, &guest_gs_sel);
817    err += vmread(VMX_GUEST_GS_BASE, &guest_gs_base);
818    err += vmread(VMX_GUEST_GS_LIM, &guest_gs_lim);
819    err += vmread(VMX_GUEST_GS_ACCESS, &guest_gs_access);
820    err += vmread(VMX_GUEST_TR_SEL, &guest_tr_sel);
821    err += vmread(VMX_GUEST_TR_BASE, &guest_tr_base);
822    err += vmread(VMX_GUEST_TR_LIM, &guest_tr_lim);
823    err += vmread(VMX_GUEST_TR_ACCESS, &guest_tr_access);
824    err += vmread(VMX_GUEST_LDTR_SEL, &guest_ldtr_sel);
825    err += vmread(VMX_GUEST_LDTR_BASE, &guest_ldtr_base);
826    err += vmread(VMX_GUEST_LDTR_LIM, &guest_ldtr_lim);
827    err += vmread(VMX_GUEST_LDTR_ACCESS, &guest_ldtr_access);
828    err += vmread(VMX_GUEST_IDTR_BASE, &guest_idtr_base);
829    err += vmread(VMX_GUEST_IDTR_LIM, &guest_idtr_lim);
830    err += vmread(VMX_GUEST_GDTR_BASE, &guest_gdtr_base);
831    err += vmread(VMX_GUEST_GDTR_LIM, &guest_gdtr_lim);
832
833    err += vmread(VMX_GUEST_RIP, &guest_rip);
834    err += vmread(VMX_GUEST_RSP, &guest_rsp);
835    err += vmread(VMX_GUEST_RFLAGS, &guest_rflags);
836    err += vmread(VMX_EXIT_REASON, &reason);
837    err += vmread(VMX_EXIT_QUAL, &exit_qual);
838    err += vmread(VMX_EXIT_INTR_INFO, &exit_intr_info);
839    err += vmread(VMX_EXIT_INTR_ERR, &intr_err);
840    err += vmread(VMX_IDT_VEC_INFO, &idt_vec_info);
841    err += vmread(VMX_IDT_VEC_ERR, &idt_vec_err);
842    err += vmread(VMX_INSTR_ERROR, &instr_error);
843    err += vmread(VMX_GPADDR_F, &gpaddr);
844    err += vmread(VMX_GL_ADDR, &gladdr);
845    err += vmread(VMX_ENTRY_INTR_INFO, &entry_intr_info);
846    err += vmread(VMX_GUEST_ACTIV_STATE, &activ_state);
847    err += vmread(VMX_GUEST_INTR_STATE, &intr_state);
848    err += vmread(VMX_EXIT_INSTR_LEN, &instr_len);
849    err += vmread(VMX_EXIT_INSTR_INFO, &instr_info);
850    err += vmread(VMX_GUEST_CR0, &guest_cr0);
851    err += vmread(VMX_GUEST_CR3, &guest_cr3);
852    err += vmread(VMX_GUEST_CR4, &guest_cr4);
853    err += vmread(VMX_GUEST_EFER_F, &guest_efer);
854    assert(err_is_ok(err));
855
856    printf("VMCS info:\n");
857    printf("\tvmexit reason = %d\n", (int)reason & 0xFFFF);
858    printf("\texit qualification = 0x%"PRIx64"\n", exit_qual);
859    printf("\tBit 31 of reason = %x\n", ((int)reason >> 31) & 1);
860
861    printf("\tVM-exit interruption information = 0x%"PRIx64"\n", exit_intr_info);
862    printf("\tVM-exit interruption error = 0x%"PRIx64"\n", intr_err);
863
864    printf("\tVM-entry interruption info=0x%"PRIx64"\n", entry_intr_info);
865
866    printf("\tIDT vector information = 0x%"PRIx64"\n", idt_vec_info);
867    printf("\tIDT vector error = 0x%"PRIx64"\n", idt_vec_err);
868
869    printf("\tInstruction error = 0x%"PRIx64", gladdr = 0x%"PRIx64", gpaddr = 0x%"PRIx64"\n",
870	   instr_error, gpaddr, gladdr);
871    printf("\tActivity state=0x%"PRIx64", Interruptibility state=0x%"PRIx64"\n",
872	   activ_state, intr_state);
873    printf("\tVM-exit instruction length = 0x%"PRIx64"\n", instr_len);
874    printf("\tVM-exit instruction info = 0x%"PRIx64"\n", instr_info);
875
876    printf("\tguest_rip = 0x%"PRIx64", guest_rflags = 0x%"PRIx64"\n",
877	   guest_rip, guest_rflags);
878    printf("\tRAX=0x%"PRIx64"    RBX=0x%"PRIx64"    RCX=0x%"PRIx64"    RDX=0x%"PRIx64"\n",
879	   g->regs.rax, g->regs.rbx, g->regs.rcx, g->regs.rdx);
880    printf("\tRSP=0x%"PRIx64"    RBP=0x%"PRIx64"    RSI=0x%"PRIx64"    RDI=0x%"PRIx64"\n",
881	   guest_rsp, g->regs.rbp, g->regs.rsi, g->regs.rdi);
882    printf("\tR8 =0x%"PRIx64"    R9 =0x%"PRIx64"    R10=0x%"PRIx64"    R11=0x%"PRIx64"\n",
883	   g->regs.r8, g->regs.r9, g->regs.r10, g->regs.r11);
884    printf("\tR12=0x%"PRIx64"    R13=0x%"PRIx64"    R14=0x%"PRIx64"    R15=0x%"PRIx64"\n",
885	   g->regs.r12, g->regs.r13, g->regs.r14, g->regs.r15);
886    printf("\tCR0=0x%"PRIx64", CR3=0x%"PRIx64", CR4=0x%"PRIx64"\n",
887	   guest_cr0, guest_cr3, guest_cr4);
888
889    printf("\tES: sel=0x%"PRIx64", base=0x%"PRIx64", lim=0x%"PRIx64", access=0x%"PRIx64"\n",
890	   guest_es_sel, guest_es_base, guest_es_lim, guest_es_access);
891    printf("\tCS: sel=0x%"PRIx64", base=0x%"PRIx64", lim=0x%"PRIx64", access=0x%"PRIx64"\n",
892	   guest_cs_sel, guest_cs_base, guest_cs_lim, guest_cs_access);
893    printf("\tSS: sel= 0x%"PRIx64", base=0x%"PRIx64", lim=0x%"PRIx64", access=0x%"PRIx64"\n",
894	   guest_ss_sel, guest_ss_base, guest_ss_lim, guest_ss_access);
895    printf("\tDS: sel=0x%"PRIx64", base=0x%"PRIx64", lim=0x%"PRIx64", access=0x%"PRIx64"\n",
896	   guest_ds_sel, guest_ds_base, guest_ds_lim, guest_ds_access);
897    printf("\tFS: sel=0x%"PRIx64", base=0x%"PRIx64", lim=0x%"PRIx64", access=0x%"PRIx64"\n",
898	   guest_fs_sel, guest_fs_base, guest_fs_lim, guest_fs_access);
899    printf("\tGS: sel=0x%"PRIx64", base=0x%"PRIx64", lim=0x%"PRIx64", access=0x%"PRIx64"\n",
900	   guest_gs_sel, guest_gs_base, guest_gs_lim, guest_gs_access);
901    printf("\tTR: sel=0x%"PRIx64", base=0x%"PRIx64", lim=0x%"PRIx64", access=0x%"PRIx64"\n",
902	   guest_tr_sel, guest_tr_base, guest_tr_lim, guest_tr_access);
903    printf("\tLDTR: sel=0x%"PRIx64", base=0x%"PRIx64", lim=0x%"PRIx64", access=0x%"PRIx64"\n",
904	   guest_ldtr_sel, guest_ldtr_base, guest_ldtr_lim, guest_ldtr_access);
905    printf("\tIDTR: base=0x%"PRIx64", lim=0x%"PRIx64"\n",
906	   guest_idtr_base, guest_idtr_lim);
907    printf("\tGDTR: base=0x%"PRIx64", lim=0x%"PRIx64"\n",
908	   guest_gdtr_base, guest_gdtr_lim);
909
910    printf("\tEFER = 0x%"PRIx64"\n", guest_efer);
911}
912
913static inline uint64_t interruption_type(uint64_t intr_info) {
914    return (intr_info >> 8) & 0x7;
915}
916
917static void __attribute__ ((noreturn))
918call_monitor(struct dcb *dcb)
919{
920    ctrl->num_vm_exits_with_monitor_invocation++;
921    /* the guest exited not due to an interrupt but some condition the
922     * monitor has to handle, therefore notify the monitor */
923
924    assert(dcb->is_vm_guest);
925
926    // disable the domain
927    scheduler_remove(dcb);
928
929    // call the monitor
930    errval_t err = lmp_deliver_notification(&dcb->guest_desc.monitor_ep.cap);
931    if (err_is_fail(err)) {
932        printk(LOG_ERR, "Unexpected error delivering VMEXIT");
933    }
934
935    // run the monitor
936    dispatch(dcb->guest_desc.monitor_ep.cap.u.endpoint.listener);
937}
938
939struct sysret sys_syscall(uint64_t syscall, uint64_t arg0, uint64_t arg1,
940                          uint64_t *args, uint64_t rflags, uint64_t rip);
941
942extern uint64_t user_stack_save;
943
944void __attribute__ ((noreturn))
945vmx_vmkit_vmenter (struct dcb *dcb)
946{
947    errval_t err;
948    lpaddr_t lpaddr = gen_phys_to_local_phys(dcb->guest_desc.ctrl.cap.u.frame.base);
949    ctrl = (void *)local_phys_to_mem(lpaddr);
950
951    assert(dcb != NULL);
952    assert(dcb->vspace != 0);
953    assert(dcb->is_vm_guest);
954
955    if (ept_enabled()) {
956        err = vmwrite(VMX_EPTP_F, ((dcb->vspace) & pa_width_mask() & ~BASE_PAGE_MASK) | 0x18);
957	assert(err_is_ok(err));
958    } else {
959        err = vmwrite(VMX_GUEST_CR3, dcb->vspace);
960	assert(err_is_ok(err));
961    }
962
963 vmx_vmenter_loop:
964
965    enter_guest();
966
967    uint16_t exit_reason;
968    err = vmread(VMX_EXIT_REASON, (uint64_t *)&exit_reason);
969
970    switch(exit_reason) {
971    case VMX_EXIT_REASON_INVAL_VMCS:
972      {
973	// A condition that violates ones of the processor checks may be violated
974	// during the execution of the guest. With the Linux guest we used, the GS
975	// limit is set to 0x10ffef, which causes one of the checks to fail.
976	uint64_t gs_lim;
977	err += vmread(VMX_GUEST_GS_LIM, &gs_lim);
978	assert(gs_lim == 0x10ffef);
979	err += vmwrite(VMX_GUEST_GS_LIM, 0xfffef);
980	assert(err_is_ok(err));
981      }
982      goto vmx_vmenter_loop;
983
984    case VMX_EXIT_REASON_EXCEPTION:
985      {
986        uint64_t intr_info, type;
987	err += vmread(VMX_EXIT_INTR_INFO, &intr_info);
988	assert(err_is_ok(err));
989
990	type = interruption_type(intr_info);
991
992	if (type != TYPE_NMI) {
993	    call_monitor(dcb);
994	    break;
995	}
996      }
997    case VMX_EXIT_REASON_EXT_INTR:
998    case VMX_EXIT_REASON_SMI:
999      {
1000	ctrl->num_vm_exits_without_monitor_invocation++;
1001
1002#ifdef CONFIG_ARRAKISMON
1003	uint64_t guest_rip, guest_rsp, guest_rflags;
1004	err += vmread(VMX_GUEST_RIP, &guest_rip);
1005	err += vmread(VMX_GUEST_RSP, &guest_rsp);
1006	err += vmread(VMX_GUEST_RFLAGS, &guest_rflags);
1007
1008	uint64_t guest_fs_sel, guest_gs_sel;
1009	err += vmread(VMX_GUEST_FS_SEL, &guest_fs_sel);
1010	err += vmread(VMX_GUEST_GS_SEL, &guest_gs_sel);
1011	assert(err_is_ok(err));
1012
1013	arch_registers_state_t *area = NULL;
1014
1015	// Store user state into corresponding save area
1016	if(dispatcher_is_disabled_ip(dcb->disp, guest_rip)) {
1017	    area = dispatcher_get_disabled_save_area(dcb->disp);
1018	    dcb->disabled = true;
1019	} else {
1020	    area = dispatcher_get_enabled_save_area(dcb->disp);
1021	    dcb->disabled = false;
1022	}
1023	memcpy(area, &ctrl->regs, sizeof(arch_registers_state_t));
1024	area->rip = guest_rip;
1025	area->rax = ctrl->regs.rax;
1026	area->rsp = guest_rsp;
1027	area->eflags = guest_rflags;
1028	area->fs = guest_fs_sel;
1029	area->gs = guest_gs_sel;
1030#endif
1031	wait_for_interrupt();
1032      }
1033      break;
1034#ifdef CONFIG_ARRAKISMON
1035    case VMX_EXIT_REASON_VMCALL:
1036      {
1037	// Translate this to a SYSCALL
1038	struct registers_x86_64 *regs = &ctrl->regs;
1039	uint64_t args[10] = {
1040	    regs->r10, regs->r8, regs->r9, regs->r12, regs->r13, regs->r14,
1041	    regs->r15, regs->rax, regs->rbp, regs->rbx
1042	};
1043
1044	/* printf("VMMCALL\n"); */
1045
1046	uint64_t guest_rip, guest_rsp, guest_rflags;
1047	err += vmread(VMX_GUEST_RIP, &guest_rip);
1048	err += vmread(VMX_GUEST_RSP, &guest_rsp);
1049	err += vmread(VMX_GUEST_RFLAGS, &guest_rflags);
1050	// Advance guest RIP to next instruction
1051	err += vmwrite(VMX_GUEST_RIP, guest_rip + 3);
1052	assert(err_is_ok(err));
1053
1054	user_stack_save = guest_rsp;
1055
1056	struct sysret ret = sys_syscall(regs->rdi, regs->rsi, regs->rdx,
1057					args, guest_rflags, guest_rip + 3);
1058	regs->rax = ret.error;
1059	regs->rdx = ret.value;
1060      }
1061      goto vmx_vmenter_loop;
1062#endif
1063    default:
1064        call_monitor(dcb);
1065	break;
1066    }
1067}
1068