1/*
2 * Copyright 2020, Data61, CSIRO (ABN 41 687 119 230)
3 *
4 * SPDX-License-Identifier: GPL-2.0-only
5 */
6
7#include <config.h>
8
9#ifdef CONFIG_VTX
10
11#include <types.h>
12#include <machine/io.h>
13#include <api/failures.h>
14#include <api/syscall.h>
15#include <kernel/thread.h>
16#include <object/objecttype.h>
17#include <arch/machine/cpu_registers.h>
18#include <arch/model/statedata.h>
19#include <arch/object/vcpu.h>
20#include <arch/object/ioport.h>
21#include <util.h>
22#include <sel4/arch/vmenter.h>
23
24#define VMX_EXIT_QUAL_TYPE_MOV_CR 0
25#define VMX_EXIT_QUAL_TYPE_CLTS 2
26#define VMX_EXIT_QUAL_TYPE_LMSW 3
27
28#define VMXON_REGION_SIZE 4096
29
30const vcpu_gp_register_t crExitRegs[] = {
31    VCPU_EAX, VCPU_ECX, VCPU_EDX, VCPU_EBX, VCPU_ESP, VCPU_EBP, VCPU_ESI, VCPU_EDI
32};
33
34typedef struct msr_bitmap {
35    word_t bitmap[0x2000 / sizeof(word_t) / 8];
36} msr_bitmap_t;
37
38typedef struct msr_bitmaps {
39    msr_bitmap_t low_msr_read;
40    msr_bitmap_t high_msr_read;
41    msr_bitmap_t low_msr_write;
42    msr_bitmap_t high_msr_write;
43} msr_bitmaps_t;
44
45static struct PACKED {
46    uint32_t revision;
47    char data[VMXON_REGION_SIZE - sizeof(uint32_t)];
48} vmxon_region ALIGN(VMXON_REGION_SIZE);
49
50static msr_bitmaps_t msr_bitmap_region ALIGN(BIT(seL4_PageBits));
51
52static char null_ept_space[seL4_PageBits] ALIGN(BIT(seL4_PageBits));
53
54/* Cached value of the hardware defined vmcs revision */
55static uint32_t vmcs_revision;
56
57/* Cached value of the VPID capability MSR */
58static vmx_ept_vpid_cap_msr_t vpid_capability;
59
60/* Cache the values that we calculated for bits that need to be set high
61 * and low in various vmcs fields */
62static uint32_t pin_control_high;
63static uint32_t pin_control_low;
64static uint32_t primary_control_high;
65static uint32_t primary_control_low;
66static uint32_t secondary_control_high;
67static uint32_t secondary_control_low;
68static uint32_t entry_control_high;
69static uint32_t entry_control_low;
70static uint32_t exit_control_high;
71static uint32_t exit_control_low;
72static uint32_t cr0_high;
73static uint32_t cr0_low;
74static uint32_t cr4_high;
75static uint32_t cr4_low;
76
77/* these flags indicate the presence of specific VT-x features. These features
78 * are checked for at boot time and are constant from then on */
79static bool_t vmx_feature_vpid;
80static bool_t vmx_feature_load_perf_global_ctrl;
81static bool_t vmx_feature_ack_on_exit;
82
83static vcpu_t *x86KSVPIDTable[VPID_LAST + 1];
84static vpid_t x86KSNextVPID = VPID_FIRST;
85
86static inline bool_t vmxon(paddr_t vmxon_region)
87{
88    uint8_t error;
89    /* vmxon requires a 64bit memory address, so perform a
90     * cast here to guarantee this on 32-bit platforms */
91    uint64_t vmxonreg = vmxon_region;
92    asm volatile(
93        "vmxon %1; setnae %0"
94        : "=q"(error)
95        : "m"(vmxonreg)
96        : "memory", "cc"
97    );
98    return !!error;
99}
100
101static void vmclear(void *vmcs_ptr)
102{
103    uint64_t physical_address;
104    physical_address = pptr_to_paddr((void *)vmcs_ptr);
105    asm volatile(
106        "vmclear %0"
107        :
108        : "m"(physical_address)
109        : "cc"
110    );
111}
112
113void clearCurrentVCPU(void)
114{
115    vcpu_t *vcpu = ARCH_NODE_STATE(x86KSCurrentVCPU);
116    if (vcpu) {
117        vmclear(vcpu);
118        vcpu->launched = false;
119        ARCH_NODE_STATE(x86KSCurrentVCPU) = NULL;
120    }
121}
122
123static void vmptrld(void *vmcs_ptr)
124{
125    uint64_t physical_address;
126    uint8_t error;
127    physical_address = pptr_to_paddr(vmcs_ptr);
128    asm volatile(
129        "vmptrld %1; setna %0"
130        : "=q"(error)
131        : "m"(physical_address)
132        : "cc"
133    );
134    /* The usage of vmptrld should be correct by construction. As there is no
135     * capacity to propogate errors where vmptrld is used we will do our best
136     * to detect bugs in debug builds by asserting */
137    assert(!error);
138}
139
140static void switchVCPU(vcpu_t *vcpu)
141{
142#ifdef ENABLE_SMP_SUPPORT
143    if (vcpu->last_cpu != getCurrentCPUIndex() && ARCH_NODE_STATE_ON_CORE(x86KSCurrentVCPU, vcpu->last_cpu) == vcpu) {
144        /* vcpu is currently loaded on another core, need to do vmclear on that core */
145        doRemoteClearCurrentVCPU(vcpu->last_cpu);
146    }
147#endif
148    clearCurrentVCPU();
149    vmptrld(vcpu);
150#ifdef ENABLE_SMP_SUPPORT
151    if (vcpu->last_cpu != getCurrentCPUIndex()) {
152        /* migrate host state */
153        vmwrite(VMX_HOST_TR_BASE, (word_t)&x86KSGlobalState[CURRENT_CPU_INDEX()].x86KStss);
154        vmwrite(VMX_HOST_GDTR_BASE, (word_t)x86KSGlobalState[CURRENT_CPU_INDEX()].x86KSgdt);
155        vmwrite(VMX_HOST_IDTR_BASE, (word_t)x86KSGlobalState[CURRENT_CPU_INDEX()].x86KSidt);
156        vmwrite(VMX_HOST_SYSENTER_ESP, (uint64_t)(word_t)((char *)&x86KSGlobalState[CURRENT_CPU_INDEX()].x86KStss.tss.words[0] +
157                                                          4));
158    }
159    vcpu->last_cpu = getCurrentCPUIndex();
160#endif
161    ARCH_NODE_STATE(x86KSCurrentVCPU) = vcpu;
162}
163
164static void print_bits(word_t bits)
165{
166    bool_t first = true;
167    while (bits) {
168        int index = seL4_WordBits - 1 - clzl(bits);
169        if (first) {
170            printf("%d", index);
171            first = false;
172        } else {
173            printf(",%d", index);
174        }
175        bits &= ~BIT(index);
176    }
177}
178
179static bool_t check_fixed_value(word_t val, word_t low, word_t high)
180{
181    word_t not_high;
182    word_t not_low;
183    /* check if any bits that should be high, are not
184     * high & val represents the set of bits that are
185     * correctly set to high. if this is equal to high
186     * then everything is good, to detect exactly which
187     * bits were not high we can invert and mask with
188     * high again. Now if this is 0 everythins is fine,
189     * and if not each bit set indicates a bit we had
190     * failed to set */
191    not_high = high & ~(high & val);
192    if (not_high != 0) {
193        printf("Failed to set bits: ");
194        print_bits(not_high);
195        return false;
196    }
197    /* we can do the same thing for finding things
198     * that should be low by first inverting */
199    not_low = ~low & ~(~low & ~val);
200    if (not_low != 0) {
201        printf("Incorrectly cleared bits: ");
202        print_bits(not_low);
203        return false;
204    }
205    return true;
206}
207
208static bool_t vtx_check_fixed_values(word_t cr0, word_t cr4)
209{
210    if (!check_fixed_value(cr0, cr0_low, cr0_high)) {
211        printf(" of CR0\n");
212        return false;
213    }
214    if (!check_fixed_value(cr4, cr4_low, cr4_high)) {
215        printf(" of CR4\n");
216        return false;
217    }
218    return true;
219}
220
221static bool_t BOOT_CODE init_vtx_fixed_values(bool_t useTrueMsrs)
222{
223    uint32_t pin_control_mask =
224        BIT(0) |    //Extern interrupt exiting
225        BIT(3) |    //NMI exiting
226        BIT(5);     //virtual NMIs
227    uint32_t primary_control_mask =
228        BIT(25) |   //Use I/O bitmaps
229        BIT(28) |   //Use MSR bitmaps
230        BIT(31);    //Activate secondary controls
231    uint32_t secondary_control_mask =
232        BIT(1);     //Enable EPT
233    uint32_t exit_control_mask =
234        BIT(2)  |   //Save debug controls
235        BIT(18) |   //Save guest IA32_PAT on exit
236        BIT(19) |   //Load host IA32_PAT
237        BIT(20) |   //Save guest IA32_EFER on exit
238        BIT(21);    //Load host IA32_EFER
239#ifdef CONFIG_ARCH_X86_64
240    exit_control_mask |= BIT(9); //Host address-space size
241#endif
242    /* Read out the fixed high and low bits from the MSRs */
243    uint32_t pinbased_ctls;
244    uint32_t procbased_ctls;
245    uint32_t exit_ctls;
246    uint32_t entry_ctls;
247    if (useTrueMsrs) {
248        pinbased_ctls = IA32_VMX_TRUE_PINBASED_CTLS_MSR;
249        procbased_ctls = IA32_VMX_TRUE_PROCBASED_CTLS_MSR;
250        exit_ctls = IA32_VMX_TRUE_EXIT_CTLS_MSR;
251        entry_ctls = IA32_VMX_TRUE_ENTRY_CTLS_MSR;
252    } else {
253        pinbased_ctls = IA32_VMX_PINBASED_CTLS_MSR;
254        procbased_ctls = IA32_VMX_PROCBASED_CTLS_MSR;
255        exit_ctls = IA32_VMX_EXIT_CTLS_MSR;
256        entry_ctls = IA32_VMX_ENTRY_CTLS_MSR;
257    }
258    pin_control_high = x86_rdmsr_low(pinbased_ctls);
259    pin_control_low = x86_rdmsr_high(pinbased_ctls);
260    primary_control_high = x86_rdmsr_low(procbased_ctls);
261    primary_control_low = x86_rdmsr_high(procbased_ctls);
262    secondary_control_high = x86_rdmsr_low(IA32_VMX_PROCBASED_CTLS2_MSR);
263    secondary_control_low = x86_rdmsr_high(IA32_VMX_PROCBASED_CTLS2_MSR);
264    exit_control_high = x86_rdmsr_low(exit_ctls);
265    exit_control_low = x86_rdmsr_high(exit_ctls);
266    entry_control_high = x86_rdmsr_low(entry_ctls);
267    entry_control_low = x86_rdmsr_high(entry_ctls);
268
269    cr0_high = x86_rdmsr_low(IA32_VMX_CR0_FIXED0_MSR);
270    cr0_low = x86_rdmsr_low(IA32_VMX_CR0_FIXED1_MSR);
271    cr4_high = x86_rdmsr_low(IA32_VMX_CR4_FIXED0_MSR);
272    cr4_low = x86_rdmsr_low(IA32_VMX_CR4_FIXED1_MSR);
273
274    /* Check for VPID support */
275    if (!(secondary_control_low & BIT(5))) {
276        vmx_feature_vpid = 0;
277        printf("vt-x: VPIDs are not supported. Expect performance degredation\n");
278    } else {
279        vmx_feature_vpid = 1;
280        secondary_control_mask |= BIT(5);
281    }
282
283    /* Check for load perf global control */
284    if (!(exit_control_low & BIT(12))) {
285        vmx_feature_load_perf_global_ctrl = 0;
286        printf("vt-x: Load IA32_PERF_GLOBAL_CONTROL not supported. Hardware debugging may not work\n");
287    } else {
288        vmx_feature_load_perf_global_ctrl = 1;
289        exit_control_mask |= BIT(12);
290    }
291
292    /* Check for external interrupt exiting */
293    if (!(exit_control_low & BIT(15))) {
294        vmx_feature_ack_on_exit = 0;
295        printf("vt-x: Interrupt ack on exit not supported. Expect performance degredation\n");
296    } else {
297        vmx_feature_ack_on_exit = 1;
298        exit_control_mask |= BIT(15);
299    }
300
301    /* See if the hardware requires bits that require to be high to be low */
302    uint32_t missing;
303    missing = (~pin_control_low) & pin_control_mask;
304    if (missing) {
305        printf("vt-x: Unsupported pin control features %lx\n", (long)missing);
306        return false;
307    }
308    missing = (~primary_control_low) & primary_control_mask;
309    if (missing) {
310        printf("vt-x: Unsupported primary control features %lx\n", (long)missing);
311        return false;
312    }
313    missing = (~secondary_control_low) & secondary_control_mask;
314    if (missing) {
315        printf("vt-x: Unsupported secondary control features %lx\n", (long)missing);
316        return false;
317    }
318    missing = (~exit_control_low) & exit_control_mask;
319    if (missing) {
320        printf("vt-x: Unsupported exit control features %lx\n", (long)missing);
321        return false;
322    }
323
324    /* Force the bits we require to be high */
325    pin_control_high |= pin_control_mask;
326    primary_control_high |= primary_control_mask;
327    secondary_control_high |= secondary_control_mask;
328    exit_control_high |= exit_control_mask;
329
330    return true;
331}
332
333static bool_t BOOT_CODE check_vtx_fixed_values(bool_t useTrueMsrs)
334{
335    uint32_t pinbased_ctls;
336    uint32_t procbased_ctls;
337    uint32_t exit_ctls;
338    uint32_t entry_ctls;
339    if (useTrueMsrs) {
340        pinbased_ctls = IA32_VMX_TRUE_PINBASED_CTLS_MSR;
341        procbased_ctls = IA32_VMX_TRUE_PROCBASED_CTLS_MSR;
342        exit_ctls = IA32_VMX_TRUE_EXIT_CTLS_MSR;
343        entry_ctls = IA32_VMX_TRUE_ENTRY_CTLS_MSR;
344    } else {
345        pinbased_ctls = IA32_VMX_PINBASED_CTLS_MSR;
346        procbased_ctls = IA32_VMX_PROCBASED_CTLS_MSR;
347        exit_ctls = IA32_VMX_EXIT_CTLS_MSR;
348        entry_ctls = IA32_VMX_ENTRY_CTLS_MSR;
349    }
350    uint32_t local_pin_control_high = x86_rdmsr_low(pinbased_ctls);
351    uint32_t local_pin_control_low = x86_rdmsr_high(pinbased_ctls);
352    uint32_t local_primary_control_high = x86_rdmsr_low(procbased_ctls);
353    uint32_t local_primary_control_low = x86_rdmsr_high(procbased_ctls);
354    uint32_t local_secondary_control_high = x86_rdmsr_low(IA32_VMX_PROCBASED_CTLS2_MSR);
355    uint32_t local_secondary_control_low = x86_rdmsr_high(IA32_VMX_PROCBASED_CTLS2_MSR);
356    uint32_t local_exit_control_high = x86_rdmsr_low(exit_ctls);
357    uint32_t local_exit_control_low = x86_rdmsr_high(exit_ctls);
358    uint32_t local_entry_control_high = x86_rdmsr_low(entry_ctls);
359    uint32_t local_entry_control_low = x86_rdmsr_high(entry_ctls);
360
361    uint32_t local_cr0_high = x86_rdmsr_low(IA32_VMX_CR0_FIXED0_MSR);
362    uint32_t local_cr0_low = x86_rdmsr_low(IA32_VMX_CR0_FIXED1_MSR);
363    uint32_t local_cr4_high = x86_rdmsr_low(IA32_VMX_CR4_FIXED0_MSR);
364    uint32_t local_cr4_low = x86_rdmsr_low(IA32_VMX_CR4_FIXED1_MSR);
365
366    /* We want to check that any bits that there are no bits that this core
367     * requires to be high, that the BSP did not require to be high. This can
368     * be checked with 'local_high & high == local_high'.
369     * Also need to make sure that the BSP has not determined that any bits should
370     * be high that this core requires to be low. This can be checked with
371     * '~local_low & high == 0'
372     */
373    return
374        (local_pin_control_high & pin_control_high) == local_pin_control_high &&
375        (~local_pin_control_low & pin_control_high) == 0 &&
376        (local_primary_control_high & primary_control_high) == local_primary_control_high &&
377        (~local_primary_control_low & primary_control_high) == 0 &&
378        (local_secondary_control_high & secondary_control_high) == local_secondary_control_high &&
379        (~local_secondary_control_low & secondary_control_high) == 0 &&
380        (local_exit_control_high & exit_control_high) == local_exit_control_high &&
381        (~local_exit_control_low & exit_control_high) == 0 &&
382        (local_entry_control_high & entry_control_high) == local_entry_control_high &&
383        (~local_entry_control_low & entry_control_high) == 0 &&
384        local_cr0_high == cr0_high &&
385        local_cr0_low == cr0_low &&
386        local_cr4_high == cr4_high &&
387        local_cr4_low == cr4_low;
388}
389
390static inline uint32_t applyFixedBits(uint32_t original, uint32_t high, uint32_t low)
391{
392    original |= high;
393    original &= low;
394    return original;
395}
396
397void vcpu_init(vcpu_t *vcpu)
398{
399    vcpu->vcpuTCB = NULL;
400    vcpu->launched = false;
401
402    memcpy(vcpu->vmcs, &vmcs_revision, 4);
403
404    switchVCPU(vcpu);
405
406    vcpu->cr0 = cr0_high & cr0_low;
407    vcpu->cr0_shadow = 0;
408    vcpu->cr0_mask = 0;
409    vcpu->exception_bitmap = 0;
410    vcpu->vpid = VPID_INVALID;
411#ifdef ENABLE_SMP_SUPPORT
412    vcpu->last_cpu = getCurrentCPUIndex();
413#endif /* ENABLE_SMP_SUPPORT */
414
415    vmwrite(VMX_HOST_PAT, x86_rdmsr(IA32_PAT_MSR));
416    vmwrite(VMX_HOST_EFER, x86_rdmsr(IA32_EFER_MSR));
417    // By default we will disable performance counters when we come out
418    // of a VM. When performance counters are supported this host state
419    // needs to be updated on VM entry
420    if (vmx_feature_load_perf_global_ctrl) {
421        vmwrite(VMX_HOST_PERF_GLOBAL_CTRL, 0);
422    }
423    vmwrite(VMX_HOST_CR0, read_cr0());
424    vmwrite(VMX_HOST_CR4, read_cr4());
425    vmwrite(VMX_HOST_FS_BASE, 0);
426    vmwrite(VMX_HOST_GS_BASE, 0);
427    vmwrite(VMX_HOST_TR_BASE, (word_t)&x86KSGlobalState[CURRENT_CPU_INDEX()].x86KStss);
428    vmwrite(VMX_HOST_GDTR_BASE, (word_t)x86KSGlobalState[CURRENT_CPU_INDEX()].x86KSgdt);
429    vmwrite(VMX_HOST_IDTR_BASE, (word_t)x86KSGlobalState[CURRENT_CPU_INDEX()].x86KSidt);
430    vmwrite(VMX_HOST_SYSENTER_CS, (word_t)SEL_CS_0);
431    vmwrite(VMX_HOST_SYSENTER_EIP, (word_t)&handle_syscall);
432    if (!config_set(CONFIG_HARDWARE_DEBUG_API)) {
433        vmwrite(VMX_HOST_SYSENTER_ESP, (uint64_t)(word_t)((char *)&x86KSGlobalState[CURRENT_CPU_INDEX()].x86KStss.tss.words[0] +
434                                                          4));
435    }
436    /* Set host SP to point just beyond the first field to be stored on exit. */
437    vmwrite(VMX_HOST_RSP, (word_t)&vcpu->gp_registers[n_vcpu_gp_register]);
438    vmwrite(VMX_HOST_RIP, (word_t)&handle_vmexit);
439#ifdef CONFIG_KERNEL_SKIM_WINDOW
440    /* if we have a skim window then our host cr3 is a constant and is always the
441     * the kernel address space, so we set it here instead of lazily in restoreVMCS */
442    vmwrite(VMX_HOST_CR3, makeCR3(kpptr_to_paddr(x64KSKernelPML4), 0).words[0]);
443#endif /* CONFIG_KERNEL_SKIM_WINDOW */
444
445    vmwrite(VMX_HOST_ES_SELECTOR, SEL_DS_0);
446    vmwrite(VMX_HOST_CS_SELECTOR, SEL_CS_0);
447    vmwrite(VMX_HOST_SS_SELECTOR, SEL_DS_0);
448    vmwrite(VMX_HOST_DS_SELECTOR, SEL_DS_0);
449    vmwrite(VMX_HOST_FS_SELECTOR, 0);
450    vmwrite(VMX_HOST_GS_SELECTOR, 0);
451    vmwrite(VMX_HOST_TR_SELECTOR, SEL_TSS);
452
453    /* Set fixed VMCS control fields. */
454    vmwrite(VMX_CONTROL_PIN_EXECUTION_CONTROLS, pin_control_high & pin_control_low);
455    vmwrite(VMX_CONTROL_PRIMARY_PROCESSOR_CONTROLS, primary_control_high & primary_control_low);
456    vmwrite(VMX_CONTROL_SECONDARY_PROCESSOR_CONTROLS, secondary_control_high & secondary_control_low);
457    vmwrite(VMX_CONTROL_EXIT_CONTROLS, exit_control_high & exit_control_low);
458    vmwrite(VMX_CONTROL_ENTRY_CONTROLS, entry_control_high & entry_control_low);
459    vmwrite(VMX_CONTROL_MSR_ADDRESS, (word_t)kpptr_to_paddr(&msr_bitmap_region));
460    vmwrite(VMX_GUEST_CR0, vcpu->cr0);
461    vmwrite(VMX_GUEST_CR4, cr4_high & cr4_low);
462
463    vmwrite(VMX_GUEST_VMCS_LINK_POINTER, ~(word_t)0);
464    vmwrite(VMX_GUEST_VMCS_LINK_POINTER_HIGH, ~(word_t)0);
465
466    memset(vcpu->io, ~(word_t)0, VCPU_IOBITMAP_SIZE);
467    vmwrite(VMX_CONTROL_IOA_ADDRESS, pptr_to_paddr(vcpu->io));
468    vmwrite(VMX_CONTROL_IOB_ADDRESS, pptr_to_paddr((char *)vcpu->io + (VCPU_IOBITMAP_SIZE / 2)));
469}
470
471static void dissociateVcpuTcb(tcb_t *tcb, vcpu_t *vcpu)
472{
473    assert(tcb->tcbArch.tcbVCPU == vcpu);
474    assert(vcpu->vcpuTCB == tcb);
475    tcb->tcbArch.tcbVCPU = NULL;
476    vcpu->vcpuTCB = NULL;
477}
478
479void vcpu_finalise(vcpu_t *vcpu)
480{
481    if (vcpu->vcpuTCB) {
482        dissociateVcpuTcb(vcpu->vcpuTCB, vcpu);
483    }
484    if (ARCH_NODE_STATE_ON_CORE(x86KSCurrentVCPU, vcpu->last_cpu) == vcpu) {
485#ifdef ENABLE_SMP_SUPPORT
486        if (vcpu->last_cpu != getCurrentCPUIndex()) {
487            doRemoteClearCurrentVCPU(vcpu->last_cpu);
488        } else
489#endif /* ENABLE_SMP_SUPPORT */
490        {
491            clearCurrentVCPU();
492        }
493    }
494}
495
496static void associateVcpuTcb(tcb_t *tcb, vcpu_t *vcpu)
497{
498    if (tcb->tcbArch.tcbVCPU) {
499        dissociateVcpuTcb(tcb, tcb->tcbArch.tcbVCPU);
500    }
501    if (vcpu->vcpuTCB) {
502        dissociateVcpuTcb(vcpu->vcpuTCB, vcpu);
503    }
504    vcpu->vcpuTCB = tcb;
505    tcb->tcbArch.tcbVCPU = vcpu;
506}
507
508static exception_t invokeVCPUWriteRegisters(vcpu_t *vcpu, word_t *buffer)
509{
510    int i;
511    for (i = 0; i < n_vcpu_gp_register; i++) {
512        vcpu->gp_registers[i] = getSyscallArg(i, buffer);
513    }
514    setThreadState(NODE_STATE(ksCurThread), ThreadState_Restart);
515    return EXCEPTION_NONE;
516}
517
518static exception_t decodeVCPUWriteRegisters(cap_t cap, word_t length, word_t *buffer)
519{
520    if (length < 7) {
521        userError("VCPU WriteRegisters: Truncated message.");
522        current_syscall_error.type = seL4_TruncatedMessage;
523        return EXCEPTION_SYSCALL_ERROR;
524    }
525    return invokeVCPUWriteRegisters(VCPU_PTR(cap_vcpu_cap_get_capVCPUPtr(cap)), buffer);
526}
527
528static exception_t invokeEnableIOPort(vcpu_t *vcpu, cte_t *slot, cap_t cap, uint16_t low, uint16_t high)
529{
530    /* remove any existing io ports from this cap */
531    clearVPIDIOPortMappings(cap_io_port_cap_get_capIOPortVPID(cap),
532                            cap_io_port_cap_get_capIOPortFirstPort(cap),
533                            cap_io_port_cap_get_capIOPortLastPort(cap));
534    /* update the assigned vpid. If the vcpu does not have a valid vpid then
535     * this is fine as whilst the cap will not point to the vcpu, the vcpu
536     * will have its port mask cleared when it gets assigned a vpid */
537    cap = cap_io_port_cap_set_capIOPortVPID(cap, vcpu->vpid);
538    slot->cap = cap;
539    setIOPortMask(vcpu->io, low, high, false);
540    return EXCEPTION_NONE;
541}
542
543static exception_t decodeEnableIOPort(cap_t cap, word_t length, word_t *buffer, extra_caps_t excaps)
544{
545    vcpu_t *vcpu;
546    cap_t ioCap;
547    cte_t *ioSlot;
548    uint16_t low, high;
549
550    if (length < 2) {
551        userError("VCPU EnableIOPort: Truncated message.");
552        current_syscall_error.type = seL4_TruncatedMessage;
553        return EXCEPTION_SYSCALL_ERROR;
554    }
555    if (excaps.excaprefs[0] == NULL) {
556        userError("VCPU EnableIOPort: Truncated message.");
557        current_syscall_error.type = seL4_TruncatedMessage;
558        return EXCEPTION_SYSCALL_ERROR;
559    }
560    ioSlot = excaps.excaprefs[0];
561    ioCap  = excaps.excaprefs[0]->cap;
562
563    if (cap_get_capType(ioCap) != cap_io_port_cap) {
564        userError("VCPU EnableIOPort: IOPort cap is not a IOPort cap.");
565        current_syscall_error.type = seL4_IllegalOperation;
566        return EXCEPTION_SYSCALL_ERROR;
567    }
568
569    low = getSyscallArg(0, buffer);
570    high = getSyscallArg(1, buffer);
571
572    if (low < cap_io_port_cap_get_capIOPortFirstPort(ioCap) || high > cap_io_port_cap_get_capIOPortLastPort(ioCap)) {
573        userError("VCPU EnableIOPort: Requested range not valid for given IOPort cap");
574        current_syscall_error.type = seL4_InvalidArgument;
575        current_syscall_error.invalidArgumentNumber = 0;
576        return EXCEPTION_SYSCALL_ERROR;
577    }
578
579    vcpu = VCPU_PTR(cap_vcpu_cap_get_capVCPUPtr(cap));
580
581    setThreadState(NODE_STATE(ksCurThread), ThreadState_Restart);
582    return invokeEnableIOPort(vcpu, ioSlot, ioCap, low, high);
583}
584
585static exception_t invokeDisableIOPort(vcpu_t *vcpu, uint16_t low, uint16_t high)
586{
587    setIOPortMask(vcpu->io, low, high, true);
588    setThreadState(NODE_STATE(ksCurThread), ThreadState_Restart);
589    return EXCEPTION_NONE;
590}
591
592static exception_t decodeDisableIOPort(cap_t cap, word_t length, word_t *buffer)
593{
594    vcpu_t *vcpu;
595    uint16_t low, high;
596
597    if (length < 2) {
598        userError("VCPU DisableIOPort: Truncated message.");
599        current_syscall_error.type = seL4_TruncatedMessage;
600        return EXCEPTION_SYSCALL_ERROR;
601    }
602
603    low = getSyscallArg(0, buffer);
604    high = getSyscallArg(1, buffer);
605
606    vcpu = VCPU_PTR(cap_vcpu_cap_get_capVCPUPtr(cap));
607
608    return invokeDisableIOPort(vcpu, low, high);
609}
610
611static exception_t invokeWriteVMCS(vcpu_t *vcpu, word_t *buffer, word_t field, word_t value)
612{
613    tcb_t *thread;
614    thread = NODE_STATE(ksCurThread);
615    if (ARCH_NODE_STATE(x86KSCurrentVCPU) != vcpu) {
616        switchVCPU(vcpu);
617    }
618    switch (field) {
619    case VMX_CONTROL_EXCEPTION_BITMAP:
620        vcpu->exception_bitmap = vcpu->cached_exception_bitmap = value;
621        break;
622    case VMX_GUEST_CR0:
623        vcpu->cr0 = vcpu->cached_cr0 = value;
624        break;
625    case VMX_CONTROL_CR0_MASK:
626        vcpu->cr0_mask = vcpu->cached_cr0_mask = value;
627        break;
628    case VMX_CONTROL_CR0_READ_SHADOW:
629        vcpu->cr0_shadow = vcpu->cached_cr0_shadow = value;
630        break;
631    }
632    setMR(thread, buffer, 0, value);
633    vmwrite(field, value);
634    setThreadState(NODE_STATE(ksCurThread), ThreadState_Restart);
635    return EXCEPTION_NONE;
636}
637
638static exception_t decodeWriteVMCS(cap_t cap, word_t length, word_t *buffer)
639{
640    word_t field;
641    word_t value;
642
643    if (length < 2) {
644        userError("VCPU WriteVMCS: Not enough arguments.");
645        current_syscall_error.type = seL4_IllegalOperation;
646        return EXCEPTION_SYSCALL_ERROR;
647    }
648
649    field = getSyscallArg(0, buffer);
650    value = getSyscallArg(1, buffer);
651    switch (field) {
652    case VMX_GUEST_RIP:
653    case VMX_GUEST_RSP:
654    case VMX_GUEST_ES_SELECTOR:
655    case VMX_GUEST_CS_SELECTOR:
656    case VMX_GUEST_SS_SELECTOR:
657    case VMX_GUEST_DS_SELECTOR:
658    case VMX_GUEST_FS_SELECTOR:
659    case VMX_GUEST_GS_SELECTOR:
660    case VMX_GUEST_LDTR_SELECTOR:
661    case VMX_GUEST_TR_SELECTOR:
662    case VMX_GUEST_DEBUGCTRL:
663    case VMX_GUEST_PAT:
664    case VMX_GUEST_EFER:
665    case VMX_GUEST_PERF_GLOBAL_CTRL:
666    case VMX_GUEST_PDPTE0:
667    case VMX_GUEST_PDPTE1:
668    case VMX_GUEST_PDPTE2:
669    case VMX_GUEST_PDPTE3:
670    case VMX_GUEST_ES_LIMIT:
671    case VMX_GUEST_CS_LIMIT:
672    case VMX_GUEST_SS_LIMIT:
673    case VMX_GUEST_DS_LIMIT:
674    case VMX_GUEST_FS_LIMIT:
675    case VMX_GUEST_GS_LIMIT:
676    case VMX_GUEST_LDTR_LIMIT:
677    case VMX_GUEST_TR_LIMIT:
678    case VMX_GUEST_GDTR_LIMIT:
679    case VMX_GUEST_IDTR_LIMIT:
680    case VMX_GUEST_ES_ACCESS_RIGHTS:
681    case VMX_GUEST_CS_ACCESS_RIGHTS:
682    case VMX_GUEST_SS_ACCESS_RIGHTS:
683    case VMX_GUEST_DS_ACCESS_RIGHTS:
684    case VMX_GUEST_FS_ACCESS_RIGHTS:
685    case VMX_GUEST_GS_ACCESS_RIGHTS:
686    case VMX_GUEST_LDTR_ACCESS_RIGHTS:
687    case VMX_GUEST_TR_ACCESS_RIGHTS:
688    case VMX_GUEST_INTERRUPTABILITY:
689    case VMX_GUEST_ACTIVITY:
690    case VMX_GUEST_SMBASE:
691    case VMX_GUEST_SYSENTER_CS:
692    case VMX_GUEST_PREEMPTION_TIMER_VALUE:
693    case VMX_GUEST_ES_BASE:
694    case VMX_GUEST_CS_BASE:
695    case VMX_GUEST_SS_BASE:
696    case VMX_GUEST_DS_BASE:
697    case VMX_GUEST_FS_BASE:
698    case VMX_GUEST_GS_BASE:
699    case VMX_GUEST_LDTR_BASE:
700    case VMX_GUEST_TR_BASE:
701    case VMX_GUEST_GDTR_BASE:
702    case VMX_GUEST_IDTR_BASE:
703    case VMX_GUEST_DR7:
704    case VMX_GUEST_RFLAGS:
705    case VMX_GUEST_PENDING_DEBUG_EXCEPTIONS:
706    case VMX_GUEST_SYSENTER_ESP:
707    case VMX_GUEST_SYSENTER_EIP:
708    case VMX_CONTROL_CR0_MASK:
709    case VMX_CONTROL_CR4_MASK:
710    case VMX_CONTROL_CR0_READ_SHADOW:
711    case VMX_CONTROL_CR4_READ_SHADOW:
712    case VMX_GUEST_CR3:
713    case VMX_CONTROL_EXCEPTION_BITMAP:
714    case VMX_CONTROL_ENTRY_INTERRUPTION_INFO:
715    case VMX_CONTROL_ENTRY_EXCEPTION_ERROR_CODE:
716        break;
717    case VMX_CONTROL_PIN_EXECUTION_CONTROLS:
718        value = applyFixedBits(value, pin_control_high, pin_control_low);
719        break;
720    case VMX_CONTROL_PRIMARY_PROCESSOR_CONTROLS:
721        value = applyFixedBits(value, primary_control_high, primary_control_low);
722        break;
723    case VMX_CONTROL_SECONDARY_PROCESSOR_CONTROLS:
724        value = applyFixedBits(value, secondary_control_high, secondary_control_low);
725        break;
726    case VMX_CONTROL_EXIT_CONTROLS:
727        value = applyFixedBits(value, exit_control_high, exit_control_low);
728        break;
729    case VMX_GUEST_CR0:
730        value = applyFixedBits(value, cr0_high, cr0_low);
731        break;
732    case VMX_GUEST_CR4:
733        value = applyFixedBits(value, cr4_high, cr4_low);
734        break;
735    default:
736        userError("VCPU WriteVMCS: Invalid field %lx.", (long)field);
737        current_syscall_error.type = seL4_IllegalOperation;
738        return EXCEPTION_SYSCALL_ERROR;
739    }
740    return invokeWriteVMCS(VCPU_PTR(cap_vcpu_cap_get_capVCPUPtr(cap)), buffer, field, value);
741}
742
743static word_t readVMCSField(vcpu_t *vcpu, word_t field)
744{
745    switch (field) {
746    case VMX_CONTROL_EXCEPTION_BITMAP:
747        return vcpu->exception_bitmap;
748    case VMX_GUEST_CR0:
749        return vcpu->cr0;
750    case VMX_CONTROL_CR0_MASK:
751        return vcpu->cr0_mask;
752    case VMX_CONTROL_CR0_READ_SHADOW:
753        return vcpu->cr0_shadow;
754    }
755    if (ARCH_NODE_STATE(x86KSCurrentVCPU) != vcpu) {
756        switchVCPU(vcpu);
757    }
758    return vmread(field);
759}
760
761static exception_t invokeReadVMCS(vcpu_t *vcpu, word_t field, word_t *buffer)
762{
763    tcb_t *thread;
764    thread = NODE_STATE(ksCurThread);
765
766    setMR(thread, buffer, 0, readVMCSField(vcpu, field));
767    setRegister(thread, msgInfoRegister, wordFromMessageInfo(
768                    seL4_MessageInfo_new(0, 0, 0, 1)));
769    setThreadState(thread, ThreadState_Restart);
770    return EXCEPTION_NONE;
771}
772
773static exception_t decodeReadVMCS(cap_t cap, word_t length, word_t *buffer)
774{
775    if (length < 1) {
776        userError("VCPU ReadVMCS: Not enough arguments.");
777        current_syscall_error.type = seL4_IllegalOperation;
778        return EXCEPTION_SYSCALL_ERROR;
779    }
780    word_t field = getSyscallArg(0, buffer);
781    switch (field) {
782    case VMX_GUEST_RIP:
783    case VMX_GUEST_RSP:
784    case VMX_GUEST_ES_SELECTOR:
785    case VMX_GUEST_CS_SELECTOR:
786    case VMX_GUEST_SS_SELECTOR:
787    case VMX_GUEST_DS_SELECTOR:
788    case VMX_GUEST_FS_SELECTOR:
789    case VMX_GUEST_GS_SELECTOR:
790    case VMX_GUEST_LDTR_SELECTOR:
791    case VMX_GUEST_TR_SELECTOR:
792    case VMX_GUEST_DEBUGCTRL:
793    case VMX_GUEST_PAT:
794    case VMX_GUEST_EFER:
795    case VMX_GUEST_PERF_GLOBAL_CTRL:
796    case VMX_GUEST_PDPTE0:
797    case VMX_GUEST_PDPTE1:
798    case VMX_GUEST_PDPTE2:
799    case VMX_GUEST_PDPTE3:
800    case VMX_GUEST_ES_LIMIT:
801    case VMX_GUEST_CS_LIMIT:
802    case VMX_GUEST_SS_LIMIT:
803    case VMX_GUEST_DS_LIMIT:
804    case VMX_GUEST_FS_LIMIT:
805    case VMX_GUEST_GS_LIMIT:
806    case VMX_GUEST_LDTR_LIMIT:
807    case VMX_GUEST_TR_LIMIT:
808    case VMX_GUEST_GDTR_LIMIT:
809    case VMX_GUEST_IDTR_LIMIT:
810    case VMX_GUEST_ES_ACCESS_RIGHTS:
811    case VMX_GUEST_CS_ACCESS_RIGHTS:
812    case VMX_GUEST_SS_ACCESS_RIGHTS:
813    case VMX_GUEST_DS_ACCESS_RIGHTS:
814    case VMX_GUEST_FS_ACCESS_RIGHTS:
815    case VMX_GUEST_GS_ACCESS_RIGHTS:
816    case VMX_GUEST_LDTR_ACCESS_RIGHTS:
817    case VMX_GUEST_TR_ACCESS_RIGHTS:
818    case VMX_GUEST_INTERRUPTABILITY:
819    case VMX_GUEST_ACTIVITY:
820    case VMX_GUEST_SMBASE:
821    case VMX_GUEST_SYSENTER_CS:
822    case VMX_GUEST_PREEMPTION_TIMER_VALUE:
823    case VMX_GUEST_ES_BASE:
824    case VMX_GUEST_CS_BASE:
825    case VMX_GUEST_SS_BASE:
826    case VMX_GUEST_DS_BASE:
827    case VMX_GUEST_FS_BASE:
828    case VMX_GUEST_GS_BASE:
829    case VMX_GUEST_LDTR_BASE:
830    case VMX_GUEST_TR_BASE:
831    case VMX_GUEST_GDTR_BASE:
832    case VMX_GUEST_IDTR_BASE:
833    case VMX_GUEST_DR7:
834    case VMX_GUEST_RFLAGS:
835    case VMX_GUEST_PENDING_DEBUG_EXCEPTIONS:
836    case VMX_GUEST_SYSENTER_ESP:
837    case VMX_GUEST_SYSENTER_EIP:
838    case VMX_CONTROL_CR0_MASK:
839    case VMX_CONTROL_CR4_MASK:
840    case VMX_CONTROL_CR0_READ_SHADOW:
841    case VMX_CONTROL_CR4_READ_SHADOW:
842    case VMX_DATA_INSTRUCTION_ERROR:
843    case VMX_DATA_EXIT_INTERRUPT_INFO:
844    case VMX_DATA_EXIT_INTERRUPT_ERROR:
845    case VMX_DATA_IDT_VECTOR_INFO:
846    case VMX_DATA_IDT_VECTOR_ERROR:
847    case VMX_DATA_EXIT_INSTRUCTION_LENGTH:
848    case VMX_DATA_EXIT_INSTRUCTION_INFO:
849    case VMX_DATA_GUEST_PHYSICAL:
850    case VMX_DATA_IO_RCX:
851    case VMX_DATA_IO_RSI:
852    case VMX_DATA_IO_RDI:
853    case VMX_DATA_IO_RIP:
854    case VMX_DATA_GUEST_LINEAR_ADDRESS:
855    case VMX_CONTROL_ENTRY_INTERRUPTION_INFO:
856    case VMX_CONTROL_PIN_EXECUTION_CONTROLS:
857    case VMX_CONTROL_PRIMARY_PROCESSOR_CONTROLS:
858    case VMX_CONTROL_EXCEPTION_BITMAP:
859    case VMX_CONTROL_EXIT_CONTROLS:
860    case VMX_GUEST_CR0:
861    case VMX_GUEST_CR3:
862    case VMX_GUEST_CR4:
863        break;
864    default:
865        userError("VCPU ReadVMCS: Invalid field %lx.", (long)field);
866        current_syscall_error.type = seL4_IllegalOperation;
867        return EXCEPTION_SYSCALL_ERROR;
868    }
869    return invokeReadVMCS(VCPU_PTR(cap_vcpu_cap_get_capVCPUPtr(cap)), field, buffer);
870}
871
872static exception_t invokeSetTCB(vcpu_t *vcpu, tcb_t *tcb)
873{
874    associateVcpuTcb(tcb, vcpu);
875
876    setThreadState(NODE_STATE(ksCurThread), ThreadState_Restart);
877    return EXCEPTION_NONE;
878}
879
880static exception_t decodeSetTCB(cap_t cap, word_t length, word_t *buffer, extra_caps_t excaps)
881{
882    cap_t tcbCap;
883    if (excaps.excaprefs[0] == NULL) {
884        userError("VCPU SetTCB: Truncated message.");
885        current_syscall_error.type = seL4_TruncatedMessage;
886        return EXCEPTION_SYSCALL_ERROR;
887    }
888    tcbCap  = excaps.excaprefs[0]->cap;
889
890    if (cap_get_capType(tcbCap) != cap_thread_cap) {
891        userError("TCB cap is not a TCB cap.");
892        current_syscall_error.type = seL4_IllegalOperation;
893        return EXCEPTION_SYSCALL_ERROR;
894    }
895
896    return invokeSetTCB(VCPU_PTR(cap_vcpu_cap_get_capVCPUPtr(cap)), TCB_PTR(cap_thread_cap_get_capTCBPtr(tcbCap)));
897}
898
899void vcpu_update_state_sysvmenter(vcpu_t *vcpu)
900{
901    word_t *buffer;
902    if (ARCH_NODE_STATE(x86KSCurrentVCPU) != vcpu) {
903        switchVCPU(vcpu);
904    }
905    buffer = lookupIPCBuffer(false, NODE_STATE(ksCurThread));
906    if (!buffer) {
907        userError("No IPC buffer.");
908        return;
909    }
910    vmwrite(VMX_GUEST_RIP, getSyscallArg(0, buffer));
911    vmwrite(VMX_CONTROL_PRIMARY_PROCESSOR_CONTROLS, applyFixedBits(getSyscallArg(1, buffer), primary_control_high,
912                                                                   primary_control_low));
913    vmwrite(VMX_CONTROL_ENTRY_INTERRUPTION_INFO, getSyscallArg(2, buffer));
914}
915
916void vcpu_sysvmenter_reply_to_user(tcb_t *tcb)
917{
918    word_t *buffer;
919    vcpu_t *vcpu;
920
921    buffer = lookupIPCBuffer(true, tcb);
922    vcpu = tcb->tcbArch.tcbVCPU;
923
924    assert(vcpu);
925
926    if (ARCH_NODE_STATE(x86KSCurrentVCPU) != vcpu) {
927        switchVCPU(vcpu);
928    }
929
930    setMR(tcb, buffer, SEL4_VMENTER_CALL_EIP_MR, vmread(VMX_GUEST_RIP));
931    setMR(tcb, buffer, SEL4_VMENTER_CALL_CONTROL_PPC_MR, vmread(VMX_CONTROL_PRIMARY_PROCESSOR_CONTROLS));
932
933    setMR(tcb, buffer, SEL4_VMENTER_CALL_CONTROL_ENTRY_MR, vmread(VMX_CONTROL_ENTRY_INTERRUPTION_INFO));
934    setRegister(tcb, msgInfoRegister, 0);
935}
936
937exception_t decodeX86VCPUInvocation(
938    word_t invLabel,
939    word_t length,
940    cptr_t cptr,
941    cte_t *slot,
942    cap_t cap,
943    extra_caps_t excaps,
944    word_t *buffer
945)
946{
947    switch (invLabel) {
948    case X86VCPUSetTCB:
949        return decodeSetTCB(cap, length, buffer, excaps);
950    case X86VCPUReadVMCS:
951        return decodeReadVMCS(cap, length, buffer);
952    case X86VCPUWriteVMCS:
953        return decodeWriteVMCS(cap, length, buffer);
954    case X86VCPUEnableIOPort:
955        return decodeEnableIOPort(cap, length, buffer, excaps);
956    case X86VCPUDisableIOPort:
957        return decodeDisableIOPort(cap, length, buffer);
958    case X86VCPUWriteRegisters:
959        return decodeVCPUWriteRegisters(cap, length, buffer);
960    default:
961        userError("VCPU: Illegal operation.");
962        current_syscall_error.type = seL4_IllegalOperation;
963        return EXCEPTION_SYSCALL_ERROR;
964    }
965}
966
967static bool_t is_vtx_supported(void)
968{
969    /* check for VMX support in CPUID
970     * see section 23.7 of Volume 3 of the Intel manual */
971    return !!(x86_cpuid_ecx(0x1, 0) & BIT(5));
972}
973
974static inline void clear_bit(word_t *bitmap, word_t bit)
975{
976    int index = bit / (sizeof(word_t) * 8);
977    int offset = bit % (sizeof(word_t) * 8);
978    bitmap[index] &= ~BIT(offset);
979}
980
981BOOT_CODE bool_t vtx_init(void)
982{
983    if (!is_vtx_supported()) {
984        printf("vt-x: not supported\n");
985        return false;
986    }
987    vmx_basic_msr_t vmx_basic;
988    feature_control_msr_t feature_control;
989    vmx_basic.words[0] = x86_rdmsr_low(IA32_VMX_BASIC_MSR);
990    vmx_basic.words[1] = x86_rdmsr_high(IA32_VMX_BASIC_MSR);
991    vmcs_revision = vmx_basic_msr_get_vmcs_revision(vmx_basic);
992    feature_control.words[0] = x86_rdmsr_low(IA32_FEATURE_CONTROL_MSR);
993    if (!feature_control_msr_get_vmx_outside_smx(feature_control)) {
994        /* enable if the MSR is not locked */
995        if (feature_control_msr_get_lock(feature_control)) {
996            printf("vt-x: feature locked\n");
997            return false;
998        }
999        feature_control = feature_control_msr_set_vmx_outside_smx(feature_control, 1);
1000        x86_wrmsr_parts(IA32_FEATURE_CONTROL_MSR, x86_rdmsr_high(IA32_FEATURE_CONTROL_MSR), feature_control.words[0]);
1001    }
1002    /* make sure the msr is locked */
1003    if (!feature_control_msr_get_lock(feature_control)) {
1004        feature_control = feature_control_msr_set_lock(feature_control, 1);
1005        x86_wrmsr_parts(IA32_FEATURE_CONTROL_MSR, x86_rdmsr_high(IA32_FEATURE_CONTROL_MSR), feature_control.words[0]);
1006    }
1007    /* Initialize the fixed values only on the boot core. All other cores
1008     * will just check that the fixed values are valid */
1009    if (SMP_TERNARY(getCurrentCPUIndex(), 0) == 0) {
1010        if (!init_vtx_fixed_values(vmx_basic_msr_get_true_msrs(vmx_basic))) {
1011            printf("vt-x: lack of required features\n");
1012            return false;
1013        }
1014    }
1015    if (!check_vtx_fixed_values(vmx_basic_msr_get_true_msrs(vmx_basic))) {
1016        printf("vt-x: cores have inconsistent features\n");
1017        return false;
1018    }
1019    write_cr4(read_cr4() | CR4_VMXE);
1020    /* we are required to set the VMCS region in the VMXON region */
1021    vmxon_region.revision = vmcs_revision;
1022    /* Before calling vmxon, we must check that CR0 and CR4 are not set to values
1023     * that are unsupported by vt-x */
1024    if (!vtx_check_fixed_values(read_cr0(), read_cr4())) {
1025        return false;
1026    }
1027    if (vmxon(kpptr_to_paddr(&vmxon_region))) {
1028        printf("vt-x: vmxon failure\n");
1029        return false;
1030    }
1031    memset(&msr_bitmap_region, ~0, sizeof(msr_bitmap_region));
1032    /* Set sysenter MSRs to writeable and readable. These are all low msrs */
1033    clear_bit(msr_bitmap_region.low_msr_read.bitmap, IA32_SYSENTER_CS_MSR);
1034    clear_bit(msr_bitmap_region.low_msr_read.bitmap, IA32_SYSENTER_ESP_MSR);
1035    clear_bit(msr_bitmap_region.low_msr_read.bitmap, IA32_SYSENTER_EIP_MSR);
1036    clear_bit(msr_bitmap_region.low_msr_write.bitmap, IA32_SYSENTER_CS_MSR);
1037    clear_bit(msr_bitmap_region.low_msr_write.bitmap, IA32_SYSENTER_ESP_MSR);
1038    clear_bit(msr_bitmap_region.low_msr_write.bitmap, IA32_SYSENTER_EIP_MSR);
1039
1040    /* The VMX_EPT_VPID_CAP MSR exists if VMX supports EPT or VPIDs. Whilst
1041     * VPID support is optional, EPT support is not and is already checked for,
1042     * so we know that this MSR is safe to read */
1043    vpid_capability.words[0] = x86_rdmsr_low(IA32_VMX_EPT_VPID_CAP_MSR);
1044    vpid_capability.words[1] = x86_rdmsr_high(IA32_VMX_EPT_VPID_CAP_MSR);
1045
1046    /* check for supported EPT features */
1047    if (!vmx_ept_vpid_cap_msr_get_ept_wb(vpid_capability)) {
1048        printf("vt-x: Expected wb attribute for EPT paging structure\n");
1049        return false;
1050    }
1051    if (!vmx_ept_vpid_cap_msr_get_ept_2m(vpid_capability)) {
1052        printf("vt-x: Expected supported for 2m pages\n");
1053        return false;
1054    }
1055
1056    return true;
1057}
1058
1059static void setMRs_vmexit(uint32_t reason, word_t qualification)
1060{
1061    word_t *buffer;
1062    int i;
1063
1064    buffer = lookupIPCBuffer(true, NODE_STATE(ksCurThread));
1065
1066    setMR(NODE_STATE(ksCurThread), buffer, SEL4_VMENTER_CALL_EIP_MR, vmread(VMX_GUEST_RIP));
1067    setMR(NODE_STATE(ksCurThread), buffer, SEL4_VMENTER_CALL_CONTROL_PPC_MR,
1068          vmread(VMX_CONTROL_PRIMARY_PROCESSOR_CONTROLS));
1069    setMR(NODE_STATE(ksCurThread), buffer, SEL4_VMENTER_CALL_CONTROL_ENTRY_MR, vmread(VMX_CONTROL_ENTRY_INTERRUPTION_INFO));
1070    setMR(NODE_STATE(ksCurThread), buffer, SEL4_VMENTER_FAULT_REASON_MR, reason);
1071    setMR(NODE_STATE(ksCurThread), buffer, SEL4_VMENTER_FAULT_QUALIFICATION_MR, qualification);
1072
1073    setMR(NODE_STATE(ksCurThread), buffer, SEL4_VMENTER_FAULT_INSTRUCTION_LEN_MR, vmread(VMX_DATA_EXIT_INSTRUCTION_LENGTH));
1074    setMR(NODE_STATE(ksCurThread), buffer, SEL4_VMENTER_FAULT_GUEST_PHYSICAL_MR, vmread(VMX_DATA_GUEST_PHYSICAL));
1075    setMR(NODE_STATE(ksCurThread), buffer, SEL4_VMENTER_FAULT_RFLAGS_MR, vmread(VMX_GUEST_RFLAGS));
1076    setMR(NODE_STATE(ksCurThread), buffer, SEL4_VMENTER_FAULT_GUEST_INT_MR, vmread(VMX_GUEST_INTERRUPTABILITY));
1077    setMR(NODE_STATE(ksCurThread), buffer, SEL4_VMENTER_FAULT_CR3_MR, vmread(VMX_GUEST_CR3));
1078
1079    for (i = 0; i < n_vcpu_gp_register; i++) {
1080        setMR(NODE_STATE(ksCurThread), buffer, SEL4_VMENTER_FAULT_EAX + i,
1081              NODE_STATE(ksCurThread)->tcbArch.tcbVCPU->gp_registers[i]);
1082    }
1083}
1084
1085static void handleVmxFault(uint32_t reason, word_t qualification)
1086{
1087    /* Indicate that we are returning the from VMEnter with a fault */
1088    setRegister(NODE_STATE(ksCurThread), msgInfoRegister, SEL4_VMENTER_RESULT_FAULT);
1089
1090    setMRs_vmexit(reason, qualification);
1091
1092    /* Set the thread back to running */
1093    setThreadState(NODE_STATE(ksCurThread), ThreadState_Running);
1094
1095    /* No need to schedule because this wasn't an interrupt and
1096     * we run at the same priority */
1097    activateThread();
1098}
1099
1100static inline void finishVmexitSaving(void)
1101{
1102    vcpu_t *vcpu = ARCH_NODE_STATE(x86KSCurrentVCPU);
1103    assert(vcpu == NODE_STATE(ksCurThread)->tcbArch.tcbVCPU);
1104    vcpu->launched = true;
1105    /* Update our cache of what is in the vmcs. This is the only value
1106     * that we cache that can be modified by the guest during execution */
1107    vcpu->cached_cr0 = vmread(VMX_GUEST_CR0);
1108    if (vcpuThreadUsingFPU(NODE_STATE(ksCurThread))) {
1109        /* If the vcpu owns the fpu then we did not modify the active cr0 to anything different
1110         * to what the VCPU owner requested, so we can update it with any modifications
1111         * the guest may have made */
1112        vcpu->cr0 = vcpu->cached_cr0;
1113    } else {
1114        /* If the vcpu does not own the fpu then we will have forced the task switched flag
1115         * to be set in the cr0 that was put into the vmcs. Since the VCPU owner (or the guest)
1116         * may believe the task switched flag is not set, and the guest may have modified other
1117         * parts of cr0 during execution then to update the desired cr0 value to be the cr0
1118         * value from the vmcs (thus pulling in any modifications the guest made) but removing
1119         * the task switched flag that we set and then adding back in the task switched flag
1120         * that may be in the desired current cr0 */
1121        vcpu->cr0 = (vcpu->cached_cr0 & ~CR0_TASK_SWITCH) | (NODE_STATE(ksCurThread)->tcbArch.tcbVCPU->cr0 & CR0_TASK_SWITCH);
1122    }
1123}
1124
1125exception_t handleVmexit(void)
1126{
1127    uint32_t interrupt;
1128    /* qualification is host width, reason is defined as being 32 bit */
1129    word_t qualification;
1130    uint32_t reason;
1131    finishVmexitSaving();
1132    /* the basic exit reason is the bottom 16 bits of the exit reason field */
1133    reason = vmread(VMX_DATA_EXIT_REASON) & MASK(16);
1134    if (reason == EXTERNAL_INTERRUPT) {
1135        if (vmx_feature_ack_on_exit) {
1136            interrupt = vmread(VMX_DATA_EXIT_INTERRUPT_INFO);
1137            ARCH_NODE_STATE(x86KScurInterrupt) = interrupt & 0xff;
1138            NODE_LOCK_IRQ_IF(interrupt != int_remote_call_ipi);
1139            handleInterruptEntry();
1140        } else {
1141            /* poll for the pending irq. We will then handle it once we return back
1142             * up to restore_user_context */
1143            receivePendingIRQ();
1144        }
1145        return EXCEPTION_NONE;
1146    }
1147
1148    NODE_LOCK_SYS;
1149
1150    if (!vcpuThreadUsingFPU(NODE_STATE(ksCurThread))) {
1151        /* since this vcpu does not currently own the fpu state, check if the kernel should
1152         * switch the fpu owner or not. We switch if the guest performed and unimplemented device
1153         * exception AND the owner of this vcpu has not requested that these exceptions be forwarded
1154         * to them (i.e. if they have not explicitly set the unimplemented device exception in the
1155         * exception_bitmap) */
1156        if (reason == EXCEPTION_OR_NMI && !(NODE_STATE(ksCurThread)->tcbArch.tcbVCPU->exception_bitmap & BIT(int_unimpl_dev))) {
1157            interrupt = vmread(VMX_DATA_EXIT_INTERRUPT_INFO);
1158            /* The exception number is the bottom 8 bits of the interrupt info */
1159            if ((interrupt & 0xff) == int_unimpl_dev) {
1160                switchLocalFpuOwner(&NODE_STATE(ksCurThread)->tcbArch.tcbVCPU->fpuState);
1161                return EXCEPTION_NONE;
1162            }
1163        } else if (reason == CONTROL_REGISTER && !(NODE_STATE(ksCurThread)->tcbArch.tcbVCPU->cr0_mask & CR0_TASK_SWITCH)) {
1164            /* we get here if the guest is attempting to write to a control register that is set (by
1165             * a 1 bit in the cr0 mask) as being owned by the host. If we got here then the previous check
1166             * on cr0_mask meant that the VCPU owner did not claim ownership of the the task switch bit
1167             * however we may have temporarily claimed ownership for the purposes of FPU switching.
1168             * At this point we could still have a false positive, as the guest could be attempted to
1169             * manipulate bits that are not task switch, so we still have to be careful and propogate
1170             * all or some of an attempted write */
1171            qualification = vmread(VMX_DATA_EXIT_QUALIFICATION);
1172            vmx_data_exit_qualification_control_regster_t qual;
1173            qual.words[0] = qualification;
1174            /* We only care about some of the exit qualification cases, we handle them here
1175             * and will deliver any others through to fault handler */
1176            switch (vmx_data_exit_qualification_control_regster_get_access_type(qual)) {
1177            case VMX_EXIT_QUAL_TYPE_MOV_CR: {
1178                /* check for cr0 */
1179                if (vmx_data_exit_qualification_control_regster_get_cr(qual) == 0) {
1180                    vcpu_gp_register_t source = crExitRegs[vmx_data_exit_qualification_control_regster_get_reg(qual)];
1181                    word_t value;
1182                    if (source == VCPU_ESP) {
1183                        /* ESP is the only register that is is not part of the general purpose
1184                         * registers that we have to save and restore ourselves, so we need to
1185                         * get this one from the vmcs */
1186                        value = vmread(VMX_GUEST_RSP);
1187                    } else {
1188                        value = NODE_STATE(ksCurThread)->tcbArch.tcbVCPU->gp_registers[source];
1189                    }
1190                    /* First unset the task switch bit in cr0 */
1191                    NODE_STATE(ksCurThread)->tcbArch.tcbVCPU->cr0 &= ~CR0_TASK_SWITCH;
1192                    /* now set it to the value we were given */
1193                    NODE_STATE(ksCurThread)->tcbArch.tcbVCPU->cr0 |= value & CR0_TASK_SWITCH;
1194                    /* check if there are any parts of the write remaining to forward. we only need
1195                     * to consider bits that the hardware will not have handled without faulting, which
1196                     * is writing any bit such that it is different to the shadow, but only considering
1197                     * bits that the VCPU owner has declared that they want to own (via the cr0_shadow)
1198                     */
1199                    if (!((value ^ NODE_STATE(ksCurThread)->tcbArch.tcbVCPU->cr0_shadow) &
1200                          NODE_STATE(ksCurThread)->tcbArch.tcbVCPU->cr0_mask)) {
1201                        return EXCEPTION_NONE;
1202                    }
1203                }
1204                break;
1205            }
1206            case VMX_EXIT_QUAL_TYPE_CLTS: {
1207                /* Easy case. Just remove the task switch bit out of cr0 */
1208                NODE_STATE(ksCurThread)->tcbArch.tcbVCPU->cr0 &= ~CR0_TASK_SWITCH;
1209                return EXCEPTION_NONE;
1210            }
1211            case VMX_EXIT_QUAL_TYPE_LMSW: {
1212                uint16_t value = vmx_data_exit_qualification_control_regster_get_data(qual);
1213                /* First unset the task switch bit in cr0 */
1214                NODE_STATE(ksCurThread)->tcbArch.tcbVCPU->cr0 &= ~CR0_TASK_SWITCH;
1215                /* now set it to the value we were given */
1216                NODE_STATE(ksCurThread)->tcbArch.tcbVCPU->cr0 |= value & CR0_TASK_SWITCH;
1217                /* check if there are any parts of the write remaining to forward. we only need
1218                 * to consider bits that the hardware will not have handled without faulting, which
1219                 * is writing any bit such that it is different to the shadow, but only considering
1220                 * bits that the VCPU owner has declared that they want to own (via the cr0_shadow).
1221                 * Additionally since LMSW only loads the bottom 4 bits of CR0 we only consider
1222                 * the low 4 bits
1223                 */
1224                if (!((value ^ NODE_STATE(ksCurThread)->tcbArch.tcbVCPU->cr0_shadow) &
1225                      NODE_STATE(ksCurThread)->tcbArch.tcbVCPU->cr0_mask & MASK(4))) {
1226                    return EXCEPTION_NONE;
1227                }
1228                break;
1229            }
1230            }
1231        }
1232    }
1233    switch (reason) {
1234    case EXCEPTION_OR_NMI:
1235    case MOV_DR:
1236    case TASK_SWITCH:
1237    case CONTROL_REGISTER:
1238    case IO:
1239    case MWAIT:
1240    case SIPI:
1241    case INVLPG:
1242    case INVEPT:
1243    case INVVPID:
1244    case VMCLEAR:
1245    case VMPTRLD:
1246    case VMPTRST:
1247    case VMREAD:
1248    case VMWRITE:
1249    case VMXON:
1250    case EPT_VIOLATION:
1251    case GDTR_OR_IDTR:
1252    case LDTR_OR_TR:
1253    case TPR_BELOW_THRESHOLD:
1254    case APIC_ACCESS:
1255        qualification = vmread(VMX_DATA_EXIT_QUALIFICATION);
1256        break;
1257    default:
1258        qualification = 0;
1259    }
1260
1261    handleVmxFault(reason, qualification);
1262
1263    return EXCEPTION_NONE;
1264}
1265
1266exception_t handleVmEntryFail(void)
1267{
1268    handleVmxFault(-1, -1);
1269
1270    return EXCEPTION_NONE;
1271}
1272
1273#ifdef ENABLE_SMP_SUPPORT
1274void VMCheckBoundNotification(tcb_t *tcb)
1275{
1276    /* We want to check if the VM we are currently running has received
1277     * a message on its bound notification object. This check is done
1278     * in c_traps when we first peform a SysVMEnter, but we could presently
1279     * be running a VM and another core may have placed a message on the
1280     * endpoint
1281     */
1282    assert(tcb->tcbAffinity == getCurrentCPUIndex());
1283    notification_t *ntfnPtr = tcb->tcbBoundNotification;
1284    if (thread_state_ptr_get_tsType(&tcb->tcbState) == ThreadState_RunningVM
1285        && ntfnPtr && notification_ptr_get_state(ntfnPtr) == NtfnState_Active) {
1286
1287        word_t badge = notification_ptr_get_ntfnMsgIdentifier(ntfnPtr);
1288        notification_ptr_set_state(ntfnPtr, NtfnState_Idle);
1289        setThreadState(tcb, ThreadState_Running);
1290        setRegister(tcb, badgeRegister, badge);
1291        Arch_leaveVMAsyncTransfer(tcb);
1292        /* In the process of performing Arch_leavVMAsyncTransfer we will have
1293         * had to switch the active VMCS. As a result we might as well try and
1294         * run this tcb if it is permitted instead of switching VMCS contexts
1295         * back and forth */
1296        if (tcb != NODE_STATE(ksCurThread)) {
1297            possibleSwitchTo(tcb);
1298        }
1299    }
1300}
1301#endif /* ENABLE_SMP_SUPPORT */
1302
1303static void invvpid_context(uint16_t vpid)
1304{
1305    struct {
1306        uint64_t vpid : 16;
1307        uint64_t rsvd : 48;
1308        uint64_t address;
1309    } PACKED operand = {vpid, 0, 0};
1310    asm volatile("invvpid %0, %1" :: "m"(operand), "r"((word_t)1) : "cc");
1311}
1312
1313static void setEPTRoot(cap_t vmxSpace, vcpu_t *vcpu)
1314{
1315    paddr_t ept_root;
1316    if (cap_get_capType(vmxSpace) != cap_ept_pml4_cap ||
1317        !cap_ept_pml4_cap_get_capPML4IsMapped(vmxSpace)) {
1318        ept_root = kpptr_to_paddr(null_ept_space);
1319    } else {
1320        findEPTForASID_ret_t find_ret;
1321        ept_pml4e_t *pml4;
1322
1323        pml4 = (ept_pml4e_t *)cap_ept_pml4_cap_get_capPML4BasePtr(vmxSpace);
1324        find_ret = findEPTForASID(cap_ept_pml4_cap_get_capPML4MappedASID(vmxSpace));
1325        if (find_ret.status != EXCEPTION_NONE || find_ret.ept != pml4) {
1326            ept_root = kpptr_to_paddr(null_ept_space);
1327        } else {
1328            ept_root = pptr_to_paddr(pml4);
1329        }
1330    }
1331    if (ept_root != vcpu->last_ept_root) {
1332        vcpu->last_ept_root = ept_root;
1333        vmx_eptp_t eptp = vmx_eptp_new(
1334                              ept_root,       /* paddr of ept */
1335                              0,              /* do not use accessed and dirty flags */
1336                              3,              /* depth (4) minus 1 of desired table walking */
1337                              6               /* write back memory type */
1338                          );
1339        vmwrite(VMX_CONTROL_EPT_POINTER, eptp.words[0]);
1340        assert(vcpu->vpid != VPID_INVALID);
1341        if (vmx_feature_vpid) {
1342            invvpid_context(vcpu->vpid);
1343        }
1344    }
1345}
1346
1347static void handleLazyFpu(void)
1348{
1349    vcpu_t *vcpu = NODE_STATE(ksCurThread)->tcbArch.tcbVCPU;
1350    word_t cr0 = vcpu->cr0;
1351    word_t exception_bitmap = vcpu->exception_bitmap;
1352    word_t cr0_mask = vcpu->cr0_mask;
1353    word_t cr0_shadow = vcpu->cr0_shadow;
1354    /* if the vcpu actually owns the fpu then we do not need to change any bits
1355     * and so we will put into the vmcs precisely what the VCPU owner has requested */
1356    if (!vcpuThreadUsingFPU(NODE_STATE(ksCurThread))) {
1357        /* when the vcpu doesn't own the fpu we need to force the task switched flag
1358         * so that we can take an exception and perform lazy fpu switching */
1359        cr0 |= CR0_TASK_SWITCH;
1360        /* once we have forced the task switched flag we also need to handle
1361         * unimplemented device exceptions so we force these to trap. In the case
1362         * where the VCPU owner had already set these to trap then this well be
1363         * checked in handleVmexit and the exception will get forwarded */
1364        exception_bitmap |= BIT(int_unimpl_dev);
1365        /* we need to claim ownership of the task switch bit so that any modifications
1366         * of it cause an exception and we must do this regardless of whether the
1367         * VCPU owner also wants to claim ownership. In the scenario where the VCPU
1368         * owner had also claimed ownership then in handleVmexit we won't actually
1369         * do any fpu switching and will forward the fault */
1370        cr0_mask |= CR0_TASK_SWITCH;
1371        /* since we have forced a value in the cr0 mask we need to set an appropriate value
1372         * in the cr0 read shadow. If the VCPU owner is also masking this bit then
1373         * we should use the value they have put in the cr0 read shadow. If they aren't
1374         * then the read shadow will contain garbage and we should instead set the
1375         * read shadow to the actual desired cr0 value */
1376        if (!(vcpu->cr0_mask & CR0_TASK_SWITCH)) {
1377            cr0_shadow &= ~CR0_TASK_SWITCH;
1378            cr0_shadow |= vcpu->cr0 & CR0_TASK_SWITCH;
1379        }
1380    }
1381    if (cr0 != vcpu->cached_cr0) {
1382        vmwrite(VMX_GUEST_CR0, cr0);
1383        vcpu->cached_cr0 = cr0;
1384    }
1385    if (exception_bitmap != vcpu->cached_exception_bitmap) {
1386        vmwrite(VMX_CONTROL_EXCEPTION_BITMAP, exception_bitmap);
1387        vcpu->cached_exception_bitmap = exception_bitmap;
1388    }
1389    if (cr0_mask != vcpu->cached_cr0_mask) {
1390        vmwrite(VMX_CONTROL_CR0_MASK, cr0_mask);
1391        vcpu->cached_cr0_mask = cr0_mask;
1392    }
1393    if (cr0_shadow != vcpu->cached_cr0_shadow) {
1394        vmwrite(VMX_CONTROL_CR0_READ_SHADOW, cr0_shadow);
1395        vcpu->cached_cr0_shadow = cr0_shadow;
1396    }
1397}
1398
1399void clearVPIDIOPortMappings(vpid_t vpid, uint16_t first, uint16_t last)
1400{
1401    if (vpid == VPID_INVALID) {
1402        return;
1403    }
1404    vcpu_t *vcpu = x86KSVPIDTable[vpid];
1405    if (vcpu == NULL) {
1406        return;
1407    }
1408    assert(vcpu->vpid == vpid);
1409    setIOPortMask(vcpu->io, first, last, true);
1410}
1411
1412static inline vpid_t nextVPID(vpid_t vpid)
1413{
1414    if (vpid == VPID_LAST) {
1415        return VPID_FIRST;
1416    } else {
1417        return vpid + 1;
1418    }
1419}
1420
1421static void invalidateVPID(vpid_t vpid)
1422{
1423    vcpu_t *vcpu = x86KSVPIDTable[vpid];
1424    /* clear the IO bitmap as when we sever the VPID asignment we lose
1425     * the ability for the references in IO port capabilities to invalidate */
1426    memset(vcpu->io, ~0, sizeof(vcpu->io));
1427    /* invalidate the VPID context */
1428    if (vmx_feature_vpid) {
1429        invvpid_context(vpid);
1430    }
1431}
1432
1433static vpid_t findFreeVPID(void)
1434{
1435    vpid_t vpid;
1436
1437    vpid = x86KSNextVPID;
1438    do {
1439        if (x86KSVPIDTable[vpid] == NULL) {
1440            return vpid;
1441        }
1442        vpid = nextVPID(vpid);
1443    } while (vpid != x86KSNextVPID);
1444
1445    /* Forcively take the next VPID */
1446    vpid = x86KSNextVPID;
1447    invalidateVPID(vpid);
1448
1449    x86KSVPIDTable[vpid]->vpid = VPID_INVALID;
1450    x86KSVPIDTable[vpid] = NULL;
1451
1452    x86KSNextVPID = nextVPID(x86KSNextVPID);
1453    return vpid;
1454}
1455
1456static void storeVPID(vcpu_t *vcpu, vpid_t vpid)
1457{
1458    assert(x86KSVPIDTable[vpid] == NULL);
1459    assert(vcpu->vpid == VPID_INVALID);
1460    x86KSVPIDTable[vpid] = vcpu;
1461    vcpu->vpid = vpid;
1462}
1463
1464void restoreVMCS(void)
1465{
1466    vcpu_t *expected_vmcs = NODE_STATE(ksCurThread)->tcbArch.tcbVCPU;
1467
1468    /* Check that the right VMCS is active and current. */
1469    if (ARCH_NODE_STATE(x86KSCurrentVCPU) != expected_vmcs) {
1470        switchVCPU(expected_vmcs);
1471    }
1472
1473#ifndef CONFIG_KERNEL_SKIM_WINDOW
1474    if (getCurrentCR3().words[0] != expected_vmcs->last_host_cr3) {
1475        expected_vmcs->last_host_cr3 = getCurrentCR3().words[0];
1476        vmwrite(VMX_HOST_CR3, getCurrentCR3().words[0]);
1477    }
1478#endif
1479    if (expected_vmcs->vpid == VPID_INVALID) {
1480        vpid_t vpid = findFreeVPID();
1481        storeVPID(expected_vmcs, vpid);
1482        if (vmx_feature_vpid) {
1483            vmwrite(VMX_CONTROL_VPID, vpid);
1484        }
1485    }
1486    setEPTRoot(TCB_PTR_CTE_PTR(NODE_STATE(ksCurThread), tcbArchEPTRoot)->cap, expected_vmcs);
1487    handleLazyFpu();
1488}
1489
1490void invept(ept_pml4e_t *ept_pml4)
1491{
1492    if (vmx_ept_vpid_cap_msr_get_invept(vpid_capability)) {
1493        struct {
1494            uint64_t parts[2];
1495        } address;
1496        word_t type;
1497        if (vmx_ept_vpid_cap_msr_get_invept_single_context(vpid_capability)) {
1498            type = 1;
1499        } else if (vmx_ept_vpid_cap_msr_get_invept_all_context(vpid_capability)) {
1500            type = 2;
1501        } else {
1502            /* hardware claims to support invept yet provides us with no actual
1503             * invept mechanism. This is probably impossible, but just silently
1504             * ignore if it happens */
1505            userError("Hardware claimed to support invept, yet provided no mechanism");
1506            return;
1507        }
1508
1509        address.parts[0] = pptr_to_paddr((void *)ept_pml4);
1510        address.parts[1] = 0;
1511        asm volatile(
1512            "invept %0, %1"
1513            :
1514            : "m"(address),  "r"(type)
1515            : "memory"
1516        );
1517    }
1518}
1519
1520#endif
1521