vmx.c revision 266626
1/*-
2 * Copyright (c) 2011 NetApp, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD: head/sys/amd64/vmm/intel/vmx.c 266626 2014-05-24 19:13:25Z neel $
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/vmx.c 266626 2014-05-24 19:13:25Z neel $");
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/smp.h>
35#include <sys/kernel.h>
36#include <sys/malloc.h>
37#include <sys/pcpu.h>
38#include <sys/proc.h>
39#include <sys/sysctl.h>
40
41#include <vm/vm.h>
42#include <vm/pmap.h>
43
44#include <machine/psl.h>
45#include <machine/cpufunc.h>
46#include <machine/md_var.h>
47#include <machine/segments.h>
48#include <machine/smp.h>
49#include <machine/specialreg.h>
50#include <machine/vmparam.h>
51
52#include <machine/vmm.h>
53#include <machine/vmm_dev.h>
54#include "vmm_host.h"
55#include "vmm_ioport.h"
56#include "vmm_ipi.h"
57#include "vmm_msr.h"
58#include "vmm_ktr.h"
59#include "vmm_stat.h"
60#include "vatpic.h"
61#include "vlapic.h"
62#include "vlapic_priv.h"
63
64#include "vmx_msr.h"
65#include "ept.h"
66#include "vmx_cpufunc.h"
67#include "vmx.h"
68#include "x86.h"
69#include "vmx_controls.h"
70
71#define	PINBASED_CTLS_ONE_SETTING					\
72	(PINBASED_EXTINT_EXITING	|				\
73	 PINBASED_NMI_EXITING		|				\
74	 PINBASED_VIRTUAL_NMI)
75#define	PINBASED_CTLS_ZERO_SETTING	0
76
77#define PROCBASED_CTLS_WINDOW_SETTING					\
78	(PROCBASED_INT_WINDOW_EXITING	|				\
79	 PROCBASED_NMI_WINDOW_EXITING)
80
81#define	PROCBASED_CTLS_ONE_SETTING 					\
82	(PROCBASED_SECONDARY_CONTROLS	|				\
83	 PROCBASED_IO_EXITING		|				\
84	 PROCBASED_MSR_BITMAPS		|				\
85	 PROCBASED_CTLS_WINDOW_SETTING)
86#define	PROCBASED_CTLS_ZERO_SETTING	\
87	(PROCBASED_CR3_LOAD_EXITING |	\
88	PROCBASED_CR3_STORE_EXITING |	\
89	PROCBASED_IO_BITMAPS)
90
91#define	PROCBASED_CTLS2_ONE_SETTING	PROCBASED2_ENABLE_EPT
92#define	PROCBASED_CTLS2_ZERO_SETTING	0
93
94#define VM_EXIT_CTLS_ONE_SETTING_NO_PAT					\
95	(VM_EXIT_HOST_LMA			|			\
96	VM_EXIT_SAVE_EFER			|			\
97	VM_EXIT_LOAD_EFER)
98
99#define	VM_EXIT_CTLS_ONE_SETTING					\
100	(VM_EXIT_CTLS_ONE_SETTING_NO_PAT       	|			\
101	VM_EXIT_ACKNOWLEDGE_INTERRUPT		|			\
102	VM_EXIT_SAVE_PAT			|			\
103	VM_EXIT_LOAD_PAT)
104#define	VM_EXIT_CTLS_ZERO_SETTING	VM_EXIT_SAVE_DEBUG_CONTROLS
105
106#define	VM_ENTRY_CTLS_ONE_SETTING_NO_PAT	VM_ENTRY_LOAD_EFER
107
108#define	VM_ENTRY_CTLS_ONE_SETTING					\
109	(VM_ENTRY_CTLS_ONE_SETTING_NO_PAT     	|			\
110	VM_ENTRY_LOAD_PAT)
111#define	VM_ENTRY_CTLS_ZERO_SETTING					\
112	(VM_ENTRY_LOAD_DEBUG_CONTROLS		|			\
113	VM_ENTRY_INTO_SMM			|			\
114	VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
115
116#define	guest_msr_rw(vmx, msr) \
117	msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW)
118
119#define	guest_msr_ro(vmx, msr) \
120    msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_READ)
121
122#define	HANDLED		1
123#define	UNHANDLED	0
124
125static MALLOC_DEFINE(M_VMX, "vmx", "vmx");
126static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
127
128SYSCTL_DECL(_hw_vmm);
129SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL);
130
131int vmxon_enabled[MAXCPU];
132static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
133
134static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
135static uint32_t exit_ctls, entry_ctls;
136
137static uint64_t cr0_ones_mask, cr0_zeros_mask;
138SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD,
139	     &cr0_ones_mask, 0, NULL);
140SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD,
141	     &cr0_zeros_mask, 0, NULL);
142
143static uint64_t cr4_ones_mask, cr4_zeros_mask;
144SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD,
145	     &cr4_ones_mask, 0, NULL);
146SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD,
147	     &cr4_zeros_mask, 0, NULL);
148
149static int vmx_no_patmsr;
150
151static int vmx_initialized;
152SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
153	   &vmx_initialized, 0, "Intel VMX initialized");
154
155/*
156 * Optional capabilities
157 */
158static int cap_halt_exit;
159static int cap_pause_exit;
160static int cap_unrestricted_guest;
161static int cap_monitor_trap;
162static int cap_invpcid;
163
164static int virtual_interrupt_delivery;
165SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
166    &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support");
167
168static int posted_interrupts;
169SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupts, CTLFLAG_RD,
170    &posted_interrupts, 0, "APICv posted interrupt support");
171
172static int pirvec;
173SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD,
174    &pirvec, 0, "APICv posted interrupt vector");
175
176static struct unrhdr *vpid_unr;
177static u_int vpid_alloc_failed;
178SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
179	    &vpid_alloc_failed, 0, NULL);
180
181/*
182 * Use the last page below 4GB as the APIC access address. This address is
183 * occupied by the boot firmware so it is guaranteed that it will not conflict
184 * with a page in system memory.
185 */
186#define	APIC_ACCESS_ADDRESS	0xFFFFF000
187
188static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc);
189static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval);
190static void vmx_inject_pir(struct vlapic *vlapic);
191
192#ifdef KTR
193static const char *
194exit_reason_to_str(int reason)
195{
196	static char reasonbuf[32];
197
198	switch (reason) {
199	case EXIT_REASON_EXCEPTION:
200		return "exception";
201	case EXIT_REASON_EXT_INTR:
202		return "extint";
203	case EXIT_REASON_TRIPLE_FAULT:
204		return "triplefault";
205	case EXIT_REASON_INIT:
206		return "init";
207	case EXIT_REASON_SIPI:
208		return "sipi";
209	case EXIT_REASON_IO_SMI:
210		return "iosmi";
211	case EXIT_REASON_SMI:
212		return "smi";
213	case EXIT_REASON_INTR_WINDOW:
214		return "intrwindow";
215	case EXIT_REASON_NMI_WINDOW:
216		return "nmiwindow";
217	case EXIT_REASON_TASK_SWITCH:
218		return "taskswitch";
219	case EXIT_REASON_CPUID:
220		return "cpuid";
221	case EXIT_REASON_GETSEC:
222		return "getsec";
223	case EXIT_REASON_HLT:
224		return "hlt";
225	case EXIT_REASON_INVD:
226		return "invd";
227	case EXIT_REASON_INVLPG:
228		return "invlpg";
229	case EXIT_REASON_RDPMC:
230		return "rdpmc";
231	case EXIT_REASON_RDTSC:
232		return "rdtsc";
233	case EXIT_REASON_RSM:
234		return "rsm";
235	case EXIT_REASON_VMCALL:
236		return "vmcall";
237	case EXIT_REASON_VMCLEAR:
238		return "vmclear";
239	case EXIT_REASON_VMLAUNCH:
240		return "vmlaunch";
241	case EXIT_REASON_VMPTRLD:
242		return "vmptrld";
243	case EXIT_REASON_VMPTRST:
244		return "vmptrst";
245	case EXIT_REASON_VMREAD:
246		return "vmread";
247	case EXIT_REASON_VMRESUME:
248		return "vmresume";
249	case EXIT_REASON_VMWRITE:
250		return "vmwrite";
251	case EXIT_REASON_VMXOFF:
252		return "vmxoff";
253	case EXIT_REASON_VMXON:
254		return "vmxon";
255	case EXIT_REASON_CR_ACCESS:
256		return "craccess";
257	case EXIT_REASON_DR_ACCESS:
258		return "draccess";
259	case EXIT_REASON_INOUT:
260		return "inout";
261	case EXIT_REASON_RDMSR:
262		return "rdmsr";
263	case EXIT_REASON_WRMSR:
264		return "wrmsr";
265	case EXIT_REASON_INVAL_VMCS:
266		return "invalvmcs";
267	case EXIT_REASON_INVAL_MSR:
268		return "invalmsr";
269	case EXIT_REASON_MWAIT:
270		return "mwait";
271	case EXIT_REASON_MTF:
272		return "mtf";
273	case EXIT_REASON_MONITOR:
274		return "monitor";
275	case EXIT_REASON_PAUSE:
276		return "pause";
277	case EXIT_REASON_MCE:
278		return "mce";
279	case EXIT_REASON_TPR:
280		return "tpr";
281	case EXIT_REASON_APIC_ACCESS:
282		return "apic-access";
283	case EXIT_REASON_GDTR_IDTR:
284		return "gdtridtr";
285	case EXIT_REASON_LDTR_TR:
286		return "ldtrtr";
287	case EXIT_REASON_EPT_FAULT:
288		return "eptfault";
289	case EXIT_REASON_EPT_MISCONFIG:
290		return "eptmisconfig";
291	case EXIT_REASON_INVEPT:
292		return "invept";
293	case EXIT_REASON_RDTSCP:
294		return "rdtscp";
295	case EXIT_REASON_VMX_PREEMPT:
296		return "vmxpreempt";
297	case EXIT_REASON_INVVPID:
298		return "invvpid";
299	case EXIT_REASON_WBINVD:
300		return "wbinvd";
301	case EXIT_REASON_XSETBV:
302		return "xsetbv";
303	case EXIT_REASON_APIC_WRITE:
304		return "apic-write";
305	default:
306		snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason);
307		return (reasonbuf);
308	}
309}
310#endif	/* KTR */
311
312static int
313vmx_allow_x2apic_msrs(struct vmx *vmx)
314{
315	int i, error;
316
317	error = 0;
318
319	/*
320	 * Allow readonly access to the following x2APIC MSRs from the guest.
321	 */
322	error += guest_msr_ro(vmx, MSR_APIC_ID);
323	error += guest_msr_ro(vmx, MSR_APIC_VERSION);
324	error += guest_msr_ro(vmx, MSR_APIC_LDR);
325	error += guest_msr_ro(vmx, MSR_APIC_SVR);
326
327	for (i = 0; i < 8; i++)
328		error += guest_msr_ro(vmx, MSR_APIC_ISR0 + i);
329
330	for (i = 0; i < 8; i++)
331		error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i);
332
333	for (i = 0; i < 8; i++)
334		error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i);
335
336	error += guest_msr_ro(vmx, MSR_APIC_ESR);
337	error += guest_msr_ro(vmx, MSR_APIC_LVT_TIMER);
338	error += guest_msr_ro(vmx, MSR_APIC_LVT_THERMAL);
339	error += guest_msr_ro(vmx, MSR_APIC_LVT_PCINT);
340	error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT0);
341	error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT1);
342	error += guest_msr_ro(vmx, MSR_APIC_LVT_ERROR);
343	error += guest_msr_ro(vmx, MSR_APIC_ICR_TIMER);
344	error += guest_msr_ro(vmx, MSR_APIC_DCR_TIMER);
345	error += guest_msr_ro(vmx, MSR_APIC_ICR);
346
347	/*
348	 * Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest.
349	 *
350	 * These registers get special treatment described in the section
351	 * "Virtualizing MSR-Based APIC Accesses".
352	 */
353	error += guest_msr_rw(vmx, MSR_APIC_TPR);
354	error += guest_msr_rw(vmx, MSR_APIC_EOI);
355	error += guest_msr_rw(vmx, MSR_APIC_SELF_IPI);
356
357	return (error);
358}
359
360u_long
361vmx_fix_cr0(u_long cr0)
362{
363
364	return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask);
365}
366
367u_long
368vmx_fix_cr4(u_long cr4)
369{
370
371	return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask);
372}
373
374static void
375vpid_free(int vpid)
376{
377	if (vpid < 0 || vpid > 0xffff)
378		panic("vpid_free: invalid vpid %d", vpid);
379
380	/*
381	 * VPIDs [0,VM_MAXCPU] are special and are not allocated from
382	 * the unit number allocator.
383	 */
384
385	if (vpid > VM_MAXCPU)
386		free_unr(vpid_unr, vpid);
387}
388
389static void
390vpid_alloc(uint16_t *vpid, int num)
391{
392	int i, x;
393
394	if (num <= 0 || num > VM_MAXCPU)
395		panic("invalid number of vpids requested: %d", num);
396
397	/*
398	 * If the "enable vpid" execution control is not enabled then the
399	 * VPID is required to be 0 for all vcpus.
400	 */
401	if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) {
402		for (i = 0; i < num; i++)
403			vpid[i] = 0;
404		return;
405	}
406
407	/*
408	 * Allocate a unique VPID for each vcpu from the unit number allocator.
409	 */
410	for (i = 0; i < num; i++) {
411		x = alloc_unr(vpid_unr);
412		if (x == -1)
413			break;
414		else
415			vpid[i] = x;
416	}
417
418	if (i < num) {
419		atomic_add_int(&vpid_alloc_failed, 1);
420
421		/*
422		 * If the unit number allocator does not have enough unique
423		 * VPIDs then we need to allocate from the [1,VM_MAXCPU] range.
424		 *
425		 * These VPIDs are not be unique across VMs but this does not
426		 * affect correctness because the combined mappings are also
427		 * tagged with the EP4TA which is unique for each VM.
428		 *
429		 * It is still sub-optimal because the invvpid will invalidate
430		 * combined mappings for a particular VPID across all EP4TAs.
431		 */
432		while (i-- > 0)
433			vpid_free(vpid[i]);
434
435		for (i = 0; i < num; i++)
436			vpid[i] = i + 1;
437	}
438}
439
440static void
441vpid_init(void)
442{
443	/*
444	 * VPID 0 is required when the "enable VPID" execution control is
445	 * disabled.
446	 *
447	 * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the
448	 * unit number allocator does not have sufficient unique VPIDs to
449	 * satisfy the allocation.
450	 *
451	 * The remaining VPIDs are managed by the unit number allocator.
452	 */
453	vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL);
454}
455
456static void
457msr_save_area_init(struct msr_entry *g_area, int *g_count)
458{
459	int cnt;
460
461	static struct msr_entry guest_msrs[] = {
462		{ MSR_KGSBASE, 0, 0 },
463	};
464
465	cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]);
466	if (cnt > GUEST_MSR_MAX_ENTRIES)
467		panic("guest msr save area overrun");
468	bcopy(guest_msrs, g_area, sizeof(guest_msrs));
469	*g_count = cnt;
470}
471
472static void
473vmx_disable(void *arg __unused)
474{
475	struct invvpid_desc invvpid_desc = { 0 };
476	struct invept_desc invept_desc = { 0 };
477
478	if (vmxon_enabled[curcpu]) {
479		/*
480		 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b.
481		 *
482		 * VMXON or VMXOFF are not required to invalidate any TLB
483		 * caching structures. This prevents potential retention of
484		 * cached information in the TLB between distinct VMX episodes.
485		 */
486		invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc);
487		invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc);
488		vmxoff();
489	}
490	load_cr4(rcr4() & ~CR4_VMXE);
491}
492
493static int
494vmx_cleanup(void)
495{
496
497	if (pirvec != 0)
498		vmm_ipi_free(pirvec);
499
500	if (vpid_unr != NULL) {
501		delete_unrhdr(vpid_unr);
502		vpid_unr = NULL;
503	}
504
505	smp_rendezvous(NULL, vmx_disable, NULL, NULL);
506
507	return (0);
508}
509
510static void
511vmx_enable(void *arg __unused)
512{
513	int error;
514
515	load_cr4(rcr4() | CR4_VMXE);
516
517	*(uint32_t *)vmxon_region[curcpu] = vmx_revision();
518	error = vmxon(vmxon_region[curcpu]);
519	if (error == 0)
520		vmxon_enabled[curcpu] = 1;
521}
522
523static void
524vmx_restore(void)
525{
526
527	if (vmxon_enabled[curcpu])
528		vmxon(vmxon_region[curcpu]);
529}
530
531static int
532vmx_init(int ipinum)
533{
534	int error, use_tpr_shadow;
535	uint64_t basic, fixed0, fixed1, feature_control;
536	uint32_t tmp, procbased2_vid_bits;
537
538	/* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
539	if (!(cpu_feature2 & CPUID2_VMX)) {
540		printf("vmx_init: processor does not support VMX operation\n");
541		return (ENXIO);
542	}
543
544	/*
545	 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits
546	 * are set (bits 0 and 2 respectively).
547	 */
548	feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
549	if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 ||
550	    (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
551		printf("vmx_init: VMX operation disabled by BIOS\n");
552		return (ENXIO);
553	}
554
555	/*
556	 * Verify capabilities MSR_VMX_BASIC:
557	 * - bit 54 indicates support for INS/OUTS decoding
558	 */
559	basic = rdmsr(MSR_VMX_BASIC);
560	if ((basic & (1UL << 54)) == 0) {
561		printf("vmx_init: processor does not support desired basic "
562		    "capabilities\n");
563		return (EINVAL);
564	}
565
566	/* Check support for primary processor-based VM-execution controls */
567	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
568			       MSR_VMX_TRUE_PROCBASED_CTLS,
569			       PROCBASED_CTLS_ONE_SETTING,
570			       PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls);
571	if (error) {
572		printf("vmx_init: processor does not support desired primary "
573		       "processor-based controls\n");
574		return (error);
575	}
576
577	/* Clear the processor-based ctl bits that are set on demand */
578	procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING;
579
580	/* Check support for secondary processor-based VM-execution controls */
581	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
582			       MSR_VMX_PROCBASED_CTLS2,
583			       PROCBASED_CTLS2_ONE_SETTING,
584			       PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2);
585	if (error) {
586		printf("vmx_init: processor does not support desired secondary "
587		       "processor-based controls\n");
588		return (error);
589	}
590
591	/* Check support for VPID */
592	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
593			       PROCBASED2_ENABLE_VPID, 0, &tmp);
594	if (error == 0)
595		procbased_ctls2 |= PROCBASED2_ENABLE_VPID;
596
597	/* Check support for pin-based VM-execution controls */
598	error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
599			       MSR_VMX_TRUE_PINBASED_CTLS,
600			       PINBASED_CTLS_ONE_SETTING,
601			       PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls);
602	if (error) {
603		printf("vmx_init: processor does not support desired "
604		       "pin-based controls\n");
605		return (error);
606	}
607
608	/* Check support for VM-exit controls */
609	error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
610			       VM_EXIT_CTLS_ONE_SETTING,
611			       VM_EXIT_CTLS_ZERO_SETTING,
612			       &exit_ctls);
613	if (error) {
614		/* Try again without the PAT MSR bits */
615		error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS,
616				       MSR_VMX_TRUE_EXIT_CTLS,
617				       VM_EXIT_CTLS_ONE_SETTING_NO_PAT,
618				       VM_EXIT_CTLS_ZERO_SETTING,
619				       &exit_ctls);
620		if (error) {
621			printf("vmx_init: processor does not support desired "
622			       "exit controls\n");
623			return (error);
624		} else {
625			if (bootverbose)
626				printf("vmm: PAT MSR access not supported\n");
627			guest_msr_valid(MSR_PAT);
628			vmx_no_patmsr = 1;
629		}
630	}
631
632	/* Check support for VM-entry controls */
633	if (!vmx_no_patmsr) {
634		error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
635				       MSR_VMX_TRUE_ENTRY_CTLS,
636				       VM_ENTRY_CTLS_ONE_SETTING,
637				       VM_ENTRY_CTLS_ZERO_SETTING,
638				       &entry_ctls);
639	} else {
640		error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
641				       MSR_VMX_TRUE_ENTRY_CTLS,
642				       VM_ENTRY_CTLS_ONE_SETTING_NO_PAT,
643				       VM_ENTRY_CTLS_ZERO_SETTING,
644				       &entry_ctls);
645	}
646
647	if (error) {
648		printf("vmx_init: processor does not support desired "
649		       "entry controls\n");
650		       return (error);
651	}
652
653	/*
654	 * Check support for optional features by testing them
655	 * as individual bits
656	 */
657	cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
658					MSR_VMX_TRUE_PROCBASED_CTLS,
659					PROCBASED_HLT_EXITING, 0,
660					&tmp) == 0);
661
662	cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
663					MSR_VMX_PROCBASED_CTLS,
664					PROCBASED_MTF, 0,
665					&tmp) == 0);
666
667	cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
668					 MSR_VMX_TRUE_PROCBASED_CTLS,
669					 PROCBASED_PAUSE_EXITING, 0,
670					 &tmp) == 0);
671
672	cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
673					MSR_VMX_PROCBASED_CTLS2,
674					PROCBASED2_UNRESTRICTED_GUEST, 0,
675				        &tmp) == 0);
676
677	cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
678	    MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0,
679	    &tmp) == 0);
680
681	/*
682	 * Check support for virtual interrupt delivery.
683	 */
684	procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES |
685	    PROCBASED2_VIRTUALIZE_X2APIC_MODE |
686	    PROCBASED2_APIC_REGISTER_VIRTUALIZATION |
687	    PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY);
688
689	use_tpr_shadow = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
690	    MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0,
691	    &tmp) == 0);
692
693	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
694	    procbased2_vid_bits, 0, &tmp);
695	if (error == 0 && use_tpr_shadow) {
696		virtual_interrupt_delivery = 1;
697		TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid",
698		    &virtual_interrupt_delivery);
699	}
700
701	if (virtual_interrupt_delivery) {
702		procbased_ctls |= PROCBASED_USE_TPR_SHADOW;
703		procbased_ctls2 |= procbased2_vid_bits;
704		procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE;
705
706		/*
707		 * Check for Posted Interrupts only if Virtual Interrupt
708		 * Delivery is enabled.
709		 */
710		error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
711		    MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0,
712		    &tmp);
713		if (error == 0) {
714			pirvec = vmm_ipi_alloc();
715			if (pirvec == 0) {
716				if (bootverbose) {
717					printf("vmx_init: unable to allocate "
718					    "posted interrupt vector\n");
719				}
720			} else {
721				posted_interrupts = 1;
722				TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir",
723				    &posted_interrupts);
724			}
725		}
726	}
727
728	if (posted_interrupts)
729		    pinbased_ctls |= PINBASED_POSTED_INTERRUPT;
730
731	/* Initialize EPT */
732	error = ept_init(ipinum);
733	if (error) {
734		printf("vmx_init: ept initialization failed (%d)\n", error);
735		return (error);
736	}
737
738	/*
739	 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
740	 */
741	fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
742	fixed1 = rdmsr(MSR_VMX_CR0_FIXED1);
743	cr0_ones_mask = fixed0 & fixed1;
744	cr0_zeros_mask = ~fixed0 & ~fixed1;
745
746	/*
747	 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation
748	 * if unrestricted guest execution is allowed.
749	 */
750	if (cap_unrestricted_guest)
751		cr0_ones_mask &= ~(CR0_PG | CR0_PE);
752
753	/*
754	 * Do not allow the guest to set CR0_NW or CR0_CD.
755	 */
756	cr0_zeros_mask |= (CR0_NW | CR0_CD);
757
758	fixed0 = rdmsr(MSR_VMX_CR4_FIXED0);
759	fixed1 = rdmsr(MSR_VMX_CR4_FIXED1);
760	cr4_ones_mask = fixed0 & fixed1;
761	cr4_zeros_mask = ~fixed0 & ~fixed1;
762
763	vpid_init();
764
765	/* enable VMX operation */
766	smp_rendezvous(NULL, vmx_enable, NULL, NULL);
767
768	vmx_initialized = 1;
769
770	return (0);
771}
772
773static void
774vmx_trigger_hostintr(int vector)
775{
776	uintptr_t func;
777	struct gate_descriptor *gd;
778
779	gd = &idt[vector];
780
781	KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: "
782	    "invalid vector %d", vector));
783	KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present",
784	    vector));
785	KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d "
786	    "has invalid type %d", vector, gd->gd_type));
787	KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d "
788	    "has invalid dpl %d", vector, gd->gd_dpl));
789	KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor "
790	    "for vector %d has invalid selector %d", vector, gd->gd_selector));
791	KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid "
792	    "IST %d", vector, gd->gd_ist));
793
794	func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset);
795	vmx_call_isr(func);
796}
797
798static int
799vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial)
800{
801	int error, mask_ident, shadow_ident;
802	uint64_t mask_value;
803
804	if (which != 0 && which != 4)
805		panic("vmx_setup_cr_shadow: unknown cr%d", which);
806
807	if (which == 0) {
808		mask_ident = VMCS_CR0_MASK;
809		mask_value = cr0_ones_mask | cr0_zeros_mask;
810		shadow_ident = VMCS_CR0_SHADOW;
811	} else {
812		mask_ident = VMCS_CR4_MASK;
813		mask_value = cr4_ones_mask | cr4_zeros_mask;
814		shadow_ident = VMCS_CR4_SHADOW;
815	}
816
817	error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value);
818	if (error)
819		return (error);
820
821	error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial);
822	if (error)
823		return (error);
824
825	return (0);
826}
827#define	vmx_setup_cr0_shadow(vmcs,init)	vmx_setup_cr_shadow(0, (vmcs), (init))
828#define	vmx_setup_cr4_shadow(vmcs,init)	vmx_setup_cr_shadow(4, (vmcs), (init))
829
830static void *
831vmx_vminit(struct vm *vm, pmap_t pmap)
832{
833	uint16_t vpid[VM_MAXCPU];
834	int i, error, guest_msr_count;
835	struct vmx *vmx;
836	struct vmcs *vmcs;
837
838	vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
839	if ((uintptr_t)vmx & PAGE_MASK) {
840		panic("malloc of struct vmx not aligned on %d byte boundary",
841		      PAGE_SIZE);
842	}
843	vmx->vm = vm;
844
845	vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4));
846
847	/*
848	 * Clean up EPTP-tagged guest physical and combined mappings
849	 *
850	 * VMX transitions are not required to invalidate any guest physical
851	 * mappings. So, it may be possible for stale guest physical mappings
852	 * to be present in the processor TLBs.
853	 *
854	 * Combined mappings for this EP4TA are also invalidated for all VPIDs.
855	 */
856	ept_invalidate_mappings(vmx->eptp);
857
858	msr_bitmap_initialize(vmx->msr_bitmap);
859
860	/*
861	 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE.
862	 * The guest FSBASE and GSBASE are saved and restored during
863	 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are
864	 * always restored from the vmcs host state area on vm-exit.
865	 *
866	 * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in
867	 * how they are saved/restored so can be directly accessed by the
868	 * guest.
869	 *
870	 * Guest KGSBASE is saved and restored in the guest MSR save area.
871	 * Host KGSBASE is restored before returning to userland from the pcb.
872	 * There will be a window of time when we are executing in the host
873	 * kernel context with a value of KGSBASE from the guest. This is ok
874	 * because the value of KGSBASE is inconsequential in kernel context.
875	 *
876	 * MSR_EFER is saved and restored in the guest VMCS area on a
877	 * VM exit and entry respectively. It is also restored from the
878	 * host VMCS area on a VM exit.
879	 *
880	 * The TSC MSR is exposed read-only. Writes are disallowed as that
881	 * will impact the host TSC.
882	 * XXX Writes would be implemented with a wrmsr trap, and
883	 * then modifying the TSC offset in the VMCS.
884	 */
885	if (guest_msr_rw(vmx, MSR_GSBASE) ||
886	    guest_msr_rw(vmx, MSR_FSBASE) ||
887	    guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) ||
888	    guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) ||
889	    guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) ||
890	    guest_msr_rw(vmx, MSR_KGSBASE) ||
891	    guest_msr_rw(vmx, MSR_EFER) ||
892	    guest_msr_ro(vmx, MSR_TSC))
893		panic("vmx_vminit: error setting guest msr access");
894
895	/*
896	 * MSR_PAT is saved and restored in the guest VMCS are on a VM exit
897	 * and entry respectively. It is also restored from the host VMCS
898	 * area on a VM exit. However, if running on a system with no
899	 * MSR_PAT save/restore support, leave access disabled so accesses
900	 * will be trapped.
901	 */
902	if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT))
903		panic("vmx_vminit: error setting guest pat msr access");
904
905	vpid_alloc(vpid, VM_MAXCPU);
906
907	if (virtual_interrupt_delivery) {
908		error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE,
909		    APIC_ACCESS_ADDRESS);
910		/* XXX this should really return an error to the caller */
911		KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error));
912	}
913
914	for (i = 0; i < VM_MAXCPU; i++) {
915		vmcs = &vmx->vmcs[i];
916		vmcs->identifier = vmx_revision();
917		error = vmclear(vmcs);
918		if (error != 0) {
919			panic("vmx_vminit: vmclear error %d on vcpu %d\n",
920			      error, i);
921		}
922
923		error = vmcs_init(vmcs);
924		KASSERT(error == 0, ("vmcs_init error %d", error));
925
926		VMPTRLD(vmcs);
927		error = 0;
928		error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]);
929		error += vmwrite(VMCS_EPTP, vmx->eptp);
930		error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls);
931		error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls);
932		error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2);
933		error += vmwrite(VMCS_EXIT_CTLS, exit_ctls);
934		error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls);
935		error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap));
936		error += vmwrite(VMCS_VPID, vpid[i]);
937		if (virtual_interrupt_delivery) {
938			error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS);
939			error += vmwrite(VMCS_VIRTUAL_APIC,
940			    vtophys(&vmx->apic_page[i]));
941			error += vmwrite(VMCS_EOI_EXIT0, 0);
942			error += vmwrite(VMCS_EOI_EXIT1, 0);
943			error += vmwrite(VMCS_EOI_EXIT2, 0);
944			error += vmwrite(VMCS_EOI_EXIT3, 0);
945		}
946		if (posted_interrupts) {
947			error += vmwrite(VMCS_PIR_VECTOR, pirvec);
948			error += vmwrite(VMCS_PIR_DESC,
949			    vtophys(&vmx->pir_desc[i]));
950		}
951		VMCLEAR(vmcs);
952		KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs"));
953
954		vmx->cap[i].set = 0;
955		vmx->cap[i].proc_ctls = procbased_ctls;
956		vmx->cap[i].proc_ctls2 = procbased_ctls2;
957
958		vmx->state[i].lastcpu = -1;
959		vmx->state[i].vpid = vpid[i];
960
961		msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count);
962
963		error = vmcs_set_msr_save(vmcs, vtophys(vmx->guest_msrs[i]),
964		    guest_msr_count);
965		if (error != 0)
966			panic("vmcs_set_msr_save error %d", error);
967
968		/*
969		 * Set up the CR0/4 shadows, and init the read shadow
970		 * to the power-on register value from the Intel Sys Arch.
971		 *  CR0 - 0x60000010
972		 *  CR4 - 0
973		 */
974		error = vmx_setup_cr0_shadow(vmcs, 0x60000010);
975		if (error != 0)
976			panic("vmx_setup_cr0_shadow %d", error);
977
978		error = vmx_setup_cr4_shadow(vmcs, 0);
979		if (error != 0)
980			panic("vmx_setup_cr4_shadow %d", error);
981
982		vmx->ctx[i].pmap = pmap;
983	}
984
985	return (vmx);
986}
987
988static int
989vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx)
990{
991	int handled, func;
992
993	func = vmxctx->guest_rax;
994
995	handled = x86_emulate_cpuid(vm, vcpu,
996				    (uint32_t*)(&vmxctx->guest_rax),
997				    (uint32_t*)(&vmxctx->guest_rbx),
998				    (uint32_t*)(&vmxctx->guest_rcx),
999				    (uint32_t*)(&vmxctx->guest_rdx));
1000	return (handled);
1001}
1002
1003static __inline void
1004vmx_run_trace(struct vmx *vmx, int vcpu)
1005{
1006#ifdef KTR
1007	VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip());
1008#endif
1009}
1010
1011static __inline void
1012vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason,
1013	       int handled)
1014{
1015#ifdef KTR
1016	VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx",
1017		 handled ? "handled" : "unhandled",
1018		 exit_reason_to_str(exit_reason), rip);
1019#endif
1020}
1021
1022static __inline void
1023vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
1024{
1025#ifdef KTR
1026	VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip);
1027#endif
1028}
1029
1030static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved");
1031
1032static void
1033vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
1034{
1035	struct vmxstate *vmxstate;
1036	struct invvpid_desc invvpid_desc;
1037
1038	vmxstate = &vmx->state[vcpu];
1039	if (vmxstate->lastcpu == curcpu)
1040		return;
1041
1042	vmxstate->lastcpu = curcpu;
1043
1044	vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
1045
1046	vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
1047	vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
1048	vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
1049
1050	/*
1051	 * If we are using VPIDs then invalidate all mappings tagged with 'vpid'
1052	 *
1053	 * We do this because this vcpu was executing on a different host
1054	 * cpu when it last ran. We do not track whether it invalidated
1055	 * mappings associated with its 'vpid' during that run. So we must
1056	 * assume that the mappings associated with 'vpid' on 'curcpu' are
1057	 * stale and invalidate them.
1058	 *
1059	 * Note that we incur this penalty only when the scheduler chooses to
1060	 * move the thread associated with this vcpu between host cpus.
1061	 *
1062	 * Note also that this will invalidate mappings tagged with 'vpid'
1063	 * for "all" EP4TAs.
1064	 */
1065	if (vmxstate->vpid != 0) {
1066		if (pmap->pm_eptgen == vmx->eptgen[curcpu]) {
1067			invvpid_desc._res1 = 0;
1068			invvpid_desc._res2 = 0;
1069			invvpid_desc.vpid = vmxstate->vpid;
1070			invvpid_desc.linear_addr = 0;
1071			invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
1072		} else {
1073			/*
1074			 * The invvpid can be skipped if an invept is going to
1075			 * be performed before entering the guest. The invept
1076			 * will invalidate combined mappings tagged with
1077			 * 'vmx->eptp' for all vpids.
1078			 */
1079			vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1);
1080		}
1081	}
1082}
1083
1084/*
1085 * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
1086 */
1087CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);
1088
1089static void __inline
1090vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
1091{
1092
1093	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) {
1094		vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING;
1095		vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1096		VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting");
1097	}
1098}
1099
1100static void __inline
1101vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
1102{
1103
1104	KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0,
1105	    ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls));
1106	vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
1107	vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1108	VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting");
1109}
1110
1111static void __inline
1112vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
1113{
1114
1115	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) {
1116		vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING;
1117		vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1118		VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting");
1119	}
1120}
1121
1122static void __inline
1123vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
1124{
1125
1126	KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0,
1127	    ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls));
1128	vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
1129	vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1130	VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
1131}
1132
1133#define	NMI_BLOCKING	(VMCS_INTERRUPTIBILITY_NMI_BLOCKING |		\
1134			 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
1135#define	HWINTR_BLOCKING	(VMCS_INTERRUPTIBILITY_STI_BLOCKING |		\
1136			 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
1137
1138static void
1139vmx_inject_nmi(struct vmx *vmx, int vcpu)
1140{
1141	uint32_t gi, info;
1142
1143	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1144	KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest "
1145	    "interruptibility-state %#x", gi));
1146
1147	info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1148	KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid "
1149	    "VM-entry interruption information %#x", info));
1150
1151	/*
1152	 * Inject the virtual NMI. The vector must be the NMI IDT entry
1153	 * or the VMCS entry check will fail.
1154	 */
1155	info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID;
1156	vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1157
1158	VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI");
1159
1160	/* Clear the request */
1161	vm_nmi_clear(vmx->vm, vcpu);
1162}
1163
1164static void
1165vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic)
1166{
1167	struct vm_exception exc;
1168	int vector, need_nmi_exiting, extint_pending;
1169	uint64_t rflags;
1170	uint32_t gi, info;
1171
1172	if (vm_exception_pending(vmx->vm, vcpu, &exc)) {
1173		KASSERT(exc.vector >= 0 && exc.vector < 32,
1174		    ("%s: invalid exception vector %d", __func__, exc.vector));
1175
1176		info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1177		KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject "
1178		     "pending exception %d: %#x", __func__, exc.vector, info));
1179
1180		info = exc.vector | VMCS_INTR_T_HWEXCEPTION | VMCS_INTR_VALID;
1181		if (exc.error_code_valid) {
1182			info |= VMCS_INTR_DEL_ERRCODE;
1183			vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, exc.error_code);
1184		}
1185		vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1186	}
1187
1188	if (vm_nmi_pending(vmx->vm, vcpu)) {
1189		/*
1190		 * If there are no conditions blocking NMI injection then
1191		 * inject it directly here otherwise enable "NMI window
1192		 * exiting" to inject it as soon as we can.
1193		 *
1194		 * We also check for STI_BLOCKING because some implementations
1195		 * don't allow NMI injection in this case. If we are running
1196		 * on a processor that doesn't have this restriction it will
1197		 * immediately exit and the NMI will be injected in the
1198		 * "NMI window exiting" handler.
1199		 */
1200		need_nmi_exiting = 1;
1201		gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1202		if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) {
1203			info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1204			if ((info & VMCS_INTR_VALID) == 0) {
1205				vmx_inject_nmi(vmx, vcpu);
1206				need_nmi_exiting = 0;
1207			} else {
1208				VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI "
1209				    "due to VM-entry intr info %#x", info);
1210			}
1211		} else {
1212			VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to "
1213			    "Guest Interruptibility-state %#x", gi);
1214		}
1215
1216		if (need_nmi_exiting)
1217			vmx_set_nmi_window_exiting(vmx, vcpu);
1218	}
1219
1220	extint_pending = vm_extint_pending(vmx->vm, vcpu);
1221
1222	if (!extint_pending && virtual_interrupt_delivery) {
1223		vmx_inject_pir(vlapic);
1224		return;
1225	}
1226
1227	/*
1228	 * If interrupt-window exiting is already in effect then don't bother
1229	 * checking for pending interrupts. This is just an optimization and
1230	 * not needed for correctness.
1231	 */
1232	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) {
1233		VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to "
1234		    "pending int_window_exiting");
1235		return;
1236	}
1237
1238	if (!extint_pending) {
1239		/* Ask the local apic for a vector to inject */
1240		if (!vlapic_pending_intr(vlapic, &vector))
1241			return;
1242	} else {
1243		/* Ask the legacy pic for a vector to inject */
1244		vatpic_pending_intr(vmx->vm, &vector);
1245	}
1246
1247	KASSERT(vector >= 32 && vector <= 255, ("invalid vector %d", vector));
1248
1249	/* Check RFLAGS.IF and the interruptibility state of the guest */
1250	rflags = vmcs_read(VMCS_GUEST_RFLAGS);
1251	if ((rflags & PSL_I) == 0) {
1252		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
1253		    "rflags %#lx", vector, rflags);
1254		goto cantinject;
1255	}
1256
1257	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1258	if (gi & HWINTR_BLOCKING) {
1259		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
1260		    "Guest Interruptibility-state %#x", vector, gi);
1261		goto cantinject;
1262	}
1263
1264	info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1265	if (info & VMCS_INTR_VALID) {
1266		/*
1267		 * This is expected and could happen for multiple reasons:
1268		 * - A vectoring VM-entry was aborted due to astpending
1269		 * - A VM-exit happened during event injection.
1270		 * - An exception was injected above.
1271		 * - An NMI was injected above or after "NMI window exiting"
1272		 */
1273		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
1274		    "VM-entry intr info %#x", vector, info);
1275		goto cantinject;
1276	}
1277
1278	/* Inject the interrupt */
1279	info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID;
1280	info |= vector;
1281	vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1282
1283	if (!extint_pending) {
1284		/* Update the Local APIC ISR */
1285		vlapic_intr_accepted(vlapic, vector);
1286	} else {
1287		vm_extint_clear(vmx->vm, vcpu);
1288		vatpic_intr_accepted(vmx->vm, vector);
1289
1290		/*
1291		 * After we accepted the current ExtINT the PIC may
1292		 * have posted another one.  If that is the case, set
1293		 * the Interrupt Window Exiting execution control so
1294		 * we can inject that one too.
1295		 */
1296		if (vm_extint_pending(vmx->vm, vcpu))
1297			vmx_set_int_window_exiting(vmx, vcpu);
1298	}
1299
1300	VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
1301
1302	return;
1303
1304cantinject:
1305	/*
1306	 * Set the Interrupt Window Exiting execution control so we can inject
1307	 * the interrupt as soon as blocking condition goes away.
1308	 */
1309	vmx_set_int_window_exiting(vmx, vcpu);
1310}
1311
1312/*
1313 * If the Virtual NMIs execution control is '1' then the logical processor
1314 * tracks virtual-NMI blocking in the Guest Interruptibility-state field of
1315 * the VMCS. An IRET instruction in VMX non-root operation will remove any
1316 * virtual-NMI blocking.
1317 *
1318 * This unblocking occurs even if the IRET causes a fault. In this case the
1319 * hypervisor needs to restore virtual-NMI blocking before resuming the guest.
1320 */
1321static void
1322vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid)
1323{
1324	uint32_t gi;
1325
1326	VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking");
1327	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1328	gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
1329	vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
1330}
1331
1332static void
1333vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid)
1334{
1335	uint32_t gi;
1336
1337	VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking");
1338	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1339	gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
1340	vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
1341}
1342
1343static int
1344vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
1345{
1346	struct vmxctx *vmxctx;
1347	uint64_t xcrval;
1348	const struct xsave_limits *limits;
1349
1350	vmxctx = &vmx->ctx[vcpu];
1351	limits = vmm_get_xsave_limits();
1352
1353	/*
1354	 * Note that the processor raises a GP# fault on its own if
1355	 * xsetbv is executed for CPL != 0, so we do not have to
1356	 * emulate that fault here.
1357	 */
1358
1359	/* Only xcr0 is supported. */
1360	if (vmxctx->guest_rcx != 0) {
1361		vm_inject_gp(vmx->vm, vcpu);
1362		return (HANDLED);
1363	}
1364
1365	/* We only handle xcr0 if both the host and guest have XSAVE enabled. */
1366	if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) {
1367		vm_inject_ud(vmx->vm, vcpu);
1368		return (HANDLED);
1369	}
1370
1371	xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff);
1372	if ((xcrval & ~limits->xcr0_allowed) != 0) {
1373		vm_inject_gp(vmx->vm, vcpu);
1374		return (HANDLED);
1375	}
1376
1377	if (!(xcrval & XFEATURE_ENABLED_X87)) {
1378		vm_inject_gp(vmx->vm, vcpu);
1379		return (HANDLED);
1380	}
1381
1382	if ((xcrval & (XFEATURE_ENABLED_AVX | XFEATURE_ENABLED_SSE)) ==
1383	    XFEATURE_ENABLED_AVX) {
1384		vm_inject_gp(vmx->vm, vcpu);
1385		return (HANDLED);
1386	}
1387
1388	/*
1389	 * This runs "inside" vmrun() with the guest's FPU state, so
1390	 * modifying xcr0 directly modifies the guest's xcr0, not the
1391	 * host's.
1392	 */
1393	load_xcr(0, xcrval);
1394	return (HANDLED);
1395}
1396
1397static int
1398vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
1399{
1400	int cr, vmcs_guest_cr, vmcs_shadow_cr;
1401	uint64_t crval, regval, ones_mask, zeros_mask;
1402	const struct vmxctx *vmxctx;
1403
1404	/* We only handle mov to %cr0 or %cr4 at this time */
1405	if ((exitqual & 0xf0) != 0x00)
1406		return (UNHANDLED);
1407
1408	cr = exitqual & 0xf;
1409	if (cr != 0 && cr != 4)
1410		return (UNHANDLED);
1411
1412	regval = 0; /* silence gcc */
1413	vmxctx = &vmx->ctx[vcpu];
1414
1415	/*
1416	 * We must use vmcs_write() directly here because vmcs_setreg() will
1417	 * call vmclear(vmcs) as a side-effect which we certainly don't want.
1418	 */
1419	switch ((exitqual >> 8) & 0xf) {
1420	case 0:
1421		regval = vmxctx->guest_rax;
1422		break;
1423	case 1:
1424		regval = vmxctx->guest_rcx;
1425		break;
1426	case 2:
1427		regval = vmxctx->guest_rdx;
1428		break;
1429	case 3:
1430		regval = vmxctx->guest_rbx;
1431		break;
1432	case 4:
1433		regval = vmcs_read(VMCS_GUEST_RSP);
1434		break;
1435	case 5:
1436		regval = vmxctx->guest_rbp;
1437		break;
1438	case 6:
1439		regval = vmxctx->guest_rsi;
1440		break;
1441	case 7:
1442		regval = vmxctx->guest_rdi;
1443		break;
1444	case 8:
1445		regval = vmxctx->guest_r8;
1446		break;
1447	case 9:
1448		regval = vmxctx->guest_r9;
1449		break;
1450	case 10:
1451		regval = vmxctx->guest_r10;
1452		break;
1453	case 11:
1454		regval = vmxctx->guest_r11;
1455		break;
1456	case 12:
1457		regval = vmxctx->guest_r12;
1458		break;
1459	case 13:
1460		regval = vmxctx->guest_r13;
1461		break;
1462	case 14:
1463		regval = vmxctx->guest_r14;
1464		break;
1465	case 15:
1466		regval = vmxctx->guest_r15;
1467		break;
1468	}
1469
1470	if (cr == 0) {
1471		ones_mask = cr0_ones_mask;
1472		zeros_mask = cr0_zeros_mask;
1473		vmcs_guest_cr = VMCS_GUEST_CR0;
1474		vmcs_shadow_cr = VMCS_CR0_SHADOW;
1475	} else {
1476		ones_mask = cr4_ones_mask;
1477		zeros_mask = cr4_zeros_mask;
1478		vmcs_guest_cr = VMCS_GUEST_CR4;
1479		vmcs_shadow_cr = VMCS_CR4_SHADOW;
1480	}
1481	vmcs_write(vmcs_shadow_cr, regval);
1482
1483	crval = regval | ones_mask;
1484	crval &= ~zeros_mask;
1485	vmcs_write(vmcs_guest_cr, crval);
1486
1487	if (cr == 0 && regval & CR0_PG) {
1488		uint64_t efer, entry_ctls;
1489
1490		/*
1491		 * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and
1492		 * the "IA-32e mode guest" bit in VM-entry control must be
1493		 * equal.
1494		 */
1495		efer = vmcs_read(VMCS_GUEST_IA32_EFER);
1496		if (efer & EFER_LME) {
1497			efer |= EFER_LMA;
1498			vmcs_write(VMCS_GUEST_IA32_EFER, efer);
1499			entry_ctls = vmcs_read(VMCS_ENTRY_CTLS);
1500			entry_ctls |= VM_ENTRY_GUEST_LMA;
1501			vmcs_write(VMCS_ENTRY_CTLS, entry_ctls);
1502		}
1503	}
1504
1505	return (HANDLED);
1506}
1507
1508/*
1509 * From section "Guest Register State" in the Intel SDM: CPL = SS.DPL
1510 */
1511static int
1512vmx_cpl(void)
1513{
1514	uint32_t ssar;
1515
1516	ssar = vmcs_read(VMCS_GUEST_SS_ACCESS_RIGHTS);
1517	return ((ssar >> 5) & 0x3);
1518}
1519
1520static enum vie_cpu_mode
1521vmx_cpu_mode(void)
1522{
1523
1524	if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA)
1525		return (CPU_MODE_64BIT);
1526	else
1527		return (CPU_MODE_COMPATIBILITY);
1528}
1529
1530static enum vie_paging_mode
1531vmx_paging_mode(void)
1532{
1533
1534	if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG))
1535		return (PAGING_MODE_FLAT);
1536	if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE))
1537		return (PAGING_MODE_32);
1538	if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME)
1539		return (PAGING_MODE_64);
1540	else
1541		return (PAGING_MODE_PAE);
1542}
1543
1544static uint64_t
1545inout_str_index(struct vmx *vmx, int vcpuid, int in)
1546{
1547	uint64_t val;
1548	int error;
1549	enum vm_reg_name reg;
1550
1551	reg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI;
1552	error = vmx_getreg(vmx, vcpuid, reg, &val);
1553	KASSERT(error == 0, ("%s: vmx_getreg error %d", __func__, error));
1554	return (val);
1555}
1556
1557static uint64_t
1558inout_str_count(struct vmx *vmx, int vcpuid, int rep)
1559{
1560	uint64_t val;
1561	int error;
1562
1563	if (rep) {
1564		error = vmx_getreg(vmx, vcpuid, VM_REG_GUEST_RCX, &val);
1565		KASSERT(!error, ("%s: vmx_getreg error %d", __func__, error));
1566	} else {
1567		val = 1;
1568	}
1569	return (val);
1570}
1571
1572static int
1573inout_str_addrsize(uint32_t inst_info)
1574{
1575	uint32_t size;
1576
1577	size = (inst_info >> 7) & 0x7;
1578	switch (size) {
1579	case 0:
1580		return (2);	/* 16 bit */
1581	case 1:
1582		return (4);	/* 32 bit */
1583	case 2:
1584		return (8);	/* 64 bit */
1585	default:
1586		panic("%s: invalid size encoding %d", __func__, size);
1587	}
1588}
1589
1590static void
1591inout_str_seginfo(struct vmx *vmx, int vcpuid, uint32_t inst_info, int in,
1592    struct vm_inout_str *vis)
1593{
1594	int error, s;
1595
1596	if (in) {
1597		vis->seg_name = VM_REG_GUEST_ES;
1598	} else {
1599		s = (inst_info >> 15) & 0x7;
1600		vis->seg_name = vm_segment_name(s);
1601	}
1602
1603	error = vmx_getdesc(vmx, vcpuid, vis->seg_name, &vis->seg_desc);
1604	KASSERT(error == 0, ("%s: vmx_getdesc error %d", __func__, error));
1605
1606	/* XXX modify svm.c to update bit 16 of seg_desc.access (unusable) */
1607}
1608
1609static void
1610vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla)
1611{
1612	vmexit->exitcode = VM_EXITCODE_INST_EMUL;
1613	vmexit->u.inst_emul.gpa = gpa;
1614	vmexit->u.inst_emul.gla = gla;
1615	vmexit->u.inst_emul.cr3 = vmcs_guest_cr3();
1616	vmexit->u.inst_emul.cpu_mode = vmx_cpu_mode();
1617	vmexit->u.inst_emul.paging_mode = vmx_paging_mode();
1618	vmexit->u.inst_emul.cpl = vmx_cpl();
1619}
1620
1621static int
1622ept_fault_type(uint64_t ept_qual)
1623{
1624	int fault_type;
1625
1626	if (ept_qual & EPT_VIOLATION_DATA_WRITE)
1627		fault_type = VM_PROT_WRITE;
1628	else if (ept_qual & EPT_VIOLATION_INST_FETCH)
1629		fault_type = VM_PROT_EXECUTE;
1630	else
1631		fault_type= VM_PROT_READ;
1632
1633	return (fault_type);
1634}
1635
1636static boolean_t
1637ept_emulation_fault(uint64_t ept_qual)
1638{
1639	int read, write;
1640
1641	/* EPT fault on an instruction fetch doesn't make sense here */
1642	if (ept_qual & EPT_VIOLATION_INST_FETCH)
1643		return (FALSE);
1644
1645	/* EPT fault must be a read fault or a write fault */
1646	read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
1647	write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
1648	if ((read | write) == 0)
1649		return (FALSE);
1650
1651	/*
1652	 * The EPT violation must have been caused by accessing a
1653	 * guest-physical address that is a translation of a guest-linear
1654	 * address.
1655	 */
1656	if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
1657	    (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
1658		return (FALSE);
1659	}
1660
1661	return (TRUE);
1662}
1663
1664static __inline int
1665apic_access_virtualization(struct vmx *vmx, int vcpuid)
1666{
1667	uint32_t proc_ctls2;
1668
1669	proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
1670	return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0);
1671}
1672
1673static __inline int
1674x2apic_virtualization(struct vmx *vmx, int vcpuid)
1675{
1676	uint32_t proc_ctls2;
1677
1678	proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
1679	return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0);
1680}
1681
1682static int
1683vmx_handle_apic_write(struct vmx *vmx, int vcpuid, struct vlapic *vlapic,
1684    uint64_t qual)
1685{
1686	int error, handled, offset;
1687	uint32_t *apic_regs, vector;
1688	bool retu;
1689
1690	handled = HANDLED;
1691	offset = APIC_WRITE_OFFSET(qual);
1692
1693	if (!apic_access_virtualization(vmx, vcpuid)) {
1694		/*
1695		 * In general there should not be any APIC write VM-exits
1696		 * unless APIC-access virtualization is enabled.
1697		 *
1698		 * However self-IPI virtualization can legitimately trigger
1699		 * an APIC-write VM-exit so treat it specially.
1700		 */
1701		if (x2apic_virtualization(vmx, vcpuid) &&
1702		    offset == APIC_OFFSET_SELF_IPI) {
1703			apic_regs = (uint32_t *)(vlapic->apic_page);
1704			vector = apic_regs[APIC_OFFSET_SELF_IPI / 4];
1705			vlapic_self_ipi_handler(vlapic, vector);
1706			return (HANDLED);
1707		} else
1708			return (UNHANDLED);
1709	}
1710
1711	switch (offset) {
1712	case APIC_OFFSET_ID:
1713		vlapic_id_write_handler(vlapic);
1714		break;
1715	case APIC_OFFSET_LDR:
1716		vlapic_ldr_write_handler(vlapic);
1717		break;
1718	case APIC_OFFSET_DFR:
1719		vlapic_dfr_write_handler(vlapic);
1720		break;
1721	case APIC_OFFSET_SVR:
1722		vlapic_svr_write_handler(vlapic);
1723		break;
1724	case APIC_OFFSET_ESR:
1725		vlapic_esr_write_handler(vlapic);
1726		break;
1727	case APIC_OFFSET_ICR_LOW:
1728		retu = false;
1729		error = vlapic_icrlo_write_handler(vlapic, &retu);
1730		if (error != 0 || retu)
1731			handled = UNHANDLED;
1732		break;
1733	case APIC_OFFSET_CMCI_LVT:
1734	case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
1735		vlapic_lvt_write_handler(vlapic, offset);
1736		break;
1737	case APIC_OFFSET_TIMER_ICR:
1738		vlapic_icrtmr_write_handler(vlapic);
1739		break;
1740	case APIC_OFFSET_TIMER_DCR:
1741		vlapic_dcr_write_handler(vlapic);
1742		break;
1743	default:
1744		handled = UNHANDLED;
1745		break;
1746	}
1747	return (handled);
1748}
1749
1750static bool
1751apic_access_fault(struct vmx *vmx, int vcpuid, uint64_t gpa)
1752{
1753
1754	if (apic_access_virtualization(vmx, vcpuid) &&
1755	    (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE))
1756		return (true);
1757	else
1758		return (false);
1759}
1760
1761static int
1762vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
1763{
1764	uint64_t qual;
1765	int access_type, offset, allowed;
1766
1767	if (!apic_access_virtualization(vmx, vcpuid))
1768		return (UNHANDLED);
1769
1770	qual = vmexit->u.vmx.exit_qualification;
1771	access_type = APIC_ACCESS_TYPE(qual);
1772	offset = APIC_ACCESS_OFFSET(qual);
1773
1774	allowed = 0;
1775	if (access_type == 0) {
1776		/*
1777		 * Read data access to the following registers is expected.
1778		 */
1779		switch (offset) {
1780		case APIC_OFFSET_APR:
1781		case APIC_OFFSET_PPR:
1782		case APIC_OFFSET_RRR:
1783		case APIC_OFFSET_CMCI_LVT:
1784		case APIC_OFFSET_TIMER_CCR:
1785			allowed = 1;
1786			break;
1787		default:
1788			break;
1789		}
1790	} else if (access_type == 1) {
1791		/*
1792		 * Write data access to the following registers is expected.
1793		 */
1794		switch (offset) {
1795		case APIC_OFFSET_VER:
1796		case APIC_OFFSET_APR:
1797		case APIC_OFFSET_PPR:
1798		case APIC_OFFSET_RRR:
1799		case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
1800		case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
1801		case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
1802		case APIC_OFFSET_CMCI_LVT:
1803		case APIC_OFFSET_TIMER_CCR:
1804			allowed = 1;
1805			break;
1806		default:
1807			break;
1808		}
1809	}
1810
1811	if (allowed) {
1812		vmexit_inst_emul(vmexit, DEFAULT_APIC_BASE + offset,
1813		    VIE_INVALID_GLA);
1814	}
1815
1816	/*
1817	 * Regardless of whether the APIC-access is allowed this handler
1818	 * always returns UNHANDLED:
1819	 * - if the access is allowed then it is handled by emulating the
1820	 *   instruction that caused the VM-exit (outside the critical section)
1821	 * - if the access is not allowed then it will be converted to an
1822	 *   exitcode of VM_EXITCODE_VMX and will be dealt with in userland.
1823	 */
1824	return (UNHANDLED);
1825}
1826
1827static int
1828vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
1829{
1830	int error, handled, in;
1831	struct vmxctx *vmxctx;
1832	struct vlapic *vlapic;
1833	struct vm_inout_str *vis;
1834	uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info;
1835	uint32_t reason;
1836	uint64_t qual, gpa;
1837	bool retu;
1838
1839	CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0);
1840	CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0);
1841
1842	handled = UNHANDLED;
1843	vmxctx = &vmx->ctx[vcpu];
1844
1845	qual = vmexit->u.vmx.exit_qualification;
1846	reason = vmexit->u.vmx.exit_reason;
1847	vmexit->exitcode = VM_EXITCODE_BOGUS;
1848
1849	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
1850
1851	/*
1852	 * VM exits that could be triggered during event injection on the
1853	 * previous VM entry need to be handled specially by re-injecting
1854	 * the event.
1855	 *
1856	 * See "Information for VM Exits During Event Delivery" in Intel SDM
1857	 * for details.
1858	 */
1859	switch (reason) {
1860	case EXIT_REASON_EPT_FAULT:
1861	case EXIT_REASON_EPT_MISCONFIG:
1862	case EXIT_REASON_APIC_ACCESS:
1863	case EXIT_REASON_TASK_SWITCH:
1864	case EXIT_REASON_EXCEPTION:
1865		idtvec_info = vmcs_idt_vectoring_info();
1866		if (idtvec_info & VMCS_IDT_VEC_VALID) {
1867			idtvec_info &= ~(1 << 12); /* clear undefined bit */
1868			vmcs_write(VMCS_ENTRY_INTR_INFO, idtvec_info);
1869			if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
1870				idtvec_err = vmcs_idt_vectoring_err();
1871				vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR,
1872				    idtvec_err);
1873			}
1874			/*
1875			 * If 'virtual NMIs' are being used and the VM-exit
1876			 * happened while injecting an NMI during the previous
1877			 * VM-entry, then clear "blocking by NMI" in the Guest
1878			 * Interruptibility-state.
1879			 */
1880			if ((idtvec_info & VMCS_INTR_T_MASK) ==
1881			    VMCS_INTR_T_NMI) {
1882				 vmx_clear_nmi_blocking(vmx, vcpu);
1883			}
1884			vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
1885		}
1886	default:
1887		idtvec_info = 0;
1888		break;
1889	}
1890
1891	switch (reason) {
1892	case EXIT_REASON_CR_ACCESS:
1893		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
1894		handled = vmx_emulate_cr_access(vmx, vcpu, qual);
1895		break;
1896	case EXIT_REASON_RDMSR:
1897		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1);
1898		retu = false;
1899		ecx = vmxctx->guest_rcx;
1900		VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx);
1901		error = emulate_rdmsr(vmx->vm, vcpu, ecx, &retu);
1902		if (error) {
1903			vmexit->exitcode = VM_EXITCODE_RDMSR;
1904			vmexit->u.msr.code = ecx;
1905		} else if (!retu) {
1906			handled = HANDLED;
1907		} else {
1908			/* Return to userspace with a valid exitcode */
1909			KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
1910			    ("emulate_wrmsr retu with bogus exitcode"));
1911		}
1912		break;
1913	case EXIT_REASON_WRMSR:
1914		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1);
1915		retu = false;
1916		eax = vmxctx->guest_rax;
1917		ecx = vmxctx->guest_rcx;
1918		edx = vmxctx->guest_rdx;
1919		VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016lx",
1920		    ecx, (uint64_t)edx << 32 | eax);
1921		error = emulate_wrmsr(vmx->vm, vcpu, ecx,
1922		    (uint64_t)edx << 32 | eax, &retu);
1923		if (error) {
1924			vmexit->exitcode = VM_EXITCODE_WRMSR;
1925			vmexit->u.msr.code = ecx;
1926			vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
1927		} else if (!retu) {
1928			handled = HANDLED;
1929		} else {
1930			/* Return to userspace with a valid exitcode */
1931			KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
1932			    ("emulate_wrmsr retu with bogus exitcode"));
1933		}
1934		break;
1935	case EXIT_REASON_HLT:
1936		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
1937		vmexit->exitcode = VM_EXITCODE_HLT;
1938		vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS);
1939		break;
1940	case EXIT_REASON_MTF:
1941		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
1942		vmexit->exitcode = VM_EXITCODE_MTRAP;
1943		break;
1944	case EXIT_REASON_PAUSE:
1945		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1);
1946		vmexit->exitcode = VM_EXITCODE_PAUSE;
1947		break;
1948	case EXIT_REASON_INTR_WINDOW:
1949		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1);
1950		vmx_clear_int_window_exiting(vmx, vcpu);
1951		return (1);
1952	case EXIT_REASON_EXT_INTR:
1953		/*
1954		 * External interrupts serve only to cause VM exits and allow
1955		 * the host interrupt handler to run.
1956		 *
1957		 * If this external interrupt triggers a virtual interrupt
1958		 * to a VM, then that state will be recorded by the
1959		 * host interrupt handler in the VM's softc. We will inject
1960		 * this virtual interrupt during the subsequent VM enter.
1961		 */
1962		intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
1963
1964		/*
1965		 * XXX: Ignore this exit if VMCS_INTR_VALID is not set.
1966		 * This appears to be a bug in VMware Fusion?
1967		 */
1968		if (!(intr_info & VMCS_INTR_VALID))
1969			return (1);
1970		KASSERT((intr_info & VMCS_INTR_VALID) != 0 &&
1971		    (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR,
1972		    ("VM exit interruption info invalid: %#x", intr_info));
1973		vmx_trigger_hostintr(intr_info & 0xff);
1974
1975		/*
1976		 * This is special. We want to treat this as an 'handled'
1977		 * VM-exit but not increment the instruction pointer.
1978		 */
1979		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1);
1980		return (1);
1981	case EXIT_REASON_NMI_WINDOW:
1982		/* Exit to allow the pending virtual NMI to be injected */
1983		if (vm_nmi_pending(vmx->vm, vcpu))
1984			vmx_inject_nmi(vmx, vcpu);
1985		vmx_clear_nmi_window_exiting(vmx, vcpu);
1986		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1);
1987		return (1);
1988	case EXIT_REASON_INOUT:
1989		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1);
1990		vmexit->exitcode = VM_EXITCODE_INOUT;
1991		vmexit->u.inout.bytes = (qual & 0x7) + 1;
1992		vmexit->u.inout.in = in = (qual & 0x8) ? 1 : 0;
1993		vmexit->u.inout.string = (qual & 0x10) ? 1 : 0;
1994		vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0;
1995		vmexit->u.inout.port = (uint16_t)(qual >> 16);
1996		vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
1997		if (vmexit->u.inout.string) {
1998			inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO);
1999			vmexit->exitcode = VM_EXITCODE_INOUT_STR;
2000			vis = &vmexit->u.inout_str;
2001			vis->cpu_mode = vmx_cpu_mode();
2002			vis->paging_mode = vmx_paging_mode();
2003			vis->rflags = vmcs_read(VMCS_GUEST_RFLAGS);
2004			vis->cr0 = vmcs_read(VMCS_GUEST_CR0);
2005			vis->cr3 = vmcs_read(VMCS_GUEST_CR3);
2006			vis->cpl = vmx_cpl();
2007			vis->index = inout_str_index(vmx, vcpu, in);
2008			vis->count = inout_str_count(vmx, vcpu, vis->inout.rep);
2009			vis->addrsize = inout_str_addrsize(inst_info);
2010			inout_str_seginfo(vmx, vcpu, inst_info, in, vis);
2011			vis->gla = vmcs_gla();
2012		}
2013		break;
2014	case EXIT_REASON_CPUID:
2015		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1);
2016		handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
2017		break;
2018	case EXIT_REASON_EXCEPTION:
2019		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1);
2020		intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
2021		KASSERT((intr_info & VMCS_INTR_VALID) != 0,
2022		    ("VM exit interruption info invalid: %#x", intr_info));
2023
2024		/*
2025		 * If Virtual NMIs control is 1 and the VM-exit is due to a
2026		 * fault encountered during the execution of IRET then we must
2027		 * restore the state of "virtual-NMI blocking" before resuming
2028		 * the guest.
2029		 *
2030		 * See "Resuming Guest Software after Handling an Exception".
2031		 */
2032		if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
2033		    (intr_info & 0xff) != IDT_DF &&
2034		    (intr_info & EXIT_QUAL_NMIUDTI) != 0)
2035			vmx_restore_nmi_blocking(vmx, vcpu);
2036
2037		/*
2038		 * The NMI has already been handled in vmx_exit_handle_nmi().
2039		 */
2040		if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI)
2041			return (1);
2042		break;
2043	case EXIT_REASON_EPT_FAULT:
2044		/*
2045		 * If 'gpa' lies within the address space allocated to
2046		 * memory then this must be a nested page fault otherwise
2047		 * this must be an instruction that accesses MMIO space.
2048		 */
2049		gpa = vmcs_gpa();
2050		if (vm_mem_allocated(vmx->vm, gpa) ||
2051		    apic_access_fault(vmx, vcpu, gpa)) {
2052			vmexit->exitcode = VM_EXITCODE_PAGING;
2053			vmexit->u.paging.gpa = gpa;
2054			vmexit->u.paging.fault_type = ept_fault_type(qual);
2055			vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
2056		} else if (ept_emulation_fault(qual)) {
2057			vmexit_inst_emul(vmexit, gpa, vmcs_gla());
2058			vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1);
2059		}
2060		/*
2061		 * If Virtual NMIs control is 1 and the VM-exit is due to an
2062		 * EPT fault during the execution of IRET then we must restore
2063		 * the state of "virtual-NMI blocking" before resuming.
2064		 *
2065		 * See description of "NMI unblocking due to IRET" in
2066		 * "Exit Qualification for EPT Violations".
2067		 */
2068		if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
2069		    (qual & EXIT_QUAL_NMIUDTI) != 0)
2070			vmx_restore_nmi_blocking(vmx, vcpu);
2071		break;
2072	case EXIT_REASON_VIRTUALIZED_EOI:
2073		vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI;
2074		vmexit->u.ioapic_eoi.vector = qual & 0xFF;
2075		vmexit->inst_length = 0;	/* trap-like */
2076		break;
2077	case EXIT_REASON_APIC_ACCESS:
2078		handled = vmx_handle_apic_access(vmx, vcpu, vmexit);
2079		break;
2080	case EXIT_REASON_APIC_WRITE:
2081		/*
2082		 * APIC-write VM exit is trap-like so the %rip is already
2083		 * pointing to the next instruction.
2084		 */
2085		vmexit->inst_length = 0;
2086		vlapic = vm_lapic(vmx->vm, vcpu);
2087		handled = vmx_handle_apic_write(vmx, vcpu, vlapic, qual);
2088		break;
2089	case EXIT_REASON_XSETBV:
2090		handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit);
2091		break;
2092	default:
2093		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1);
2094		break;
2095	}
2096
2097	if (handled) {
2098		/*
2099		 * It is possible that control is returned to userland
2100		 * even though we were able to handle the VM exit in the
2101		 * kernel.
2102		 *
2103		 * In such a case we want to make sure that the userland
2104		 * restarts guest execution at the instruction *after*
2105		 * the one we just processed. Therefore we update the
2106		 * guest rip in the VMCS and in 'vmexit'.
2107		 */
2108		vmexit->rip += vmexit->inst_length;
2109		vmexit->inst_length = 0;
2110		vmcs_write(VMCS_GUEST_RIP, vmexit->rip);
2111	} else {
2112		if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
2113			/*
2114			 * If this VM exit was not claimed by anybody then
2115			 * treat it as a generic VMX exit.
2116			 */
2117			vmexit->exitcode = VM_EXITCODE_VMX;
2118			vmexit->u.vmx.status = VM_SUCCESS;
2119			vmexit->u.vmx.inst_type = 0;
2120			vmexit->u.vmx.inst_error = 0;
2121		} else {
2122			/*
2123			 * The exitcode and collateral have been populated.
2124			 * The VM exit will be processed further in userland.
2125			 */
2126		}
2127	}
2128	return (handled);
2129}
2130
2131static __inline int
2132vmx_exit_astpending(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
2133{
2134
2135	vmexit->rip = vmcs_guest_rip();
2136	vmexit->inst_length = 0;
2137	vmexit->exitcode = VM_EXITCODE_BOGUS;
2138	vmx_astpending_trace(vmx, vcpu, vmexit->rip);
2139	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_ASTPENDING, 1);
2140
2141	return (HANDLED);
2142}
2143
2144static __inline int
2145vmx_exit_rendezvous(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
2146{
2147
2148	vmexit->rip = vmcs_guest_rip();
2149	vmexit->inst_length = 0;
2150	vmexit->exitcode = VM_EXITCODE_RENDEZVOUS;
2151	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RENDEZVOUS, 1);
2152
2153	return (UNHANDLED);
2154}
2155
2156static __inline int
2157vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit)
2158{
2159
2160	KASSERT(vmxctx->inst_fail_status != VM_SUCCESS,
2161	    ("vmx_exit_inst_error: invalid inst_fail_status %d",
2162	    vmxctx->inst_fail_status));
2163
2164	vmexit->inst_length = 0;
2165	vmexit->exitcode = VM_EXITCODE_VMX;
2166	vmexit->u.vmx.status = vmxctx->inst_fail_status;
2167	vmexit->u.vmx.inst_error = vmcs_instruction_error();
2168	vmexit->u.vmx.exit_reason = ~0;
2169	vmexit->u.vmx.exit_qualification = ~0;
2170
2171	switch (rc) {
2172	case VMX_VMRESUME_ERROR:
2173	case VMX_VMLAUNCH_ERROR:
2174	case VMX_INVEPT_ERROR:
2175		vmexit->u.vmx.inst_type = rc;
2176		break;
2177	default:
2178		panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc);
2179	}
2180
2181	return (UNHANDLED);
2182}
2183
2184/*
2185 * If the NMI-exiting VM execution control is set to '1' then an NMI in
2186 * non-root operation causes a VM-exit. NMI blocking is in effect so it is
2187 * sufficient to simply vector to the NMI handler via a software interrupt.
2188 * However, this must be done before maskable interrupts are enabled
2189 * otherwise the "iret" issued by an interrupt handler will incorrectly
2190 * clear NMI blocking.
2191 */
2192static __inline void
2193vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
2194{
2195	uint32_t intr_info;
2196
2197	KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled"));
2198
2199	if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION)
2200		return;
2201
2202	intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
2203	KASSERT((intr_info & VMCS_INTR_VALID) != 0,
2204	    ("VM exit interruption info invalid: %#x", intr_info));
2205
2206	if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) {
2207		KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due "
2208		    "to NMI has invalid vector: %#x", intr_info));
2209		VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler");
2210		__asm __volatile("int $2");
2211	}
2212}
2213
2214static int
2215vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
2216    void *rendezvous_cookie, void *suspend_cookie)
2217{
2218	int rc, handled, launched;
2219	struct vmx *vmx;
2220	struct vm *vm;
2221	struct vmxctx *vmxctx;
2222	struct vmcs *vmcs;
2223	struct vm_exit *vmexit;
2224	struct vlapic *vlapic;
2225	uint64_t rip;
2226	uint32_t exit_reason;
2227
2228	vmx = arg;
2229	vm = vmx->vm;
2230	vmcs = &vmx->vmcs[vcpu];
2231	vmxctx = &vmx->ctx[vcpu];
2232	vlapic = vm_lapic(vm, vcpu);
2233	vmexit = vm_exitinfo(vm, vcpu);
2234	launched = 0;
2235
2236	KASSERT(vmxctx->pmap == pmap,
2237	    ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap));
2238
2239	VMPTRLD(vmcs);
2240
2241	/*
2242	 * XXX
2243	 * We do this every time because we may setup the virtual machine
2244	 * from a different process than the one that actually runs it.
2245	 *
2246	 * If the life of a virtual machine was spent entirely in the context
2247	 * of a single process we could do this once in vmx_vminit().
2248	 */
2249	vmcs_write(VMCS_HOST_CR3, rcr3());
2250
2251	vmcs_write(VMCS_GUEST_RIP, startrip);
2252	vmx_set_pcpu_defaults(vmx, vcpu, pmap);
2253	do {
2254		/*
2255		 * Interrupts are disabled from this point on until the
2256		 * guest starts executing. This is done for the following
2257		 * reasons:
2258		 *
2259		 * If an AST is asserted on this thread after the check below,
2260		 * then the IPI_AST notification will not be lost, because it
2261		 * will cause a VM exit due to external interrupt as soon as
2262		 * the guest state is loaded.
2263		 *
2264		 * A posted interrupt after 'vmx_inject_interrupts()' will
2265		 * not be "lost" because it will be held pending in the host
2266		 * APIC because interrupts are disabled. The pending interrupt
2267		 * will be recognized as soon as the guest state is loaded.
2268		 *
2269		 * The same reasoning applies to the IPI generated by
2270		 * pmap_invalidate_ept().
2271		 */
2272		disable_intr();
2273		if (vcpu_suspended(suspend_cookie)) {
2274			enable_intr();
2275			vm_exit_suspended(vmx->vm, vcpu, vmcs_guest_rip());
2276			handled = UNHANDLED;
2277			break;
2278		}
2279
2280		if (vcpu_rendezvous_pending(rendezvous_cookie)) {
2281			enable_intr();
2282			handled = vmx_exit_rendezvous(vmx, vcpu, vmexit);
2283			break;
2284		}
2285
2286		if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)) {
2287			enable_intr();
2288			handled = vmx_exit_astpending(vmx, vcpu, vmexit);
2289			break;
2290		}
2291
2292		vmx_inject_interrupts(vmx, vcpu, vlapic);
2293		vmx_run_trace(vmx, vcpu);
2294		rc = vmx_enter_guest(vmxctx, vmx, launched);
2295
2296		/* Collect some information for VM exit processing */
2297		vmexit->rip = rip = vmcs_guest_rip();
2298		vmexit->inst_length = vmexit_instruction_length();
2299		vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
2300		vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
2301
2302		if (rc == VMX_GUEST_VMEXIT) {
2303			vmx_exit_handle_nmi(vmx, vcpu, vmexit);
2304			enable_intr();
2305			handled = vmx_exit_process(vmx, vcpu, vmexit);
2306		} else {
2307			enable_intr();
2308			handled = vmx_exit_inst_error(vmxctx, rc, vmexit);
2309		}
2310		launched = 1;
2311		vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled);
2312	} while (handled);
2313
2314	/*
2315	 * If a VM exit has been handled then the exitcode must be BOGUS
2316	 * If a VM exit is not handled then the exitcode must not be BOGUS
2317	 */
2318	if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) ||
2319	    (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) {
2320		panic("Mismatch between handled (%d) and exitcode (%d)",
2321		      handled, vmexit->exitcode);
2322	}
2323
2324	if (!handled)
2325		vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1);
2326
2327	VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d",
2328	    vmexit->exitcode);
2329
2330	VMCLEAR(vmcs);
2331	return (0);
2332}
2333
2334static void
2335vmx_vmcleanup(void *arg)
2336{
2337	int i;
2338	struct vmx *vmx = arg;
2339
2340	if (apic_access_virtualization(vmx, 0))
2341		vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
2342
2343	for (i = 0; i < VM_MAXCPU; i++)
2344		vpid_free(vmx->state[i].vpid);
2345
2346	free(vmx, M_VMX);
2347
2348	return;
2349}
2350
2351static register_t *
2352vmxctx_regptr(struct vmxctx *vmxctx, int reg)
2353{
2354
2355	switch (reg) {
2356	case VM_REG_GUEST_RAX:
2357		return (&vmxctx->guest_rax);
2358	case VM_REG_GUEST_RBX:
2359		return (&vmxctx->guest_rbx);
2360	case VM_REG_GUEST_RCX:
2361		return (&vmxctx->guest_rcx);
2362	case VM_REG_GUEST_RDX:
2363		return (&vmxctx->guest_rdx);
2364	case VM_REG_GUEST_RSI:
2365		return (&vmxctx->guest_rsi);
2366	case VM_REG_GUEST_RDI:
2367		return (&vmxctx->guest_rdi);
2368	case VM_REG_GUEST_RBP:
2369		return (&vmxctx->guest_rbp);
2370	case VM_REG_GUEST_R8:
2371		return (&vmxctx->guest_r8);
2372	case VM_REG_GUEST_R9:
2373		return (&vmxctx->guest_r9);
2374	case VM_REG_GUEST_R10:
2375		return (&vmxctx->guest_r10);
2376	case VM_REG_GUEST_R11:
2377		return (&vmxctx->guest_r11);
2378	case VM_REG_GUEST_R12:
2379		return (&vmxctx->guest_r12);
2380	case VM_REG_GUEST_R13:
2381		return (&vmxctx->guest_r13);
2382	case VM_REG_GUEST_R14:
2383		return (&vmxctx->guest_r14);
2384	case VM_REG_GUEST_R15:
2385		return (&vmxctx->guest_r15);
2386	case VM_REG_GUEST_CR2:
2387		return (&vmxctx->guest_cr2);
2388	default:
2389		break;
2390	}
2391	return (NULL);
2392}
2393
2394static int
2395vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval)
2396{
2397	register_t *regp;
2398
2399	if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
2400		*retval = *regp;
2401		return (0);
2402	} else
2403		return (EINVAL);
2404}
2405
2406static int
2407vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
2408{
2409	register_t *regp;
2410
2411	if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
2412		*regp = val;
2413		return (0);
2414	} else
2415		return (EINVAL);
2416}
2417
2418static int
2419vmx_shadow_reg(int reg)
2420{
2421	int shreg;
2422
2423	shreg = -1;
2424
2425	switch (reg) {
2426	case VM_REG_GUEST_CR0:
2427		shreg = VMCS_CR0_SHADOW;
2428                break;
2429        case VM_REG_GUEST_CR4:
2430		shreg = VMCS_CR4_SHADOW;
2431		break;
2432	default:
2433		break;
2434	}
2435
2436	return (shreg);
2437}
2438
2439static int
2440vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
2441{
2442	int running, hostcpu;
2443	struct vmx *vmx = arg;
2444
2445	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
2446	if (running && hostcpu != curcpu)
2447		panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);
2448
2449	if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0)
2450		return (0);
2451
2452	return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval));
2453}
2454
2455static int
2456vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
2457{
2458	int error, hostcpu, running, shadow;
2459	uint64_t ctls;
2460	struct vmx *vmx = arg;
2461
2462	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
2463	if (running && hostcpu != curcpu)
2464		panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);
2465
2466	if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0)
2467		return (0);
2468
2469	error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val);
2470
2471	if (error == 0) {
2472		/*
2473		 * If the "load EFER" VM-entry control is 1 then the
2474		 * value of EFER.LMA must be identical to "IA-32e mode guest"
2475		 * bit in the VM-entry control.
2476		 */
2477		if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 &&
2478		    (reg == VM_REG_GUEST_EFER)) {
2479			vmcs_getreg(&vmx->vmcs[vcpu], running,
2480				    VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls);
2481			if (val & EFER_LMA)
2482				ctls |= VM_ENTRY_GUEST_LMA;
2483			else
2484				ctls &= ~VM_ENTRY_GUEST_LMA;
2485			vmcs_setreg(&vmx->vmcs[vcpu], running,
2486				    VMCS_IDENT(VMCS_ENTRY_CTLS), ctls);
2487		}
2488
2489		shadow = vmx_shadow_reg(reg);
2490		if (shadow > 0) {
2491			/*
2492			 * Store the unmodified value in the shadow
2493			 */
2494			error = vmcs_setreg(&vmx->vmcs[vcpu], running,
2495				    VMCS_IDENT(shadow), val);
2496		}
2497	}
2498
2499	return (error);
2500}
2501
2502static int
2503vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
2504{
2505	int hostcpu, running;
2506	struct vmx *vmx = arg;
2507
2508	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
2509	if (running && hostcpu != curcpu)
2510		panic("vmx_getdesc: %s%d is running", vm_name(vmx->vm), vcpu);
2511
2512	return (vmcs_getdesc(&vmx->vmcs[vcpu], running, reg, desc));
2513}
2514
2515static int
2516vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
2517{
2518	int hostcpu, running;
2519	struct vmx *vmx = arg;
2520
2521	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
2522	if (running && hostcpu != curcpu)
2523		panic("vmx_setdesc: %s%d is running", vm_name(vmx->vm), vcpu);
2524
2525	return (vmcs_setdesc(&vmx->vmcs[vcpu], running, reg, desc));
2526}
2527
2528static int
2529vmx_getcap(void *arg, int vcpu, int type, int *retval)
2530{
2531	struct vmx *vmx = arg;
2532	int vcap;
2533	int ret;
2534
2535	ret = ENOENT;
2536
2537	vcap = vmx->cap[vcpu].set;
2538
2539	switch (type) {
2540	case VM_CAP_HALT_EXIT:
2541		if (cap_halt_exit)
2542			ret = 0;
2543		break;
2544	case VM_CAP_PAUSE_EXIT:
2545		if (cap_pause_exit)
2546			ret = 0;
2547		break;
2548	case VM_CAP_MTRAP_EXIT:
2549		if (cap_monitor_trap)
2550			ret = 0;
2551		break;
2552	case VM_CAP_UNRESTRICTED_GUEST:
2553		if (cap_unrestricted_guest)
2554			ret = 0;
2555		break;
2556	case VM_CAP_ENABLE_INVPCID:
2557		if (cap_invpcid)
2558			ret = 0;
2559		break;
2560	default:
2561		break;
2562	}
2563
2564	if (ret == 0)
2565		*retval = (vcap & (1 << type)) ? 1 : 0;
2566
2567	return (ret);
2568}
2569
2570static int
2571vmx_setcap(void *arg, int vcpu, int type, int val)
2572{
2573	struct vmx *vmx = arg;
2574	struct vmcs *vmcs = &vmx->vmcs[vcpu];
2575	uint32_t baseval;
2576	uint32_t *pptr;
2577	int error;
2578	int flag;
2579	int reg;
2580	int retval;
2581
2582	retval = ENOENT;
2583	pptr = NULL;
2584
2585	switch (type) {
2586	case VM_CAP_HALT_EXIT:
2587		if (cap_halt_exit) {
2588			retval = 0;
2589			pptr = &vmx->cap[vcpu].proc_ctls;
2590			baseval = *pptr;
2591			flag = PROCBASED_HLT_EXITING;
2592			reg = VMCS_PRI_PROC_BASED_CTLS;
2593		}
2594		break;
2595	case VM_CAP_MTRAP_EXIT:
2596		if (cap_monitor_trap) {
2597			retval = 0;
2598			pptr = &vmx->cap[vcpu].proc_ctls;
2599			baseval = *pptr;
2600			flag = PROCBASED_MTF;
2601			reg = VMCS_PRI_PROC_BASED_CTLS;
2602		}
2603		break;
2604	case VM_CAP_PAUSE_EXIT:
2605		if (cap_pause_exit) {
2606			retval = 0;
2607			pptr = &vmx->cap[vcpu].proc_ctls;
2608			baseval = *pptr;
2609			flag = PROCBASED_PAUSE_EXITING;
2610			reg = VMCS_PRI_PROC_BASED_CTLS;
2611		}
2612		break;
2613	case VM_CAP_UNRESTRICTED_GUEST:
2614		if (cap_unrestricted_guest) {
2615			retval = 0;
2616			pptr = &vmx->cap[vcpu].proc_ctls2;
2617			baseval = *pptr;
2618			flag = PROCBASED2_UNRESTRICTED_GUEST;
2619			reg = VMCS_SEC_PROC_BASED_CTLS;
2620		}
2621		break;
2622	case VM_CAP_ENABLE_INVPCID:
2623		if (cap_invpcid) {
2624			retval = 0;
2625			pptr = &vmx->cap[vcpu].proc_ctls2;
2626			baseval = *pptr;
2627			flag = PROCBASED2_ENABLE_INVPCID;
2628			reg = VMCS_SEC_PROC_BASED_CTLS;
2629		}
2630		break;
2631	default:
2632		break;
2633	}
2634
2635	if (retval == 0) {
2636		if (val) {
2637			baseval |= flag;
2638		} else {
2639			baseval &= ~flag;
2640		}
2641		VMPTRLD(vmcs);
2642		error = vmwrite(reg, baseval);
2643		VMCLEAR(vmcs);
2644
2645		if (error) {
2646			retval = error;
2647		} else {
2648			/*
2649			 * Update optional stored flags, and record
2650			 * setting
2651			 */
2652			if (pptr != NULL) {
2653				*pptr = baseval;
2654			}
2655
2656			if (val) {
2657				vmx->cap[vcpu].set |= (1 << type);
2658			} else {
2659				vmx->cap[vcpu].set &= ~(1 << type);
2660			}
2661		}
2662	}
2663
2664        return (retval);
2665}
2666
2667struct vlapic_vtx {
2668	struct vlapic	vlapic;
2669	struct pir_desc	*pir_desc;
2670	struct vmx	*vmx;
2671};
2672
2673#define	VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg)	\
2674do {									\
2675	VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d",	\
2676	    level ? "level" : "edge", vector);				\
2677	VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]);	\
2678	VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]);	\
2679	VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]);	\
2680	VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]);	\
2681	VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\
2682} while (0)
2683
2684/*
2685 * vlapic->ops handlers that utilize the APICv hardware assist described in
2686 * Chapter 29 of the Intel SDM.
2687 */
2688static int
2689vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
2690{
2691	struct vlapic_vtx *vlapic_vtx;
2692	struct pir_desc *pir_desc;
2693	uint64_t mask;
2694	int idx, notify;
2695
2696	vlapic_vtx = (struct vlapic_vtx *)vlapic;
2697	pir_desc = vlapic_vtx->pir_desc;
2698
2699	/*
2700	 * Keep track of interrupt requests in the PIR descriptor. This is
2701	 * because the virtual APIC page pointed to by the VMCS cannot be
2702	 * modified if the vcpu is running.
2703	 */
2704	idx = vector / 64;
2705	mask = 1UL << (vector % 64);
2706	atomic_set_long(&pir_desc->pir[idx], mask);
2707	notify = atomic_cmpset_long(&pir_desc->pending, 0, 1);
2708
2709	VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector,
2710	    level, "vmx_set_intr_ready");
2711	return (notify);
2712}
2713
2714static int
2715vmx_pending_intr(struct vlapic *vlapic, int *vecptr)
2716{
2717	struct vlapic_vtx *vlapic_vtx;
2718	struct pir_desc *pir_desc;
2719	struct LAPIC *lapic;
2720	uint64_t pending, pirval;
2721	uint32_t ppr, vpr;
2722	int i;
2723
2724	/*
2725	 * This function is only expected to be called from the 'HLT' exit
2726	 * handler which does not care about the vector that is pending.
2727	 */
2728	KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL"));
2729
2730	vlapic_vtx = (struct vlapic_vtx *)vlapic;
2731	pir_desc = vlapic_vtx->pir_desc;
2732
2733	pending = atomic_load_acq_long(&pir_desc->pending);
2734	if (!pending)
2735		return (0);	/* common case */
2736
2737	/*
2738	 * If there is an interrupt pending then it will be recognized only
2739	 * if its priority is greater than the processor priority.
2740	 *
2741	 * Special case: if the processor priority is zero then any pending
2742	 * interrupt will be recognized.
2743	 */
2744	lapic = vlapic->apic_page;
2745	ppr = lapic->ppr & 0xf0;
2746	if (ppr == 0)
2747		return (1);
2748
2749	VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d",
2750	    lapic->ppr);
2751
2752	for (i = 3; i >= 0; i--) {
2753		pirval = pir_desc->pir[i];
2754		if (pirval != 0) {
2755			vpr = (i * 64 + flsl(pirval) - 1) & 0xf0;
2756			return (vpr > ppr);
2757		}
2758	}
2759	return (0);
2760}
2761
2762static void
2763vmx_intr_accepted(struct vlapic *vlapic, int vector)
2764{
2765
2766	panic("vmx_intr_accepted: not expected to be called");
2767}
2768
2769static void
2770vmx_set_tmr(struct vlapic *vlapic, int vector, bool level)
2771{
2772	struct vlapic_vtx *vlapic_vtx;
2773	struct vmx *vmx;
2774	struct vmcs *vmcs;
2775	uint64_t mask, val;
2776
2777	KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector));
2778	KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL),
2779	    ("vmx_set_tmr: vcpu cannot be running"));
2780
2781	vlapic_vtx = (struct vlapic_vtx *)vlapic;
2782	vmx = vlapic_vtx->vmx;
2783	vmcs = &vmx->vmcs[vlapic->vcpuid];
2784	mask = 1UL << (vector % 64);
2785
2786	VMPTRLD(vmcs);
2787	val = vmcs_read(VMCS_EOI_EXIT(vector));
2788	if (level)
2789		val |= mask;
2790	else
2791		val &= ~mask;
2792	vmcs_write(VMCS_EOI_EXIT(vector), val);
2793	VMCLEAR(vmcs);
2794}
2795
2796static void
2797vmx_enable_x2apic_mode(struct vlapic *vlapic)
2798{
2799	struct vmx *vmx;
2800	struct vmcs *vmcs;
2801	uint32_t proc_ctls2;
2802	int vcpuid, error;
2803
2804	vcpuid = vlapic->vcpuid;
2805	vmx = ((struct vlapic_vtx *)vlapic)->vmx;
2806	vmcs = &vmx->vmcs[vcpuid];
2807
2808	proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
2809	KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0,
2810	    ("%s: invalid proc_ctls2 %#x", __func__, proc_ctls2));
2811
2812	proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES;
2813	proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE;
2814	vmx->cap[vcpuid].proc_ctls2 = proc_ctls2;
2815
2816	VMPTRLD(vmcs);
2817	vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2);
2818	VMCLEAR(vmcs);
2819
2820	if (vlapic->vcpuid == 0) {
2821		/*
2822		 * The nested page table mappings are shared by all vcpus
2823		 * so unmap the APIC access page just once.
2824		 */
2825		error = vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
2826		KASSERT(error == 0, ("%s: vm_unmap_mmio error %d",
2827		    __func__, error));
2828
2829		/*
2830		 * The MSR bitmap is shared by all vcpus so modify it only
2831		 * once in the context of vcpu 0.
2832		 */
2833		error = vmx_allow_x2apic_msrs(vmx);
2834		KASSERT(error == 0, ("%s: vmx_allow_x2apic_msrs error %d",
2835		    __func__, error));
2836	}
2837}
2838
2839static void
2840vmx_post_intr(struct vlapic *vlapic, int hostcpu)
2841{
2842
2843	ipi_cpu(hostcpu, pirvec);
2844}
2845
2846/*
2847 * Transfer the pending interrupts in the PIR descriptor to the IRR
2848 * in the virtual APIC page.
2849 */
2850static void
2851vmx_inject_pir(struct vlapic *vlapic)
2852{
2853	struct vlapic_vtx *vlapic_vtx;
2854	struct pir_desc *pir_desc;
2855	struct LAPIC *lapic;
2856	uint64_t val, pirval;
2857	int rvi, pirbase = -1;
2858	uint16_t intr_status_old, intr_status_new;
2859
2860	vlapic_vtx = (struct vlapic_vtx *)vlapic;
2861	pir_desc = vlapic_vtx->pir_desc;
2862	if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) {
2863		VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
2864		    "no posted interrupt pending");
2865		return;
2866	}
2867
2868	pirval = 0;
2869	pirbase = -1;
2870	lapic = vlapic->apic_page;
2871
2872	val = atomic_readandclear_long(&pir_desc->pir[0]);
2873	if (val != 0) {
2874		lapic->irr0 |= val;
2875		lapic->irr1 |= val >> 32;
2876		pirbase = 0;
2877		pirval = val;
2878	}
2879
2880	val = atomic_readandclear_long(&pir_desc->pir[1]);
2881	if (val != 0) {
2882		lapic->irr2 |= val;
2883		lapic->irr3 |= val >> 32;
2884		pirbase = 64;
2885		pirval = val;
2886	}
2887
2888	val = atomic_readandclear_long(&pir_desc->pir[2]);
2889	if (val != 0) {
2890		lapic->irr4 |= val;
2891		lapic->irr5 |= val >> 32;
2892		pirbase = 128;
2893		pirval = val;
2894	}
2895
2896	val = atomic_readandclear_long(&pir_desc->pir[3]);
2897	if (val != 0) {
2898		lapic->irr6 |= val;
2899		lapic->irr7 |= val >> 32;
2900		pirbase = 192;
2901		pirval = val;
2902	}
2903
2904	VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir");
2905
2906	/*
2907	 * Update RVI so the processor can evaluate pending virtual
2908	 * interrupts on VM-entry.
2909	 *
2910	 * It is possible for pirval to be 0 here, even though the
2911	 * pending bit has been set. The scenario is:
2912	 * CPU-Y is sending a posted interrupt to CPU-X, which
2913	 * is running a guest and processing posted interrupts in h/w.
2914	 * CPU-X will eventually exit and the state seen in s/w is
2915	 * the pending bit set, but no PIR bits set.
2916	 *
2917	 *      CPU-X                      CPU-Y
2918	 *   (vm running)                (host running)
2919	 *   rx posted interrupt
2920	 *   CLEAR pending bit
2921	 *				 SET PIR bit
2922	 *   READ/CLEAR PIR bits
2923	 *				 SET pending bit
2924	 *   (vm exit)
2925	 *   pending bit set, PIR 0
2926	 */
2927	if (pirval != 0) {
2928		rvi = pirbase + flsl(pirval) - 1;
2929		intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS);
2930		intr_status_new = (intr_status_old & 0xFF00) | rvi;
2931		if (intr_status_new > intr_status_old) {
2932			vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new);
2933			VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
2934			    "guest_intr_status changed from 0x%04x to 0x%04x",
2935			    intr_status_old, intr_status_new);
2936		}
2937	}
2938}
2939
2940static struct vlapic *
2941vmx_vlapic_init(void *arg, int vcpuid)
2942{
2943	struct vmx *vmx;
2944	struct vlapic *vlapic;
2945	struct vlapic_vtx *vlapic_vtx;
2946
2947	vmx = arg;
2948
2949	vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO);
2950	vlapic->vm = vmx->vm;
2951	vlapic->vcpuid = vcpuid;
2952	vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid];
2953
2954	vlapic_vtx = (struct vlapic_vtx *)vlapic;
2955	vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid];
2956	vlapic_vtx->vmx = vmx;
2957
2958	if (virtual_interrupt_delivery) {
2959		vlapic->ops.set_intr_ready = vmx_set_intr_ready;
2960		vlapic->ops.pending_intr = vmx_pending_intr;
2961		vlapic->ops.intr_accepted = vmx_intr_accepted;
2962		vlapic->ops.set_tmr = vmx_set_tmr;
2963		vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode;
2964	}
2965
2966	if (posted_interrupts)
2967		vlapic->ops.post_intr = vmx_post_intr;
2968
2969	vlapic_init(vlapic);
2970
2971	return (vlapic);
2972}
2973
2974static void
2975vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic)
2976{
2977
2978	vlapic_cleanup(vlapic);
2979	free(vlapic, M_VLAPIC);
2980}
2981
2982struct vmm_ops vmm_ops_intel = {
2983	vmx_init,
2984	vmx_cleanup,
2985	vmx_restore,
2986	vmx_vminit,
2987	vmx_run,
2988	vmx_vmcleanup,
2989	vmx_getreg,
2990	vmx_setreg,
2991	vmx_getdesc,
2992	vmx_setdesc,
2993	vmx_getcap,
2994	vmx_setcap,
2995	ept_vmspace_alloc,
2996	ept_vmspace_free,
2997	vmx_vlapic_init,
2998	vmx_vlapic_cleanup,
2999};
3000