vmx.c revision 262144
1/*-
2 * Copyright (c) 2011 NetApp, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD: head/sys/amd64/vmm/intel/vmx.c 262144 2014-02-18 03:07:36Z jhb $
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/vmx.c 262144 2014-02-18 03:07:36Z jhb $");
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/smp.h>
35#include <sys/kernel.h>
36#include <sys/malloc.h>
37#include <sys/pcpu.h>
38#include <sys/proc.h>
39#include <sys/sysctl.h>
40
41#include <vm/vm.h>
42#include <vm/pmap.h>
43
44#include <machine/psl.h>
45#include <machine/cpufunc.h>
46#include <machine/md_var.h>
47#include <machine/segments.h>
48#include <machine/smp.h>
49#include <machine/specialreg.h>
50#include <machine/vmparam.h>
51
52#include <machine/vmm.h>
53#include "vmm_host.h"
54#include "vmm_ipi.h"
55#include "vmm_msr.h"
56#include "vmm_ktr.h"
57#include "vmm_stat.h"
58#include "vlapic.h"
59#include "vlapic_priv.h"
60
61#include "vmx_msr.h"
62#include "ept.h"
63#include "vmx_cpufunc.h"
64#include "vmx.h"
65#include "x86.h"
66#include "vmx_controls.h"
67
68#define	PINBASED_CTLS_ONE_SETTING					\
69	(PINBASED_EXTINT_EXITING	|				\
70	 PINBASED_NMI_EXITING		|				\
71	 PINBASED_VIRTUAL_NMI)
72#define	PINBASED_CTLS_ZERO_SETTING	0
73
74#define PROCBASED_CTLS_WINDOW_SETTING					\
75	(PROCBASED_INT_WINDOW_EXITING	|				\
76	 PROCBASED_NMI_WINDOW_EXITING)
77
78#define	PROCBASED_CTLS_ONE_SETTING 					\
79	(PROCBASED_SECONDARY_CONTROLS	|				\
80	 PROCBASED_IO_EXITING		|				\
81	 PROCBASED_MSR_BITMAPS		|				\
82	 PROCBASED_CTLS_WINDOW_SETTING)
83#define	PROCBASED_CTLS_ZERO_SETTING	\
84	(PROCBASED_CR3_LOAD_EXITING |	\
85	PROCBASED_CR3_STORE_EXITING |	\
86	PROCBASED_IO_BITMAPS)
87
88#define	PROCBASED_CTLS2_ONE_SETTING	PROCBASED2_ENABLE_EPT
89#define	PROCBASED_CTLS2_ZERO_SETTING	0
90
91#define VM_EXIT_CTLS_ONE_SETTING_NO_PAT					\
92	(VM_EXIT_HOST_LMA			|			\
93	VM_EXIT_SAVE_EFER			|			\
94	VM_EXIT_LOAD_EFER)
95
96#define	VM_EXIT_CTLS_ONE_SETTING					\
97	(VM_EXIT_CTLS_ONE_SETTING_NO_PAT       	|			\
98	VM_EXIT_ACKNOWLEDGE_INTERRUPT		|			\
99	VM_EXIT_SAVE_PAT			|			\
100	VM_EXIT_LOAD_PAT)
101#define	VM_EXIT_CTLS_ZERO_SETTING	VM_EXIT_SAVE_DEBUG_CONTROLS
102
103#define	VM_ENTRY_CTLS_ONE_SETTING_NO_PAT	VM_ENTRY_LOAD_EFER
104
105#define	VM_ENTRY_CTLS_ONE_SETTING					\
106	(VM_ENTRY_CTLS_ONE_SETTING_NO_PAT     	|			\
107	VM_ENTRY_LOAD_PAT)
108#define	VM_ENTRY_CTLS_ZERO_SETTING					\
109	(VM_ENTRY_LOAD_DEBUG_CONTROLS		|			\
110	VM_ENTRY_INTO_SMM			|			\
111	VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
112
113#define	guest_msr_rw(vmx, msr) \
114	msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW)
115
116#define	HANDLED		1
117#define	UNHANDLED	0
118
119static MALLOC_DEFINE(M_VMX, "vmx", "vmx");
120static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
121
122SYSCTL_DECL(_hw_vmm);
123SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL);
124
125int vmxon_enabled[MAXCPU];
126static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
127
128static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
129static uint32_t exit_ctls, entry_ctls;
130
131static uint64_t cr0_ones_mask, cr0_zeros_mask;
132SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD,
133	     &cr0_ones_mask, 0, NULL);
134SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD,
135	     &cr0_zeros_mask, 0, NULL);
136
137static uint64_t cr4_ones_mask, cr4_zeros_mask;
138SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD,
139	     &cr4_ones_mask, 0, NULL);
140SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD,
141	     &cr4_zeros_mask, 0, NULL);
142
143static int vmx_no_patmsr;
144
145static int vmx_initialized;
146SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
147	   &vmx_initialized, 0, "Intel VMX initialized");
148
149/*
150 * Optional capabilities
151 */
152static int cap_halt_exit;
153static int cap_pause_exit;
154static int cap_unrestricted_guest;
155static int cap_monitor_trap;
156static int cap_invpcid;
157
158static int virtual_interrupt_delivery;
159SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
160    &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support");
161
162static int posted_interrupts;
163SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupts, CTLFLAG_RD,
164    &posted_interrupts, 0, "APICv posted interrupt support");
165
166static int pirvec;
167SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD,
168    &pirvec, 0, "APICv posted interrupt vector");
169
170static struct unrhdr *vpid_unr;
171static u_int vpid_alloc_failed;
172SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
173	    &vpid_alloc_failed, 0, NULL);
174
175/*
176 * Use the last page below 4GB as the APIC access address. This address is
177 * occupied by the boot firmware so it is guaranteed that it will not conflict
178 * with a page in system memory.
179 */
180#define	APIC_ACCESS_ADDRESS	0xFFFFF000
181
182static void vmx_inject_pir(struct vlapic *vlapic);
183
184#ifdef KTR
185static const char *
186exit_reason_to_str(int reason)
187{
188	static char reasonbuf[32];
189
190	switch (reason) {
191	case EXIT_REASON_EXCEPTION:
192		return "exception";
193	case EXIT_REASON_EXT_INTR:
194		return "extint";
195	case EXIT_REASON_TRIPLE_FAULT:
196		return "triplefault";
197	case EXIT_REASON_INIT:
198		return "init";
199	case EXIT_REASON_SIPI:
200		return "sipi";
201	case EXIT_REASON_IO_SMI:
202		return "iosmi";
203	case EXIT_REASON_SMI:
204		return "smi";
205	case EXIT_REASON_INTR_WINDOW:
206		return "intrwindow";
207	case EXIT_REASON_NMI_WINDOW:
208		return "nmiwindow";
209	case EXIT_REASON_TASK_SWITCH:
210		return "taskswitch";
211	case EXIT_REASON_CPUID:
212		return "cpuid";
213	case EXIT_REASON_GETSEC:
214		return "getsec";
215	case EXIT_REASON_HLT:
216		return "hlt";
217	case EXIT_REASON_INVD:
218		return "invd";
219	case EXIT_REASON_INVLPG:
220		return "invlpg";
221	case EXIT_REASON_RDPMC:
222		return "rdpmc";
223	case EXIT_REASON_RDTSC:
224		return "rdtsc";
225	case EXIT_REASON_RSM:
226		return "rsm";
227	case EXIT_REASON_VMCALL:
228		return "vmcall";
229	case EXIT_REASON_VMCLEAR:
230		return "vmclear";
231	case EXIT_REASON_VMLAUNCH:
232		return "vmlaunch";
233	case EXIT_REASON_VMPTRLD:
234		return "vmptrld";
235	case EXIT_REASON_VMPTRST:
236		return "vmptrst";
237	case EXIT_REASON_VMREAD:
238		return "vmread";
239	case EXIT_REASON_VMRESUME:
240		return "vmresume";
241	case EXIT_REASON_VMWRITE:
242		return "vmwrite";
243	case EXIT_REASON_VMXOFF:
244		return "vmxoff";
245	case EXIT_REASON_VMXON:
246		return "vmxon";
247	case EXIT_REASON_CR_ACCESS:
248		return "craccess";
249	case EXIT_REASON_DR_ACCESS:
250		return "draccess";
251	case EXIT_REASON_INOUT:
252		return "inout";
253	case EXIT_REASON_RDMSR:
254		return "rdmsr";
255	case EXIT_REASON_WRMSR:
256		return "wrmsr";
257	case EXIT_REASON_INVAL_VMCS:
258		return "invalvmcs";
259	case EXIT_REASON_INVAL_MSR:
260		return "invalmsr";
261	case EXIT_REASON_MWAIT:
262		return "mwait";
263	case EXIT_REASON_MTF:
264		return "mtf";
265	case EXIT_REASON_MONITOR:
266		return "monitor";
267	case EXIT_REASON_PAUSE:
268		return "pause";
269	case EXIT_REASON_MCE:
270		return "mce";
271	case EXIT_REASON_TPR:
272		return "tpr";
273	case EXIT_REASON_APIC_ACCESS:
274		return "apic-access";
275	case EXIT_REASON_GDTR_IDTR:
276		return "gdtridtr";
277	case EXIT_REASON_LDTR_TR:
278		return "ldtrtr";
279	case EXIT_REASON_EPT_FAULT:
280		return "eptfault";
281	case EXIT_REASON_EPT_MISCONFIG:
282		return "eptmisconfig";
283	case EXIT_REASON_INVEPT:
284		return "invept";
285	case EXIT_REASON_RDTSCP:
286		return "rdtscp";
287	case EXIT_REASON_VMX_PREEMPT:
288		return "vmxpreempt";
289	case EXIT_REASON_INVVPID:
290		return "invvpid";
291	case EXIT_REASON_WBINVD:
292		return "wbinvd";
293	case EXIT_REASON_XSETBV:
294		return "xsetbv";
295	case EXIT_REASON_APIC_WRITE:
296		return "apic-write";
297	default:
298		snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason);
299		return (reasonbuf);
300	}
301}
302#endif	/* KTR */
303
304u_long
305vmx_fix_cr0(u_long cr0)
306{
307
308	return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask);
309}
310
311u_long
312vmx_fix_cr4(u_long cr4)
313{
314
315	return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask);
316}
317
318static void
319vpid_free(int vpid)
320{
321	if (vpid < 0 || vpid > 0xffff)
322		panic("vpid_free: invalid vpid %d", vpid);
323
324	/*
325	 * VPIDs [0,VM_MAXCPU] are special and are not allocated from
326	 * the unit number allocator.
327	 */
328
329	if (vpid > VM_MAXCPU)
330		free_unr(vpid_unr, vpid);
331}
332
333static void
334vpid_alloc(uint16_t *vpid, int num)
335{
336	int i, x;
337
338	if (num <= 0 || num > VM_MAXCPU)
339		panic("invalid number of vpids requested: %d", num);
340
341	/*
342	 * If the "enable vpid" execution control is not enabled then the
343	 * VPID is required to be 0 for all vcpus.
344	 */
345	if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) {
346		for (i = 0; i < num; i++)
347			vpid[i] = 0;
348		return;
349	}
350
351	/*
352	 * Allocate a unique VPID for each vcpu from the unit number allocator.
353	 */
354	for (i = 0; i < num; i++) {
355		x = alloc_unr(vpid_unr);
356		if (x == -1)
357			break;
358		else
359			vpid[i] = x;
360	}
361
362	if (i < num) {
363		atomic_add_int(&vpid_alloc_failed, 1);
364
365		/*
366		 * If the unit number allocator does not have enough unique
367		 * VPIDs then we need to allocate from the [1,VM_MAXCPU] range.
368		 *
369		 * These VPIDs are not be unique across VMs but this does not
370		 * affect correctness because the combined mappings are also
371		 * tagged with the EP4TA which is unique for each VM.
372		 *
373		 * It is still sub-optimal because the invvpid will invalidate
374		 * combined mappings for a particular VPID across all EP4TAs.
375		 */
376		while (i-- > 0)
377			vpid_free(vpid[i]);
378
379		for (i = 0; i < num; i++)
380			vpid[i] = i + 1;
381	}
382}
383
384static void
385vpid_init(void)
386{
387	/*
388	 * VPID 0 is required when the "enable VPID" execution control is
389	 * disabled.
390	 *
391	 * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the
392	 * unit number allocator does not have sufficient unique VPIDs to
393	 * satisfy the allocation.
394	 *
395	 * The remaining VPIDs are managed by the unit number allocator.
396	 */
397	vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL);
398}
399
400static void
401msr_save_area_init(struct msr_entry *g_area, int *g_count)
402{
403	int cnt;
404
405	static struct msr_entry guest_msrs[] = {
406		{ MSR_KGSBASE, 0, 0 },
407	};
408
409	cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]);
410	if (cnt > GUEST_MSR_MAX_ENTRIES)
411		panic("guest msr save area overrun");
412	bcopy(guest_msrs, g_area, sizeof(guest_msrs));
413	*g_count = cnt;
414}
415
416static void
417vmx_disable(void *arg __unused)
418{
419	struct invvpid_desc invvpid_desc = { 0 };
420	struct invept_desc invept_desc = { 0 };
421
422	if (vmxon_enabled[curcpu]) {
423		/*
424		 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b.
425		 *
426		 * VMXON or VMXOFF are not required to invalidate any TLB
427		 * caching structures. This prevents potential retention of
428		 * cached information in the TLB between distinct VMX episodes.
429		 */
430		invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc);
431		invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc);
432		vmxoff();
433	}
434	load_cr4(rcr4() & ~CR4_VMXE);
435}
436
437static int
438vmx_cleanup(void)
439{
440
441	if (pirvec != 0)
442		vmm_ipi_free(pirvec);
443
444	if (vpid_unr != NULL) {
445		delete_unrhdr(vpid_unr);
446		vpid_unr = NULL;
447	}
448
449	smp_rendezvous(NULL, vmx_disable, NULL, NULL);
450
451	return (0);
452}
453
454static void
455vmx_enable(void *arg __unused)
456{
457	int error;
458
459	load_cr4(rcr4() | CR4_VMXE);
460
461	*(uint32_t *)vmxon_region[curcpu] = vmx_revision();
462	error = vmxon(vmxon_region[curcpu]);
463	if (error == 0)
464		vmxon_enabled[curcpu] = 1;
465}
466
467static void
468vmx_restore(void)
469{
470
471	if (vmxon_enabled[curcpu])
472		vmxon(vmxon_region[curcpu]);
473}
474
475static int
476vmx_init(int ipinum)
477{
478	int error, use_tpr_shadow;
479	uint64_t fixed0, fixed1, feature_control;
480	uint32_t tmp, procbased2_vid_bits;
481
482	/* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
483	if (!(cpu_feature2 & CPUID2_VMX)) {
484		printf("vmx_init: processor does not support VMX operation\n");
485		return (ENXIO);
486	}
487
488	/*
489	 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits
490	 * are set (bits 0 and 2 respectively).
491	 */
492	feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
493	if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 ||
494	    (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
495		printf("vmx_init: VMX operation disabled by BIOS\n");
496		return (ENXIO);
497	}
498
499	/* Check support for primary processor-based VM-execution controls */
500	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
501			       MSR_VMX_TRUE_PROCBASED_CTLS,
502			       PROCBASED_CTLS_ONE_SETTING,
503			       PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls);
504	if (error) {
505		printf("vmx_init: processor does not support desired primary "
506		       "processor-based controls\n");
507		return (error);
508	}
509
510	/* Clear the processor-based ctl bits that are set on demand */
511	procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING;
512
513	/* Check support for secondary processor-based VM-execution controls */
514	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
515			       MSR_VMX_PROCBASED_CTLS2,
516			       PROCBASED_CTLS2_ONE_SETTING,
517			       PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2);
518	if (error) {
519		printf("vmx_init: processor does not support desired secondary "
520		       "processor-based controls\n");
521		return (error);
522	}
523
524	/* Check support for VPID */
525	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
526			       PROCBASED2_ENABLE_VPID, 0, &tmp);
527	if (error == 0)
528		procbased_ctls2 |= PROCBASED2_ENABLE_VPID;
529
530	/* Check support for pin-based VM-execution controls */
531	error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
532			       MSR_VMX_TRUE_PINBASED_CTLS,
533			       PINBASED_CTLS_ONE_SETTING,
534			       PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls);
535	if (error) {
536		printf("vmx_init: processor does not support desired "
537		       "pin-based controls\n");
538		return (error);
539	}
540
541	/* Check support for VM-exit controls */
542	error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
543			       VM_EXIT_CTLS_ONE_SETTING,
544			       VM_EXIT_CTLS_ZERO_SETTING,
545			       &exit_ctls);
546	if (error) {
547		/* Try again without the PAT MSR bits */
548		error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS,
549				       MSR_VMX_TRUE_EXIT_CTLS,
550				       VM_EXIT_CTLS_ONE_SETTING_NO_PAT,
551				       VM_EXIT_CTLS_ZERO_SETTING,
552				       &exit_ctls);
553		if (error) {
554			printf("vmx_init: processor does not support desired "
555			       "exit controls\n");
556			return (error);
557		} else {
558			if (bootverbose)
559				printf("vmm: PAT MSR access not supported\n");
560			guest_msr_valid(MSR_PAT);
561			vmx_no_patmsr = 1;
562		}
563	}
564
565	/* Check support for VM-entry controls */
566	if (!vmx_no_patmsr) {
567		error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
568				       MSR_VMX_TRUE_ENTRY_CTLS,
569				       VM_ENTRY_CTLS_ONE_SETTING,
570				       VM_ENTRY_CTLS_ZERO_SETTING,
571				       &entry_ctls);
572	} else {
573		error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
574				       MSR_VMX_TRUE_ENTRY_CTLS,
575				       VM_ENTRY_CTLS_ONE_SETTING_NO_PAT,
576				       VM_ENTRY_CTLS_ZERO_SETTING,
577				       &entry_ctls);
578	}
579
580	if (error) {
581		printf("vmx_init: processor does not support desired "
582		       "entry controls\n");
583		       return (error);
584	}
585
586	/*
587	 * Check support for optional features by testing them
588	 * as individual bits
589	 */
590	cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
591					MSR_VMX_TRUE_PROCBASED_CTLS,
592					PROCBASED_HLT_EXITING, 0,
593					&tmp) == 0);
594
595	cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
596					MSR_VMX_PROCBASED_CTLS,
597					PROCBASED_MTF, 0,
598					&tmp) == 0);
599
600	cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
601					 MSR_VMX_TRUE_PROCBASED_CTLS,
602					 PROCBASED_PAUSE_EXITING, 0,
603					 &tmp) == 0);
604
605	cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
606					MSR_VMX_PROCBASED_CTLS2,
607					PROCBASED2_UNRESTRICTED_GUEST, 0,
608				        &tmp) == 0);
609
610	cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
611	    MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0,
612	    &tmp) == 0);
613
614	/*
615	 * Check support for virtual interrupt delivery.
616	 */
617	procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES |
618	    PROCBASED2_VIRTUALIZE_X2APIC_MODE |
619	    PROCBASED2_APIC_REGISTER_VIRTUALIZATION |
620	    PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY);
621
622	use_tpr_shadow = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
623	    MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0,
624	    &tmp) == 0);
625
626	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
627	    procbased2_vid_bits, 0, &tmp);
628	if (error == 0 && use_tpr_shadow) {
629		virtual_interrupt_delivery = 1;
630		TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid",
631		    &virtual_interrupt_delivery);
632	}
633
634	if (virtual_interrupt_delivery) {
635		procbased_ctls |= PROCBASED_USE_TPR_SHADOW;
636		procbased_ctls2 |= procbased2_vid_bits;
637		procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE;
638
639		/*
640		 * Check for Posted Interrupts only if Virtual Interrupt
641		 * Delivery is enabled.
642		 */
643		error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
644		    MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0,
645		    &tmp);
646		if (error == 0) {
647			pirvec = vmm_ipi_alloc();
648			if (pirvec == 0) {
649				if (bootverbose) {
650					printf("vmx_init: unable to allocate "
651					    "posted interrupt vector\n");
652				}
653			} else {
654				posted_interrupts = 1;
655				TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir",
656				    &posted_interrupts);
657			}
658		}
659	}
660
661	if (posted_interrupts)
662		    pinbased_ctls |= PINBASED_POSTED_INTERRUPT;
663
664	/* Initialize EPT */
665	error = ept_init(ipinum);
666	if (error) {
667		printf("vmx_init: ept initialization failed (%d)\n", error);
668		return (error);
669	}
670
671	/*
672	 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
673	 */
674	fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
675	fixed1 = rdmsr(MSR_VMX_CR0_FIXED1);
676	cr0_ones_mask = fixed0 & fixed1;
677	cr0_zeros_mask = ~fixed0 & ~fixed1;
678
679	/*
680	 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation
681	 * if unrestricted guest execution is allowed.
682	 */
683	if (cap_unrestricted_guest)
684		cr0_ones_mask &= ~(CR0_PG | CR0_PE);
685
686	/*
687	 * Do not allow the guest to set CR0_NW or CR0_CD.
688	 */
689	cr0_zeros_mask |= (CR0_NW | CR0_CD);
690
691	fixed0 = rdmsr(MSR_VMX_CR4_FIXED0);
692	fixed1 = rdmsr(MSR_VMX_CR4_FIXED1);
693	cr4_ones_mask = fixed0 & fixed1;
694	cr4_zeros_mask = ~fixed0 & ~fixed1;
695
696	vpid_init();
697
698	/* enable VMX operation */
699	smp_rendezvous(NULL, vmx_enable, NULL, NULL);
700
701	vmx_initialized = 1;
702
703	return (0);
704}
705
706static void
707vmx_trigger_hostintr(int vector)
708{
709	uintptr_t func;
710	struct gate_descriptor *gd;
711
712	gd = &idt[vector];
713
714	KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: "
715	    "invalid vector %d", vector));
716	KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present",
717	    vector));
718	KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d "
719	    "has invalid type %d", vector, gd->gd_type));
720	KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d "
721	    "has invalid dpl %d", vector, gd->gd_dpl));
722	KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor "
723	    "for vector %d has invalid selector %d", vector, gd->gd_selector));
724	KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid "
725	    "IST %d", vector, gd->gd_ist));
726
727	func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset);
728	vmx_call_isr(func);
729}
730
731static int
732vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial)
733{
734	int error, mask_ident, shadow_ident;
735	uint64_t mask_value;
736
737	if (which != 0 && which != 4)
738		panic("vmx_setup_cr_shadow: unknown cr%d", which);
739
740	if (which == 0) {
741		mask_ident = VMCS_CR0_MASK;
742		mask_value = cr0_ones_mask | cr0_zeros_mask;
743		shadow_ident = VMCS_CR0_SHADOW;
744	} else {
745		mask_ident = VMCS_CR4_MASK;
746		mask_value = cr4_ones_mask | cr4_zeros_mask;
747		shadow_ident = VMCS_CR4_SHADOW;
748	}
749
750	error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value);
751	if (error)
752		return (error);
753
754	error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial);
755	if (error)
756		return (error);
757
758	return (0);
759}
760#define	vmx_setup_cr0_shadow(vmcs,init)	vmx_setup_cr_shadow(0, (vmcs), (init))
761#define	vmx_setup_cr4_shadow(vmcs,init)	vmx_setup_cr_shadow(4, (vmcs), (init))
762
763static void *
764vmx_vminit(struct vm *vm, pmap_t pmap)
765{
766	uint16_t vpid[VM_MAXCPU];
767	int i, error, guest_msr_count;
768	struct vmx *vmx;
769	struct vmcs *vmcs;
770
771	vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
772	if ((uintptr_t)vmx & PAGE_MASK) {
773		panic("malloc of struct vmx not aligned on %d byte boundary",
774		      PAGE_SIZE);
775	}
776	vmx->vm = vm;
777
778	vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4));
779
780	/*
781	 * Clean up EPTP-tagged guest physical and combined mappings
782	 *
783	 * VMX transitions are not required to invalidate any guest physical
784	 * mappings. So, it may be possible for stale guest physical mappings
785	 * to be present in the processor TLBs.
786	 *
787	 * Combined mappings for this EP4TA are also invalidated for all VPIDs.
788	 */
789	ept_invalidate_mappings(vmx->eptp);
790
791	msr_bitmap_initialize(vmx->msr_bitmap);
792
793	/*
794	 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE.
795	 * The guest FSBASE and GSBASE are saved and restored during
796	 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are
797	 * always restored from the vmcs host state area on vm-exit.
798	 *
799	 * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in
800	 * how they are saved/restored so can be directly accessed by the
801	 * guest.
802	 *
803	 * Guest KGSBASE is saved and restored in the guest MSR save area.
804	 * Host KGSBASE is restored before returning to userland from the pcb.
805	 * There will be a window of time when we are executing in the host
806	 * kernel context with a value of KGSBASE from the guest. This is ok
807	 * because the value of KGSBASE is inconsequential in kernel context.
808	 *
809	 * MSR_EFER is saved and restored in the guest VMCS area on a
810	 * VM exit and entry respectively. It is also restored from the
811	 * host VMCS area on a VM exit.
812	 */
813	if (guest_msr_rw(vmx, MSR_GSBASE) ||
814	    guest_msr_rw(vmx, MSR_FSBASE) ||
815	    guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) ||
816	    guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) ||
817	    guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) ||
818	    guest_msr_rw(vmx, MSR_KGSBASE) ||
819	    guest_msr_rw(vmx, MSR_EFER))
820		panic("vmx_vminit: error setting guest msr access");
821
822	/*
823	 * MSR_PAT is saved and restored in the guest VMCS are on a VM exit
824	 * and entry respectively. It is also restored from the host VMCS
825	 * area on a VM exit. However, if running on a system with no
826	 * MSR_PAT save/restore support, leave access disabled so accesses
827	 * will be trapped.
828	 */
829	if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT))
830		panic("vmx_vminit: error setting guest pat msr access");
831
832	vpid_alloc(vpid, VM_MAXCPU);
833
834	if (virtual_interrupt_delivery) {
835		error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE,
836		    APIC_ACCESS_ADDRESS);
837		/* XXX this should really return an error to the caller */
838		KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error));
839	}
840
841	for (i = 0; i < VM_MAXCPU; i++) {
842		vmcs = &vmx->vmcs[i];
843		vmcs->identifier = vmx_revision();
844		error = vmclear(vmcs);
845		if (error != 0) {
846			panic("vmx_vminit: vmclear error %d on vcpu %d\n",
847			      error, i);
848		}
849
850		error = vmcs_init(vmcs);
851		KASSERT(error == 0, ("vmcs_init error %d", error));
852
853		VMPTRLD(vmcs);
854		error = 0;
855		error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]);
856		error += vmwrite(VMCS_EPTP, vmx->eptp);
857		error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls);
858		error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls);
859		error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2);
860		error += vmwrite(VMCS_EXIT_CTLS, exit_ctls);
861		error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls);
862		error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap));
863		error += vmwrite(VMCS_VPID, vpid[i]);
864		if (virtual_interrupt_delivery) {
865			error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS);
866			error += vmwrite(VMCS_VIRTUAL_APIC,
867			    vtophys(&vmx->apic_page[i]));
868			error += vmwrite(VMCS_EOI_EXIT0, 0);
869			error += vmwrite(VMCS_EOI_EXIT1, 0);
870			error += vmwrite(VMCS_EOI_EXIT2, 0);
871			error += vmwrite(VMCS_EOI_EXIT3, 0);
872		}
873		if (posted_interrupts) {
874			error += vmwrite(VMCS_PIR_VECTOR, pirvec);
875			error += vmwrite(VMCS_PIR_DESC,
876			    vtophys(&vmx->pir_desc[i]));
877		}
878		VMCLEAR(vmcs);
879		KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs"));
880
881		vmx->cap[i].set = 0;
882		vmx->cap[i].proc_ctls = procbased_ctls;
883		vmx->cap[i].proc_ctls2 = procbased_ctls2;
884
885		vmx->state[i].lastcpu = -1;
886		vmx->state[i].vpid = vpid[i];
887		vmx->state[i].user_event.intr_info = 0;
888
889		msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count);
890
891		error = vmcs_set_msr_save(vmcs, vtophys(vmx->guest_msrs[i]),
892		    guest_msr_count);
893		if (error != 0)
894			panic("vmcs_set_msr_save error %d", error);
895
896		/*
897		 * Set up the CR0/4 shadows, and init the read shadow
898		 * to the power-on register value from the Intel Sys Arch.
899		 *  CR0 - 0x60000010
900		 *  CR4 - 0
901		 */
902		error = vmx_setup_cr0_shadow(vmcs, 0x60000010);
903		if (error != 0)
904			panic("vmx_setup_cr0_shadow %d", error);
905
906		error = vmx_setup_cr4_shadow(vmcs, 0);
907		if (error != 0)
908			panic("vmx_setup_cr4_shadow %d", error);
909
910		vmx->ctx[i].pmap = pmap;
911	}
912
913	return (vmx);
914}
915
916static int
917vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx)
918{
919	int handled, func;
920
921	func = vmxctx->guest_rax;
922
923	handled = x86_emulate_cpuid(vm, vcpu,
924				    (uint32_t*)(&vmxctx->guest_rax),
925				    (uint32_t*)(&vmxctx->guest_rbx),
926				    (uint32_t*)(&vmxctx->guest_rcx),
927				    (uint32_t*)(&vmxctx->guest_rdx));
928	return (handled);
929}
930
931static __inline void
932vmx_run_trace(struct vmx *vmx, int vcpu)
933{
934#ifdef KTR
935	VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip());
936#endif
937}
938
939static __inline void
940vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason,
941	       int handled)
942{
943#ifdef KTR
944	VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx",
945		 handled ? "handled" : "unhandled",
946		 exit_reason_to_str(exit_reason), rip);
947#endif
948}
949
950static __inline void
951vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
952{
953#ifdef KTR
954	VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip);
955#endif
956}
957
958static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved");
959
960static void
961vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
962{
963	struct vmxstate *vmxstate;
964	struct invvpid_desc invvpid_desc;
965
966	vmxstate = &vmx->state[vcpu];
967	if (vmxstate->lastcpu == curcpu)
968		return;
969
970	vmxstate->lastcpu = curcpu;
971
972	vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
973
974	vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
975	vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
976	vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
977
978	/*
979	 * If we are using VPIDs then invalidate all mappings tagged with 'vpid'
980	 *
981	 * We do this because this vcpu was executing on a different host
982	 * cpu when it last ran. We do not track whether it invalidated
983	 * mappings associated with its 'vpid' during that run. So we must
984	 * assume that the mappings associated with 'vpid' on 'curcpu' are
985	 * stale and invalidate them.
986	 *
987	 * Note that we incur this penalty only when the scheduler chooses to
988	 * move the thread associated with this vcpu between host cpus.
989	 *
990	 * Note also that this will invalidate mappings tagged with 'vpid'
991	 * for "all" EP4TAs.
992	 */
993	if (vmxstate->vpid != 0) {
994		if (pmap->pm_eptgen == vmx->eptgen[curcpu]) {
995			invvpid_desc._res1 = 0;
996			invvpid_desc._res2 = 0;
997			invvpid_desc.vpid = vmxstate->vpid;
998			invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
999		} else {
1000			/*
1001			 * The invvpid can be skipped if an invept is going to
1002			 * be performed before entering the guest. The invept
1003			 * will invalidate combined mappings tagged with
1004			 * 'vmx->eptp' for all vpids.
1005			 */
1006			vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1);
1007		}
1008	}
1009}
1010
1011/*
1012 * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
1013 */
1014CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);
1015
1016static void __inline
1017vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
1018{
1019
1020	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) {
1021		vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING;
1022		vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1023		VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting");
1024	}
1025}
1026
1027static void __inline
1028vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
1029{
1030
1031	KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0,
1032	    ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls));
1033	vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
1034	vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1035	VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting");
1036}
1037
1038static void __inline
1039vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
1040{
1041
1042	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) {
1043		vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING;
1044		vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1045		VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting");
1046	}
1047}
1048
1049static void __inline
1050vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
1051{
1052
1053	KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0,
1054	    ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls));
1055	vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
1056	vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1057	VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
1058}
1059
1060#define	NMI_BLOCKING	(VMCS_INTERRUPTIBILITY_NMI_BLOCKING |		\
1061			 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
1062#define	HWINTR_BLOCKING	(VMCS_INTERRUPTIBILITY_STI_BLOCKING |		\
1063			 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
1064
1065static void
1066vmx_inject_user_event(struct vmx *vmx, int vcpu)
1067{
1068	struct vmxevent *user_event;
1069	uint32_t info;
1070
1071	user_event = &vmx->state[vcpu].user_event;
1072
1073	info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1074	KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_user_event: invalid "
1075	    "VM-entry interruption information %#x", info));
1076
1077	vmcs_write(VMCS_ENTRY_INTR_INFO, user_event->intr_info);
1078	if (user_event->intr_info & VMCS_INTR_DEL_ERRCODE)
1079		vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, user_event->error_code);
1080	user_event->intr_info = 0;
1081}
1082
1083static void
1084vmx_inject_exception(struct vmx *vmx, int vcpu, struct vm_exit *vmexit,
1085    int fault, int errvalid, int errcode)
1086{
1087	uint32_t info;
1088
1089	info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1090	KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_exception: invalid "
1091	    "VM-entry interruption information %#x", info));
1092
1093	/*
1094	 * Although INTR_T_HWEXCEPTION does not advance %rip, vmx_run()
1095	 * always advances it, so we clear the instruction length to zero
1096	 * explicitly.
1097	 */
1098	vmexit->inst_length = 0;
1099	info = fault | VMCS_INTR_T_HWEXCEPTION | VMCS_INTR_VALID;
1100	if (errvalid) {
1101		info |= VMCS_INTR_DEL_ERRCODE;
1102		vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, errcode);
1103	}
1104	vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1105
1106	VCPU_CTR2(vmx->vm, vcpu, "Injecting fault %d (errcode %d)", fault,
1107	    errcode);
1108}
1109
1110/* All GP# faults VMM injects use an error code of 0. */
1111static void
1112vmx_inject_gp(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
1113{
1114
1115	vmx_inject_exception(vmx, vcpu, vmexit, IDT_GP, 1, 0);
1116}
1117
1118static void
1119vmx_inject_ud(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
1120{
1121
1122	vmx_inject_exception(vmx, vcpu, vmexit, IDT_UD, 0, 0);
1123}
1124
1125static void
1126vmx_inject_nmi(struct vmx *vmx, int vcpu)
1127{
1128	uint32_t gi, info;
1129
1130	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1131	KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest "
1132	    "interruptibility-state %#x", gi));
1133
1134	info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1135	KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid "
1136	    "VM-entry interruption information %#x", info));
1137
1138	/*
1139	 * Inject the virtual NMI. The vector must be the NMI IDT entry
1140	 * or the VMCS entry check will fail.
1141	 */
1142	info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID;
1143	vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1144
1145	VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI");
1146
1147	/* Clear the request */
1148	vm_nmi_clear(vmx->vm, vcpu);
1149}
1150
1151static void
1152vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic)
1153{
1154	int vector, need_nmi_exiting;
1155	uint64_t rflags;
1156	uint32_t gi, info;
1157
1158	if (vm_nmi_pending(vmx->vm, vcpu)) {
1159		/*
1160		 * If there are no conditions blocking NMI injection then
1161		 * inject it directly here otherwise enable "NMI window
1162		 * exiting" to inject it as soon as we can.
1163		 *
1164		 * We also check for STI_BLOCKING because some implementations
1165		 * don't allow NMI injection in this case. If we are running
1166		 * on a processor that doesn't have this restriction it will
1167		 * immediately exit and the NMI will be injected in the
1168		 * "NMI window exiting" handler.
1169		 */
1170		need_nmi_exiting = 1;
1171		gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1172		if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) {
1173			info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1174			if ((info & VMCS_INTR_VALID) == 0) {
1175				vmx_inject_nmi(vmx, vcpu);
1176				need_nmi_exiting = 0;
1177			} else {
1178				VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI "
1179				    "due to VM-entry intr info %#x", info);
1180			}
1181		} else {
1182			VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to "
1183			    "Guest Interruptibility-state %#x", gi);
1184		}
1185
1186		if (need_nmi_exiting)
1187			vmx_set_nmi_window_exiting(vmx, vcpu);
1188	}
1189
1190	/*
1191	 * If there is a user injection event pending and there isn't
1192	 * an interrupt queued already, inject the user event.
1193	 */
1194	if (vmx->state[vcpu].user_event.intr_info & VMCS_INTR_VALID) {
1195		info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1196		if ((info & VMCS_INTR_VALID) == 0) {
1197			vmx_inject_user_event(vmx, vcpu);
1198		} else {
1199			/*
1200			 * XXX: Do we need to force an exit so this can
1201			 * be injected?
1202			 */
1203			VCPU_CTR1(vmx->vm, vcpu, "Cannot inject user event "
1204			    "due to VM-entry intr info %#x", info);
1205		}
1206	}
1207
1208	if (virtual_interrupt_delivery) {
1209		vmx_inject_pir(vlapic);
1210		return;
1211	}
1212
1213	/*
1214	 * If interrupt-window exiting is already in effect then don't bother
1215	 * checking for pending interrupts. This is just an optimization and
1216	 * not needed for correctness.
1217	 */
1218	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) {
1219		VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to "
1220		    "pending int_window_exiting");
1221		return;
1222	}
1223
1224	/* Ask the local apic for a vector to inject */
1225	if (!vlapic_pending_intr(vlapic, &vector))
1226		return;
1227
1228	KASSERT(vector >= 32 && vector <= 255, ("invalid vector %d", vector));
1229
1230	/* Check RFLAGS.IF and the interruptibility state of the guest */
1231	rflags = vmcs_read(VMCS_GUEST_RFLAGS);
1232	if ((rflags & PSL_I) == 0) {
1233		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
1234		    "rflags %#lx", vector, rflags);
1235		goto cantinject;
1236	}
1237
1238	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1239	if (gi & HWINTR_BLOCKING) {
1240		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
1241		    "Guest Interruptibility-state %#x", vector, gi);
1242		goto cantinject;
1243	}
1244
1245	info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1246	if (info & VMCS_INTR_VALID) {
1247		/*
1248		 * This is expected and could happen for multiple reasons:
1249		 * - A vectoring VM-entry was aborted due to astpending
1250		 * - A VM-exit happened during event injection.
1251		 * - An NMI was injected above or after "NMI window exiting"
1252		 */
1253		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
1254		    "VM-entry intr info %#x", vector, info);
1255		goto cantinject;
1256	}
1257
1258	/* Inject the interrupt */
1259	info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID;
1260	info |= vector;
1261	vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1262
1263	/* Update the Local APIC ISR */
1264	vlapic_intr_accepted(vlapic, vector);
1265
1266	VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
1267
1268	return;
1269
1270cantinject:
1271	/*
1272	 * Set the Interrupt Window Exiting execution control so we can inject
1273	 * the interrupt as soon as blocking condition goes away.
1274	 */
1275	vmx_set_int_window_exiting(vmx, vcpu);
1276}
1277
1278/*
1279 * If the Virtual NMIs execution control is '1' then the logical processor
1280 * tracks virtual-NMI blocking in the Guest Interruptibility-state field of
1281 * the VMCS. An IRET instruction in VMX non-root operation will remove any
1282 * virtual-NMI blocking.
1283 *
1284 * This unblocking occurs even if the IRET causes a fault. In this case the
1285 * hypervisor needs to restore virtual-NMI blocking before resuming the guest.
1286 */
1287static void
1288vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid)
1289{
1290	uint32_t gi;
1291
1292	VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking");
1293	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1294	gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
1295	vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
1296}
1297
1298static void
1299vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid)
1300{
1301	uint32_t gi;
1302
1303	VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking");
1304	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1305	gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
1306	vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
1307}
1308
1309static int
1310vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
1311{
1312	struct vmxctx *vmxctx;
1313	uint64_t xcrval;
1314	const struct xsave_limits *limits;
1315
1316	vmxctx = &vmx->ctx[vcpu];
1317	limits = vmm_get_xsave_limits();
1318
1319	/*
1320	 * Note that the processor raises a GP# fault on its own if
1321	 * xsetbv is executed for CPL != 0, so we do not have to
1322	 * emulate that fault here.
1323	 */
1324
1325	/* Only xcr0 is supported. */
1326	if (vmxctx->guest_rcx != 0) {
1327		vmx_inject_gp(vmx, vcpu, vmexit);
1328		return (HANDLED);
1329	}
1330
1331	/* We only handle xcr0 if both the host and guest have XSAVE enabled. */
1332	if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) {
1333		vmx_inject_ud(vmx, vcpu, vmexit);
1334		return (HANDLED);
1335	}
1336
1337	xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff);
1338	if ((xcrval & ~limits->xcr0_allowed) != 0) {
1339		vmx_inject_gp(vmx, vcpu, vmexit);
1340		return (HANDLED);
1341	}
1342
1343	if (!(xcrval & XFEATURE_ENABLED_X87)) {
1344		vmx_inject_gp(vmx, vcpu, vmexit);
1345		return (HANDLED);
1346	}
1347
1348	if ((xcrval & (XFEATURE_ENABLED_AVX | XFEATURE_ENABLED_SSE)) ==
1349	    XFEATURE_ENABLED_AVX) {
1350		vmx_inject_gp(vmx, vcpu, vmexit);
1351		return (HANDLED);
1352	}
1353
1354	/*
1355	 * This runs "inside" vmrun() with the guest's FPU state, so
1356	 * modifying xcr0 directly modifies the guest's xcr0, not the
1357	 * host's.
1358	 */
1359	load_xcr(0, xcrval);
1360	return (HANDLED);
1361}
1362
1363static int
1364vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
1365{
1366	int cr, vmcs_guest_cr, vmcs_shadow_cr;
1367	uint64_t crval, regval, ones_mask, zeros_mask;
1368	const struct vmxctx *vmxctx;
1369
1370	/* We only handle mov to %cr0 or %cr4 at this time */
1371	if ((exitqual & 0xf0) != 0x00)
1372		return (UNHANDLED);
1373
1374	cr = exitqual & 0xf;
1375	if (cr != 0 && cr != 4)
1376		return (UNHANDLED);
1377
1378	regval = 0; /* silence gcc */
1379	vmxctx = &vmx->ctx[vcpu];
1380
1381	/*
1382	 * We must use vmcs_write() directly here because vmcs_setreg() will
1383	 * call vmclear(vmcs) as a side-effect which we certainly don't want.
1384	 */
1385	switch ((exitqual >> 8) & 0xf) {
1386	case 0:
1387		regval = vmxctx->guest_rax;
1388		break;
1389	case 1:
1390		regval = vmxctx->guest_rcx;
1391		break;
1392	case 2:
1393		regval = vmxctx->guest_rdx;
1394		break;
1395	case 3:
1396		regval = vmxctx->guest_rbx;
1397		break;
1398	case 4:
1399		regval = vmcs_read(VMCS_GUEST_RSP);
1400		break;
1401	case 5:
1402		regval = vmxctx->guest_rbp;
1403		break;
1404	case 6:
1405		regval = vmxctx->guest_rsi;
1406		break;
1407	case 7:
1408		regval = vmxctx->guest_rdi;
1409		break;
1410	case 8:
1411		regval = vmxctx->guest_r8;
1412		break;
1413	case 9:
1414		regval = vmxctx->guest_r9;
1415		break;
1416	case 10:
1417		regval = vmxctx->guest_r10;
1418		break;
1419	case 11:
1420		regval = vmxctx->guest_r11;
1421		break;
1422	case 12:
1423		regval = vmxctx->guest_r12;
1424		break;
1425	case 13:
1426		regval = vmxctx->guest_r13;
1427		break;
1428	case 14:
1429		regval = vmxctx->guest_r14;
1430		break;
1431	case 15:
1432		regval = vmxctx->guest_r15;
1433		break;
1434	}
1435
1436	if (cr == 0) {
1437		ones_mask = cr0_ones_mask;
1438		zeros_mask = cr0_zeros_mask;
1439		vmcs_guest_cr = VMCS_GUEST_CR0;
1440		vmcs_shadow_cr = VMCS_CR0_SHADOW;
1441	} else {
1442		ones_mask = cr4_ones_mask;
1443		zeros_mask = cr4_zeros_mask;
1444		vmcs_guest_cr = VMCS_GUEST_CR4;
1445		vmcs_shadow_cr = VMCS_CR4_SHADOW;
1446	}
1447	vmcs_write(vmcs_shadow_cr, regval);
1448
1449	crval = regval | ones_mask;
1450	crval &= ~zeros_mask;
1451	vmcs_write(vmcs_guest_cr, crval);
1452
1453	if (cr == 0 && regval & CR0_PG) {
1454		uint64_t efer, entry_ctls;
1455
1456		/*
1457		 * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and
1458		 * the "IA-32e mode guest" bit in VM-entry control must be
1459		 * equal.
1460		 */
1461		efer = vmcs_read(VMCS_GUEST_IA32_EFER);
1462		if (efer & EFER_LME) {
1463			efer |= EFER_LMA;
1464			vmcs_write(VMCS_GUEST_IA32_EFER, efer);
1465			entry_ctls = vmcs_read(VMCS_ENTRY_CTLS);
1466			entry_ctls |= VM_ENTRY_GUEST_LMA;
1467			vmcs_write(VMCS_ENTRY_CTLS, entry_ctls);
1468		}
1469	}
1470
1471	return (HANDLED);
1472}
1473
1474static enum vie_cpu_mode
1475vmx_cpu_mode(void)
1476{
1477
1478	if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA)
1479		return (CPU_MODE_64BIT);
1480	else
1481		return (CPU_MODE_COMPATIBILITY);
1482}
1483
1484static enum vie_paging_mode
1485vmx_paging_mode(void)
1486{
1487
1488	if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG))
1489		return (PAGING_MODE_FLAT);
1490	if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE))
1491		return (PAGING_MODE_32);
1492	if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME)
1493		return (PAGING_MODE_64);
1494	else
1495		return (PAGING_MODE_PAE);
1496}
1497
1498static int
1499ept_fault_type(uint64_t ept_qual)
1500{
1501	int fault_type;
1502
1503	if (ept_qual & EPT_VIOLATION_DATA_WRITE)
1504		fault_type = VM_PROT_WRITE;
1505	else if (ept_qual & EPT_VIOLATION_INST_FETCH)
1506		fault_type = VM_PROT_EXECUTE;
1507	else
1508		fault_type= VM_PROT_READ;
1509
1510	return (fault_type);
1511}
1512
1513static boolean_t
1514ept_emulation_fault(uint64_t ept_qual)
1515{
1516	int read, write;
1517
1518	/* EPT fault on an instruction fetch doesn't make sense here */
1519	if (ept_qual & EPT_VIOLATION_INST_FETCH)
1520		return (FALSE);
1521
1522	/* EPT fault must be a read fault or a write fault */
1523	read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
1524	write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
1525	if ((read | write) == 0)
1526		return (FALSE);
1527
1528	/*
1529	 * The EPT violation must have been caused by accessing a
1530	 * guest-physical address that is a translation of a guest-linear
1531	 * address.
1532	 */
1533	if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
1534	    (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
1535		return (FALSE);
1536	}
1537
1538	return (TRUE);
1539}
1540
1541static int
1542vmx_handle_apic_write(struct vlapic *vlapic, uint64_t qual)
1543{
1544	int error, handled, offset;
1545	bool retu;
1546
1547	if (!virtual_interrupt_delivery)
1548		return (UNHANDLED);
1549
1550	handled = HANDLED;
1551	offset = APIC_WRITE_OFFSET(qual);
1552	switch (offset) {
1553	case APIC_OFFSET_ID:
1554		vlapic_id_write_handler(vlapic);
1555		break;
1556	case APIC_OFFSET_LDR:
1557		vlapic_ldr_write_handler(vlapic);
1558		break;
1559	case APIC_OFFSET_DFR:
1560		vlapic_dfr_write_handler(vlapic);
1561		break;
1562	case APIC_OFFSET_SVR:
1563		vlapic_svr_write_handler(vlapic);
1564		break;
1565	case APIC_OFFSET_ESR:
1566		vlapic_esr_write_handler(vlapic);
1567		break;
1568	case APIC_OFFSET_ICR_LOW:
1569		retu = false;
1570		error = vlapic_icrlo_write_handler(vlapic, &retu);
1571		if (error != 0 || retu)
1572			handled = UNHANDLED;
1573		break;
1574	case APIC_OFFSET_CMCI_LVT:
1575	case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
1576		vlapic_lvt_write_handler(vlapic, offset);
1577		break;
1578	case APIC_OFFSET_TIMER_ICR:
1579		vlapic_icrtmr_write_handler(vlapic);
1580		break;
1581	case APIC_OFFSET_TIMER_DCR:
1582		vlapic_dcr_write_handler(vlapic);
1583		break;
1584	default:
1585		handled = UNHANDLED;
1586		break;
1587	}
1588	return (handled);
1589}
1590
1591static bool
1592apic_access_fault(uint64_t gpa)
1593{
1594
1595	if (virtual_interrupt_delivery &&
1596	    (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE))
1597		return (true);
1598	else
1599		return (false);
1600}
1601
1602static int
1603vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
1604{
1605	uint64_t qual;
1606	int access_type, offset, allowed;
1607
1608	if (!virtual_interrupt_delivery)
1609		return (UNHANDLED);
1610
1611	qual = vmexit->u.vmx.exit_qualification;
1612	access_type = APIC_ACCESS_TYPE(qual);
1613	offset = APIC_ACCESS_OFFSET(qual);
1614
1615	allowed = 0;
1616	if (access_type == 0) {
1617		/*
1618		 * Read data access to the following registers is expected.
1619		 */
1620		switch (offset) {
1621		case APIC_OFFSET_APR:
1622		case APIC_OFFSET_PPR:
1623		case APIC_OFFSET_RRR:
1624		case APIC_OFFSET_CMCI_LVT:
1625		case APIC_OFFSET_TIMER_CCR:
1626			allowed = 1;
1627			break;
1628		default:
1629			break;
1630		}
1631	} else if (access_type == 1) {
1632		/*
1633		 * Write data access to the following registers is expected.
1634		 */
1635		switch (offset) {
1636		case APIC_OFFSET_VER:
1637		case APIC_OFFSET_APR:
1638		case APIC_OFFSET_PPR:
1639		case APIC_OFFSET_RRR:
1640		case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
1641		case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
1642		case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
1643		case APIC_OFFSET_CMCI_LVT:
1644		case APIC_OFFSET_TIMER_CCR:
1645			allowed = 1;
1646			break;
1647		default:
1648			break;
1649		}
1650	}
1651
1652	if (allowed) {
1653		vmexit->exitcode = VM_EXITCODE_INST_EMUL;
1654		vmexit->u.inst_emul.gpa = DEFAULT_APIC_BASE + offset;
1655		vmexit->u.inst_emul.gla = VIE_INVALID_GLA;
1656		vmexit->u.inst_emul.cr3 = vmcs_guest_cr3();
1657		vmexit->u.inst_emul.cpu_mode = vmx_cpu_mode();
1658		vmexit->u.inst_emul.paging_mode = vmx_paging_mode();
1659	}
1660
1661	/*
1662	 * Regardless of whether the APIC-access is allowed this handler
1663	 * always returns UNHANDLED:
1664	 * - if the access is allowed then it is handled by emulating the
1665	 *   instruction that caused the VM-exit (outside the critical section)
1666	 * - if the access is not allowed then it will be converted to an
1667	 *   exitcode of VM_EXITCODE_VMX and will be dealt with in userland.
1668	 */
1669	return (UNHANDLED);
1670}
1671
1672static int
1673vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
1674{
1675	int error, handled;
1676	struct vmxctx *vmxctx;
1677	struct vlapic *vlapic;
1678	uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, reason;
1679	uint64_t qual, gpa;
1680	bool retu;
1681
1682	CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0);
1683	CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0);
1684
1685	handled = UNHANDLED;
1686	vmxctx = &vmx->ctx[vcpu];
1687
1688	qual = vmexit->u.vmx.exit_qualification;
1689	reason = vmexit->u.vmx.exit_reason;
1690	vmexit->exitcode = VM_EXITCODE_BOGUS;
1691
1692	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
1693
1694	/*
1695	 * VM exits that could be triggered during event injection on the
1696	 * previous VM entry need to be handled specially by re-injecting
1697	 * the event.
1698	 *
1699	 * See "Information for VM Exits During Event Delivery" in Intel SDM
1700	 * for details.
1701	 */
1702	switch (reason) {
1703	case EXIT_REASON_EPT_FAULT:
1704	case EXIT_REASON_EPT_MISCONFIG:
1705	case EXIT_REASON_APIC_ACCESS:
1706	case EXIT_REASON_TASK_SWITCH:
1707	case EXIT_REASON_EXCEPTION:
1708		idtvec_info = vmcs_idt_vectoring_info();
1709		if (idtvec_info & VMCS_IDT_VEC_VALID) {
1710			idtvec_info &= ~(1 << 12); /* clear undefined bit */
1711			vmcs_write(VMCS_ENTRY_INTR_INFO, idtvec_info);
1712			if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
1713				idtvec_err = vmcs_idt_vectoring_err();
1714				vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR,
1715				    idtvec_err);
1716			}
1717			/*
1718			 * If 'virtual NMIs' are being used and the VM-exit
1719			 * happened while injecting an NMI during the previous
1720			 * VM-entry, then clear "blocking by NMI" in the Guest
1721			 * Interruptibility-state.
1722			 */
1723			if ((idtvec_info & VMCS_INTR_T_MASK) ==
1724			    VMCS_INTR_T_NMI) {
1725				 vmx_clear_nmi_blocking(vmx, vcpu);
1726			}
1727			vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
1728		}
1729	default:
1730		idtvec_info = 0;
1731		break;
1732	}
1733
1734	switch (reason) {
1735	case EXIT_REASON_CR_ACCESS:
1736		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
1737		handled = vmx_emulate_cr_access(vmx, vcpu, qual);
1738		break;
1739	case EXIT_REASON_RDMSR:
1740		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1);
1741		retu = false;
1742		ecx = vmxctx->guest_rcx;
1743		error = emulate_rdmsr(vmx->vm, vcpu, ecx, &retu);
1744		if (error) {
1745			vmexit->exitcode = VM_EXITCODE_RDMSR;
1746			vmexit->u.msr.code = ecx;
1747		} else if (!retu) {
1748			handled = HANDLED;
1749		} else {
1750			/* Return to userspace with a valid exitcode */
1751			KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
1752			    ("emulate_wrmsr retu with bogus exitcode"));
1753		}
1754		break;
1755	case EXIT_REASON_WRMSR:
1756		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1);
1757		retu = false;
1758		eax = vmxctx->guest_rax;
1759		ecx = vmxctx->guest_rcx;
1760		edx = vmxctx->guest_rdx;
1761		error = emulate_wrmsr(vmx->vm, vcpu, ecx,
1762		    (uint64_t)edx << 32 | eax, &retu);
1763		if (error) {
1764			vmexit->exitcode = VM_EXITCODE_WRMSR;
1765			vmexit->u.msr.code = ecx;
1766			vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
1767		} else if (!retu) {
1768			handled = HANDLED;
1769		} else {
1770			/* Return to userspace with a valid exitcode */
1771			KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
1772			    ("emulate_wrmsr retu with bogus exitcode"));
1773		}
1774		break;
1775	case EXIT_REASON_HLT:
1776		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
1777		vmexit->exitcode = VM_EXITCODE_HLT;
1778		vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS);
1779		break;
1780	case EXIT_REASON_MTF:
1781		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
1782		vmexit->exitcode = VM_EXITCODE_MTRAP;
1783		break;
1784	case EXIT_REASON_PAUSE:
1785		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1);
1786		vmexit->exitcode = VM_EXITCODE_PAUSE;
1787		break;
1788	case EXIT_REASON_INTR_WINDOW:
1789		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1);
1790		vmx_clear_int_window_exiting(vmx, vcpu);
1791		return (1);
1792	case EXIT_REASON_EXT_INTR:
1793		/*
1794		 * External interrupts serve only to cause VM exits and allow
1795		 * the host interrupt handler to run.
1796		 *
1797		 * If this external interrupt triggers a virtual interrupt
1798		 * to a VM, then that state will be recorded by the
1799		 * host interrupt handler in the VM's softc. We will inject
1800		 * this virtual interrupt during the subsequent VM enter.
1801		 */
1802		intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
1803		KASSERT((intr_info & VMCS_INTR_VALID) != 0 &&
1804		    (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR,
1805		    ("VM exit interruption info invalid: %#x", intr_info));
1806		vmx_trigger_hostintr(intr_info & 0xff);
1807
1808		/*
1809		 * This is special. We want to treat this as an 'handled'
1810		 * VM-exit but not increment the instruction pointer.
1811		 */
1812		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1);
1813		return (1);
1814	case EXIT_REASON_NMI_WINDOW:
1815		/* Exit to allow the pending virtual NMI to be injected */
1816		if (vm_nmi_pending(vmx->vm, vcpu))
1817			vmx_inject_nmi(vmx, vcpu);
1818		vmx_clear_nmi_window_exiting(vmx, vcpu);
1819		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1);
1820		return (1);
1821	case EXIT_REASON_INOUT:
1822		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1);
1823		vmexit->exitcode = VM_EXITCODE_INOUT;
1824		vmexit->u.inout.bytes = (qual & 0x7) + 1;
1825		vmexit->u.inout.in = (qual & 0x8) ? 1 : 0;
1826		vmexit->u.inout.string = (qual & 0x10) ? 1 : 0;
1827		vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0;
1828		vmexit->u.inout.port = (uint16_t)(qual >> 16);
1829		vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
1830		break;
1831	case EXIT_REASON_CPUID:
1832		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1);
1833		handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
1834		break;
1835	case EXIT_REASON_EXCEPTION:
1836		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1);
1837		intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
1838		KASSERT((intr_info & VMCS_INTR_VALID) != 0,
1839		    ("VM exit interruption info invalid: %#x", intr_info));
1840
1841		/*
1842		 * If Virtual NMIs control is 1 and the VM-exit is due to a
1843		 * fault encountered during the execution of IRET then we must
1844		 * restore the state of "virtual-NMI blocking" before resuming
1845		 * the guest.
1846		 *
1847		 * See "Resuming Guest Software after Handling an Exception".
1848		 */
1849		if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
1850		    (intr_info & 0xff) != IDT_DF &&
1851		    (intr_info & EXIT_QUAL_NMIUDTI) != 0)
1852			vmx_restore_nmi_blocking(vmx, vcpu);
1853
1854		/*
1855		 * The NMI has already been handled in vmx_exit_handle_nmi().
1856		 */
1857		if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI)
1858			return (1);
1859		break;
1860	case EXIT_REASON_EPT_FAULT:
1861		/*
1862		 * If 'gpa' lies within the address space allocated to
1863		 * memory then this must be a nested page fault otherwise
1864		 * this must be an instruction that accesses MMIO space.
1865		 */
1866		gpa = vmcs_gpa();
1867		if (vm_mem_allocated(vmx->vm, gpa) || apic_access_fault(gpa)) {
1868			vmexit->exitcode = VM_EXITCODE_PAGING;
1869			vmexit->u.paging.gpa = gpa;
1870			vmexit->u.paging.fault_type = ept_fault_type(qual);
1871			vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
1872		} else if (ept_emulation_fault(qual)) {
1873			vmexit->exitcode = VM_EXITCODE_INST_EMUL;
1874			vmexit->u.inst_emul.gpa = gpa;
1875			vmexit->u.inst_emul.gla = vmcs_gla();
1876			vmexit->u.inst_emul.cr3 = vmcs_guest_cr3();
1877			vmexit->u.inst_emul.cpu_mode = vmx_cpu_mode();
1878			vmexit->u.inst_emul.paging_mode = vmx_paging_mode();
1879			vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1);
1880		}
1881		/*
1882		 * If Virtual NMIs control is 1 and the VM-exit is due to an
1883		 * EPT fault during the execution of IRET then we must restore
1884		 * the state of "virtual-NMI blocking" before resuming.
1885		 *
1886		 * See description of "NMI unblocking due to IRET" in
1887		 * "Exit Qualification for EPT Violations".
1888		 */
1889		if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
1890		    (qual & EXIT_QUAL_NMIUDTI) != 0)
1891			vmx_restore_nmi_blocking(vmx, vcpu);
1892		break;
1893	case EXIT_REASON_VIRTUALIZED_EOI:
1894		vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI;
1895		vmexit->u.ioapic_eoi.vector = qual & 0xFF;
1896		vmexit->inst_length = 0;	/* trap-like */
1897		break;
1898	case EXIT_REASON_APIC_ACCESS:
1899		handled = vmx_handle_apic_access(vmx, vcpu, vmexit);
1900		break;
1901	case EXIT_REASON_APIC_WRITE:
1902		/*
1903		 * APIC-write VM exit is trap-like so the %rip is already
1904		 * pointing to the next instruction.
1905		 */
1906		vmexit->inst_length = 0;
1907		vlapic = vm_lapic(vmx->vm, vcpu);
1908		handled = vmx_handle_apic_write(vlapic, qual);
1909		break;
1910	case EXIT_REASON_XSETBV:
1911		handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit);
1912		break;
1913	default:
1914		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1);
1915		break;
1916	}
1917
1918	if (handled) {
1919		/*
1920		 * It is possible that control is returned to userland
1921		 * even though we were able to handle the VM exit in the
1922		 * kernel.
1923		 *
1924		 * In such a case we want to make sure that the userland
1925		 * restarts guest execution at the instruction *after*
1926		 * the one we just processed. Therefore we update the
1927		 * guest rip in the VMCS and in 'vmexit'.
1928		 */
1929		vmexit->rip += vmexit->inst_length;
1930		vmexit->inst_length = 0;
1931		vmcs_write(VMCS_GUEST_RIP, vmexit->rip);
1932	} else {
1933		if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
1934			/*
1935			 * If this VM exit was not claimed by anybody then
1936			 * treat it as a generic VMX exit.
1937			 */
1938			vmexit->exitcode = VM_EXITCODE_VMX;
1939			vmexit->u.vmx.status = VM_SUCCESS;
1940			vmexit->u.vmx.inst_type = 0;
1941			vmexit->u.vmx.inst_error = 0;
1942		} else {
1943			/*
1944			 * The exitcode and collateral have been populated.
1945			 * The VM exit will be processed further in userland.
1946			 */
1947		}
1948	}
1949	return (handled);
1950}
1951
1952static __inline int
1953vmx_exit_astpending(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
1954{
1955
1956	vmexit->rip = vmcs_guest_rip();
1957	vmexit->inst_length = 0;
1958	vmexit->exitcode = VM_EXITCODE_BOGUS;
1959	vmx_astpending_trace(vmx, vcpu, vmexit->rip);
1960	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_ASTPENDING, 1);
1961
1962	return (HANDLED);
1963}
1964
1965static __inline int
1966vmx_exit_rendezvous(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
1967{
1968
1969	vmexit->rip = vmcs_guest_rip();
1970	vmexit->inst_length = 0;
1971	vmexit->exitcode = VM_EXITCODE_RENDEZVOUS;
1972	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RENDEZVOUS, 1);
1973
1974	return (UNHANDLED);
1975}
1976
1977static __inline int
1978vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit)
1979{
1980
1981	KASSERT(vmxctx->inst_fail_status != VM_SUCCESS,
1982	    ("vmx_exit_inst_error: invalid inst_fail_status %d",
1983	    vmxctx->inst_fail_status));
1984
1985	vmexit->inst_length = 0;
1986	vmexit->exitcode = VM_EXITCODE_VMX;
1987	vmexit->u.vmx.status = vmxctx->inst_fail_status;
1988	vmexit->u.vmx.inst_error = vmcs_instruction_error();
1989	vmexit->u.vmx.exit_reason = ~0;
1990	vmexit->u.vmx.exit_qualification = ~0;
1991
1992	switch (rc) {
1993	case VMX_VMRESUME_ERROR:
1994	case VMX_VMLAUNCH_ERROR:
1995	case VMX_INVEPT_ERROR:
1996		vmexit->u.vmx.inst_type = rc;
1997		break;
1998	default:
1999		panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc);
2000	}
2001
2002	return (UNHANDLED);
2003}
2004
2005/*
2006 * If the NMI-exiting VM execution control is set to '1' then an NMI in
2007 * non-root operation causes a VM-exit. NMI blocking is in effect so it is
2008 * sufficient to simply vector to the NMI handler via a software interrupt.
2009 * However, this must be done before maskable interrupts are enabled
2010 * otherwise the "iret" issued by an interrupt handler will incorrectly
2011 * clear NMI blocking.
2012 */
2013static __inline void
2014vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
2015{
2016	uint32_t intr_info;
2017
2018	KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled"));
2019
2020	if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION)
2021		return;
2022
2023	intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
2024	KASSERT((intr_info & VMCS_INTR_VALID) != 0,
2025	    ("VM exit interruption info invalid: %#x", intr_info));
2026
2027	if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) {
2028		KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due "
2029		    "to NMI has invalid vector: %#x", intr_info));
2030		VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler");
2031		__asm __volatile("int $2");
2032	}
2033}
2034
2035static int
2036vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
2037    void *rendezvous_cookie)
2038{
2039	int rc, handled, launched;
2040	struct vmx *vmx;
2041	struct vm *vm;
2042	struct vmxctx *vmxctx;
2043	struct vmcs *vmcs;
2044	struct vm_exit *vmexit;
2045	struct vlapic *vlapic;
2046	uint64_t rip;
2047	uint32_t exit_reason;
2048
2049	vmx = arg;
2050	vm = vmx->vm;
2051	vmcs = &vmx->vmcs[vcpu];
2052	vmxctx = &vmx->ctx[vcpu];
2053	vlapic = vm_lapic(vm, vcpu);
2054	vmexit = vm_exitinfo(vm, vcpu);
2055	launched = 0;
2056
2057	KASSERT(vmxctx->pmap == pmap,
2058	    ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap));
2059
2060	VMPTRLD(vmcs);
2061
2062	/*
2063	 * XXX
2064	 * We do this every time because we may setup the virtual machine
2065	 * from a different process than the one that actually runs it.
2066	 *
2067	 * If the life of a virtual machine was spent entirely in the context
2068	 * of a single process we could do this once in vmx_vminit().
2069	 */
2070	vmcs_write(VMCS_HOST_CR3, rcr3());
2071
2072	vmcs_write(VMCS_GUEST_RIP, startrip);
2073	vmx_set_pcpu_defaults(vmx, vcpu, pmap);
2074	do {
2075		/*
2076		 * Interrupts are disabled from this point on until the
2077		 * guest starts executing. This is done for the following
2078		 * reasons:
2079		 *
2080		 * If an AST is asserted on this thread after the check below,
2081		 * then the IPI_AST notification will not be lost, because it
2082		 * will cause a VM exit due to external interrupt as soon as
2083		 * the guest state is loaded.
2084		 *
2085		 * A posted interrupt after 'vmx_inject_interrupts()' will
2086		 * not be "lost" because it will be held pending in the host
2087		 * APIC because interrupts are disabled. The pending interrupt
2088		 * will be recognized as soon as the guest state is loaded.
2089		 *
2090		 * The same reasoning applies to the IPI generated by
2091		 * pmap_invalidate_ept().
2092		 */
2093		disable_intr();
2094		if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)) {
2095			enable_intr();
2096			handled = vmx_exit_astpending(vmx, vcpu, vmexit);
2097			break;
2098		}
2099
2100		if (vcpu_rendezvous_pending(rendezvous_cookie)) {
2101			enable_intr();
2102			handled = vmx_exit_rendezvous(vmx, vcpu, vmexit);
2103			break;
2104		}
2105
2106		vmx_inject_interrupts(vmx, vcpu, vlapic);
2107		vmx_run_trace(vmx, vcpu);
2108		rc = vmx_enter_guest(vmxctx, vmx, launched);
2109
2110		/* Collect some information for VM exit processing */
2111		vmexit->rip = rip = vmcs_guest_rip();
2112		vmexit->inst_length = vmexit_instruction_length();
2113		vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
2114		vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
2115
2116		if (rc == VMX_GUEST_VMEXIT) {
2117			vmx_exit_handle_nmi(vmx, vcpu, vmexit);
2118			enable_intr();
2119			handled = vmx_exit_process(vmx, vcpu, vmexit);
2120		} else {
2121			enable_intr();
2122			handled = vmx_exit_inst_error(vmxctx, rc, vmexit);
2123		}
2124		launched = 1;
2125		vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled);
2126	} while (handled);
2127
2128	/*
2129	 * If a VM exit has been handled then the exitcode must be BOGUS
2130	 * If a VM exit is not handled then the exitcode must not be BOGUS
2131	 */
2132	if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) ||
2133	    (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) {
2134		panic("Mismatch between handled (%d) and exitcode (%d)",
2135		      handled, vmexit->exitcode);
2136	}
2137
2138	if (!handled)
2139		vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1);
2140
2141	VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d",
2142	    vmexit->exitcode);
2143
2144	VMCLEAR(vmcs);
2145	return (0);
2146}
2147
2148static void
2149vmx_vmcleanup(void *arg)
2150{
2151	int i, error;
2152	struct vmx *vmx = arg;
2153
2154	if (virtual_interrupt_delivery)
2155		vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
2156
2157	for (i = 0; i < VM_MAXCPU; i++)
2158		vpid_free(vmx->state[i].vpid);
2159
2160	/*
2161	 * XXXSMP we also need to clear the VMCS active on the other vcpus.
2162	 */
2163	error = vmclear(&vmx->vmcs[0]);
2164	if (error != 0)
2165		panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error);
2166
2167	free(vmx, M_VMX);
2168
2169	return;
2170}
2171
2172static register_t *
2173vmxctx_regptr(struct vmxctx *vmxctx, int reg)
2174{
2175
2176	switch (reg) {
2177	case VM_REG_GUEST_RAX:
2178		return (&vmxctx->guest_rax);
2179	case VM_REG_GUEST_RBX:
2180		return (&vmxctx->guest_rbx);
2181	case VM_REG_GUEST_RCX:
2182		return (&vmxctx->guest_rcx);
2183	case VM_REG_GUEST_RDX:
2184		return (&vmxctx->guest_rdx);
2185	case VM_REG_GUEST_RSI:
2186		return (&vmxctx->guest_rsi);
2187	case VM_REG_GUEST_RDI:
2188		return (&vmxctx->guest_rdi);
2189	case VM_REG_GUEST_RBP:
2190		return (&vmxctx->guest_rbp);
2191	case VM_REG_GUEST_R8:
2192		return (&vmxctx->guest_r8);
2193	case VM_REG_GUEST_R9:
2194		return (&vmxctx->guest_r9);
2195	case VM_REG_GUEST_R10:
2196		return (&vmxctx->guest_r10);
2197	case VM_REG_GUEST_R11:
2198		return (&vmxctx->guest_r11);
2199	case VM_REG_GUEST_R12:
2200		return (&vmxctx->guest_r12);
2201	case VM_REG_GUEST_R13:
2202		return (&vmxctx->guest_r13);
2203	case VM_REG_GUEST_R14:
2204		return (&vmxctx->guest_r14);
2205	case VM_REG_GUEST_R15:
2206		return (&vmxctx->guest_r15);
2207	default:
2208		break;
2209	}
2210	return (NULL);
2211}
2212
2213static int
2214vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval)
2215{
2216	register_t *regp;
2217
2218	if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
2219		*retval = *regp;
2220		return (0);
2221	} else
2222		return (EINVAL);
2223}
2224
2225static int
2226vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
2227{
2228	register_t *regp;
2229
2230	if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
2231		*regp = val;
2232		return (0);
2233	} else
2234		return (EINVAL);
2235}
2236
2237static int
2238vmx_shadow_reg(int reg)
2239{
2240	int shreg;
2241
2242	shreg = -1;
2243
2244	switch (reg) {
2245	case VM_REG_GUEST_CR0:
2246		shreg = VMCS_CR0_SHADOW;
2247                break;
2248        case VM_REG_GUEST_CR4:
2249		shreg = VMCS_CR4_SHADOW;
2250		break;
2251	default:
2252		break;
2253	}
2254
2255	return (shreg);
2256}
2257
2258static int
2259vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
2260{
2261	int running, hostcpu;
2262	struct vmx *vmx = arg;
2263
2264	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
2265	if (running && hostcpu != curcpu)
2266		panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);
2267
2268	if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0)
2269		return (0);
2270
2271	return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval));
2272}
2273
2274static int
2275vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
2276{
2277	int error, hostcpu, running, shadow;
2278	uint64_t ctls;
2279	struct vmx *vmx = arg;
2280
2281	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
2282	if (running && hostcpu != curcpu)
2283		panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);
2284
2285	if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0)
2286		return (0);
2287
2288	error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val);
2289
2290	if (error == 0) {
2291		/*
2292		 * If the "load EFER" VM-entry control is 1 then the
2293		 * value of EFER.LMA must be identical to "IA-32e mode guest"
2294		 * bit in the VM-entry control.
2295		 */
2296		if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 &&
2297		    (reg == VM_REG_GUEST_EFER)) {
2298			vmcs_getreg(&vmx->vmcs[vcpu], running,
2299				    VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls);
2300			if (val & EFER_LMA)
2301				ctls |= VM_ENTRY_GUEST_LMA;
2302			else
2303				ctls &= ~VM_ENTRY_GUEST_LMA;
2304			vmcs_setreg(&vmx->vmcs[vcpu], running,
2305				    VMCS_IDENT(VMCS_ENTRY_CTLS), ctls);
2306		}
2307
2308		shadow = vmx_shadow_reg(reg);
2309		if (shadow > 0) {
2310			/*
2311			 * Store the unmodified value in the shadow
2312			 */
2313			error = vmcs_setreg(&vmx->vmcs[vcpu], running,
2314				    VMCS_IDENT(shadow), val);
2315		}
2316	}
2317
2318	return (error);
2319}
2320
2321static int
2322vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
2323{
2324	struct vmx *vmx = arg;
2325
2326	return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc));
2327}
2328
2329static int
2330vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
2331{
2332	struct vmx *vmx = arg;
2333
2334	return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc));
2335}
2336
2337static int
2338vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code,
2339	   int code_valid)
2340{
2341	struct vmx *vmx = arg;
2342	struct vmxevent *user_event = &vmx->state[vcpu].user_event;
2343
2344	static uint32_t type_map[VM_EVENT_MAX] = {
2345		0x1,		/* VM_EVENT_NONE */
2346		0x0,		/* VM_HW_INTR */
2347		0x2,		/* VM_NMI */
2348		0x3,		/* VM_HW_EXCEPTION */
2349		0x4,		/* VM_SW_INTR */
2350		0x5,		/* VM_PRIV_SW_EXCEPTION */
2351		0x6,		/* VM_SW_EXCEPTION */
2352	};
2353
2354	/*
2355	 * If there is already an exception pending to be delivered to the
2356	 * vcpu then just return.
2357	 */
2358	if (user_event->intr_info & VMCS_INTR_VALID)
2359		return (EAGAIN);
2360
2361	user_event->intr_info = vector | (type_map[type] << 8) | VMCS_INTR_VALID;
2362	if (code_valid) {
2363		user_event->intr_info |= VMCS_INTR_DEL_ERRCODE;
2364		user_event->error_code = code;
2365	}
2366	return (0);
2367}
2368
2369static int
2370vmx_getcap(void *arg, int vcpu, int type, int *retval)
2371{
2372	struct vmx *vmx = arg;
2373	int vcap;
2374	int ret;
2375
2376	ret = ENOENT;
2377
2378	vcap = vmx->cap[vcpu].set;
2379
2380	switch (type) {
2381	case VM_CAP_HALT_EXIT:
2382		if (cap_halt_exit)
2383			ret = 0;
2384		break;
2385	case VM_CAP_PAUSE_EXIT:
2386		if (cap_pause_exit)
2387			ret = 0;
2388		break;
2389	case VM_CAP_MTRAP_EXIT:
2390		if (cap_monitor_trap)
2391			ret = 0;
2392		break;
2393	case VM_CAP_UNRESTRICTED_GUEST:
2394		if (cap_unrestricted_guest)
2395			ret = 0;
2396		break;
2397	case VM_CAP_ENABLE_INVPCID:
2398		if (cap_invpcid)
2399			ret = 0;
2400		break;
2401	default:
2402		break;
2403	}
2404
2405	if (ret == 0)
2406		*retval = (vcap & (1 << type)) ? 1 : 0;
2407
2408	return (ret);
2409}
2410
2411static int
2412vmx_setcap(void *arg, int vcpu, int type, int val)
2413{
2414	struct vmx *vmx = arg;
2415	struct vmcs *vmcs = &vmx->vmcs[vcpu];
2416	uint32_t baseval;
2417	uint32_t *pptr;
2418	int error;
2419	int flag;
2420	int reg;
2421	int retval;
2422
2423	retval = ENOENT;
2424	pptr = NULL;
2425
2426	switch (type) {
2427	case VM_CAP_HALT_EXIT:
2428		if (cap_halt_exit) {
2429			retval = 0;
2430			pptr = &vmx->cap[vcpu].proc_ctls;
2431			baseval = *pptr;
2432			flag = PROCBASED_HLT_EXITING;
2433			reg = VMCS_PRI_PROC_BASED_CTLS;
2434		}
2435		break;
2436	case VM_CAP_MTRAP_EXIT:
2437		if (cap_monitor_trap) {
2438			retval = 0;
2439			pptr = &vmx->cap[vcpu].proc_ctls;
2440			baseval = *pptr;
2441			flag = PROCBASED_MTF;
2442			reg = VMCS_PRI_PROC_BASED_CTLS;
2443		}
2444		break;
2445	case VM_CAP_PAUSE_EXIT:
2446		if (cap_pause_exit) {
2447			retval = 0;
2448			pptr = &vmx->cap[vcpu].proc_ctls;
2449			baseval = *pptr;
2450			flag = PROCBASED_PAUSE_EXITING;
2451			reg = VMCS_PRI_PROC_BASED_CTLS;
2452		}
2453		break;
2454	case VM_CAP_UNRESTRICTED_GUEST:
2455		if (cap_unrestricted_guest) {
2456			retval = 0;
2457			pptr = &vmx->cap[vcpu].proc_ctls2;
2458			baseval = *pptr;
2459			flag = PROCBASED2_UNRESTRICTED_GUEST;
2460			reg = VMCS_SEC_PROC_BASED_CTLS;
2461		}
2462		break;
2463	case VM_CAP_ENABLE_INVPCID:
2464		if (cap_invpcid) {
2465			retval = 0;
2466			pptr = &vmx->cap[vcpu].proc_ctls2;
2467			baseval = *pptr;
2468			flag = PROCBASED2_ENABLE_INVPCID;
2469			reg = VMCS_SEC_PROC_BASED_CTLS;
2470		}
2471		break;
2472	default:
2473		break;
2474	}
2475
2476	if (retval == 0) {
2477		if (val) {
2478			baseval |= flag;
2479		} else {
2480			baseval &= ~flag;
2481		}
2482		VMPTRLD(vmcs);
2483		error = vmwrite(reg, baseval);
2484		VMCLEAR(vmcs);
2485
2486		if (error) {
2487			retval = error;
2488		} else {
2489			/*
2490			 * Update optional stored flags, and record
2491			 * setting
2492			 */
2493			if (pptr != NULL) {
2494				*pptr = baseval;
2495			}
2496
2497			if (val) {
2498				vmx->cap[vcpu].set |= (1 << type);
2499			} else {
2500				vmx->cap[vcpu].set &= ~(1 << type);
2501			}
2502		}
2503	}
2504
2505        return (retval);
2506}
2507
2508struct vlapic_vtx {
2509	struct vlapic	vlapic;
2510	struct pir_desc	*pir_desc;
2511	struct vmx	*vmx;
2512};
2513
2514#define	VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg)	\
2515do {									\
2516	VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d",	\
2517	    level ? "level" : "edge", vector);				\
2518	VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]);	\
2519	VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]);	\
2520	VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]);	\
2521	VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]);	\
2522	VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\
2523} while (0)
2524
2525/*
2526 * vlapic->ops handlers that utilize the APICv hardware assist described in
2527 * Chapter 29 of the Intel SDM.
2528 */
2529static int
2530vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
2531{
2532	struct vlapic_vtx *vlapic_vtx;
2533	struct pir_desc *pir_desc;
2534	uint64_t mask;
2535	int idx, notify;
2536
2537	vlapic_vtx = (struct vlapic_vtx *)vlapic;
2538	pir_desc = vlapic_vtx->pir_desc;
2539
2540	/*
2541	 * Keep track of interrupt requests in the PIR descriptor. This is
2542	 * because the virtual APIC page pointed to by the VMCS cannot be
2543	 * modified if the vcpu is running.
2544	 */
2545	idx = vector / 64;
2546	mask = 1UL << (vector % 64);
2547	atomic_set_long(&pir_desc->pir[idx], mask);
2548	notify = atomic_cmpset_long(&pir_desc->pending, 0, 1);
2549
2550	VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector,
2551	    level, "vmx_set_intr_ready");
2552	return (notify);
2553}
2554
2555static int
2556vmx_pending_intr(struct vlapic *vlapic, int *vecptr)
2557{
2558	struct vlapic_vtx *vlapic_vtx;
2559	struct pir_desc *pir_desc;
2560	struct LAPIC *lapic;
2561	uint64_t pending, pirval;
2562	uint32_t ppr, vpr;
2563	int i;
2564
2565	/*
2566	 * This function is only expected to be called from the 'HLT' exit
2567	 * handler which does not care about the vector that is pending.
2568	 */
2569	KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL"));
2570
2571	vlapic_vtx = (struct vlapic_vtx *)vlapic;
2572	pir_desc = vlapic_vtx->pir_desc;
2573
2574	pending = atomic_load_acq_long(&pir_desc->pending);
2575	if (!pending)
2576		return (0);	/* common case */
2577
2578	/*
2579	 * If there is an interrupt pending then it will be recognized only
2580	 * if its priority is greater than the processor priority.
2581	 *
2582	 * Special case: if the processor priority is zero then any pending
2583	 * interrupt will be recognized.
2584	 */
2585	lapic = vlapic->apic_page;
2586	ppr = lapic->ppr & 0xf0;
2587	if (ppr == 0)
2588		return (1);
2589
2590	VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d",
2591	    lapic->ppr);
2592
2593	for (i = 3; i >= 0; i--) {
2594		pirval = pir_desc->pir[i];
2595		if (pirval != 0) {
2596			vpr = (i * 64 + flsl(pirval) - 1) & 0xf0;
2597			return (vpr > ppr);
2598		}
2599	}
2600	return (0);
2601}
2602
2603static void
2604vmx_intr_accepted(struct vlapic *vlapic, int vector)
2605{
2606
2607	panic("vmx_intr_accepted: not expected to be called");
2608}
2609
2610static void
2611vmx_set_tmr(struct vlapic *vlapic, int vector, bool level)
2612{
2613	struct vlapic_vtx *vlapic_vtx;
2614	struct vmx *vmx;
2615	struct vmcs *vmcs;
2616	uint64_t mask, val;
2617
2618	KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector));
2619	KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL),
2620	    ("vmx_set_tmr: vcpu cannot be running"));
2621
2622	vlapic_vtx = (struct vlapic_vtx *)vlapic;
2623	vmx = vlapic_vtx->vmx;
2624	vmcs = &vmx->vmcs[vlapic->vcpuid];
2625	mask = 1UL << (vector % 64);
2626
2627	VMPTRLD(vmcs);
2628	val = vmcs_read(VMCS_EOI_EXIT(vector));
2629	if (level)
2630		val |= mask;
2631	else
2632		val &= ~mask;
2633	vmcs_write(VMCS_EOI_EXIT(vector), val);
2634	VMCLEAR(vmcs);
2635}
2636
2637static void
2638vmx_post_intr(struct vlapic *vlapic, int hostcpu)
2639{
2640
2641	ipi_cpu(hostcpu, pirvec);
2642}
2643
2644/*
2645 * Transfer the pending interrupts in the PIR descriptor to the IRR
2646 * in the virtual APIC page.
2647 */
2648static void
2649vmx_inject_pir(struct vlapic *vlapic)
2650{
2651	struct vlapic_vtx *vlapic_vtx;
2652	struct pir_desc *pir_desc;
2653	struct LAPIC *lapic;
2654	uint64_t val, pirval;
2655	int rvi, pirbase;
2656	uint16_t intr_status_old, intr_status_new;
2657
2658	vlapic_vtx = (struct vlapic_vtx *)vlapic;
2659	pir_desc = vlapic_vtx->pir_desc;
2660	if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) {
2661		VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
2662		    "no posted interrupt pending");
2663		return;
2664	}
2665
2666	pirval = 0;
2667	lapic = vlapic->apic_page;
2668
2669	val = atomic_readandclear_long(&pir_desc->pir[0]);
2670	if (val != 0) {
2671		lapic->irr0 |= val;
2672		lapic->irr1 |= val >> 32;
2673		pirbase = 0;
2674		pirval = val;
2675	}
2676
2677	val = atomic_readandclear_long(&pir_desc->pir[1]);
2678	if (val != 0) {
2679		lapic->irr2 |= val;
2680		lapic->irr3 |= val >> 32;
2681		pirbase = 64;
2682		pirval = val;
2683	}
2684
2685	val = atomic_readandclear_long(&pir_desc->pir[2]);
2686	if (val != 0) {
2687		lapic->irr4 |= val;
2688		lapic->irr5 |= val >> 32;
2689		pirbase = 128;
2690		pirval = val;
2691	}
2692
2693	val = atomic_readandclear_long(&pir_desc->pir[3]);
2694	if (val != 0) {
2695		lapic->irr6 |= val;
2696		lapic->irr7 |= val >> 32;
2697		pirbase = 192;
2698		pirval = val;
2699	}
2700	VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir");
2701
2702	/*
2703	 * Update RVI so the processor can evaluate pending virtual
2704	 * interrupts on VM-entry.
2705	 */
2706	if (pirval != 0) {
2707		rvi = pirbase + flsl(pirval) - 1;
2708		intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS);
2709		intr_status_new = (intr_status_old & 0xFF00) | rvi;
2710		if (intr_status_new > intr_status_old) {
2711			vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new);
2712			VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
2713			    "guest_intr_status changed from 0x%04x to 0x%04x",
2714			    intr_status_old, intr_status_new);
2715		}
2716	}
2717}
2718
2719static struct vlapic *
2720vmx_vlapic_init(void *arg, int vcpuid)
2721{
2722	struct vmx *vmx;
2723	struct vlapic *vlapic;
2724	struct vlapic_vtx *vlapic_vtx;
2725
2726	vmx = arg;
2727
2728	vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO);
2729	vlapic->vm = vmx->vm;
2730	vlapic->vcpuid = vcpuid;
2731	vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid];
2732
2733	vlapic_vtx = (struct vlapic_vtx *)vlapic;
2734	vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid];
2735	vlapic_vtx->vmx = vmx;
2736
2737	if (virtual_interrupt_delivery) {
2738		vlapic->ops.set_intr_ready = vmx_set_intr_ready;
2739		vlapic->ops.pending_intr = vmx_pending_intr;
2740		vlapic->ops.intr_accepted = vmx_intr_accepted;
2741		vlapic->ops.set_tmr = vmx_set_tmr;
2742	}
2743
2744	if (posted_interrupts)
2745		vlapic->ops.post_intr = vmx_post_intr;
2746
2747	vlapic_init(vlapic);
2748
2749	return (vlapic);
2750}
2751
2752static void
2753vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic)
2754{
2755
2756	vlapic_cleanup(vlapic);
2757	free(vlapic, M_VLAPIC);
2758}
2759
2760struct vmm_ops vmm_ops_intel = {
2761	vmx_init,
2762	vmx_cleanup,
2763	vmx_restore,
2764	vmx_vminit,
2765	vmx_run,
2766	vmx_vmcleanup,
2767	vmx_getreg,
2768	vmx_setreg,
2769	vmx_getdesc,
2770	vmx_setdesc,
2771	vmx_inject,
2772	vmx_getcap,
2773	vmx_setcap,
2774	ept_vmspace_alloc,
2775	ept_vmspace_free,
2776	vmx_vlapic_init,
2777	vmx_vlapic_cleanup,
2778};
2779