1254677Sgrehan/*-
2254677Sgrehan * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com)
3254677Sgrehan * All rights reserved.
4254677Sgrehan *
5254677Sgrehan * Redistribution and use in source and binary forms, with or without
6254677Sgrehan * modification, are permitted provided that the following conditions
7254677Sgrehan * are met:
8254677Sgrehan * 1. Redistributions of source code must retain the above copyright
9254677Sgrehan *    notice unmodified, this list of conditions, and the following
10254677Sgrehan *    disclaimer.
11254677Sgrehan * 2. Redistributions in binary form must reproduce the above copyright
12254677Sgrehan *    notice, this list of conditions and the following disclaimer in the
13254677Sgrehan *    documentation and/or other materials provided with the distribution.
14254677Sgrehan *
15254677Sgrehan * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16254677Sgrehan * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17254677Sgrehan * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18254677Sgrehan * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19254677Sgrehan * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20254677Sgrehan * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21254677Sgrehan * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22254677Sgrehan * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23254677Sgrehan * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24254677Sgrehan * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25254677Sgrehan */
26254677Sgrehan
27254677Sgrehan#include <sys/cdefs.h>
28254677Sgrehan__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/amd/svm.c 330069 2018-02-27 14:47:56Z avg $");
29254677Sgrehan
30254677Sgrehan#include <sys/param.h>
31254677Sgrehan#include <sys/systm.h>
32254677Sgrehan#include <sys/smp.h>
33254677Sgrehan#include <sys/kernel.h>
34254677Sgrehan#include <sys/malloc.h>
35254677Sgrehan#include <sys/pcpu.h>
36254677Sgrehan#include <sys/proc.h>
37271554Sneel#include <sys/sysctl.h>
38254677Sgrehan
39254677Sgrehan#include <vm/vm.h>
40254677Sgrehan#include <vm/pmap.h>
41254677Sgrehan
42254677Sgrehan#include <machine/cpufunc.h>
43254677Sgrehan#include <machine/psl.h>
44254677Sgrehan#include <machine/pmap.h>
45254677Sgrehan#include <machine/md_var.h>
46254677Sgrehan#include <machine/specialreg.h>
47271415Sneel#include <machine/smp.h>
48254677Sgrehan#include <machine/vmm.h>
49276403Sneel#include <machine/vmm_dev.h>
50267003Sgrehan#include <machine/vmm_instruction_emul.h>
51254677Sgrehan
52254677Sgrehan#include "vmm_lapic.h"
53254677Sgrehan#include "vmm_stat.h"
54254677Sgrehan#include "vmm_ktr.h"
55267003Sgrehan#include "vmm_ioport.h"
56267305Sgrehan#include "vatpic.h"
57267003Sgrehan#include "vlapic.h"
58267003Sgrehan#include "vlapic_priv.h"
59254677Sgrehan
60254677Sgrehan#include "x86.h"
61254677Sgrehan#include "vmcb.h"
62254677Sgrehan#include "svm.h"
63254677Sgrehan#include "svm_softc.h"
64271912Sneel#include "svm_msr.h"
65254677Sgrehan#include "npt.h"
66254677Sgrehan
67271554SneelSYSCTL_DECL(_hw_vmm);
68271554SneelSYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW, NULL, NULL);
69271554Sneel
70254677Sgrehan/*
71259579Sgrehan * SVM CPUID function 0x8000_000A, edx bit decoding.
72254677Sgrehan */
73254677Sgrehan#define AMD_CPUID_SVM_NP		BIT(0)  /* Nested paging or RVI */
74254677Sgrehan#define AMD_CPUID_SVM_LBR		BIT(1)  /* Last branch virtualization */
75254677Sgrehan#define AMD_CPUID_SVM_SVML		BIT(2)  /* SVM lock */
76254677Sgrehan#define AMD_CPUID_SVM_NRIP_SAVE		BIT(3)  /* Next RIP is saved */
77254677Sgrehan#define AMD_CPUID_SVM_TSC_RATE		BIT(4)  /* TSC rate control. */
78254677Sgrehan#define AMD_CPUID_SVM_VMCB_CLEAN	BIT(5)  /* VMCB state caching */
79271203Sneel#define AMD_CPUID_SVM_FLUSH_BY_ASID	BIT(6)  /* Flush by ASID */
80254677Sgrehan#define AMD_CPUID_SVM_DECODE_ASSIST	BIT(7)  /* Decode assist */
81254677Sgrehan#define AMD_CPUID_SVM_PAUSE_INC		BIT(10) /* Pause intercept filter. */
82254677Sgrehan#define AMD_CPUID_SVM_PAUSE_FTH		BIT(12) /* Pause filter threshold */
83284894Sneel#define	AMD_CPUID_SVM_AVIC		BIT(13)	/* AVIC present */
84254677Sgrehan
85271348Sneel#define	VMCB_CACHE_DEFAULT	(VMCB_CACHE_ASID 	|	\
86271348Sneel				VMCB_CACHE_IOPM		|	\
87271348Sneel				VMCB_CACHE_I		|	\
88271348Sneel				VMCB_CACHE_TPR		|	\
89271939Sneel				VMCB_CACHE_CR2		|	\
90271939Sneel				VMCB_CACHE_CR		|	\
91271939Sneel				VMCB_CACHE_DT		|	\
92271939Sneel				VMCB_CACHE_SEG		|	\
93271348Sneel				VMCB_CACHE_NP)
94271203Sneel
95271939Sneelstatic uint32_t vmcb_clean = VMCB_CACHE_DEFAULT;
96271939SneelSYSCTL_INT(_hw_vmm_svm, OID_AUTO, vmcb_clean, CTLFLAG_RDTUN, &vmcb_clean,
97271939Sneel    0, NULL);
98271939Sneel
99272929Sneelstatic MALLOC_DEFINE(M_SVM, "svm", "svm");
100272929Sneelstatic MALLOC_DEFINE(M_SVM_VLAPIC, "svm-vlapic", "svm-vlapic");
101254677Sgrehan
102254677Sgrehan/* Per-CPU context area. */
103254677Sgrehanextern struct pcpu __pcpu[];
104254677Sgrehan
105284900Sneelstatic uint32_t svm_feature = ~0U;	/* AMD SVM features. */
106284900SneelSYSCTL_UINT(_hw_vmm_svm, OID_AUTO, features, CTLFLAG_RDTUN, &svm_feature, 0,
107271554Sneel    "SVM features advertised by CPUID.8000000AH:EDX");
108254677Sgrehan
109271554Sneelstatic int disable_npf_assist;
110271554SneelSYSCTL_INT(_hw_vmm_svm, OID_AUTO, disable_npf_assist, CTLFLAG_RWTUN,
111271554Sneel    &disable_npf_assist, 0, NULL);
112271554Sneel
113271203Sneel/* Maximum ASIDs supported by the processor */
114271203Sneelstatic uint32_t nasid;
115284900SneelSYSCTL_UINT(_hw_vmm_svm, OID_AUTO, num_asids, CTLFLAG_RDTUN, &nasid, 0,
116271554Sneel    "Number of ASIDs supported by this processor");
117254677Sgrehan
118271203Sneel/* Current ASID generation for each host cpu */
119271203Sneelstatic struct asid asid[MAXCPU];
120254677Sgrehan
121254677Sgrehan/*
122254677Sgrehan * SVM host state saved area of size 4KB for each core.
123254677Sgrehan */
124254677Sgrehanstatic uint8_t hsave[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
125254677Sgrehan
126271415Sneelstatic VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery");
127271415Sneelstatic VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry");
128271415Sneelstatic VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window");
129259579Sgrehan
130271939Sneelstatic int svm_setreg(void *arg, int vcpu, int ident, uint64_t val);
131271939Sneel
132272929Sneelstatic __inline int
133272929Sneelflush_by_asid(void)
134254677Sgrehan{
135271340Sneel
136272929Sneel	return (svm_feature & AMD_CPUID_SVM_FLUSH_BY_ASID);
137272929Sneel}
138254677Sgrehan
139272929Sneelstatic __inline int
140272929Sneeldecode_assist(void)
141272929Sneel{
142254677Sgrehan
143272929Sneel	return (svm_feature & AMD_CPUID_SVM_DECODE_ASSIST);
144254677Sgrehan}
145254677Sgrehan
146254677Sgrehanstatic void
147254677Sgrehansvm_disable(void *arg __unused)
148254677Sgrehan{
149272929Sneel	uint64_t efer;
150254677Sgrehan
151272929Sneel	efer = rdmsr(MSR_EFER);
152272929Sneel	efer &= ~EFER_SVM;
153272929Sneel	wrmsr(MSR_EFER, efer);
154254677Sgrehan}
155254677Sgrehan
156254677Sgrehan/*
157272929Sneel * Disable SVM on all CPUs.
158254677Sgrehan */
159254677Sgrehanstatic int
160254677Sgrehansvm_cleanup(void)
161254677Sgrehan{
162254677Sgrehan
163254677Sgrehan	smp_rendezvous(NULL, svm_disable, NULL, NULL);
164254677Sgrehan	return (0);
165254677Sgrehan}
166254677Sgrehan
167254677Sgrehan/*
168272926Sneel * Verify that all the features required by bhyve are available.
169254677Sgrehan */
170254677Sgrehanstatic int
171272926Sneelcheck_svm_features(void)
172254677Sgrehan{
173254677Sgrehan	u_int regs[4];
174254677Sgrehan
175254677Sgrehan	/* CPUID Fn8000_000A is for SVM */
176254677Sgrehan	do_cpuid(0x8000000A, regs);
177284900Sneel	svm_feature &= regs[3];
178254677Sgrehan
179284900Sneel	/*
180284900Sneel	 * The number of ASIDs can be configured to be less than what is
181284900Sneel	 * supported by the hardware but not more.
182284900Sneel	 */
183284900Sneel	if (nasid == 0 || nasid > regs[1])
184284900Sneel		nasid = regs[1];
185271203Sneel	KASSERT(nasid > 1, ("Insufficient ASIDs for guests: %#x", nasid));
186271203Sneel
187272926Sneel	/* bhyve requires the Nested Paging feature */
188272926Sneel	if (!(svm_feature & AMD_CPUID_SVM_NP)) {
189272926Sneel		printf("SVM: Nested Paging feature not available.\n");
190254677Sgrehan		return (ENXIO);
191254677Sgrehan	}
192254677Sgrehan
193272926Sneel	/* bhyve requires the NRIP Save feature */
194272926Sneel	if (!(svm_feature & AMD_CPUID_SVM_NRIP_SAVE)) {
195272926Sneel		printf("SVM: NRIP Save feature not available.\n");
196272926Sneel		return (ENXIO);
197254677Sgrehan	}
198271340Sneel
199272926Sneel	return (0);
200254677Sgrehan}
201254677Sgrehan
202254677Sgrehanstatic void
203254677Sgrehansvm_enable(void *arg __unused)
204254677Sgrehan{
205272929Sneel	uint64_t efer;
206254677Sgrehan
207272929Sneel	efer = rdmsr(MSR_EFER);
208272929Sneel	efer |= EFER_SVM;
209272929Sneel	wrmsr(MSR_EFER, efer);
210254677Sgrehan
211272929Sneel	wrmsr(MSR_VM_HSAVE_PA, vtophys(hsave[curcpu]));
212254677Sgrehan}
213254677Sgrehan
214254677Sgrehan/*
215272929Sneel * Return 1 if SVM is enabled on this processor and 0 otherwise.
216254677Sgrehan */
217254677Sgrehanstatic int
218272929Sneelsvm_available(void)
219254677Sgrehan{
220254677Sgrehan	uint64_t msr;
221254677Sgrehan
222272926Sneel	/* Section 15.4 Enabling SVM from APM2. */
223254677Sgrehan	if ((amd_feature2 & AMDID2_SVM) == 0) {
224272926Sneel		printf("SVM: not available.\n");
225272929Sneel		return (0);
226254677Sgrehan	}
227254677Sgrehan
228254677Sgrehan	msr = rdmsr(MSR_VM_CR);
229272926Sneel	if ((msr & VM_CR_SVMDIS) != 0) {
230272926Sneel		printf("SVM: disabled by BIOS.\n");
231272929Sneel		return (0);
232254677Sgrehan	}
233254677Sgrehan
234272929Sneel	return (1);
235254677Sgrehan}
236254677Sgrehan
237254677Sgrehanstatic int
238267003Sgrehansvm_init(int ipinum)
239254677Sgrehan{
240272929Sneel	int error, cpu;
241254677Sgrehan
242272929Sneel	if (!svm_available())
243272929Sneel		return (ENXIO);
244271340Sneel
245272929Sneel	error = check_svm_features();
246272929Sneel	if (error)
247272929Sneel		return (error);
248272929Sneel
249271939Sneel	vmcb_clean &= VMCB_CACHE_DEFAULT;
250271939Sneel
251271203Sneel	for (cpu = 0; cpu < MAXCPU; cpu++) {
252271203Sneel		/*
253271203Sneel		 * Initialize the host ASIDs to their "highest" valid values.
254271203Sneel		 *
255271203Sneel		 * The next ASID allocation will rollover both 'gen' and 'num'
256271203Sneel		 * and start off the sequence at {1,1}.
257271203Sneel		 */
258271203Sneel		asid[cpu].gen = ~0UL;
259271203Sneel		asid[cpu].num = nasid - 1;
260271203Sneel	}
261254677Sgrehan
262271912Sneel	svm_msr_init();
263267003Sgrehan	svm_npt_init(ipinum);
264271340Sneel
265272929Sneel	/* Enable SVM on all CPUs */
266254677Sgrehan	smp_rendezvous(NULL, svm_enable, NULL, NULL);
267271203Sneel
268259579Sgrehan	return (0);
269254677Sgrehan}
270254677Sgrehan
271267003Sgrehanstatic void
272267003Sgrehansvm_restore(void)
273267003Sgrehan{
274272929Sneel
275267003Sgrehan	svm_enable(NULL);
276267003Sgrehan}
277271340Sneel
278254677Sgrehan/* Pentium compatible MSRs */
279254677Sgrehan#define MSR_PENTIUM_START 	0
280254677Sgrehan#define MSR_PENTIUM_END 	0x1FFF
281254677Sgrehan/* AMD 6th generation and Intel compatible MSRs */
282254677Sgrehan#define MSR_AMD6TH_START 	0xC0000000UL
283254677Sgrehan#define MSR_AMD6TH_END 		0xC0001FFFUL
284254677Sgrehan/* AMD 7th and 8th generation compatible MSRs */
285254677Sgrehan#define MSR_AMD7TH_START 	0xC0010000UL
286254677Sgrehan#define MSR_AMD7TH_END 		0xC0011FFFUL
287271340Sneel
288272929Sneel/*
289272929Sneel * Get the index and bit position for a MSR in permission bitmap.
290272929Sneel * Two bits are used for each MSR: lower bit for read and higher bit for write.
291272929Sneel */
292272929Sneelstatic int
293272929Sneelsvm_msr_index(uint64_t msr, int *index, int *bit)
294272929Sneel{
295272929Sneel	uint32_t base, off;
296272929Sneel
297254677Sgrehan	*index = -1;
298254677Sgrehan	*bit = (msr % 4) * 2;
299254677Sgrehan	base = 0;
300254677Sgrehan
301254677Sgrehan	if (msr >= MSR_PENTIUM_START && msr <= MSR_PENTIUM_END) {
302254677Sgrehan		*index = msr / 4;
303254677Sgrehan		return (0);
304254677Sgrehan	}
305271340Sneel
306254677Sgrehan	base += (MSR_PENTIUM_END - MSR_PENTIUM_START + 1);
307254677Sgrehan	if (msr >= MSR_AMD6TH_START && msr <= MSR_AMD6TH_END) {
308254677Sgrehan		off = (msr - MSR_AMD6TH_START);
309254677Sgrehan		*index = (off + base) / 4;
310254677Sgrehan		return (0);
311254677Sgrehan	}
312271340Sneel
313254677Sgrehan	base += (MSR_AMD6TH_END - MSR_AMD6TH_START + 1);
314254677Sgrehan	if (msr >= MSR_AMD7TH_START && msr <= MSR_AMD7TH_END) {
315254677Sgrehan		off = (msr - MSR_AMD7TH_START);
316254677Sgrehan		*index = (off + base) / 4;
317254677Sgrehan		return (0);
318254677Sgrehan	}
319254677Sgrehan
320272929Sneel	return (EINVAL);
321254677Sgrehan}
322254677Sgrehan
323254677Sgrehan/*
324272929Sneel * Allow vcpu to read or write the 'msr' without trapping into the hypervisor.
325254677Sgrehan */
326272929Sneelstatic void
327267003Sgrehansvm_msr_perm(uint8_t *perm_bitmap, uint64_t msr, bool read, bool write)
328254677Sgrehan{
329272929Sneel	int index, bit, error;
330254677Sgrehan
331272929Sneel	error = svm_msr_index(msr, &index, &bit);
332272929Sneel	KASSERT(error == 0, ("%s: invalid msr %#lx", __func__, msr));
333272929Sneel	KASSERT(index >= 0 && index < SVM_MSR_BITMAP_SIZE,
334272929Sneel	    ("%s: invalid index %d for msr %#lx", __func__, index, msr));
335272929Sneel	KASSERT(bit >= 0 && bit <= 6, ("%s: invalid bit position %d "
336272929Sneel	    "msr %#lx", __func__, bit, msr));
337271340Sneel
338267003Sgrehan	if (read)
339267003Sgrehan		perm_bitmap[index] &= ~(1UL << bit);
340272929Sneel
341267003Sgrehan	if (write)
342267003Sgrehan		perm_bitmap[index] &= ~(2UL << bit);
343254677Sgrehan}
344254677Sgrehan
345272929Sneelstatic void
346267003Sgrehansvm_msr_rw_ok(uint8_t *perm_bitmap, uint64_t msr)
347267003Sgrehan{
348272929Sneel
349272929Sneel	svm_msr_perm(perm_bitmap, msr, true, true);
350267003Sgrehan}
351267003Sgrehan
352272929Sneelstatic void
353267003Sgrehansvm_msr_rd_ok(uint8_t *perm_bitmap, uint64_t msr)
354267003Sgrehan{
355272929Sneel
356272929Sneel	svm_msr_perm(perm_bitmap, msr, true, false);
357267003Sgrehan}
358271152Sneel
359271348Sneelstatic __inline int
360271348Sneelsvm_get_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask)
361271348Sneel{
362271348Sneel	struct vmcb_ctrl *ctrl;
363271348Sneel
364271348Sneel	KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx));
365271348Sneel
366271348Sneel	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
367271348Sneel	return (ctrl->intercept[idx] & bitmask ? 1 : 0);
368271348Sneel}
369271348Sneel
370271348Sneelstatic __inline void
371271348Sneelsvm_set_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask,
372271348Sneel    int enabled)
373271348Sneel{
374271348Sneel	struct vmcb_ctrl *ctrl;
375271348Sneel	uint32_t oldval;
376271348Sneel
377271348Sneel	KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx));
378271348Sneel
379271348Sneel	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
380271348Sneel	oldval = ctrl->intercept[idx];
381271348Sneel
382271348Sneel	if (enabled)
383271348Sneel		ctrl->intercept[idx] |= bitmask;
384271348Sneel	else
385271348Sneel		ctrl->intercept[idx] &= ~bitmask;
386271348Sneel
387271348Sneel	if (ctrl->intercept[idx] != oldval) {
388271939Sneel		svm_set_dirty(sc, vcpu, VMCB_CACHE_I);
389271348Sneel		VCPU_CTR3(sc->vm, vcpu, "intercept[%d] modified "
390271348Sneel		    "from %#x to %#x", idx, oldval, ctrl->intercept[idx]);
391271348Sneel	}
392271348Sneel}
393271348Sneel
394271348Sneelstatic __inline void
395271348Sneelsvm_disable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
396271348Sneel{
397272929Sneel
398271348Sneel	svm_set_intercept(sc, vcpu, off, bitmask, 0);
399271348Sneel}
400271348Sneel
401271348Sneelstatic __inline void
402271348Sneelsvm_enable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
403271348Sneel{
404272929Sneel
405271348Sneel	svm_set_intercept(sc, vcpu, off, bitmask, 1);
406271348Sneel}
407271348Sneel
408271346Sneelstatic void
409271346Sneelvmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa,
410271346Sneel    uint64_t msrpm_base_pa, uint64_t np_pml4)
411271346Sneel{
412271346Sneel	struct vmcb_ctrl *ctrl;
413271346Sneel	struct vmcb_state *state;
414271348Sneel	uint32_t mask;
415271348Sneel	int n;
416271346Sneel
417271346Sneel	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
418271346Sneel	state = svm_get_vmcb_state(sc, vcpu);
419271346Sneel
420271346Sneel	ctrl->iopm_base_pa = iopm_base_pa;
421271346Sneel	ctrl->msrpm_base_pa = msrpm_base_pa;
422271346Sneel
423271346Sneel	/* Enable nested paging */
424271346Sneel	ctrl->np_enable = 1;
425271346Sneel	ctrl->n_cr3 = np_pml4;
426271346Sneel
427271346Sneel	/*
428271346Sneel	 * Intercept accesses to the control registers that are not shadowed
429271346Sneel	 * in the VMCB - i.e. all except cr0, cr2, cr3, cr4 and cr8.
430271346Sneel	 */
431271348Sneel	for (n = 0; n < 16; n++) {
432271348Sneel		mask = (BIT(n) << 16) | BIT(n);
433271348Sneel		if (n == 0 || n == 2 || n == 3 || n == 4 || n == 8)
434271348Sneel			svm_disable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
435271348Sneel		else
436271348Sneel			svm_enable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
437271348Sneel	}
438271346Sneel
439271346Sneel
440276403Sneel	/*
441276403Sneel	 * Intercept everything when tracing guest exceptions otherwise
442276403Sneel	 * just intercept machine check exception.
443276403Sneel	 */
444276403Sneel	if (vcpu_trace_exceptions(sc->vm, vcpu)) {
445276403Sneel		for (n = 0; n < 32; n++) {
446276403Sneel			/*
447276403Sneel			 * Skip unimplemented vectors in the exception bitmap.
448276403Sneel			 */
449276403Sneel			if (n == 2 || n == 9) {
450276403Sneel				continue;
451276403Sneel			}
452276403Sneel			svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(n));
453276403Sneel		}
454276403Sneel	} else {
455276403Sneel		svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC));
456276403Sneel	}
457276403Sneel
458271346Sneel	/* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */
459271348Sneel	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO);
460271348Sneel	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_MSR);
461271348Sneel	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_CPUID);
462271348Sneel	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INTR);
463271348Sneel	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INIT);
464271348Sneel	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_NMI);
465271348Sneel	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SMI);
466271348Sneel	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SHUTDOWN);
467271348Sneel	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
468271348Sneel	    VMCB_INTCPT_FERR_FREEZE);
469271346Sneel
470276403Sneel	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MONITOR);
471276403Sneel	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MWAIT);
472276403Sneel
473271346Sneel	/*
474271346Sneel	 * From section "Canonicalization and Consistency Checks" in APMv2
475271346Sneel	 * the VMRUN intercept bit must be set to pass the consistency check.
476271346Sneel	 */
477271348Sneel	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMRUN);
478271346Sneel
479271346Sneel	/*
480271346Sneel	 * The ASID will be set to a non-zero value just before VMRUN.
481271346Sneel	 */
482271346Sneel	ctrl->asid = 0;
483271346Sneel
484271346Sneel	/*
485271346Sneel	 * Section 15.21.1, Interrupt Masking in EFLAGS
486271346Sneel	 * Section 15.21.2, Virtualizing APIC.TPR
487271346Sneel	 *
488271346Sneel	 * This must be set for %rflag and %cr8 isolation of guest and host.
489271346Sneel	 */
490271346Sneel	ctrl->v_intr_masking = 1;
491271346Sneel
492271346Sneel	/* Enable Last Branch Record aka LBR for debugging */
493271346Sneel	ctrl->lbr_virt_en = 1;
494271346Sneel	state->dbgctl = BIT(0);
495271346Sneel
496271346Sneel	/* EFER_SVM must always be set when the guest is executing */
497271346Sneel	state->efer = EFER_SVM;
498271346Sneel
499271346Sneel	/* Set up the PAT to power-on state */
500271346Sneel	state->g_pat = PAT_VALUE(0, PAT_WRITE_BACK)	|
501271346Sneel	    PAT_VALUE(1, PAT_WRITE_THROUGH)	|
502271346Sneel	    PAT_VALUE(2, PAT_UNCACHED)		|
503271346Sneel	    PAT_VALUE(3, PAT_UNCACHEABLE)	|
504271346Sneel	    PAT_VALUE(4, PAT_WRITE_BACK)	|
505271346Sneel	    PAT_VALUE(5, PAT_WRITE_THROUGH)	|
506271346Sneel	    PAT_VALUE(6, PAT_UNCACHED)		|
507271346Sneel	    PAT_VALUE(7, PAT_UNCACHEABLE);
508271346Sneel}
509271346Sneel
510254677Sgrehan/*
511272929Sneel * Initialize a virtual machine.
512254677Sgrehan */
513254677Sgrehanstatic void *
514259579Sgrehansvm_vminit(struct vm *vm, pmap_t pmap)
515254677Sgrehan{
516254677Sgrehan	struct svm_softc *svm_sc;
517271152Sneel	struct svm_vcpu *vcpu;
518308436Savg	vm_paddr_t msrpm_pa, iopm_pa, pml4_pa;
519271203Sneel	int i;
520254677Sgrehan
521328842Savg	svm_sc = malloc(sizeof (*svm_sc), M_SVM, M_WAITOK | M_ZERO);
522328842Savg	if (((uintptr_t)svm_sc & PAGE_MASK) != 0)
523328842Savg		panic("malloc of svm_softc not aligned on page boundary");
524328842Savg
525328842Savg	svm_sc->msr_bitmap = contigmalloc(SVM_MSR_BITMAP_SIZE, M_SVM,
526328842Savg	    M_WAITOK, 0, ~(vm_paddr_t)0, PAGE_SIZE, 0);
527328842Savg	if (svm_sc->msr_bitmap == NULL)
528328842Savg		panic("contigmalloc of SVM MSR bitmap failed");
529328842Savg	svm_sc->iopm_bitmap = contigmalloc(SVM_IO_BITMAP_SIZE, M_SVM,
530328842Savg	    M_WAITOK, 0, ~(vm_paddr_t)0, PAGE_SIZE, 0);
531328842Savg	if (svm_sc->iopm_bitmap == NULL)
532328842Savg		panic("contigmalloc of SVM IO bitmap failed");
533328842Savg
534254677Sgrehan	svm_sc->vm = vm;
535259579Sgrehan	svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pml4);
536271203Sneel
537254677Sgrehan	/*
538272929Sneel	 * Intercept read and write accesses to all MSRs.
539272929Sneel	 */
540328842Savg	memset(svm_sc->msr_bitmap, 0xFF, SVM_MSR_BITMAP_SIZE);
541254677Sgrehan
542254677Sgrehan	/*
543272929Sneel	 * Access to the following MSRs is redirected to the VMCB when the
544272929Sneel	 * guest is executing. Therefore it is safe to allow the guest to
545272929Sneel	 * read/write these MSRs directly without hypervisor involvement.
546254677Sgrehan	 */
547254677Sgrehan	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_GSBASE);
548254677Sgrehan	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_FSBASE);
549254677Sgrehan	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_KGSBASE);
550308436Savg
551254677Sgrehan	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_STAR);
552254677Sgrehan	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_LSTAR);
553254677Sgrehan	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_CSTAR);
554254677Sgrehan	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SF_MASK);
555254677Sgrehan	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_CS_MSR);
556254677Sgrehan	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_ESP_MSR);
557254677Sgrehan	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_EIP_MSR);
558254677Sgrehan	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_PAT);
559254677Sgrehan
560267218Sgrehan	svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_TSC);
561271939Sneel
562271939Sneel	/*
563271939Sneel	 * Intercept writes to make sure that the EFER_SVM bit is not cleared.
564271939Sneel	 */
565271342Sneel	svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_EFER);
566267218Sgrehan
567272929Sneel	/* Intercept access to all I/O ports. */
568328842Savg	memset(svm_sc->iopm_bitmap, 0xFF, SVM_IO_BITMAP_SIZE);
569254677Sgrehan
570254677Sgrehan	iopm_pa = vtophys(svm_sc->iopm_bitmap);
571254677Sgrehan	msrpm_pa = vtophys(svm_sc->msr_bitmap);
572259579Sgrehan	pml4_pa = svm_sc->nptp;
573271662Sneel	for (i = 0; i < VM_MAXCPU; i++) {
574271152Sneel		vcpu = svm_get_vcpu(svm_sc, i);
575284894Sneel		vcpu->nextrip = ~0;
576271152Sneel		vcpu->lastcpu = NOCPU;
577271152Sneel		vcpu->vmcb_pa = vtophys(&vcpu->vmcb);
578271346Sneel		vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa);
579271912Sneel		svm_msr_guest_init(svm_sc, i);
580254677Sgrehan	}
581254677Sgrehan	return (svm_sc);
582254677Sgrehan}
583254677Sgrehan
584284900Sneel/*
585284900Sneel * Collateral for a generic SVM VM-exit.
586284900Sneel */
587284900Sneelstatic void
588284900Sneelvm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2)
589284900Sneel{
590284900Sneel
591284900Sneel	vme->exitcode = VM_EXITCODE_SVM;
592284900Sneel	vme->u.svm.exitcode = code;
593284900Sneel	vme->u.svm.exitinfo1 = info1;
594284900Sneel	vme->u.svm.exitinfo2 = info2;
595284900Sneel}
596284900Sneel
597267144Sgrehanstatic int
598267144Sgrehansvm_cpl(struct vmcb_state *state)
599267144Sgrehan{
600267144Sgrehan
601267144Sgrehan	/*
602267144Sgrehan	 * From APMv2:
603267144Sgrehan	 *   "Retrieve the CPL from the CPL field in the VMCB, not
604267144Sgrehan	 *    from any segment DPL"
605267144Sgrehan	 */
606267144Sgrehan	return (state->cpl);
607267144Sgrehan}
608267144Sgrehan
609267144Sgrehanstatic enum vm_cpu_mode
610270962Sneelsvm_vcpu_mode(struct vmcb *vmcb)
611267144Sgrehan{
612271939Sneel	struct vmcb_segment seg;
613270962Sneel	struct vmcb_state *state;
614271939Sneel	int error;
615267144Sgrehan
616270962Sneel	state = &vmcb->state;
617270962Sneel
618270962Sneel	if (state->efer & EFER_LMA) {
619271939Sneel		error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg);
620271939Sneel		KASSERT(error == 0, ("%s: vmcb_seg(cs) error %d", __func__,
621271939Sneel		    error));
622271939Sneel
623270962Sneel		/*
624270962Sneel		 * Section 4.8.1 for APM2, check if Code Segment has
625270962Sneel		 * Long attribute set in descriptor.
626270962Sneel		 */
627271939Sneel		if (seg.attrib & VMCB_CS_ATTRIB_L)
628270962Sneel			return (CPU_MODE_64BIT);
629270962Sneel		else
630270962Sneel			return (CPU_MODE_COMPATIBILITY);
631270962Sneel	} else  if (state->cr0 & CR0_PE) {
632270962Sneel		return (CPU_MODE_PROTECTED);
633270962Sneel	} else {
634270962Sneel		return (CPU_MODE_REAL);
635270962Sneel	}
636267144Sgrehan}
637267144Sgrehan
638267144Sgrehanstatic enum vm_paging_mode
639267144Sgrehansvm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer)
640267144Sgrehan{
641267144Sgrehan
642267144Sgrehan	if ((cr0 & CR0_PG) == 0)
643267144Sgrehan		return (PAGING_MODE_FLAT);
644267144Sgrehan	if ((cr4 & CR4_PAE) == 0)
645267144Sgrehan		return (PAGING_MODE_32);
646267144Sgrehan	if (efer & EFER_LME)
647267144Sgrehan		return (PAGING_MODE_64);
648267144Sgrehan	else
649267144Sgrehan		return (PAGING_MODE_PAE);
650267144Sgrehan}
651267144Sgrehan
652254677Sgrehan/*
653267144Sgrehan * ins/outs utility routines
654267144Sgrehan */
655267144Sgrehanstatic uint64_t
656267144Sgrehansvm_inout_str_index(struct svm_regctx *regs, int in)
657267144Sgrehan{
658267144Sgrehan	uint64_t val;
659267144Sgrehan
660272195Sneel	val = in ? regs->sctx_rdi : regs->sctx_rsi;
661267144Sgrehan
662267144Sgrehan	return (val);
663267144Sgrehan}
664267144Sgrehan
665267144Sgrehanstatic uint64_t
666267144Sgrehansvm_inout_str_count(struct svm_regctx *regs, int rep)
667267144Sgrehan{
668267144Sgrehan	uint64_t val;
669267144Sgrehan
670267144Sgrehan	val = rep ? regs->sctx_rcx : 1;
671267144Sgrehan
672267144Sgrehan	return (val);
673267144Sgrehan}
674267144Sgrehan
675267144Sgrehanstatic void
676267144Sgrehansvm_inout_str_seginfo(struct svm_softc *svm_sc, int vcpu, int64_t info1,
677267144Sgrehan    int in, struct vm_inout_str *vis)
678267144Sgrehan{
679267144Sgrehan	int error, s;
680267144Sgrehan
681267144Sgrehan	if (in) {
682267144Sgrehan		vis->seg_name = VM_REG_GUEST_ES;
683267144Sgrehan	} else {
684267144Sgrehan		/* The segment field has standard encoding */
685267144Sgrehan		s = (info1 >> 10) & 0x7;
686267144Sgrehan		vis->seg_name = vm_segment_name(s);
687267144Sgrehan	}
688267144Sgrehan
689271939Sneel	error = vmcb_getdesc(svm_sc, vcpu, vis->seg_name, &vis->seg_desc);
690267144Sgrehan	KASSERT(error == 0, ("%s: svm_getdesc error %d", __func__, error));
691267144Sgrehan}
692267144Sgrehan
693267144Sgrehanstatic int
694267144Sgrehansvm_inout_str_addrsize(uint64_t info1)
695267144Sgrehan{
696267144Sgrehan        uint32_t size;
697267144Sgrehan
698267144Sgrehan        size = (info1 >> 7) & 0x7;
699267144Sgrehan        switch (size) {
700267144Sgrehan        case 1:
701267144Sgrehan                return (2);     /* 16 bit */
702267144Sgrehan        case 2:
703267144Sgrehan                return (4);     /* 32 bit */
704267144Sgrehan        case 4:
705267144Sgrehan                return (8);     /* 64 bit */
706267144Sgrehan        default:
707267144Sgrehan                panic("%s: invalid size encoding %d", __func__, size);
708267144Sgrehan        }
709267144Sgrehan}
710267144Sgrehan
711267144Sgrehanstatic void
712270962Sneelsvm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging)
713267144Sgrehan{
714270962Sneel	struct vmcb_state *state;
715267144Sgrehan
716270962Sneel	state = &vmcb->state;
717267144Sgrehan	paging->cr3 = state->cr3;
718267144Sgrehan	paging->cpl = svm_cpl(state);
719270962Sneel	paging->cpu_mode = svm_vcpu_mode(vmcb);
720267144Sgrehan	paging->paging_mode = svm_paging_mode(state->cr0, state->cr4,
721270962Sneel	    state->efer);
722267144Sgrehan}
723267144Sgrehan
724271570Sneel#define	UNHANDLED 0
725270962Sneel
726267144Sgrehan/*
727254677Sgrehan * Handle guest I/O intercept.
728254677Sgrehan */
729271570Sneelstatic int
730254677Sgrehansvm_handle_io(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
731254677Sgrehan{
732254677Sgrehan	struct vmcb_ctrl *ctrl;
733254677Sgrehan	struct vmcb_state *state;
734267144Sgrehan	struct svm_regctx *regs;
735267144Sgrehan	struct vm_inout_str *vis;
736254677Sgrehan	uint64_t info1;
737271570Sneel	int inout_string;
738271348Sneel
739254677Sgrehan	state = svm_get_vmcb_state(svm_sc, vcpu);
740254677Sgrehan	ctrl  = svm_get_vmcb_ctrl(svm_sc, vcpu);
741267144Sgrehan	regs  = svm_get_guest_regctx(svm_sc, vcpu);
742271570Sneel
743254677Sgrehan	info1 = ctrl->exitinfo1;
744271570Sneel	inout_string = info1 & BIT(2) ? 1 : 0;
745271570Sneel
746271570Sneel	/*
747271570Sneel	 * The effective segment number in EXITINFO1[12:10] is populated
748271570Sneel	 * only if the processor has the DecodeAssist capability.
749271570Sneel	 *
750271570Sneel	 * XXX this is not specified explicitly in APMv2 but can be verified
751271570Sneel	 * empirically.
752271570Sneel	 */
753271570Sneel	if (inout_string && !decode_assist())
754271570Sneel		return (UNHANDLED);
755271570Sneel
756254677Sgrehan	vmexit->exitcode 	= VM_EXITCODE_INOUT;
757254677Sgrehan	vmexit->u.inout.in 	= (info1 & BIT(0)) ? 1 : 0;
758271570Sneel	vmexit->u.inout.string 	= inout_string;
759267144Sgrehan	vmexit->u.inout.rep 	= (info1 & BIT(3)) ? 1 : 0;
760254677Sgrehan	vmexit->u.inout.bytes 	= (info1 >> 4) & 0x7;
761254677Sgrehan	vmexit->u.inout.port 	= (uint16_t)(info1 >> 16);
762254677Sgrehan	vmexit->u.inout.eax 	= (uint32_t)(state->rax);
763254677Sgrehan
764271570Sneel	if (inout_string) {
765267144Sgrehan		vmexit->exitcode = VM_EXITCODE_INOUT_STR;
766267144Sgrehan		vis = &vmexit->u.inout_str;
767270962Sneel		svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &vis->paging);
768267144Sgrehan		vis->rflags = state->rflags;
769267144Sgrehan		vis->cr0 = state->cr0;
770267144Sgrehan		vis->index = svm_inout_str_index(regs, vmexit->u.inout.in);
771267144Sgrehan		vis->count = svm_inout_str_count(regs, vmexit->u.inout.rep);
772267144Sgrehan		vis->addrsize = svm_inout_str_addrsize(info1);
773267144Sgrehan		svm_inout_str_seginfo(svm_sc, vcpu, info1,
774267144Sgrehan		    vmexit->u.inout.in, vis);
775267144Sgrehan	}
776271570Sneel
777271570Sneel	return (UNHANDLED);
778254677Sgrehan}
779254677Sgrehan
780261462Sgrehanstatic int
781272929Sneelnpf_fault_type(uint64_t exitinfo1)
782254677Sgrehan{
783254677Sgrehan
784259579Sgrehan	if (exitinfo1 & VMCB_NPF_INFO1_W)
785261462Sgrehan		return (VM_PROT_WRITE);
786273176Sneel	else if (exitinfo1 & VMCB_NPF_INFO1_ID)
787273176Sneel		return (VM_PROT_EXECUTE);
788272929Sneel	else
789272929Sneel		return (VM_PROT_READ);
790259579Sgrehan}
791259579Sgrehan
792259579Sgrehanstatic bool
793259579Sgrehansvm_npf_emul_fault(uint64_t exitinfo1)
794259579Sgrehan{
795254677Sgrehan
796259579Sgrehan	if (exitinfo1 & VMCB_NPF_INFO1_ID) {
797259579Sgrehan		return (false);
798254677Sgrehan	}
799254677Sgrehan
800259579Sgrehan	if (exitinfo1 & VMCB_NPF_INFO1_GPT) {
801259579Sgrehan		return (false);
802254677Sgrehan	}
803254677Sgrehan
804259579Sgrehan	if ((exitinfo1 & VMCB_NPF_INFO1_GPA) == 0) {
805259579Sgrehan		return (false);
806254677Sgrehan	}
807254677Sgrehan
808259579Sgrehan	return (true);
809254677Sgrehan}
810254677Sgrehan
811270962Sneelstatic void
812270962Sneelsvm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit)
813270962Sneel{
814270962Sneel	struct vm_guest_paging *paging;
815271939Sneel	struct vmcb_segment seg;
816271554Sneel	struct vmcb_ctrl *ctrl;
817271554Sneel	char *inst_bytes;
818271939Sneel	int error, inst_len;
819270962Sneel
820271554Sneel	ctrl = &vmcb->ctrl;
821270962Sneel	paging = &vmexit->u.inst_emul.paging;
822271554Sneel
823270962Sneel	vmexit->exitcode = VM_EXITCODE_INST_EMUL;
824270962Sneel	vmexit->u.inst_emul.gpa = gpa;
825270962Sneel	vmexit->u.inst_emul.gla = VIE_INVALID_GLA;
826270962Sneel	svm_paging_info(vmcb, paging);
827270962Sneel
828271939Sneel	error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg);
829271939Sneel	KASSERT(error == 0, ("%s: vmcb_seg(CS) error %d", __func__, error));
830271939Sneel
831270962Sneel	switch(paging->cpu_mode) {
832284899Sneel	case CPU_MODE_REAL:
833284899Sneel		vmexit->u.inst_emul.cs_base = seg.base;
834284899Sneel		vmexit->u.inst_emul.cs_d = 0;
835284899Sneel		break;
836270962Sneel	case CPU_MODE_PROTECTED:
837270962Sneel	case CPU_MODE_COMPATIBILITY:
838284899Sneel		vmexit->u.inst_emul.cs_base = seg.base;
839284899Sneel
840270962Sneel		/*
841270962Sneel		 * Section 4.8.1 of APM2, Default Operand Size or D bit.
842270962Sneel		 */
843271939Sneel		vmexit->u.inst_emul.cs_d = (seg.attrib & VMCB_CS_ATTRIB_D) ?
844270962Sneel		    1 : 0;
845270962Sneel		break;
846270962Sneel	default:
847284899Sneel		vmexit->u.inst_emul.cs_base = 0;
848270962Sneel		vmexit->u.inst_emul.cs_d = 0;
849270962Sneel		break;
850270962Sneel	}
851271554Sneel
852271554Sneel	/*
853271554Sneel	 * Copy the instruction bytes into 'vie' if available.
854271554Sneel	 */
855271554Sneel	if (decode_assist() && !disable_npf_assist) {
856271554Sneel		inst_len = ctrl->inst_len;
857271554Sneel		inst_bytes = ctrl->inst_bytes;
858271554Sneel	} else {
859271554Sneel		inst_len = 0;
860271554Sneel		inst_bytes = NULL;
861271554Sneel	}
862271554Sneel	vie_init(&vmexit->u.inst_emul.vie, inst_bytes, inst_len);
863270962Sneel}
864270962Sneel
865271345Sneel#ifdef KTR
866271345Sneelstatic const char *
867271345Sneelintrtype_to_str(int intr_type)
868271345Sneel{
869271345Sneel	switch (intr_type) {
870271345Sneel	case VMCB_EVENTINJ_TYPE_INTR:
871271345Sneel		return ("hwintr");
872271345Sneel	case VMCB_EVENTINJ_TYPE_NMI:
873271345Sneel		return ("nmi");
874271345Sneel	case VMCB_EVENTINJ_TYPE_INTn:
875271345Sneel		return ("swintr");
876271345Sneel	case VMCB_EVENTINJ_TYPE_EXCEPTION:
877271345Sneel		return ("exception");
878271345Sneel	default:
879271345Sneel		panic("%s: unknown intr_type %d", __func__, intr_type);
880271345Sneel	}
881271345Sneel}
882271345Sneel#endif
883271345Sneel
884271345Sneel/*
885271345Sneel * Inject an event to vcpu as described in section 15.20, "Event injection".
886271345Sneel */
887270962Sneelstatic void
888271345Sneelsvm_eventinject(struct svm_softc *sc, int vcpu, int intr_type, int vector,
889271345Sneel		 uint32_t error, bool ec_valid)
890271345Sneel{
891271345Sneel	struct vmcb_ctrl *ctrl;
892271345Sneel
893271345Sneel	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
894271345Sneel
895271345Sneel	KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0,
896271345Sneel	    ("%s: event already pending %#lx", __func__, ctrl->eventinj));
897271345Sneel
898271345Sneel	KASSERT(vector >=0 && vector <= 255, ("%s: invalid vector %d",
899271345Sneel	    __func__, vector));
900271345Sneel
901271345Sneel	switch (intr_type) {
902271345Sneel	case VMCB_EVENTINJ_TYPE_INTR:
903271345Sneel	case VMCB_EVENTINJ_TYPE_NMI:
904271345Sneel	case VMCB_EVENTINJ_TYPE_INTn:
905271345Sneel		break;
906271345Sneel	case VMCB_EVENTINJ_TYPE_EXCEPTION:
907271345Sneel		if (vector >= 0 && vector <= 31 && vector != 2)
908271345Sneel			break;
909271345Sneel		/* FALLTHROUGH */
910271345Sneel	default:
911271345Sneel		panic("%s: invalid intr_type/vector: %d/%d", __func__,
912271345Sneel		    intr_type, vector);
913271345Sneel	}
914271345Sneel	ctrl->eventinj = vector | (intr_type << 8) | VMCB_EVENTINJ_VALID;
915271345Sneel	if (ec_valid) {
916271345Sneel		ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID;
917271345Sneel		ctrl->eventinj |= (uint64_t)error << 32;
918271345Sneel		VCPU_CTR3(sc->vm, vcpu, "Injecting %s at vector %d errcode %#x",
919271345Sneel		    intrtype_to_str(intr_type), vector, error);
920271345Sneel	} else {
921271345Sneel		VCPU_CTR2(sc->vm, vcpu, "Injecting %s at vector %d",
922271345Sneel		    intrtype_to_str(intr_type), vector);
923271345Sneel	}
924271345Sneel}
925271345Sneel
926271345Sneelstatic void
927271661Sneelsvm_update_virqinfo(struct svm_softc *sc, int vcpu)
928271661Sneel{
929271661Sneel	struct vm *vm;
930271661Sneel	struct vlapic *vlapic;
931271661Sneel	struct vmcb_ctrl *ctrl;
932271661Sneel
933271661Sneel	vm = sc->vm;
934271661Sneel	vlapic = vm_lapic(vm, vcpu);
935271661Sneel	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
936271661Sneel
937271661Sneel	/* Update %cr8 in the emulated vlapic */
938271661Sneel	vlapic_set_cr8(vlapic, ctrl->v_tpr);
939271661Sneel
940329321Savg	/* Virtual interrupt injection is not used. */
941329321Savg	KASSERT(ctrl->v_intr_vector == 0, ("%s: invalid "
942329321Savg	    "v_intr_vector %d", __func__, ctrl->v_intr_vector));
943271661Sneel}
944271661Sneel
945271661Sneelstatic void
946270962Sneelsvm_save_intinfo(struct svm_softc *svm_sc, int vcpu)
947270962Sneel{
948270962Sneel	struct vmcb_ctrl *ctrl;
949270962Sneel	uint64_t intinfo;
950270962Sneel
951270962Sneel	ctrl  = svm_get_vmcb_ctrl(svm_sc, vcpu);
952270962Sneel	intinfo = ctrl->exitintinfo;
953270962Sneel	if (!VMCB_EXITINTINFO_VALID(intinfo))
954270962Sneel		return;
955270962Sneel
956270962Sneel	/*
957270962Sneel	 * From APMv2, Section "Intercepts during IDT interrupt delivery"
958270962Sneel	 *
959270962Sneel	 * If a #VMEXIT happened during event delivery then record the event
960270962Sneel	 * that was being delivered.
961270962Sneel	 */
962270962Sneel	VCPU_CTR2(svm_sc->vm, vcpu, "SVM:Pending INTINFO(0x%lx), vector=%d.\n",
963270962Sneel		intinfo, VMCB_EXITINTINFO_VECTOR(intinfo));
964270962Sneel	vmm_stat_incr(svm_sc->vm, vcpu, VCPU_EXITINTINFO, 1);
965270962Sneel	vm_exit_intinfo(svm_sc->vm, vcpu, intinfo);
966270962Sneel}
967270962Sneel
968330069Savg#ifdef INVARIANTS
969271661Sneelstatic __inline int
970271661Sneelvintr_intercept_enabled(struct svm_softc *sc, int vcpu)
971271661Sneel{
972271661Sneel
973271661Sneel	return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
974271661Sneel	    VMCB_INTCPT_VINTR));
975271661Sneel}
976330069Savg#endif
977271661Sneel
978271415Sneelstatic __inline void
979271415Sneelenable_intr_window_exiting(struct svm_softc *sc, int vcpu)
980271415Sneel{
981271415Sneel	struct vmcb_ctrl *ctrl;
982271415Sneel
983271415Sneel	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
984271415Sneel
985271661Sneel	if (ctrl->v_irq && ctrl->v_intr_vector == 0) {
986271661Sneel		KASSERT(ctrl->v_ign_tpr, ("%s: invalid v_ign_tpr", __func__));
987271661Sneel		KASSERT(vintr_intercept_enabled(sc, vcpu),
988271661Sneel		    ("%s: vintr intercept should be enabled", __func__));
989271661Sneel		return;
990271415Sneel	}
991271661Sneel
992271661Sneel	VCPU_CTR0(sc->vm, vcpu, "Enable intr window exiting");
993271661Sneel	ctrl->v_irq = 1;
994271661Sneel	ctrl->v_ign_tpr = 1;
995271661Sneel	ctrl->v_intr_vector = 0;
996271939Sneel	svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
997271661Sneel	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
998271415Sneel}
999271415Sneel
1000271415Sneelstatic __inline void
1001271415Sneeldisable_intr_window_exiting(struct svm_softc *sc, int vcpu)
1002271415Sneel{
1003271415Sneel	struct vmcb_ctrl *ctrl;
1004271415Sneel
1005271415Sneel	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1006271415Sneel
1007271661Sneel	if (!ctrl->v_irq && ctrl->v_intr_vector == 0) {
1008271661Sneel		KASSERT(!vintr_intercept_enabled(sc, vcpu),
1009271661Sneel		    ("%s: vintr intercept should be disabled", __func__));
1010271661Sneel		return;
1011271661Sneel	}
1012271661Sneel
1013329321Savg	VCPU_CTR0(sc->vm, vcpu, "Disable intr window exiting");
1014271661Sneel	ctrl->v_irq = 0;
1015271661Sneel	ctrl->v_intr_vector = 0;
1016271939Sneel	svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
1017271661Sneel	svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
1018271415Sneel}
1019271415Sneel
1020271415Sneelstatic int
1021271715Sneelsvm_modify_intr_shadow(struct svm_softc *sc, int vcpu, uint64_t val)
1022271415Sneel{
1023271694Sneel	struct vmcb_ctrl *ctrl;
1024271694Sneel	int oldval, newval;
1025271694Sneel
1026271694Sneel	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1027271694Sneel	oldval = ctrl->intr_shadow;
1028271694Sneel	newval = val ? 1 : 0;
1029271694Sneel	if (newval != oldval) {
1030271694Sneel		ctrl->intr_shadow = newval;
1031271694Sneel		VCPU_CTR1(sc->vm, vcpu, "Setting intr_shadow to %d", newval);
1032271694Sneel	}
1033271415Sneel	return (0);
1034271415Sneel}
1035271415Sneel
1036271715Sneelstatic int
1037271715Sneelsvm_get_intr_shadow(struct svm_softc *sc, int vcpu, uint64_t *val)
1038271715Sneel{
1039271715Sneel	struct vmcb_ctrl *ctrl;
1040271715Sneel
1041271715Sneel	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1042271715Sneel	*val = ctrl->intr_shadow;
1043271715Sneel	return (0);
1044271715Sneel}
1045271715Sneel
1046271694Sneel/*
1047271694Sneel * Once an NMI is injected it blocks delivery of further NMIs until the handler
1048271694Sneel * executes an IRET. The IRET intercept is enabled when an NMI is injected to
1049271694Sneel * to track when the vcpu is done handling the NMI.
1050271694Sneel */
1051271694Sneelstatic int
1052271694Sneelnmi_blocked(struct svm_softc *sc, int vcpu)
1053271694Sneel{
1054271694Sneel	int blocked;
1055271694Sneel
1056271694Sneel	blocked = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
1057271694Sneel	    VMCB_INTCPT_IRET);
1058271694Sneel	return (blocked);
1059271694Sneel}
1060271694Sneel
1061271415Sneelstatic void
1062271415Sneelenable_nmi_blocking(struct svm_softc *sc, int vcpu)
1063271415Sneel{
1064271694Sneel
1065271694Sneel	KASSERT(!nmi_blocked(sc, vcpu), ("vNMI already blocked"));
1066271694Sneel	VCPU_CTR0(sc->vm, vcpu, "vNMI blocking enabled");
1067271694Sneel	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
1068271415Sneel}
1069271415Sneel
1070271415Sneelstatic void
1071271715Sneelclear_nmi_blocking(struct svm_softc *sc, int vcpu)
1072271415Sneel{
1073271694Sneel	int error;
1074271694Sneel
1075271694Sneel	KASSERT(nmi_blocked(sc, vcpu), ("vNMI already unblocked"));
1076271694Sneel	VCPU_CTR0(sc->vm, vcpu, "vNMI blocking cleared");
1077271694Sneel	/*
1078271694Sneel	 * When the IRET intercept is cleared the vcpu will attempt to execute
1079271694Sneel	 * the "iret" when it runs next. However, it is possible to inject
1080271694Sneel	 * another NMI into the vcpu before the "iret" has actually executed.
1081271694Sneel	 *
1082271694Sneel	 * For e.g. if the "iret" encounters a #NPF when accessing the stack
1083271694Sneel	 * it will trap back into the hypervisor. If an NMI is pending for
1084271694Sneel	 * the vcpu it will be injected into the guest.
1085271694Sneel	 *
1086271694Sneel	 * XXX this needs to be fixed
1087271694Sneel	 */
1088271694Sneel	svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
1089271694Sneel
1090271694Sneel	/*
1091271694Sneel	 * Set 'intr_shadow' to prevent an NMI from being injected on the
1092271694Sneel	 * immediate VMRUN.
1093271694Sneel	 */
1094271715Sneel	error = svm_modify_intr_shadow(sc, vcpu, 1);
1095271694Sneel	KASSERT(!error, ("%s: error %d setting intr_shadow", __func__, error));
1096271415Sneel}
1097271415Sneel
1098284900Sneel#define	EFER_MBZ_BITS	0xFFFFFFFFFFFF0200UL
1099284900Sneel
1100271912Sneelstatic int
1101284900Sneelsvm_write_efer(struct svm_softc *sc, int vcpu, uint64_t newval, bool *retu)
1102284900Sneel{
1103284900Sneel	struct vm_exit *vme;
1104284900Sneel	struct vmcb_state *state;
1105284900Sneel	uint64_t changed, lma, oldval;
1106284900Sneel	int error;
1107284900Sneel
1108284900Sneel	state = svm_get_vmcb_state(sc, vcpu);
1109284900Sneel
1110284900Sneel	oldval = state->efer;
1111284900Sneel	VCPU_CTR2(sc->vm, vcpu, "wrmsr(efer) %#lx/%#lx", oldval, newval);
1112284900Sneel
1113284900Sneel	newval &= ~0xFE;		/* clear the Read-As-Zero (RAZ) bits */
1114284900Sneel	changed = oldval ^ newval;
1115284900Sneel
1116284900Sneel	if (newval & EFER_MBZ_BITS)
1117284900Sneel		goto gpf;
1118284900Sneel
1119284900Sneel	/* APMv2 Table 14-5 "Long-Mode Consistency Checks" */
1120284900Sneel	if (changed & EFER_LME) {
1121284900Sneel		if (state->cr0 & CR0_PG)
1122284900Sneel			goto gpf;
1123284900Sneel	}
1124284900Sneel
1125284900Sneel	/* EFER.LMA = EFER.LME & CR0.PG */
1126284900Sneel	if ((newval & EFER_LME) != 0 && (state->cr0 & CR0_PG) != 0)
1127284900Sneel		lma = EFER_LMA;
1128284900Sneel	else
1129284900Sneel		lma = 0;
1130284900Sneel
1131284900Sneel	if ((newval & EFER_LMA) != lma)
1132284900Sneel		goto gpf;
1133284900Sneel
1134284900Sneel	if (newval & EFER_NXE) {
1135284900Sneel		if (!vm_cpuid_capability(sc->vm, vcpu, VCC_NO_EXECUTE))
1136284900Sneel			goto gpf;
1137284900Sneel	}
1138284900Sneel
1139284900Sneel	/*
1140284900Sneel	 * XXX bhyve does not enforce segment limits in 64-bit mode. Until
1141284900Sneel	 * this is fixed flag guest attempt to set EFER_LMSLE as an error.
1142284900Sneel	 */
1143284900Sneel	if (newval & EFER_LMSLE) {
1144284900Sneel		vme = vm_exitinfo(sc->vm, vcpu);
1145284900Sneel		vm_exit_svm(vme, VMCB_EXIT_MSR, 1, 0);
1146284900Sneel		*retu = true;
1147284900Sneel		return (0);
1148284900Sneel	}
1149284900Sneel
1150284900Sneel	if (newval & EFER_FFXSR) {
1151284900Sneel		if (!vm_cpuid_capability(sc->vm, vcpu, VCC_FFXSR))
1152284900Sneel			goto gpf;
1153284900Sneel	}
1154284900Sneel
1155284900Sneel	if (newval & EFER_TCE) {
1156284900Sneel		if (!vm_cpuid_capability(sc->vm, vcpu, VCC_TCE))
1157284900Sneel			goto gpf;
1158284900Sneel	}
1159284900Sneel
1160284900Sneel	error = svm_setreg(sc, vcpu, VM_REG_GUEST_EFER, newval);
1161284900Sneel	KASSERT(error == 0, ("%s: error %d updating efer", __func__, error));
1162284900Sneel	return (0);
1163284900Sneelgpf:
1164284900Sneel	vm_inject_gp(sc->vm, vcpu);
1165284900Sneel	return (0);
1166284900Sneel}
1167284900Sneel
1168284900Sneelstatic int
1169271912Sneelemulate_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val,
1170271912Sneel    bool *retu)
1171271912Sneel{
1172271912Sneel	int error;
1173271912Sneel
1174271912Sneel	if (lapic_msr(num))
1175271912Sneel		error = lapic_wrmsr(sc->vm, vcpu, num, val, retu);
1176271912Sneel	else if (num == MSR_EFER)
1177284900Sneel		error = svm_write_efer(sc, vcpu, val, retu);
1178271912Sneel	else
1179271912Sneel		error = svm_wrmsr(sc, vcpu, num, val, retu);
1180271912Sneel
1181271912Sneel	return (error);
1182271912Sneel}
1183271912Sneel
1184271912Sneelstatic int
1185271912Sneelemulate_rdmsr(struct svm_softc *sc, int vcpu, u_int num, bool *retu)
1186271912Sneel{
1187271912Sneel	struct vmcb_state *state;
1188271912Sneel	struct svm_regctx *ctx;
1189271912Sneel	uint64_t result;
1190271912Sneel	int error;
1191271912Sneel
1192271912Sneel	if (lapic_msr(num))
1193271912Sneel		error = lapic_rdmsr(sc->vm, vcpu, num, &result, retu);
1194271912Sneel	else
1195271912Sneel		error = svm_rdmsr(sc, vcpu, num, &result, retu);
1196271912Sneel
1197271912Sneel	if (error == 0) {
1198271912Sneel		state = svm_get_vmcb_state(sc, vcpu);
1199271912Sneel		ctx = svm_get_guest_regctx(sc, vcpu);
1200271912Sneel		state->rax = result & 0xffffffff;
1201272195Sneel		ctx->sctx_rdx = result >> 32;
1202271912Sneel	}
1203271912Sneel
1204271912Sneel	return (error);
1205271912Sneel}
1206271912Sneel
1207271343Sneel#ifdef KTR
1208271343Sneelstatic const char *
1209271343Sneelexit_reason_to_str(uint64_t reason)
1210271343Sneel{
1211271343Sneel	static char reasonbuf[32];
1212271343Sneel
1213271343Sneel	switch (reason) {
1214271343Sneel	case VMCB_EXIT_INVALID:
1215271343Sneel		return ("invalvmcb");
1216271343Sneel	case VMCB_EXIT_SHUTDOWN:
1217271343Sneel		return ("shutdown");
1218271343Sneel	case VMCB_EXIT_NPF:
1219271343Sneel		return ("nptfault");
1220271343Sneel	case VMCB_EXIT_PAUSE:
1221271343Sneel		return ("pause");
1222271343Sneel	case VMCB_EXIT_HLT:
1223271343Sneel		return ("hlt");
1224271343Sneel	case VMCB_EXIT_CPUID:
1225271343Sneel		return ("cpuid");
1226271343Sneel	case VMCB_EXIT_IO:
1227271343Sneel		return ("inout");
1228271343Sneel	case VMCB_EXIT_MC:
1229271343Sneel		return ("mchk");
1230271343Sneel	case VMCB_EXIT_INTR:
1231271343Sneel		return ("extintr");
1232271559Sneel	case VMCB_EXIT_NMI:
1233271559Sneel		return ("nmi");
1234271343Sneel	case VMCB_EXIT_VINTR:
1235271343Sneel		return ("vintr");
1236271343Sneel	case VMCB_EXIT_MSR:
1237271343Sneel		return ("msr");
1238271694Sneel	case VMCB_EXIT_IRET:
1239271694Sneel		return ("iret");
1240276403Sneel	case VMCB_EXIT_MONITOR:
1241276403Sneel		return ("monitor");
1242276403Sneel	case VMCB_EXIT_MWAIT:
1243276403Sneel		return ("mwait");
1244271343Sneel	default:
1245271343Sneel		snprintf(reasonbuf, sizeof(reasonbuf), "%#lx", reason);
1246271343Sneel		return (reasonbuf);
1247271343Sneel	}
1248271343Sneel}
1249271343Sneel#endif	/* KTR */
1250271343Sneel
1251254677Sgrehan/*
1252271570Sneel * From section "State Saved on Exit" in APMv2: nRIP is saved for all #VMEXITs
1253271570Sneel * that are due to instruction intercepts as well as MSR and IOIO intercepts
1254271570Sneel * and exceptions caused by INT3, INTO and BOUND instructions.
1255271570Sneel *
1256271570Sneel * Return 1 if the nRIP is valid and 0 otherwise.
1257254677Sgrehan */
1258271570Sneelstatic int
1259271570Sneelnrip_valid(uint64_t exitcode)
1260271570Sneel{
1261271570Sneel	switch (exitcode) {
1262271570Sneel	case 0x00 ... 0x0F:	/* read of CR0 through CR15 */
1263271570Sneel	case 0x10 ... 0x1F:	/* write of CR0 through CR15 */
1264271570Sneel	case 0x20 ... 0x2F:	/* read of DR0 through DR15 */
1265271570Sneel	case 0x30 ... 0x3F:	/* write of DR0 through DR15 */
1266271570Sneel	case 0x43:		/* INT3 */
1267271570Sneel	case 0x44:		/* INTO */
1268271570Sneel	case 0x45:		/* BOUND */
1269271570Sneel	case 0x65 ... 0x7C:	/* VMEXIT_CR0_SEL_WRITE ... VMEXIT_MSR */
1270271570Sneel	case 0x80 ... 0x8D:	/* VMEXIT_VMRUN ... VMEXIT_XSETBV */
1271271570Sneel		return (1);
1272271570Sneel	default:
1273271570Sneel		return (0);
1274271570Sneel	}
1275271570Sneel}
1276271570Sneel
1277271570Sneelstatic int
1278254677Sgrehansvm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
1279254677Sgrehan{
1280271570Sneel	struct vmcb *vmcb;
1281254677Sgrehan	struct vmcb_state *state;
1282254677Sgrehan	struct vmcb_ctrl *ctrl;
1283254677Sgrehan	struct svm_regctx *ctx;
1284254677Sgrehan	uint64_t code, info1, info2, val;
1285254677Sgrehan	uint32_t eax, ecx, edx;
1286276403Sneel	int error, errcode_valid, handled, idtvec, reflect;
1287271570Sneel	bool retu;
1288254677Sgrehan
1289271570Sneel	ctx = svm_get_guest_regctx(svm_sc, vcpu);
1290271570Sneel	vmcb = svm_get_vmcb(svm_sc, vcpu);
1291271570Sneel	state = &vmcb->state;
1292271570Sneel	ctrl = &vmcb->ctrl;
1293254677Sgrehan
1294271570Sneel	handled = 0;
1295271570Sneel	code = ctrl->exitcode;
1296254677Sgrehan	info1 = ctrl->exitinfo1;
1297254677Sgrehan	info2 = ctrl->exitinfo2;
1298254677Sgrehan
1299271570Sneel	vmexit->exitcode = VM_EXITCODE_BOGUS;
1300271570Sneel	vmexit->rip = state->rip;
1301271570Sneel	vmexit->inst_length = nrip_valid(code) ? ctrl->nrip - state->rip : 0;
1302254677Sgrehan
1303271343Sneel	vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_COUNT, 1);
1304271343Sneel
1305271570Sneel	/*
1306271570Sneel	 * #VMEXIT(INVALID) needs to be handled early because the VMCB is
1307271570Sneel	 * in an inconsistent state and can trigger assertions that would
1308271570Sneel	 * never happen otherwise.
1309271570Sneel	 */
1310271570Sneel	if (code == VMCB_EXIT_INVALID) {
1311271570Sneel		vm_exit_svm(vmexit, code, info1, info2);
1312271570Sneel		return (0);
1313271570Sneel	}
1314271570Sneel
1315270511Sneel	KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event "
1316270511Sneel	    "injection valid bit is set %#lx", __func__, ctrl->eventinj));
1317270511Sneel
1318271570Sneel	KASSERT(vmexit->inst_length >= 0 && vmexit->inst_length <= 15,
1319271570Sneel	    ("invalid inst_length %d: code (%#lx), info1 (%#lx), info2 (%#lx)",
1320271570Sneel	    vmexit->inst_length, code, info1, info2));
1321271570Sneel
1322271661Sneel	svm_update_virqinfo(svm_sc, vcpu);
1323270962Sneel	svm_save_intinfo(svm_sc, vcpu);
1324270962Sneel
1325254677Sgrehan	switch (code) {
1326271694Sneel	case VMCB_EXIT_IRET:
1327271694Sneel		/*
1328271694Sneel		 * Restart execution at "iret" but with the intercept cleared.
1329271694Sneel		 */
1330271694Sneel		vmexit->inst_length = 0;
1331271715Sneel		clear_nmi_blocking(svm_sc, vcpu);
1332271694Sneel		handled = 1;
1333271694Sneel		break;
1334271570Sneel	case VMCB_EXIT_VINTR:	/* interrupt window exiting */
1335271419Sneel		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_VINTR, 1);
1336271570Sneel		handled = 1;
1337271419Sneel		break;
1338271570Sneel	case VMCB_EXIT_INTR:	/* external interrupt */
1339271570Sneel		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXTINT, 1);
1340271570Sneel		handled = 1;
1341271419Sneel		break;
1342271570Sneel	case VMCB_EXIT_NMI:	/* external NMI */
1343271570Sneel		handled = 1;
1344271570Sneel		break;
1345276403Sneel	case 0x40 ... 0x5F:
1346271570Sneel		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXCEPTION, 1);
1347276403Sneel		reflect = 1;
1348276403Sneel		idtvec = code - 0x40;
1349276403Sneel		switch (idtvec) {
1350276403Sneel		case IDT_MC:
1351276403Sneel			/*
1352276403Sneel			 * Call the machine check handler by hand. Also don't
1353276403Sneel			 * reflect the machine check back into the guest.
1354276403Sneel			 */
1355276403Sneel			reflect = 0;
1356276403Sneel			VCPU_CTR0(svm_sc->vm, vcpu, "Vectoring to MCE handler");
1357276403Sneel			__asm __volatile("int $18");
1358276403Sneel			break;
1359276403Sneel		case IDT_PF:
1360276403Sneel			error = svm_setreg(svm_sc, vcpu, VM_REG_GUEST_CR2,
1361276403Sneel			    info2);
1362276403Sneel			KASSERT(error == 0, ("%s: error %d updating cr2",
1363276403Sneel			    __func__, error));
1364276403Sneel			/* fallthru */
1365276403Sneel		case IDT_NP:
1366276403Sneel		case IDT_SS:
1367276403Sneel		case IDT_GP:
1368276403Sneel		case IDT_AC:
1369276403Sneel		case IDT_TS:
1370276403Sneel			errcode_valid = 1;
1371276403Sneel			break;
1372276403Sneel
1373276403Sneel		case IDT_DF:
1374276403Sneel			errcode_valid = 1;
1375276403Sneel			info1 = 0;
1376276403Sneel			break;
1377276403Sneel
1378276403Sneel		case IDT_BP:
1379276403Sneel		case IDT_OF:
1380276403Sneel		case IDT_BR:
1381276403Sneel			/*
1382276403Sneel			 * The 'nrip' field is populated for INT3, INTO and
1383276403Sneel			 * BOUND exceptions and this also implies that
1384276403Sneel			 * 'inst_length' is non-zero.
1385276403Sneel			 *
1386276403Sneel			 * Reset 'inst_length' to zero so the guest %rip at
1387276403Sneel			 * event injection is identical to what it was when
1388276403Sneel			 * the exception originally happened.
1389276403Sneel			 */
1390276403Sneel			VCPU_CTR2(svm_sc->vm, vcpu, "Reset inst_length from %d "
1391276403Sneel			    "to zero before injecting exception %d",
1392276403Sneel			    vmexit->inst_length, idtvec);
1393276403Sneel			vmexit->inst_length = 0;
1394276403Sneel			/* fallthru */
1395276403Sneel		default:
1396276403Sneel			errcode_valid = 0;
1397284894Sneel			info1 = 0;
1398276403Sneel			break;
1399276403Sneel		}
1400276403Sneel		KASSERT(vmexit->inst_length == 0, ("invalid inst_length (%d) "
1401276403Sneel		    "when reflecting exception %d into guest",
1402276403Sneel		    vmexit->inst_length, idtvec));
1403276403Sneel
1404276403Sneel		if (reflect) {
1405276403Sneel			/* Reflect the exception back into the guest */
1406276403Sneel			VCPU_CTR2(svm_sc->vm, vcpu, "Reflecting exception "
1407284894Sneel			    "%d/%#x into the guest", idtvec, (int)info1);
1408284894Sneel			error = vm_inject_exception(svm_sc->vm, vcpu, idtvec,
1409284894Sneel			    errcode_valid, info1, 0);
1410276403Sneel			KASSERT(error == 0, ("%s: vm_inject_exception error %d",
1411276403Sneel			    __func__, error));
1412276403Sneel		}
1413276403Sneel		handled = 1;
1414271570Sneel		break;
1415271419Sneel	case VMCB_EXIT_MSR:	/* MSR access. */
1416271419Sneel		eax = state->rax;
1417271419Sneel		ecx = ctx->sctx_rcx;
1418272195Sneel		edx = ctx->sctx_rdx;
1419271570Sneel		retu = false;
1420271419Sneel
1421271419Sneel		if (info1) {
1422271419Sneel			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_WRMSR, 1);
1423271419Sneel			val = (uint64_t)edx << 32 | eax;
1424271570Sneel			VCPU_CTR2(svm_sc->vm, vcpu, "wrmsr %#x val %#lx",
1425271570Sneel			    ecx, val);
1426271912Sneel			if (emulate_wrmsr(svm_sc, vcpu, ecx, val, &retu)) {
1427271570Sneel				vmexit->exitcode = VM_EXITCODE_WRMSR;
1428271570Sneel				vmexit->u.msr.code = ecx;
1429271419Sneel				vmexit->u.msr.wval = val;
1430271570Sneel			} else if (!retu) {
1431271570Sneel				handled = 1;
1432271570Sneel			} else {
1433271570Sneel				KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
1434271570Sneel				    ("emulate_wrmsr retu with bogus exitcode"));
1435271570Sneel			}
1436271419Sneel		} else {
1437271570Sneel			VCPU_CTR1(svm_sc->vm, vcpu, "rdmsr %#x", ecx);
1438271419Sneel			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_RDMSR, 1);
1439271912Sneel			if (emulate_rdmsr(svm_sc, vcpu, ecx, &retu)) {
1440271570Sneel				vmexit->exitcode = VM_EXITCODE_RDMSR;
1441271570Sneel				vmexit->u.msr.code = ecx;
1442271570Sneel			} else if (!retu) {
1443271570Sneel				handled = 1;
1444271570Sneel			} else {
1445271570Sneel				KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
1446271570Sneel				    ("emulate_rdmsr retu with bogus exitcode"));
1447271570Sneel			}
1448271419Sneel		}
1449271419Sneel		break;
1450271419Sneel	case VMCB_EXIT_IO:
1451271570Sneel		handled = svm_handle_io(svm_sc, vcpu, vmexit);
1452271419Sneel		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INOUT, 1);
1453271419Sneel		break;
1454271419Sneel	case VMCB_EXIT_CPUID:
1455271419Sneel		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_CPUID, 1);
1456271570Sneel		handled = x86_emulate_cpuid(svm_sc->vm, vcpu,
1457271419Sneel		    (uint32_t *)&state->rax,
1458271419Sneel		    (uint32_t *)&ctx->sctx_rbx,
1459271419Sneel		    (uint32_t *)&ctx->sctx_rcx,
1460272195Sneel		    (uint32_t *)&ctx->sctx_rdx);
1461271419Sneel		break;
1462271419Sneel	case VMCB_EXIT_HLT:
1463271419Sneel		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_HLT, 1);
1464271419Sneel		vmexit->exitcode = VM_EXITCODE_HLT;
1465271419Sneel		vmexit->u.hlt.rflags = state->rflags;
1466271419Sneel		break;
1467271419Sneel	case VMCB_EXIT_PAUSE:
1468271419Sneel		vmexit->exitcode = VM_EXITCODE_PAUSE;
1469271419Sneel		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_PAUSE, 1);
1470271419Sneel		break;
1471271419Sneel	case VMCB_EXIT_NPF:
1472271570Sneel		/* EXITINFO2 contains the faulting guest physical address */
1473271419Sneel		if (info1 & VMCB_NPF_INFO1_RSV) {
1474271419Sneel			VCPU_CTR2(svm_sc->vm, vcpu, "nested page fault with "
1475271419Sneel			    "reserved bits set: info1(%#lx) info2(%#lx)",
1476271419Sneel			    info1, info2);
1477295124Sgrehan		} else if (vm_mem_allocated(svm_sc->vm, vcpu, info2)) {
1478271419Sneel			vmexit->exitcode = VM_EXITCODE_PAGING;
1479271419Sneel			vmexit->u.paging.gpa = info2;
1480272929Sneel			vmexit->u.paging.fault_type = npf_fault_type(info1);
1481271419Sneel			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
1482271419Sneel			VCPU_CTR3(svm_sc->vm, vcpu, "nested page fault "
1483271419Sneel			    "on gpa %#lx/%#lx at rip %#lx",
1484271419Sneel			    info2, info1, state->rip);
1485271419Sneel		} else if (svm_npf_emul_fault(info1)) {
1486271570Sneel			svm_handle_inst_emul(vmcb, info2, vmexit);
1487271419Sneel			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INST_EMUL, 1);
1488271419Sneel			VCPU_CTR3(svm_sc->vm, vcpu, "inst_emul fault "
1489271419Sneel			    "for gpa %#lx/%#lx at rip %#lx",
1490271419Sneel			    info2, info1, state->rip);
1491271419Sneel		}
1492271419Sneel		break;
1493276403Sneel	case VMCB_EXIT_MONITOR:
1494276403Sneel		vmexit->exitcode = VM_EXITCODE_MONITOR;
1495276403Sneel		break;
1496276403Sneel	case VMCB_EXIT_MWAIT:
1497276403Sneel		vmexit->exitcode = VM_EXITCODE_MWAIT;
1498276403Sneel		break;
1499271419Sneel	default:
1500271419Sneel		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_UNKNOWN, 1);
1501271419Sneel		break;
1502254677Sgrehan	}
1503254677Sgrehan
1504271570Sneel	VCPU_CTR4(svm_sc->vm, vcpu, "%s %s vmexit at %#lx/%d",
1505271570Sneel	    handled ? "handled" : "unhandled", exit_reason_to_str(code),
1506271570Sneel	    vmexit->rip, vmexit->inst_length);
1507271343Sneel
1508271570Sneel	if (handled) {
1509271570Sneel		vmexit->rip += vmexit->inst_length;
1510271570Sneel		vmexit->inst_length = 0;
1511259579Sgrehan		state->rip = vmexit->rip;
1512271570Sneel	} else {
1513271570Sneel		if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
1514271570Sneel			/*
1515271570Sneel			 * If this VM exit was not claimed by anybody then
1516271570Sneel			 * treat it as a generic SVM exit.
1517271570Sneel			 */
1518271570Sneel			vm_exit_svm(vmexit, code, info1, info2);
1519271570Sneel		} else {
1520271570Sneel			/*
1521271570Sneel			 * The exitcode and collateral have been populated.
1522271570Sneel			 * The VM exit will be processed further in userland.
1523271570Sneel			 */
1524271570Sneel		}
1525254677Sgrehan	}
1526271570Sneel	return (handled);
1527254677Sgrehan}
1528254677Sgrehan
1529270962Sneelstatic void
1530270962Sneelsvm_inj_intinfo(struct svm_softc *svm_sc, int vcpu)
1531270962Sneel{
1532270962Sneel	uint64_t intinfo;
1533270962Sneel
1534270962Sneel	if (!vm_entry_intinfo(svm_sc->vm, vcpu, &intinfo))
1535270962Sneel		return;
1536270962Sneel
1537270962Sneel	KASSERT(VMCB_EXITINTINFO_VALID(intinfo), ("%s: entry intinfo is not "
1538270962Sneel	    "valid: %#lx", __func__, intinfo));
1539270962Sneel
1540271345Sneel	svm_eventinject(svm_sc, vcpu, VMCB_EXITINTINFO_TYPE(intinfo),
1541270962Sneel		VMCB_EXITINTINFO_VECTOR(intinfo),
1542270962Sneel		VMCB_EXITINTINFO_EC(intinfo),
1543270962Sneel		VMCB_EXITINTINFO_EC_VALID(intinfo));
1544270962Sneel	vmm_stat_incr(svm_sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1);
1545270962Sneel	VCPU_CTR1(svm_sc->vm, vcpu, "Injected entry intinfo: %#lx", intinfo);
1546270962Sneel}
1547270962Sneel
1548254677Sgrehan/*
1549254677Sgrehan * Inject event to virtual cpu.
1550254677Sgrehan */
1551254677Sgrehanstatic void
1552271415Sneelsvm_inj_interrupts(struct svm_softc *sc, int vcpu, struct vlapic *vlapic)
1553254677Sgrehan{
1554254677Sgrehan	struct vmcb_ctrl *ctrl;
1555254677Sgrehan	struct vmcb_state *state;
1556284894Sneel	struct svm_vcpu *vcpustate;
1557271661Sneel	uint8_t v_tpr;
1558329321Savg	int vector, need_intr_window;
1559329321Savg	int extint_pending;
1560254677Sgrehan
1561271415Sneel	state = svm_get_vmcb_state(sc, vcpu);
1562271415Sneel	ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
1563284894Sneel	vcpustate = svm_get_vcpu(sc, vcpu);
1564254677Sgrehan
1565271415Sneel	need_intr_window = 0;
1566270511Sneel
1567284894Sneel	if (vcpustate->nextrip != state->rip) {
1568284894Sneel		ctrl->intr_shadow = 0;
1569284894Sneel		VCPU_CTR2(sc->vm, vcpu, "Guest interrupt blocking "
1570284894Sneel		    "cleared due to rip change: %#lx/%#lx",
1571284894Sneel		    vcpustate->nextrip, state->rip);
1572284894Sneel	}
1573284894Sneel
1574271415Sneel	/*
1575271415Sneel	 * Inject pending events or exceptions for this vcpu.
1576271415Sneel	 *
1577271415Sneel	 * An event might be pending because the previous #VMEXIT happened
1578271415Sneel	 * during event delivery (i.e. ctrl->exitintinfo).
1579271415Sneel	 *
1580271415Sneel	 * An event might also be pending because an exception was injected
1581271415Sneel	 * by the hypervisor (e.g. #PF during instruction emulation).
1582271415Sneel	 */
1583271415Sneel	svm_inj_intinfo(sc, vcpu);
1584254677Sgrehan
1585271415Sneel	/* NMI event has priority over interrupts. */
1586271415Sneel	if (vm_nmi_pending(sc->vm, vcpu)) {
1587271415Sneel		if (nmi_blocked(sc, vcpu)) {
1588271415Sneel			/*
1589271415Sneel			 * Can't inject another NMI if the guest has not
1590271415Sneel			 * yet executed an "iret" after the last NMI.
1591271415Sneel			 */
1592271415Sneel			VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due "
1593271415Sneel			    "to NMI-blocking");
1594271694Sneel		} else if (ctrl->intr_shadow) {
1595271694Sneel			/*
1596271694Sneel			 * Can't inject an NMI if the vcpu is in an intr_shadow.
1597271694Sneel			 */
1598271694Sneel			VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due to "
1599271694Sneel			    "interrupt shadow");
1600271694Sneel			need_intr_window = 1;
1601271694Sneel			goto done;
1602271415Sneel		} else if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
1603271415Sneel			/*
1604271415Sneel			 * If there is already an exception/interrupt pending
1605271415Sneel			 * then defer the NMI until after that.
1606271415Sneel			 */
1607271415Sneel			VCPU_CTR1(sc->vm, vcpu, "Cannot inject NMI due to "
1608271415Sneel			    "eventinj %#lx", ctrl->eventinj);
1609270511Sneel
1610271415Sneel			/*
1611271415Sneel			 * Use self-IPI to trigger a VM-exit as soon as
1612271415Sneel			 * possible after the event injection is completed.
1613271415Sneel			 *
1614271415Sneel			 * This works only if the external interrupt exiting
1615271415Sneel			 * is at a lower priority than the event injection.
1616271415Sneel			 *
1617271415Sneel			 * Although not explicitly specified in APMv2 the
1618271415Sneel			 * relative priorities were verified empirically.
1619271415Sneel			 */
1620271415Sneel			ipi_cpu(curcpu, IPI_AST);	/* XXX vmm_ipinum? */
1621271415Sneel		} else {
1622271415Sneel			vm_nmi_clear(sc->vm, vcpu);
1623271415Sneel
1624271415Sneel			/* Inject NMI, vector number is not used */
1625271415Sneel			svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_NMI,
1626271415Sneel			    IDT_NMI, 0, false);
1627271415Sneel
1628271415Sneel			/* virtual NMI blocking is now in effect */
1629271415Sneel			enable_nmi_blocking(sc, vcpu);
1630271415Sneel
1631271415Sneel			VCPU_CTR0(sc->vm, vcpu, "Injecting vNMI");
1632271415Sneel		}
1633254677Sgrehan	}
1634254677Sgrehan
1635329321Savg	extint_pending = vm_extint_pending(sc->vm, vcpu);
1636329321Savg	if (!extint_pending) {
1637329321Savg		if (!vlapic_pending_intr(vlapic, &vector))
1638329321Savg			goto done;
1639329321Savg		KASSERT(vector >= 16 && vector <= 255,
1640329321Savg		    ("invalid vector %d from local APIC", vector));
1641329321Savg	} else {
1642329321Savg		/* Ask the legacy pic for a vector to inject */
1643329321Savg		vatpic_pending_intr(sc->vm, &vector);
1644329321Savg		KASSERT(vector >= 0 && vector <= 255,
1645329321Savg		    ("invalid vector %d from INTR", vector));
1646267305Sgrehan	}
1647267305Sgrehan
1648271415Sneel	/*
1649271415Sneel	 * If the guest has disabled interrupts or is in an interrupt shadow
1650271415Sneel	 * then we cannot inject the pending interrupt.
1651271415Sneel	 */
1652271415Sneel	if ((state->rflags & PSL_I) == 0) {
1653271415Sneel		VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to "
1654271415Sneel		    "rflags %#lx", vector, state->rflags);
1655271415Sneel		need_intr_window = 1;
1656271415Sneel		goto done;
1657254677Sgrehan	}
1658254677Sgrehan
1659271415Sneel	if (ctrl->intr_shadow) {
1660271415Sneel		VCPU_CTR1(sc->vm, vcpu, "Cannot inject vector %d due to "
1661271415Sneel		    "interrupt shadow", vector);
1662271415Sneel		need_intr_window = 1;
1663271415Sneel		goto done;
1664254677Sgrehan	}
1665254677Sgrehan
1666271415Sneel	if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
1667271415Sneel		VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to "
1668271415Sneel		    "eventinj %#lx", vector, ctrl->eventinj);
1669271415Sneel		need_intr_window = 1;
1670271415Sneel		goto done;
1671271415Sneel	}
1672254677Sgrehan
1673271415Sneel	svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_INTR, vector, 0, false);
1674271415Sneel
1675329321Savg	if (!extint_pending) {
1676329321Savg		vlapic_intr_accepted(vlapic, vector);
1677329321Savg	} else {
1678329321Savg		vm_extint_clear(sc->vm, vcpu);
1679329321Savg		vatpic_intr_accepted(sc->vm, vector);
1680329321Savg	}
1681271661Sneel
1682271661Sneel	/*
1683271661Sneel	 * Force a VM-exit as soon as the vcpu is ready to accept another
1684271661Sneel	 * interrupt. This is done because the PIC might have another vector
1685271661Sneel	 * that it wants to inject. Also, if the APIC has a pending interrupt
1686271661Sneel	 * that was preempted by the ExtInt then it allows us to inject the
1687271661Sneel	 * APIC vector as soon as possible.
1688271661Sneel	 */
1689271661Sneel	need_intr_window = 1;
1690271661Sneeldone:
1691271661Sneel	/*
1692271661Sneel	 * The guest can modify the TPR by writing to %CR8. In guest mode
1693271661Sneel	 * the processor reflects this write to V_TPR without hypervisor
1694271661Sneel	 * intervention.
1695271661Sneel	 *
1696271661Sneel	 * The guest can also modify the TPR by writing to it via the memory
1697271661Sneel	 * mapped APIC page. In this case, the write will be emulated by the
1698271661Sneel	 * hypervisor. For this reason V_TPR must be updated before every
1699271661Sneel	 * VMRUN.
1700271661Sneel	 */
1701271661Sneel	v_tpr = vlapic_get_cr8(vlapic);
1702284894Sneel	KASSERT(v_tpr <= 15, ("invalid v_tpr %#x", v_tpr));
1703271661Sneel	if (ctrl->v_tpr != v_tpr) {
1704271661Sneel		VCPU_CTR2(sc->vm, vcpu, "VMCB V_TPR changed from %#x to %#x",
1705271661Sneel		    ctrl->v_tpr, v_tpr);
1706271661Sneel		ctrl->v_tpr = v_tpr;
1707271939Sneel		svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
1708271661Sneel	}
1709271661Sneel
1710329321Savg	if (need_intr_window) {
1711271415Sneel		/*
1712271415Sneel		 * We use V_IRQ in conjunction with the VINTR intercept to
1713271415Sneel		 * trap into the hypervisor as soon as a virtual interrupt
1714271415Sneel		 * can be delivered.
1715271415Sneel		 *
1716271415Sneel		 * Since injected events are not subject to intercept checks
1717271415Sneel		 * we need to ensure that the V_IRQ is not actually going to
1718271415Sneel		 * be delivered on VM entry. The KASSERT below enforces this.
1719271415Sneel		 */
1720271415Sneel		KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 ||
1721271415Sneel		    (state->rflags & PSL_I) == 0 || ctrl->intr_shadow,
1722271415Sneel		    ("Bogus intr_window_exiting: eventinj (%#lx), "
1723271415Sneel		    "intr_shadow (%u), rflags (%#lx)",
1724271415Sneel		    ctrl->eventinj, ctrl->intr_shadow, state->rflags));
1725271415Sneel		enable_intr_window_exiting(sc, vcpu);
1726271415Sneel	} else {
1727271415Sneel		disable_intr_window_exiting(sc, vcpu);
1728271415Sneel	}
1729254677Sgrehan}
1730254677Sgrehan
1731271203Sneelstatic __inline void
1732271086Sneelrestore_host_tss(void)
1733254677Sgrehan{
1734271086Sneel	struct system_segment_descriptor *tss_sd;
1735254677Sgrehan
1736254677Sgrehan	/*
1737271086Sneel	 * The TSS descriptor was in use prior to launching the guest so it
1738271086Sneel	 * has been marked busy.
1739271086Sneel	 *
1740271086Sneel	 * 'ltr' requires the descriptor to be marked available so change the
1741271086Sneel	 * type to "64-bit available TSS".
1742254677Sgrehan	 */
1743271086Sneel	tss_sd = PCPU_GET(tss);
1744271086Sneel	tss_sd->sd_type = SDT_SYSTSS;
1745271086Sneel	ltr(GSEL(GPROC0_SEL, SEL_KPL));
1746254677Sgrehan}
1747254677Sgrehan
1748271203Sneelstatic void
1749271203Sneelcheck_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, u_int thiscpu)
1750271203Sneel{
1751271203Sneel	struct svm_vcpu *vcpustate;
1752271203Sneel	struct vmcb_ctrl *ctrl;
1753271203Sneel	long eptgen;
1754271203Sneel	bool alloc_asid;
1755271203Sneel
1756271203Sneel	KASSERT(CPU_ISSET(thiscpu, &pmap->pm_active), ("%s: nested pmap not "
1757271203Sneel	    "active on cpu %u", __func__, thiscpu));
1758271203Sneel
1759271203Sneel	vcpustate = svm_get_vcpu(sc, vcpuid);
1760271203Sneel	ctrl = svm_get_vmcb_ctrl(sc, vcpuid);
1761271203Sneel
1762271203Sneel	/*
1763271203Sneel	 * The TLB entries associated with the vcpu's ASID are not valid
1764271203Sneel	 * if either of the following conditions is true:
1765271203Sneel	 *
1766271203Sneel	 * 1. The vcpu's ASID generation is different than the host cpu's
1767271203Sneel	 *    ASID generation. This happens when the vcpu migrates to a new
1768271203Sneel	 *    host cpu. It can also happen when the number of vcpus executing
1769271203Sneel	 *    on a host cpu is greater than the number of ASIDs available.
1770271203Sneel	 *
1771271203Sneel	 * 2. The pmap generation number is different than the value cached in
1772271203Sneel	 *    the 'vcpustate'. This happens when the host invalidates pages
1773271203Sneel	 *    belonging to the guest.
1774271203Sneel	 *
1775271203Sneel	 *	asidgen		eptgen	      Action
1776271203Sneel	 *	mismatch	mismatch
1777271203Sneel	 *	   0		   0		(a)
1778271203Sneel	 *	   0		   1		(b1) or (b2)
1779271203Sneel	 *	   1		   0		(c)
1780271203Sneel	 *	   1		   1		(d)
1781271203Sneel	 *
1782271203Sneel	 * (a) There is no mismatch in eptgen or ASID generation and therefore
1783271203Sneel	 *     no further action is needed.
1784271203Sneel	 *
1785271203Sneel	 * (b1) If the cpu supports FlushByAsid then the vcpu's ASID is
1786271203Sneel	 *      retained and the TLB entries associated with this ASID
1787271203Sneel	 *      are flushed by VMRUN.
1788271203Sneel	 *
1789271203Sneel	 * (b2) If the cpu does not support FlushByAsid then a new ASID is
1790271203Sneel	 *      allocated.
1791271203Sneel	 *
1792271203Sneel	 * (c) A new ASID is allocated.
1793271203Sneel	 *
1794271203Sneel	 * (d) A new ASID is allocated.
1795271203Sneel	 */
1796271203Sneel
1797271203Sneel	alloc_asid = false;
1798271203Sneel	eptgen = pmap->pm_eptgen;
1799271203Sneel	ctrl->tlb_ctrl = VMCB_TLB_FLUSH_NOTHING;
1800271203Sneel
1801271203Sneel	if (vcpustate->asid.gen != asid[thiscpu].gen) {
1802271203Sneel		alloc_asid = true;	/* (c) and (d) */
1803271203Sneel	} else if (vcpustate->eptgen != eptgen) {
1804271203Sneel		if (flush_by_asid())
1805271203Sneel			ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST;	/* (b1) */
1806271203Sneel		else
1807271203Sneel			alloc_asid = true;			/* (b2) */
1808271203Sneel	} else {
1809271203Sneel		/*
1810271203Sneel		 * This is the common case (a).
1811271203Sneel		 */
1812271203Sneel		KASSERT(!alloc_asid, ("ASID allocation not necessary"));
1813271203Sneel		KASSERT(ctrl->tlb_ctrl == VMCB_TLB_FLUSH_NOTHING,
1814271203Sneel		    ("Invalid VMCB tlb_ctrl: %#x", ctrl->tlb_ctrl));
1815271203Sneel	}
1816271203Sneel
1817271203Sneel	if (alloc_asid) {
1818271203Sneel		if (++asid[thiscpu].num >= nasid) {
1819271203Sneel			asid[thiscpu].num = 1;
1820271203Sneel			if (++asid[thiscpu].gen == 0)
1821271203Sneel				asid[thiscpu].gen = 1;
1822271203Sneel			/*
1823271203Sneel			 * If this cpu does not support "flush-by-asid"
1824271203Sneel			 * then flush the entire TLB on a generation
1825271203Sneel			 * bump. Subsequent ASID allocation in this
1826271203Sneel			 * generation can be done without a TLB flush.
1827271203Sneel			 */
1828271203Sneel			if (!flush_by_asid())
1829271203Sneel				ctrl->tlb_ctrl = VMCB_TLB_FLUSH_ALL;
1830271203Sneel		}
1831271203Sneel		vcpustate->asid.gen = asid[thiscpu].gen;
1832271203Sneel		vcpustate->asid.num = asid[thiscpu].num;
1833271203Sneel
1834271203Sneel		ctrl->asid = vcpustate->asid.num;
1835271939Sneel		svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID);
1836271203Sneel		/*
1837271203Sneel		 * If this cpu supports "flush-by-asid" then the TLB
1838271203Sneel		 * was not flushed after the generation bump. The TLB
1839271203Sneel		 * is flushed selectively after every new ASID allocation.
1840271203Sneel		 */
1841271203Sneel		if (flush_by_asid())
1842271203Sneel			ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST;
1843271203Sneel	}
1844271203Sneel	vcpustate->eptgen = eptgen;
1845271203Sneel
1846271203Sneel	KASSERT(ctrl->asid != 0, ("Guest ASID must be non-zero"));
1847271203Sneel	KASSERT(ctrl->asid == vcpustate->asid.num,
1848271203Sneel	    ("ASID mismatch: %u/%u", ctrl->asid, vcpustate->asid.num));
1849271203Sneel}
1850271203Sneel
1851272929Sneelstatic __inline void
1852272929Sneeldisable_gintr(void)
1853272929Sneel{
1854272929Sneel
1855284894Sneel	__asm __volatile("clgi");
1856272929Sneel}
1857272929Sneel
1858272929Sneelstatic __inline void
1859272929Sneelenable_gintr(void)
1860272929Sneel{
1861272929Sneel
1862284894Sneel        __asm __volatile("stgi");
1863272929Sneel}
1864272929Sneel
1865254677Sgrehan/*
1866254677Sgrehan * Start vcpu with specified RIP.
1867254677Sgrehan */
1868254677Sgrehanstatic int
1869267003Sgrehansvm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap,
1870284900Sneel	struct vm_eventinfo *evinfo)
1871254677Sgrehan{
1872272195Sneel	struct svm_regctx *gctx;
1873254677Sgrehan	struct svm_softc *svm_sc;
1874254677Sgrehan	struct svm_vcpu *vcpustate;
1875254677Sgrehan	struct vmcb_state *state;
1876254677Sgrehan	struct vmcb_ctrl *ctrl;
1877254677Sgrehan	struct vm_exit *vmexit;
1878267003Sgrehan	struct vlapic *vlapic;
1879267003Sgrehan	struct vm *vm;
1880254677Sgrehan	uint64_t vmcb_pa;
1881271570Sneel	int handled;
1882254677Sgrehan
1883254677Sgrehan	svm_sc = arg;
1884267003Sgrehan	vm = svm_sc->vm;
1885267003Sgrehan
1886254677Sgrehan	vcpustate = svm_get_vcpu(svm_sc, vcpu);
1887259579Sgrehan	state = svm_get_vmcb_state(svm_sc, vcpu);
1888259579Sgrehan	ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
1889267003Sgrehan	vmexit = vm_exitinfo(vm, vcpu);
1890267003Sgrehan	vlapic = vm_lapic(vm, vcpu);
1891254677Sgrehan
1892254677Sgrehan	gctx = svm_get_guest_regctx(svm_sc, vcpu);
1893254677Sgrehan	vmcb_pa = svm_sc->vcpu[vcpu].vmcb_pa;
1894254677Sgrehan
1895285015Sneel	if (vcpustate->lastcpu != curcpu) {
1896271203Sneel		/*
1897271203Sneel		 * Force new ASID allocation by invalidating the generation.
1898271203Sneel		 */
1899271203Sneel		vcpustate->asid.gen = 0;
1900254677Sgrehan
1901254677Sgrehan		/*
1902271203Sneel		 * Invalidate the VMCB state cache by marking all fields dirty.
1903254677Sgrehan		 */
1904271939Sneel		svm_set_dirty(svm_sc, vcpu, 0xffffffff);
1905254677Sgrehan
1906256867Sneel		/*
1907271203Sneel		 * XXX
1908271203Sneel		 * Setting 'vcpustate->lastcpu' here is bit premature because
1909271203Sneel		 * we may return from this function without actually executing
1910271203Sneel		 * the VMRUN  instruction. This could happen if a rendezvous
1911271203Sneel		 * or an AST is pending on the first time through the loop.
1912254677Sgrehan		 *
1913271203Sneel		 * This works for now but any new side-effects of vcpu
1914271203Sneel		 * migration should take this case into account.
1915254677Sgrehan		 */
1916285015Sneel		vcpustate->lastcpu = curcpu;
1917271203Sneel		vmm_stat_incr(vm, vcpu, VCPU_MIGRATIONS, 1);
1918254677Sgrehan	}
1919254677Sgrehan
1920271912Sneel	svm_msr_guest_enter(svm_sc, vcpu);
1921271912Sneel
1922254677Sgrehan	/* Update Guest RIP */
1923254677Sgrehan	state->rip = rip;
1924271203Sneel
1925254677Sgrehan	do {
1926267367Sneel		/*
1927267367Sneel		 * Disable global interrupts to guarantee atomicity during
1928267367Sneel		 * loading of guest state. This includes not only the state
1929267367Sneel		 * loaded by the "vmrun" instruction but also software state
1930267367Sneel		 * maintained by the hypervisor: suspended and rendezvous
1931267367Sneel		 * state, NPT generation number, vlapic interrupts etc.
1932267367Sneel		 */
1933267367Sneel		disable_gintr();
1934267367Sneel
1935284900Sneel		if (vcpu_suspended(evinfo)) {
1936267367Sneel			enable_gintr();
1937267032Sgrehan			vm_exit_suspended(vm, vcpu, state->rip);
1938267003Sgrehan			break;
1939267003Sgrehan		}
1940267003Sgrehan
1941284900Sneel		if (vcpu_rendezvous_pending(evinfo)) {
1942267367Sneel			enable_gintr();
1943271343Sneel			vm_exit_rendezvous(vm, vcpu, state->rip);
1944267003Sgrehan			break;
1945267003Sgrehan		}
1946267003Sgrehan
1947284900Sneel		if (vcpu_reqidle(evinfo)) {
1948284900Sneel			enable_gintr();
1949284900Sneel			vm_exit_reqidle(vm, vcpu, state->rip);
1950284900Sneel			break;
1951284900Sneel		}
1952284900Sneel
1953267367Sneel		/* We are asked to give the cpu by scheduler. */
1954284899Sneel		if (vcpu_should_yield(vm, vcpu)) {
1955267367Sneel			enable_gintr();
1956271343Sneel			vm_exit_astpending(vm, vcpu, state->rip);
1957267367Sneel			break;
1958267367Sneel		}
1959267367Sneel
1960270511Sneel		svm_inj_interrupts(svm_sc, vcpu, vlapic);
1961267367Sneel
1962285015Sneel		/* Activate the nested pmap on 'curcpu' */
1963285015Sneel		CPU_SET_ATOMIC_ACQ(curcpu, &pmap->pm_active);
1964271203Sneel
1965271203Sneel		/*
1966271203Sneel		 * Check the pmap generation and the ASID generation to
1967271203Sneel		 * ensure that the vcpu does not use stale TLB mappings.
1968271203Sneel		 */
1969285015Sneel		check_asid(svm_sc, vcpu, pmap, curcpu);
1970271203Sneel
1971271939Sneel		ctrl->vmcb_clean = vmcb_clean & ~vcpustate->dirty;
1972271203Sneel		vcpustate->dirty = 0;
1973271343Sneel		VCPU_CTR1(vm, vcpu, "vmcb clean %#x", ctrl->vmcb_clean);
1974271203Sneel
1975254677Sgrehan		/* Launch Virtual Machine. */
1976271343Sneel		VCPU_CTR1(vm, vcpu, "Resume execution at %#lx", state->rip);
1977285015Sneel		svm_launch(vmcb_pa, gctx, &__pcpu[curcpu]);
1978267367Sneel
1979285015Sneel		CPU_CLR_ATOMIC(curcpu, &pmap->pm_active);
1980271203Sneel
1981254677Sgrehan		/*
1982271086Sneel		 * The host GDTR and IDTR is saved by VMRUN and restored
1983271086Sneel		 * automatically on #VMEXIT. However, the host TSS needs
1984271086Sneel		 * to be restored explicitly.
1985254677Sgrehan		 */
1986271086Sneel		restore_host_tss();
1987267367Sneel
1988267367Sneel		/* #VMEXIT disables interrupts so re-enable them here. */
1989254677Sgrehan		enable_gintr();
1990267367Sneel
1991284894Sneel		/* Update 'nextrip' */
1992284894Sneel		vcpustate->nextrip = state->rip;
1993284894Sneel
1994254677Sgrehan		/* Handle #VMEXIT and if required return to user space. */
1995271570Sneel		handled = svm_vmexit(svm_sc, vcpu, vmexit);
1996271570Sneel	} while (handled);
1997271343Sneel
1998271912Sneel	svm_msr_guest_exit(svm_sc, vcpu);
1999271912Sneel
2000254677Sgrehan	return (0);
2001254677Sgrehan}
2002254677Sgrehan
2003254677Sgrehanstatic void
2004254677Sgrehansvm_vmcleanup(void *arg)
2005254677Sgrehan{
2006272929Sneel	struct svm_softc *sc = arg;
2007254677Sgrehan
2008328842Savg	contigfree(sc->iopm_bitmap, SVM_IO_BITMAP_SIZE, M_SVM);
2009328842Savg	contigfree(sc->msr_bitmap, SVM_MSR_BITMAP_SIZE, M_SVM);
2010328842Savg	free(sc, M_SVM);
2011254677Sgrehan}
2012254677Sgrehan
2013254677Sgrehanstatic register_t *
2014254677Sgrehanswctx_regptr(struct svm_regctx *regctx, int reg)
2015254677Sgrehan{
2016254677Sgrehan
2017254677Sgrehan	switch (reg) {
2018272929Sneel	case VM_REG_GUEST_RBX:
2019272929Sneel		return (&regctx->sctx_rbx);
2020272929Sneel	case VM_REG_GUEST_RCX:
2021272929Sneel		return (&regctx->sctx_rcx);
2022272929Sneel	case VM_REG_GUEST_RDX:
2023272929Sneel		return (&regctx->sctx_rdx);
2024272929Sneel	case VM_REG_GUEST_RDI:
2025272929Sneel		return (&regctx->sctx_rdi);
2026272929Sneel	case VM_REG_GUEST_RSI:
2027272929Sneel		return (&regctx->sctx_rsi);
2028272929Sneel	case VM_REG_GUEST_RBP:
2029272929Sneel		return (&regctx->sctx_rbp);
2030272929Sneel	case VM_REG_GUEST_R8:
2031272929Sneel		return (&regctx->sctx_r8);
2032272929Sneel	case VM_REG_GUEST_R9:
2033272929Sneel		return (&regctx->sctx_r9);
2034272929Sneel	case VM_REG_GUEST_R10:
2035272929Sneel		return (&regctx->sctx_r10);
2036272929Sneel	case VM_REG_GUEST_R11:
2037272929Sneel		return (&regctx->sctx_r11);
2038272929Sneel	case VM_REG_GUEST_R12:
2039272929Sneel		return (&regctx->sctx_r12);
2040272929Sneel	case VM_REG_GUEST_R13:
2041272929Sneel		return (&regctx->sctx_r13);
2042272929Sneel	case VM_REG_GUEST_R14:
2043272929Sneel		return (&regctx->sctx_r14);
2044272929Sneel	case VM_REG_GUEST_R15:
2045272929Sneel		return (&regctx->sctx_r15);
2046272929Sneel	default:
2047272929Sneel		return (NULL);
2048254677Sgrehan	}
2049254677Sgrehan}
2050254677Sgrehan
2051254677Sgrehanstatic int
2052254677Sgrehansvm_getreg(void *arg, int vcpu, int ident, uint64_t *val)
2053254677Sgrehan{
2054254677Sgrehan	struct svm_softc *svm_sc;
2055254677Sgrehan	register_t *reg;
2056271715Sneel
2057254677Sgrehan	svm_sc = arg;
2058254677Sgrehan
2059271715Sneel	if (ident == VM_REG_GUEST_INTR_SHADOW) {
2060271715Sneel		return (svm_get_intr_shadow(svm_sc, vcpu, val));
2061271715Sneel	}
2062271715Sneel
2063271939Sneel	if (vmcb_read(svm_sc, vcpu, ident, val) == 0) {
2064254677Sgrehan		return (0);
2065254677Sgrehan	}
2066254677Sgrehan
2067254677Sgrehan	reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident);
2068259579Sgrehan
2069254677Sgrehan	if (reg != NULL) {
2070254677Sgrehan		*val = *reg;
2071254677Sgrehan		return (0);
2072254677Sgrehan	}
2073254677Sgrehan
2074272929Sneel	VCPU_CTR1(svm_sc->vm, vcpu, "svm_getreg: unknown register %#x", ident);
2075254677Sgrehan	return (EINVAL);
2076254677Sgrehan}
2077254677Sgrehan
2078254677Sgrehanstatic int
2079254677Sgrehansvm_setreg(void *arg, int vcpu, int ident, uint64_t val)
2080254677Sgrehan{
2081254677Sgrehan	struct svm_softc *svm_sc;
2082254677Sgrehan	register_t *reg;
2083254677Sgrehan
2084254677Sgrehan	svm_sc = arg;
2085271715Sneel
2086271715Sneel	if (ident == VM_REG_GUEST_INTR_SHADOW) {
2087271715Sneel		return (svm_modify_intr_shadow(svm_sc, vcpu, val));
2088271715Sneel	}
2089271715Sneel
2090271939Sneel	if (vmcb_write(svm_sc, vcpu, ident, val) == 0) {
2091254677Sgrehan		return (0);
2092254677Sgrehan	}
2093254677Sgrehan
2094254677Sgrehan	reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident);
2095271203Sneel
2096254677Sgrehan	if (reg != NULL) {
2097254677Sgrehan		*reg = val;
2098254677Sgrehan		return (0);
2099254677Sgrehan	}
2100254677Sgrehan
2101271203Sneel	/*
2102271203Sneel	 * XXX deal with CR3 and invalidate TLB entries tagged with the
2103271203Sneel	 * vcpu's ASID. This needs to be treated differently depending on
2104271203Sneel	 * whether 'running' is true/false.
2105271203Sneel	 */
2106271203Sneel
2107272929Sneel	VCPU_CTR1(svm_sc->vm, vcpu, "svm_setreg: unknown register %#x", ident);
2108254677Sgrehan	return (EINVAL);
2109254677Sgrehan}
2110254677Sgrehan
2111254677Sgrehanstatic int
2112254677Sgrehansvm_setcap(void *arg, int vcpu, int type, int val)
2113254677Sgrehan{
2114271348Sneel	struct svm_softc *sc;
2115271348Sneel	int error;
2116254677Sgrehan
2117271348Sneel	sc = arg;
2118271348Sneel	error = 0;
2119254677Sgrehan	switch (type) {
2120271348Sneel	case VM_CAP_HALT_EXIT:
2121271348Sneel		svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2122271348Sneel		    VMCB_INTCPT_HLT, val);
2123271348Sneel		break;
2124271348Sneel	case VM_CAP_PAUSE_EXIT:
2125271348Sneel		svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2126271348Sneel		    VMCB_INTCPT_PAUSE, val);
2127271348Sneel		break;
2128271348Sneel	case VM_CAP_UNRESTRICTED_GUEST:
2129271348Sneel		/* Unrestricted guest execution cannot be disabled in SVM */
2130271348Sneel		if (val == 0)
2131271348Sneel			error = EINVAL;
2132271348Sneel		break;
2133271348Sneel	default:
2134271348Sneel		error = ENOENT;
2135271348Sneel		break;
2136271348Sneel	}
2137271348Sneel	return (error);
2138254677Sgrehan}
2139254677Sgrehan
2140254677Sgrehanstatic int
2141254677Sgrehansvm_getcap(void *arg, int vcpu, int type, int *retval)
2142254677Sgrehan{
2143271348Sneel	struct svm_softc *sc;
2144271348Sneel	int error;
2145254677Sgrehan
2146271348Sneel	sc = arg;
2147271348Sneel	error = 0;
2148254677Sgrehan
2149254677Sgrehan	switch (type) {
2150271348Sneel	case VM_CAP_HALT_EXIT:
2151271348Sneel		*retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2152271348Sneel		    VMCB_INTCPT_HLT);
2153254677Sgrehan		break;
2154271348Sneel	case VM_CAP_PAUSE_EXIT:
2155271348Sneel		*retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2156271348Sneel		    VMCB_INTCPT_PAUSE);
2157254677Sgrehan		break;
2158254677Sgrehan	case VM_CAP_UNRESTRICTED_GUEST:
2159271348Sneel		*retval = 1;	/* unrestricted guest is always enabled */
2160254677Sgrehan		break;
2161271348Sneel	default:
2162271348Sneel		error = ENOENT;
2163254677Sgrehan		break;
2164254677Sgrehan	}
2165271348Sneel	return (error);
2166254677Sgrehan}
2167254677Sgrehan
2168267003Sgrehanstatic struct vlapic *
2169267003Sgrehansvm_vlapic_init(void *arg, int vcpuid)
2170267003Sgrehan{
2171267003Sgrehan	struct svm_softc *svm_sc;
2172267003Sgrehan	struct vlapic *vlapic;
2173267003Sgrehan
2174267003Sgrehan	svm_sc = arg;
2175267003Sgrehan	vlapic = malloc(sizeof(struct vlapic), M_SVM_VLAPIC, M_WAITOK | M_ZERO);
2176267003Sgrehan	vlapic->vm = svm_sc->vm;
2177267003Sgrehan	vlapic->vcpuid = vcpuid;
2178267003Sgrehan	vlapic->apic_page = (struct LAPIC *)&svm_sc->apic_page[vcpuid];
2179267003Sgrehan
2180267003Sgrehan	vlapic_init(vlapic);
2181272929Sneel
2182267003Sgrehan	return (vlapic);
2183267003Sgrehan}
2184267003Sgrehan
2185267003Sgrehanstatic void
2186267003Sgrehansvm_vlapic_cleanup(void *arg, struct vlapic *vlapic)
2187267003Sgrehan{
2188267003Sgrehan
2189267003Sgrehan        vlapic_cleanup(vlapic);
2190267003Sgrehan        free(vlapic, M_SVM_VLAPIC);
2191267003Sgrehan}
2192267003Sgrehan
2193254677Sgrehanstruct vmm_ops vmm_ops_amd = {
2194254677Sgrehan	svm_init,
2195254677Sgrehan	svm_cleanup,
2196267003Sgrehan	svm_restore,
2197254677Sgrehan	svm_vminit,
2198254677Sgrehan	svm_vmrun,
2199254677Sgrehan	svm_vmcleanup,
2200254677Sgrehan	svm_getreg,
2201254677Sgrehan	svm_setreg,
2202271939Sneel	vmcb_getdesc,
2203271939Sneel	vmcb_setdesc,
2204254677Sgrehan	svm_getcap,
2205259579Sgrehan	svm_setcap,
2206259579Sgrehan	svm_npt_alloc,
2207267003Sgrehan	svm_npt_free,
2208267003Sgrehan	svm_vlapic_init,
2209267003Sgrehan	svm_vlapic_cleanup
2210254677Sgrehan};
2211