svm.c revision 271570
112115Sdyson/*-
212115Sdyson * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com)
312115Sdyson * All rights reserved.
412115Sdyson *
512115Sdyson * Redistribution and use in source and binary forms, with or without
612115Sdyson * modification, are permitted provided that the following conditions
712115Sdyson * are met:
812115Sdyson * 1. Redistributions of source code must retain the above copyright
912115Sdyson *    notice unmodified, this list of conditions, and the following
1012115Sdyson *    disclaimer.
1112115Sdyson * 2. Redistributions in binary form must reproduce the above copyright
1212115Sdyson *    notice, this list of conditions and the following disclaimer in the
1312115Sdyson *    documentation and/or other materials provided with the distribution.
1412115Sdyson *
1512115Sdyson * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
1612115Sdyson * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
1712115Sdyson * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
1812115Sdyson * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
1912115Sdyson * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
2012115Sdyson * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
2112115Sdyson * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
2212115Sdyson * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
2312115Sdyson * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
2412115Sdyson * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2512115Sdyson */
2612115Sdyson
2712115Sdyson#include <sys/cdefs.h>
2812115Sdyson__FBSDID("$FreeBSD: projects/bhyve_svm/sys/amd64/vmm/amd/svm.c 271570 2014-09-14 04:39:04Z neel $");
2912115Sdyson
3012115Sdyson#include <sys/param.h>
3112115Sdyson#include <sys/systm.h>
3212115Sdyson#include <sys/smp.h>
3312115Sdyson#include <sys/kernel.h>
3412115Sdyson#include <sys/malloc.h>
3512115Sdyson#include <sys/pcpu.h>
3612115Sdyson#include <sys/proc.h>
3712115Sdyson#include <sys/sysctl.h>
3812115Sdyson
3912115Sdyson#include <vm/vm.h>
4012115Sdyson#include <vm/pmap.h>
4112115Sdyson
4213260Swollman#include <machine/cpufunc.h>
4312115Sdyson#include <machine/psl.h>
4412115Sdyson#include <machine/pmap.h>
4512115Sdyson#include <machine/md_var.h>
4612115Sdyson#include <machine/vmparam.h>
4712115Sdyson#include <machine/specialreg.h>
4812115Sdyson#include <machine/segments.h>
4912115Sdyson#include <machine/smp.h>
5012115Sdyson#include <machine/vmm.h>
5112115Sdyson#include <machine/vmm_dev.h>
5229906Skato#include <machine/vmm_instruction_emul.h>
5324131Sbde
5412115Sdyson#include <x86/apicreg.h>
5512115Sdyson
5612115Sdyson#include "vmm_lapic.h"
5712115Sdyson#include "vmm_msr.h"
5812115Sdyson#include "vmm_stat.h"
5912115Sdyson#include "vmm_ktr.h"
6012115Sdyson#include "vmm_ioport.h"
6112115Sdyson#include "vatpic.h"
6212115Sdyson#include "vlapic.h"
6312115Sdyson#include "vlapic_priv.h"
6412115Sdyson
6512115Sdyson#include "x86.h"
6612115Sdyson#include "vmcb.h"
6712115Sdyson#include "svm.h"
6812115Sdyson#include "svm_softc.h"
6912115Sdyson#include "npt.h"
7028270Swollman
7112911SphkSYSCTL_DECL(_hw_vmm);
7212911SphkSYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW, NULL, NULL);
7312911Sphk
7412911Sphk/*
7512911Sphk * SVM CPUID function 0x8000_000A, edx bit decoding.
7612911Sphk */
7712911Sphk#define AMD_CPUID_SVM_NP		BIT(0)  /* Nested paging or RVI */
7812911Sphk#define AMD_CPUID_SVM_LBR		BIT(1)  /* Last branch virtualization */
7912911Sphk#define AMD_CPUID_SVM_SVML		BIT(2)  /* SVM lock */
8012911Sphk#define AMD_CPUID_SVM_NRIP_SAVE		BIT(3)  /* Next RIP is saved */
8112911Sphk#define AMD_CPUID_SVM_TSC_RATE		BIT(4)  /* TSC rate control. */
8212911Sphk#define AMD_CPUID_SVM_VMCB_CLEAN	BIT(5)  /* VMCB state caching */
8312911Sphk#define AMD_CPUID_SVM_FLUSH_BY_ASID	BIT(6)  /* Flush by ASID */
8412115Sdyson#define AMD_CPUID_SVM_DECODE_ASSIST	BIT(7)  /* Decode assist */
8531315Sbde#define AMD_CPUID_SVM_PAUSE_INC		BIT(10) /* Pause intercept filter. */
8630280Sphk#define AMD_CPUID_SVM_PAUSE_FTH		BIT(12) /* Pause filter threshold */
8712911Sphk
8812115Sdyson#define	VMCB_CACHE_DEFAULT	(VMCB_CACHE_ASID 	|	\
8912115Sdyson				VMCB_CACHE_IOPM		|	\
9012115Sdyson				VMCB_CACHE_I		|	\
9112115Sdyson				VMCB_CACHE_TPR		|	\
9212115Sdyson				VMCB_CACHE_NP)
9312115Sdyson
9412115SdysonMALLOC_DEFINE(M_SVM, "svm", "svm");
9512115SdysonMALLOC_DEFINE(M_SVM_VLAPIC, "svm-vlapic", "svm-vlapic");
9612115Sdyson
9712115Sdyson/* Per-CPU context area. */
9812115Sdysonextern struct pcpu __pcpu[];
9912115Sdyson
10012115Sdysonstatic int svm_getdesc(void *arg, int vcpu, int type, struct seg_desc *desc);
10112115Sdyson
10212115Sdysonstatic uint32_t svm_feature;	/* AMD SVM features. */
10312115SdysonSYSCTL_UINT(_hw_vmm_svm, OID_AUTO, features, CTLFLAG_RD, &svm_feature, 0,
10412115Sdyson    "SVM features advertised by CPUID.8000000AH:EDX");
10512911Sphk
10612115Sdysonstatic int disable_npf_assist;
10716322SgpalmerSYSCTL_INT(_hw_vmm_svm, OID_AUTO, disable_npf_assist, CTLFLAG_RWTUN,
10816322Sgpalmer    &disable_npf_assist, 0, NULL);
10916322Sgpalmer
11016322Sgpalmer/* Maximum ASIDs supported by the processor */
11116322Sgpalmerstatic uint32_t nasid;
11216322SgpalmerSYSCTL_UINT(_hw_vmm_svm, OID_AUTO, num_asids, CTLFLAG_RD, &nasid, 0,
11316322Sgpalmer    "Number of ASIDs supported by this processor");
11412115Sdyson
11516322Sgpalmer/* Current ASID generation for each host cpu */
11612115Sdysonstatic struct asid asid[MAXCPU];
11712115Sdyson
11812115Sdyson/*
11912115Sdyson * SVM host state saved area of size 4KB for each core.
12012115Sdyson */
12112911Sphkstatic uint8_t hsave[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
12212115Sdyson
12312115Sdyson/*
12412115Sdyson * S/w saved host context.
12512115Sdyson */
12612115Sdysonstatic struct svm_regctx host_ctx[MAXCPU];
12712115Sdyson
12812115Sdysonstatic VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery");
12912115Sdysonstatic VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry");
13012115Sdysonstatic VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window");
13129208Sbde
13229208Sbde/*
13329208Sbde * Common function to enable or disabled SVM for a CPU.
13429208Sbde */
13512115Sdysonstatic int
13612115Sdysoncpu_svm_enable_disable(boolean_t enable)
13712115Sdyson{
13812115Sdyson	uint64_t efer_msr;
13929888Skato
14029888Skato	efer_msr = rdmsr(MSR_EFER);
14129888Skato
14229888Skato	if (enable)
14312115Sdyson		efer_msr |= EFER_SVM;
14412115Sdyson	else
14512115Sdyson		efer_msr &= ~EFER_SVM;
14612115Sdyson
14712115Sdyson	wrmsr(MSR_EFER, efer_msr);
14812115Sdyson
14912115Sdyson	return(0);
15012115Sdyson}
15112115Sdyson
15230469Sjulian/*
15312115Sdyson * Disable SVM on a CPU.
15412115Sdyson */
15512115Sdysonstatic void
15612115Sdysonsvm_disable(void *arg __unused)
15712115Sdyson{
15812115Sdyson
15912115Sdyson	(void)cpu_svm_enable_disable(FALSE);
16012115Sdyson}
16112115Sdyson
16212115Sdyson/*
16312115Sdyson * Disable SVM for all CPUs.
16412115Sdyson */
16512115Sdysonstatic int
16612115Sdysonsvm_cleanup(void)
16712115Sdyson{
16812115Sdyson
16916322Sgpalmer	smp_rendezvous(NULL, svm_disable, NULL, NULL);
17012115Sdyson	return (0);
17112115Sdyson}
17212115Sdyson
17312115Sdyson/*
17412115Sdyson * Check for required BHyVe SVM features in a CPU.
17512115Sdyson */
17612911Sphkstatic int
17712115Sdysonsvm_cpuid_features(void)
17812115Sdyson{
17912115Sdyson	u_int regs[4];
18012115Sdyson
18112115Sdyson	/* CPUID Fn8000_000A is for SVM */
18212115Sdyson	do_cpuid(0x8000000A, regs);
18312115Sdyson	svm_feature = regs[3];
18412115Sdyson
18512115Sdyson	printf("SVM rev: 0x%x NASID:0x%x\n", regs[0] & 0xFF, regs[1]);
18612115Sdyson	nasid = regs[1];
18712115Sdyson	KASSERT(nasid > 1, ("Insufficient ASIDs for guests: %#x", nasid));
18812115Sdyson
18912115Sdyson	printf("SVM Features:0x%b\n", svm_feature,
19012115Sdyson		"\020"
19112115Sdyson		"\001NP"		/* Nested paging */
19212115Sdyson		"\002LbrVirt"		/* LBR virtualization */
19312115Sdyson		"\003SVML"		/* SVM lock */
19412115Sdyson		"\004NRIPS"		/* NRIP save */
19512115Sdyson		"\005TscRateMsr"	/* MSR based TSC rate control */
19629888Skato		"\006VmcbClean"		/* VMCB clean bits */
19729888Skato		"\007FlushByAsid"	/* Flush by ASID */
19812115Sdyson		"\010DecodeAssist"	/* Decode assist */
19912115Sdyson		"\011<b20>"
20012115Sdyson		"\012<b20>"
20112115Sdyson		"\013PauseFilter"
20212115Sdyson		"\014<b20>"
20329888Skato		"\015PauseFilterThreshold"
20429888Skato		"\016AVIC"
20529888Skato		);
20629888Skato
20712115Sdyson	/* SVM Lock */
20812115Sdyson	if (!(svm_feature & AMD_CPUID_SVM_SVML)) {
20912115Sdyson		printf("SVM is disabled by BIOS, please enable in BIOS.\n");
21012115Sdyson		return (ENXIO);
21122521Sdyson	}
21212115Sdyson
21312115Sdyson	/*
21422521Sdyson	 * bhyve need RVI to work.
21512115Sdyson	 */
21612115Sdyson	if (!(svm_feature & AMD_CPUID_SVM_NP)) {
21712115Sdyson		printf("Missing Nested paging or RVI SVM support in processor.\n");
21812115Sdyson		return (EIO);
21912115Sdyson	}
22031132Sjulian
22112115Sdyson	if (svm_feature & AMD_CPUID_SVM_NRIP_SAVE)
22212115Sdyson		return (0);
22312115Sdyson
22412115Sdyson	return (EIO);
22512115Sdyson}
22612115Sdyson
22712115Sdysonstatic __inline int
22812115Sdysonflush_by_asid(void)
22912115Sdyson{
23012115Sdyson
23112115Sdyson	return (svm_feature & AMD_CPUID_SVM_FLUSH_BY_ASID);
23212115Sdyson}
23312115Sdyson
23412115Sdysonstatic __inline int
23512115Sdysondecode_assist(void)
23612115Sdyson{
23712115Sdyson
23812115Sdyson	return (svm_feature & AMD_CPUID_SVM_DECODE_ASSIST);
23912115Sdyson}
24012115Sdyson
24112115Sdyson/*
24212115Sdyson * Enable SVM for a CPU.
24312115Sdyson */
24412115Sdysonstatic void
24512115Sdysonsvm_enable(void *arg __unused)
24612115Sdyson{
24712115Sdyson	uint64_t hsave_pa;
24812115Sdyson
24912115Sdyson	(void)cpu_svm_enable_disable(TRUE);
25012115Sdyson
25129888Skato	hsave_pa = vtophys(hsave[curcpu]);
25229888Skato	wrmsr(MSR_VM_HSAVE_PA, hsave_pa);
25329888Skato
25429888Skato	if (rdmsr(MSR_VM_HSAVE_PA) != hsave_pa) {
25529888Skato		panic("VM_HSAVE_PA is wrong on CPU%d\n", curcpu);
25612115Sdyson	}
25729888Skato}
25812115Sdyson
25912115Sdyson/*
26012115Sdyson * Check if a processor support SVM.
26112115Sdyson */
26212115Sdysonstatic int
26312115Sdysonis_svm_enabled(void)
26412115Sdyson{
26512115Sdyson	uint64_t msr;
26612115Sdyson
26712115Sdyson	 /* Section 15.4 Enabling SVM from APM2. */
26812115Sdyson	if ((amd_feature2 & AMDID2_SVM) == 0) {
26912115Sdyson		printf("SVM is not supported on this processor.\n");
27012115Sdyson		return (ENXIO);
27112115Sdyson	}
27212115Sdyson
27312115Sdyson	msr = rdmsr(MSR_VM_CR);
27412115Sdyson	/* Make sure SVM is not disabled by BIOS. */
27512115Sdyson	if ((msr & VM_CR_SVMDIS) == 0) {
27612115Sdyson		return svm_cpuid_features();
27712115Sdyson	}
27812115Sdyson
27912115Sdyson	printf("SVM disabled by Key, consult TPM/BIOS manual.\n");
28012115Sdyson	return (ENXIO);
28112115Sdyson}
28212115Sdyson
28312115Sdyson/*
28412115Sdyson * Enable SVM on CPU and initialize nested page table h/w.
28512115Sdyson */
28612115Sdysonstatic int
28712115Sdysonsvm_init(int ipinum)
28812115Sdyson{
28912115Sdyson	int err, cpu;
29012115Sdyson
29112115Sdyson	err = is_svm_enabled();
29212115Sdyson	if (err)
29312115Sdyson		return (err);
29412115Sdyson
29512115Sdyson	for (cpu = 0; cpu < MAXCPU; cpu++) {
29612115Sdyson		/*
29712115Sdyson		 * Initialize the host ASIDs to their "highest" valid values.
29812115Sdyson		 *
29912115Sdyson		 * The next ASID allocation will rollover both 'gen' and 'num'
30012115Sdyson		 * and start off the sequence at {1,1}.
30112115Sdyson		 */
30212115Sdyson		asid[cpu].gen = ~0UL;
30312115Sdyson		asid[cpu].num = nasid - 1;
30412115Sdyson	}
30512115Sdyson
30612115Sdyson	svm_npt_init(ipinum);
30712115Sdyson
30812115Sdyson	/* Start SVM on all CPUs */
30912115Sdyson	smp_rendezvous(NULL, svm_enable, NULL, NULL);
31012115Sdyson
31112115Sdyson	return (0);
31212115Sdyson}
31312115Sdyson
31412115Sdysonstatic void
31512115Sdysonsvm_restore(void)
31612115Sdyson{
31712115Sdyson	svm_enable(NULL);
31812115Sdyson}
31912115Sdyson
32012115Sdyson/*
32112115Sdyson * Get index and bit position for a MSR in MSR permission
32212115Sdyson * bitmap. Two bits are used for each MSR, lower bit is
32312115Sdyson * for read and higher bit is for write.
32412115Sdyson */
32512115Sdysonstatic int
32612115Sdysonsvm_msr_index(uint64_t msr, int *index, int *bit)
32712115Sdyson{
32812115Sdyson	uint32_t base, off;
32912115Sdyson
33012115Sdyson/* Pentium compatible MSRs */
33112115Sdyson#define MSR_PENTIUM_START 	0
33212115Sdyson#define MSR_PENTIUM_END 	0x1FFF
33312115Sdyson/* AMD 6th generation and Intel compatible MSRs */
33412115Sdyson#define MSR_AMD6TH_START 	0xC0000000UL
33512115Sdyson#define MSR_AMD6TH_END 		0xC0001FFFUL
33612115Sdyson/* AMD 7th and 8th generation compatible MSRs */
33712115Sdyson#define MSR_AMD7TH_START 	0xC0010000UL
33812115Sdyson#define MSR_AMD7TH_END 		0xC0011FFFUL
33912115Sdyson
34012115Sdyson	*index = -1;
34112115Sdyson	*bit = (msr % 4) * 2;
34212115Sdyson	base = 0;
34312115Sdyson
34412115Sdyson	if (msr >= MSR_PENTIUM_START && msr <= MSR_PENTIUM_END) {
34512115Sdyson		*index = msr / 4;
34612115Sdyson		return (0);
34712115Sdyson	}
34812115Sdyson
34912115Sdyson	base += (MSR_PENTIUM_END - MSR_PENTIUM_START + 1);
35012115Sdyson	if (msr >= MSR_AMD6TH_START && msr <= MSR_AMD6TH_END) {
35112115Sdyson		off = (msr - MSR_AMD6TH_START);
35212115Sdyson		*index = (off + base) / 4;
35312115Sdyson		return (0);
35412115Sdyson	}
35512115Sdyson
35612115Sdyson	base += (MSR_AMD6TH_END - MSR_AMD6TH_START + 1);
35712115Sdyson	if (msr >= MSR_AMD7TH_START && msr <= MSR_AMD7TH_END) {
35812115Sdyson		off = (msr - MSR_AMD7TH_START);
35912115Sdyson		*index = (off + base) / 4;
36012115Sdyson		return (0);
36112115Sdyson	}
36212115Sdyson
36312115Sdyson	return (EIO);
36412115Sdyson}
36512115Sdyson
36612115Sdyson/*
36712115Sdyson * Give virtual cpu the complete access to MSR(read & write).
36812115Sdyson */
36912115Sdysonstatic int
37012115Sdysonsvm_msr_perm(uint8_t *perm_bitmap, uint64_t msr, bool read, bool write)
37112115Sdyson{
37212115Sdyson	int index, bit, err;
37312115Sdyson
37412115Sdyson	err = svm_msr_index(msr, &index, &bit);
37512115Sdyson	if (err) {
37612115Sdyson		ERR("MSR 0x%lx is not writeable by guest.\n", msr);
37712115Sdyson		return (err);
37812115Sdyson	}
37912115Sdyson
38012115Sdyson	if (index < 0 || index > (SVM_MSR_BITMAP_SIZE)) {
38112115Sdyson		ERR("MSR 0x%lx index out of range(%d).\n", msr, index);
38212115Sdyson		return (EINVAL);
38312115Sdyson	}
38412115Sdyson	if (bit < 0 || bit > 8) {
38512115Sdyson		ERR("MSR 0x%lx bit out of range(%d).\n", msr, bit);
38612115Sdyson		return (EINVAL);
38712115Sdyson	}
38812115Sdyson
38912115Sdyson	/* Disable intercept for read and write. */
39012115Sdyson	if (read)
39112115Sdyson		perm_bitmap[index] &= ~(1UL << bit);
39212115Sdyson	if (write)
39312115Sdyson		perm_bitmap[index] &= ~(2UL << bit);
39412115Sdyson	CTR2(KTR_VMM, "Guest has control:0x%x on SVM:MSR(0x%lx).\n",
39512115Sdyson		(perm_bitmap[index] >> bit) & 0x3, msr);
39612115Sdyson
39712115Sdyson	return (0);
39812115Sdyson}
39912115Sdyson
40012115Sdysonstatic int
40112115Sdysonsvm_msr_rw_ok(uint8_t *perm_bitmap, uint64_t msr)
40212115Sdyson{
40312115Sdyson	return svm_msr_perm(perm_bitmap, msr, true, true);
40412115Sdyson}
40512115Sdyson
40612115Sdysonstatic int
40712115Sdysonsvm_msr_rd_ok(uint8_t *perm_bitmap, uint64_t msr)
40812115Sdyson{
40912115Sdyson	return svm_msr_perm(perm_bitmap, msr, true, false);
41027881Sdyson}
41127881Sdyson
41212115Sdysonstatic __inline void
41312115Sdysonvcpu_set_dirty(struct svm_softc *sc, int vcpu, uint32_t dirtybits)
41412115Sdyson{
41527881Sdyson	struct svm_vcpu *vcpustate;
41612115Sdyson
41712115Sdyson	vcpustate = svm_get_vcpu(sc, vcpu);
41812115Sdyson
41912115Sdyson	vcpustate->dirty |= dirtybits;
42012115Sdyson}
42112115Sdyson
42212115Sdysonstatic __inline int
42312115Sdysonsvm_get_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask)
42412115Sdyson{
42512115Sdyson	struct vmcb_ctrl *ctrl;
42612115Sdyson
42712115Sdyson	KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx));
42812115Sdyson
42912115Sdyson	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
43012115Sdyson	return (ctrl->intercept[idx] & bitmask ? 1 : 0);
43112115Sdyson}
43212115Sdyson
43312115Sdysonstatic __inline void
43412115Sdysonsvm_set_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask,
43512115Sdyson    int enabled)
43612115Sdyson{
43712115Sdyson	struct vmcb_ctrl *ctrl;
43812115Sdyson	uint32_t oldval;
43912115Sdyson
44012115Sdyson	KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx));
44112115Sdyson
44212115Sdyson	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
44312115Sdyson	oldval = ctrl->intercept[idx];
44412115Sdyson
44512115Sdyson	if (enabled)
44612911Sphk		ctrl->intercept[idx] |= bitmask;
44712115Sdyson	else
44812115Sdyson		ctrl->intercept[idx] &= ~bitmask;
44912115Sdyson
45012115Sdyson	if (ctrl->intercept[idx] != oldval) {
45112115Sdyson		vcpu_set_dirty(sc, vcpu, VMCB_CACHE_I);
45212115Sdyson		VCPU_CTR3(sc->vm, vcpu, "intercept[%d] modified "
45312115Sdyson		    "from %#x to %#x", idx, oldval, ctrl->intercept[idx]);
45412115Sdyson	}
45512115Sdyson}
45612115Sdyson
45712147Sdysonstatic __inline void
45812115Sdysonsvm_disable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
45912115Sdyson{
46012115Sdyson	svm_set_intercept(sc, vcpu, off, bitmask, 0);
46112115Sdyson}
46212115Sdyson
46312115Sdysonstatic __inline void
46412115Sdysonsvm_enable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
46512115Sdyson{
46612115Sdyson	svm_set_intercept(sc, vcpu, off, bitmask, 1);
46712115Sdyson}
46812115Sdyson
46912115Sdysonstatic void
47012115Sdysonvmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa,
47112115Sdyson    uint64_t msrpm_base_pa, uint64_t np_pml4)
47212115Sdyson{
47312115Sdyson	struct vmcb_ctrl *ctrl;
47412115Sdyson	struct vmcb_state *state;
47512115Sdyson	uint32_t mask;
47612115Sdyson	int n;
47712115Sdyson
47812115Sdyson	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
47912115Sdyson	state = svm_get_vmcb_state(sc, vcpu);
48012115Sdyson
48112115Sdyson	ctrl->iopm_base_pa = iopm_base_pa;
48212115Sdyson	ctrl->msrpm_base_pa = msrpm_base_pa;
48312115Sdyson
48412115Sdyson	/* Enable nested paging */
48512115Sdyson	ctrl->np_enable = 1;
48612115Sdyson	ctrl->n_cr3 = np_pml4;
48712115Sdyson
48812115Sdyson	/*
48912115Sdyson	 * Intercept accesses to the control registers that are not shadowed
49012115Sdyson	 * in the VMCB - i.e. all except cr0, cr2, cr3, cr4 and cr8.
49112115Sdyson	 */
49212115Sdyson	for (n = 0; n < 16; n++) {
49312115Sdyson		mask = (BIT(n) << 16) | BIT(n);
49412115Sdyson		if (n == 0 || n == 2 || n == 3 || n == 4 || n == 8)
49512115Sdyson			svm_disable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
49612115Sdyson		else
49712115Sdyson			svm_enable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
49812115Sdyson	}
49912115Sdyson
50012115Sdyson	/* Intercept Machine Check exceptions. */
50112115Sdyson	svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC));
50212115Sdyson
50312115Sdyson	/* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */
50412115Sdyson	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO);
50512115Sdyson	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_MSR);
50612115Sdyson	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_CPUID);
50712115Sdyson	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INTR);
50812115Sdyson	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INIT);
50912115Sdyson	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_NMI);
51012115Sdyson	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SMI);
51122521Sdyson	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SHUTDOWN);
51212115Sdyson	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
51312115Sdyson	    VMCB_INTCPT_FERR_FREEZE);
51412115Sdyson
51512115Sdyson	/*
51612115Sdyson	 * From section "Canonicalization and Consistency Checks" in APMv2
51712115Sdyson	 * the VMRUN intercept bit must be set to pass the consistency check.
51812115Sdyson	 */
51912115Sdyson	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMRUN);
52012115Sdyson
52112115Sdyson	/*
52212115Sdyson	 * The ASID will be set to a non-zero value just before VMRUN.
52312115Sdyson	 */
52412115Sdyson	ctrl->asid = 0;
52512115Sdyson
52612115Sdyson	/*
52712115Sdyson	 * Section 15.21.1, Interrupt Masking in EFLAGS
52812115Sdyson	 * Section 15.21.2, Virtualizing APIC.TPR
52912115Sdyson	 *
53012115Sdyson	 * This must be set for %rflag and %cr8 isolation of guest and host.
53112115Sdyson	 */
53212115Sdyson	ctrl->v_intr_masking = 1;
53312115Sdyson
53412115Sdyson	/* Enable Last Branch Record aka LBR for debugging */
53512115Sdyson	ctrl->lbr_virt_en = 1;
53612115Sdyson	state->dbgctl = BIT(0);
53712115Sdyson
53812115Sdyson	/* EFER_SVM must always be set when the guest is executing */
53912911Sphk	state->efer = EFER_SVM;
54012115Sdyson
54112115Sdyson	/* Set up the PAT to power-on state */
54212115Sdyson	state->g_pat = PAT_VALUE(0, PAT_WRITE_BACK)	|
54312115Sdyson	    PAT_VALUE(1, PAT_WRITE_THROUGH)	|
54412115Sdyson	    PAT_VALUE(2, PAT_UNCACHED)		|
54512115Sdyson	    PAT_VALUE(3, PAT_UNCACHEABLE)	|
54612115Sdyson	    PAT_VALUE(4, PAT_WRITE_BACK)	|
54712115Sdyson	    PAT_VALUE(5, PAT_WRITE_THROUGH)	|
54812115Sdyson	    PAT_VALUE(6, PAT_UNCACHED)		|
54912115Sdyson	    PAT_VALUE(7, PAT_UNCACHEABLE);
55012115Sdyson}
55112115Sdyson
55212115Sdyson/*
55312115Sdyson * Initialise a virtual machine.
55412115Sdyson */
55512115Sdysonstatic void *
55612115Sdysonsvm_vminit(struct vm *vm, pmap_t pmap)
55712115Sdyson{
55812115Sdyson	struct svm_softc *svm_sc;
55912115Sdyson	struct svm_vcpu *vcpu;
56012115Sdyson	vm_paddr_t msrpm_pa, iopm_pa, pml4_pa;
56112115Sdyson	int i;
56212115Sdyson
56312115Sdyson	svm_sc = (struct svm_softc *)malloc(sizeof (struct svm_softc),
56412115Sdyson			M_SVM, M_WAITOK | M_ZERO);
56512115Sdyson
56612115Sdyson	svm_sc->vm = vm;
56712115Sdyson	svm_sc->svm_feature = svm_feature;
56812115Sdyson	svm_sc->vcpu_cnt = VM_MAXCPU;
56912115Sdyson	svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pml4);
57012115Sdyson
57112115Sdyson	/*
57212115Sdyson	 * Intercept MSR access to all MSRs except GSBASE, FSBASE,... etc.
57312115Sdyson	 */
57412115Sdyson	 memset(svm_sc->msr_bitmap, 0xFF, sizeof(svm_sc->msr_bitmap));
57512115Sdyson
57612115Sdyson	/*
57712115Sdyson	 * Following MSR can be completely controlled by virtual machines
57812115Sdyson	 * since access to following are translated to access to VMCB.
57912115Sdyson	 */
58012115Sdyson	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_GSBASE);
58112115Sdyson	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_FSBASE);
58212115Sdyson	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_KGSBASE);
58312115Sdyson
58412115Sdyson	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_STAR);
58512115Sdyson	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_LSTAR);
58612115Sdyson	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_CSTAR);
58712115Sdyson	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SF_MASK);
58812115Sdyson	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_CS_MSR);
58912115Sdyson	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_ESP_MSR);
59012115Sdyson	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_EIP_MSR);
59112115Sdyson
59212115Sdyson	/* For Nested Paging/RVI only. */
59312115Sdyson	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_PAT);
59412115Sdyson
59512115Sdyson	svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_TSC);
59612115Sdyson	svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_EFER);
59712115Sdyson
59812115Sdyson	 /* Intercept access to all I/O ports. */
59912115Sdyson	memset(svm_sc->iopm_bitmap, 0xFF, sizeof(svm_sc->iopm_bitmap));
60030280Sphk
60130474Sphk	/* Cache physical address for multiple vcpus. */
60230474Sphk	iopm_pa = vtophys(svm_sc->iopm_bitmap);
60330492Sphk	msrpm_pa = vtophys(svm_sc->msr_bitmap);
60430474Sphk	pml4_pa = svm_sc->nptp;
60530474Sphk
60612115Sdyson	for (i = 0; i < svm_sc->vcpu_cnt; i++) {
60712115Sdyson		vcpu = svm_get_vcpu(svm_sc, i);
60812115Sdyson		vcpu->lastcpu = NOCPU;
60912115Sdyson		vcpu->vmcb_pa = vtophys(&vcpu->vmcb);
61012115Sdyson		vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa);
61112115Sdyson	}
61212115Sdyson	return (svm_sc);
61312115Sdyson}
61412115Sdyson
61512115Sdysonstatic int
61612115Sdysonsvm_cpl(struct vmcb_state *state)
61712115Sdyson{
61812115Sdyson
61912115Sdyson	/*
62012115Sdyson	 * From APMv2:
62112115Sdyson	 *   "Retrieve the CPL from the CPL field in the VMCB, not
62212115Sdyson	 *    from any segment DPL"
62312115Sdyson	 */
62412115Sdyson	return (state->cpl);
62512115Sdyson}
62612115Sdyson
62712115Sdysonstatic enum vm_cpu_mode
62812115Sdysonsvm_vcpu_mode(struct vmcb *vmcb)
62912115Sdyson{
63012115Sdyson	struct vmcb_segment *seg;
63112115Sdyson	struct vmcb_state *state;
63212115Sdyson
63312115Sdyson	state = &vmcb->state;
63412115Sdyson
63512115Sdyson	if (state->efer & EFER_LMA) {
63612115Sdyson		seg = vmcb_seg(vmcb, VM_REG_GUEST_CS);
63712115Sdyson		/*
63812115Sdyson		 * Section 4.8.1 for APM2, check if Code Segment has
63912115Sdyson		 * Long attribute set in descriptor.
64012115Sdyson		 */
64112115Sdyson		if (seg->attrib & VMCB_CS_ATTRIB_L)
64212115Sdyson			return (CPU_MODE_64BIT);
64312115Sdyson		else
64412115Sdyson			return (CPU_MODE_COMPATIBILITY);
64512115Sdyson	} else  if (state->cr0 & CR0_PE) {
64612115Sdyson		return (CPU_MODE_PROTECTED);
64712115Sdyson	} else {
64812115Sdyson		return (CPU_MODE_REAL);
64912115Sdyson	}
65012115Sdyson}
65112115Sdyson
65212115Sdysonstatic enum vm_paging_mode
65312115Sdysonsvm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer)
65412115Sdyson{
65512115Sdyson
65612115Sdyson	if ((cr0 & CR0_PG) == 0)
65712115Sdyson		return (PAGING_MODE_FLAT);
65812115Sdyson	if ((cr4 & CR4_PAE) == 0)
65912115Sdyson		return (PAGING_MODE_32);
66012115Sdyson	if (efer & EFER_LME)
66112115Sdyson		return (PAGING_MODE_64);
66212115Sdyson	else
66312115Sdyson		return (PAGING_MODE_PAE);
66412115Sdyson}
66512115Sdyson
66612115Sdyson/*
66712115Sdyson * ins/outs utility routines
66812115Sdyson */
66912911Sphkstatic uint64_t
67012115Sdysonsvm_inout_str_index(struct svm_regctx *regs, int in)
67112115Sdyson{
67212115Sdyson	uint64_t val;
67312115Sdyson
67412115Sdyson	val = in ? regs->e.g.sctx_rdi : regs->e.g.sctx_rsi;
67512115Sdyson
67612115Sdyson	return (val);
67712115Sdyson}
67812115Sdyson
67912115Sdysonstatic uint64_t
68012115Sdysonsvm_inout_str_count(struct svm_regctx *regs, int rep)
68112115Sdyson{
68212115Sdyson	uint64_t val;
68312115Sdyson
68412115Sdyson	val = rep ? regs->sctx_rcx : 1;
68512115Sdyson
68612115Sdyson	return (val);
68712115Sdyson}
68812115Sdyson
68912115Sdysonstatic void
69012115Sdysonsvm_inout_str_seginfo(struct svm_softc *svm_sc, int vcpu, int64_t info1,
69112115Sdyson    int in, struct vm_inout_str *vis)
69212115Sdyson{
69312115Sdyson	int error, s;
69427881Sdyson
69512115Sdyson	if (in) {
69612115Sdyson		vis->seg_name = VM_REG_GUEST_ES;
69727881Sdyson	} else {
69827881Sdyson		/* The segment field has standard encoding */
69912115Sdyson		s = (info1 >> 10) & 0x7;
70012115Sdyson		vis->seg_name = vm_segment_name(s);
70112115Sdyson	}
70227881Sdyson
70327881Sdyson	error = svm_getdesc(svm_sc, vcpu, vis->seg_name, &vis->seg_desc);
70412115Sdyson	KASSERT(error == 0, ("%s: svm_getdesc error %d", __func__, error));
70512115Sdyson}
70627881Sdyson
70712115Sdysonstatic int
70812115Sdysonsvm_inout_str_addrsize(uint64_t info1)
70912115Sdyson{
71012115Sdyson        uint32_t size;
71112115Sdyson
71212115Sdyson        size = (info1 >> 7) & 0x7;
71312115Sdyson        switch (size) {
71412115Sdyson        case 1:
71512115Sdyson                return (2);     /* 16 bit */
71612115Sdyson        case 2:
71712115Sdyson                return (4);     /* 32 bit */
71812115Sdyson        case 4:
71912115Sdyson                return (8);     /* 64 bit */
72012115Sdyson        default:
72112115Sdyson                panic("%s: invalid size encoding %d", __func__, size);
72212115Sdyson        }
72312911Sphk}
72412115Sdyson
72512115Sdysonstatic void
72612115Sdysonsvm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging)
72712115Sdyson{
72812115Sdyson	struct vmcb_state *state;
72912115Sdyson
73012147Sdyson	state = &vmcb->state;
73112746Sbde	paging->cr3 = state->cr3;
73212746Sbde	paging->cpl = svm_cpl(state);
73312746Sbde	paging->cpu_mode = svm_vcpu_mode(vmcb);
73412115Sdyson	paging->paging_mode = svm_paging_mode(state->cr0, state->cr4,
73512115Sdyson	    state->efer);
73612115Sdyson}
73712115Sdyson
73812115Sdyson#define	UNHANDLED 0
73912115Sdyson
74012115Sdyson/*
74112115Sdyson * Handle guest I/O intercept.
74212115Sdyson */
74312115Sdysonstatic int
74412115Sdysonsvm_handle_io(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
74512115Sdyson{
74612115Sdyson	struct vmcb_ctrl *ctrl;
74712115Sdyson	struct vmcb_state *state;
74812115Sdyson	struct svm_regctx *regs;
74912115Sdyson	struct vm_inout_str *vis;
75012115Sdyson	uint64_t info1;
75112115Sdyson	int inout_string;
75212115Sdyson
75312115Sdyson	state = svm_get_vmcb_state(svm_sc, vcpu);
75412115Sdyson	ctrl  = svm_get_vmcb_ctrl(svm_sc, vcpu);
75512115Sdyson	regs  = svm_get_guest_regctx(svm_sc, vcpu);
75612115Sdyson
75712115Sdyson	info1 = ctrl->exitinfo1;
75812115Sdyson	inout_string = info1 & BIT(2) ? 1 : 0;
75912911Sphk
76012115Sdyson	/*
76112115Sdyson	 * The effective segment number in EXITINFO1[12:10] is populated
76212115Sdyson	 * only if the processor has the DecodeAssist capability.
76312115Sdyson	 *
76412115Sdyson	 * XXX this is not specified explicitly in APMv2 but can be verified
76512115Sdyson	 * empirically.
76612115Sdyson	 */
76712115Sdyson	if (inout_string && !decode_assist())
76812115Sdyson		return (UNHANDLED);
76912115Sdyson
77012115Sdyson	vmexit->exitcode 	= VM_EXITCODE_INOUT;
77112115Sdyson	vmexit->u.inout.in 	= (info1 & BIT(0)) ? 1 : 0;
77212115Sdyson	vmexit->u.inout.string 	= inout_string;
77312115Sdyson	vmexit->u.inout.rep 	= (info1 & BIT(3)) ? 1 : 0;
77412115Sdyson	vmexit->u.inout.bytes 	= (info1 >> 4) & 0x7;
77512115Sdyson	vmexit->u.inout.port 	= (uint16_t)(info1 >> 16);
77612115Sdyson	vmexit->u.inout.eax 	= (uint32_t)(state->rax);
77712115Sdyson
77812115Sdyson	if (inout_string) {
77912115Sdyson		vmexit->exitcode = VM_EXITCODE_INOUT_STR;
78012115Sdyson		vis = &vmexit->u.inout_str;
78112115Sdyson		svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &vis->paging);
78212115Sdyson		vis->rflags = state->rflags;
78312115Sdyson		vis->cr0 = state->cr0;
78412115Sdyson		vis->index = svm_inout_str_index(regs, vmexit->u.inout.in);
78512115Sdyson		vis->count = svm_inout_str_count(regs, vmexit->u.inout.rep);
78612115Sdyson		vis->addrsize = svm_inout_str_addrsize(info1);
78712115Sdyson		svm_inout_str_seginfo(svm_sc, vcpu, info1,
78812115Sdyson		    vmexit->u.inout.in, vis);
78912115Sdyson	}
79012115Sdyson
79112115Sdyson	return (UNHANDLED);
79212115Sdyson}
79312115Sdyson
79412115Sdysonstatic int
79512115Sdysonsvm_npf_paging(uint64_t exitinfo1)
79612115Sdyson{
79712115Sdyson
79812115Sdyson	if (exitinfo1 & VMCB_NPF_INFO1_W)
79912115Sdyson		return (VM_PROT_WRITE);
80012115Sdyson
80112115Sdyson	return (VM_PROT_READ);
80212115Sdyson}
80312115Sdyson
80412115Sdysonstatic bool
80512115Sdysonsvm_npf_emul_fault(uint64_t exitinfo1)
80612115Sdyson{
80712115Sdyson
80812115Sdyson	if (exitinfo1 & VMCB_NPF_INFO1_ID) {
80912115Sdyson		return (false);
81012115Sdyson	}
81112115Sdyson
81212115Sdyson	if (exitinfo1 & VMCB_NPF_INFO1_GPT) {
81312115Sdyson		return (false);
81412911Sphk	}
81512115Sdyson
81612115Sdyson	if ((exitinfo1 & VMCB_NPF_INFO1_GPA) == 0) {
81712115Sdyson		return (false);
81812115Sdyson	}
81912115Sdyson
82012115Sdyson	return (true);
82112115Sdyson}
82212115Sdyson
82312115Sdysonstatic void
82412115Sdysonsvm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit)
82512115Sdyson{
82612115Sdyson	struct vm_guest_paging *paging;
82712115Sdyson	struct vmcb_segment *seg;
82812115Sdyson	struct vmcb_ctrl *ctrl;
82912115Sdyson	char *inst_bytes;
83012115Sdyson	int inst_len;
83112115Sdyson
83212115Sdyson	ctrl = &vmcb->ctrl;
83312115Sdyson	paging = &vmexit->u.inst_emul.paging;
83412115Sdyson
83512115Sdyson	vmexit->exitcode = VM_EXITCODE_INST_EMUL;
83612115Sdyson	vmexit->u.inst_emul.gpa = gpa;
83712115Sdyson	vmexit->u.inst_emul.gla = VIE_INVALID_GLA;
83812115Sdyson	svm_paging_info(vmcb, paging);
83912115Sdyson
84012115Sdyson	seg = vmcb_seg(vmcb, VM_REG_GUEST_CS);
84112115Sdyson	switch(paging->cpu_mode) {
84212115Sdyson	case CPU_MODE_PROTECTED:
84312115Sdyson	case CPU_MODE_COMPATIBILITY:
84412115Sdyson		/*
84512115Sdyson		 * Section 4.8.1 of APM2, Default Operand Size or D bit.
84612115Sdyson		 */
84712115Sdyson		vmexit->u.inst_emul.cs_d = (seg->attrib & VMCB_CS_ATTRIB_D) ?
84812115Sdyson		    1 : 0;
84912115Sdyson		break;
85012115Sdyson	default:
85112115Sdyson		vmexit->u.inst_emul.cs_d = 0;
85212115Sdyson		break;
85312115Sdyson	}
85412115Sdyson
85512115Sdyson	/*
85612115Sdyson	 * Copy the instruction bytes into 'vie' if available.
85712115Sdyson	 */
85812115Sdyson	if (decode_assist() && !disable_npf_assist) {
85912115Sdyson		inst_len = ctrl->inst_len;
86012115Sdyson		inst_bytes = ctrl->inst_bytes;
86112115Sdyson	} else {
86222521Sdyson		inst_len = 0;
86312115Sdyson		inst_bytes = NULL;
86412115Sdyson	}
86512115Sdyson	vie_init(&vmexit->u.inst_emul.vie, inst_bytes, inst_len);
86612115Sdyson}
86712115Sdyson
86812115Sdyson/*
86912115Sdyson * Intercept access to MSR_EFER to prevent the guest from clearing the
87012115Sdyson * SVM enable bit.
87112115Sdyson */
87212115Sdysonstatic void
87312115Sdysonsvm_write_efer(struct svm_softc *sc, int vcpu, uint32_t edx, uint32_t eax)
87412115Sdyson{
87512115Sdyson	struct vmcb_state *state;
87612115Sdyson	uint64_t oldval;
87712115Sdyson
87812115Sdyson	state = svm_get_vmcb_state(sc, vcpu);
87912115Sdyson
88012115Sdyson	oldval = state->efer;
88112115Sdyson	state->efer = (uint64_t)edx << 32 | eax | EFER_SVM;
88212115Sdyson	if (state->efer != oldval) {
88312115Sdyson		VCPU_CTR2(sc->vm, vcpu, "Guest EFER changed from %#lx to %#lx",
88412115Sdyson		    oldval, state->efer);
88512911Sphk		vcpu_set_dirty(sc, vcpu, VMCB_CACHE_CR);
88612115Sdyson	}
88712115Sdyson}
88812115Sdyson
88912115Sdyson#ifdef KTR
89012115Sdysonstatic const char *
89112115Sdysonintrtype_to_str(int intr_type)
89212115Sdyson{
89312115Sdyson	switch (intr_type) {
89412115Sdyson	case VMCB_EVENTINJ_TYPE_INTR:
89512115Sdyson		return ("hwintr");
89612115Sdyson	case VMCB_EVENTINJ_TYPE_NMI:
89730280Sphk		return ("nmi");
89812115Sdyson	case VMCB_EVENTINJ_TYPE_INTn:
89912115Sdyson		return ("swintr");
90012115Sdyson	case VMCB_EVENTINJ_TYPE_EXCEPTION:
90112115Sdyson		return ("exception");
90212406Sdyson	default:
90312115Sdyson		panic("%s: unknown intr_type %d", __func__, intr_type);
90412115Sdyson	}
90512115Sdyson}
90612406Sdyson#endif
90712406Sdyson
90812406Sdyson/*
90912406Sdyson * Inject an event to vcpu as described in section 15.20, "Event injection".
91012406Sdyson */
91112406Sdysonstatic void
91212406Sdysonsvm_eventinject(struct svm_softc *sc, int vcpu, int intr_type, int vector,
91312406Sdyson		 uint32_t error, bool ec_valid)
91412406Sdyson{
91512406Sdyson	struct vmcb_ctrl *ctrl;
91612406Sdyson
91712406Sdyson	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
91812406Sdyson
91912406Sdyson	KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0,
92012115Sdyson	    ("%s: event already pending %#lx", __func__, ctrl->eventinj));
92112115Sdyson
92212115Sdyson	KASSERT(vector >=0 && vector <= 255, ("%s: invalid vector %d",
92312115Sdyson	    __func__, vector));
92412115Sdyson
92530280Sphk	switch (intr_type) {
92612115Sdyson	case VMCB_EVENTINJ_TYPE_INTR:
92712115Sdyson	case VMCB_EVENTINJ_TYPE_NMI:
92812115Sdyson	case VMCB_EVENTINJ_TYPE_INTn:
92912115Sdyson		break;
93012115Sdyson	case VMCB_EVENTINJ_TYPE_EXCEPTION:
93112115Sdyson		if (vector >= 0 && vector <= 31 && vector != 2)
93212115Sdyson			break;
93312115Sdyson		/* FALLTHROUGH */
93412115Sdyson	default:
93512115Sdyson		panic("%s: invalid intr_type/vector: %d/%d", __func__,
93612115Sdyson		    intr_type, vector);
93712115Sdyson	}
93812115Sdyson	ctrl->eventinj = vector | (intr_type << 8) | VMCB_EVENTINJ_VALID;
93912115Sdyson	if (ec_valid) {
94012115Sdyson		ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID;
94112115Sdyson		ctrl->eventinj |= (uint64_t)error << 32;
94212115Sdyson		VCPU_CTR3(sc->vm, vcpu, "Injecting %s at vector %d errcode %#x",
94312115Sdyson		    intrtype_to_str(intr_type), vector, error);
94412406Sdyson	} else {
94512406Sdyson		VCPU_CTR2(sc->vm, vcpu, "Injecting %s at vector %d",
94612406Sdyson		    intrtype_to_str(intr_type), vector);
94712406Sdyson	}
94812115Sdyson}
94912406Sdyson
95012115Sdysonstatic void
95112115Sdysonsvm_save_intinfo(struct svm_softc *svm_sc, int vcpu)
95212115Sdyson{
95312115Sdyson	struct vmcb_ctrl *ctrl;
95412115Sdyson	uint64_t intinfo;
95512115Sdyson
95612115Sdyson	ctrl  = svm_get_vmcb_ctrl(svm_sc, vcpu);
95712115Sdyson	intinfo = ctrl->exitintinfo;
95812115Sdyson	if (!VMCB_EXITINTINFO_VALID(intinfo))
95912115Sdyson		return;
96012115Sdyson
96112115Sdyson	/*
96212115Sdyson	 * From APMv2, Section "Intercepts during IDT interrupt delivery"
96312115Sdyson	 *
96412115Sdyson	 * If a #VMEXIT happened during event delivery then record the event
96512115Sdyson	 * that was being delivered.
96612115Sdyson	 */
96712115Sdyson	VCPU_CTR2(svm_sc->vm, vcpu, "SVM:Pending INTINFO(0x%lx), vector=%d.\n",
96812115Sdyson		intinfo, VMCB_EXITINTINFO_VECTOR(intinfo));
96912115Sdyson	vmm_stat_incr(svm_sc->vm, vcpu, VCPU_EXITINTINFO, 1);
97012115Sdyson	vm_exit_intinfo(svm_sc->vm, vcpu, intinfo);
97112115Sdyson}
97212115Sdyson
97312115Sdysonstatic __inline void
97412115Sdysonenable_intr_window_exiting(struct svm_softc *sc, int vcpu)
97512115Sdyson{
97612115Sdyson	struct vmcb_ctrl *ctrl;
97712115Sdyson
97812115Sdyson	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
97912115Sdyson
98012115Sdyson	if (ctrl->v_irq == 0) {
98112115Sdyson		VCPU_CTR0(sc->vm, vcpu, "Enable intr window exiting");
98212115Sdyson		ctrl->v_irq = 1;
98312115Sdyson		ctrl->v_ign_tpr = 1;
98412115Sdyson		vcpu_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
98512115Sdyson		svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
98612115Sdyson		    VMCB_INTCPT_VINTR);
98712115Sdyson	}
98812115Sdyson}
98912115Sdyson
99012115Sdysonstatic __inline void
99112115Sdysondisable_intr_window_exiting(struct svm_softc *sc, int vcpu)
99214249Sbde{
99312115Sdyson	struct vmcb_ctrl *ctrl;
99412115Sdyson
99512115Sdyson	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
99612115Sdyson
99712115Sdyson	if (ctrl->v_irq) {
99812115Sdyson		VCPU_CTR0(sc->vm, vcpu, "Disable intr window exiting");
99912115Sdyson		ctrl->v_irq = 0;
100012115Sdyson		vcpu_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
100112115Sdyson		svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
100212115Sdyson		    VMCB_INTCPT_VINTR);
100312115Sdyson	}
100412115Sdyson}
100512115Sdyson
100612115Sdysonstatic int
100731485Sbdenmi_blocked(struct svm_softc *sc, int vcpu)
100812115Sdyson{
100912115Sdyson	/* XXX need to track NMI blocking */
101012115Sdyson	return (0);
101112115Sdyson}
101212115Sdyson
101312115Sdysonstatic void
101412115Sdysonenable_nmi_blocking(struct svm_softc *sc, int vcpu)
101512115Sdyson{
101612115Sdyson	/* XXX enable iret intercept */
101712115Sdyson}
101812115Sdyson
101912115Sdyson#ifdef notyet
102012115Sdysonstatic void
102112115Sdysonclear_nmi_blocking(struct svm_softc *sc, int vcpu)
102212115Sdyson{
102312115Sdyson	/* XXX disable iret intercept */
102412115Sdyson}
102512911Sphk#endif
102612115Sdyson
102712115Sdyson#ifdef KTR
102812115Sdysonstatic const char *
102928270Swollmanexit_reason_to_str(uint64_t reason)
103012115Sdyson{
103112115Sdyson	static char reasonbuf[32];
103212115Sdyson
103312115Sdyson	switch (reason) {
103412115Sdyson	case VMCB_EXIT_INVALID:
103512115Sdyson		return ("invalvmcb");
103612115Sdyson	case VMCB_EXIT_SHUTDOWN:
103712115Sdyson		return ("shutdown");
103812115Sdyson	case VMCB_EXIT_NPF:
103912115Sdyson		return ("nptfault");
104012115Sdyson	case VMCB_EXIT_PAUSE:
104112115Sdyson		return ("pause");
104212115Sdyson	case VMCB_EXIT_HLT:
104312115Sdyson		return ("hlt");
104412115Sdyson	case VMCB_EXIT_CPUID:
104512115Sdyson		return ("cpuid");
104612115Sdyson	case VMCB_EXIT_IO:
104712115Sdyson		return ("inout");
104812115Sdyson	case VMCB_EXIT_MC:
104912911Sphk		return ("mchk");
105012115Sdyson	case VMCB_EXIT_INTR:
105112115Sdyson		return ("extintr");
105212115Sdyson	case VMCB_EXIT_NMI:
105312115Sdyson		return ("nmi");
105412115Sdyson	case VMCB_EXIT_VINTR:
105512115Sdyson		return ("vintr");
105612115Sdyson	case VMCB_EXIT_MSR:
105712115Sdyson		return ("msr");
105812115Sdyson	default:
105912115Sdyson		snprintf(reasonbuf, sizeof(reasonbuf), "%#lx", reason);
106012115Sdyson		return (reasonbuf);
106112115Sdyson	}
106212115Sdyson}
106312115Sdyson#endif	/* KTR */
106412115Sdyson
106512115Sdyson/*
106612115Sdyson * From section "State Saved on Exit" in APMv2: nRIP is saved for all #VMEXITs
106712115Sdyson * that are due to instruction intercepts as well as MSR and IOIO intercepts
106812911Sphk * and exceptions caused by INT3, INTO and BOUND instructions.
106912115Sdyson *
107012115Sdyson * Return 1 if the nRIP is valid and 0 otherwise.
107112115Sdyson */
107212115Sdysonstatic int
107312115Sdysonnrip_valid(uint64_t exitcode)
107412115Sdyson{
107512115Sdyson	switch (exitcode) {
107612147Sdyson	case 0x00 ... 0x0F:	/* read of CR0 through CR15 */
107712115Sdyson	case 0x10 ... 0x1F:	/* write of CR0 through CR15 */
107812115Sdyson	case 0x20 ... 0x2F:	/* read of DR0 through DR15 */
107912115Sdyson	case 0x30 ... 0x3F:	/* write of DR0 through DR15 */
108012115Sdyson	case 0x43:		/* INT3 */
108112115Sdyson	case 0x44:		/* INTO */
108212115Sdyson	case 0x45:		/* BOUND */
108312115Sdyson	case 0x65 ... 0x7C:	/* VMEXIT_CR0_SEL_WRITE ... VMEXIT_MSR */
108412115Sdyson	case 0x80 ... 0x8D:	/* VMEXIT_VMRUN ... VMEXIT_XSETBV */
108512115Sdyson		return (1);
108612115Sdyson	default:
108727881Sdyson		return (0);
108827881Sdyson	}
108927881Sdyson}
109027881Sdyson
109112115Sdyson/*
109212115Sdyson * Collateral for a generic SVM VM-exit.
109312115Sdyson */
109412115Sdysonstatic void
1095vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2)
1096{
1097
1098	vme->exitcode = VM_EXITCODE_SVM;
1099	vme->u.svm.exitcode = code;
1100	vme->u.svm.exitinfo1 = info1;
1101	vme->u.svm.exitinfo2 = info2;
1102}
1103
1104static int
1105svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
1106{
1107	struct vmcb *vmcb;
1108	struct vmcb_state *state;
1109	struct vmcb_ctrl *ctrl;
1110	struct svm_regctx *ctx;
1111	uint64_t code, info1, info2, val;
1112	uint32_t eax, ecx, edx;
1113	int handled;
1114	bool retu;
1115
1116	ctx = svm_get_guest_regctx(svm_sc, vcpu);
1117	vmcb = svm_get_vmcb(svm_sc, vcpu);
1118	state = &vmcb->state;
1119	ctrl = &vmcb->ctrl;
1120
1121	handled = 0;
1122	code = ctrl->exitcode;
1123	info1 = ctrl->exitinfo1;
1124	info2 = ctrl->exitinfo2;
1125
1126	vmexit->exitcode = VM_EXITCODE_BOGUS;
1127	vmexit->rip = state->rip;
1128	vmexit->inst_length = nrip_valid(code) ? ctrl->nrip - state->rip : 0;
1129
1130	vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_COUNT, 1);
1131
1132	/*
1133	 * #VMEXIT(INVALID) needs to be handled early because the VMCB is
1134	 * in an inconsistent state and can trigger assertions that would
1135	 * never happen otherwise.
1136	 */
1137	if (code == VMCB_EXIT_INVALID) {
1138		vm_exit_svm(vmexit, code, info1, info2);
1139		return (0);
1140	}
1141
1142	KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event "
1143	    "injection valid bit is set %#lx", __func__, ctrl->eventinj));
1144
1145	KASSERT(vmexit->inst_length >= 0 && vmexit->inst_length <= 15,
1146	    ("invalid inst_length %d: code (%#lx), info1 (%#lx), info2 (%#lx)",
1147	    vmexit->inst_length, code, info1, info2));
1148
1149	svm_save_intinfo(svm_sc, vcpu);
1150
1151	switch (code) {
1152	case VMCB_EXIT_VINTR:	/* interrupt window exiting */
1153		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_VINTR, 1);
1154		handled = 1;
1155		break;
1156	case VMCB_EXIT_INTR:	/* external interrupt */
1157		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXTINT, 1);
1158		handled = 1;
1159		break;
1160	case VMCB_EXIT_NMI:	/* external NMI */
1161		handled = 1;
1162		break;
1163	case VMCB_EXIT_MC:	/* machine check */
1164		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXCEPTION, 1);
1165		break;
1166	case VMCB_EXIT_MSR:	/* MSR access. */
1167		eax = state->rax;
1168		ecx = ctx->sctx_rcx;
1169		edx = ctx->e.g.sctx_rdx;
1170		retu = false;
1171
1172		if (ecx == MSR_EFER) {
1173			KASSERT(info1 != 0, ("rdmsr(MSR_EFER) is not emulated: "
1174			    "info1(%#lx) info2(%#lx)", info1, info2));
1175			svm_write_efer(svm_sc, vcpu, edx, eax);
1176			handled = 1;
1177			break;
1178		}
1179
1180#define MSR_AMDK8_IPM           0xc0010055
1181		/*
1182		 * Ignore access to the "Interrupt Pending Message" MSR.
1183		 */
1184		if (ecx == MSR_AMDK8_IPM) {
1185			if (!info1)
1186				state->rax = ctx->e.g.sctx_rdx = 0;
1187			handled = 1;
1188			break;
1189		}
1190
1191		if (info1) {
1192			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_WRMSR, 1);
1193			val = (uint64_t)edx << 32 | eax;
1194			VCPU_CTR2(svm_sc->vm, vcpu, "wrmsr %#x val %#lx",
1195			    ecx, val);
1196			if (emulate_wrmsr(svm_sc->vm, vcpu, ecx, val, &retu)) {
1197				vmexit->exitcode = VM_EXITCODE_WRMSR;
1198				vmexit->u.msr.code = ecx;
1199				vmexit->u.msr.wval = val;
1200			} else if (!retu) {
1201				handled = 1;
1202			} else {
1203				KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
1204				    ("emulate_wrmsr retu with bogus exitcode"));
1205			}
1206		} else {
1207			VCPU_CTR1(svm_sc->vm, vcpu, "rdmsr %#x", ecx);
1208			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_RDMSR, 1);
1209			if (emulate_rdmsr(svm_sc->vm, vcpu, ecx, &retu)) {
1210				vmexit->exitcode = VM_EXITCODE_RDMSR;
1211				vmexit->u.msr.code = ecx;
1212			} else if (!retu) {
1213				handled = 1;
1214			} else {
1215				KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
1216				    ("emulate_rdmsr retu with bogus exitcode"));
1217			}
1218		}
1219		break;
1220	case VMCB_EXIT_IO:
1221		handled = svm_handle_io(svm_sc, vcpu, vmexit);
1222		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INOUT, 1);
1223		break;
1224	case VMCB_EXIT_CPUID:
1225		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_CPUID, 1);
1226		handled = x86_emulate_cpuid(svm_sc->vm, vcpu,
1227		    (uint32_t *)&state->rax,
1228		    (uint32_t *)&ctx->sctx_rbx,
1229		    (uint32_t *)&ctx->sctx_rcx,
1230		    (uint32_t *)&ctx->e.g.sctx_rdx);
1231		break;
1232	case VMCB_EXIT_HLT:
1233		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_HLT, 1);
1234		vmexit->exitcode = VM_EXITCODE_HLT;
1235		vmexit->u.hlt.rflags = state->rflags;
1236		break;
1237	case VMCB_EXIT_PAUSE:
1238		vmexit->exitcode = VM_EXITCODE_PAUSE;
1239		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_PAUSE, 1);
1240		break;
1241	case VMCB_EXIT_NPF:
1242		/* EXITINFO2 contains the faulting guest physical address */
1243		if (info1 & VMCB_NPF_INFO1_RSV) {
1244			VCPU_CTR2(svm_sc->vm, vcpu, "nested page fault with "
1245			    "reserved bits set: info1(%#lx) info2(%#lx)",
1246			    info1, info2);
1247		} else if (vm_mem_allocated(svm_sc->vm, info2)) {
1248			vmexit->exitcode = VM_EXITCODE_PAGING;
1249			vmexit->u.paging.gpa = info2;
1250			vmexit->u.paging.fault_type = svm_npf_paging(info1);
1251			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
1252			VCPU_CTR3(svm_sc->vm, vcpu, "nested page fault "
1253			    "on gpa %#lx/%#lx at rip %#lx",
1254			    info2, info1, state->rip);
1255		} else if (svm_npf_emul_fault(info1)) {
1256			svm_handle_inst_emul(vmcb, info2, vmexit);
1257			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INST_EMUL, 1);
1258			VCPU_CTR3(svm_sc->vm, vcpu, "inst_emul fault "
1259			    "for gpa %#lx/%#lx at rip %#lx",
1260			    info2, info1, state->rip);
1261		}
1262		break;
1263	default:
1264		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_UNKNOWN, 1);
1265		break;
1266	}
1267
1268	VCPU_CTR4(svm_sc->vm, vcpu, "%s %s vmexit at %#lx/%d",
1269	    handled ? "handled" : "unhandled", exit_reason_to_str(code),
1270	    vmexit->rip, vmexit->inst_length);
1271
1272	if (handled) {
1273		vmexit->rip += vmexit->inst_length;
1274		vmexit->inst_length = 0;
1275		state->rip = vmexit->rip;
1276	} else {
1277		if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
1278			/*
1279			 * If this VM exit was not claimed by anybody then
1280			 * treat it as a generic SVM exit.
1281			 */
1282			vm_exit_svm(vmexit, code, info1, info2);
1283		} else {
1284			/*
1285			 * The exitcode and collateral have been populated.
1286			 * The VM exit will be processed further in userland.
1287			 */
1288		}
1289	}
1290	return (handled);
1291}
1292
1293static void
1294svm_inj_intinfo(struct svm_softc *svm_sc, int vcpu)
1295{
1296	uint64_t intinfo;
1297
1298	if (!vm_entry_intinfo(svm_sc->vm, vcpu, &intinfo))
1299		return;
1300
1301	KASSERT(VMCB_EXITINTINFO_VALID(intinfo), ("%s: entry intinfo is not "
1302	    "valid: %#lx", __func__, intinfo));
1303
1304	svm_eventinject(svm_sc, vcpu, VMCB_EXITINTINFO_TYPE(intinfo),
1305		VMCB_EXITINTINFO_VECTOR(intinfo),
1306		VMCB_EXITINTINFO_EC(intinfo),
1307		VMCB_EXITINTINFO_EC_VALID(intinfo));
1308	vmm_stat_incr(svm_sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1);
1309	VCPU_CTR1(svm_sc->vm, vcpu, "Injected entry intinfo: %#lx", intinfo);
1310}
1311
1312/*
1313 * Inject event to virtual cpu.
1314 */
1315static void
1316svm_inj_interrupts(struct svm_softc *sc, int vcpu, struct vlapic *vlapic)
1317{
1318	struct vmcb_ctrl *ctrl;
1319	struct vmcb_state *state;
1320	int extint_pending;
1321	int vector, need_intr_window;
1322
1323	state = svm_get_vmcb_state(sc, vcpu);
1324	ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
1325
1326	need_intr_window = 0;
1327
1328	/*
1329	 * Inject pending events or exceptions for this vcpu.
1330	 *
1331	 * An event might be pending because the previous #VMEXIT happened
1332	 * during event delivery (i.e. ctrl->exitintinfo).
1333	 *
1334	 * An event might also be pending because an exception was injected
1335	 * by the hypervisor (e.g. #PF during instruction emulation).
1336	 */
1337	svm_inj_intinfo(sc, vcpu);
1338
1339	/* NMI event has priority over interrupts. */
1340	if (vm_nmi_pending(sc->vm, vcpu)) {
1341		if (nmi_blocked(sc, vcpu)) {
1342			/*
1343			 * Can't inject another NMI if the guest has not
1344			 * yet executed an "iret" after the last NMI.
1345			 */
1346			VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due "
1347			    "to NMI-blocking");
1348		} else if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
1349			/*
1350			 * If there is already an exception/interrupt pending
1351			 * then defer the NMI until after that.
1352			 */
1353			VCPU_CTR1(sc->vm, vcpu, "Cannot inject NMI due to "
1354			    "eventinj %#lx", ctrl->eventinj);
1355
1356			/*
1357			 * Use self-IPI to trigger a VM-exit as soon as
1358			 * possible after the event injection is completed.
1359			 *
1360			 * This works only if the external interrupt exiting
1361			 * is at a lower priority than the event injection.
1362			 *
1363			 * Although not explicitly specified in APMv2 the
1364			 * relative priorities were verified empirically.
1365			 */
1366			ipi_cpu(curcpu, IPI_AST);	/* XXX vmm_ipinum? */
1367		} else {
1368			vm_nmi_clear(sc->vm, vcpu);
1369
1370			/* Inject NMI, vector number is not used */
1371			svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_NMI,
1372			    IDT_NMI, 0, false);
1373
1374			/* virtual NMI blocking is now in effect */
1375			enable_nmi_blocking(sc, vcpu);
1376
1377			VCPU_CTR0(sc->vm, vcpu, "Injecting vNMI");
1378		}
1379	}
1380
1381	extint_pending = vm_extint_pending(sc->vm, vcpu);
1382
1383	if (!extint_pending) {
1384		/* Ask the local apic for a vector to inject */
1385		if (!vlapic_pending_intr(vlapic, &vector)) {
1386			goto done;	/* nothing to inject */
1387		}
1388		KASSERT(vector >= 16 && vector <= 255,
1389		    ("invalid vector %d from local APIC", vector));
1390	} else {
1391                /* Ask the legacy pic for a vector to inject */
1392                vatpic_pending_intr(sc->vm, &vector);
1393		KASSERT(vector >= 0 && vector <= 255,
1394		    ("invalid vector %d from local APIC", vector));
1395	}
1396
1397	/*
1398	 * If the guest has disabled interrupts or is in an interrupt shadow
1399	 * then we cannot inject the pending interrupt.
1400	 */
1401	if ((state->rflags & PSL_I) == 0) {
1402		VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to "
1403		    "rflags %#lx", vector, state->rflags);
1404		need_intr_window = 1;
1405		goto done;
1406	}
1407
1408	if (ctrl->intr_shadow) {
1409		VCPU_CTR1(sc->vm, vcpu, "Cannot inject vector %d due to "
1410		    "interrupt shadow", vector);
1411		need_intr_window = 1;
1412		goto done;
1413	}
1414
1415	if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
1416		VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to "
1417		    "eventinj %#lx", vector, ctrl->eventinj);
1418		need_intr_window = 1;
1419		goto done;
1420	}
1421
1422	svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_INTR, vector, 0, false);
1423
1424        if (!extint_pending) {
1425                /* Update the Local APIC ISR */
1426                vlapic_intr_accepted(vlapic, vector);
1427        } else {
1428                vm_extint_clear(sc->vm, vcpu);
1429                vatpic_intr_accepted(sc->vm, vector);
1430		/*
1431		 * Force a VM-exit as soon as the vcpu is ready to accept
1432		 * another interrupt. This is done because the PIC might
1433		 * have another vector that it wants to inject. Also, if
1434		 * the vlapic has a pending interrupt that was preempted
1435		 * by the ExtInt then it allows us to inject the APIC
1436		 * vector as soon as possible.
1437		 */
1438		need_intr_window = 1;
1439        }
1440done:
1441	if (need_intr_window) {
1442		/*
1443		 * We use V_IRQ in conjunction with the VINTR intercept to
1444		 * trap into the hypervisor as soon as a virtual interrupt
1445		 * can be delivered.
1446		 *
1447		 * Since injected events are not subject to intercept checks
1448		 * we need to ensure that the V_IRQ is not actually going to
1449		 * be delivered on VM entry. The KASSERT below enforces this.
1450		 */
1451		KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 ||
1452		    (state->rflags & PSL_I) == 0 || ctrl->intr_shadow,
1453		    ("Bogus intr_window_exiting: eventinj (%#lx), "
1454		    "intr_shadow (%u), rflags (%#lx)",
1455		    ctrl->eventinj, ctrl->intr_shadow, state->rflags));
1456		enable_intr_window_exiting(sc, vcpu);
1457	} else {
1458		disable_intr_window_exiting(sc, vcpu);
1459	}
1460}
1461
1462static __inline void
1463restore_host_tss(void)
1464{
1465	struct system_segment_descriptor *tss_sd;
1466
1467	/*
1468	 * The TSS descriptor was in use prior to launching the guest so it
1469	 * has been marked busy.
1470	 *
1471	 * 'ltr' requires the descriptor to be marked available so change the
1472	 * type to "64-bit available TSS".
1473	 */
1474	tss_sd = PCPU_GET(tss);
1475	tss_sd->sd_type = SDT_SYSTSS;
1476	ltr(GSEL(GPROC0_SEL, SEL_KPL));
1477}
1478
1479static void
1480check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, u_int thiscpu)
1481{
1482	struct svm_vcpu *vcpustate;
1483	struct vmcb_ctrl *ctrl;
1484	long eptgen;
1485	bool alloc_asid;
1486
1487	KASSERT(CPU_ISSET(thiscpu, &pmap->pm_active), ("%s: nested pmap not "
1488	    "active on cpu %u", __func__, thiscpu));
1489
1490	vcpustate = svm_get_vcpu(sc, vcpuid);
1491	ctrl = svm_get_vmcb_ctrl(sc, vcpuid);
1492
1493	/*
1494	 * The TLB entries associated with the vcpu's ASID are not valid
1495	 * if either of the following conditions is true:
1496	 *
1497	 * 1. The vcpu's ASID generation is different than the host cpu's
1498	 *    ASID generation. This happens when the vcpu migrates to a new
1499	 *    host cpu. It can also happen when the number of vcpus executing
1500	 *    on a host cpu is greater than the number of ASIDs available.
1501	 *
1502	 * 2. The pmap generation number is different than the value cached in
1503	 *    the 'vcpustate'. This happens when the host invalidates pages
1504	 *    belonging to the guest.
1505	 *
1506	 *	asidgen		eptgen	      Action
1507	 *	mismatch	mismatch
1508	 *	   0		   0		(a)
1509	 *	   0		   1		(b1) or (b2)
1510	 *	   1		   0		(c)
1511	 *	   1		   1		(d)
1512	 *
1513	 * (a) There is no mismatch in eptgen or ASID generation and therefore
1514	 *     no further action is needed.
1515	 *
1516	 * (b1) If the cpu supports FlushByAsid then the vcpu's ASID is
1517	 *      retained and the TLB entries associated with this ASID
1518	 *      are flushed by VMRUN.
1519	 *
1520	 * (b2) If the cpu does not support FlushByAsid then a new ASID is
1521	 *      allocated.
1522	 *
1523	 * (c) A new ASID is allocated.
1524	 *
1525	 * (d) A new ASID is allocated.
1526	 */
1527
1528	alloc_asid = false;
1529	eptgen = pmap->pm_eptgen;
1530	ctrl->tlb_ctrl = VMCB_TLB_FLUSH_NOTHING;
1531
1532	if (vcpustate->asid.gen != asid[thiscpu].gen) {
1533		alloc_asid = true;	/* (c) and (d) */
1534	} else if (vcpustate->eptgen != eptgen) {
1535		if (flush_by_asid())
1536			ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST;	/* (b1) */
1537		else
1538			alloc_asid = true;			/* (b2) */
1539	} else {
1540		/*
1541		 * This is the common case (a).
1542		 */
1543		KASSERT(!alloc_asid, ("ASID allocation not necessary"));
1544		KASSERT(ctrl->tlb_ctrl == VMCB_TLB_FLUSH_NOTHING,
1545		    ("Invalid VMCB tlb_ctrl: %#x", ctrl->tlb_ctrl));
1546	}
1547
1548	if (alloc_asid) {
1549		if (++asid[thiscpu].num >= nasid) {
1550			asid[thiscpu].num = 1;
1551			if (++asid[thiscpu].gen == 0)
1552				asid[thiscpu].gen = 1;
1553			/*
1554			 * If this cpu does not support "flush-by-asid"
1555			 * then flush the entire TLB on a generation
1556			 * bump. Subsequent ASID allocation in this
1557			 * generation can be done without a TLB flush.
1558			 */
1559			if (!flush_by_asid())
1560				ctrl->tlb_ctrl = VMCB_TLB_FLUSH_ALL;
1561		}
1562		vcpustate->asid.gen = asid[thiscpu].gen;
1563		vcpustate->asid.num = asid[thiscpu].num;
1564
1565		ctrl->asid = vcpustate->asid.num;
1566		vcpu_set_dirty(sc, vcpuid, VMCB_CACHE_ASID);
1567		/*
1568		 * If this cpu supports "flush-by-asid" then the TLB
1569		 * was not flushed after the generation bump. The TLB
1570		 * is flushed selectively after every new ASID allocation.
1571		 */
1572		if (flush_by_asid())
1573			ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST;
1574	}
1575	vcpustate->eptgen = eptgen;
1576
1577	KASSERT(ctrl->asid != 0, ("Guest ASID must be non-zero"));
1578	KASSERT(ctrl->asid == vcpustate->asid.num,
1579	    ("ASID mismatch: %u/%u", ctrl->asid, vcpustate->asid.num));
1580}
1581
1582/*
1583 * Start vcpu with specified RIP.
1584 */
1585static int
1586svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap,
1587	void *rend_cookie, void *suspended_cookie)
1588{
1589	struct svm_regctx *hctx, *gctx;
1590	struct svm_softc *svm_sc;
1591	struct svm_vcpu *vcpustate;
1592	struct vmcb_state *state;
1593	struct vmcb_ctrl *ctrl;
1594	struct vm_exit *vmexit;
1595	struct vlapic *vlapic;
1596	struct vm *vm;
1597	uint64_t vmcb_pa;
1598	u_int thiscpu;
1599	int handled;
1600
1601	svm_sc = arg;
1602	vm = svm_sc->vm;
1603
1604	vcpustate = svm_get_vcpu(svm_sc, vcpu);
1605	state = svm_get_vmcb_state(svm_sc, vcpu);
1606	ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
1607	vmexit = vm_exitinfo(vm, vcpu);
1608	vlapic = vm_lapic(vm, vcpu);
1609
1610	/*
1611	 * Stash 'curcpu' on the stack as 'thiscpu'.
1612	 *
1613	 * The per-cpu data area is not accessible until MSR_GSBASE is restored
1614	 * after the #VMEXIT. Since VMRUN is executed inside a critical section
1615	 * 'curcpu' and 'thiscpu' are guaranteed to identical.
1616	 */
1617	thiscpu = curcpu;
1618
1619	gctx = svm_get_guest_regctx(svm_sc, vcpu);
1620	hctx = &host_ctx[thiscpu];
1621	vmcb_pa = svm_sc->vcpu[vcpu].vmcb_pa;
1622
1623	if (vcpustate->lastcpu != thiscpu) {
1624		/*
1625		 * Force new ASID allocation by invalidating the generation.
1626		 */
1627		vcpustate->asid.gen = 0;
1628
1629		/*
1630		 * Invalidate the VMCB state cache by marking all fields dirty.
1631		 */
1632		vcpu_set_dirty(svm_sc, vcpu, 0xffffffff);
1633
1634		/*
1635		 * XXX
1636		 * Setting 'vcpustate->lastcpu' here is bit premature because
1637		 * we may return from this function without actually executing
1638		 * the VMRUN  instruction. This could happen if a rendezvous
1639		 * or an AST is pending on the first time through the loop.
1640		 *
1641		 * This works for now but any new side-effects of vcpu
1642		 * migration should take this case into account.
1643		 */
1644		vcpustate->lastcpu = thiscpu;
1645		vmm_stat_incr(vm, vcpu, VCPU_MIGRATIONS, 1);
1646	}
1647
1648	/* Update Guest RIP */
1649	state->rip = rip;
1650
1651	do {
1652		/*
1653		 * Disable global interrupts to guarantee atomicity during
1654		 * loading of guest state. This includes not only the state
1655		 * loaded by the "vmrun" instruction but also software state
1656		 * maintained by the hypervisor: suspended and rendezvous
1657		 * state, NPT generation number, vlapic interrupts etc.
1658		 */
1659		disable_gintr();
1660
1661		if (vcpu_suspended(suspended_cookie)) {
1662			enable_gintr();
1663			vm_exit_suspended(vm, vcpu, state->rip);
1664			break;
1665		}
1666
1667		if (vcpu_rendezvous_pending(rend_cookie)) {
1668			enable_gintr();
1669			vm_exit_rendezvous(vm, vcpu, state->rip);
1670			break;
1671		}
1672
1673		/* We are asked to give the cpu by scheduler. */
1674		if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)) {
1675			enable_gintr();
1676			vm_exit_astpending(vm, vcpu, state->rip);
1677			break;
1678		}
1679
1680		svm_inj_interrupts(svm_sc, vcpu, vlapic);
1681
1682		/* Activate the nested pmap on 'thiscpu' */
1683		CPU_SET_ATOMIC_ACQ(thiscpu, &pmap->pm_active);
1684
1685		/*
1686		 * Check the pmap generation and the ASID generation to
1687		 * ensure that the vcpu does not use stale TLB mappings.
1688		 */
1689		check_asid(svm_sc, vcpu, pmap, thiscpu);
1690
1691		ctrl->vmcb_clean = VMCB_CACHE_DEFAULT & ~vcpustate->dirty;
1692		vcpustate->dirty = 0;
1693		VCPU_CTR1(vm, vcpu, "vmcb clean %#x", ctrl->vmcb_clean);
1694
1695		/* Launch Virtual Machine. */
1696		VCPU_CTR1(vm, vcpu, "Resume execution at %#lx", state->rip);
1697		svm_launch(vmcb_pa, gctx, hctx);
1698
1699		CPU_CLR_ATOMIC(thiscpu, &pmap->pm_active);
1700
1701		/*
1702		 * Restore MSR_GSBASE to point to the pcpu data area.
1703		 *
1704		 * Note that accesses done via PCPU_GET/PCPU_SET will work
1705		 * only after MSR_GSBASE is restored.
1706		 *
1707		 * Also note that we don't bother restoring MSR_KGSBASE
1708		 * since it is not used in the kernel and will be restored
1709		 * when the VMRUN ioctl returns to userspace.
1710		 */
1711		wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[thiscpu]);
1712		KASSERT(curcpu == thiscpu, ("thiscpu/curcpu (%u/%u) mismatch",
1713		    thiscpu, curcpu));
1714
1715		/*
1716		 * The host GDTR and IDTR is saved by VMRUN and restored
1717		 * automatically on #VMEXIT. However, the host TSS needs
1718		 * to be restored explicitly.
1719		 */
1720		restore_host_tss();
1721
1722		/* #VMEXIT disables interrupts so re-enable them here. */
1723		enable_gintr();
1724
1725		/* Handle #VMEXIT and if required return to user space. */
1726		handled = svm_vmexit(svm_sc, vcpu, vmexit);
1727	} while (handled);
1728
1729	return (0);
1730}
1731
1732/*
1733 * Cleanup for virtual machine.
1734 */
1735static void
1736svm_vmcleanup(void *arg)
1737{
1738	struct svm_softc *svm_sc;
1739
1740	svm_sc = arg;
1741
1742	VCPU_CTR0(svm_sc->vm, 0, "SVM:cleanup\n");
1743
1744	free(svm_sc, M_SVM);
1745}
1746
1747/*
1748 * Return pointer to hypervisor saved register state.
1749 */
1750static register_t *
1751swctx_regptr(struct svm_regctx *regctx, int reg)
1752{
1753
1754	switch (reg) {
1755		case VM_REG_GUEST_RBX:
1756			return (&regctx->sctx_rbx);
1757		case VM_REG_GUEST_RCX:
1758			return (&regctx->sctx_rcx);
1759		case VM_REG_GUEST_RDX:
1760			return (&regctx->e.g.sctx_rdx);
1761		case VM_REG_GUEST_RDI:
1762			return (&regctx->e.g.sctx_rdi);
1763		case VM_REG_GUEST_RSI:
1764			return (&regctx->e.g.sctx_rsi);
1765		case VM_REG_GUEST_RBP:
1766			return (&regctx->sctx_rbp);
1767		case VM_REG_GUEST_R8:
1768			return (&regctx->sctx_r8);
1769		case VM_REG_GUEST_R9:
1770			return (&regctx->sctx_r9);
1771		case VM_REG_GUEST_R10:
1772			return (&regctx->sctx_r10);
1773		case VM_REG_GUEST_R11:
1774			return (&regctx->sctx_r11);
1775		case VM_REG_GUEST_R12:
1776			return (&regctx->sctx_r12);
1777		case VM_REG_GUEST_R13:
1778			return (&regctx->sctx_r13);
1779		case VM_REG_GUEST_R14:
1780			return (&regctx->sctx_r14);
1781		case VM_REG_GUEST_R15:
1782			return (&regctx->sctx_r15);
1783		default:
1784			ERR("Unknown register requested, reg=%d.\n", reg);
1785			break;
1786	}
1787
1788	return (NULL);
1789}
1790
1791/*
1792 * Interface to read guest registers.
1793 * This can be SVM h/w saved or hypervisor saved register.
1794 */
1795static int
1796svm_getreg(void *arg, int vcpu, int ident, uint64_t *val)
1797{
1798	struct svm_softc *svm_sc;
1799	struct vmcb *vmcb;
1800	register_t *reg;
1801
1802	svm_sc = arg;
1803	KASSERT(vcpu < svm_sc->vcpu_cnt, ("Guest doesn't have VCPU%d", vcpu));
1804
1805	vmcb = svm_get_vmcb(svm_sc, vcpu);
1806
1807	if (vmcb_read(vmcb, ident, val) == 0) {
1808		return (0);
1809	}
1810
1811	reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident);
1812
1813	if (reg != NULL) {
1814		*val = *reg;
1815		return (0);
1816	}
1817
1818 	ERR("SVM_ERR:reg type %x is not saved in VMCB.\n", ident);
1819	return (EINVAL);
1820}
1821
1822/*
1823 * Interface to write to guest registers.
1824 * This can be SVM h/w saved or hypervisor saved register.
1825 */
1826static int
1827svm_setreg(void *arg, int vcpu, int ident, uint64_t val)
1828{
1829	struct svm_softc *svm_sc;
1830	struct vmcb *vmcb;
1831	register_t *reg;
1832
1833	svm_sc = arg;
1834	KASSERT(vcpu < svm_sc->vcpu_cnt, ("Guest doesn't have VCPU%d", vcpu));
1835
1836	vmcb = svm_get_vmcb(svm_sc, vcpu);
1837	if (vmcb_write(vmcb, ident, val) == 0) {
1838		return (0);
1839	}
1840
1841	reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident);
1842
1843	if (reg != NULL) {
1844		*reg = val;
1845		return (0);
1846	}
1847
1848	/*
1849	 * XXX deal with CR3 and invalidate TLB entries tagged with the
1850	 * vcpu's ASID. This needs to be treated differently depending on
1851	 * whether 'running' is true/false.
1852	 */
1853
1854 	ERR("SVM_ERR:reg type %x is not saved in VMCB.\n", ident);
1855	return (EINVAL);
1856}
1857
1858
1859/*
1860 * Inteface to set various descriptors.
1861 */
1862static int
1863svm_setdesc(void *arg, int vcpu, int type, struct seg_desc *desc)
1864{
1865	struct svm_softc *svm_sc;
1866	struct vmcb *vmcb;
1867	struct vmcb_segment *seg;
1868	uint16_t attrib;
1869
1870	svm_sc = arg;
1871	KASSERT(vcpu < svm_sc->vcpu_cnt, ("Guest doesn't have VCPU%d", vcpu));
1872
1873	vmcb = svm_get_vmcb(svm_sc, vcpu);
1874
1875	VCPU_CTR1(svm_sc->vm, vcpu, "SVM:set_desc: Type%d\n", type);
1876
1877	seg = vmcb_seg(vmcb, type);
1878	if (seg == NULL) {
1879		ERR("SVM_ERR:Unsupported segment type%d\n", type);
1880		return (EINVAL);
1881	}
1882
1883	/* Map seg_desc access to VMCB attribute format.*/
1884	attrib = ((desc->access & 0xF000) >> 4) | (desc->access & 0xFF);
1885	VCPU_CTR3(svm_sc->vm, vcpu, "SVM:[sel %d attribute 0x%x limit:0x%x]\n",
1886		type, desc->access, desc->limit);
1887	seg->attrib = attrib;
1888	seg->base = desc->base;
1889	seg->limit = desc->limit;
1890
1891	return (0);
1892}
1893
1894/*
1895 * Interface to get guest descriptor.
1896 */
1897static int
1898svm_getdesc(void *arg, int vcpu, int type, struct seg_desc *desc)
1899{
1900	struct svm_softc *svm_sc;
1901	struct vmcb_segment	*seg;
1902
1903	svm_sc = arg;
1904	KASSERT(vcpu < svm_sc->vcpu_cnt, ("Guest doesn't have VCPU%d", vcpu));
1905
1906	VCPU_CTR1(svm_sc->vm, vcpu, "SVM:get_desc: Type%d\n", type);
1907
1908	seg = vmcb_seg(svm_get_vmcb(svm_sc, vcpu), type);
1909	if (!seg) {
1910		ERR("SVM_ERR:Unsupported segment type%d\n", type);
1911		return (EINVAL);
1912	}
1913
1914	/* Map seg_desc access to VMCB attribute format.*/
1915	desc->access = ((seg->attrib & 0xF00) << 4) | (seg->attrib & 0xFF);
1916	desc->base = seg->base;
1917	desc->limit = seg->limit;
1918
1919	/*
1920	 * VT-x uses bit 16 (Unusable) to indicate a segment that has been
1921	 * loaded with a NULL segment selector. The 'desc->access' field is
1922	 * interpreted in the VT-x format by the processor-independent code.
1923	 *
1924	 * SVM uses the 'P' bit to convey the same information so convert it
1925	 * into the VT-x format. For more details refer to section
1926	 * "Segment State in the VMCB" in APMv2.
1927	 */
1928	if (type == VM_REG_GUEST_CS && type == VM_REG_GUEST_TR)
1929		desc->access |= 0x80;		/* CS and TS always present */
1930
1931	if (!(desc->access & 0x80))
1932		desc->access |= 0x10000;	/* Unusable segment */
1933
1934	return (0);
1935}
1936
1937static int
1938svm_setcap(void *arg, int vcpu, int type, int val)
1939{
1940	struct svm_softc *sc;
1941	int error;
1942
1943	sc = arg;
1944	error = 0;
1945	switch (type) {
1946	case VM_CAP_HALT_EXIT:
1947		svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
1948		    VMCB_INTCPT_HLT, val);
1949		break;
1950	case VM_CAP_PAUSE_EXIT:
1951		svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
1952		    VMCB_INTCPT_PAUSE, val);
1953		break;
1954	case VM_CAP_UNRESTRICTED_GUEST:
1955		/* Unrestricted guest execution cannot be disabled in SVM */
1956		if (val == 0)
1957			error = EINVAL;
1958		break;
1959	default:
1960		error = ENOENT;
1961		break;
1962	}
1963	return (error);
1964}
1965
1966static int
1967svm_getcap(void *arg, int vcpu, int type, int *retval)
1968{
1969	struct svm_softc *sc;
1970	int error;
1971
1972	sc = arg;
1973	error = 0;
1974
1975	switch (type) {
1976	case VM_CAP_HALT_EXIT:
1977		*retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
1978		    VMCB_INTCPT_HLT);
1979		break;
1980	case VM_CAP_PAUSE_EXIT:
1981		*retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
1982		    VMCB_INTCPT_PAUSE);
1983		break;
1984	case VM_CAP_UNRESTRICTED_GUEST:
1985		*retval = 1;	/* unrestricted guest is always enabled */
1986		break;
1987	default:
1988		error = ENOENT;
1989		break;
1990	}
1991	return (error);
1992}
1993
1994static struct vlapic *
1995svm_vlapic_init(void *arg, int vcpuid)
1996{
1997	struct svm_softc *svm_sc;
1998	struct vlapic *vlapic;
1999
2000	svm_sc = arg;
2001	vlapic = malloc(sizeof(struct vlapic), M_SVM_VLAPIC, M_WAITOK | M_ZERO);
2002	vlapic->vm = svm_sc->vm;
2003	vlapic->vcpuid = vcpuid;
2004	vlapic->apic_page = (struct LAPIC *)&svm_sc->apic_page[vcpuid];
2005
2006	vlapic_init(vlapic);
2007
2008	return (vlapic);
2009}
2010
2011static void
2012svm_vlapic_cleanup(void *arg, struct vlapic *vlapic)
2013{
2014
2015        vlapic_cleanup(vlapic);
2016        free(vlapic, M_SVM_VLAPIC);
2017}
2018
2019struct vmm_ops vmm_ops_amd = {
2020	svm_init,
2021	svm_cleanup,
2022	svm_restore,
2023	svm_vminit,
2024	svm_vmrun,
2025	svm_vmcleanup,
2026	svm_getreg,
2027	svm_setreg,
2028	svm_getdesc,
2029	svm_setdesc,
2030	svm_getcap,
2031	svm_setcap,
2032	svm_npt_alloc,
2033	svm_npt_free,
2034	svm_vlapic_init,
2035	svm_vlapic_cleanup
2036};
2037