svm.c revision 271570
112115Sdyson/*- 212115Sdyson * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com) 312115Sdyson * All rights reserved. 412115Sdyson * 512115Sdyson * Redistribution and use in source and binary forms, with or without 612115Sdyson * modification, are permitted provided that the following conditions 712115Sdyson * are met: 812115Sdyson * 1. Redistributions of source code must retain the above copyright 912115Sdyson * notice unmodified, this list of conditions, and the following 1012115Sdyson * disclaimer. 1112115Sdyson * 2. Redistributions in binary form must reproduce the above copyright 1212115Sdyson * notice, this list of conditions and the following disclaimer in the 1312115Sdyson * documentation and/or other materials provided with the distribution. 1412115Sdyson * 1512115Sdyson * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 1612115Sdyson * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 1712115Sdyson * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 1812115Sdyson * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 1912115Sdyson * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 2012115Sdyson * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 2112115Sdyson * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 2212115Sdyson * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 2312115Sdyson * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 2412115Sdyson * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 2512115Sdyson */ 2612115Sdyson 2712115Sdyson#include <sys/cdefs.h> 2812115Sdyson__FBSDID("$FreeBSD: projects/bhyve_svm/sys/amd64/vmm/amd/svm.c 271570 2014-09-14 04:39:04Z neel $"); 2912115Sdyson 3012115Sdyson#include <sys/param.h> 3112115Sdyson#include <sys/systm.h> 3212115Sdyson#include <sys/smp.h> 3312115Sdyson#include <sys/kernel.h> 3412115Sdyson#include <sys/malloc.h> 3512115Sdyson#include <sys/pcpu.h> 3612115Sdyson#include <sys/proc.h> 3712115Sdyson#include <sys/sysctl.h> 3812115Sdyson 3912115Sdyson#include <vm/vm.h> 4012115Sdyson#include <vm/pmap.h> 4112115Sdyson 4213260Swollman#include <machine/cpufunc.h> 4312115Sdyson#include <machine/psl.h> 4412115Sdyson#include <machine/pmap.h> 4512115Sdyson#include <machine/md_var.h> 4612115Sdyson#include <machine/vmparam.h> 4712115Sdyson#include <machine/specialreg.h> 4812115Sdyson#include <machine/segments.h> 4912115Sdyson#include <machine/smp.h> 5012115Sdyson#include <machine/vmm.h> 5112115Sdyson#include <machine/vmm_dev.h> 5229906Skato#include <machine/vmm_instruction_emul.h> 5324131Sbde 5412115Sdyson#include <x86/apicreg.h> 5512115Sdyson 5612115Sdyson#include "vmm_lapic.h" 5712115Sdyson#include "vmm_msr.h" 5812115Sdyson#include "vmm_stat.h" 5912115Sdyson#include "vmm_ktr.h" 6012115Sdyson#include "vmm_ioport.h" 6112115Sdyson#include "vatpic.h" 6212115Sdyson#include "vlapic.h" 6312115Sdyson#include "vlapic_priv.h" 6412115Sdyson 6512115Sdyson#include "x86.h" 6612115Sdyson#include "vmcb.h" 6712115Sdyson#include "svm.h" 6812115Sdyson#include "svm_softc.h" 6912115Sdyson#include "npt.h" 7028270Swollman 7112911SphkSYSCTL_DECL(_hw_vmm); 7212911SphkSYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW, NULL, NULL); 7312911Sphk 7412911Sphk/* 7512911Sphk * SVM CPUID function 0x8000_000A, edx bit decoding. 7612911Sphk */ 7712911Sphk#define AMD_CPUID_SVM_NP BIT(0) /* Nested paging or RVI */ 7812911Sphk#define AMD_CPUID_SVM_LBR BIT(1) /* Last branch virtualization */ 7912911Sphk#define AMD_CPUID_SVM_SVML BIT(2) /* SVM lock */ 8012911Sphk#define AMD_CPUID_SVM_NRIP_SAVE BIT(3) /* Next RIP is saved */ 8112911Sphk#define AMD_CPUID_SVM_TSC_RATE BIT(4) /* TSC rate control. */ 8212911Sphk#define AMD_CPUID_SVM_VMCB_CLEAN BIT(5) /* VMCB state caching */ 8312911Sphk#define AMD_CPUID_SVM_FLUSH_BY_ASID BIT(6) /* Flush by ASID */ 8412115Sdyson#define AMD_CPUID_SVM_DECODE_ASSIST BIT(7) /* Decode assist */ 8531315Sbde#define AMD_CPUID_SVM_PAUSE_INC BIT(10) /* Pause intercept filter. */ 8630280Sphk#define AMD_CPUID_SVM_PAUSE_FTH BIT(12) /* Pause filter threshold */ 8712911Sphk 8812115Sdyson#define VMCB_CACHE_DEFAULT (VMCB_CACHE_ASID | \ 8912115Sdyson VMCB_CACHE_IOPM | \ 9012115Sdyson VMCB_CACHE_I | \ 9112115Sdyson VMCB_CACHE_TPR | \ 9212115Sdyson VMCB_CACHE_NP) 9312115Sdyson 9412115SdysonMALLOC_DEFINE(M_SVM, "svm", "svm"); 9512115SdysonMALLOC_DEFINE(M_SVM_VLAPIC, "svm-vlapic", "svm-vlapic"); 9612115Sdyson 9712115Sdyson/* Per-CPU context area. */ 9812115Sdysonextern struct pcpu __pcpu[]; 9912115Sdyson 10012115Sdysonstatic int svm_getdesc(void *arg, int vcpu, int type, struct seg_desc *desc); 10112115Sdyson 10212115Sdysonstatic uint32_t svm_feature; /* AMD SVM features. */ 10312115SdysonSYSCTL_UINT(_hw_vmm_svm, OID_AUTO, features, CTLFLAG_RD, &svm_feature, 0, 10412115Sdyson "SVM features advertised by CPUID.8000000AH:EDX"); 10512911Sphk 10612115Sdysonstatic int disable_npf_assist; 10716322SgpalmerSYSCTL_INT(_hw_vmm_svm, OID_AUTO, disable_npf_assist, CTLFLAG_RWTUN, 10816322Sgpalmer &disable_npf_assist, 0, NULL); 10916322Sgpalmer 11016322Sgpalmer/* Maximum ASIDs supported by the processor */ 11116322Sgpalmerstatic uint32_t nasid; 11216322SgpalmerSYSCTL_UINT(_hw_vmm_svm, OID_AUTO, num_asids, CTLFLAG_RD, &nasid, 0, 11316322Sgpalmer "Number of ASIDs supported by this processor"); 11412115Sdyson 11516322Sgpalmer/* Current ASID generation for each host cpu */ 11612115Sdysonstatic struct asid asid[MAXCPU]; 11712115Sdyson 11812115Sdyson/* 11912115Sdyson * SVM host state saved area of size 4KB for each core. 12012115Sdyson */ 12112911Sphkstatic uint8_t hsave[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); 12212115Sdyson 12312115Sdyson/* 12412115Sdyson * S/w saved host context. 12512115Sdyson */ 12612115Sdysonstatic struct svm_regctx host_ctx[MAXCPU]; 12712115Sdyson 12812115Sdysonstatic VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery"); 12912115Sdysonstatic VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry"); 13012115Sdysonstatic VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window"); 13129208Sbde 13229208Sbde/* 13329208Sbde * Common function to enable or disabled SVM for a CPU. 13429208Sbde */ 13512115Sdysonstatic int 13612115Sdysoncpu_svm_enable_disable(boolean_t enable) 13712115Sdyson{ 13812115Sdyson uint64_t efer_msr; 13929888Skato 14029888Skato efer_msr = rdmsr(MSR_EFER); 14129888Skato 14229888Skato if (enable) 14312115Sdyson efer_msr |= EFER_SVM; 14412115Sdyson else 14512115Sdyson efer_msr &= ~EFER_SVM; 14612115Sdyson 14712115Sdyson wrmsr(MSR_EFER, efer_msr); 14812115Sdyson 14912115Sdyson return(0); 15012115Sdyson} 15112115Sdyson 15230469Sjulian/* 15312115Sdyson * Disable SVM on a CPU. 15412115Sdyson */ 15512115Sdysonstatic void 15612115Sdysonsvm_disable(void *arg __unused) 15712115Sdyson{ 15812115Sdyson 15912115Sdyson (void)cpu_svm_enable_disable(FALSE); 16012115Sdyson} 16112115Sdyson 16212115Sdyson/* 16312115Sdyson * Disable SVM for all CPUs. 16412115Sdyson */ 16512115Sdysonstatic int 16612115Sdysonsvm_cleanup(void) 16712115Sdyson{ 16812115Sdyson 16916322Sgpalmer smp_rendezvous(NULL, svm_disable, NULL, NULL); 17012115Sdyson return (0); 17112115Sdyson} 17212115Sdyson 17312115Sdyson/* 17412115Sdyson * Check for required BHyVe SVM features in a CPU. 17512115Sdyson */ 17612911Sphkstatic int 17712115Sdysonsvm_cpuid_features(void) 17812115Sdyson{ 17912115Sdyson u_int regs[4]; 18012115Sdyson 18112115Sdyson /* CPUID Fn8000_000A is for SVM */ 18212115Sdyson do_cpuid(0x8000000A, regs); 18312115Sdyson svm_feature = regs[3]; 18412115Sdyson 18512115Sdyson printf("SVM rev: 0x%x NASID:0x%x\n", regs[0] & 0xFF, regs[1]); 18612115Sdyson nasid = regs[1]; 18712115Sdyson KASSERT(nasid > 1, ("Insufficient ASIDs for guests: %#x", nasid)); 18812115Sdyson 18912115Sdyson printf("SVM Features:0x%b\n", svm_feature, 19012115Sdyson "\020" 19112115Sdyson "\001NP" /* Nested paging */ 19212115Sdyson "\002LbrVirt" /* LBR virtualization */ 19312115Sdyson "\003SVML" /* SVM lock */ 19412115Sdyson "\004NRIPS" /* NRIP save */ 19512115Sdyson "\005TscRateMsr" /* MSR based TSC rate control */ 19629888Skato "\006VmcbClean" /* VMCB clean bits */ 19729888Skato "\007FlushByAsid" /* Flush by ASID */ 19812115Sdyson "\010DecodeAssist" /* Decode assist */ 19912115Sdyson "\011<b20>" 20012115Sdyson "\012<b20>" 20112115Sdyson "\013PauseFilter" 20212115Sdyson "\014<b20>" 20329888Skato "\015PauseFilterThreshold" 20429888Skato "\016AVIC" 20529888Skato ); 20629888Skato 20712115Sdyson /* SVM Lock */ 20812115Sdyson if (!(svm_feature & AMD_CPUID_SVM_SVML)) { 20912115Sdyson printf("SVM is disabled by BIOS, please enable in BIOS.\n"); 21012115Sdyson return (ENXIO); 21122521Sdyson } 21212115Sdyson 21312115Sdyson /* 21422521Sdyson * bhyve need RVI to work. 21512115Sdyson */ 21612115Sdyson if (!(svm_feature & AMD_CPUID_SVM_NP)) { 21712115Sdyson printf("Missing Nested paging or RVI SVM support in processor.\n"); 21812115Sdyson return (EIO); 21912115Sdyson } 22031132Sjulian 22112115Sdyson if (svm_feature & AMD_CPUID_SVM_NRIP_SAVE) 22212115Sdyson return (0); 22312115Sdyson 22412115Sdyson return (EIO); 22512115Sdyson} 22612115Sdyson 22712115Sdysonstatic __inline int 22812115Sdysonflush_by_asid(void) 22912115Sdyson{ 23012115Sdyson 23112115Sdyson return (svm_feature & AMD_CPUID_SVM_FLUSH_BY_ASID); 23212115Sdyson} 23312115Sdyson 23412115Sdysonstatic __inline int 23512115Sdysondecode_assist(void) 23612115Sdyson{ 23712115Sdyson 23812115Sdyson return (svm_feature & AMD_CPUID_SVM_DECODE_ASSIST); 23912115Sdyson} 24012115Sdyson 24112115Sdyson/* 24212115Sdyson * Enable SVM for a CPU. 24312115Sdyson */ 24412115Sdysonstatic void 24512115Sdysonsvm_enable(void *arg __unused) 24612115Sdyson{ 24712115Sdyson uint64_t hsave_pa; 24812115Sdyson 24912115Sdyson (void)cpu_svm_enable_disable(TRUE); 25012115Sdyson 25129888Skato hsave_pa = vtophys(hsave[curcpu]); 25229888Skato wrmsr(MSR_VM_HSAVE_PA, hsave_pa); 25329888Skato 25429888Skato if (rdmsr(MSR_VM_HSAVE_PA) != hsave_pa) { 25529888Skato panic("VM_HSAVE_PA is wrong on CPU%d\n", curcpu); 25612115Sdyson } 25729888Skato} 25812115Sdyson 25912115Sdyson/* 26012115Sdyson * Check if a processor support SVM. 26112115Sdyson */ 26212115Sdysonstatic int 26312115Sdysonis_svm_enabled(void) 26412115Sdyson{ 26512115Sdyson uint64_t msr; 26612115Sdyson 26712115Sdyson /* Section 15.4 Enabling SVM from APM2. */ 26812115Sdyson if ((amd_feature2 & AMDID2_SVM) == 0) { 26912115Sdyson printf("SVM is not supported on this processor.\n"); 27012115Sdyson return (ENXIO); 27112115Sdyson } 27212115Sdyson 27312115Sdyson msr = rdmsr(MSR_VM_CR); 27412115Sdyson /* Make sure SVM is not disabled by BIOS. */ 27512115Sdyson if ((msr & VM_CR_SVMDIS) == 0) { 27612115Sdyson return svm_cpuid_features(); 27712115Sdyson } 27812115Sdyson 27912115Sdyson printf("SVM disabled by Key, consult TPM/BIOS manual.\n"); 28012115Sdyson return (ENXIO); 28112115Sdyson} 28212115Sdyson 28312115Sdyson/* 28412115Sdyson * Enable SVM on CPU and initialize nested page table h/w. 28512115Sdyson */ 28612115Sdysonstatic int 28712115Sdysonsvm_init(int ipinum) 28812115Sdyson{ 28912115Sdyson int err, cpu; 29012115Sdyson 29112115Sdyson err = is_svm_enabled(); 29212115Sdyson if (err) 29312115Sdyson return (err); 29412115Sdyson 29512115Sdyson for (cpu = 0; cpu < MAXCPU; cpu++) { 29612115Sdyson /* 29712115Sdyson * Initialize the host ASIDs to their "highest" valid values. 29812115Sdyson * 29912115Sdyson * The next ASID allocation will rollover both 'gen' and 'num' 30012115Sdyson * and start off the sequence at {1,1}. 30112115Sdyson */ 30212115Sdyson asid[cpu].gen = ~0UL; 30312115Sdyson asid[cpu].num = nasid - 1; 30412115Sdyson } 30512115Sdyson 30612115Sdyson svm_npt_init(ipinum); 30712115Sdyson 30812115Sdyson /* Start SVM on all CPUs */ 30912115Sdyson smp_rendezvous(NULL, svm_enable, NULL, NULL); 31012115Sdyson 31112115Sdyson return (0); 31212115Sdyson} 31312115Sdyson 31412115Sdysonstatic void 31512115Sdysonsvm_restore(void) 31612115Sdyson{ 31712115Sdyson svm_enable(NULL); 31812115Sdyson} 31912115Sdyson 32012115Sdyson/* 32112115Sdyson * Get index and bit position for a MSR in MSR permission 32212115Sdyson * bitmap. Two bits are used for each MSR, lower bit is 32312115Sdyson * for read and higher bit is for write. 32412115Sdyson */ 32512115Sdysonstatic int 32612115Sdysonsvm_msr_index(uint64_t msr, int *index, int *bit) 32712115Sdyson{ 32812115Sdyson uint32_t base, off; 32912115Sdyson 33012115Sdyson/* Pentium compatible MSRs */ 33112115Sdyson#define MSR_PENTIUM_START 0 33212115Sdyson#define MSR_PENTIUM_END 0x1FFF 33312115Sdyson/* AMD 6th generation and Intel compatible MSRs */ 33412115Sdyson#define MSR_AMD6TH_START 0xC0000000UL 33512115Sdyson#define MSR_AMD6TH_END 0xC0001FFFUL 33612115Sdyson/* AMD 7th and 8th generation compatible MSRs */ 33712115Sdyson#define MSR_AMD7TH_START 0xC0010000UL 33812115Sdyson#define MSR_AMD7TH_END 0xC0011FFFUL 33912115Sdyson 34012115Sdyson *index = -1; 34112115Sdyson *bit = (msr % 4) * 2; 34212115Sdyson base = 0; 34312115Sdyson 34412115Sdyson if (msr >= MSR_PENTIUM_START && msr <= MSR_PENTIUM_END) { 34512115Sdyson *index = msr / 4; 34612115Sdyson return (0); 34712115Sdyson } 34812115Sdyson 34912115Sdyson base += (MSR_PENTIUM_END - MSR_PENTIUM_START + 1); 35012115Sdyson if (msr >= MSR_AMD6TH_START && msr <= MSR_AMD6TH_END) { 35112115Sdyson off = (msr - MSR_AMD6TH_START); 35212115Sdyson *index = (off + base) / 4; 35312115Sdyson return (0); 35412115Sdyson } 35512115Sdyson 35612115Sdyson base += (MSR_AMD6TH_END - MSR_AMD6TH_START + 1); 35712115Sdyson if (msr >= MSR_AMD7TH_START && msr <= MSR_AMD7TH_END) { 35812115Sdyson off = (msr - MSR_AMD7TH_START); 35912115Sdyson *index = (off + base) / 4; 36012115Sdyson return (0); 36112115Sdyson } 36212115Sdyson 36312115Sdyson return (EIO); 36412115Sdyson} 36512115Sdyson 36612115Sdyson/* 36712115Sdyson * Give virtual cpu the complete access to MSR(read & write). 36812115Sdyson */ 36912115Sdysonstatic int 37012115Sdysonsvm_msr_perm(uint8_t *perm_bitmap, uint64_t msr, bool read, bool write) 37112115Sdyson{ 37212115Sdyson int index, bit, err; 37312115Sdyson 37412115Sdyson err = svm_msr_index(msr, &index, &bit); 37512115Sdyson if (err) { 37612115Sdyson ERR("MSR 0x%lx is not writeable by guest.\n", msr); 37712115Sdyson return (err); 37812115Sdyson } 37912115Sdyson 38012115Sdyson if (index < 0 || index > (SVM_MSR_BITMAP_SIZE)) { 38112115Sdyson ERR("MSR 0x%lx index out of range(%d).\n", msr, index); 38212115Sdyson return (EINVAL); 38312115Sdyson } 38412115Sdyson if (bit < 0 || bit > 8) { 38512115Sdyson ERR("MSR 0x%lx bit out of range(%d).\n", msr, bit); 38612115Sdyson return (EINVAL); 38712115Sdyson } 38812115Sdyson 38912115Sdyson /* Disable intercept for read and write. */ 39012115Sdyson if (read) 39112115Sdyson perm_bitmap[index] &= ~(1UL << bit); 39212115Sdyson if (write) 39312115Sdyson perm_bitmap[index] &= ~(2UL << bit); 39412115Sdyson CTR2(KTR_VMM, "Guest has control:0x%x on SVM:MSR(0x%lx).\n", 39512115Sdyson (perm_bitmap[index] >> bit) & 0x3, msr); 39612115Sdyson 39712115Sdyson return (0); 39812115Sdyson} 39912115Sdyson 40012115Sdysonstatic int 40112115Sdysonsvm_msr_rw_ok(uint8_t *perm_bitmap, uint64_t msr) 40212115Sdyson{ 40312115Sdyson return svm_msr_perm(perm_bitmap, msr, true, true); 40412115Sdyson} 40512115Sdyson 40612115Sdysonstatic int 40712115Sdysonsvm_msr_rd_ok(uint8_t *perm_bitmap, uint64_t msr) 40812115Sdyson{ 40912115Sdyson return svm_msr_perm(perm_bitmap, msr, true, false); 41027881Sdyson} 41127881Sdyson 41212115Sdysonstatic __inline void 41312115Sdysonvcpu_set_dirty(struct svm_softc *sc, int vcpu, uint32_t dirtybits) 41412115Sdyson{ 41527881Sdyson struct svm_vcpu *vcpustate; 41612115Sdyson 41712115Sdyson vcpustate = svm_get_vcpu(sc, vcpu); 41812115Sdyson 41912115Sdyson vcpustate->dirty |= dirtybits; 42012115Sdyson} 42112115Sdyson 42212115Sdysonstatic __inline int 42312115Sdysonsvm_get_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask) 42412115Sdyson{ 42512115Sdyson struct vmcb_ctrl *ctrl; 42612115Sdyson 42712115Sdyson KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx)); 42812115Sdyson 42912115Sdyson ctrl = svm_get_vmcb_ctrl(sc, vcpu); 43012115Sdyson return (ctrl->intercept[idx] & bitmask ? 1 : 0); 43112115Sdyson} 43212115Sdyson 43312115Sdysonstatic __inline void 43412115Sdysonsvm_set_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask, 43512115Sdyson int enabled) 43612115Sdyson{ 43712115Sdyson struct vmcb_ctrl *ctrl; 43812115Sdyson uint32_t oldval; 43912115Sdyson 44012115Sdyson KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx)); 44112115Sdyson 44212115Sdyson ctrl = svm_get_vmcb_ctrl(sc, vcpu); 44312115Sdyson oldval = ctrl->intercept[idx]; 44412115Sdyson 44512115Sdyson if (enabled) 44612911Sphk ctrl->intercept[idx] |= bitmask; 44712115Sdyson else 44812115Sdyson ctrl->intercept[idx] &= ~bitmask; 44912115Sdyson 45012115Sdyson if (ctrl->intercept[idx] != oldval) { 45112115Sdyson vcpu_set_dirty(sc, vcpu, VMCB_CACHE_I); 45212115Sdyson VCPU_CTR3(sc->vm, vcpu, "intercept[%d] modified " 45312115Sdyson "from %#x to %#x", idx, oldval, ctrl->intercept[idx]); 45412115Sdyson } 45512115Sdyson} 45612115Sdyson 45712147Sdysonstatic __inline void 45812115Sdysonsvm_disable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask) 45912115Sdyson{ 46012115Sdyson svm_set_intercept(sc, vcpu, off, bitmask, 0); 46112115Sdyson} 46212115Sdyson 46312115Sdysonstatic __inline void 46412115Sdysonsvm_enable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask) 46512115Sdyson{ 46612115Sdyson svm_set_intercept(sc, vcpu, off, bitmask, 1); 46712115Sdyson} 46812115Sdyson 46912115Sdysonstatic void 47012115Sdysonvmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa, 47112115Sdyson uint64_t msrpm_base_pa, uint64_t np_pml4) 47212115Sdyson{ 47312115Sdyson struct vmcb_ctrl *ctrl; 47412115Sdyson struct vmcb_state *state; 47512115Sdyson uint32_t mask; 47612115Sdyson int n; 47712115Sdyson 47812115Sdyson ctrl = svm_get_vmcb_ctrl(sc, vcpu); 47912115Sdyson state = svm_get_vmcb_state(sc, vcpu); 48012115Sdyson 48112115Sdyson ctrl->iopm_base_pa = iopm_base_pa; 48212115Sdyson ctrl->msrpm_base_pa = msrpm_base_pa; 48312115Sdyson 48412115Sdyson /* Enable nested paging */ 48512115Sdyson ctrl->np_enable = 1; 48612115Sdyson ctrl->n_cr3 = np_pml4; 48712115Sdyson 48812115Sdyson /* 48912115Sdyson * Intercept accesses to the control registers that are not shadowed 49012115Sdyson * in the VMCB - i.e. all except cr0, cr2, cr3, cr4 and cr8. 49112115Sdyson */ 49212115Sdyson for (n = 0; n < 16; n++) { 49312115Sdyson mask = (BIT(n) << 16) | BIT(n); 49412115Sdyson if (n == 0 || n == 2 || n == 3 || n == 4 || n == 8) 49512115Sdyson svm_disable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask); 49612115Sdyson else 49712115Sdyson svm_enable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask); 49812115Sdyson } 49912115Sdyson 50012115Sdyson /* Intercept Machine Check exceptions. */ 50112115Sdyson svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC)); 50212115Sdyson 50312115Sdyson /* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */ 50412115Sdyson svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO); 50512115Sdyson svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_MSR); 50612115Sdyson svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_CPUID); 50712115Sdyson svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INTR); 50812115Sdyson svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INIT); 50912115Sdyson svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_NMI); 51012115Sdyson svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SMI); 51122521Sdyson svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SHUTDOWN); 51212115Sdyson svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, 51312115Sdyson VMCB_INTCPT_FERR_FREEZE); 51412115Sdyson 51512115Sdyson /* 51612115Sdyson * From section "Canonicalization and Consistency Checks" in APMv2 51712115Sdyson * the VMRUN intercept bit must be set to pass the consistency check. 51812115Sdyson */ 51912115Sdyson svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMRUN); 52012115Sdyson 52112115Sdyson /* 52212115Sdyson * The ASID will be set to a non-zero value just before VMRUN. 52312115Sdyson */ 52412115Sdyson ctrl->asid = 0; 52512115Sdyson 52612115Sdyson /* 52712115Sdyson * Section 15.21.1, Interrupt Masking in EFLAGS 52812115Sdyson * Section 15.21.2, Virtualizing APIC.TPR 52912115Sdyson * 53012115Sdyson * This must be set for %rflag and %cr8 isolation of guest and host. 53112115Sdyson */ 53212115Sdyson ctrl->v_intr_masking = 1; 53312115Sdyson 53412115Sdyson /* Enable Last Branch Record aka LBR for debugging */ 53512115Sdyson ctrl->lbr_virt_en = 1; 53612115Sdyson state->dbgctl = BIT(0); 53712115Sdyson 53812115Sdyson /* EFER_SVM must always be set when the guest is executing */ 53912911Sphk state->efer = EFER_SVM; 54012115Sdyson 54112115Sdyson /* Set up the PAT to power-on state */ 54212115Sdyson state->g_pat = PAT_VALUE(0, PAT_WRITE_BACK) | 54312115Sdyson PAT_VALUE(1, PAT_WRITE_THROUGH) | 54412115Sdyson PAT_VALUE(2, PAT_UNCACHED) | 54512115Sdyson PAT_VALUE(3, PAT_UNCACHEABLE) | 54612115Sdyson PAT_VALUE(4, PAT_WRITE_BACK) | 54712115Sdyson PAT_VALUE(5, PAT_WRITE_THROUGH) | 54812115Sdyson PAT_VALUE(6, PAT_UNCACHED) | 54912115Sdyson PAT_VALUE(7, PAT_UNCACHEABLE); 55012115Sdyson} 55112115Sdyson 55212115Sdyson/* 55312115Sdyson * Initialise a virtual machine. 55412115Sdyson */ 55512115Sdysonstatic void * 55612115Sdysonsvm_vminit(struct vm *vm, pmap_t pmap) 55712115Sdyson{ 55812115Sdyson struct svm_softc *svm_sc; 55912115Sdyson struct svm_vcpu *vcpu; 56012115Sdyson vm_paddr_t msrpm_pa, iopm_pa, pml4_pa; 56112115Sdyson int i; 56212115Sdyson 56312115Sdyson svm_sc = (struct svm_softc *)malloc(sizeof (struct svm_softc), 56412115Sdyson M_SVM, M_WAITOK | M_ZERO); 56512115Sdyson 56612115Sdyson svm_sc->vm = vm; 56712115Sdyson svm_sc->svm_feature = svm_feature; 56812115Sdyson svm_sc->vcpu_cnt = VM_MAXCPU; 56912115Sdyson svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pml4); 57012115Sdyson 57112115Sdyson /* 57212115Sdyson * Intercept MSR access to all MSRs except GSBASE, FSBASE,... etc. 57312115Sdyson */ 57412115Sdyson memset(svm_sc->msr_bitmap, 0xFF, sizeof(svm_sc->msr_bitmap)); 57512115Sdyson 57612115Sdyson /* 57712115Sdyson * Following MSR can be completely controlled by virtual machines 57812115Sdyson * since access to following are translated to access to VMCB. 57912115Sdyson */ 58012115Sdyson svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_GSBASE); 58112115Sdyson svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_FSBASE); 58212115Sdyson svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_KGSBASE); 58312115Sdyson 58412115Sdyson svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_STAR); 58512115Sdyson svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_LSTAR); 58612115Sdyson svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_CSTAR); 58712115Sdyson svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SF_MASK); 58812115Sdyson svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_CS_MSR); 58912115Sdyson svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_ESP_MSR); 59012115Sdyson svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_EIP_MSR); 59112115Sdyson 59212115Sdyson /* For Nested Paging/RVI only. */ 59312115Sdyson svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_PAT); 59412115Sdyson 59512115Sdyson svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_TSC); 59612115Sdyson svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_EFER); 59712115Sdyson 59812115Sdyson /* Intercept access to all I/O ports. */ 59912115Sdyson memset(svm_sc->iopm_bitmap, 0xFF, sizeof(svm_sc->iopm_bitmap)); 60030280Sphk 60130474Sphk /* Cache physical address for multiple vcpus. */ 60230474Sphk iopm_pa = vtophys(svm_sc->iopm_bitmap); 60330492Sphk msrpm_pa = vtophys(svm_sc->msr_bitmap); 60430474Sphk pml4_pa = svm_sc->nptp; 60530474Sphk 60612115Sdyson for (i = 0; i < svm_sc->vcpu_cnt; i++) { 60712115Sdyson vcpu = svm_get_vcpu(svm_sc, i); 60812115Sdyson vcpu->lastcpu = NOCPU; 60912115Sdyson vcpu->vmcb_pa = vtophys(&vcpu->vmcb); 61012115Sdyson vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa); 61112115Sdyson } 61212115Sdyson return (svm_sc); 61312115Sdyson} 61412115Sdyson 61512115Sdysonstatic int 61612115Sdysonsvm_cpl(struct vmcb_state *state) 61712115Sdyson{ 61812115Sdyson 61912115Sdyson /* 62012115Sdyson * From APMv2: 62112115Sdyson * "Retrieve the CPL from the CPL field in the VMCB, not 62212115Sdyson * from any segment DPL" 62312115Sdyson */ 62412115Sdyson return (state->cpl); 62512115Sdyson} 62612115Sdyson 62712115Sdysonstatic enum vm_cpu_mode 62812115Sdysonsvm_vcpu_mode(struct vmcb *vmcb) 62912115Sdyson{ 63012115Sdyson struct vmcb_segment *seg; 63112115Sdyson struct vmcb_state *state; 63212115Sdyson 63312115Sdyson state = &vmcb->state; 63412115Sdyson 63512115Sdyson if (state->efer & EFER_LMA) { 63612115Sdyson seg = vmcb_seg(vmcb, VM_REG_GUEST_CS); 63712115Sdyson /* 63812115Sdyson * Section 4.8.1 for APM2, check if Code Segment has 63912115Sdyson * Long attribute set in descriptor. 64012115Sdyson */ 64112115Sdyson if (seg->attrib & VMCB_CS_ATTRIB_L) 64212115Sdyson return (CPU_MODE_64BIT); 64312115Sdyson else 64412115Sdyson return (CPU_MODE_COMPATIBILITY); 64512115Sdyson } else if (state->cr0 & CR0_PE) { 64612115Sdyson return (CPU_MODE_PROTECTED); 64712115Sdyson } else { 64812115Sdyson return (CPU_MODE_REAL); 64912115Sdyson } 65012115Sdyson} 65112115Sdyson 65212115Sdysonstatic enum vm_paging_mode 65312115Sdysonsvm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer) 65412115Sdyson{ 65512115Sdyson 65612115Sdyson if ((cr0 & CR0_PG) == 0) 65712115Sdyson return (PAGING_MODE_FLAT); 65812115Sdyson if ((cr4 & CR4_PAE) == 0) 65912115Sdyson return (PAGING_MODE_32); 66012115Sdyson if (efer & EFER_LME) 66112115Sdyson return (PAGING_MODE_64); 66212115Sdyson else 66312115Sdyson return (PAGING_MODE_PAE); 66412115Sdyson} 66512115Sdyson 66612115Sdyson/* 66712115Sdyson * ins/outs utility routines 66812115Sdyson */ 66912911Sphkstatic uint64_t 67012115Sdysonsvm_inout_str_index(struct svm_regctx *regs, int in) 67112115Sdyson{ 67212115Sdyson uint64_t val; 67312115Sdyson 67412115Sdyson val = in ? regs->e.g.sctx_rdi : regs->e.g.sctx_rsi; 67512115Sdyson 67612115Sdyson return (val); 67712115Sdyson} 67812115Sdyson 67912115Sdysonstatic uint64_t 68012115Sdysonsvm_inout_str_count(struct svm_regctx *regs, int rep) 68112115Sdyson{ 68212115Sdyson uint64_t val; 68312115Sdyson 68412115Sdyson val = rep ? regs->sctx_rcx : 1; 68512115Sdyson 68612115Sdyson return (val); 68712115Sdyson} 68812115Sdyson 68912115Sdysonstatic void 69012115Sdysonsvm_inout_str_seginfo(struct svm_softc *svm_sc, int vcpu, int64_t info1, 69112115Sdyson int in, struct vm_inout_str *vis) 69212115Sdyson{ 69312115Sdyson int error, s; 69427881Sdyson 69512115Sdyson if (in) { 69612115Sdyson vis->seg_name = VM_REG_GUEST_ES; 69727881Sdyson } else { 69827881Sdyson /* The segment field has standard encoding */ 69912115Sdyson s = (info1 >> 10) & 0x7; 70012115Sdyson vis->seg_name = vm_segment_name(s); 70112115Sdyson } 70227881Sdyson 70327881Sdyson error = svm_getdesc(svm_sc, vcpu, vis->seg_name, &vis->seg_desc); 70412115Sdyson KASSERT(error == 0, ("%s: svm_getdesc error %d", __func__, error)); 70512115Sdyson} 70627881Sdyson 70712115Sdysonstatic int 70812115Sdysonsvm_inout_str_addrsize(uint64_t info1) 70912115Sdyson{ 71012115Sdyson uint32_t size; 71112115Sdyson 71212115Sdyson size = (info1 >> 7) & 0x7; 71312115Sdyson switch (size) { 71412115Sdyson case 1: 71512115Sdyson return (2); /* 16 bit */ 71612115Sdyson case 2: 71712115Sdyson return (4); /* 32 bit */ 71812115Sdyson case 4: 71912115Sdyson return (8); /* 64 bit */ 72012115Sdyson default: 72112115Sdyson panic("%s: invalid size encoding %d", __func__, size); 72212115Sdyson } 72312911Sphk} 72412115Sdyson 72512115Sdysonstatic void 72612115Sdysonsvm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging) 72712115Sdyson{ 72812115Sdyson struct vmcb_state *state; 72912115Sdyson 73012147Sdyson state = &vmcb->state; 73112746Sbde paging->cr3 = state->cr3; 73212746Sbde paging->cpl = svm_cpl(state); 73312746Sbde paging->cpu_mode = svm_vcpu_mode(vmcb); 73412115Sdyson paging->paging_mode = svm_paging_mode(state->cr0, state->cr4, 73512115Sdyson state->efer); 73612115Sdyson} 73712115Sdyson 73812115Sdyson#define UNHANDLED 0 73912115Sdyson 74012115Sdyson/* 74112115Sdyson * Handle guest I/O intercept. 74212115Sdyson */ 74312115Sdysonstatic int 74412115Sdysonsvm_handle_io(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) 74512115Sdyson{ 74612115Sdyson struct vmcb_ctrl *ctrl; 74712115Sdyson struct vmcb_state *state; 74812115Sdyson struct svm_regctx *regs; 74912115Sdyson struct vm_inout_str *vis; 75012115Sdyson uint64_t info1; 75112115Sdyson int inout_string; 75212115Sdyson 75312115Sdyson state = svm_get_vmcb_state(svm_sc, vcpu); 75412115Sdyson ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); 75512115Sdyson regs = svm_get_guest_regctx(svm_sc, vcpu); 75612115Sdyson 75712115Sdyson info1 = ctrl->exitinfo1; 75812115Sdyson inout_string = info1 & BIT(2) ? 1 : 0; 75912911Sphk 76012115Sdyson /* 76112115Sdyson * The effective segment number in EXITINFO1[12:10] is populated 76212115Sdyson * only if the processor has the DecodeAssist capability. 76312115Sdyson * 76412115Sdyson * XXX this is not specified explicitly in APMv2 but can be verified 76512115Sdyson * empirically. 76612115Sdyson */ 76712115Sdyson if (inout_string && !decode_assist()) 76812115Sdyson return (UNHANDLED); 76912115Sdyson 77012115Sdyson vmexit->exitcode = VM_EXITCODE_INOUT; 77112115Sdyson vmexit->u.inout.in = (info1 & BIT(0)) ? 1 : 0; 77212115Sdyson vmexit->u.inout.string = inout_string; 77312115Sdyson vmexit->u.inout.rep = (info1 & BIT(3)) ? 1 : 0; 77412115Sdyson vmexit->u.inout.bytes = (info1 >> 4) & 0x7; 77512115Sdyson vmexit->u.inout.port = (uint16_t)(info1 >> 16); 77612115Sdyson vmexit->u.inout.eax = (uint32_t)(state->rax); 77712115Sdyson 77812115Sdyson if (inout_string) { 77912115Sdyson vmexit->exitcode = VM_EXITCODE_INOUT_STR; 78012115Sdyson vis = &vmexit->u.inout_str; 78112115Sdyson svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &vis->paging); 78212115Sdyson vis->rflags = state->rflags; 78312115Sdyson vis->cr0 = state->cr0; 78412115Sdyson vis->index = svm_inout_str_index(regs, vmexit->u.inout.in); 78512115Sdyson vis->count = svm_inout_str_count(regs, vmexit->u.inout.rep); 78612115Sdyson vis->addrsize = svm_inout_str_addrsize(info1); 78712115Sdyson svm_inout_str_seginfo(svm_sc, vcpu, info1, 78812115Sdyson vmexit->u.inout.in, vis); 78912115Sdyson } 79012115Sdyson 79112115Sdyson return (UNHANDLED); 79212115Sdyson} 79312115Sdyson 79412115Sdysonstatic int 79512115Sdysonsvm_npf_paging(uint64_t exitinfo1) 79612115Sdyson{ 79712115Sdyson 79812115Sdyson if (exitinfo1 & VMCB_NPF_INFO1_W) 79912115Sdyson return (VM_PROT_WRITE); 80012115Sdyson 80112115Sdyson return (VM_PROT_READ); 80212115Sdyson} 80312115Sdyson 80412115Sdysonstatic bool 80512115Sdysonsvm_npf_emul_fault(uint64_t exitinfo1) 80612115Sdyson{ 80712115Sdyson 80812115Sdyson if (exitinfo1 & VMCB_NPF_INFO1_ID) { 80912115Sdyson return (false); 81012115Sdyson } 81112115Sdyson 81212115Sdyson if (exitinfo1 & VMCB_NPF_INFO1_GPT) { 81312115Sdyson return (false); 81412911Sphk } 81512115Sdyson 81612115Sdyson if ((exitinfo1 & VMCB_NPF_INFO1_GPA) == 0) { 81712115Sdyson return (false); 81812115Sdyson } 81912115Sdyson 82012115Sdyson return (true); 82112115Sdyson} 82212115Sdyson 82312115Sdysonstatic void 82412115Sdysonsvm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit) 82512115Sdyson{ 82612115Sdyson struct vm_guest_paging *paging; 82712115Sdyson struct vmcb_segment *seg; 82812115Sdyson struct vmcb_ctrl *ctrl; 82912115Sdyson char *inst_bytes; 83012115Sdyson int inst_len; 83112115Sdyson 83212115Sdyson ctrl = &vmcb->ctrl; 83312115Sdyson paging = &vmexit->u.inst_emul.paging; 83412115Sdyson 83512115Sdyson vmexit->exitcode = VM_EXITCODE_INST_EMUL; 83612115Sdyson vmexit->u.inst_emul.gpa = gpa; 83712115Sdyson vmexit->u.inst_emul.gla = VIE_INVALID_GLA; 83812115Sdyson svm_paging_info(vmcb, paging); 83912115Sdyson 84012115Sdyson seg = vmcb_seg(vmcb, VM_REG_GUEST_CS); 84112115Sdyson switch(paging->cpu_mode) { 84212115Sdyson case CPU_MODE_PROTECTED: 84312115Sdyson case CPU_MODE_COMPATIBILITY: 84412115Sdyson /* 84512115Sdyson * Section 4.8.1 of APM2, Default Operand Size or D bit. 84612115Sdyson */ 84712115Sdyson vmexit->u.inst_emul.cs_d = (seg->attrib & VMCB_CS_ATTRIB_D) ? 84812115Sdyson 1 : 0; 84912115Sdyson break; 85012115Sdyson default: 85112115Sdyson vmexit->u.inst_emul.cs_d = 0; 85212115Sdyson break; 85312115Sdyson } 85412115Sdyson 85512115Sdyson /* 85612115Sdyson * Copy the instruction bytes into 'vie' if available. 85712115Sdyson */ 85812115Sdyson if (decode_assist() && !disable_npf_assist) { 85912115Sdyson inst_len = ctrl->inst_len; 86012115Sdyson inst_bytes = ctrl->inst_bytes; 86112115Sdyson } else { 86222521Sdyson inst_len = 0; 86312115Sdyson inst_bytes = NULL; 86412115Sdyson } 86512115Sdyson vie_init(&vmexit->u.inst_emul.vie, inst_bytes, inst_len); 86612115Sdyson} 86712115Sdyson 86812115Sdyson/* 86912115Sdyson * Intercept access to MSR_EFER to prevent the guest from clearing the 87012115Sdyson * SVM enable bit. 87112115Sdyson */ 87212115Sdysonstatic void 87312115Sdysonsvm_write_efer(struct svm_softc *sc, int vcpu, uint32_t edx, uint32_t eax) 87412115Sdyson{ 87512115Sdyson struct vmcb_state *state; 87612115Sdyson uint64_t oldval; 87712115Sdyson 87812115Sdyson state = svm_get_vmcb_state(sc, vcpu); 87912115Sdyson 88012115Sdyson oldval = state->efer; 88112115Sdyson state->efer = (uint64_t)edx << 32 | eax | EFER_SVM; 88212115Sdyson if (state->efer != oldval) { 88312115Sdyson VCPU_CTR2(sc->vm, vcpu, "Guest EFER changed from %#lx to %#lx", 88412115Sdyson oldval, state->efer); 88512911Sphk vcpu_set_dirty(sc, vcpu, VMCB_CACHE_CR); 88612115Sdyson } 88712115Sdyson} 88812115Sdyson 88912115Sdyson#ifdef KTR 89012115Sdysonstatic const char * 89112115Sdysonintrtype_to_str(int intr_type) 89212115Sdyson{ 89312115Sdyson switch (intr_type) { 89412115Sdyson case VMCB_EVENTINJ_TYPE_INTR: 89512115Sdyson return ("hwintr"); 89612115Sdyson case VMCB_EVENTINJ_TYPE_NMI: 89730280Sphk return ("nmi"); 89812115Sdyson case VMCB_EVENTINJ_TYPE_INTn: 89912115Sdyson return ("swintr"); 90012115Sdyson case VMCB_EVENTINJ_TYPE_EXCEPTION: 90112115Sdyson return ("exception"); 90212406Sdyson default: 90312115Sdyson panic("%s: unknown intr_type %d", __func__, intr_type); 90412115Sdyson } 90512115Sdyson} 90612406Sdyson#endif 90712406Sdyson 90812406Sdyson/* 90912406Sdyson * Inject an event to vcpu as described in section 15.20, "Event injection". 91012406Sdyson */ 91112406Sdysonstatic void 91212406Sdysonsvm_eventinject(struct svm_softc *sc, int vcpu, int intr_type, int vector, 91312406Sdyson uint32_t error, bool ec_valid) 91412406Sdyson{ 91512406Sdyson struct vmcb_ctrl *ctrl; 91612406Sdyson 91712406Sdyson ctrl = svm_get_vmcb_ctrl(sc, vcpu); 91812406Sdyson 91912406Sdyson KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, 92012115Sdyson ("%s: event already pending %#lx", __func__, ctrl->eventinj)); 92112115Sdyson 92212115Sdyson KASSERT(vector >=0 && vector <= 255, ("%s: invalid vector %d", 92312115Sdyson __func__, vector)); 92412115Sdyson 92530280Sphk switch (intr_type) { 92612115Sdyson case VMCB_EVENTINJ_TYPE_INTR: 92712115Sdyson case VMCB_EVENTINJ_TYPE_NMI: 92812115Sdyson case VMCB_EVENTINJ_TYPE_INTn: 92912115Sdyson break; 93012115Sdyson case VMCB_EVENTINJ_TYPE_EXCEPTION: 93112115Sdyson if (vector >= 0 && vector <= 31 && vector != 2) 93212115Sdyson break; 93312115Sdyson /* FALLTHROUGH */ 93412115Sdyson default: 93512115Sdyson panic("%s: invalid intr_type/vector: %d/%d", __func__, 93612115Sdyson intr_type, vector); 93712115Sdyson } 93812115Sdyson ctrl->eventinj = vector | (intr_type << 8) | VMCB_EVENTINJ_VALID; 93912115Sdyson if (ec_valid) { 94012115Sdyson ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID; 94112115Sdyson ctrl->eventinj |= (uint64_t)error << 32; 94212115Sdyson VCPU_CTR3(sc->vm, vcpu, "Injecting %s at vector %d errcode %#x", 94312115Sdyson intrtype_to_str(intr_type), vector, error); 94412406Sdyson } else { 94512406Sdyson VCPU_CTR2(sc->vm, vcpu, "Injecting %s at vector %d", 94612406Sdyson intrtype_to_str(intr_type), vector); 94712406Sdyson } 94812115Sdyson} 94912406Sdyson 95012115Sdysonstatic void 95112115Sdysonsvm_save_intinfo(struct svm_softc *svm_sc, int vcpu) 95212115Sdyson{ 95312115Sdyson struct vmcb_ctrl *ctrl; 95412115Sdyson uint64_t intinfo; 95512115Sdyson 95612115Sdyson ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); 95712115Sdyson intinfo = ctrl->exitintinfo; 95812115Sdyson if (!VMCB_EXITINTINFO_VALID(intinfo)) 95912115Sdyson return; 96012115Sdyson 96112115Sdyson /* 96212115Sdyson * From APMv2, Section "Intercepts during IDT interrupt delivery" 96312115Sdyson * 96412115Sdyson * If a #VMEXIT happened during event delivery then record the event 96512115Sdyson * that was being delivered. 96612115Sdyson */ 96712115Sdyson VCPU_CTR2(svm_sc->vm, vcpu, "SVM:Pending INTINFO(0x%lx), vector=%d.\n", 96812115Sdyson intinfo, VMCB_EXITINTINFO_VECTOR(intinfo)); 96912115Sdyson vmm_stat_incr(svm_sc->vm, vcpu, VCPU_EXITINTINFO, 1); 97012115Sdyson vm_exit_intinfo(svm_sc->vm, vcpu, intinfo); 97112115Sdyson} 97212115Sdyson 97312115Sdysonstatic __inline void 97412115Sdysonenable_intr_window_exiting(struct svm_softc *sc, int vcpu) 97512115Sdyson{ 97612115Sdyson struct vmcb_ctrl *ctrl; 97712115Sdyson 97812115Sdyson ctrl = svm_get_vmcb_ctrl(sc, vcpu); 97912115Sdyson 98012115Sdyson if (ctrl->v_irq == 0) { 98112115Sdyson VCPU_CTR0(sc->vm, vcpu, "Enable intr window exiting"); 98212115Sdyson ctrl->v_irq = 1; 98312115Sdyson ctrl->v_ign_tpr = 1; 98412115Sdyson vcpu_set_dirty(sc, vcpu, VMCB_CACHE_TPR); 98512115Sdyson svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, 98612115Sdyson VMCB_INTCPT_VINTR); 98712115Sdyson } 98812115Sdyson} 98912115Sdyson 99012115Sdysonstatic __inline void 99112115Sdysondisable_intr_window_exiting(struct svm_softc *sc, int vcpu) 99214249Sbde{ 99312115Sdyson struct vmcb_ctrl *ctrl; 99412115Sdyson 99512115Sdyson ctrl = svm_get_vmcb_ctrl(sc, vcpu); 99612115Sdyson 99712115Sdyson if (ctrl->v_irq) { 99812115Sdyson VCPU_CTR0(sc->vm, vcpu, "Disable intr window exiting"); 99912115Sdyson ctrl->v_irq = 0; 100012115Sdyson vcpu_set_dirty(sc, vcpu, VMCB_CACHE_TPR); 100112115Sdyson svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, 100212115Sdyson VMCB_INTCPT_VINTR); 100312115Sdyson } 100412115Sdyson} 100512115Sdyson 100612115Sdysonstatic int 100731485Sbdenmi_blocked(struct svm_softc *sc, int vcpu) 100812115Sdyson{ 100912115Sdyson /* XXX need to track NMI blocking */ 101012115Sdyson return (0); 101112115Sdyson} 101212115Sdyson 101312115Sdysonstatic void 101412115Sdysonenable_nmi_blocking(struct svm_softc *sc, int vcpu) 101512115Sdyson{ 101612115Sdyson /* XXX enable iret intercept */ 101712115Sdyson} 101812115Sdyson 101912115Sdyson#ifdef notyet 102012115Sdysonstatic void 102112115Sdysonclear_nmi_blocking(struct svm_softc *sc, int vcpu) 102212115Sdyson{ 102312115Sdyson /* XXX disable iret intercept */ 102412115Sdyson} 102512911Sphk#endif 102612115Sdyson 102712115Sdyson#ifdef KTR 102812115Sdysonstatic const char * 102928270Swollmanexit_reason_to_str(uint64_t reason) 103012115Sdyson{ 103112115Sdyson static char reasonbuf[32]; 103212115Sdyson 103312115Sdyson switch (reason) { 103412115Sdyson case VMCB_EXIT_INVALID: 103512115Sdyson return ("invalvmcb"); 103612115Sdyson case VMCB_EXIT_SHUTDOWN: 103712115Sdyson return ("shutdown"); 103812115Sdyson case VMCB_EXIT_NPF: 103912115Sdyson return ("nptfault"); 104012115Sdyson case VMCB_EXIT_PAUSE: 104112115Sdyson return ("pause"); 104212115Sdyson case VMCB_EXIT_HLT: 104312115Sdyson return ("hlt"); 104412115Sdyson case VMCB_EXIT_CPUID: 104512115Sdyson return ("cpuid"); 104612115Sdyson case VMCB_EXIT_IO: 104712115Sdyson return ("inout"); 104812115Sdyson case VMCB_EXIT_MC: 104912911Sphk return ("mchk"); 105012115Sdyson case VMCB_EXIT_INTR: 105112115Sdyson return ("extintr"); 105212115Sdyson case VMCB_EXIT_NMI: 105312115Sdyson return ("nmi"); 105412115Sdyson case VMCB_EXIT_VINTR: 105512115Sdyson return ("vintr"); 105612115Sdyson case VMCB_EXIT_MSR: 105712115Sdyson return ("msr"); 105812115Sdyson default: 105912115Sdyson snprintf(reasonbuf, sizeof(reasonbuf), "%#lx", reason); 106012115Sdyson return (reasonbuf); 106112115Sdyson } 106212115Sdyson} 106312115Sdyson#endif /* KTR */ 106412115Sdyson 106512115Sdyson/* 106612115Sdyson * From section "State Saved on Exit" in APMv2: nRIP is saved for all #VMEXITs 106712115Sdyson * that are due to instruction intercepts as well as MSR and IOIO intercepts 106812911Sphk * and exceptions caused by INT3, INTO and BOUND instructions. 106912115Sdyson * 107012115Sdyson * Return 1 if the nRIP is valid and 0 otherwise. 107112115Sdyson */ 107212115Sdysonstatic int 107312115Sdysonnrip_valid(uint64_t exitcode) 107412115Sdyson{ 107512115Sdyson switch (exitcode) { 107612147Sdyson case 0x00 ... 0x0F: /* read of CR0 through CR15 */ 107712115Sdyson case 0x10 ... 0x1F: /* write of CR0 through CR15 */ 107812115Sdyson case 0x20 ... 0x2F: /* read of DR0 through DR15 */ 107912115Sdyson case 0x30 ... 0x3F: /* write of DR0 through DR15 */ 108012115Sdyson case 0x43: /* INT3 */ 108112115Sdyson case 0x44: /* INTO */ 108212115Sdyson case 0x45: /* BOUND */ 108312115Sdyson case 0x65 ... 0x7C: /* VMEXIT_CR0_SEL_WRITE ... VMEXIT_MSR */ 108412115Sdyson case 0x80 ... 0x8D: /* VMEXIT_VMRUN ... VMEXIT_XSETBV */ 108512115Sdyson return (1); 108612115Sdyson default: 108727881Sdyson return (0); 108827881Sdyson } 108927881Sdyson} 109027881Sdyson 109112115Sdyson/* 109212115Sdyson * Collateral for a generic SVM VM-exit. 109312115Sdyson */ 109412115Sdysonstatic void 1095vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2) 1096{ 1097 1098 vme->exitcode = VM_EXITCODE_SVM; 1099 vme->u.svm.exitcode = code; 1100 vme->u.svm.exitinfo1 = info1; 1101 vme->u.svm.exitinfo2 = info2; 1102} 1103 1104static int 1105svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) 1106{ 1107 struct vmcb *vmcb; 1108 struct vmcb_state *state; 1109 struct vmcb_ctrl *ctrl; 1110 struct svm_regctx *ctx; 1111 uint64_t code, info1, info2, val; 1112 uint32_t eax, ecx, edx; 1113 int handled; 1114 bool retu; 1115 1116 ctx = svm_get_guest_regctx(svm_sc, vcpu); 1117 vmcb = svm_get_vmcb(svm_sc, vcpu); 1118 state = &vmcb->state; 1119 ctrl = &vmcb->ctrl; 1120 1121 handled = 0; 1122 code = ctrl->exitcode; 1123 info1 = ctrl->exitinfo1; 1124 info2 = ctrl->exitinfo2; 1125 1126 vmexit->exitcode = VM_EXITCODE_BOGUS; 1127 vmexit->rip = state->rip; 1128 vmexit->inst_length = nrip_valid(code) ? ctrl->nrip - state->rip : 0; 1129 1130 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_COUNT, 1); 1131 1132 /* 1133 * #VMEXIT(INVALID) needs to be handled early because the VMCB is 1134 * in an inconsistent state and can trigger assertions that would 1135 * never happen otherwise. 1136 */ 1137 if (code == VMCB_EXIT_INVALID) { 1138 vm_exit_svm(vmexit, code, info1, info2); 1139 return (0); 1140 } 1141 1142 KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event " 1143 "injection valid bit is set %#lx", __func__, ctrl->eventinj)); 1144 1145 KASSERT(vmexit->inst_length >= 0 && vmexit->inst_length <= 15, 1146 ("invalid inst_length %d: code (%#lx), info1 (%#lx), info2 (%#lx)", 1147 vmexit->inst_length, code, info1, info2)); 1148 1149 svm_save_intinfo(svm_sc, vcpu); 1150 1151 switch (code) { 1152 case VMCB_EXIT_VINTR: /* interrupt window exiting */ 1153 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_VINTR, 1); 1154 handled = 1; 1155 break; 1156 case VMCB_EXIT_INTR: /* external interrupt */ 1157 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXTINT, 1); 1158 handled = 1; 1159 break; 1160 case VMCB_EXIT_NMI: /* external NMI */ 1161 handled = 1; 1162 break; 1163 case VMCB_EXIT_MC: /* machine check */ 1164 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXCEPTION, 1); 1165 break; 1166 case VMCB_EXIT_MSR: /* MSR access. */ 1167 eax = state->rax; 1168 ecx = ctx->sctx_rcx; 1169 edx = ctx->e.g.sctx_rdx; 1170 retu = false; 1171 1172 if (ecx == MSR_EFER) { 1173 KASSERT(info1 != 0, ("rdmsr(MSR_EFER) is not emulated: " 1174 "info1(%#lx) info2(%#lx)", info1, info2)); 1175 svm_write_efer(svm_sc, vcpu, edx, eax); 1176 handled = 1; 1177 break; 1178 } 1179 1180#define MSR_AMDK8_IPM 0xc0010055 1181 /* 1182 * Ignore access to the "Interrupt Pending Message" MSR. 1183 */ 1184 if (ecx == MSR_AMDK8_IPM) { 1185 if (!info1) 1186 state->rax = ctx->e.g.sctx_rdx = 0; 1187 handled = 1; 1188 break; 1189 } 1190 1191 if (info1) { 1192 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_WRMSR, 1); 1193 val = (uint64_t)edx << 32 | eax; 1194 VCPU_CTR2(svm_sc->vm, vcpu, "wrmsr %#x val %#lx", 1195 ecx, val); 1196 if (emulate_wrmsr(svm_sc->vm, vcpu, ecx, val, &retu)) { 1197 vmexit->exitcode = VM_EXITCODE_WRMSR; 1198 vmexit->u.msr.code = ecx; 1199 vmexit->u.msr.wval = val; 1200 } else if (!retu) { 1201 handled = 1; 1202 } else { 1203 KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, 1204 ("emulate_wrmsr retu with bogus exitcode")); 1205 } 1206 } else { 1207 VCPU_CTR1(svm_sc->vm, vcpu, "rdmsr %#x", ecx); 1208 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_RDMSR, 1); 1209 if (emulate_rdmsr(svm_sc->vm, vcpu, ecx, &retu)) { 1210 vmexit->exitcode = VM_EXITCODE_RDMSR; 1211 vmexit->u.msr.code = ecx; 1212 } else if (!retu) { 1213 handled = 1; 1214 } else { 1215 KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, 1216 ("emulate_rdmsr retu with bogus exitcode")); 1217 } 1218 } 1219 break; 1220 case VMCB_EXIT_IO: 1221 handled = svm_handle_io(svm_sc, vcpu, vmexit); 1222 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INOUT, 1); 1223 break; 1224 case VMCB_EXIT_CPUID: 1225 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_CPUID, 1); 1226 handled = x86_emulate_cpuid(svm_sc->vm, vcpu, 1227 (uint32_t *)&state->rax, 1228 (uint32_t *)&ctx->sctx_rbx, 1229 (uint32_t *)&ctx->sctx_rcx, 1230 (uint32_t *)&ctx->e.g.sctx_rdx); 1231 break; 1232 case VMCB_EXIT_HLT: 1233 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_HLT, 1); 1234 vmexit->exitcode = VM_EXITCODE_HLT; 1235 vmexit->u.hlt.rflags = state->rflags; 1236 break; 1237 case VMCB_EXIT_PAUSE: 1238 vmexit->exitcode = VM_EXITCODE_PAUSE; 1239 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_PAUSE, 1); 1240 break; 1241 case VMCB_EXIT_NPF: 1242 /* EXITINFO2 contains the faulting guest physical address */ 1243 if (info1 & VMCB_NPF_INFO1_RSV) { 1244 VCPU_CTR2(svm_sc->vm, vcpu, "nested page fault with " 1245 "reserved bits set: info1(%#lx) info2(%#lx)", 1246 info1, info2); 1247 } else if (vm_mem_allocated(svm_sc->vm, info2)) { 1248 vmexit->exitcode = VM_EXITCODE_PAGING; 1249 vmexit->u.paging.gpa = info2; 1250 vmexit->u.paging.fault_type = svm_npf_paging(info1); 1251 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_NESTED_FAULT, 1); 1252 VCPU_CTR3(svm_sc->vm, vcpu, "nested page fault " 1253 "on gpa %#lx/%#lx at rip %#lx", 1254 info2, info1, state->rip); 1255 } else if (svm_npf_emul_fault(info1)) { 1256 svm_handle_inst_emul(vmcb, info2, vmexit); 1257 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INST_EMUL, 1); 1258 VCPU_CTR3(svm_sc->vm, vcpu, "inst_emul fault " 1259 "for gpa %#lx/%#lx at rip %#lx", 1260 info2, info1, state->rip); 1261 } 1262 break; 1263 default: 1264 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_UNKNOWN, 1); 1265 break; 1266 } 1267 1268 VCPU_CTR4(svm_sc->vm, vcpu, "%s %s vmexit at %#lx/%d", 1269 handled ? "handled" : "unhandled", exit_reason_to_str(code), 1270 vmexit->rip, vmexit->inst_length); 1271 1272 if (handled) { 1273 vmexit->rip += vmexit->inst_length; 1274 vmexit->inst_length = 0; 1275 state->rip = vmexit->rip; 1276 } else { 1277 if (vmexit->exitcode == VM_EXITCODE_BOGUS) { 1278 /* 1279 * If this VM exit was not claimed by anybody then 1280 * treat it as a generic SVM exit. 1281 */ 1282 vm_exit_svm(vmexit, code, info1, info2); 1283 } else { 1284 /* 1285 * The exitcode and collateral have been populated. 1286 * The VM exit will be processed further in userland. 1287 */ 1288 } 1289 } 1290 return (handled); 1291} 1292 1293static void 1294svm_inj_intinfo(struct svm_softc *svm_sc, int vcpu) 1295{ 1296 uint64_t intinfo; 1297 1298 if (!vm_entry_intinfo(svm_sc->vm, vcpu, &intinfo)) 1299 return; 1300 1301 KASSERT(VMCB_EXITINTINFO_VALID(intinfo), ("%s: entry intinfo is not " 1302 "valid: %#lx", __func__, intinfo)); 1303 1304 svm_eventinject(svm_sc, vcpu, VMCB_EXITINTINFO_TYPE(intinfo), 1305 VMCB_EXITINTINFO_VECTOR(intinfo), 1306 VMCB_EXITINTINFO_EC(intinfo), 1307 VMCB_EXITINTINFO_EC_VALID(intinfo)); 1308 vmm_stat_incr(svm_sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1); 1309 VCPU_CTR1(svm_sc->vm, vcpu, "Injected entry intinfo: %#lx", intinfo); 1310} 1311 1312/* 1313 * Inject event to virtual cpu. 1314 */ 1315static void 1316svm_inj_interrupts(struct svm_softc *sc, int vcpu, struct vlapic *vlapic) 1317{ 1318 struct vmcb_ctrl *ctrl; 1319 struct vmcb_state *state; 1320 int extint_pending; 1321 int vector, need_intr_window; 1322 1323 state = svm_get_vmcb_state(sc, vcpu); 1324 ctrl = svm_get_vmcb_ctrl(sc, vcpu); 1325 1326 need_intr_window = 0; 1327 1328 /* 1329 * Inject pending events or exceptions for this vcpu. 1330 * 1331 * An event might be pending because the previous #VMEXIT happened 1332 * during event delivery (i.e. ctrl->exitintinfo). 1333 * 1334 * An event might also be pending because an exception was injected 1335 * by the hypervisor (e.g. #PF during instruction emulation). 1336 */ 1337 svm_inj_intinfo(sc, vcpu); 1338 1339 /* NMI event has priority over interrupts. */ 1340 if (vm_nmi_pending(sc->vm, vcpu)) { 1341 if (nmi_blocked(sc, vcpu)) { 1342 /* 1343 * Can't inject another NMI if the guest has not 1344 * yet executed an "iret" after the last NMI. 1345 */ 1346 VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due " 1347 "to NMI-blocking"); 1348 } else if (ctrl->eventinj & VMCB_EVENTINJ_VALID) { 1349 /* 1350 * If there is already an exception/interrupt pending 1351 * then defer the NMI until after that. 1352 */ 1353 VCPU_CTR1(sc->vm, vcpu, "Cannot inject NMI due to " 1354 "eventinj %#lx", ctrl->eventinj); 1355 1356 /* 1357 * Use self-IPI to trigger a VM-exit as soon as 1358 * possible after the event injection is completed. 1359 * 1360 * This works only if the external interrupt exiting 1361 * is at a lower priority than the event injection. 1362 * 1363 * Although not explicitly specified in APMv2 the 1364 * relative priorities were verified empirically. 1365 */ 1366 ipi_cpu(curcpu, IPI_AST); /* XXX vmm_ipinum? */ 1367 } else { 1368 vm_nmi_clear(sc->vm, vcpu); 1369 1370 /* Inject NMI, vector number is not used */ 1371 svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_NMI, 1372 IDT_NMI, 0, false); 1373 1374 /* virtual NMI blocking is now in effect */ 1375 enable_nmi_blocking(sc, vcpu); 1376 1377 VCPU_CTR0(sc->vm, vcpu, "Injecting vNMI"); 1378 } 1379 } 1380 1381 extint_pending = vm_extint_pending(sc->vm, vcpu); 1382 1383 if (!extint_pending) { 1384 /* Ask the local apic for a vector to inject */ 1385 if (!vlapic_pending_intr(vlapic, &vector)) { 1386 goto done; /* nothing to inject */ 1387 } 1388 KASSERT(vector >= 16 && vector <= 255, 1389 ("invalid vector %d from local APIC", vector)); 1390 } else { 1391 /* Ask the legacy pic for a vector to inject */ 1392 vatpic_pending_intr(sc->vm, &vector); 1393 KASSERT(vector >= 0 && vector <= 255, 1394 ("invalid vector %d from local APIC", vector)); 1395 } 1396 1397 /* 1398 * If the guest has disabled interrupts or is in an interrupt shadow 1399 * then we cannot inject the pending interrupt. 1400 */ 1401 if ((state->rflags & PSL_I) == 0) { 1402 VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to " 1403 "rflags %#lx", vector, state->rflags); 1404 need_intr_window = 1; 1405 goto done; 1406 } 1407 1408 if (ctrl->intr_shadow) { 1409 VCPU_CTR1(sc->vm, vcpu, "Cannot inject vector %d due to " 1410 "interrupt shadow", vector); 1411 need_intr_window = 1; 1412 goto done; 1413 } 1414 1415 if (ctrl->eventinj & VMCB_EVENTINJ_VALID) { 1416 VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to " 1417 "eventinj %#lx", vector, ctrl->eventinj); 1418 need_intr_window = 1; 1419 goto done; 1420 } 1421 1422 svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_INTR, vector, 0, false); 1423 1424 if (!extint_pending) { 1425 /* Update the Local APIC ISR */ 1426 vlapic_intr_accepted(vlapic, vector); 1427 } else { 1428 vm_extint_clear(sc->vm, vcpu); 1429 vatpic_intr_accepted(sc->vm, vector); 1430 /* 1431 * Force a VM-exit as soon as the vcpu is ready to accept 1432 * another interrupt. This is done because the PIC might 1433 * have another vector that it wants to inject. Also, if 1434 * the vlapic has a pending interrupt that was preempted 1435 * by the ExtInt then it allows us to inject the APIC 1436 * vector as soon as possible. 1437 */ 1438 need_intr_window = 1; 1439 } 1440done: 1441 if (need_intr_window) { 1442 /* 1443 * We use V_IRQ in conjunction with the VINTR intercept to 1444 * trap into the hypervisor as soon as a virtual interrupt 1445 * can be delivered. 1446 * 1447 * Since injected events are not subject to intercept checks 1448 * we need to ensure that the V_IRQ is not actually going to 1449 * be delivered on VM entry. The KASSERT below enforces this. 1450 */ 1451 KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 || 1452 (state->rflags & PSL_I) == 0 || ctrl->intr_shadow, 1453 ("Bogus intr_window_exiting: eventinj (%#lx), " 1454 "intr_shadow (%u), rflags (%#lx)", 1455 ctrl->eventinj, ctrl->intr_shadow, state->rflags)); 1456 enable_intr_window_exiting(sc, vcpu); 1457 } else { 1458 disable_intr_window_exiting(sc, vcpu); 1459 } 1460} 1461 1462static __inline void 1463restore_host_tss(void) 1464{ 1465 struct system_segment_descriptor *tss_sd; 1466 1467 /* 1468 * The TSS descriptor was in use prior to launching the guest so it 1469 * has been marked busy. 1470 * 1471 * 'ltr' requires the descriptor to be marked available so change the 1472 * type to "64-bit available TSS". 1473 */ 1474 tss_sd = PCPU_GET(tss); 1475 tss_sd->sd_type = SDT_SYSTSS; 1476 ltr(GSEL(GPROC0_SEL, SEL_KPL)); 1477} 1478 1479static void 1480check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, u_int thiscpu) 1481{ 1482 struct svm_vcpu *vcpustate; 1483 struct vmcb_ctrl *ctrl; 1484 long eptgen; 1485 bool alloc_asid; 1486 1487 KASSERT(CPU_ISSET(thiscpu, &pmap->pm_active), ("%s: nested pmap not " 1488 "active on cpu %u", __func__, thiscpu)); 1489 1490 vcpustate = svm_get_vcpu(sc, vcpuid); 1491 ctrl = svm_get_vmcb_ctrl(sc, vcpuid); 1492 1493 /* 1494 * The TLB entries associated with the vcpu's ASID are not valid 1495 * if either of the following conditions is true: 1496 * 1497 * 1. The vcpu's ASID generation is different than the host cpu's 1498 * ASID generation. This happens when the vcpu migrates to a new 1499 * host cpu. It can also happen when the number of vcpus executing 1500 * on a host cpu is greater than the number of ASIDs available. 1501 * 1502 * 2. The pmap generation number is different than the value cached in 1503 * the 'vcpustate'. This happens when the host invalidates pages 1504 * belonging to the guest. 1505 * 1506 * asidgen eptgen Action 1507 * mismatch mismatch 1508 * 0 0 (a) 1509 * 0 1 (b1) or (b2) 1510 * 1 0 (c) 1511 * 1 1 (d) 1512 * 1513 * (a) There is no mismatch in eptgen or ASID generation and therefore 1514 * no further action is needed. 1515 * 1516 * (b1) If the cpu supports FlushByAsid then the vcpu's ASID is 1517 * retained and the TLB entries associated with this ASID 1518 * are flushed by VMRUN. 1519 * 1520 * (b2) If the cpu does not support FlushByAsid then a new ASID is 1521 * allocated. 1522 * 1523 * (c) A new ASID is allocated. 1524 * 1525 * (d) A new ASID is allocated. 1526 */ 1527 1528 alloc_asid = false; 1529 eptgen = pmap->pm_eptgen; 1530 ctrl->tlb_ctrl = VMCB_TLB_FLUSH_NOTHING; 1531 1532 if (vcpustate->asid.gen != asid[thiscpu].gen) { 1533 alloc_asid = true; /* (c) and (d) */ 1534 } else if (vcpustate->eptgen != eptgen) { 1535 if (flush_by_asid()) 1536 ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST; /* (b1) */ 1537 else 1538 alloc_asid = true; /* (b2) */ 1539 } else { 1540 /* 1541 * This is the common case (a). 1542 */ 1543 KASSERT(!alloc_asid, ("ASID allocation not necessary")); 1544 KASSERT(ctrl->tlb_ctrl == VMCB_TLB_FLUSH_NOTHING, 1545 ("Invalid VMCB tlb_ctrl: %#x", ctrl->tlb_ctrl)); 1546 } 1547 1548 if (alloc_asid) { 1549 if (++asid[thiscpu].num >= nasid) { 1550 asid[thiscpu].num = 1; 1551 if (++asid[thiscpu].gen == 0) 1552 asid[thiscpu].gen = 1; 1553 /* 1554 * If this cpu does not support "flush-by-asid" 1555 * then flush the entire TLB on a generation 1556 * bump. Subsequent ASID allocation in this 1557 * generation can be done without a TLB flush. 1558 */ 1559 if (!flush_by_asid()) 1560 ctrl->tlb_ctrl = VMCB_TLB_FLUSH_ALL; 1561 } 1562 vcpustate->asid.gen = asid[thiscpu].gen; 1563 vcpustate->asid.num = asid[thiscpu].num; 1564 1565 ctrl->asid = vcpustate->asid.num; 1566 vcpu_set_dirty(sc, vcpuid, VMCB_CACHE_ASID); 1567 /* 1568 * If this cpu supports "flush-by-asid" then the TLB 1569 * was not flushed after the generation bump. The TLB 1570 * is flushed selectively after every new ASID allocation. 1571 */ 1572 if (flush_by_asid()) 1573 ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST; 1574 } 1575 vcpustate->eptgen = eptgen; 1576 1577 KASSERT(ctrl->asid != 0, ("Guest ASID must be non-zero")); 1578 KASSERT(ctrl->asid == vcpustate->asid.num, 1579 ("ASID mismatch: %u/%u", ctrl->asid, vcpustate->asid.num)); 1580} 1581 1582/* 1583 * Start vcpu with specified RIP. 1584 */ 1585static int 1586svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap, 1587 void *rend_cookie, void *suspended_cookie) 1588{ 1589 struct svm_regctx *hctx, *gctx; 1590 struct svm_softc *svm_sc; 1591 struct svm_vcpu *vcpustate; 1592 struct vmcb_state *state; 1593 struct vmcb_ctrl *ctrl; 1594 struct vm_exit *vmexit; 1595 struct vlapic *vlapic; 1596 struct vm *vm; 1597 uint64_t vmcb_pa; 1598 u_int thiscpu; 1599 int handled; 1600 1601 svm_sc = arg; 1602 vm = svm_sc->vm; 1603 1604 vcpustate = svm_get_vcpu(svm_sc, vcpu); 1605 state = svm_get_vmcb_state(svm_sc, vcpu); 1606 ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); 1607 vmexit = vm_exitinfo(vm, vcpu); 1608 vlapic = vm_lapic(vm, vcpu); 1609 1610 /* 1611 * Stash 'curcpu' on the stack as 'thiscpu'. 1612 * 1613 * The per-cpu data area is not accessible until MSR_GSBASE is restored 1614 * after the #VMEXIT. Since VMRUN is executed inside a critical section 1615 * 'curcpu' and 'thiscpu' are guaranteed to identical. 1616 */ 1617 thiscpu = curcpu; 1618 1619 gctx = svm_get_guest_regctx(svm_sc, vcpu); 1620 hctx = &host_ctx[thiscpu]; 1621 vmcb_pa = svm_sc->vcpu[vcpu].vmcb_pa; 1622 1623 if (vcpustate->lastcpu != thiscpu) { 1624 /* 1625 * Force new ASID allocation by invalidating the generation. 1626 */ 1627 vcpustate->asid.gen = 0; 1628 1629 /* 1630 * Invalidate the VMCB state cache by marking all fields dirty. 1631 */ 1632 vcpu_set_dirty(svm_sc, vcpu, 0xffffffff); 1633 1634 /* 1635 * XXX 1636 * Setting 'vcpustate->lastcpu' here is bit premature because 1637 * we may return from this function without actually executing 1638 * the VMRUN instruction. This could happen if a rendezvous 1639 * or an AST is pending on the first time through the loop. 1640 * 1641 * This works for now but any new side-effects of vcpu 1642 * migration should take this case into account. 1643 */ 1644 vcpustate->lastcpu = thiscpu; 1645 vmm_stat_incr(vm, vcpu, VCPU_MIGRATIONS, 1); 1646 } 1647 1648 /* Update Guest RIP */ 1649 state->rip = rip; 1650 1651 do { 1652 /* 1653 * Disable global interrupts to guarantee atomicity during 1654 * loading of guest state. This includes not only the state 1655 * loaded by the "vmrun" instruction but also software state 1656 * maintained by the hypervisor: suspended and rendezvous 1657 * state, NPT generation number, vlapic interrupts etc. 1658 */ 1659 disable_gintr(); 1660 1661 if (vcpu_suspended(suspended_cookie)) { 1662 enable_gintr(); 1663 vm_exit_suspended(vm, vcpu, state->rip); 1664 break; 1665 } 1666 1667 if (vcpu_rendezvous_pending(rend_cookie)) { 1668 enable_gintr(); 1669 vm_exit_rendezvous(vm, vcpu, state->rip); 1670 break; 1671 } 1672 1673 /* We are asked to give the cpu by scheduler. */ 1674 if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)) { 1675 enable_gintr(); 1676 vm_exit_astpending(vm, vcpu, state->rip); 1677 break; 1678 } 1679 1680 svm_inj_interrupts(svm_sc, vcpu, vlapic); 1681 1682 /* Activate the nested pmap on 'thiscpu' */ 1683 CPU_SET_ATOMIC_ACQ(thiscpu, &pmap->pm_active); 1684 1685 /* 1686 * Check the pmap generation and the ASID generation to 1687 * ensure that the vcpu does not use stale TLB mappings. 1688 */ 1689 check_asid(svm_sc, vcpu, pmap, thiscpu); 1690 1691 ctrl->vmcb_clean = VMCB_CACHE_DEFAULT & ~vcpustate->dirty; 1692 vcpustate->dirty = 0; 1693 VCPU_CTR1(vm, vcpu, "vmcb clean %#x", ctrl->vmcb_clean); 1694 1695 /* Launch Virtual Machine. */ 1696 VCPU_CTR1(vm, vcpu, "Resume execution at %#lx", state->rip); 1697 svm_launch(vmcb_pa, gctx, hctx); 1698 1699 CPU_CLR_ATOMIC(thiscpu, &pmap->pm_active); 1700 1701 /* 1702 * Restore MSR_GSBASE to point to the pcpu data area. 1703 * 1704 * Note that accesses done via PCPU_GET/PCPU_SET will work 1705 * only after MSR_GSBASE is restored. 1706 * 1707 * Also note that we don't bother restoring MSR_KGSBASE 1708 * since it is not used in the kernel and will be restored 1709 * when the VMRUN ioctl returns to userspace. 1710 */ 1711 wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[thiscpu]); 1712 KASSERT(curcpu == thiscpu, ("thiscpu/curcpu (%u/%u) mismatch", 1713 thiscpu, curcpu)); 1714 1715 /* 1716 * The host GDTR and IDTR is saved by VMRUN and restored 1717 * automatically on #VMEXIT. However, the host TSS needs 1718 * to be restored explicitly. 1719 */ 1720 restore_host_tss(); 1721 1722 /* #VMEXIT disables interrupts so re-enable them here. */ 1723 enable_gintr(); 1724 1725 /* Handle #VMEXIT and if required return to user space. */ 1726 handled = svm_vmexit(svm_sc, vcpu, vmexit); 1727 } while (handled); 1728 1729 return (0); 1730} 1731 1732/* 1733 * Cleanup for virtual machine. 1734 */ 1735static void 1736svm_vmcleanup(void *arg) 1737{ 1738 struct svm_softc *svm_sc; 1739 1740 svm_sc = arg; 1741 1742 VCPU_CTR0(svm_sc->vm, 0, "SVM:cleanup\n"); 1743 1744 free(svm_sc, M_SVM); 1745} 1746 1747/* 1748 * Return pointer to hypervisor saved register state. 1749 */ 1750static register_t * 1751swctx_regptr(struct svm_regctx *regctx, int reg) 1752{ 1753 1754 switch (reg) { 1755 case VM_REG_GUEST_RBX: 1756 return (®ctx->sctx_rbx); 1757 case VM_REG_GUEST_RCX: 1758 return (®ctx->sctx_rcx); 1759 case VM_REG_GUEST_RDX: 1760 return (®ctx->e.g.sctx_rdx); 1761 case VM_REG_GUEST_RDI: 1762 return (®ctx->e.g.sctx_rdi); 1763 case VM_REG_GUEST_RSI: 1764 return (®ctx->e.g.sctx_rsi); 1765 case VM_REG_GUEST_RBP: 1766 return (®ctx->sctx_rbp); 1767 case VM_REG_GUEST_R8: 1768 return (®ctx->sctx_r8); 1769 case VM_REG_GUEST_R9: 1770 return (®ctx->sctx_r9); 1771 case VM_REG_GUEST_R10: 1772 return (®ctx->sctx_r10); 1773 case VM_REG_GUEST_R11: 1774 return (®ctx->sctx_r11); 1775 case VM_REG_GUEST_R12: 1776 return (®ctx->sctx_r12); 1777 case VM_REG_GUEST_R13: 1778 return (®ctx->sctx_r13); 1779 case VM_REG_GUEST_R14: 1780 return (®ctx->sctx_r14); 1781 case VM_REG_GUEST_R15: 1782 return (®ctx->sctx_r15); 1783 default: 1784 ERR("Unknown register requested, reg=%d.\n", reg); 1785 break; 1786 } 1787 1788 return (NULL); 1789} 1790 1791/* 1792 * Interface to read guest registers. 1793 * This can be SVM h/w saved or hypervisor saved register. 1794 */ 1795static int 1796svm_getreg(void *arg, int vcpu, int ident, uint64_t *val) 1797{ 1798 struct svm_softc *svm_sc; 1799 struct vmcb *vmcb; 1800 register_t *reg; 1801 1802 svm_sc = arg; 1803 KASSERT(vcpu < svm_sc->vcpu_cnt, ("Guest doesn't have VCPU%d", vcpu)); 1804 1805 vmcb = svm_get_vmcb(svm_sc, vcpu); 1806 1807 if (vmcb_read(vmcb, ident, val) == 0) { 1808 return (0); 1809 } 1810 1811 reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident); 1812 1813 if (reg != NULL) { 1814 *val = *reg; 1815 return (0); 1816 } 1817 1818 ERR("SVM_ERR:reg type %x is not saved in VMCB.\n", ident); 1819 return (EINVAL); 1820} 1821 1822/* 1823 * Interface to write to guest registers. 1824 * This can be SVM h/w saved or hypervisor saved register. 1825 */ 1826static int 1827svm_setreg(void *arg, int vcpu, int ident, uint64_t val) 1828{ 1829 struct svm_softc *svm_sc; 1830 struct vmcb *vmcb; 1831 register_t *reg; 1832 1833 svm_sc = arg; 1834 KASSERT(vcpu < svm_sc->vcpu_cnt, ("Guest doesn't have VCPU%d", vcpu)); 1835 1836 vmcb = svm_get_vmcb(svm_sc, vcpu); 1837 if (vmcb_write(vmcb, ident, val) == 0) { 1838 return (0); 1839 } 1840 1841 reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident); 1842 1843 if (reg != NULL) { 1844 *reg = val; 1845 return (0); 1846 } 1847 1848 /* 1849 * XXX deal with CR3 and invalidate TLB entries tagged with the 1850 * vcpu's ASID. This needs to be treated differently depending on 1851 * whether 'running' is true/false. 1852 */ 1853 1854 ERR("SVM_ERR:reg type %x is not saved in VMCB.\n", ident); 1855 return (EINVAL); 1856} 1857 1858 1859/* 1860 * Inteface to set various descriptors. 1861 */ 1862static int 1863svm_setdesc(void *arg, int vcpu, int type, struct seg_desc *desc) 1864{ 1865 struct svm_softc *svm_sc; 1866 struct vmcb *vmcb; 1867 struct vmcb_segment *seg; 1868 uint16_t attrib; 1869 1870 svm_sc = arg; 1871 KASSERT(vcpu < svm_sc->vcpu_cnt, ("Guest doesn't have VCPU%d", vcpu)); 1872 1873 vmcb = svm_get_vmcb(svm_sc, vcpu); 1874 1875 VCPU_CTR1(svm_sc->vm, vcpu, "SVM:set_desc: Type%d\n", type); 1876 1877 seg = vmcb_seg(vmcb, type); 1878 if (seg == NULL) { 1879 ERR("SVM_ERR:Unsupported segment type%d\n", type); 1880 return (EINVAL); 1881 } 1882 1883 /* Map seg_desc access to VMCB attribute format.*/ 1884 attrib = ((desc->access & 0xF000) >> 4) | (desc->access & 0xFF); 1885 VCPU_CTR3(svm_sc->vm, vcpu, "SVM:[sel %d attribute 0x%x limit:0x%x]\n", 1886 type, desc->access, desc->limit); 1887 seg->attrib = attrib; 1888 seg->base = desc->base; 1889 seg->limit = desc->limit; 1890 1891 return (0); 1892} 1893 1894/* 1895 * Interface to get guest descriptor. 1896 */ 1897static int 1898svm_getdesc(void *arg, int vcpu, int type, struct seg_desc *desc) 1899{ 1900 struct svm_softc *svm_sc; 1901 struct vmcb_segment *seg; 1902 1903 svm_sc = arg; 1904 KASSERT(vcpu < svm_sc->vcpu_cnt, ("Guest doesn't have VCPU%d", vcpu)); 1905 1906 VCPU_CTR1(svm_sc->vm, vcpu, "SVM:get_desc: Type%d\n", type); 1907 1908 seg = vmcb_seg(svm_get_vmcb(svm_sc, vcpu), type); 1909 if (!seg) { 1910 ERR("SVM_ERR:Unsupported segment type%d\n", type); 1911 return (EINVAL); 1912 } 1913 1914 /* Map seg_desc access to VMCB attribute format.*/ 1915 desc->access = ((seg->attrib & 0xF00) << 4) | (seg->attrib & 0xFF); 1916 desc->base = seg->base; 1917 desc->limit = seg->limit; 1918 1919 /* 1920 * VT-x uses bit 16 (Unusable) to indicate a segment that has been 1921 * loaded with a NULL segment selector. The 'desc->access' field is 1922 * interpreted in the VT-x format by the processor-independent code. 1923 * 1924 * SVM uses the 'P' bit to convey the same information so convert it 1925 * into the VT-x format. For more details refer to section 1926 * "Segment State in the VMCB" in APMv2. 1927 */ 1928 if (type == VM_REG_GUEST_CS && type == VM_REG_GUEST_TR) 1929 desc->access |= 0x80; /* CS and TS always present */ 1930 1931 if (!(desc->access & 0x80)) 1932 desc->access |= 0x10000; /* Unusable segment */ 1933 1934 return (0); 1935} 1936 1937static int 1938svm_setcap(void *arg, int vcpu, int type, int val) 1939{ 1940 struct svm_softc *sc; 1941 int error; 1942 1943 sc = arg; 1944 error = 0; 1945 switch (type) { 1946 case VM_CAP_HALT_EXIT: 1947 svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, 1948 VMCB_INTCPT_HLT, val); 1949 break; 1950 case VM_CAP_PAUSE_EXIT: 1951 svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, 1952 VMCB_INTCPT_PAUSE, val); 1953 break; 1954 case VM_CAP_UNRESTRICTED_GUEST: 1955 /* Unrestricted guest execution cannot be disabled in SVM */ 1956 if (val == 0) 1957 error = EINVAL; 1958 break; 1959 default: 1960 error = ENOENT; 1961 break; 1962 } 1963 return (error); 1964} 1965 1966static int 1967svm_getcap(void *arg, int vcpu, int type, int *retval) 1968{ 1969 struct svm_softc *sc; 1970 int error; 1971 1972 sc = arg; 1973 error = 0; 1974 1975 switch (type) { 1976 case VM_CAP_HALT_EXIT: 1977 *retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, 1978 VMCB_INTCPT_HLT); 1979 break; 1980 case VM_CAP_PAUSE_EXIT: 1981 *retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, 1982 VMCB_INTCPT_PAUSE); 1983 break; 1984 case VM_CAP_UNRESTRICTED_GUEST: 1985 *retval = 1; /* unrestricted guest is always enabled */ 1986 break; 1987 default: 1988 error = ENOENT; 1989 break; 1990 } 1991 return (error); 1992} 1993 1994static struct vlapic * 1995svm_vlapic_init(void *arg, int vcpuid) 1996{ 1997 struct svm_softc *svm_sc; 1998 struct vlapic *vlapic; 1999 2000 svm_sc = arg; 2001 vlapic = malloc(sizeof(struct vlapic), M_SVM_VLAPIC, M_WAITOK | M_ZERO); 2002 vlapic->vm = svm_sc->vm; 2003 vlapic->vcpuid = vcpuid; 2004 vlapic->apic_page = (struct LAPIC *)&svm_sc->apic_page[vcpuid]; 2005 2006 vlapic_init(vlapic); 2007 2008 return (vlapic); 2009} 2010 2011static void 2012svm_vlapic_cleanup(void *arg, struct vlapic *vlapic) 2013{ 2014 2015 vlapic_cleanup(vlapic); 2016 free(vlapic, M_SVM_VLAPIC); 2017} 2018 2019struct vmm_ops vmm_ops_amd = { 2020 svm_init, 2021 svm_cleanup, 2022 svm_restore, 2023 svm_vminit, 2024 svm_vmrun, 2025 svm_vmcleanup, 2026 svm_getreg, 2027 svm_setreg, 2028 svm_getdesc, 2029 svm_setdesc, 2030 svm_getcap, 2031 svm_setcap, 2032 svm_npt_alloc, 2033 svm_npt_free, 2034 svm_vlapic_init, 2035 svm_vlapic_cleanup 2036}; 2037