/*- * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmm_mem.h" #include "vmm_util.h" #include #include "vlapic.h" #include "vmm_msr.h" #include "vmm_ipi.h" #include "vmm_stat.h" #include "io/ppt.h" #include "io/iommu.h" struct vlapic; struct vcpu { int flags; int pincpu; /* host cpuid this vcpu is bound to */ int hostcpu; /* host cpuid this vcpu last ran on */ uint64_t guest_msrs[VMM_MSR_NUM]; struct vlapic *vlapic; int vcpuid; struct savefpu savefpu; /* guest fpu state */ void *stats; }; #define VCPU_F_PINNED 0x0001 #define VCPU_F_RUNNING 0x0002 #define VCPU_PINCPU(vm, vcpuid) \ ((vm->vcpu[vcpuid].flags & VCPU_F_PINNED) ? vm->vcpu[vcpuid].pincpu : -1) #define VCPU_UNPIN(vm, vcpuid) (vm->vcpu[vcpuid].flags &= ~VCPU_F_PINNED) #define VCPU_PIN(vm, vcpuid, host_cpuid) \ do { \ vm->vcpu[vcpuid].flags |= VCPU_F_PINNED; \ vm->vcpu[vcpuid].pincpu = host_cpuid; \ } while(0) #define VM_MAX_MEMORY_SEGMENTS 2 struct vm { void *cookie; /* processor-specific data */ void *iommu; /* iommu-specific data */ struct vcpu vcpu[VM_MAXCPU]; int num_mem_segs; struct vm_memory_segment mem_segs[VM_MAX_MEMORY_SEGMENTS]; char name[VM_MAX_NAMELEN]; /* * Set of active vcpus. * An active vcpu is one that has been started implicitly (BSP) or * explicitly (AP) by sending it a startup ipi. */ cpuset_t active_cpus; }; static struct vmm_ops *ops; #define VMM_INIT() (ops != NULL ? (*ops->init)() : 0) #define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0) #define VMINIT(vm) (ops != NULL ? (*ops->vminit)(vm): NULL) #define VMRUN(vmi, vcpu, rip, vmexit) \ (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, vmexit) : ENXIO) #define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL) #define VMMMAP(vmi, gpa, hpa, len, attr, prot, spm) \ (ops != NULL ? (*ops->vmmmap)(vmi, gpa, hpa, len, attr, prot, spm) : ENXIO) #define VMGETREG(vmi, vcpu, num, retval) \ (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO) #define VMSETREG(vmi, vcpu, num, val) \ (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO) #define VMGETDESC(vmi, vcpu, num, desc) \ (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO) #define VMSETDESC(vmi, vcpu, num, desc) \ (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO) #define VMINJECT(vmi, vcpu, type, vec, ec, ecv) \ (ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO) #define VMNMI(vmi, vcpu) \ (ops != NULL ? (*ops->vmnmi)(vmi, vcpu) : ENXIO) #define VMGETCAP(vmi, vcpu, num, retval) \ (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO) #define VMSETCAP(vmi, vcpu, num, val) \ (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO) #define fxrstor(addr) __asm("fxrstor %0" : : "m" (*(addr))) #define fxsave(addr) __asm __volatile("fxsave %0" : "=m" (*(addr))) #define fpu_start_emulating() __asm("smsw %%ax; orb %0,%%al; lmsw %%ax" \ : : "n" (CR0_TS) : "ax") #define fpu_stop_emulating() __asm("clts") static MALLOC_DEFINE(M_VM, "vm", "vm"); CTASSERT(VMM_MSR_NUM <= 64); /* msr_mask can keep track of up to 64 msrs */ /* statistics */ static VMM_STAT_DEFINE(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); static void vcpu_cleanup(struct vcpu *vcpu) { vlapic_cleanup(vcpu->vlapic); vmm_stat_free(vcpu->stats); } static void vcpu_init(struct vm *vm, uint32_t vcpu_id) { struct vcpu *vcpu; vcpu = &vm->vcpu[vcpu_id]; vcpu->hostcpu = -1; vcpu->vcpuid = vcpu_id; vcpu->vlapic = vlapic_init(vm, vcpu_id); fpugetregs(curthread); vcpu->savefpu = curthread->td_pcb->pcb_user_save; vcpu->stats = vmm_stat_alloc(); } static int vmm_init(void) { int error; vmm_ipi_init(); error = vmm_mem_init(); if (error) return (error); if (vmm_is_intel()) ops = &vmm_ops_intel; else if (vmm_is_amd()) ops = &vmm_ops_amd; else return (ENXIO); vmm_msr_init(); return (VMM_INIT()); } static int vmm_handler(module_t mod, int what, void *arg) { int error; switch (what) { case MOD_LOAD: vmmdev_init(); iommu_init(); error = vmm_init(); break; case MOD_UNLOAD: vmmdev_cleanup(); iommu_cleanup(); vmm_ipi_cleanup(); error = VMM_CLEANUP(); break; default: error = 0; break; } return (error); } static moduledata_t vmm_kmod = { "vmm", vmm_handler, NULL }; /* * Execute the module load handler after the pci passthru driver has had * a chance to claim devices. We need this information at the time we do * iommu initialization. */ DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_CONFIGURE + 1, SI_ORDER_ANY); MODULE_VERSION(vmm, 1); SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); struct vm * vm_create(const char *name) { int i; struct vm *vm; vm_paddr_t maxaddr; const int BSP = 0; if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) return (NULL); vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); strcpy(vm->name, name); vm->cookie = VMINIT(vm); for (i = 0; i < VM_MAXCPU; i++) { vcpu_init(vm, i); guest_msrs_init(vm, i); } maxaddr = vmm_mem_maxaddr(); vm->iommu = iommu_create_domain(maxaddr); vm_activate_cpu(vm, BSP); return (vm); } void vm_destroy(struct vm *vm) { int i; ppt_unassign_all(vm); for (i = 0; i < vm->num_mem_segs; i++) vmm_mem_free(vm->mem_segs[i].hpa, vm->mem_segs[i].len); for (i = 0; i < VM_MAXCPU; i++) vcpu_cleanup(&vm->vcpu[i]); iommu_destroy_domain(vm->iommu); VMCLEANUP(vm->cookie); free(vm, M_VM); } const char * vm_name(struct vm *vm) { return (vm->name); } int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) { const boolean_t spok = TRUE; /* superpage mappings are ok */ return (VMMMAP(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE, VM_PROT_RW, spok)); } int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) { const boolean_t spok = TRUE; /* superpage mappings are ok */ return (VMMMAP(vm->cookie, gpa, 0, len, VM_MEMATTR_UNCACHEABLE, VM_PROT_NONE, spok)); } int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t *ret_hpa) { int error; vm_paddr_t hpa; const boolean_t spok = TRUE; /* superpage mappings are ok */ /* * find the hpa if already it was already vm_malloc'd. */ hpa = vm_gpa2hpa(vm, gpa, len); if (hpa != ((vm_paddr_t)-1)) goto out; if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS) return (E2BIG); hpa = vmm_mem_alloc(len); if (hpa == 0) return (ENOMEM); error = VMMMAP(vm->cookie, gpa, hpa, len, VM_MEMATTR_WRITE_BACK, VM_PROT_ALL, spok); if (error) { vmm_mem_free(hpa, len); return (error); } iommu_create_mapping(vm->iommu, gpa, hpa, len); vm->mem_segs[vm->num_mem_segs].gpa = gpa; vm->mem_segs[vm->num_mem_segs].hpa = hpa; vm->mem_segs[vm->num_mem_segs].len = len; vm->num_mem_segs++; out: *ret_hpa = hpa; return (0); } vm_paddr_t vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len) { int i; vm_paddr_t gpabase, gpalimit, hpabase; for (i = 0; i < vm->num_mem_segs; i++) { hpabase = vm->mem_segs[i].hpa; gpabase = vm->mem_segs[i].gpa; gpalimit = gpabase + vm->mem_segs[i].len; if (gpa >= gpabase && gpa + len <= gpalimit) return ((gpa - gpabase) + hpabase); } return ((vm_paddr_t)-1); } int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase, struct vm_memory_segment *seg) { int i; for (i = 0; i < vm->num_mem_segs; i++) { if (gpabase == vm->mem_segs[i].gpa) { *seg = vm->mem_segs[i]; return (0); } } return (-1); } int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) { if (vcpu < 0 || vcpu >= VM_MAXCPU) return (EINVAL); if (reg >= VM_REG_LAST) return (EINVAL); return (VMGETREG(vm->cookie, vcpu, reg, retval)); } int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val) { if (vcpu < 0 || vcpu >= VM_MAXCPU) return (EINVAL); if (reg >= VM_REG_LAST) return (EINVAL); return (VMSETREG(vm->cookie, vcpu, reg, val)); } static boolean_t is_descriptor_table(int reg) { switch (reg) { case VM_REG_GUEST_IDTR: case VM_REG_GUEST_GDTR: return (TRUE); default: return (FALSE); } } static boolean_t is_segment_register(int reg) { switch (reg) { case VM_REG_GUEST_ES: case VM_REG_GUEST_CS: case VM_REG_GUEST_SS: case VM_REG_GUEST_DS: case VM_REG_GUEST_FS: case VM_REG_GUEST_GS: case VM_REG_GUEST_TR: case VM_REG_GUEST_LDTR: return (TRUE); default: return (FALSE); } } int vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc) { if (vcpu < 0 || vcpu >= VM_MAXCPU) return (EINVAL); if (!is_segment_register(reg) && !is_descriptor_table(reg)) return (EINVAL); return (VMGETDESC(vm->cookie, vcpu, reg, desc)); } int vm_set_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc) { if (vcpu < 0 || vcpu >= VM_MAXCPU) return (EINVAL); if (!is_segment_register(reg) && !is_descriptor_table(reg)) return (EINVAL); return (VMSETDESC(vm->cookie, vcpu, reg, desc)); } int vm_get_pinning(struct vm *vm, int vcpuid, int *cpuid) { if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); *cpuid = VCPU_PINCPU(vm, vcpuid); return (0); } int vm_set_pinning(struct vm *vm, int vcpuid, int host_cpuid) { struct thread *td; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); td = curthread; /* XXXSMP only safe when muxing vcpus */ /* unpin */ if (host_cpuid < 0) { VCPU_UNPIN(vm, vcpuid); thread_lock(td); sched_unbind(td); thread_unlock(td); return (0); } if (CPU_ABSENT(host_cpuid)) return (EINVAL); /* * XXX we should check that 'host_cpuid' has not already been pinned * by another vm. */ thread_lock(td); sched_bind(td, host_cpuid); thread_unlock(td); VCPU_PIN(vm, vcpuid, host_cpuid); return (0); } static void restore_guest_fpustate(struct vcpu *vcpu) { register_t s; s = intr_disable(); fpu_stop_emulating(); fxrstor(&vcpu->savefpu); fpu_start_emulating(); intr_restore(s); } static void save_guest_fpustate(struct vcpu *vcpu) { register_t s; s = intr_disable(); fpu_stop_emulating(); fxsave(&vcpu->savefpu); fpu_start_emulating(); intr_restore(s); } int vm_run(struct vm *vm, struct vm_run *vmrun) { int error, vcpuid; struct vcpu *vcpu; struct pcb *pcb; uint64_t tscval; vcpuid = vmrun->cpuid; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; critical_enter(); tscval = rdtsc(); pcb = PCPU_GET(curpcb); set_pcb_flags(pcb, PCB_FULL_IRET); vcpu->hostcpu = curcpu; fpuexit(curthread); restore_guest_msrs(vm, vcpuid); restore_guest_fpustate(vcpu); error = VMRUN(vm->cookie, vcpuid, vmrun->rip, &vmrun->vm_exit); save_guest_fpustate(vcpu); restore_host_msrs(vm, vcpuid); vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); critical_exit(); return (error); } int vm_inject_event(struct vm *vm, int vcpuid, int type, int vector, uint32_t code, int code_valid) { if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0) return (EINVAL); if (vector < 0 || vector > 255) return (EINVAL); return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid)); } int vm_inject_nmi(struct vm *vm, int vcpu) { int error; if (vcpu < 0 || vcpu >= VM_MAXCPU) return (EINVAL); error = VMNMI(vm->cookie, vcpu); vm_interrupt_hostcpu(vm, vcpu); return (error); } int vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) { if (vcpu < 0 || vcpu >= VM_MAXCPU) return (EINVAL); if (type < 0 || type >= VM_CAP_MAX) return (EINVAL); return (VMGETCAP(vm->cookie, vcpu, type, retval)); } int vm_set_capability(struct vm *vm, int vcpu, int type, int val) { if (vcpu < 0 || vcpu >= VM_MAXCPU) return (EINVAL); if (type < 0 || type >= VM_CAP_MAX) return (EINVAL); return (VMSETCAP(vm->cookie, vcpu, type, val)); } uint64_t * vm_guest_msrs(struct vm *vm, int cpu) { return (vm->vcpu[cpu].guest_msrs); } struct vlapic * vm_lapic(struct vm *vm, int cpu) { return (vm->vcpu[cpu].vlapic); } boolean_t vmm_is_pptdev(int bus, int slot, int func) { int found, b, s, f, n; char *val, *cp, *cp2; /* * setenv pptdevs "1/2/3 4/5/6 7/8/9 10/11/12" */ found = 0; cp = val = getenv("pptdevs"); while (cp != NULL && *cp != '\0') { if ((cp2 = strchr(cp, ' ')) != NULL) *cp2 = '\0'; n = sscanf(cp, "%d/%d/%d", &b, &s, &f); if (n == 3 && bus == b && slot == s && func == f) { found = 1; break; } if (cp2 != NULL) *cp2++ = ' '; cp = cp2; } freeenv(val); return (found); } void * vm_iommu_domain(struct vm *vm) { return (vm->iommu); } void vm_set_run_state(struct vm *vm, int vcpuid, int state) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) panic("vm_set_run_state: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; if (state == VCPU_RUNNING) { if (vcpu->flags & VCPU_F_RUNNING) { panic("vm_set_run_state: %s[%d] is already running", vm_name(vm), vcpuid); } vcpu->flags |= VCPU_F_RUNNING; } else { if ((vcpu->flags & VCPU_F_RUNNING) == 0) { panic("vm_set_run_state: %s[%d] is already stopped", vm_name(vm), vcpuid); } vcpu->flags &= ~VCPU_F_RUNNING; } } int vm_get_run_state(struct vm *vm, int vcpuid, int *cpuptr) { int retval, hostcpu; struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) panic("vm_get_run_state: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; if (vcpu->flags & VCPU_F_RUNNING) { retval = VCPU_RUNNING; hostcpu = vcpu->hostcpu; } else { retval = VCPU_STOPPED; hostcpu = -1; } if (cpuptr) *cpuptr = hostcpu; return (retval); } void vm_activate_cpu(struct vm *vm, int vcpuid) { if (vcpuid >= 0 && vcpuid < VM_MAXCPU) CPU_SET(vcpuid, &vm->active_cpus); } cpuset_t vm_active_cpus(struct vm *vm) { return (vm->active_cpus); } void * vcpu_stats(struct vm *vm, int vcpuid) { return (vm->vcpu[vcpuid].stats); }