vmm.c revision 262350
1/*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD: stable/10/sys/amd64/vmm/vmm.c 262350 2014-02-23 00:46:05Z jhb $ 27 */ 28 29#include <sys/cdefs.h> 30__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/vmm.c 262350 2014-02-23 00:46:05Z jhb $"); 31 32#include <sys/param.h> 33#include <sys/systm.h> 34#include <sys/kernel.h> 35#include <sys/module.h> 36#include <sys/sysctl.h> 37#include <sys/malloc.h> 38#include <sys/pcpu.h> 39#include <sys/lock.h> 40#include <sys/mutex.h> 41#include <sys/proc.h> 42#include <sys/rwlock.h> 43#include <sys/sched.h> 44#include <sys/smp.h> 45#include <sys/systm.h> 46 47#include <vm/vm.h> 48#include <vm/vm_object.h> 49#include <vm/vm_page.h> 50#include <vm/pmap.h> 51#include <vm/vm_map.h> 52#include <vm/vm_extern.h> 53#include <vm/vm_param.h> 54 55#include <machine/cpu.h> 56#include <machine/vm.h> 57#include <machine/pcb.h> 58#include <machine/smp.h> 59#include <x86/psl.h> 60#include <x86/apicreg.h> 61#include <machine/vmparam.h> 62 63#include <machine/vmm.h> 64#include <machine/vmm_dev.h> 65 66#include "vmm_ktr.h" 67#include "vmm_host.h" 68#include "vmm_mem.h" 69#include "vmm_util.h" 70#include "vhpet.h" 71#include "vioapic.h" 72#include "vlapic.h" 73#include "vmm_msr.h" 74#include "vmm_ipi.h" 75#include "vmm_stat.h" 76#include "vmm_lapic.h" 77 78#include "io/ppt.h" 79#include "io/iommu.h" 80 81struct vlapic; 82 83struct vcpu { 84 int flags; 85 enum vcpu_state state; 86 struct mtx mtx; 87 int hostcpu; /* host cpuid this vcpu last ran on */ 88 uint64_t guest_msrs[VMM_MSR_NUM]; 89 struct vlapic *vlapic; 90 int vcpuid; 91 struct savefpu *guestfpu; /* guest fpu state */ 92 void *stats; 93 struct vm_exit exitinfo; 94 enum x2apic_state x2apic_state; 95 int nmi_pending; 96}; 97 98#define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) 99#define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) 100#define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) 101#define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) 102 103struct mem_seg { 104 vm_paddr_t gpa; 105 size_t len; 106 boolean_t wired; 107 vm_object_t object; 108}; 109#define VM_MAX_MEMORY_SEGMENTS 2 110 111struct vm { 112 void *cookie; /* processor-specific data */ 113 void *iommu; /* iommu-specific data */ 114 struct vhpet *vhpet; /* virtual HPET */ 115 struct vioapic *vioapic; /* virtual ioapic */ 116 struct vmspace *vmspace; /* guest's address space */ 117 struct vcpu vcpu[VM_MAXCPU]; 118 int num_mem_segs; 119 struct mem_seg mem_segs[VM_MAX_MEMORY_SEGMENTS]; 120 char name[VM_MAX_NAMELEN]; 121 122 /* 123 * Set of active vcpus. 124 * An active vcpu is one that has been started implicitly (BSP) or 125 * explicitly (AP) by sending it a startup ipi. 126 */ 127 cpuset_t active_cpus; 128}; 129 130static int vmm_initialized; 131 132static struct vmm_ops *ops; 133#define VMM_INIT() (ops != NULL ? (*ops->init)() : 0) 134#define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0) 135#define VMM_RESUME() (ops != NULL ? (*ops->resume)() : 0) 136 137#define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL) 138#define VMRUN(vmi, vcpu, rip, pmap) \ 139 (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap) : ENXIO) 140#define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL) 141#define VMSPACE_ALLOC(min, max) \ 142 (ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL) 143#define VMSPACE_FREE(vmspace) \ 144 (ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO) 145#define VMGETREG(vmi, vcpu, num, retval) \ 146 (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO) 147#define VMSETREG(vmi, vcpu, num, val) \ 148 (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO) 149#define VMGETDESC(vmi, vcpu, num, desc) \ 150 (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO) 151#define VMSETDESC(vmi, vcpu, num, desc) \ 152 (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO) 153#define VMINJECT(vmi, vcpu, type, vec, ec, ecv) \ 154 (ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO) 155#define VMGETCAP(vmi, vcpu, num, retval) \ 156 (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO) 157#define VMSETCAP(vmi, vcpu, num, val) \ 158 (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO) 159 160#define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) 161#define fpu_stop_emulating() clts() 162 163static MALLOC_DEFINE(M_VM, "vm", "vm"); 164CTASSERT(VMM_MSR_NUM <= 64); /* msr_mask can keep track of up to 64 msrs */ 165 166/* statistics */ 167static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); 168 169static void 170vcpu_cleanup(struct vcpu *vcpu) 171{ 172 vlapic_cleanup(vcpu->vlapic); 173 vmm_stat_free(vcpu->stats); 174 fpu_save_area_free(vcpu->guestfpu); 175} 176 177static void 178vcpu_init(struct vm *vm, uint32_t vcpu_id) 179{ 180 struct vcpu *vcpu; 181 182 vcpu = &vm->vcpu[vcpu_id]; 183 184 vcpu_lock_init(vcpu); 185 vcpu->hostcpu = NOCPU; 186 vcpu->vcpuid = vcpu_id; 187 vcpu->vlapic = vlapic_init(vm, vcpu_id); 188 vm_set_x2apic_state(vm, vcpu_id, X2APIC_ENABLED); 189 vcpu->guestfpu = fpu_save_area_alloc(); 190 fpu_save_area_reset(vcpu->guestfpu); 191 vcpu->stats = vmm_stat_alloc(); 192} 193 194struct vm_exit * 195vm_exitinfo(struct vm *vm, int cpuid) 196{ 197 struct vcpu *vcpu; 198 199 if (cpuid < 0 || cpuid >= VM_MAXCPU) 200 panic("vm_exitinfo: invalid cpuid %d", cpuid); 201 202 vcpu = &vm->vcpu[cpuid]; 203 204 return (&vcpu->exitinfo); 205} 206 207static void 208vmm_resume(void) 209{ 210 VMM_RESUME(); 211} 212 213static int 214vmm_init(void) 215{ 216 int error; 217 218 vmm_host_state_init(); 219 vmm_ipi_init(); 220 221 error = vmm_mem_init(); 222 if (error) 223 return (error); 224 225 if (vmm_is_intel()) 226 ops = &vmm_ops_intel; 227 else if (vmm_is_amd()) 228 ops = &vmm_ops_amd; 229 else 230 return (ENXIO); 231 232 vmm_msr_init(); 233 vmm_resume_p = vmm_resume; 234 235 return (VMM_INIT()); 236} 237 238static int 239vmm_handler(module_t mod, int what, void *arg) 240{ 241 int error; 242 243 switch (what) { 244 case MOD_LOAD: 245 vmmdev_init(); 246 iommu_init(); 247 error = vmm_init(); 248 if (error == 0) 249 vmm_initialized = 1; 250 break; 251 case MOD_UNLOAD: 252 error = vmmdev_cleanup(); 253 if (error == 0) { 254 vmm_resume_p = NULL; 255 iommu_cleanup(); 256 vmm_ipi_cleanup(); 257 error = VMM_CLEANUP(); 258 /* 259 * Something bad happened - prevent new 260 * VMs from being created 261 */ 262 if (error) 263 vmm_initialized = 0; 264 } 265 break; 266 default: 267 error = 0; 268 break; 269 } 270 return (error); 271} 272 273static moduledata_t vmm_kmod = { 274 "vmm", 275 vmm_handler, 276 NULL 277}; 278 279/* 280 * vmm initialization has the following dependencies: 281 * 282 * - iommu initialization must happen after the pci passthru driver has had 283 * a chance to attach to any passthru devices (after SI_SUB_CONFIGURE). 284 * 285 * - VT-x initialization requires smp_rendezvous() and therefore must happen 286 * after SMP is fully functional (after SI_SUB_SMP). 287 */ 288DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY); 289MODULE_VERSION(vmm, 1); 290 291SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); 292 293int 294vm_create(const char *name, struct vm **retvm) 295{ 296 int i; 297 struct vm *vm; 298 struct vmspace *vmspace; 299 300 const int BSP = 0; 301 302 /* 303 * If vmm.ko could not be successfully initialized then don't attempt 304 * to create the virtual machine. 305 */ 306 if (!vmm_initialized) 307 return (ENXIO); 308 309 if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) 310 return (EINVAL); 311 312 vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS); 313 if (vmspace == NULL) 314 return (ENOMEM); 315 316 vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); 317 strcpy(vm->name, name); 318 vm->cookie = VMINIT(vm, vmspace_pmap(vmspace)); 319 vm->vioapic = vioapic_init(vm); 320 vm->vhpet = vhpet_init(vm); 321 322 for (i = 0; i < VM_MAXCPU; i++) { 323 vcpu_init(vm, i); 324 guest_msrs_init(vm, i); 325 } 326 327 vm_activate_cpu(vm, BSP); 328 vm->vmspace = vmspace; 329 330 *retvm = vm; 331 return (0); 332} 333 334static void 335vm_free_mem_seg(struct vm *vm, struct mem_seg *seg) 336{ 337 338 if (seg->object != NULL) 339 vmm_mem_free(vm->vmspace, seg->gpa, seg->len); 340 341 bzero(seg, sizeof(*seg)); 342} 343 344void 345vm_destroy(struct vm *vm) 346{ 347 int i; 348 349 ppt_unassign_all(vm); 350 351 if (vm->iommu != NULL) 352 iommu_destroy_domain(vm->iommu); 353 354 vhpet_cleanup(vm->vhpet); 355 vioapic_cleanup(vm->vioapic); 356 357 for (i = 0; i < vm->num_mem_segs; i++) 358 vm_free_mem_seg(vm, &vm->mem_segs[i]); 359 360 vm->num_mem_segs = 0; 361 362 for (i = 0; i < VM_MAXCPU; i++) 363 vcpu_cleanup(&vm->vcpu[i]); 364 365 VMSPACE_FREE(vm->vmspace); 366 367 VMCLEANUP(vm->cookie); 368 369 free(vm, M_VM); 370} 371 372const char * 373vm_name(struct vm *vm) 374{ 375 return (vm->name); 376} 377 378int 379vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) 380{ 381 vm_object_t obj; 382 383 if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) 384 return (ENOMEM); 385 else 386 return (0); 387} 388 389int 390vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) 391{ 392 393 vmm_mmio_free(vm->vmspace, gpa, len); 394 return (0); 395} 396 397boolean_t 398vm_mem_allocated(struct vm *vm, vm_paddr_t gpa) 399{ 400 int i; 401 vm_paddr_t gpabase, gpalimit; 402 403 for (i = 0; i < vm->num_mem_segs; i++) { 404 gpabase = vm->mem_segs[i].gpa; 405 gpalimit = gpabase + vm->mem_segs[i].len; 406 if (gpa >= gpabase && gpa < gpalimit) 407 return (TRUE); /* 'gpa' is regular memory */ 408 } 409 410 if (ppt_is_mmio(vm, gpa)) 411 return (TRUE); /* 'gpa' is pci passthru mmio */ 412 413 return (FALSE); 414} 415 416int 417vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len) 418{ 419 int available, allocated; 420 struct mem_seg *seg; 421 vm_object_t object; 422 vm_paddr_t g; 423 424 if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0) 425 return (EINVAL); 426 427 available = allocated = 0; 428 g = gpa; 429 while (g < gpa + len) { 430 if (vm_mem_allocated(vm, g)) 431 allocated++; 432 else 433 available++; 434 435 g += PAGE_SIZE; 436 } 437 438 /* 439 * If there are some allocated and some available pages in the address 440 * range then it is an error. 441 */ 442 if (allocated && available) 443 return (EINVAL); 444 445 /* 446 * If the entire address range being requested has already been 447 * allocated then there isn't anything more to do. 448 */ 449 if (allocated && available == 0) 450 return (0); 451 452 if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS) 453 return (E2BIG); 454 455 seg = &vm->mem_segs[vm->num_mem_segs]; 456 457 if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL) 458 return (ENOMEM); 459 460 seg->gpa = gpa; 461 seg->len = len; 462 seg->object = object; 463 seg->wired = FALSE; 464 465 vm->num_mem_segs++; 466 467 return (0); 468} 469 470static void 471vm_gpa_unwire(struct vm *vm) 472{ 473 int i, rv; 474 struct mem_seg *seg; 475 476 for (i = 0; i < vm->num_mem_segs; i++) { 477 seg = &vm->mem_segs[i]; 478 if (!seg->wired) 479 continue; 480 481 rv = vm_map_unwire(&vm->vmspace->vm_map, 482 seg->gpa, seg->gpa + seg->len, 483 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 484 KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment " 485 "%#lx/%ld could not be unwired: %d", 486 vm_name(vm), seg->gpa, seg->len, rv)); 487 488 seg->wired = FALSE; 489 } 490} 491 492static int 493vm_gpa_wire(struct vm *vm) 494{ 495 int i, rv; 496 struct mem_seg *seg; 497 498 for (i = 0; i < vm->num_mem_segs; i++) { 499 seg = &vm->mem_segs[i]; 500 if (seg->wired) 501 continue; 502 503 /* XXX rlimits? */ 504 rv = vm_map_wire(&vm->vmspace->vm_map, 505 seg->gpa, seg->gpa + seg->len, 506 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 507 if (rv != KERN_SUCCESS) 508 break; 509 510 seg->wired = TRUE; 511 } 512 513 if (i < vm->num_mem_segs) { 514 /* 515 * Undo the wiring before returning an error. 516 */ 517 vm_gpa_unwire(vm); 518 return (EAGAIN); 519 } 520 521 return (0); 522} 523 524static void 525vm_iommu_modify(struct vm *vm, boolean_t map) 526{ 527 int i, sz; 528 vm_paddr_t gpa, hpa; 529 struct mem_seg *seg; 530 void *vp, *cookie, *host_domain; 531 532 sz = PAGE_SIZE; 533 host_domain = iommu_host_domain(); 534 535 for (i = 0; i < vm->num_mem_segs; i++) { 536 seg = &vm->mem_segs[i]; 537 KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired", 538 vm_name(vm), seg->gpa, seg->len)); 539 540 gpa = seg->gpa; 541 while (gpa < seg->gpa + seg->len) { 542 vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE, 543 &cookie); 544 KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx", 545 vm_name(vm), gpa)); 546 547 vm_gpa_release(cookie); 548 549 hpa = DMAP_TO_PHYS((uintptr_t)vp); 550 if (map) { 551 iommu_create_mapping(vm->iommu, gpa, hpa, sz); 552 iommu_remove_mapping(host_domain, hpa, sz); 553 } else { 554 iommu_remove_mapping(vm->iommu, gpa, sz); 555 iommu_create_mapping(host_domain, hpa, hpa, sz); 556 } 557 558 gpa += PAGE_SIZE; 559 } 560 } 561 562 /* 563 * Invalidate the cached translations associated with the domain 564 * from which pages were removed. 565 */ 566 if (map) 567 iommu_invalidate_tlb(host_domain); 568 else 569 iommu_invalidate_tlb(vm->iommu); 570} 571 572#define vm_iommu_unmap(vm) vm_iommu_modify((vm), FALSE) 573#define vm_iommu_map(vm) vm_iommu_modify((vm), TRUE) 574 575int 576vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func) 577{ 578 int error; 579 580 error = ppt_unassign_device(vm, bus, slot, func); 581 if (error) 582 return (error); 583 584 if (ppt_num_devices(vm) == 0) { 585 vm_iommu_unmap(vm); 586 vm_gpa_unwire(vm); 587 } 588 return (0); 589} 590 591int 592vm_assign_pptdev(struct vm *vm, int bus, int slot, int func) 593{ 594 int error; 595 vm_paddr_t maxaddr; 596 597 /* 598 * Virtual machines with pci passthru devices get special treatment: 599 * - the guest physical memory is wired 600 * - the iommu is programmed to do the 'gpa' to 'hpa' translation 601 * 602 * We need to do this before the first pci passthru device is attached. 603 */ 604 if (ppt_num_devices(vm) == 0) { 605 KASSERT(vm->iommu == NULL, 606 ("vm_assign_pptdev: iommu must be NULL")); 607 maxaddr = vmm_mem_maxaddr(); 608 vm->iommu = iommu_create_domain(maxaddr); 609 610 error = vm_gpa_wire(vm); 611 if (error) 612 return (error); 613 614 vm_iommu_map(vm); 615 } 616 617 error = ppt_assign_device(vm, bus, slot, func); 618 return (error); 619} 620 621void * 622vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot, 623 void **cookie) 624{ 625 int count, pageoff; 626 vm_page_t m; 627 628 pageoff = gpa & PAGE_MASK; 629 if (len > PAGE_SIZE - pageoff) 630 panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len); 631 632 count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map, 633 trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1); 634 635 if (count == 1) { 636 *cookie = m; 637 return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff)); 638 } else { 639 *cookie = NULL; 640 return (NULL); 641 } 642} 643 644void 645vm_gpa_release(void *cookie) 646{ 647 vm_page_t m = cookie; 648 649 vm_page_lock(m); 650 vm_page_unhold(m); 651 vm_page_unlock(m); 652} 653 654int 655vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase, 656 struct vm_memory_segment *seg) 657{ 658 int i; 659 660 for (i = 0; i < vm->num_mem_segs; i++) { 661 if (gpabase == vm->mem_segs[i].gpa) { 662 seg->gpa = vm->mem_segs[i].gpa; 663 seg->len = vm->mem_segs[i].len; 664 seg->wired = vm->mem_segs[i].wired; 665 return (0); 666 } 667 } 668 return (-1); 669} 670 671int 672vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len, 673 vm_offset_t *offset, struct vm_object **object) 674{ 675 int i; 676 size_t seg_len; 677 vm_paddr_t seg_gpa; 678 vm_object_t seg_obj; 679 680 for (i = 0; i < vm->num_mem_segs; i++) { 681 if ((seg_obj = vm->mem_segs[i].object) == NULL) 682 continue; 683 684 seg_gpa = vm->mem_segs[i].gpa; 685 seg_len = vm->mem_segs[i].len; 686 687 if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) { 688 *offset = gpa - seg_gpa; 689 *object = seg_obj; 690 vm_object_reference(seg_obj); 691 return (0); 692 } 693 } 694 695 return (EINVAL); 696} 697 698int 699vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) 700{ 701 702 if (vcpu < 0 || vcpu >= VM_MAXCPU) 703 return (EINVAL); 704 705 if (reg >= VM_REG_LAST) 706 return (EINVAL); 707 708 return (VMGETREG(vm->cookie, vcpu, reg, retval)); 709} 710 711int 712vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val) 713{ 714 715 if (vcpu < 0 || vcpu >= VM_MAXCPU) 716 return (EINVAL); 717 718 if (reg >= VM_REG_LAST) 719 return (EINVAL); 720 721 return (VMSETREG(vm->cookie, vcpu, reg, val)); 722} 723 724static boolean_t 725is_descriptor_table(int reg) 726{ 727 728 switch (reg) { 729 case VM_REG_GUEST_IDTR: 730 case VM_REG_GUEST_GDTR: 731 return (TRUE); 732 default: 733 return (FALSE); 734 } 735} 736 737static boolean_t 738is_segment_register(int reg) 739{ 740 741 switch (reg) { 742 case VM_REG_GUEST_ES: 743 case VM_REG_GUEST_CS: 744 case VM_REG_GUEST_SS: 745 case VM_REG_GUEST_DS: 746 case VM_REG_GUEST_FS: 747 case VM_REG_GUEST_GS: 748 case VM_REG_GUEST_TR: 749 case VM_REG_GUEST_LDTR: 750 return (TRUE); 751 default: 752 return (FALSE); 753 } 754} 755 756int 757vm_get_seg_desc(struct vm *vm, int vcpu, int reg, 758 struct seg_desc *desc) 759{ 760 761 if (vcpu < 0 || vcpu >= VM_MAXCPU) 762 return (EINVAL); 763 764 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 765 return (EINVAL); 766 767 return (VMGETDESC(vm->cookie, vcpu, reg, desc)); 768} 769 770int 771vm_set_seg_desc(struct vm *vm, int vcpu, int reg, 772 struct seg_desc *desc) 773{ 774 if (vcpu < 0 || vcpu >= VM_MAXCPU) 775 return (EINVAL); 776 777 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 778 return (EINVAL); 779 780 return (VMSETDESC(vm->cookie, vcpu, reg, desc)); 781} 782 783static void 784restore_guest_fpustate(struct vcpu *vcpu) 785{ 786 787 /* flush host state to the pcb */ 788 fpuexit(curthread); 789 790 /* restore guest FPU state */ 791 fpu_stop_emulating(); 792 fpurestore(vcpu->guestfpu); 793 794 /* 795 * The FPU is now "dirty" with the guest's state so turn on emulation 796 * to trap any access to the FPU by the host. 797 */ 798 fpu_start_emulating(); 799} 800 801static void 802save_guest_fpustate(struct vcpu *vcpu) 803{ 804 805 if ((rcr0() & CR0_TS) == 0) 806 panic("fpu emulation not enabled in host!"); 807 808 /* save guest FPU state */ 809 fpu_stop_emulating(); 810 fpusave(vcpu->guestfpu); 811 fpu_start_emulating(); 812} 813 814static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle"); 815 816static int 817vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) 818{ 819 int error; 820 821 vcpu_assert_locked(vcpu); 822 823 /* 824 * The following state transitions are allowed: 825 * IDLE -> FROZEN -> IDLE 826 * FROZEN -> RUNNING -> FROZEN 827 * FROZEN -> SLEEPING -> FROZEN 828 */ 829 switch (vcpu->state) { 830 case VCPU_IDLE: 831 case VCPU_RUNNING: 832 case VCPU_SLEEPING: 833 error = (newstate != VCPU_FROZEN); 834 break; 835 case VCPU_FROZEN: 836 error = (newstate == VCPU_FROZEN); 837 break; 838 default: 839 error = 1; 840 break; 841 } 842 843 if (error == 0) 844 vcpu->state = newstate; 845 else 846 error = EBUSY; 847 848 return (error); 849} 850 851static void 852vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) 853{ 854 int error; 855 856 if ((error = vcpu_set_state(vm, vcpuid, newstate)) != 0) 857 panic("Error %d setting state to %d\n", error, newstate); 858} 859 860static void 861vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) 862{ 863 int error; 864 865 if ((error = vcpu_set_state_locked(vcpu, newstate)) != 0) 866 panic("Error %d setting state to %d", error, newstate); 867} 868 869/* 870 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. 871 */ 872static int 873vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu) 874{ 875 struct vm_exit *vmexit; 876 struct vcpu *vcpu; 877 int t, timo; 878 879 vcpu = &vm->vcpu[vcpuid]; 880 881 vcpu_lock(vcpu); 882 883 /* 884 * Do a final check for pending NMI or interrupts before 885 * really putting this thread to sleep. 886 * 887 * These interrupts could have happened any time after we 888 * returned from VMRUN() and before we grabbed the vcpu lock. 889 */ 890 if (!vm_nmi_pending(vm, vcpuid) && 891 (intr_disabled || vlapic_pending_intr(vcpu->vlapic) < 0)) { 892 t = ticks; 893 vcpu_require_state_locked(vcpu, VCPU_SLEEPING); 894 if (vlapic_enabled(vcpu->vlapic)) { 895 /* 896 * XXX msleep_spin() is not interruptible so use the 897 * 'timo' to put an upper bound on the sleep time. 898 */ 899 timo = hz; 900 msleep_spin(vcpu, &vcpu->mtx, "vmidle", timo); 901 } else { 902 /* 903 * Spindown the vcpu if the apic is disabled and it 904 * had entered the halted state. 905 */ 906 *retu = true; 907 vmexit = vm_exitinfo(vm, vcpuid); 908 vmexit->exitcode = VM_EXITCODE_SPINDOWN_CPU; 909 VCPU_CTR0(vm, vcpuid, "spinning down cpu"); 910 } 911 vcpu_require_state_locked(vcpu, VCPU_FROZEN); 912 vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t); 913 } 914 vcpu_unlock(vcpu); 915 916 return (0); 917} 918 919static int 920vm_handle_paging(struct vm *vm, int vcpuid, bool *retu) 921{ 922 int rv, ftype; 923 struct vm_map *map; 924 struct vcpu *vcpu; 925 struct vm_exit *vme; 926 927 vcpu = &vm->vcpu[vcpuid]; 928 vme = &vcpu->exitinfo; 929 930 ftype = vme->u.paging.fault_type; 931 KASSERT(ftype == VM_PROT_READ || 932 ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE, 933 ("vm_handle_paging: invalid fault_type %d", ftype)); 934 935 if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { 936 rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace), 937 vme->u.paging.gpa, ftype); 938 if (rv == 0) 939 goto done; 940 } 941 942 map = &vm->vmspace->vm_map; 943 rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL); 944 945 VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, " 946 "ftype = %d", rv, vme->u.paging.gpa, ftype); 947 948 if (rv != KERN_SUCCESS) 949 return (EFAULT); 950done: 951 /* restart execution at the faulting instruction */ 952 vme->inst_length = 0; 953 954 return (0); 955} 956 957static int 958vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) 959{ 960 struct vie *vie; 961 struct vcpu *vcpu; 962 struct vm_exit *vme; 963 int error, inst_length; 964 uint64_t rip, gla, gpa, cr3; 965 mem_region_read_t mread; 966 mem_region_write_t mwrite; 967 968 vcpu = &vm->vcpu[vcpuid]; 969 vme = &vcpu->exitinfo; 970 971 rip = vme->rip; 972 inst_length = vme->inst_length; 973 974 gla = vme->u.inst_emul.gla; 975 gpa = vme->u.inst_emul.gpa; 976 cr3 = vme->u.inst_emul.cr3; 977 vie = &vme->u.inst_emul.vie; 978 979 vie_init(vie); 980 981 /* Fetch, decode and emulate the faulting instruction */ 982 if (vmm_fetch_instruction(vm, vcpuid, rip, inst_length, cr3, vie) != 0) 983 return (EFAULT); 984 985 if (vmm_decode_instruction(vm, vcpuid, gla, vie) != 0) 986 return (EFAULT); 987 988 /* return to userland unless this is an in-kernel emulated device */ 989 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { 990 mread = lapic_mmio_read; 991 mwrite = lapic_mmio_write; 992 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { 993 mread = vioapic_mmio_read; 994 mwrite = vioapic_mmio_write; 995 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { 996 mread = vhpet_mmio_read; 997 mwrite = vhpet_mmio_write; 998 } else { 999 *retu = true; 1000 return (0); 1001 } 1002 1003 error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, mread, mwrite, 1004 retu); 1005 1006 return (error); 1007} 1008 1009int 1010vm_run(struct vm *vm, struct vm_run *vmrun) 1011{ 1012 int error, vcpuid; 1013 struct vcpu *vcpu; 1014 struct pcb *pcb; 1015 uint64_t tscval, rip; 1016 struct vm_exit *vme; 1017 bool retu, intr_disabled; 1018 pmap_t pmap; 1019 1020 vcpuid = vmrun->cpuid; 1021 1022 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1023 return (EINVAL); 1024 1025 pmap = vmspace_pmap(vm->vmspace); 1026 vcpu = &vm->vcpu[vcpuid]; 1027 vme = &vcpu->exitinfo; 1028 rip = vmrun->rip; 1029restart: 1030 critical_enter(); 1031 1032 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), 1033 ("vm_run: absurd pm_active")); 1034 1035 tscval = rdtsc(); 1036 1037 pcb = PCPU_GET(curpcb); 1038 set_pcb_flags(pcb, PCB_FULL_IRET); 1039 1040 restore_guest_msrs(vm, vcpuid); 1041 restore_guest_fpustate(vcpu); 1042 1043 vcpu_require_state(vm, vcpuid, VCPU_RUNNING); 1044 vcpu->hostcpu = curcpu; 1045 error = VMRUN(vm->cookie, vcpuid, rip, pmap); 1046 vcpu->hostcpu = NOCPU; 1047 vcpu_require_state(vm, vcpuid, VCPU_FROZEN); 1048 1049 save_guest_fpustate(vcpu); 1050 restore_host_msrs(vm, vcpuid); 1051 1052 vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); 1053 1054 critical_exit(); 1055 1056 if (error == 0) { 1057 retu = false; 1058 switch (vme->exitcode) { 1059 case VM_EXITCODE_HLT: 1060 intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); 1061 error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu); 1062 break; 1063 case VM_EXITCODE_PAGING: 1064 error = vm_handle_paging(vm, vcpuid, &retu); 1065 break; 1066 case VM_EXITCODE_INST_EMUL: 1067 error = vm_handle_inst_emul(vm, vcpuid, &retu); 1068 break; 1069 default: 1070 retu = true; /* handled in userland */ 1071 break; 1072 } 1073 } 1074 1075 if (error == 0 && retu == false) { 1076 rip = vme->rip + vme->inst_length; 1077 goto restart; 1078 } 1079 1080 /* copy the exit information */ 1081 bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit)); 1082 return (error); 1083} 1084 1085int 1086vm_inject_event(struct vm *vm, int vcpuid, int type, 1087 int vector, uint32_t code, int code_valid) 1088{ 1089 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1090 return (EINVAL); 1091 1092 if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0) 1093 return (EINVAL); 1094 1095 if (vector < 0 || vector > 255) 1096 return (EINVAL); 1097 1098 return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid)); 1099} 1100 1101static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); 1102 1103int 1104vm_inject_nmi(struct vm *vm, int vcpuid) 1105{ 1106 struct vcpu *vcpu; 1107 1108 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1109 return (EINVAL); 1110 1111 vcpu = &vm->vcpu[vcpuid]; 1112 1113 vcpu->nmi_pending = 1; 1114 vcpu_notify_event(vm, vcpuid); 1115 return (0); 1116} 1117 1118int 1119vm_nmi_pending(struct vm *vm, int vcpuid) 1120{ 1121 struct vcpu *vcpu; 1122 1123 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1124 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); 1125 1126 vcpu = &vm->vcpu[vcpuid]; 1127 1128 return (vcpu->nmi_pending); 1129} 1130 1131void 1132vm_nmi_clear(struct vm *vm, int vcpuid) 1133{ 1134 struct vcpu *vcpu; 1135 1136 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1137 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); 1138 1139 vcpu = &vm->vcpu[vcpuid]; 1140 1141 if (vcpu->nmi_pending == 0) 1142 panic("vm_nmi_clear: inconsistent nmi_pending state"); 1143 1144 vcpu->nmi_pending = 0; 1145 vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1); 1146} 1147 1148int 1149vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) 1150{ 1151 if (vcpu < 0 || vcpu >= VM_MAXCPU) 1152 return (EINVAL); 1153 1154 if (type < 0 || type >= VM_CAP_MAX) 1155 return (EINVAL); 1156 1157 return (VMGETCAP(vm->cookie, vcpu, type, retval)); 1158} 1159 1160int 1161vm_set_capability(struct vm *vm, int vcpu, int type, int val) 1162{ 1163 if (vcpu < 0 || vcpu >= VM_MAXCPU) 1164 return (EINVAL); 1165 1166 if (type < 0 || type >= VM_CAP_MAX) 1167 return (EINVAL); 1168 1169 return (VMSETCAP(vm->cookie, vcpu, type, val)); 1170} 1171 1172uint64_t * 1173vm_guest_msrs(struct vm *vm, int cpu) 1174{ 1175 return (vm->vcpu[cpu].guest_msrs); 1176} 1177 1178struct vlapic * 1179vm_lapic(struct vm *vm, int cpu) 1180{ 1181 return (vm->vcpu[cpu].vlapic); 1182} 1183 1184struct vioapic * 1185vm_ioapic(struct vm *vm) 1186{ 1187 1188 return (vm->vioapic); 1189} 1190 1191struct vhpet * 1192vm_hpet(struct vm *vm) 1193{ 1194 1195 return (vm->vhpet); 1196} 1197 1198boolean_t 1199vmm_is_pptdev(int bus, int slot, int func) 1200{ 1201 int found, i, n; 1202 int b, s, f; 1203 char *val, *cp, *cp2; 1204 1205 /* 1206 * XXX 1207 * The length of an environment variable is limited to 128 bytes which 1208 * puts an upper limit on the number of passthru devices that may be 1209 * specified using a single environment variable. 1210 * 1211 * Work around this by scanning multiple environment variable 1212 * names instead of a single one - yuck! 1213 */ 1214 const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL }; 1215 1216 /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */ 1217 found = 0; 1218 for (i = 0; names[i] != NULL && !found; i++) { 1219 cp = val = getenv(names[i]); 1220 while (cp != NULL && *cp != '\0') { 1221 if ((cp2 = strchr(cp, ' ')) != NULL) 1222 *cp2 = '\0'; 1223 1224 n = sscanf(cp, "%d/%d/%d", &b, &s, &f); 1225 if (n == 3 && bus == b && slot == s && func == f) { 1226 found = 1; 1227 break; 1228 } 1229 1230 if (cp2 != NULL) 1231 *cp2++ = ' '; 1232 1233 cp = cp2; 1234 } 1235 freeenv(val); 1236 } 1237 return (found); 1238} 1239 1240void * 1241vm_iommu_domain(struct vm *vm) 1242{ 1243 1244 return (vm->iommu); 1245} 1246 1247int 1248vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) 1249{ 1250 int error; 1251 struct vcpu *vcpu; 1252 1253 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1254 panic("vm_set_run_state: invalid vcpuid %d", vcpuid); 1255 1256 vcpu = &vm->vcpu[vcpuid]; 1257 1258 vcpu_lock(vcpu); 1259 error = vcpu_set_state_locked(vcpu, newstate); 1260 vcpu_unlock(vcpu); 1261 1262 return (error); 1263} 1264 1265enum vcpu_state 1266vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) 1267{ 1268 struct vcpu *vcpu; 1269 enum vcpu_state state; 1270 1271 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1272 panic("vm_get_run_state: invalid vcpuid %d", vcpuid); 1273 1274 vcpu = &vm->vcpu[vcpuid]; 1275 1276 vcpu_lock(vcpu); 1277 state = vcpu->state; 1278 if (hostcpu != NULL) 1279 *hostcpu = vcpu->hostcpu; 1280 vcpu_unlock(vcpu); 1281 1282 return (state); 1283} 1284 1285void 1286vm_activate_cpu(struct vm *vm, int vcpuid) 1287{ 1288 1289 if (vcpuid >= 0 && vcpuid < VM_MAXCPU) 1290 CPU_SET(vcpuid, &vm->active_cpus); 1291} 1292 1293cpuset_t 1294vm_active_cpus(struct vm *vm) 1295{ 1296 1297 return (vm->active_cpus); 1298} 1299 1300void * 1301vcpu_stats(struct vm *vm, int vcpuid) 1302{ 1303 1304 return (vm->vcpu[vcpuid].stats); 1305} 1306 1307int 1308vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) 1309{ 1310 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1311 return (EINVAL); 1312 1313 *state = vm->vcpu[vcpuid].x2apic_state; 1314 1315 return (0); 1316} 1317 1318int 1319vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) 1320{ 1321 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1322 return (EINVAL); 1323 1324 if (state >= X2APIC_STATE_LAST) 1325 return (EINVAL); 1326 1327 vm->vcpu[vcpuid].x2apic_state = state; 1328 1329 vlapic_set_x2apic_state(vm, vcpuid, state); 1330 1331 return (0); 1332} 1333 1334/* 1335 * This function is called to ensure that a vcpu "sees" a pending event 1336 * as soon as possible: 1337 * - If the vcpu thread is sleeping then it is woken up. 1338 * - If the vcpu is running on a different host_cpu then an IPI will be directed 1339 * to the host_cpu to cause the vcpu to trap into the hypervisor. 1340 */ 1341void 1342vcpu_notify_event(struct vm *vm, int vcpuid) 1343{ 1344 int hostcpu; 1345 struct vcpu *vcpu; 1346 1347 vcpu = &vm->vcpu[vcpuid]; 1348 1349 vcpu_lock(vcpu); 1350 hostcpu = vcpu->hostcpu; 1351 if (hostcpu == NOCPU) { 1352 if (vcpu->state == VCPU_SLEEPING) 1353 wakeup_one(vcpu); 1354 } else { 1355 if (vcpu->state != VCPU_RUNNING) 1356 panic("invalid vcpu state %d", vcpu->state); 1357 if (hostcpu != curcpu) 1358 ipi_cpu(hostcpu, vmm_ipinum); 1359 } 1360 vcpu_unlock(vcpu); 1361} 1362 1363struct vmspace * 1364vm_get_vmspace(struct vm *vm) 1365{ 1366 1367 return (vm->vmspace); 1368} 1369 1370int 1371vm_apicid2vcpuid(struct vm *vm, int apicid) 1372{ 1373 /* 1374 * XXX apic id is assumed to be numerically identical to vcpu id 1375 */ 1376 return (apicid); 1377} 1378