vmm.c revision 234695
1/*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD$ 27 */ 28 29#include <sys/cdefs.h> 30__FBSDID("$FreeBSD$"); 31 32#include <sys/param.h> 33#include <sys/systm.h> 34#include <sys/kernel.h> 35#include <sys/module.h> 36#include <sys/sysctl.h> 37#include <sys/malloc.h> 38#include <sys/pcpu.h> 39#include <sys/lock.h> 40#include <sys/mutex.h> 41#include <sys/proc.h> 42#include <sys/sched.h> 43#include <sys/smp.h> 44#include <sys/systm.h> 45 46#include <vm/vm.h> 47 48#include <machine/vm.h> 49#include <machine/pcb.h> 50#include <x86/apicreg.h> 51 52#include <machine/vmm.h> 53#include "vmm_mem.h" 54#include "vmm_util.h" 55#include <machine/vmm_dev.h> 56#include "vlapic.h" 57#include "vmm_msr.h" 58#include "vmm_ipi.h" 59#include "vmm_stat.h" 60 61#include "io/ppt.h" 62#include "io/iommu.h" 63 64struct vlapic; 65 66struct vcpu { 67 int flags; 68 int pincpu; /* host cpuid this vcpu is bound to */ 69 int hostcpu; /* host cpuid this vcpu last ran on */ 70 uint64_t guest_msrs[VMM_MSR_NUM]; 71 struct vlapic *vlapic; 72 int vcpuid; 73 struct savefpu *guestfpu; /* guest fpu state */ 74 void *stats; 75}; 76#define VCPU_F_PINNED 0x0001 77#define VCPU_F_RUNNING 0x0002 78 79#define VCPU_PINCPU(vm, vcpuid) \ 80 ((vm->vcpu[vcpuid].flags & VCPU_F_PINNED) ? vm->vcpu[vcpuid].pincpu : -1) 81 82#define VCPU_UNPIN(vm, vcpuid) (vm->vcpu[vcpuid].flags &= ~VCPU_F_PINNED) 83 84#define VCPU_PIN(vm, vcpuid, host_cpuid) \ 85do { \ 86 vm->vcpu[vcpuid].flags |= VCPU_F_PINNED; \ 87 vm->vcpu[vcpuid].pincpu = host_cpuid; \ 88} while(0) 89 90#define VM_MAX_MEMORY_SEGMENTS 2 91 92struct vm { 93 void *cookie; /* processor-specific data */ 94 void *iommu; /* iommu-specific data */ 95 struct vcpu vcpu[VM_MAXCPU]; 96 int num_mem_segs; 97 struct vm_memory_segment mem_segs[VM_MAX_MEMORY_SEGMENTS]; 98 char name[VM_MAX_NAMELEN]; 99 100 /* 101 * Set of active vcpus. 102 * An active vcpu is one that has been started implicitly (BSP) or 103 * explicitly (AP) by sending it a startup ipi. 104 */ 105 cpuset_t active_cpus; 106}; 107 108static struct vmm_ops *ops; 109#define VMM_INIT() (ops != NULL ? (*ops->init)() : 0) 110#define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0) 111 112#define VMINIT(vm) (ops != NULL ? (*ops->vminit)(vm): NULL) 113#define VMRUN(vmi, vcpu, rip, vmexit) \ 114 (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, vmexit) : ENXIO) 115#define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL) 116#define VMMMAP(vmi, gpa, hpa, len, attr, prot, spm) \ 117 (ops != NULL ? (*ops->vmmmap)(vmi, gpa, hpa, len, attr, prot, spm) : ENXIO) 118#define VMGETREG(vmi, vcpu, num, retval) \ 119 (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO) 120#define VMSETREG(vmi, vcpu, num, val) \ 121 (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO) 122#define VMGETDESC(vmi, vcpu, num, desc) \ 123 (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO) 124#define VMSETDESC(vmi, vcpu, num, desc) \ 125 (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO) 126#define VMINJECT(vmi, vcpu, type, vec, ec, ecv) \ 127 (ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO) 128#define VMNMI(vmi, vcpu) \ 129 (ops != NULL ? (*ops->vmnmi)(vmi, vcpu) : ENXIO) 130#define VMGETCAP(vmi, vcpu, num, retval) \ 131 (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO) 132#define VMSETCAP(vmi, vcpu, num, val) \ 133 (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO) 134 135#define fpu_start_emulating() start_emulating() 136#define fpu_stop_emulating() stop_emulating() 137 138static MALLOC_DEFINE(M_VM, "vm", "vm"); 139CTASSERT(VMM_MSR_NUM <= 64); /* msr_mask can keep track of up to 64 msrs */ 140 141/* statistics */ 142static VMM_STAT_DEFINE(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); 143 144static void 145vcpu_cleanup(struct vcpu *vcpu) 146{ 147 vlapic_cleanup(vcpu->vlapic); 148 vmm_stat_free(vcpu->stats); 149 fpu_save_area_free(vcpu->guestfpu); 150} 151 152static void 153vcpu_init(struct vm *vm, uint32_t vcpu_id) 154{ 155 struct vcpu *vcpu; 156 157 vcpu = &vm->vcpu[vcpu_id]; 158 159 vcpu->hostcpu = -1; 160 vcpu->vcpuid = vcpu_id; 161 vcpu->vlapic = vlapic_init(vm, vcpu_id); 162 vcpu->guestfpu = fpu_save_area_alloc(); 163 fpu_save_area_reset(vcpu->guestfpu); 164 vcpu->stats = vmm_stat_alloc(); 165} 166 167static int 168vmm_init(void) 169{ 170 int error; 171 172 vmm_ipi_init(); 173 174 error = vmm_mem_init(); 175 if (error) 176 return (error); 177 178 if (vmm_is_intel()) 179 ops = &vmm_ops_intel; 180 else if (vmm_is_amd()) 181 ops = &vmm_ops_amd; 182 else 183 return (ENXIO); 184 185 vmm_msr_init(); 186 187 return (VMM_INIT()); 188} 189 190static int 191vmm_handler(module_t mod, int what, void *arg) 192{ 193 int error; 194 195 switch (what) { 196 case MOD_LOAD: 197 vmmdev_init(); 198 iommu_init(); 199 error = vmm_init(); 200 break; 201 case MOD_UNLOAD: 202 vmmdev_cleanup(); 203 iommu_cleanup(); 204 vmm_ipi_cleanup(); 205 error = VMM_CLEANUP(); 206 break; 207 default: 208 error = 0; 209 break; 210 } 211 return (error); 212} 213 214static moduledata_t vmm_kmod = { 215 "vmm", 216 vmm_handler, 217 NULL 218}; 219 220/* 221 * Execute the module load handler after the pci passthru driver has had 222 * a chance to claim devices. We need this information at the time we do 223 * iommu initialization. 224 */ 225DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_CONFIGURE + 1, SI_ORDER_ANY); 226MODULE_VERSION(vmm, 1); 227 228SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); 229 230struct vm * 231vm_create(const char *name) 232{ 233 int i; 234 struct vm *vm; 235 vm_paddr_t maxaddr; 236 237 const int BSP = 0; 238 239 if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) 240 return (NULL); 241 242 vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); 243 strcpy(vm->name, name); 244 vm->cookie = VMINIT(vm); 245 246 for (i = 0; i < VM_MAXCPU; i++) { 247 vcpu_init(vm, i); 248 guest_msrs_init(vm, i); 249 } 250 251 maxaddr = vmm_mem_maxaddr(); 252 vm->iommu = iommu_create_domain(maxaddr); 253 vm_activate_cpu(vm, BSP); 254 255 return (vm); 256} 257 258void 259vm_destroy(struct vm *vm) 260{ 261 int i; 262 263 ppt_unassign_all(vm); 264 265 for (i = 0; i < vm->num_mem_segs; i++) 266 vmm_mem_free(vm->mem_segs[i].hpa, vm->mem_segs[i].len); 267 268 for (i = 0; i < VM_MAXCPU; i++) 269 vcpu_cleanup(&vm->vcpu[i]); 270 271 iommu_destroy_domain(vm->iommu); 272 273 VMCLEANUP(vm->cookie); 274 275 free(vm, M_VM); 276} 277 278const char * 279vm_name(struct vm *vm) 280{ 281 return (vm->name); 282} 283 284int 285vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) 286{ 287 const boolean_t spok = TRUE; /* superpage mappings are ok */ 288 289 return (VMMMAP(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE, 290 VM_PROT_RW, spok)); 291} 292 293int 294vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) 295{ 296 const boolean_t spok = TRUE; /* superpage mappings are ok */ 297 298 return (VMMMAP(vm->cookie, gpa, 0, len, VM_MEMATTR_UNCACHEABLE, 299 VM_PROT_NONE, spok)); 300} 301 302int 303vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t *ret_hpa) 304{ 305 int error; 306 vm_paddr_t hpa; 307 308 const boolean_t spok = TRUE; /* superpage mappings are ok */ 309 310 /* 311 * find the hpa if already it was already vm_malloc'd. 312 */ 313 hpa = vm_gpa2hpa(vm, gpa, len); 314 if (hpa != ((vm_paddr_t)-1)) 315 goto out; 316 317 if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS) 318 return (E2BIG); 319 320 hpa = vmm_mem_alloc(len); 321 if (hpa == 0) 322 return (ENOMEM); 323 324 error = VMMMAP(vm->cookie, gpa, hpa, len, VM_MEMATTR_WRITE_BACK, 325 VM_PROT_ALL, spok); 326 if (error) { 327 vmm_mem_free(hpa, len); 328 return (error); 329 } 330 331 iommu_create_mapping(vm->iommu, gpa, hpa, len); 332 333 vm->mem_segs[vm->num_mem_segs].gpa = gpa; 334 vm->mem_segs[vm->num_mem_segs].hpa = hpa; 335 vm->mem_segs[vm->num_mem_segs].len = len; 336 vm->num_mem_segs++; 337out: 338 *ret_hpa = hpa; 339 return (0); 340} 341 342vm_paddr_t 343vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len) 344{ 345 int i; 346 vm_paddr_t gpabase, gpalimit, hpabase; 347 348 for (i = 0; i < vm->num_mem_segs; i++) { 349 hpabase = vm->mem_segs[i].hpa; 350 gpabase = vm->mem_segs[i].gpa; 351 gpalimit = gpabase + vm->mem_segs[i].len; 352 if (gpa >= gpabase && gpa + len <= gpalimit) 353 return ((gpa - gpabase) + hpabase); 354 } 355 return ((vm_paddr_t)-1); 356} 357 358int 359vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase, 360 struct vm_memory_segment *seg) 361{ 362 int i; 363 364 for (i = 0; i < vm->num_mem_segs; i++) { 365 if (gpabase == vm->mem_segs[i].gpa) { 366 *seg = vm->mem_segs[i]; 367 return (0); 368 } 369 } 370 return (-1); 371} 372 373int 374vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) 375{ 376 377 if (vcpu < 0 || vcpu >= VM_MAXCPU) 378 return (EINVAL); 379 380 if (reg >= VM_REG_LAST) 381 return (EINVAL); 382 383 return (VMGETREG(vm->cookie, vcpu, reg, retval)); 384} 385 386int 387vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val) 388{ 389 390 if (vcpu < 0 || vcpu >= VM_MAXCPU) 391 return (EINVAL); 392 393 if (reg >= VM_REG_LAST) 394 return (EINVAL); 395 396 return (VMSETREG(vm->cookie, vcpu, reg, val)); 397} 398 399static boolean_t 400is_descriptor_table(int reg) 401{ 402 403 switch (reg) { 404 case VM_REG_GUEST_IDTR: 405 case VM_REG_GUEST_GDTR: 406 return (TRUE); 407 default: 408 return (FALSE); 409 } 410} 411 412static boolean_t 413is_segment_register(int reg) 414{ 415 416 switch (reg) { 417 case VM_REG_GUEST_ES: 418 case VM_REG_GUEST_CS: 419 case VM_REG_GUEST_SS: 420 case VM_REG_GUEST_DS: 421 case VM_REG_GUEST_FS: 422 case VM_REG_GUEST_GS: 423 case VM_REG_GUEST_TR: 424 case VM_REG_GUEST_LDTR: 425 return (TRUE); 426 default: 427 return (FALSE); 428 } 429} 430 431int 432vm_get_seg_desc(struct vm *vm, int vcpu, int reg, 433 struct seg_desc *desc) 434{ 435 436 if (vcpu < 0 || vcpu >= VM_MAXCPU) 437 return (EINVAL); 438 439 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 440 return (EINVAL); 441 442 return (VMGETDESC(vm->cookie, vcpu, reg, desc)); 443} 444 445int 446vm_set_seg_desc(struct vm *vm, int vcpu, int reg, 447 struct seg_desc *desc) 448{ 449 if (vcpu < 0 || vcpu >= VM_MAXCPU) 450 return (EINVAL); 451 452 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 453 return (EINVAL); 454 455 return (VMSETDESC(vm->cookie, vcpu, reg, desc)); 456} 457 458int 459vm_get_pinning(struct vm *vm, int vcpuid, int *cpuid) 460{ 461 462 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 463 return (EINVAL); 464 465 *cpuid = VCPU_PINCPU(vm, vcpuid); 466 467 return (0); 468} 469 470int 471vm_set_pinning(struct vm *vm, int vcpuid, int host_cpuid) 472{ 473 struct thread *td; 474 475 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 476 return (EINVAL); 477 478 td = curthread; /* XXXSMP only safe when muxing vcpus */ 479 480 /* unpin */ 481 if (host_cpuid < 0) { 482 VCPU_UNPIN(vm, vcpuid); 483 thread_lock(td); 484 sched_unbind(td); 485 thread_unlock(td); 486 return (0); 487 } 488 489 if (CPU_ABSENT(host_cpuid)) 490 return (EINVAL); 491 492 /* 493 * XXX we should check that 'host_cpuid' has not already been pinned 494 * by another vm. 495 */ 496 thread_lock(td); 497 sched_bind(td, host_cpuid); 498 thread_unlock(td); 499 VCPU_PIN(vm, vcpuid, host_cpuid); 500 501 return (0); 502} 503 504static void 505restore_guest_fpustate(struct vcpu *vcpu) 506{ 507 508 /* flush host state to the pcb */ 509 fpuexit(curthread); 510 fpu_stop_emulating(); 511 fpurestore(vcpu->guestfpu); 512} 513 514static void 515save_guest_fpustate(struct vcpu *vcpu) 516{ 517 518 fpusave(vcpu->guestfpu); 519 fpu_start_emulating(); 520} 521 522int 523vm_run(struct vm *vm, struct vm_run *vmrun) 524{ 525 int error, vcpuid; 526 struct vcpu *vcpu; 527 struct pcb *pcb; 528 uint64_t tscval; 529 530 vcpuid = vmrun->cpuid; 531 532 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 533 return (EINVAL); 534 535 vcpu = &vm->vcpu[vcpuid]; 536 537 critical_enter(); 538 539 tscval = rdtsc(); 540 541 pcb = PCPU_GET(curpcb); 542 set_pcb_flags(pcb, PCB_FULL_IRET); 543 544 vcpu->hostcpu = curcpu; 545 546 restore_guest_msrs(vm, vcpuid); 547 restore_guest_fpustate(vcpu); 548 error = VMRUN(vm->cookie, vcpuid, vmrun->rip, &vmrun->vm_exit); 549 save_guest_fpustate(vcpu); 550 restore_host_msrs(vm, vcpuid); 551 552 vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); 553 554 critical_exit(); 555 556 return (error); 557} 558 559int 560vm_inject_event(struct vm *vm, int vcpuid, int type, 561 int vector, uint32_t code, int code_valid) 562{ 563 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 564 return (EINVAL); 565 566 if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0) 567 return (EINVAL); 568 569 if (vector < 0 || vector > 255) 570 return (EINVAL); 571 572 return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid)); 573} 574 575int 576vm_inject_nmi(struct vm *vm, int vcpu) 577{ 578 int error; 579 580 if (vcpu < 0 || vcpu >= VM_MAXCPU) 581 return (EINVAL); 582 583 error = VMNMI(vm->cookie, vcpu); 584 vm_interrupt_hostcpu(vm, vcpu); 585 return (error); 586} 587 588int 589vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) 590{ 591 if (vcpu < 0 || vcpu >= VM_MAXCPU) 592 return (EINVAL); 593 594 if (type < 0 || type >= VM_CAP_MAX) 595 return (EINVAL); 596 597 return (VMGETCAP(vm->cookie, vcpu, type, retval)); 598} 599 600int 601vm_set_capability(struct vm *vm, int vcpu, int type, int val) 602{ 603 if (vcpu < 0 || vcpu >= VM_MAXCPU) 604 return (EINVAL); 605 606 if (type < 0 || type >= VM_CAP_MAX) 607 return (EINVAL); 608 609 return (VMSETCAP(vm->cookie, vcpu, type, val)); 610} 611 612uint64_t * 613vm_guest_msrs(struct vm *vm, int cpu) 614{ 615 return (vm->vcpu[cpu].guest_msrs); 616} 617 618struct vlapic * 619vm_lapic(struct vm *vm, int cpu) 620{ 621 return (vm->vcpu[cpu].vlapic); 622} 623 624boolean_t 625vmm_is_pptdev(int bus, int slot, int func) 626{ 627 int found, b, s, f, n; 628 char *val, *cp, *cp2; 629 630 /* 631 * setenv pptdevs "1/2/3 4/5/6 7/8/9 10/11/12" 632 */ 633 found = 0; 634 cp = val = getenv("pptdevs"); 635 while (cp != NULL && *cp != '\0') { 636 if ((cp2 = strchr(cp, ' ')) != NULL) 637 *cp2 = '\0'; 638 639 n = sscanf(cp, "%d/%d/%d", &b, &s, &f); 640 if (n == 3 && bus == b && slot == s && func == f) { 641 found = 1; 642 break; 643 } 644 645 if (cp2 != NULL) 646 *cp2++ = ' '; 647 648 cp = cp2; 649 } 650 freeenv(val); 651 return (found); 652} 653 654void * 655vm_iommu_domain(struct vm *vm) 656{ 657 658 return (vm->iommu); 659} 660 661void 662vm_set_run_state(struct vm *vm, int vcpuid, int state) 663{ 664 struct vcpu *vcpu; 665 666 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 667 panic("vm_set_run_state: invalid vcpuid %d", vcpuid); 668 669 vcpu = &vm->vcpu[vcpuid]; 670 671 if (state == VCPU_RUNNING) { 672 if (vcpu->flags & VCPU_F_RUNNING) { 673 panic("vm_set_run_state: %s[%d] is already running", 674 vm_name(vm), vcpuid); 675 } 676 vcpu->flags |= VCPU_F_RUNNING; 677 } else { 678 if ((vcpu->flags & VCPU_F_RUNNING) == 0) { 679 panic("vm_set_run_state: %s[%d] is already stopped", 680 vm_name(vm), vcpuid); 681 } 682 vcpu->flags &= ~VCPU_F_RUNNING; 683 } 684} 685 686int 687vm_get_run_state(struct vm *vm, int vcpuid, int *cpuptr) 688{ 689 int retval, hostcpu; 690 struct vcpu *vcpu; 691 692 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 693 panic("vm_get_run_state: invalid vcpuid %d", vcpuid); 694 695 vcpu = &vm->vcpu[vcpuid]; 696 if (vcpu->flags & VCPU_F_RUNNING) { 697 retval = VCPU_RUNNING; 698 hostcpu = vcpu->hostcpu; 699 } else { 700 retval = VCPU_STOPPED; 701 hostcpu = -1; 702 } 703 704 if (cpuptr) 705 *cpuptr = hostcpu; 706 707 return (retval); 708} 709 710void 711vm_activate_cpu(struct vm *vm, int vcpuid) 712{ 713 714 if (vcpuid >= 0 && vcpuid < VM_MAXCPU) 715 CPU_SET(vcpuid, &vm->active_cpus); 716} 717 718cpuset_t 719vm_active_cpus(struct vm *vm) 720{ 721 722 return (vm->active_cpus); 723} 724 725void * 726vcpu_stats(struct vm *vm, int vcpuid) 727{ 728 729 return (vm->vcpu[vcpuid].stats); 730} 731