vmm_dev.c revision 268972
1/*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD: stable/10/sys/amd64/vmm/vmm_dev.c 268972 2014-07-22 03:14:37Z jhb $ 27 */ 28 29#include <sys/cdefs.h> 30__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/vmm_dev.c 268972 2014-07-22 03:14:37Z jhb $"); 31 32#include <sys/param.h> 33#include <sys/kernel.h> 34#include <sys/queue.h> 35#include <sys/lock.h> 36#include <sys/mutex.h> 37#include <sys/malloc.h> 38#include <sys/conf.h> 39#include <sys/sysctl.h> 40#include <sys/libkern.h> 41#include <sys/ioccom.h> 42#include <sys/mman.h> 43#include <sys/uio.h> 44 45#include <vm/vm.h> 46#include <vm/pmap.h> 47#include <vm/vm_map.h> 48 49#include <machine/vmparam.h> 50#include <machine/vmm.h> 51#include <machine/vmm_dev.h> 52 53#include "vmm_lapic.h" 54#include "vmm_stat.h" 55#include "vmm_mem.h" 56#include "io/ppt.h" 57#include "io/vatpic.h" 58#include "io/vioapic.h" 59#include "io/vhpet.h" 60 61struct vmmdev_softc { 62 struct vm *vm; /* vm instance cookie */ 63 struct cdev *cdev; 64 SLIST_ENTRY(vmmdev_softc) link; 65 int flags; 66}; 67#define VSC_LINKED 0x01 68 69static SLIST_HEAD(, vmmdev_softc) head; 70 71static struct mtx vmmdev_mtx; 72 73static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev"); 74 75SYSCTL_DECL(_hw_vmm); 76 77static struct vmmdev_softc * 78vmmdev_lookup(const char *name) 79{ 80 struct vmmdev_softc *sc; 81 82#ifdef notyet /* XXX kernel is not compiled with invariants */ 83 mtx_assert(&vmmdev_mtx, MA_OWNED); 84#endif 85 86 SLIST_FOREACH(sc, &head, link) { 87 if (strcmp(name, vm_name(sc->vm)) == 0) 88 break; 89 } 90 91 return (sc); 92} 93 94static struct vmmdev_softc * 95vmmdev_lookup2(struct cdev *cdev) 96{ 97 98 return (cdev->si_drv1); 99} 100 101static int 102vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags) 103{ 104 int error, off, c, prot; 105 vm_paddr_t gpa; 106 void *hpa, *cookie; 107 struct vmmdev_softc *sc; 108 109 static char zerobuf[PAGE_SIZE]; 110 111 error = 0; 112 sc = vmmdev_lookup2(cdev); 113 if (sc == NULL) 114 error = ENXIO; 115 116 prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ); 117 while (uio->uio_resid > 0 && error == 0) { 118 gpa = uio->uio_offset; 119 off = gpa & PAGE_MASK; 120 c = min(uio->uio_resid, PAGE_SIZE - off); 121 122 /* 123 * The VM has a hole in its physical memory map. If we want to 124 * use 'dd' to inspect memory beyond the hole we need to 125 * provide bogus data for memory that lies in the hole. 126 * 127 * Since this device does not support lseek(2), dd(1) will 128 * read(2) blocks of data to simulate the lseek(2). 129 */ 130 hpa = vm_gpa_hold(sc->vm, gpa, c, prot, &cookie); 131 if (hpa == NULL) { 132 if (uio->uio_rw == UIO_READ) 133 error = uiomove(zerobuf, c, uio); 134 else 135 error = EFAULT; 136 } else { 137 error = uiomove(hpa, c, uio); 138 vm_gpa_release(cookie); 139 } 140 } 141 return (error); 142} 143 144static int 145vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, 146 struct thread *td) 147{ 148 int error, vcpu, state_changed; 149 struct vmmdev_softc *sc; 150 struct vm_memory_segment *seg; 151 struct vm_register *vmreg; 152 struct vm_seg_desc *vmsegdesc; 153 struct vm_run *vmrun; 154 struct vm_exception *vmexc; 155 struct vm_lapic_irq *vmirq; 156 struct vm_lapic_msi *vmmsi; 157 struct vm_ioapic_irq *ioapic_irq; 158 struct vm_isa_irq *isa_irq; 159 struct vm_isa_irq_trigger *isa_irq_trigger; 160 struct vm_capability *vmcap; 161 struct vm_pptdev *pptdev; 162 struct vm_pptdev_mmio *pptmmio; 163 struct vm_pptdev_msi *pptmsi; 164 struct vm_pptdev_msix *pptmsix; 165 struct vm_nmi *vmnmi; 166 struct vm_stats *vmstats; 167 struct vm_stat_desc *statdesc; 168 struct vm_x2apic *x2apic; 169 struct vm_gpa_pte *gpapte; 170 struct vm_suspend *vmsuspend; 171 172 sc = vmmdev_lookup2(cdev); 173 if (sc == NULL) 174 return (ENXIO); 175 176 error = 0; 177 vcpu = -1; 178 state_changed = 0; 179 180 /* 181 * Some VMM ioctls can operate only on vcpus that are not running. 182 */ 183 switch (cmd) { 184 case VM_RUN: 185 case VM_GET_REGISTER: 186 case VM_SET_REGISTER: 187 case VM_GET_SEGMENT_DESCRIPTOR: 188 case VM_SET_SEGMENT_DESCRIPTOR: 189 case VM_INJECT_EXCEPTION: 190 case VM_GET_CAPABILITY: 191 case VM_SET_CAPABILITY: 192 case VM_PPTDEV_MSI: 193 case VM_PPTDEV_MSIX: 194 case VM_SET_X2APIC_STATE: 195 /* 196 * XXX fragile, handle with care 197 * Assumes that the first field of the ioctl data is the vcpu. 198 */ 199 vcpu = *(int *)data; 200 if (vcpu < 0 || vcpu >= VM_MAXCPU) { 201 error = EINVAL; 202 goto done; 203 } 204 205 error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true); 206 if (error) 207 goto done; 208 209 state_changed = 1; 210 break; 211 212 case VM_MAP_PPTDEV_MMIO: 213 case VM_BIND_PPTDEV: 214 case VM_UNBIND_PPTDEV: 215 case VM_MAP_MEMORY: 216 /* 217 * ioctls that operate on the entire virtual machine must 218 * prevent all vcpus from running. 219 */ 220 error = 0; 221 for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) { 222 error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true); 223 if (error) 224 break; 225 } 226 227 if (error) { 228 while (--vcpu >= 0) 229 vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); 230 goto done; 231 } 232 233 state_changed = 2; 234 break; 235 236 default: 237 break; 238 } 239 240 switch(cmd) { 241 case VM_RUN: 242 vmrun = (struct vm_run *)data; 243 error = vm_run(sc->vm, vmrun); 244 break; 245 case VM_SUSPEND: 246 vmsuspend = (struct vm_suspend *)data; 247 error = vm_suspend(sc->vm, vmsuspend->how); 248 break; 249 case VM_STAT_DESC: { 250 statdesc = (struct vm_stat_desc *)data; 251 error = vmm_stat_desc_copy(statdesc->index, 252 statdesc->desc, sizeof(statdesc->desc)); 253 break; 254 } 255 case VM_STATS: { 256 CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS); 257 vmstats = (struct vm_stats *)data; 258 getmicrotime(&vmstats->tv); 259 error = vmm_stat_copy(sc->vm, vmstats->cpuid, 260 &vmstats->num_entries, vmstats->statbuf); 261 break; 262 } 263 case VM_PPTDEV_MSI: 264 pptmsi = (struct vm_pptdev_msi *)data; 265 error = ppt_setup_msi(sc->vm, pptmsi->vcpu, 266 pptmsi->bus, pptmsi->slot, pptmsi->func, 267 pptmsi->addr, pptmsi->msg, 268 pptmsi->numvec); 269 break; 270 case VM_PPTDEV_MSIX: 271 pptmsix = (struct vm_pptdev_msix *)data; 272 error = ppt_setup_msix(sc->vm, pptmsix->vcpu, 273 pptmsix->bus, pptmsix->slot, 274 pptmsix->func, pptmsix->idx, 275 pptmsix->addr, pptmsix->msg, 276 pptmsix->vector_control); 277 break; 278 case VM_MAP_PPTDEV_MMIO: 279 pptmmio = (struct vm_pptdev_mmio *)data; 280 error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot, 281 pptmmio->func, pptmmio->gpa, pptmmio->len, 282 pptmmio->hpa); 283 break; 284 case VM_BIND_PPTDEV: 285 pptdev = (struct vm_pptdev *)data; 286 error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot, 287 pptdev->func); 288 break; 289 case VM_UNBIND_PPTDEV: 290 pptdev = (struct vm_pptdev *)data; 291 error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot, 292 pptdev->func); 293 break; 294 case VM_INJECT_EXCEPTION: 295 vmexc = (struct vm_exception *)data; 296 error = vm_inject_exception(sc->vm, vmexc->cpuid, vmexc); 297 break; 298 case VM_INJECT_NMI: 299 vmnmi = (struct vm_nmi *)data; 300 error = vm_inject_nmi(sc->vm, vmnmi->cpuid); 301 break; 302 case VM_LAPIC_IRQ: 303 vmirq = (struct vm_lapic_irq *)data; 304 error = lapic_intr_edge(sc->vm, vmirq->cpuid, vmirq->vector); 305 break; 306 case VM_LAPIC_LOCAL_IRQ: 307 vmirq = (struct vm_lapic_irq *)data; 308 error = lapic_set_local_intr(sc->vm, vmirq->cpuid, 309 vmirq->vector); 310 break; 311 case VM_LAPIC_MSI: 312 vmmsi = (struct vm_lapic_msi *)data; 313 error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg); 314 break; 315 case VM_IOAPIC_ASSERT_IRQ: 316 ioapic_irq = (struct vm_ioapic_irq *)data; 317 error = vioapic_assert_irq(sc->vm, ioapic_irq->irq); 318 break; 319 case VM_IOAPIC_DEASSERT_IRQ: 320 ioapic_irq = (struct vm_ioapic_irq *)data; 321 error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq); 322 break; 323 case VM_IOAPIC_PULSE_IRQ: 324 ioapic_irq = (struct vm_ioapic_irq *)data; 325 error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq); 326 break; 327 case VM_IOAPIC_PINCOUNT: 328 *(int *)data = vioapic_pincount(sc->vm); 329 break; 330 case VM_ISA_ASSERT_IRQ: 331 isa_irq = (struct vm_isa_irq *)data; 332 error = vatpic_assert_irq(sc->vm, isa_irq->atpic_irq); 333 if (error == 0 && isa_irq->ioapic_irq != -1) 334 error = vioapic_assert_irq(sc->vm, 335 isa_irq->ioapic_irq); 336 break; 337 case VM_ISA_DEASSERT_IRQ: 338 isa_irq = (struct vm_isa_irq *)data; 339 error = vatpic_deassert_irq(sc->vm, isa_irq->atpic_irq); 340 if (error == 0 && isa_irq->ioapic_irq != -1) 341 error = vioapic_deassert_irq(sc->vm, 342 isa_irq->ioapic_irq); 343 break; 344 case VM_ISA_PULSE_IRQ: 345 isa_irq = (struct vm_isa_irq *)data; 346 error = vatpic_pulse_irq(sc->vm, isa_irq->atpic_irq); 347 if (error == 0 && isa_irq->ioapic_irq != -1) 348 error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq); 349 break; 350 case VM_ISA_SET_IRQ_TRIGGER: 351 isa_irq_trigger = (struct vm_isa_irq_trigger *)data; 352 error = vatpic_set_irq_trigger(sc->vm, 353 isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger); 354 break; 355 case VM_MAP_MEMORY: 356 seg = (struct vm_memory_segment *)data; 357 error = vm_malloc(sc->vm, seg->gpa, seg->len); 358 break; 359 case VM_GET_MEMORY_SEG: 360 seg = (struct vm_memory_segment *)data; 361 seg->len = 0; 362 (void)vm_gpabase2memseg(sc->vm, seg->gpa, seg); 363 error = 0; 364 break; 365 case VM_GET_REGISTER: 366 vmreg = (struct vm_register *)data; 367 error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum, 368 &vmreg->regval); 369 break; 370 case VM_SET_REGISTER: 371 vmreg = (struct vm_register *)data; 372 error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum, 373 vmreg->regval); 374 break; 375 case VM_SET_SEGMENT_DESCRIPTOR: 376 vmsegdesc = (struct vm_seg_desc *)data; 377 error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid, 378 vmsegdesc->regnum, 379 &vmsegdesc->desc); 380 break; 381 case VM_GET_SEGMENT_DESCRIPTOR: 382 vmsegdesc = (struct vm_seg_desc *)data; 383 error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid, 384 vmsegdesc->regnum, 385 &vmsegdesc->desc); 386 break; 387 case VM_GET_CAPABILITY: 388 vmcap = (struct vm_capability *)data; 389 error = vm_get_capability(sc->vm, vmcap->cpuid, 390 vmcap->captype, 391 &vmcap->capval); 392 break; 393 case VM_SET_CAPABILITY: 394 vmcap = (struct vm_capability *)data; 395 error = vm_set_capability(sc->vm, vmcap->cpuid, 396 vmcap->captype, 397 vmcap->capval); 398 break; 399 case VM_SET_X2APIC_STATE: 400 x2apic = (struct vm_x2apic *)data; 401 error = vm_set_x2apic_state(sc->vm, 402 x2apic->cpuid, x2apic->state); 403 break; 404 case VM_GET_X2APIC_STATE: 405 x2apic = (struct vm_x2apic *)data; 406 error = vm_get_x2apic_state(sc->vm, 407 x2apic->cpuid, &x2apic->state); 408 break; 409 case VM_GET_GPA_PMAP: 410 gpapte = (struct vm_gpa_pte *)data; 411 pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)), 412 gpapte->gpa, gpapte->pte, &gpapte->ptenum); 413 error = 0; 414 break; 415 case VM_GET_HPET_CAPABILITIES: 416 error = vhpet_getcap((struct vm_hpet_cap *)data); 417 break; 418 default: 419 error = ENOTTY; 420 break; 421 } 422 423 if (state_changed == 1) { 424 vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); 425 } else if (state_changed == 2) { 426 for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) 427 vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); 428 } 429 430done: 431 /* Make sure that no handler returns a bogus value like ERESTART */ 432 KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error)); 433 return (error); 434} 435 436static int 437vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, 438 vm_size_t size, struct vm_object **object, int nprot) 439{ 440 int error; 441 struct vmmdev_softc *sc; 442 443 sc = vmmdev_lookup2(cdev); 444 if (sc != NULL && (nprot & PROT_EXEC) == 0) 445 error = vm_get_memobj(sc->vm, *offset, size, offset, object); 446 else 447 error = EINVAL; 448 449 return (error); 450} 451 452static void 453vmmdev_destroy(void *arg) 454{ 455 456 struct vmmdev_softc *sc = arg; 457 458 if (sc->cdev != NULL) 459 destroy_dev(sc->cdev); 460 461 if (sc->vm != NULL) 462 vm_destroy(sc->vm); 463 464 if ((sc->flags & VSC_LINKED) != 0) { 465 mtx_lock(&vmmdev_mtx); 466 SLIST_REMOVE(&head, sc, vmmdev_softc, link); 467 mtx_unlock(&vmmdev_mtx); 468 } 469 470 free(sc, M_VMMDEV); 471} 472 473static int 474sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS) 475{ 476 int error; 477 char buf[VM_MAX_NAMELEN]; 478 struct vmmdev_softc *sc; 479 struct cdev *cdev; 480 481 strlcpy(buf, "beavis", sizeof(buf)); 482 error = sysctl_handle_string(oidp, buf, sizeof(buf), req); 483 if (error != 0 || req->newptr == NULL) 484 return (error); 485 486 mtx_lock(&vmmdev_mtx); 487 sc = vmmdev_lookup(buf); 488 if (sc == NULL || sc->cdev == NULL) { 489 mtx_unlock(&vmmdev_mtx); 490 return (EINVAL); 491 } 492 493 /* 494 * The 'cdev' will be destroyed asynchronously when 'si_threadcount' 495 * goes down to 0 so we should not do it again in the callback. 496 */ 497 cdev = sc->cdev; 498 sc->cdev = NULL; 499 mtx_unlock(&vmmdev_mtx); 500 501 /* 502 * Schedule the 'cdev' to be destroyed: 503 * 504 * - any new operations on this 'cdev' will return an error (ENXIO). 505 * 506 * - when the 'si_threadcount' dwindles down to zero the 'cdev' will 507 * be destroyed and the callback will be invoked in a taskqueue 508 * context. 509 */ 510 destroy_dev_sched_cb(cdev, vmmdev_destroy, sc); 511 512 return (0); 513} 514SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW, 515 NULL, 0, sysctl_vmm_destroy, "A", NULL); 516 517static struct cdevsw vmmdevsw = { 518 .d_name = "vmmdev", 519 .d_version = D_VERSION, 520 .d_ioctl = vmmdev_ioctl, 521 .d_mmap_single = vmmdev_mmap_single, 522 .d_read = vmmdev_rw, 523 .d_write = vmmdev_rw, 524}; 525 526static int 527sysctl_vmm_create(SYSCTL_HANDLER_ARGS) 528{ 529 int error; 530 struct vm *vm; 531 struct cdev *cdev; 532 struct vmmdev_softc *sc, *sc2; 533 char buf[VM_MAX_NAMELEN]; 534 535 strlcpy(buf, "beavis", sizeof(buf)); 536 error = sysctl_handle_string(oidp, buf, sizeof(buf), req); 537 if (error != 0 || req->newptr == NULL) 538 return (error); 539 540 mtx_lock(&vmmdev_mtx); 541 sc = vmmdev_lookup(buf); 542 mtx_unlock(&vmmdev_mtx); 543 if (sc != NULL) 544 return (EEXIST); 545 546 error = vm_create(buf, &vm); 547 if (error != 0) 548 return (error); 549 550 sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO); 551 sc->vm = vm; 552 553 /* 554 * Lookup the name again just in case somebody sneaked in when we 555 * dropped the lock. 556 */ 557 mtx_lock(&vmmdev_mtx); 558 sc2 = vmmdev_lookup(buf); 559 if (sc2 == NULL) { 560 SLIST_INSERT_HEAD(&head, sc, link); 561 sc->flags |= VSC_LINKED; 562 } 563 mtx_unlock(&vmmdev_mtx); 564 565 if (sc2 != NULL) { 566 vmmdev_destroy(sc); 567 return (EEXIST); 568 } 569 570 error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, NULL, 571 UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf); 572 if (error != 0) { 573 vmmdev_destroy(sc); 574 return (error); 575 } 576 577 mtx_lock(&vmmdev_mtx); 578 sc->cdev = cdev; 579 sc->cdev->si_drv1 = sc; 580 mtx_unlock(&vmmdev_mtx); 581 582 return (0); 583} 584SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW, 585 NULL, 0, sysctl_vmm_create, "A", NULL); 586 587void 588vmmdev_init(void) 589{ 590 mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF); 591} 592 593int 594vmmdev_cleanup(void) 595{ 596 int error; 597 598 if (SLIST_EMPTY(&head)) 599 error = 0; 600 else 601 error = EBUSY; 602 603 return (error); 604} 605