vmm_dev.c revision 268891
1/*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD: stable/10/sys/amd64/vmm/vmm_dev.c 268891 2014-07-19 22:06:46Z jhb $ 27 */ 28 29#include <sys/cdefs.h> 30__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/vmm_dev.c 268891 2014-07-19 22:06:46Z jhb $"); 31 32#include <sys/param.h> 33#include <sys/kernel.h> 34#include <sys/queue.h> 35#include <sys/lock.h> 36#include <sys/mutex.h> 37#include <sys/malloc.h> 38#include <sys/conf.h> 39#include <sys/sysctl.h> 40#include <sys/libkern.h> 41#include <sys/ioccom.h> 42#include <sys/mman.h> 43#include <sys/uio.h> 44 45#include <vm/vm.h> 46#include <vm/pmap.h> 47#include <vm/vm_map.h> 48 49#include <machine/vmparam.h> 50#include <machine/vmm.h> 51#include <machine/vmm_dev.h> 52 53#include "vmm_lapic.h" 54#include "vmm_stat.h" 55#include "vmm_mem.h" 56#include "io/ppt.h" 57#include "io/vatpic.h" 58#include "io/vioapic.h" 59#include "io/vhpet.h" 60 61struct vmmdev_softc { 62 struct vm *vm; /* vm instance cookie */ 63 struct cdev *cdev; 64 SLIST_ENTRY(vmmdev_softc) link; 65 int flags; 66}; 67#define VSC_LINKED 0x01 68 69static SLIST_HEAD(, vmmdev_softc) head; 70 71static struct mtx vmmdev_mtx; 72 73static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev"); 74 75SYSCTL_DECL(_hw_vmm); 76 77static struct vmmdev_softc * 78vmmdev_lookup(const char *name) 79{ 80 struct vmmdev_softc *sc; 81 82#ifdef notyet /* XXX kernel is not compiled with invariants */ 83 mtx_assert(&vmmdev_mtx, MA_OWNED); 84#endif 85 86 SLIST_FOREACH(sc, &head, link) { 87 if (strcmp(name, vm_name(sc->vm)) == 0) 88 break; 89 } 90 91 return (sc); 92} 93 94static struct vmmdev_softc * 95vmmdev_lookup2(struct cdev *cdev) 96{ 97 98 return (cdev->si_drv1); 99} 100 101static int 102vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags) 103{ 104 int error, off, c, prot; 105 vm_paddr_t gpa; 106 void *hpa, *cookie; 107 struct vmmdev_softc *sc; 108 109 static char zerobuf[PAGE_SIZE]; 110 111 error = 0; 112 sc = vmmdev_lookup2(cdev); 113 if (sc == NULL) 114 error = ENXIO; 115 116 prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ); 117 while (uio->uio_resid > 0 && error == 0) { 118 gpa = uio->uio_offset; 119 off = gpa & PAGE_MASK; 120 c = min(uio->uio_resid, PAGE_SIZE - off); 121 122 /* 123 * The VM has a hole in its physical memory map. If we want to 124 * use 'dd' to inspect memory beyond the hole we need to 125 * provide bogus data for memory that lies in the hole. 126 * 127 * Since this device does not support lseek(2), dd(1) will 128 * read(2) blocks of data to simulate the lseek(2). 129 */ 130 hpa = vm_gpa_hold(sc->vm, gpa, c, prot, &cookie); 131 if (hpa == NULL) { 132 if (uio->uio_rw == UIO_READ) 133 error = uiomove(zerobuf, c, uio); 134 else 135 error = EFAULT; 136 } else { 137 error = uiomove(hpa, c, uio); 138 vm_gpa_release(cookie); 139 } 140 } 141 return (error); 142} 143 144static int 145vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, 146 struct thread *td) 147{ 148 int error, vcpu, state_changed; 149 struct vmmdev_softc *sc; 150 struct vm_memory_segment *seg; 151 struct vm_register *vmreg; 152 struct vm_seg_desc *vmsegdesc; 153 struct vm_run *vmrun; 154 struct vm_exception *vmexc; 155 struct vm_lapic_irq *vmirq; 156 struct vm_lapic_msi *vmmsi; 157 struct vm_ioapic_irq *ioapic_irq; 158 struct vm_isa_irq *isa_irq; 159 struct vm_capability *vmcap; 160 struct vm_pptdev *pptdev; 161 struct vm_pptdev_mmio *pptmmio; 162 struct vm_pptdev_msi *pptmsi; 163 struct vm_pptdev_msix *pptmsix; 164 struct vm_nmi *vmnmi; 165 struct vm_stats *vmstats; 166 struct vm_stat_desc *statdesc; 167 struct vm_x2apic *x2apic; 168 struct vm_gpa_pte *gpapte; 169 170 sc = vmmdev_lookup2(cdev); 171 if (sc == NULL) 172 return (ENXIO); 173 174 error = 0; 175 vcpu = -1; 176 state_changed = 0; 177 178 /* 179 * Some VMM ioctls can operate only on vcpus that are not running. 180 */ 181 switch (cmd) { 182 case VM_RUN: 183 case VM_GET_REGISTER: 184 case VM_SET_REGISTER: 185 case VM_GET_SEGMENT_DESCRIPTOR: 186 case VM_SET_SEGMENT_DESCRIPTOR: 187 case VM_INJECT_EXCEPTION: 188 case VM_GET_CAPABILITY: 189 case VM_SET_CAPABILITY: 190 case VM_PPTDEV_MSI: 191 case VM_PPTDEV_MSIX: 192 case VM_SET_X2APIC_STATE: 193 /* 194 * XXX fragile, handle with care 195 * Assumes that the first field of the ioctl data is the vcpu. 196 */ 197 vcpu = *(int *)data; 198 if (vcpu < 0 || vcpu >= VM_MAXCPU) { 199 error = EINVAL; 200 goto done; 201 } 202 203 error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true); 204 if (error) 205 goto done; 206 207 state_changed = 1; 208 break; 209 210 case VM_MAP_PPTDEV_MMIO: 211 case VM_BIND_PPTDEV: 212 case VM_UNBIND_PPTDEV: 213 case VM_MAP_MEMORY: 214 /* 215 * ioctls that operate on the entire virtual machine must 216 * prevent all vcpus from running. 217 */ 218 error = 0; 219 for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) { 220 error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true); 221 if (error) 222 break; 223 } 224 225 if (error) { 226 while (--vcpu >= 0) 227 vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); 228 goto done; 229 } 230 231 state_changed = 2; 232 break; 233 234 default: 235 break; 236 } 237 238 switch(cmd) { 239 case VM_RUN: 240 vmrun = (struct vm_run *)data; 241 error = vm_run(sc->vm, vmrun); 242 break; 243 case VM_STAT_DESC: { 244 statdesc = (struct vm_stat_desc *)data; 245 error = vmm_stat_desc_copy(statdesc->index, 246 statdesc->desc, sizeof(statdesc->desc)); 247 break; 248 } 249 case VM_STATS: { 250 CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS); 251 vmstats = (struct vm_stats *)data; 252 getmicrotime(&vmstats->tv); 253 error = vmm_stat_copy(sc->vm, vmstats->cpuid, 254 &vmstats->num_entries, vmstats->statbuf); 255 break; 256 } 257 case VM_PPTDEV_MSI: 258 pptmsi = (struct vm_pptdev_msi *)data; 259 error = ppt_setup_msi(sc->vm, pptmsi->vcpu, 260 pptmsi->bus, pptmsi->slot, pptmsi->func, 261 pptmsi->addr, pptmsi->msg, 262 pptmsi->numvec); 263 break; 264 case VM_PPTDEV_MSIX: 265 pptmsix = (struct vm_pptdev_msix *)data; 266 error = ppt_setup_msix(sc->vm, pptmsix->vcpu, 267 pptmsix->bus, pptmsix->slot, 268 pptmsix->func, pptmsix->idx, 269 pptmsix->addr, pptmsix->msg, 270 pptmsix->vector_control); 271 break; 272 case VM_MAP_PPTDEV_MMIO: 273 pptmmio = (struct vm_pptdev_mmio *)data; 274 error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot, 275 pptmmio->func, pptmmio->gpa, pptmmio->len, 276 pptmmio->hpa); 277 break; 278 case VM_BIND_PPTDEV: 279 pptdev = (struct vm_pptdev *)data; 280 error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot, 281 pptdev->func); 282 break; 283 case VM_UNBIND_PPTDEV: 284 pptdev = (struct vm_pptdev *)data; 285 error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot, 286 pptdev->func); 287 break; 288 case VM_INJECT_EXCEPTION: 289 vmexc = (struct vm_exception *)data; 290 error = vm_inject_exception(sc->vm, vmexc->cpuid, vmexc); 291 break; 292 case VM_INJECT_NMI: 293 vmnmi = (struct vm_nmi *)data; 294 error = vm_inject_nmi(sc->vm, vmnmi->cpuid); 295 break; 296 case VM_LAPIC_IRQ: 297 vmirq = (struct vm_lapic_irq *)data; 298 error = lapic_intr_edge(sc->vm, vmirq->cpuid, vmirq->vector); 299 break; 300 case VM_LAPIC_LOCAL_IRQ: 301 vmirq = (struct vm_lapic_irq *)data; 302 error = lapic_set_local_intr(sc->vm, vmirq->cpuid, 303 vmirq->vector); 304 break; 305 case VM_LAPIC_MSI: 306 vmmsi = (struct vm_lapic_msi *)data; 307 error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg); 308 break; 309 case VM_IOAPIC_ASSERT_IRQ: 310 ioapic_irq = (struct vm_ioapic_irq *)data; 311 error = vioapic_assert_irq(sc->vm, ioapic_irq->irq); 312 break; 313 case VM_IOAPIC_DEASSERT_IRQ: 314 ioapic_irq = (struct vm_ioapic_irq *)data; 315 error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq); 316 break; 317 case VM_IOAPIC_PULSE_IRQ: 318 ioapic_irq = (struct vm_ioapic_irq *)data; 319 error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq); 320 break; 321 case VM_IOAPIC_PINCOUNT: 322 *(int *)data = vioapic_pincount(sc->vm); 323 break; 324 case VM_ISA_ASSERT_IRQ: 325 isa_irq = (struct vm_isa_irq *)data; 326 error = vatpic_assert_irq(sc->vm, isa_irq->atpic_irq); 327 if (error == 0 && isa_irq->ioapic_irq != -1) 328 error = vioapic_assert_irq(sc->vm, 329 isa_irq->ioapic_irq); 330 break; 331 case VM_ISA_DEASSERT_IRQ: 332 isa_irq = (struct vm_isa_irq *)data; 333 error = vatpic_deassert_irq(sc->vm, isa_irq->atpic_irq); 334 if (error == 0 && isa_irq->ioapic_irq != -1) 335 error = vioapic_deassert_irq(sc->vm, 336 isa_irq->ioapic_irq); 337 break; 338 case VM_ISA_PULSE_IRQ: 339 isa_irq = (struct vm_isa_irq *)data; 340 error = vatpic_pulse_irq(sc->vm, isa_irq->atpic_irq); 341 if (error == 0 && isa_irq->ioapic_irq != -1) 342 error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq); 343 break; 344 case VM_MAP_MEMORY: 345 seg = (struct vm_memory_segment *)data; 346 error = vm_malloc(sc->vm, seg->gpa, seg->len); 347 break; 348 case VM_GET_MEMORY_SEG: 349 seg = (struct vm_memory_segment *)data; 350 seg->len = 0; 351 (void)vm_gpabase2memseg(sc->vm, seg->gpa, seg); 352 error = 0; 353 break; 354 case VM_GET_REGISTER: 355 vmreg = (struct vm_register *)data; 356 error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum, 357 &vmreg->regval); 358 break; 359 case VM_SET_REGISTER: 360 vmreg = (struct vm_register *)data; 361 error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum, 362 vmreg->regval); 363 break; 364 case VM_SET_SEGMENT_DESCRIPTOR: 365 vmsegdesc = (struct vm_seg_desc *)data; 366 error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid, 367 vmsegdesc->regnum, 368 &vmsegdesc->desc); 369 break; 370 case VM_GET_SEGMENT_DESCRIPTOR: 371 vmsegdesc = (struct vm_seg_desc *)data; 372 error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid, 373 vmsegdesc->regnum, 374 &vmsegdesc->desc); 375 break; 376 case VM_GET_CAPABILITY: 377 vmcap = (struct vm_capability *)data; 378 error = vm_get_capability(sc->vm, vmcap->cpuid, 379 vmcap->captype, 380 &vmcap->capval); 381 break; 382 case VM_SET_CAPABILITY: 383 vmcap = (struct vm_capability *)data; 384 error = vm_set_capability(sc->vm, vmcap->cpuid, 385 vmcap->captype, 386 vmcap->capval); 387 break; 388 case VM_SET_X2APIC_STATE: 389 x2apic = (struct vm_x2apic *)data; 390 error = vm_set_x2apic_state(sc->vm, 391 x2apic->cpuid, x2apic->state); 392 break; 393 case VM_GET_X2APIC_STATE: 394 x2apic = (struct vm_x2apic *)data; 395 error = vm_get_x2apic_state(sc->vm, 396 x2apic->cpuid, &x2apic->state); 397 break; 398 case VM_GET_GPA_PMAP: 399 gpapte = (struct vm_gpa_pte *)data; 400 pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)), 401 gpapte->gpa, gpapte->pte, &gpapte->ptenum); 402 error = 0; 403 break; 404 case VM_GET_HPET_CAPABILITIES: 405 error = vhpet_getcap((struct vm_hpet_cap *)data); 406 break; 407 default: 408 error = ENOTTY; 409 break; 410 } 411 412 if (state_changed == 1) { 413 vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); 414 } else if (state_changed == 2) { 415 for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) 416 vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); 417 } 418 419done: 420 /* Make sure that no handler returns a bogus value like ERESTART */ 421 KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error)); 422 return (error); 423} 424 425static int 426vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, 427 vm_size_t size, struct vm_object **object, int nprot) 428{ 429 int error; 430 struct vmmdev_softc *sc; 431 432 sc = vmmdev_lookup2(cdev); 433 if (sc != NULL && (nprot & PROT_EXEC) == 0) 434 error = vm_get_memobj(sc->vm, *offset, size, offset, object); 435 else 436 error = EINVAL; 437 438 return (error); 439} 440 441static void 442vmmdev_destroy(void *arg) 443{ 444 445 struct vmmdev_softc *sc = arg; 446 447 if (sc->cdev != NULL) 448 destroy_dev(sc->cdev); 449 450 if (sc->vm != NULL) 451 vm_destroy(sc->vm); 452 453 if ((sc->flags & VSC_LINKED) != 0) { 454 mtx_lock(&vmmdev_mtx); 455 SLIST_REMOVE(&head, sc, vmmdev_softc, link); 456 mtx_unlock(&vmmdev_mtx); 457 } 458 459 free(sc, M_VMMDEV); 460} 461 462static int 463sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS) 464{ 465 int error; 466 char buf[VM_MAX_NAMELEN]; 467 struct vmmdev_softc *sc; 468 struct cdev *cdev; 469 470 strlcpy(buf, "beavis", sizeof(buf)); 471 error = sysctl_handle_string(oidp, buf, sizeof(buf), req); 472 if (error != 0 || req->newptr == NULL) 473 return (error); 474 475 mtx_lock(&vmmdev_mtx); 476 sc = vmmdev_lookup(buf); 477 if (sc == NULL || sc->cdev == NULL) { 478 mtx_unlock(&vmmdev_mtx); 479 return (EINVAL); 480 } 481 482 /* 483 * The 'cdev' will be destroyed asynchronously when 'si_threadcount' 484 * goes down to 0 so we should not do it again in the callback. 485 */ 486 cdev = sc->cdev; 487 sc->cdev = NULL; 488 mtx_unlock(&vmmdev_mtx); 489 490 /* 491 * Schedule the 'cdev' to be destroyed: 492 * 493 * - any new operations on this 'cdev' will return an error (ENXIO). 494 * 495 * - when the 'si_threadcount' dwindles down to zero the 'cdev' will 496 * be destroyed and the callback will be invoked in a taskqueue 497 * context. 498 */ 499 destroy_dev_sched_cb(cdev, vmmdev_destroy, sc); 500 501 return (0); 502} 503SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW, 504 NULL, 0, sysctl_vmm_destroy, "A", NULL); 505 506static struct cdevsw vmmdevsw = { 507 .d_name = "vmmdev", 508 .d_version = D_VERSION, 509 .d_ioctl = vmmdev_ioctl, 510 .d_mmap_single = vmmdev_mmap_single, 511 .d_read = vmmdev_rw, 512 .d_write = vmmdev_rw, 513}; 514 515static int 516sysctl_vmm_create(SYSCTL_HANDLER_ARGS) 517{ 518 int error; 519 struct vm *vm; 520 struct cdev *cdev; 521 struct vmmdev_softc *sc, *sc2; 522 char buf[VM_MAX_NAMELEN]; 523 524 strlcpy(buf, "beavis", sizeof(buf)); 525 error = sysctl_handle_string(oidp, buf, sizeof(buf), req); 526 if (error != 0 || req->newptr == NULL) 527 return (error); 528 529 mtx_lock(&vmmdev_mtx); 530 sc = vmmdev_lookup(buf); 531 mtx_unlock(&vmmdev_mtx); 532 if (sc != NULL) 533 return (EEXIST); 534 535 error = vm_create(buf, &vm); 536 if (error != 0) 537 return (error); 538 539 sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO); 540 sc->vm = vm; 541 542 /* 543 * Lookup the name again just in case somebody sneaked in when we 544 * dropped the lock. 545 */ 546 mtx_lock(&vmmdev_mtx); 547 sc2 = vmmdev_lookup(buf); 548 if (sc2 == NULL) { 549 SLIST_INSERT_HEAD(&head, sc, link); 550 sc->flags |= VSC_LINKED; 551 } 552 mtx_unlock(&vmmdev_mtx); 553 554 if (sc2 != NULL) { 555 vmmdev_destroy(sc); 556 return (EEXIST); 557 } 558 559 error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, NULL, 560 UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf); 561 if (error != 0) { 562 vmmdev_destroy(sc); 563 return (error); 564 } 565 566 mtx_lock(&vmmdev_mtx); 567 sc->cdev = cdev; 568 sc->cdev->si_drv1 = sc; 569 mtx_unlock(&vmmdev_mtx); 570 571 return (0); 572} 573SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW, 574 NULL, 0, sysctl_vmm_create, "A", NULL); 575 576void 577vmmdev_init(void) 578{ 579 mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF); 580} 581 582int 583vmmdev_cleanup(void) 584{ 585 int error; 586 587 if (SLIST_EMPTY(&head)) 588 error = 0; 589 else 590 error = EBUSY; 591 592 return (error); 593} 594