vmm_dev.c revision 268935
155714Skris/*- 255714Skris * Copyright (c) 2011 NetApp, Inc. 355714Skris * All rights reserved. 455714Skris * 555714Skris * Redistribution and use in source and binary forms, with or without 655714Skris * modification, are permitted provided that the following conditions 755714Skris * are met: 855714Skris * 1. Redistributions of source code must retain the above copyright 955714Skris * notice, this list of conditions and the following disclaimer. 1055714Skris * 2. Redistributions in binary form must reproduce the above copyright 1155714Skris * notice, this list of conditions and the following disclaimer in the 1255714Skris * documentation and/or other materials provided with the distribution. 1355714Skris * 1455714Skris * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 1555714Skris * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 1655714Skris * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 1755714Skris * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 1855714Skris * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 1955714Skris * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 2055714Skris * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 2155714Skris * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 2255714Skris * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 2355714Skris * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 2455714Skris * SUCH DAMAGE. 2555714Skris * 2655714Skris * $FreeBSD: stable/10/sys/amd64/vmm/vmm_dev.c 268935 2014-07-21 02:39:17Z jhb $ 2755714Skris */ 2855714Skris 2955714Skris#include <sys/cdefs.h> 3055714Skris__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/vmm_dev.c 268935 2014-07-21 02:39:17Z jhb $"); 3155714Skris 3255714Skris#include <sys/param.h> 3355714Skris#include <sys/kernel.h> 3455714Skris#include <sys/queue.h> 3555714Skris#include <sys/lock.h> 3655714Skris#include <sys/mutex.h> 3755714Skris#include <sys/malloc.h> 3855714Skris#include <sys/conf.h> 3955714Skris#include <sys/sysctl.h> 40#include <sys/libkern.h> 41#include <sys/ioccom.h> 42#include <sys/mman.h> 43#include <sys/uio.h> 44 45#include <vm/vm.h> 46#include <vm/pmap.h> 47#include <vm/vm_map.h> 48 49#include <machine/vmparam.h> 50#include <machine/vmm.h> 51#include <machine/vmm_dev.h> 52 53#include "vmm_lapic.h" 54#include "vmm_stat.h" 55#include "vmm_mem.h" 56#include "io/ppt.h" 57#include "io/vatpic.h" 58#include "io/vioapic.h" 59#include "io/vhpet.h" 60 61struct vmmdev_softc { 62 struct vm *vm; /* vm instance cookie */ 63 struct cdev *cdev; 64 SLIST_ENTRY(vmmdev_softc) link; 65 int flags; 66}; 67#define VSC_LINKED 0x01 68 69static SLIST_HEAD(, vmmdev_softc) head; 70 71static struct mtx vmmdev_mtx; 72 73static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev"); 74 75SYSCTL_DECL(_hw_vmm); 76 77static struct vmmdev_softc * 78vmmdev_lookup(const char *name) 79{ 80 struct vmmdev_softc *sc; 81 82#ifdef notyet /* XXX kernel is not compiled with invariants */ 83 mtx_assert(&vmmdev_mtx, MA_OWNED); 84#endif 85 86 SLIST_FOREACH(sc, &head, link) { 87 if (strcmp(name, vm_name(sc->vm)) == 0) 88 break; 89 } 90 91 return (sc); 92} 93 94static struct vmmdev_softc * 95vmmdev_lookup2(struct cdev *cdev) 96{ 97 98 return (cdev->si_drv1); 99} 100 101static int 102vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags) 103{ 104 int error, off, c, prot; 105 vm_paddr_t gpa; 106 void *hpa, *cookie; 107 struct vmmdev_softc *sc; 108 109 static char zerobuf[PAGE_SIZE]; 110 111 error = 0; 112 sc = vmmdev_lookup2(cdev); 113 if (sc == NULL) 114 error = ENXIO; 115 116 prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ); 117 while (uio->uio_resid > 0 && error == 0) { 118 gpa = uio->uio_offset; 119 off = gpa & PAGE_MASK; 120 c = min(uio->uio_resid, PAGE_SIZE - off); 121 122 /* 123 * The VM has a hole in its physical memory map. If we want to 124 * use 'dd' to inspect memory beyond the hole we need to 125 * provide bogus data for memory that lies in the hole. 126 * 127 * Since this device does not support lseek(2), dd(1) will 128 * read(2) blocks of data to simulate the lseek(2). 129 */ 130 hpa = vm_gpa_hold(sc->vm, gpa, c, prot, &cookie); 131 if (hpa == NULL) { 132 if (uio->uio_rw == UIO_READ) 133 error = uiomove(zerobuf, c, uio); 134 else 135 error = EFAULT; 136 } else { 137 error = uiomove(hpa, c, uio); 138 vm_gpa_release(cookie); 139 } 140 } 141 return (error); 142} 143 144static int 145vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, 146 struct thread *td) 147{ 148 int error, vcpu, state_changed; 149 struct vmmdev_softc *sc; 150 struct vm_memory_segment *seg; 151 struct vm_register *vmreg; 152 struct vm_seg_desc *vmsegdesc; 153 struct vm_run *vmrun; 154 struct vm_exception *vmexc; 155 struct vm_lapic_irq *vmirq; 156 struct vm_lapic_msi *vmmsi; 157 struct vm_ioapic_irq *ioapic_irq; 158 struct vm_isa_irq *isa_irq; 159 struct vm_capability *vmcap; 160 struct vm_pptdev *pptdev; 161 struct vm_pptdev_mmio *pptmmio; 162 struct vm_pptdev_msi *pptmsi; 163 struct vm_pptdev_msix *pptmsix; 164 struct vm_nmi *vmnmi; 165 struct vm_stats *vmstats; 166 struct vm_stat_desc *statdesc; 167 struct vm_x2apic *x2apic; 168 struct vm_gpa_pte *gpapte; 169 struct vm_suspend *vmsuspend; 170 171 sc = vmmdev_lookup2(cdev); 172 if (sc == NULL) 173 return (ENXIO); 174 175 error = 0; 176 vcpu = -1; 177 state_changed = 0; 178 179 /* 180 * Some VMM ioctls can operate only on vcpus that are not running. 181 */ 182 switch (cmd) { 183 case VM_RUN: 184 case VM_GET_REGISTER: 185 case VM_SET_REGISTER: 186 case VM_GET_SEGMENT_DESCRIPTOR: 187 case VM_SET_SEGMENT_DESCRIPTOR: 188 case VM_INJECT_EXCEPTION: 189 case VM_GET_CAPABILITY: 190 case VM_SET_CAPABILITY: 191 case VM_PPTDEV_MSI: 192 case VM_PPTDEV_MSIX: 193 case VM_SET_X2APIC_STATE: 194 /* 195 * XXX fragile, handle with care 196 * Assumes that the first field of the ioctl data is the vcpu. 197 */ 198 vcpu = *(int *)data; 199 if (vcpu < 0 || vcpu >= VM_MAXCPU) { 200 error = EINVAL; 201 goto done; 202 } 203 204 error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true); 205 if (error) 206 goto done; 207 208 state_changed = 1; 209 break; 210 211 case VM_MAP_PPTDEV_MMIO: 212 case VM_BIND_PPTDEV: 213 case VM_UNBIND_PPTDEV: 214 case VM_MAP_MEMORY: 215 /* 216 * ioctls that operate on the entire virtual machine must 217 * prevent all vcpus from running. 218 */ 219 error = 0; 220 for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) { 221 error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true); 222 if (error) 223 break; 224 } 225 226 if (error) { 227 while (--vcpu >= 0) 228 vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); 229 goto done; 230 } 231 232 state_changed = 2; 233 break; 234 235 default: 236 break; 237 } 238 239 switch(cmd) { 240 case VM_RUN: 241 vmrun = (struct vm_run *)data; 242 error = vm_run(sc->vm, vmrun); 243 break; 244 case VM_SUSPEND: 245 vmsuspend = (struct vm_suspend *)data; 246 error = vm_suspend(sc->vm, vmsuspend->how); 247 break; 248 case VM_STAT_DESC: { 249 statdesc = (struct vm_stat_desc *)data; 250 error = vmm_stat_desc_copy(statdesc->index, 251 statdesc->desc, sizeof(statdesc->desc)); 252 break; 253 } 254 case VM_STATS: { 255 CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS); 256 vmstats = (struct vm_stats *)data; 257 getmicrotime(&vmstats->tv); 258 error = vmm_stat_copy(sc->vm, vmstats->cpuid, 259 &vmstats->num_entries, vmstats->statbuf); 260 break; 261 } 262 case VM_PPTDEV_MSI: 263 pptmsi = (struct vm_pptdev_msi *)data; 264 error = ppt_setup_msi(sc->vm, pptmsi->vcpu, 265 pptmsi->bus, pptmsi->slot, pptmsi->func, 266 pptmsi->addr, pptmsi->msg, 267 pptmsi->numvec); 268 break; 269 case VM_PPTDEV_MSIX: 270 pptmsix = (struct vm_pptdev_msix *)data; 271 error = ppt_setup_msix(sc->vm, pptmsix->vcpu, 272 pptmsix->bus, pptmsix->slot, 273 pptmsix->func, pptmsix->idx, 274 pptmsix->addr, pptmsix->msg, 275 pptmsix->vector_control); 276 break; 277 case VM_MAP_PPTDEV_MMIO: 278 pptmmio = (struct vm_pptdev_mmio *)data; 279 error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot, 280 pptmmio->func, pptmmio->gpa, pptmmio->len, 281 pptmmio->hpa); 282 break; 283 case VM_BIND_PPTDEV: 284 pptdev = (struct vm_pptdev *)data; 285 error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot, 286 pptdev->func); 287 break; 288 case VM_UNBIND_PPTDEV: 289 pptdev = (struct vm_pptdev *)data; 290 error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot, 291 pptdev->func); 292 break; 293 case VM_INJECT_EXCEPTION: 294 vmexc = (struct vm_exception *)data; 295 error = vm_inject_exception(sc->vm, vmexc->cpuid, vmexc); 296 break; 297 case VM_INJECT_NMI: 298 vmnmi = (struct vm_nmi *)data; 299 error = vm_inject_nmi(sc->vm, vmnmi->cpuid); 300 break; 301 case VM_LAPIC_IRQ: 302 vmirq = (struct vm_lapic_irq *)data; 303 error = lapic_intr_edge(sc->vm, vmirq->cpuid, vmirq->vector); 304 break; 305 case VM_LAPIC_LOCAL_IRQ: 306 vmirq = (struct vm_lapic_irq *)data; 307 error = lapic_set_local_intr(sc->vm, vmirq->cpuid, 308 vmirq->vector); 309 break; 310 case VM_LAPIC_MSI: 311 vmmsi = (struct vm_lapic_msi *)data; 312 error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg); 313 break; 314 case VM_IOAPIC_ASSERT_IRQ: 315 ioapic_irq = (struct vm_ioapic_irq *)data; 316 error = vioapic_assert_irq(sc->vm, ioapic_irq->irq); 317 break; 318 case VM_IOAPIC_DEASSERT_IRQ: 319 ioapic_irq = (struct vm_ioapic_irq *)data; 320 error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq); 321 break; 322 case VM_IOAPIC_PULSE_IRQ: 323 ioapic_irq = (struct vm_ioapic_irq *)data; 324 error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq); 325 break; 326 case VM_IOAPIC_PINCOUNT: 327 *(int *)data = vioapic_pincount(sc->vm); 328 break; 329 case VM_ISA_ASSERT_IRQ: 330 isa_irq = (struct vm_isa_irq *)data; 331 error = vatpic_assert_irq(sc->vm, isa_irq->atpic_irq); 332 if (error == 0 && isa_irq->ioapic_irq != -1) 333 error = vioapic_assert_irq(sc->vm, 334 isa_irq->ioapic_irq); 335 break; 336 case VM_ISA_DEASSERT_IRQ: 337 isa_irq = (struct vm_isa_irq *)data; 338 error = vatpic_deassert_irq(sc->vm, isa_irq->atpic_irq); 339 if (error == 0 && isa_irq->ioapic_irq != -1) 340 error = vioapic_deassert_irq(sc->vm, 341 isa_irq->ioapic_irq); 342 break; 343 case VM_ISA_PULSE_IRQ: 344 isa_irq = (struct vm_isa_irq *)data; 345 error = vatpic_pulse_irq(sc->vm, isa_irq->atpic_irq); 346 if (error == 0 && isa_irq->ioapic_irq != -1) 347 error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq); 348 break; 349 case VM_MAP_MEMORY: 350 seg = (struct vm_memory_segment *)data; 351 error = vm_malloc(sc->vm, seg->gpa, seg->len); 352 break; 353 case VM_GET_MEMORY_SEG: 354 seg = (struct vm_memory_segment *)data; 355 seg->len = 0; 356 (void)vm_gpabase2memseg(sc->vm, seg->gpa, seg); 357 error = 0; 358 break; 359 case VM_GET_REGISTER: 360 vmreg = (struct vm_register *)data; 361 error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum, 362 &vmreg->regval); 363 break; 364 case VM_SET_REGISTER: 365 vmreg = (struct vm_register *)data; 366 error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum, 367 vmreg->regval); 368 break; 369 case VM_SET_SEGMENT_DESCRIPTOR: 370 vmsegdesc = (struct vm_seg_desc *)data; 371 error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid, 372 vmsegdesc->regnum, 373 &vmsegdesc->desc); 374 break; 375 case VM_GET_SEGMENT_DESCRIPTOR: 376 vmsegdesc = (struct vm_seg_desc *)data; 377 error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid, 378 vmsegdesc->regnum, 379 &vmsegdesc->desc); 380 break; 381 case VM_GET_CAPABILITY: 382 vmcap = (struct vm_capability *)data; 383 error = vm_get_capability(sc->vm, vmcap->cpuid, 384 vmcap->captype, 385 &vmcap->capval); 386 break; 387 case VM_SET_CAPABILITY: 388 vmcap = (struct vm_capability *)data; 389 error = vm_set_capability(sc->vm, vmcap->cpuid, 390 vmcap->captype, 391 vmcap->capval); 392 break; 393 case VM_SET_X2APIC_STATE: 394 x2apic = (struct vm_x2apic *)data; 395 error = vm_set_x2apic_state(sc->vm, 396 x2apic->cpuid, x2apic->state); 397 break; 398 case VM_GET_X2APIC_STATE: 399 x2apic = (struct vm_x2apic *)data; 400 error = vm_get_x2apic_state(sc->vm, 401 x2apic->cpuid, &x2apic->state); 402 break; 403 case VM_GET_GPA_PMAP: 404 gpapte = (struct vm_gpa_pte *)data; 405 pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)), 406 gpapte->gpa, gpapte->pte, &gpapte->ptenum); 407 error = 0; 408 break; 409 case VM_GET_HPET_CAPABILITIES: 410 error = vhpet_getcap((struct vm_hpet_cap *)data); 411 break; 412 default: 413 error = ENOTTY; 414 break; 415 } 416 417 if (state_changed == 1) { 418 vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); 419 } else if (state_changed == 2) { 420 for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) 421 vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); 422 } 423 424done: 425 /* Make sure that no handler returns a bogus value like ERESTART */ 426 KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error)); 427 return (error); 428} 429 430static int 431vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, 432 vm_size_t size, struct vm_object **object, int nprot) 433{ 434 int error; 435 struct vmmdev_softc *sc; 436 437 sc = vmmdev_lookup2(cdev); 438 if (sc != NULL && (nprot & PROT_EXEC) == 0) 439 error = vm_get_memobj(sc->vm, *offset, size, offset, object); 440 else 441 error = EINVAL; 442 443 return (error); 444} 445 446static void 447vmmdev_destroy(void *arg) 448{ 449 450 struct vmmdev_softc *sc = arg; 451 452 if (sc->cdev != NULL) 453 destroy_dev(sc->cdev); 454 455 if (sc->vm != NULL) 456 vm_destroy(sc->vm); 457 458 if ((sc->flags & VSC_LINKED) != 0) { 459 mtx_lock(&vmmdev_mtx); 460 SLIST_REMOVE(&head, sc, vmmdev_softc, link); 461 mtx_unlock(&vmmdev_mtx); 462 } 463 464 free(sc, M_VMMDEV); 465} 466 467static int 468sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS) 469{ 470 int error; 471 char buf[VM_MAX_NAMELEN]; 472 struct vmmdev_softc *sc; 473 struct cdev *cdev; 474 475 strlcpy(buf, "beavis", sizeof(buf)); 476 error = sysctl_handle_string(oidp, buf, sizeof(buf), req); 477 if (error != 0 || req->newptr == NULL) 478 return (error); 479 480 mtx_lock(&vmmdev_mtx); 481 sc = vmmdev_lookup(buf); 482 if (sc == NULL || sc->cdev == NULL) { 483 mtx_unlock(&vmmdev_mtx); 484 return (EINVAL); 485 } 486 487 /* 488 * The 'cdev' will be destroyed asynchronously when 'si_threadcount' 489 * goes down to 0 so we should not do it again in the callback. 490 */ 491 cdev = sc->cdev; 492 sc->cdev = NULL; 493 mtx_unlock(&vmmdev_mtx); 494 495 /* 496 * Schedule the 'cdev' to be destroyed: 497 * 498 * - any new operations on this 'cdev' will return an error (ENXIO). 499 * 500 * - when the 'si_threadcount' dwindles down to zero the 'cdev' will 501 * be destroyed and the callback will be invoked in a taskqueue 502 * context. 503 */ 504 destroy_dev_sched_cb(cdev, vmmdev_destroy, sc); 505 506 return (0); 507} 508SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW, 509 NULL, 0, sysctl_vmm_destroy, "A", NULL); 510 511static struct cdevsw vmmdevsw = { 512 .d_name = "vmmdev", 513 .d_version = D_VERSION, 514 .d_ioctl = vmmdev_ioctl, 515 .d_mmap_single = vmmdev_mmap_single, 516 .d_read = vmmdev_rw, 517 .d_write = vmmdev_rw, 518}; 519 520static int 521sysctl_vmm_create(SYSCTL_HANDLER_ARGS) 522{ 523 int error; 524 struct vm *vm; 525 struct cdev *cdev; 526 struct vmmdev_softc *sc, *sc2; 527 char buf[VM_MAX_NAMELEN]; 528 529 strlcpy(buf, "beavis", sizeof(buf)); 530 error = sysctl_handle_string(oidp, buf, sizeof(buf), req); 531 if (error != 0 || req->newptr == NULL) 532 return (error); 533 534 mtx_lock(&vmmdev_mtx); 535 sc = vmmdev_lookup(buf); 536 mtx_unlock(&vmmdev_mtx); 537 if (sc != NULL) 538 return (EEXIST); 539 540 error = vm_create(buf, &vm); 541 if (error != 0) 542 return (error); 543 544 sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO); 545 sc->vm = vm; 546 547 /* 548 * Lookup the name again just in case somebody sneaked in when we 549 * dropped the lock. 550 */ 551 mtx_lock(&vmmdev_mtx); 552 sc2 = vmmdev_lookup(buf); 553 if (sc2 == NULL) { 554 SLIST_INSERT_HEAD(&head, sc, link); 555 sc->flags |= VSC_LINKED; 556 } 557 mtx_unlock(&vmmdev_mtx); 558 559 if (sc2 != NULL) { 560 vmmdev_destroy(sc); 561 return (EEXIST); 562 } 563 564 error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, NULL, 565 UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf); 566 if (error != 0) { 567 vmmdev_destroy(sc); 568 return (error); 569 } 570 571 mtx_lock(&vmmdev_mtx); 572 sc->cdev = cdev; 573 sc->cdev->si_drv1 = sc; 574 mtx_unlock(&vmmdev_mtx); 575 576 return (0); 577} 578SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW, 579 NULL, 0, sysctl_vmm_create, "A", NULL); 580 581void 582vmmdev_init(void) 583{ 584 mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF); 585} 586 587int 588vmmdev_cleanup(void) 589{ 590 int error; 591 592 if (SLIST_EMPTY(&head)) 593 error = 0; 594 else 595 error = EBUSY; 596 597 return (error); 598} 599