vmm_dev.c revision 267393
1/*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD: stable/10/sys/amd64/vmm/vmm_dev.c 267393 2014-06-12 13:13:15Z jhb $ 27 */ 28 29#include <sys/cdefs.h> 30__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/vmm_dev.c 267393 2014-06-12 13:13:15Z jhb $"); 31 32#include <sys/param.h> 33#include <sys/kernel.h> 34#include <sys/queue.h> 35#include <sys/lock.h> 36#include <sys/mutex.h> 37#include <sys/malloc.h> 38#include <sys/conf.h> 39#include <sys/sysctl.h> 40#include <sys/libkern.h> 41#include <sys/ioccom.h> 42#include <sys/mman.h> 43#include <sys/uio.h> 44 45#include <vm/vm.h> 46#include <vm/pmap.h> 47#include <vm/vm_map.h> 48 49#include <machine/vmparam.h> 50#include <machine/vmm.h> 51#include <machine/vmm_dev.h> 52 53#include "vmm_lapic.h" 54#include "vmm_stat.h" 55#include "vmm_mem.h" 56#include "io/ppt.h" 57#include "io/vioapic.h" 58#include "io/vhpet.h" 59 60struct vmmdev_softc { 61 struct vm *vm; /* vm instance cookie */ 62 struct cdev *cdev; 63 SLIST_ENTRY(vmmdev_softc) link; 64 int flags; 65}; 66#define VSC_LINKED 0x01 67 68static SLIST_HEAD(, vmmdev_softc) head; 69 70static struct mtx vmmdev_mtx; 71 72static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev"); 73 74SYSCTL_DECL(_hw_vmm); 75 76static struct vmmdev_softc * 77vmmdev_lookup(const char *name) 78{ 79 struct vmmdev_softc *sc; 80 81#ifdef notyet /* XXX kernel is not compiled with invariants */ 82 mtx_assert(&vmmdev_mtx, MA_OWNED); 83#endif 84 85 SLIST_FOREACH(sc, &head, link) { 86 if (strcmp(name, vm_name(sc->vm)) == 0) 87 break; 88 } 89 90 return (sc); 91} 92 93static struct vmmdev_softc * 94vmmdev_lookup2(struct cdev *cdev) 95{ 96 97 return (cdev->si_drv1); 98} 99 100static int 101vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags) 102{ 103 int error, off, c, prot; 104 vm_paddr_t gpa; 105 void *hpa, *cookie; 106 struct vmmdev_softc *sc; 107 108 static char zerobuf[PAGE_SIZE]; 109 110 error = 0; 111 sc = vmmdev_lookup2(cdev); 112 if (sc == NULL) 113 error = ENXIO; 114 115 prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ); 116 while (uio->uio_resid > 0 && error == 0) { 117 gpa = uio->uio_offset; 118 off = gpa & PAGE_MASK; 119 c = min(uio->uio_resid, PAGE_SIZE - off); 120 121 /* 122 * The VM has a hole in its physical memory map. If we want to 123 * use 'dd' to inspect memory beyond the hole we need to 124 * provide bogus data for memory that lies in the hole. 125 * 126 * Since this device does not support lseek(2), dd(1) will 127 * read(2) blocks of data to simulate the lseek(2). 128 */ 129 hpa = vm_gpa_hold(sc->vm, gpa, c, prot, &cookie); 130 if (hpa == NULL) { 131 if (uio->uio_rw == UIO_READ) 132 error = uiomove(zerobuf, c, uio); 133 else 134 error = EFAULT; 135 } else { 136 error = uiomove(hpa, c, uio); 137 vm_gpa_release(cookie); 138 } 139 } 140 return (error); 141} 142 143static int 144vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, 145 struct thread *td) 146{ 147 int error, vcpu, state_changed; 148 struct vmmdev_softc *sc; 149 struct vm_memory_segment *seg; 150 struct vm_register *vmreg; 151 struct vm_seg_desc *vmsegdesc; 152 struct vm_run *vmrun; 153 struct vm_event *vmevent; 154 struct vm_lapic_irq *vmirq; 155 struct vm_lapic_msi *vmmsi; 156 struct vm_ioapic_irq *ioapic_irq; 157 struct vm_capability *vmcap; 158 struct vm_pptdev *pptdev; 159 struct vm_pptdev_mmio *pptmmio; 160 struct vm_pptdev_msi *pptmsi; 161 struct vm_pptdev_msix *pptmsix; 162 struct vm_nmi *vmnmi; 163 struct vm_stats *vmstats; 164 struct vm_stat_desc *statdesc; 165 struct vm_x2apic *x2apic; 166 struct vm_gpa_pte *gpapte; 167 168 sc = vmmdev_lookup2(cdev); 169 if (sc == NULL) 170 return (ENXIO); 171 172 vcpu = -1; 173 state_changed = 0; 174 175 /* 176 * Some VMM ioctls can operate only on vcpus that are not running. 177 */ 178 switch (cmd) { 179 case VM_RUN: 180 case VM_GET_REGISTER: 181 case VM_SET_REGISTER: 182 case VM_GET_SEGMENT_DESCRIPTOR: 183 case VM_SET_SEGMENT_DESCRIPTOR: 184 case VM_INJECT_EVENT: 185 case VM_GET_CAPABILITY: 186 case VM_SET_CAPABILITY: 187 case VM_PPTDEV_MSI: 188 case VM_PPTDEV_MSIX: 189 case VM_SET_X2APIC_STATE: 190 /* 191 * XXX fragile, handle with care 192 * Assumes that the first field of the ioctl data is the vcpu. 193 */ 194 vcpu = *(int *)data; 195 if (vcpu < 0 || vcpu >= VM_MAXCPU) { 196 error = EINVAL; 197 goto done; 198 } 199 200 error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true); 201 if (error) 202 goto done; 203 204 state_changed = 1; 205 break; 206 207 case VM_MAP_PPTDEV_MMIO: 208 case VM_BIND_PPTDEV: 209 case VM_UNBIND_PPTDEV: 210 case VM_MAP_MEMORY: 211 /* 212 * ioctls that operate on the entire virtual machine must 213 * prevent all vcpus from running. 214 */ 215 error = 0; 216 for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) { 217 error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true); 218 if (error) 219 break; 220 } 221 222 if (error) { 223 while (--vcpu >= 0) 224 vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); 225 goto done; 226 } 227 228 state_changed = 2; 229 break; 230 231 default: 232 break; 233 } 234 235 switch(cmd) { 236 case VM_RUN: 237 vmrun = (struct vm_run *)data; 238 error = vm_run(sc->vm, vmrun); 239 break; 240 case VM_STAT_DESC: { 241 statdesc = (struct vm_stat_desc *)data; 242 error = vmm_stat_desc_copy(statdesc->index, 243 statdesc->desc, sizeof(statdesc->desc)); 244 break; 245 } 246 case VM_STATS: { 247 CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS); 248 vmstats = (struct vm_stats *)data; 249 getmicrotime(&vmstats->tv); 250 error = vmm_stat_copy(sc->vm, vmstats->cpuid, 251 &vmstats->num_entries, vmstats->statbuf); 252 break; 253 } 254 case VM_PPTDEV_MSI: 255 pptmsi = (struct vm_pptdev_msi *)data; 256 error = ppt_setup_msi(sc->vm, pptmsi->vcpu, 257 pptmsi->bus, pptmsi->slot, pptmsi->func, 258 pptmsi->addr, pptmsi->msg, 259 pptmsi->numvec); 260 break; 261 case VM_PPTDEV_MSIX: 262 pptmsix = (struct vm_pptdev_msix *)data; 263 error = ppt_setup_msix(sc->vm, pptmsix->vcpu, 264 pptmsix->bus, pptmsix->slot, 265 pptmsix->func, pptmsix->idx, 266 pptmsix->addr, pptmsix->msg, 267 pptmsix->vector_control); 268 break; 269 case VM_MAP_PPTDEV_MMIO: 270 pptmmio = (struct vm_pptdev_mmio *)data; 271 error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot, 272 pptmmio->func, pptmmio->gpa, pptmmio->len, 273 pptmmio->hpa); 274 break; 275 case VM_BIND_PPTDEV: 276 pptdev = (struct vm_pptdev *)data; 277 error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot, 278 pptdev->func); 279 break; 280 case VM_UNBIND_PPTDEV: 281 pptdev = (struct vm_pptdev *)data; 282 error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot, 283 pptdev->func); 284 break; 285 case VM_INJECT_EVENT: 286 vmevent = (struct vm_event *)data; 287 error = vm_inject_event(sc->vm, vmevent->cpuid, vmevent->type, 288 vmevent->vector, 289 vmevent->error_code, 290 vmevent->error_code_valid); 291 break; 292 case VM_INJECT_NMI: 293 vmnmi = (struct vm_nmi *)data; 294 error = vm_inject_nmi(sc->vm, vmnmi->cpuid); 295 break; 296 case VM_LAPIC_IRQ: 297 vmirq = (struct vm_lapic_irq *)data; 298 error = lapic_intr_edge(sc->vm, vmirq->cpuid, vmirq->vector); 299 break; 300 case VM_LAPIC_LOCAL_IRQ: 301 vmirq = (struct vm_lapic_irq *)data; 302 error = lapic_set_local_intr(sc->vm, vmirq->cpuid, 303 vmirq->vector); 304 break; 305 case VM_LAPIC_MSI: 306 vmmsi = (struct vm_lapic_msi *)data; 307 error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg); 308 break; 309 case VM_IOAPIC_ASSERT_IRQ: 310 ioapic_irq = (struct vm_ioapic_irq *)data; 311 error = vioapic_assert_irq(sc->vm, ioapic_irq->irq); 312 break; 313 case VM_IOAPIC_DEASSERT_IRQ: 314 ioapic_irq = (struct vm_ioapic_irq *)data; 315 error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq); 316 break; 317 case VM_IOAPIC_PULSE_IRQ: 318 ioapic_irq = (struct vm_ioapic_irq *)data; 319 error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq); 320 break; 321 case VM_IOAPIC_PINCOUNT: 322 *(int *)data = vioapic_pincount(sc->vm); 323 break; 324 case VM_MAP_MEMORY: 325 seg = (struct vm_memory_segment *)data; 326 error = vm_malloc(sc->vm, seg->gpa, seg->len); 327 break; 328 case VM_GET_MEMORY_SEG: 329 seg = (struct vm_memory_segment *)data; 330 seg->len = 0; 331 (void)vm_gpabase2memseg(sc->vm, seg->gpa, seg); 332 error = 0; 333 break; 334 case VM_GET_REGISTER: 335 vmreg = (struct vm_register *)data; 336 error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum, 337 &vmreg->regval); 338 break; 339 case VM_SET_REGISTER: 340 vmreg = (struct vm_register *)data; 341 error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum, 342 vmreg->regval); 343 break; 344 case VM_SET_SEGMENT_DESCRIPTOR: 345 vmsegdesc = (struct vm_seg_desc *)data; 346 error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid, 347 vmsegdesc->regnum, 348 &vmsegdesc->desc); 349 break; 350 case VM_GET_SEGMENT_DESCRIPTOR: 351 vmsegdesc = (struct vm_seg_desc *)data; 352 error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid, 353 vmsegdesc->regnum, 354 &vmsegdesc->desc); 355 break; 356 case VM_GET_CAPABILITY: 357 vmcap = (struct vm_capability *)data; 358 error = vm_get_capability(sc->vm, vmcap->cpuid, 359 vmcap->captype, 360 &vmcap->capval); 361 break; 362 case VM_SET_CAPABILITY: 363 vmcap = (struct vm_capability *)data; 364 error = vm_set_capability(sc->vm, vmcap->cpuid, 365 vmcap->captype, 366 vmcap->capval); 367 break; 368 case VM_SET_X2APIC_STATE: 369 x2apic = (struct vm_x2apic *)data; 370 error = vm_set_x2apic_state(sc->vm, 371 x2apic->cpuid, x2apic->state); 372 break; 373 case VM_GET_X2APIC_STATE: 374 x2apic = (struct vm_x2apic *)data; 375 error = vm_get_x2apic_state(sc->vm, 376 x2apic->cpuid, &x2apic->state); 377 break; 378 case VM_GET_GPA_PMAP: 379 gpapte = (struct vm_gpa_pte *)data; 380 pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)), 381 gpapte->gpa, gpapte->pte, &gpapte->ptenum); 382 error = 0; 383 break; 384 case VM_GET_HPET_CAPABILITIES: 385 error = vhpet_getcap((struct vm_hpet_cap *)data); 386 break; 387 default: 388 error = ENOTTY; 389 break; 390 } 391 392 if (state_changed == 1) { 393 vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); 394 } else if (state_changed == 2) { 395 for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) 396 vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); 397 } 398 399done: 400 /* Make sure that no handler returns a bogus value like ERESTART */ 401 KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error)); 402 return (error); 403} 404 405static int 406vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, 407 vm_size_t size, struct vm_object **object, int nprot) 408{ 409 int error; 410 struct vmmdev_softc *sc; 411 412 sc = vmmdev_lookup2(cdev); 413 if (sc != NULL && (nprot & PROT_EXEC) == 0) 414 error = vm_get_memobj(sc->vm, *offset, size, offset, object); 415 else 416 error = EINVAL; 417 418 return (error); 419} 420 421static void 422vmmdev_destroy(void *arg) 423{ 424 425 struct vmmdev_softc *sc = arg; 426 427 if (sc->cdev != NULL) 428 destroy_dev(sc->cdev); 429 430 if (sc->vm != NULL) 431 vm_destroy(sc->vm); 432 433 if ((sc->flags & VSC_LINKED) != 0) { 434 mtx_lock(&vmmdev_mtx); 435 SLIST_REMOVE(&head, sc, vmmdev_softc, link); 436 mtx_unlock(&vmmdev_mtx); 437 } 438 439 free(sc, M_VMMDEV); 440} 441 442static int 443sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS) 444{ 445 int error; 446 char buf[VM_MAX_NAMELEN]; 447 struct vmmdev_softc *sc; 448 struct cdev *cdev; 449 450 strlcpy(buf, "beavis", sizeof(buf)); 451 error = sysctl_handle_string(oidp, buf, sizeof(buf), req); 452 if (error != 0 || req->newptr == NULL) 453 return (error); 454 455 mtx_lock(&vmmdev_mtx); 456 sc = vmmdev_lookup(buf); 457 if (sc == NULL || sc->cdev == NULL) { 458 mtx_unlock(&vmmdev_mtx); 459 return (EINVAL); 460 } 461 462 /* 463 * The 'cdev' will be destroyed asynchronously when 'si_threadcount' 464 * goes down to 0 so we should not do it again in the callback. 465 */ 466 cdev = sc->cdev; 467 sc->cdev = NULL; 468 mtx_unlock(&vmmdev_mtx); 469 470 /* 471 * Schedule the 'cdev' to be destroyed: 472 * 473 * - any new operations on this 'cdev' will return an error (ENXIO). 474 * 475 * - when the 'si_threadcount' dwindles down to zero the 'cdev' will 476 * be destroyed and the callback will be invoked in a taskqueue 477 * context. 478 */ 479 destroy_dev_sched_cb(cdev, vmmdev_destroy, sc); 480 481 return (0); 482} 483SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW, 484 NULL, 0, sysctl_vmm_destroy, "A", NULL); 485 486static struct cdevsw vmmdevsw = { 487 .d_name = "vmmdev", 488 .d_version = D_VERSION, 489 .d_ioctl = vmmdev_ioctl, 490 .d_mmap_single = vmmdev_mmap_single, 491 .d_read = vmmdev_rw, 492 .d_write = vmmdev_rw, 493}; 494 495static int 496sysctl_vmm_create(SYSCTL_HANDLER_ARGS) 497{ 498 int error; 499 struct vm *vm; 500 struct cdev *cdev; 501 struct vmmdev_softc *sc, *sc2; 502 char buf[VM_MAX_NAMELEN]; 503 504 strlcpy(buf, "beavis", sizeof(buf)); 505 error = sysctl_handle_string(oidp, buf, sizeof(buf), req); 506 if (error != 0 || req->newptr == NULL) 507 return (error); 508 509 mtx_lock(&vmmdev_mtx); 510 sc = vmmdev_lookup(buf); 511 mtx_unlock(&vmmdev_mtx); 512 if (sc != NULL) 513 return (EEXIST); 514 515 error = vm_create(buf, &vm); 516 if (error != 0) 517 return (error); 518 519 sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO); 520 sc->vm = vm; 521 522 /* 523 * Lookup the name again just in case somebody sneaked in when we 524 * dropped the lock. 525 */ 526 mtx_lock(&vmmdev_mtx); 527 sc2 = vmmdev_lookup(buf); 528 if (sc2 == NULL) { 529 SLIST_INSERT_HEAD(&head, sc, link); 530 sc->flags |= VSC_LINKED; 531 } 532 mtx_unlock(&vmmdev_mtx); 533 534 if (sc2 != NULL) { 535 vmmdev_destroy(sc); 536 return (EEXIST); 537 } 538 539 error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, NULL, 540 UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf); 541 if (error != 0) { 542 vmmdev_destroy(sc); 543 return (error); 544 } 545 546 mtx_lock(&vmmdev_mtx); 547 sc->cdev = cdev; 548 sc->cdev->si_drv1 = sc; 549 mtx_unlock(&vmmdev_mtx); 550 551 return (0); 552} 553SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW, 554 NULL, 0, sysctl_vmm_create, "A", NULL); 555 556void 557vmmdev_init(void) 558{ 559 mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF); 560} 561 562int 563vmmdev_cleanup(void) 564{ 565 int error; 566 567 if (SLIST_EMPTY(&head)) 568 error = 0; 569 else 570 error = EBUSY; 571 572 return (error); 573} 574