vmm_dev.c revision 262350
10SN/A/*- 29330SN/A * Copyright (c) 2011 NetApp, Inc. 30SN/A * All rights reserved. 40SN/A * 50SN/A * Redistribution and use in source and binary forms, with or without 60SN/A * modification, are permitted provided that the following conditions 70SN/A * are met: 80SN/A * 1. Redistributions of source code must retain the above copyright 90SN/A * notice, this list of conditions and the following disclaimer. 100SN/A * 2. Redistributions in binary form must reproduce the above copyright 110SN/A * notice, this list of conditions and the following disclaimer in the 120SN/A * documentation and/or other materials provided with the distribution. 130SN/A * 140SN/A * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 152362SN/A * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 160SN/A * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 170SN/A * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 180SN/A * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 190SN/A * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 200SN/A * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 210SN/A * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 220SN/A * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 230SN/A * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 240SN/A * SUCH DAMAGE. 250SN/A * 260SN/A * $FreeBSD: stable/10/sys/amd64/vmm/vmm_dev.c 262350 2014-02-23 00:46:05Z jhb $ 270SN/A */ 280SN/A 290SN/A#include <sys/cdefs.h> 300SN/A__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/vmm_dev.c 262350 2014-02-23 00:46:05Z jhb $"); 310SN/A 324457SN/A#include <sys/param.h> 334457SN/A#include <sys/kernel.h> 344457SN/A#include <sys/queue.h> 354457SN/A#include <sys/lock.h> 364457SN/A#include <sys/mutex.h> 374457SN/A#include <sys/malloc.h> 384457SN/A#include <sys/conf.h> 394457SN/A#include <sys/sysctl.h> 404457SN/A#include <sys/libkern.h> 410SN/A#include <sys/ioccom.h> 420SN/A#include <sys/mman.h> 430SN/A#include <sys/uio.h> 440SN/A 450SN/A#include <vm/vm.h> 460SN/A#include <vm/pmap.h> 470SN/A#include <vm/vm_map.h> 480SN/A 490SN/A#include <machine/vmparam.h> 500SN/A#include <machine/vmm.h> 510SN/A#include <machine/vmm_dev.h> 520SN/A 530SN/A#include "vmm_lapic.h" 540SN/A#include "vmm_stat.h" 550SN/A#include "vmm_mem.h" 560SN/A#include "io/ppt.h" 570SN/A#include "io/vioapic.h" 580SN/A#include "io/vhpet.h" 590SN/A 600SN/Astruct vmmdev_softc { 610SN/A struct vm *vm; /* vm instance cookie */ 620SN/A struct cdev *cdev; 630SN/A SLIST_ENTRY(vmmdev_softc) link; 640SN/A int flags; 650SN/A}; 660SN/A#define VSC_LINKED 0x01 670SN/A 680SN/Astatic SLIST_HEAD(, vmmdev_softc) head; 690SN/A 700SN/Astatic struct mtx vmmdev_mtx; 710SN/A 720SN/Astatic MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev"); 730SN/A 740SN/ASYSCTL_DECL(_hw_vmm); 750SN/A 760SN/Astatic struct vmmdev_softc * 770SN/Avmmdev_lookup(const char *name) 780SN/A{ 790SN/A struct vmmdev_softc *sc; 800SN/A 810SN/A#ifdef notyet /* XXX kernel is not compiled with invariants */ 820SN/A mtx_assert(&vmmdev_mtx, MA_OWNED); 830SN/A#endif 840SN/A 850SN/A SLIST_FOREACH(sc, &head, link) { 860SN/A if (strcmp(name, vm_name(sc->vm)) == 0) 870SN/A break; 880SN/A } 890SN/A 900SN/A return (sc); 910SN/A} 920SN/A 930SN/Astatic struct vmmdev_softc * 940SN/Avmmdev_lookup2(struct cdev *cdev) 950SN/A{ 960SN/A 970SN/A return (cdev->si_drv1); 980SN/A} 990SN/A 1000SN/Astatic int 1010SN/Avmmdev_rw(struct cdev *cdev, struct uio *uio, int flags) 1020SN/A{ 1030SN/A int error, off, c, prot; 1040SN/A vm_paddr_t gpa; 1050SN/A void *hpa, *cookie; 1060SN/A struct vmmdev_softc *sc; 1070SN/A 1080SN/A static char zerobuf[PAGE_SIZE]; 1090SN/A 1100SN/A error = 0; 1110SN/A sc = vmmdev_lookup2(cdev); 1120SN/A if (sc == NULL) 1130SN/A error = ENXIO; 1140SN/A 1150SN/A prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ); 1160SN/A while (uio->uio_resid > 0 && error == 0) { 1170SN/A gpa = uio->uio_offset; 1180SN/A off = gpa & PAGE_MASK; 1190SN/A c = min(uio->uio_resid, PAGE_SIZE - off); 1200SN/A 1210SN/A /* 1220SN/A * The VM has a hole in its physical memory map. If we want to 1230SN/A * use 'dd' to inspect memory beyond the hole we need to 1240SN/A * provide bogus data for memory that lies in the hole. 1250SN/A * 1260SN/A * Since this device does not support lseek(2), dd(1) will 1270SN/A * read(2) blocks of data to simulate the lseek(2). 1280SN/A */ 1290SN/A hpa = vm_gpa_hold(sc->vm, gpa, c, prot, &cookie); 1300SN/A if (hpa == NULL) { 1310SN/A if (uio->uio_rw == UIO_READ) 1320SN/A error = uiomove(zerobuf, c, uio); 1330SN/A else 1340SN/A error = EFAULT; 1350SN/A } else { 1360SN/A error = uiomove(hpa, c, uio); 1370SN/A vm_gpa_release(cookie); 1380SN/A } 1390SN/A } 1400SN/A return (error); 1410SN/A} 1420SN/A 1430SN/Astatic int 1440SN/Avmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, 1450SN/A struct thread *td) 1460SN/A{ 1470SN/A int error, vcpu, state_changed; 1480SN/A struct vmmdev_softc *sc; 1490SN/A struct vm_memory_segment *seg; 1500SN/A struct vm_register *vmreg; 1510SN/A struct vm_seg_desc *vmsegdesc; 1520SN/A struct vm_run *vmrun; 1530SN/A struct vm_event *vmevent; 1540SN/A struct vm_lapic_irq *vmirq; 1550SN/A struct vm_lapic_msi *vmmsi; 1560SN/A struct vm_ioapic_irq *ioapic_irq; 1570SN/A struct vm_capability *vmcap; 1580SN/A struct vm_pptdev *pptdev; 15912843Sprr struct vm_pptdev_mmio *pptmmio; 1600SN/A struct vm_pptdev_msi *pptmsi; 1610SN/A struct vm_pptdev_msix *pptmsix; 1620SN/A struct vm_nmi *vmnmi; 1630SN/A struct vm_stats *vmstats; 1640SN/A struct vm_stat_desc *statdesc; 1650SN/A struct vm_x2apic *x2apic; 1660SN/A struct vm_gpa_pte *gpapte; 1670SN/A 1680SN/A sc = vmmdev_lookup2(cdev); 1690SN/A if (sc == NULL) 1700SN/A return (ENXIO); 1710SN/A 1720SN/A vcpu = -1; 1730SN/A state_changed = 0; 1740SN/A 1750SN/A /* 1760SN/A * Some VMM ioctls can operate only on vcpus that are not running. 1770SN/A */ 1780SN/A switch (cmd) { 1790SN/A case VM_RUN: 1800SN/A case VM_GET_REGISTER: 1810SN/A case VM_SET_REGISTER: 1820SN/A case VM_GET_SEGMENT_DESCRIPTOR: 1830SN/A case VM_SET_SEGMENT_DESCRIPTOR: 1840SN/A case VM_INJECT_EVENT: 1850SN/A case VM_GET_CAPABILITY: 1860SN/A case VM_SET_CAPABILITY: 1870SN/A case VM_PPTDEV_MSI: 1880SN/A case VM_PPTDEV_MSIX: 1890SN/A case VM_SET_X2APIC_STATE: 1900SN/A /* 1910SN/A * XXX fragile, handle with care 1920SN/A * Assumes that the first field of the ioctl data is the vcpu. 1930SN/A */ 1940SN/A vcpu = *(int *)data; 1950SN/A if (vcpu < 0 || vcpu >= VM_MAXCPU) { 1960SN/A error = EINVAL; 1970SN/A goto done; 1980SN/A } 1990SN/A 2000SN/A error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN); 2010SN/A if (error) 2020SN/A goto done; 2030SN/A 2040SN/A state_changed = 1; 2050SN/A break; 2060SN/A 2070SN/A case VM_MAP_PPTDEV_MMIO: 2080SN/A case VM_BIND_PPTDEV: 2090SN/A case VM_UNBIND_PPTDEV: 2100SN/A case VM_MAP_MEMORY: 2110SN/A /* 2120SN/A * ioctls that operate on the entire virtual machine must 2130SN/A * prevent all vcpus from running. 2140SN/A */ 2150SN/A error = 0; 2160SN/A for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) { 2170SN/A error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN); 2180SN/A if (error) 2190SN/A break; 2200SN/A } 2210SN/A 2220SN/A if (error) { 2230SN/A while (--vcpu >= 0) 2240SN/A vcpu_set_state(sc->vm, vcpu, VCPU_IDLE); 2250SN/A goto done; 2260SN/A } 2270SN/A 2280SN/A state_changed = 2; 2290SN/A break; 2300SN/A 2310SN/A default: 232 break; 233 } 234 235 switch(cmd) { 236 case VM_RUN: 237 vmrun = (struct vm_run *)data; 238 error = vm_run(sc->vm, vmrun); 239 break; 240 case VM_STAT_DESC: { 241 statdesc = (struct vm_stat_desc *)data; 242 error = vmm_stat_desc_copy(statdesc->index, 243 statdesc->desc, sizeof(statdesc->desc)); 244 break; 245 } 246 case VM_STATS: { 247 CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS); 248 vmstats = (struct vm_stats *)data; 249 getmicrotime(&vmstats->tv); 250 error = vmm_stat_copy(sc->vm, vmstats->cpuid, 251 &vmstats->num_entries, vmstats->statbuf); 252 break; 253 } 254 case VM_PPTDEV_MSI: 255 pptmsi = (struct vm_pptdev_msi *)data; 256 error = ppt_setup_msi(sc->vm, pptmsi->vcpu, 257 pptmsi->bus, pptmsi->slot, pptmsi->func, 258 pptmsi->addr, pptmsi->msg, 259 pptmsi->numvec); 260 break; 261 case VM_PPTDEV_MSIX: 262 pptmsix = (struct vm_pptdev_msix *)data; 263 error = ppt_setup_msix(sc->vm, pptmsix->vcpu, 264 pptmsix->bus, pptmsix->slot, 265 pptmsix->func, pptmsix->idx, 266 pptmsix->addr, pptmsix->msg, 267 pptmsix->vector_control); 268 break; 269 case VM_MAP_PPTDEV_MMIO: 270 pptmmio = (struct vm_pptdev_mmio *)data; 271 error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot, 272 pptmmio->func, pptmmio->gpa, pptmmio->len, 273 pptmmio->hpa); 274 break; 275 case VM_BIND_PPTDEV: 276 pptdev = (struct vm_pptdev *)data; 277 error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot, 278 pptdev->func); 279 break; 280 case VM_UNBIND_PPTDEV: 281 pptdev = (struct vm_pptdev *)data; 282 error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot, 283 pptdev->func); 284 break; 285 case VM_INJECT_EVENT: 286 vmevent = (struct vm_event *)data; 287 error = vm_inject_event(sc->vm, vmevent->cpuid, vmevent->type, 288 vmevent->vector, 289 vmevent->error_code, 290 vmevent->error_code_valid); 291 break; 292 case VM_INJECT_NMI: 293 vmnmi = (struct vm_nmi *)data; 294 error = vm_inject_nmi(sc->vm, vmnmi->cpuid); 295 break; 296 case VM_LAPIC_IRQ: 297 vmirq = (struct vm_lapic_irq *)data; 298 error = lapic_intr_edge(sc->vm, vmirq->cpuid, vmirq->vector); 299 break; 300 case VM_LAPIC_LOCAL_IRQ: 301 vmirq = (struct vm_lapic_irq *)data; 302 error = lapic_set_local_intr(sc->vm, vmirq->cpuid, 303 vmirq->vector); 304 break; 305 case VM_LAPIC_MSI: 306 vmmsi = (struct vm_lapic_msi *)data; 307 error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg); 308 break; 309 case VM_IOAPIC_ASSERT_IRQ: 310 ioapic_irq = (struct vm_ioapic_irq *)data; 311 error = vioapic_assert_irq(sc->vm, ioapic_irq->irq); 312 break; 313 case VM_IOAPIC_DEASSERT_IRQ: 314 ioapic_irq = (struct vm_ioapic_irq *)data; 315 error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq); 316 break; 317 case VM_IOAPIC_PULSE_IRQ: 318 ioapic_irq = (struct vm_ioapic_irq *)data; 319 error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq); 320 break; 321 case VM_MAP_MEMORY: 322 seg = (struct vm_memory_segment *)data; 323 error = vm_malloc(sc->vm, seg->gpa, seg->len); 324 break; 325 case VM_GET_MEMORY_SEG: 326 seg = (struct vm_memory_segment *)data; 327 seg->len = 0; 328 (void)vm_gpabase2memseg(sc->vm, seg->gpa, seg); 329 error = 0; 330 break; 331 case VM_GET_REGISTER: 332 vmreg = (struct vm_register *)data; 333 error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum, 334 &vmreg->regval); 335 break; 336 case VM_SET_REGISTER: 337 vmreg = (struct vm_register *)data; 338 error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum, 339 vmreg->regval); 340 break; 341 case VM_SET_SEGMENT_DESCRIPTOR: 342 vmsegdesc = (struct vm_seg_desc *)data; 343 error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid, 344 vmsegdesc->regnum, 345 &vmsegdesc->desc); 346 break; 347 case VM_GET_SEGMENT_DESCRIPTOR: 348 vmsegdesc = (struct vm_seg_desc *)data; 349 error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid, 350 vmsegdesc->regnum, 351 &vmsegdesc->desc); 352 break; 353 case VM_GET_CAPABILITY: 354 vmcap = (struct vm_capability *)data; 355 error = vm_get_capability(sc->vm, vmcap->cpuid, 356 vmcap->captype, 357 &vmcap->capval); 358 break; 359 case VM_SET_CAPABILITY: 360 vmcap = (struct vm_capability *)data; 361 error = vm_set_capability(sc->vm, vmcap->cpuid, 362 vmcap->captype, 363 vmcap->capval); 364 break; 365 case VM_SET_X2APIC_STATE: 366 x2apic = (struct vm_x2apic *)data; 367 error = vm_set_x2apic_state(sc->vm, 368 x2apic->cpuid, x2apic->state); 369 break; 370 case VM_GET_X2APIC_STATE: 371 x2apic = (struct vm_x2apic *)data; 372 error = vm_get_x2apic_state(sc->vm, 373 x2apic->cpuid, &x2apic->state); 374 break; 375 case VM_GET_GPA_PMAP: 376 gpapte = (struct vm_gpa_pte *)data; 377 pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)), 378 gpapte->gpa, gpapte->pte, &gpapte->ptenum); 379 error = 0; 380 break; 381 case VM_GET_HPET_CAPABILITIES: 382 error = vhpet_getcap((struct vm_hpet_cap *)data); 383 break; 384 default: 385 error = ENOTTY; 386 break; 387 } 388 389 if (state_changed == 1) { 390 vcpu_set_state(sc->vm, vcpu, VCPU_IDLE); 391 } else if (state_changed == 2) { 392 for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) 393 vcpu_set_state(sc->vm, vcpu, VCPU_IDLE); 394 } 395 396done: 397 /* Make sure that no handler returns a bogus value like ERESTART */ 398 KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error)); 399 return (error); 400} 401 402static int 403vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, 404 vm_size_t size, struct vm_object **object, int nprot) 405{ 406 int error; 407 struct vmmdev_softc *sc; 408 409 sc = vmmdev_lookup2(cdev); 410 if (sc != NULL && (nprot & PROT_EXEC) == 0) 411 error = vm_get_memobj(sc->vm, *offset, size, offset, object); 412 else 413 error = EINVAL; 414 415 return (error); 416} 417 418static void 419vmmdev_destroy(void *arg) 420{ 421 422 struct vmmdev_softc *sc = arg; 423 424 if (sc->cdev != NULL) 425 destroy_dev(sc->cdev); 426 427 if (sc->vm != NULL) 428 vm_destroy(sc->vm); 429 430 if ((sc->flags & VSC_LINKED) != 0) { 431 mtx_lock(&vmmdev_mtx); 432 SLIST_REMOVE(&head, sc, vmmdev_softc, link); 433 mtx_unlock(&vmmdev_mtx); 434 } 435 436 free(sc, M_VMMDEV); 437} 438 439static int 440sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS) 441{ 442 int error; 443 char buf[VM_MAX_NAMELEN]; 444 struct vmmdev_softc *sc; 445 struct cdev *cdev; 446 447 strlcpy(buf, "beavis", sizeof(buf)); 448 error = sysctl_handle_string(oidp, buf, sizeof(buf), req); 449 if (error != 0 || req->newptr == NULL) 450 return (error); 451 452 mtx_lock(&vmmdev_mtx); 453 sc = vmmdev_lookup(buf); 454 if (sc == NULL || sc->cdev == NULL) { 455 mtx_unlock(&vmmdev_mtx); 456 return (EINVAL); 457 } 458 459 /* 460 * The 'cdev' will be destroyed asynchronously when 'si_threadcount' 461 * goes down to 0 so we should not do it again in the callback. 462 */ 463 cdev = sc->cdev; 464 sc->cdev = NULL; 465 mtx_unlock(&vmmdev_mtx); 466 467 /* 468 * Schedule the 'cdev' to be destroyed: 469 * 470 * - any new operations on this 'cdev' will return an error (ENXIO). 471 * 472 * - when the 'si_threadcount' dwindles down to zero the 'cdev' will 473 * be destroyed and the callback will be invoked in a taskqueue 474 * context. 475 */ 476 destroy_dev_sched_cb(cdev, vmmdev_destroy, sc); 477 478 return (0); 479} 480SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW, 481 NULL, 0, sysctl_vmm_destroy, "A", NULL); 482 483static struct cdevsw vmmdevsw = { 484 .d_name = "vmmdev", 485 .d_version = D_VERSION, 486 .d_ioctl = vmmdev_ioctl, 487 .d_mmap_single = vmmdev_mmap_single, 488 .d_read = vmmdev_rw, 489 .d_write = vmmdev_rw, 490}; 491 492static int 493sysctl_vmm_create(SYSCTL_HANDLER_ARGS) 494{ 495 int error; 496 struct vm *vm; 497 struct cdev *cdev; 498 struct vmmdev_softc *sc, *sc2; 499 char buf[VM_MAX_NAMELEN]; 500 501 strlcpy(buf, "beavis", sizeof(buf)); 502 error = sysctl_handle_string(oidp, buf, sizeof(buf), req); 503 if (error != 0 || req->newptr == NULL) 504 return (error); 505 506 mtx_lock(&vmmdev_mtx); 507 sc = vmmdev_lookup(buf); 508 mtx_unlock(&vmmdev_mtx); 509 if (sc != NULL) 510 return (EEXIST); 511 512 error = vm_create(buf, &vm); 513 if (error != 0) 514 return (error); 515 516 sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO); 517 sc->vm = vm; 518 519 /* 520 * Lookup the name again just in case somebody sneaked in when we 521 * dropped the lock. 522 */ 523 mtx_lock(&vmmdev_mtx); 524 sc2 = vmmdev_lookup(buf); 525 if (sc2 == NULL) { 526 SLIST_INSERT_HEAD(&head, sc, link); 527 sc->flags |= VSC_LINKED; 528 } 529 mtx_unlock(&vmmdev_mtx); 530 531 if (sc2 != NULL) { 532 vmmdev_destroy(sc); 533 return (EEXIST); 534 } 535 536 error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, NULL, 537 UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf); 538 if (error != 0) { 539 vmmdev_destroy(sc); 540 return (error); 541 } 542 543 mtx_lock(&vmmdev_mtx); 544 sc->cdev = cdev; 545 sc->cdev->si_drv1 = sc; 546 mtx_unlock(&vmmdev_mtx); 547 548 return (0); 549} 550SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW, 551 NULL, 0, sysctl_vmm_create, "A", NULL); 552 553void 554vmmdev_init(void) 555{ 556 mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF); 557} 558 559int 560vmmdev_cleanup(void) 561{ 562 int error; 563 564 if (SLIST_EMPTY(&head)) 565 error = 0; 566 else 567 error = EBUSY; 568 569 return (error); 570} 571