vmm_dev.c revision 262349
1/*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD: stable/10/sys/amd64/vmm/vmm_dev.c 262349 2014-02-22 23:34:39Z jhb $ 27 */ 28 29#include <sys/cdefs.h> 30__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/vmm_dev.c 262349 2014-02-22 23:34:39Z jhb $"); 31 32#include <sys/param.h> 33#include <sys/kernel.h> 34#include <sys/queue.h> 35#include <sys/lock.h> 36#include <sys/mutex.h> 37#include <sys/malloc.h> 38#include <sys/conf.h> 39#include <sys/sysctl.h> 40#include <sys/libkern.h> 41#include <sys/ioccom.h> 42#include <sys/mman.h> 43#include <sys/uio.h> 44 45#include <vm/vm.h> 46#include <vm/pmap.h> 47#include <vm/vm_map.h> 48 49#include <machine/vmparam.h> 50#include <machine/vmm.h> 51#include <machine/vmm_dev.h> 52 53#include "vmm_lapic.h" 54#include "vmm_stat.h" 55#include "vmm_mem.h" 56#include "io/ppt.h" 57#include "io/vioapic.h" 58#include "io/vhpet.h" 59 60struct vmmdev_softc { 61 struct vm *vm; /* vm instance cookie */ 62 struct cdev *cdev; 63 SLIST_ENTRY(vmmdev_softc) link; 64 int flags; 65}; 66#define VSC_LINKED 0x01 67 68static SLIST_HEAD(, vmmdev_softc) head; 69 70static struct mtx vmmdev_mtx; 71 72static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev"); 73 74SYSCTL_DECL(_hw_vmm); 75 76static struct vmmdev_softc * 77vmmdev_lookup(const char *name) 78{ 79 struct vmmdev_softc *sc; 80 81#ifdef notyet /* XXX kernel is not compiled with invariants */ 82 mtx_assert(&vmmdev_mtx, MA_OWNED); 83#endif 84 85 SLIST_FOREACH(sc, &head, link) { 86 if (strcmp(name, vm_name(sc->vm)) == 0) 87 break; 88 } 89 90 return (sc); 91} 92 93static struct vmmdev_softc * 94vmmdev_lookup2(struct cdev *cdev) 95{ 96 97 return (cdev->si_drv1); 98} 99 100static int 101vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags) 102{ 103 int error, off, c, prot; 104 vm_paddr_t gpa; 105 void *hpa, *cookie; 106 struct vmmdev_softc *sc; 107 108 static char zerobuf[PAGE_SIZE]; 109 110 error = 0; 111 sc = vmmdev_lookup2(cdev); 112 if (sc == NULL) 113 error = ENXIO; 114 115 prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ); 116 while (uio->uio_resid > 0 && error == 0) { 117 gpa = uio->uio_offset; 118 off = gpa & PAGE_MASK; 119 c = min(uio->uio_resid, PAGE_SIZE - off); 120 121 /* 122 * The VM has a hole in its physical memory map. If we want to 123 * use 'dd' to inspect memory beyond the hole we need to 124 * provide bogus data for memory that lies in the hole. 125 * 126 * Since this device does not support lseek(2), dd(1) will 127 * read(2) blocks of data to simulate the lseek(2). 128 */ 129 hpa = vm_gpa_hold(sc->vm, gpa, c, prot, &cookie); 130 if (hpa == NULL) { 131 if (uio->uio_rw == UIO_READ) 132 error = uiomove(zerobuf, c, uio); 133 else 134 error = EFAULT; 135 } else { 136 error = uiomove(hpa, c, uio); 137 vm_gpa_release(cookie); 138 } 139 } 140 return (error); 141} 142 143static int 144vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, 145 struct thread *td) 146{ 147 int error, vcpu, state_changed; 148 struct vmmdev_softc *sc; 149 struct vm_memory_segment *seg; 150 struct vm_register *vmreg; 151 struct vm_seg_desc *vmsegdesc; 152 struct vm_run *vmrun; 153 struct vm_event *vmevent; 154 struct vm_lapic_irq *vmirq; 155 struct vm_ioapic_irq *ioapic_irq; 156 struct vm_capability *vmcap; 157 struct vm_pptdev *pptdev; 158 struct vm_pptdev_mmio *pptmmio; 159 struct vm_pptdev_msi *pptmsi; 160 struct vm_pptdev_msix *pptmsix; 161 struct vm_nmi *vmnmi; 162 struct vm_stats *vmstats; 163 struct vm_stat_desc *statdesc; 164 struct vm_x2apic *x2apic; 165 struct vm_gpa_pte *gpapte; 166 167 sc = vmmdev_lookup2(cdev); 168 if (sc == NULL) 169 return (ENXIO); 170 171 vcpu = -1; 172 state_changed = 0; 173 174 /* 175 * Some VMM ioctls can operate only on vcpus that are not running. 176 */ 177 switch (cmd) { 178 case VM_RUN: 179 case VM_GET_REGISTER: 180 case VM_SET_REGISTER: 181 case VM_GET_SEGMENT_DESCRIPTOR: 182 case VM_SET_SEGMENT_DESCRIPTOR: 183 case VM_INJECT_EVENT: 184 case VM_GET_CAPABILITY: 185 case VM_SET_CAPABILITY: 186 case VM_PPTDEV_MSI: 187 case VM_PPTDEV_MSIX: 188 case VM_SET_X2APIC_STATE: 189 /* 190 * XXX fragile, handle with care 191 * Assumes that the first field of the ioctl data is the vcpu. 192 */ 193 vcpu = *(int *)data; 194 if (vcpu < 0 || vcpu >= VM_MAXCPU) { 195 error = EINVAL; 196 goto done; 197 } 198 199 error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN); 200 if (error) 201 goto done; 202 203 state_changed = 1; 204 break; 205 206 case VM_MAP_PPTDEV_MMIO: 207 case VM_BIND_PPTDEV: 208 case VM_UNBIND_PPTDEV: 209 case VM_MAP_MEMORY: 210 /* 211 * ioctls that operate on the entire virtual machine must 212 * prevent all vcpus from running. 213 */ 214 error = 0; 215 for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) { 216 error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN); 217 if (error) 218 break; 219 } 220 221 if (error) { 222 while (--vcpu >= 0) 223 vcpu_set_state(sc->vm, vcpu, VCPU_IDLE); 224 goto done; 225 } 226 227 state_changed = 2; 228 break; 229 230 default: 231 break; 232 } 233 234 switch(cmd) { 235 case VM_RUN: 236 vmrun = (struct vm_run *)data; 237 error = vm_run(sc->vm, vmrun); 238 break; 239 case VM_STAT_DESC: { 240 statdesc = (struct vm_stat_desc *)data; 241 error = vmm_stat_desc_copy(statdesc->index, 242 statdesc->desc, sizeof(statdesc->desc)); 243 break; 244 } 245 case VM_STATS: { 246 CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS); 247 vmstats = (struct vm_stats *)data; 248 getmicrotime(&vmstats->tv); 249 error = vmm_stat_copy(sc->vm, vmstats->cpuid, 250 &vmstats->num_entries, vmstats->statbuf); 251 break; 252 } 253 case VM_PPTDEV_MSI: 254 pptmsi = (struct vm_pptdev_msi *)data; 255 error = ppt_setup_msi(sc->vm, pptmsi->vcpu, 256 pptmsi->bus, pptmsi->slot, pptmsi->func, 257 pptmsi->destcpu, pptmsi->vector, 258 pptmsi->numvec); 259 break; 260 case VM_PPTDEV_MSIX: 261 pptmsix = (struct vm_pptdev_msix *)data; 262 error = ppt_setup_msix(sc->vm, pptmsix->vcpu, 263 pptmsix->bus, pptmsix->slot, 264 pptmsix->func, pptmsix->idx, 265 pptmsix->msg, pptmsix->vector_control, 266 pptmsix->addr); 267 break; 268 case VM_MAP_PPTDEV_MMIO: 269 pptmmio = (struct vm_pptdev_mmio *)data; 270 error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot, 271 pptmmio->func, pptmmio->gpa, pptmmio->len, 272 pptmmio->hpa); 273 break; 274 case VM_BIND_PPTDEV: 275 pptdev = (struct vm_pptdev *)data; 276 error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot, 277 pptdev->func); 278 break; 279 case VM_UNBIND_PPTDEV: 280 pptdev = (struct vm_pptdev *)data; 281 error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot, 282 pptdev->func); 283 break; 284 case VM_INJECT_EVENT: 285 vmevent = (struct vm_event *)data; 286 error = vm_inject_event(sc->vm, vmevent->cpuid, vmevent->type, 287 vmevent->vector, 288 vmevent->error_code, 289 vmevent->error_code_valid); 290 break; 291 case VM_INJECT_NMI: 292 vmnmi = (struct vm_nmi *)data; 293 error = vm_inject_nmi(sc->vm, vmnmi->cpuid); 294 break; 295 case VM_LAPIC_IRQ: 296 vmirq = (struct vm_lapic_irq *)data; 297 error = lapic_intr_edge(sc->vm, vmirq->cpuid, vmirq->vector); 298 break; 299 case VM_IOAPIC_ASSERT_IRQ: 300 ioapic_irq = (struct vm_ioapic_irq *)data; 301 error = vioapic_assert_irq(sc->vm, ioapic_irq->irq); 302 break; 303 case VM_IOAPIC_DEASSERT_IRQ: 304 ioapic_irq = (struct vm_ioapic_irq *)data; 305 error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq); 306 break; 307 case VM_IOAPIC_PULSE_IRQ: 308 ioapic_irq = (struct vm_ioapic_irq *)data; 309 error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq); 310 break; 311 case VM_MAP_MEMORY: 312 seg = (struct vm_memory_segment *)data; 313 error = vm_malloc(sc->vm, seg->gpa, seg->len); 314 break; 315 case VM_GET_MEMORY_SEG: 316 seg = (struct vm_memory_segment *)data; 317 seg->len = 0; 318 (void)vm_gpabase2memseg(sc->vm, seg->gpa, seg); 319 error = 0; 320 break; 321 case VM_GET_REGISTER: 322 vmreg = (struct vm_register *)data; 323 error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum, 324 &vmreg->regval); 325 break; 326 case VM_SET_REGISTER: 327 vmreg = (struct vm_register *)data; 328 error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum, 329 vmreg->regval); 330 break; 331 case VM_SET_SEGMENT_DESCRIPTOR: 332 vmsegdesc = (struct vm_seg_desc *)data; 333 error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid, 334 vmsegdesc->regnum, 335 &vmsegdesc->desc); 336 break; 337 case VM_GET_SEGMENT_DESCRIPTOR: 338 vmsegdesc = (struct vm_seg_desc *)data; 339 error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid, 340 vmsegdesc->regnum, 341 &vmsegdesc->desc); 342 break; 343 case VM_GET_CAPABILITY: 344 vmcap = (struct vm_capability *)data; 345 error = vm_get_capability(sc->vm, vmcap->cpuid, 346 vmcap->captype, 347 &vmcap->capval); 348 break; 349 case VM_SET_CAPABILITY: 350 vmcap = (struct vm_capability *)data; 351 error = vm_set_capability(sc->vm, vmcap->cpuid, 352 vmcap->captype, 353 vmcap->capval); 354 break; 355 case VM_SET_X2APIC_STATE: 356 x2apic = (struct vm_x2apic *)data; 357 error = vm_set_x2apic_state(sc->vm, 358 x2apic->cpuid, x2apic->state); 359 break; 360 case VM_GET_X2APIC_STATE: 361 x2apic = (struct vm_x2apic *)data; 362 error = vm_get_x2apic_state(sc->vm, 363 x2apic->cpuid, &x2apic->state); 364 break; 365 case VM_GET_GPA_PMAP: 366 gpapte = (struct vm_gpa_pte *)data; 367 pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)), 368 gpapte->gpa, gpapte->pte, &gpapte->ptenum); 369 error = 0; 370 break; 371 case VM_GET_HPET_CAPABILITIES: 372 error = vhpet_getcap((struct vm_hpet_cap *)data); 373 break; 374 default: 375 error = ENOTTY; 376 break; 377 } 378 379 if (state_changed == 1) { 380 vcpu_set_state(sc->vm, vcpu, VCPU_IDLE); 381 } else if (state_changed == 2) { 382 for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) 383 vcpu_set_state(sc->vm, vcpu, VCPU_IDLE); 384 } 385 386done: 387 /* Make sure that no handler returns a bogus value like ERESTART */ 388 KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error)); 389 return (error); 390} 391 392static int 393vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, 394 vm_size_t size, struct vm_object **object, int nprot) 395{ 396 int error; 397 struct vmmdev_softc *sc; 398 399 sc = vmmdev_lookup2(cdev); 400 if (sc != NULL && (nprot & PROT_EXEC) == 0) 401 error = vm_get_memobj(sc->vm, *offset, size, offset, object); 402 else 403 error = EINVAL; 404 405 return (error); 406} 407 408static void 409vmmdev_destroy(void *arg) 410{ 411 412 struct vmmdev_softc *sc = arg; 413 414 if (sc->cdev != NULL) 415 destroy_dev(sc->cdev); 416 417 if (sc->vm != NULL) 418 vm_destroy(sc->vm); 419 420 if ((sc->flags & VSC_LINKED) != 0) { 421 mtx_lock(&vmmdev_mtx); 422 SLIST_REMOVE(&head, sc, vmmdev_softc, link); 423 mtx_unlock(&vmmdev_mtx); 424 } 425 426 free(sc, M_VMMDEV); 427} 428 429static int 430sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS) 431{ 432 int error; 433 char buf[VM_MAX_NAMELEN]; 434 struct vmmdev_softc *sc; 435 struct cdev *cdev; 436 437 strlcpy(buf, "beavis", sizeof(buf)); 438 error = sysctl_handle_string(oidp, buf, sizeof(buf), req); 439 if (error != 0 || req->newptr == NULL) 440 return (error); 441 442 mtx_lock(&vmmdev_mtx); 443 sc = vmmdev_lookup(buf); 444 if (sc == NULL || sc->cdev == NULL) { 445 mtx_unlock(&vmmdev_mtx); 446 return (EINVAL); 447 } 448 449 /* 450 * The 'cdev' will be destroyed asynchronously when 'si_threadcount' 451 * goes down to 0 so we should not do it again in the callback. 452 */ 453 cdev = sc->cdev; 454 sc->cdev = NULL; 455 mtx_unlock(&vmmdev_mtx); 456 457 /* 458 * Schedule the 'cdev' to be destroyed: 459 * 460 * - any new operations on this 'cdev' will return an error (ENXIO). 461 * 462 * - when the 'si_threadcount' dwindles down to zero the 'cdev' will 463 * be destroyed and the callback will be invoked in a taskqueue 464 * context. 465 */ 466 destroy_dev_sched_cb(cdev, vmmdev_destroy, sc); 467 468 return (0); 469} 470SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW, 471 NULL, 0, sysctl_vmm_destroy, "A", NULL); 472 473static struct cdevsw vmmdevsw = { 474 .d_name = "vmmdev", 475 .d_version = D_VERSION, 476 .d_ioctl = vmmdev_ioctl, 477 .d_mmap_single = vmmdev_mmap_single, 478 .d_read = vmmdev_rw, 479 .d_write = vmmdev_rw, 480}; 481 482static int 483sysctl_vmm_create(SYSCTL_HANDLER_ARGS) 484{ 485 int error; 486 struct vm *vm; 487 struct cdev *cdev; 488 struct vmmdev_softc *sc, *sc2; 489 char buf[VM_MAX_NAMELEN]; 490 491 strlcpy(buf, "beavis", sizeof(buf)); 492 error = sysctl_handle_string(oidp, buf, sizeof(buf), req); 493 if (error != 0 || req->newptr == NULL) 494 return (error); 495 496 mtx_lock(&vmmdev_mtx); 497 sc = vmmdev_lookup(buf); 498 mtx_unlock(&vmmdev_mtx); 499 if (sc != NULL) 500 return (EEXIST); 501 502 error = vm_create(buf, &vm); 503 if (error != 0) 504 return (error); 505 506 sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO); 507 sc->vm = vm; 508 509 /* 510 * Lookup the name again just in case somebody sneaked in when we 511 * dropped the lock. 512 */ 513 mtx_lock(&vmmdev_mtx); 514 sc2 = vmmdev_lookup(buf); 515 if (sc2 == NULL) { 516 SLIST_INSERT_HEAD(&head, sc, link); 517 sc->flags |= VSC_LINKED; 518 } 519 mtx_unlock(&vmmdev_mtx); 520 521 if (sc2 != NULL) { 522 vmmdev_destroy(sc); 523 return (EEXIST); 524 } 525 526 error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, NULL, 527 UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf); 528 if (error != 0) { 529 vmmdev_destroy(sc); 530 return (error); 531 } 532 533 mtx_lock(&vmmdev_mtx); 534 sc->cdev = cdev; 535 sc->cdev->si_drv1 = sc; 536 mtx_unlock(&vmmdev_mtx); 537 538 return (0); 539} 540SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW, 541 NULL, 0, sysctl_vmm_create, "A", NULL); 542 543void 544vmmdev_init(void) 545{ 546 mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF); 547} 548 549int 550vmmdev_cleanup(void) 551{ 552 int error; 553 554 if (SLIST_EMPTY(&head)) 555 error = 0; 556 else 557 error = EBUSY; 558 559 return (error); 560} 561