vmm_dev.c revision 261088
1/*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD: stable/10/sys/amd64/vmm/vmm_dev.c 261088 2014-01-23 20:21:39Z jhb $ 27 */ 28 29#include <sys/cdefs.h> 30__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/vmm_dev.c 261088 2014-01-23 20:21:39Z jhb $"); 31 32#include <sys/param.h> 33#include <sys/kernel.h> 34#include <sys/queue.h> 35#include <sys/lock.h> 36#include <sys/mutex.h> 37#include <sys/malloc.h> 38#include <sys/conf.h> 39#include <sys/sysctl.h> 40#include <sys/libkern.h> 41#include <sys/ioccom.h> 42#include <sys/mman.h> 43#include <sys/uio.h> 44 45#include <vm/vm.h> 46#include <vm/pmap.h> 47#include <vm/vm_map.h> 48 49#include <machine/pmap.h> 50#include <machine/vmparam.h> 51#include <machine/vmm.h> 52#include <machine/vmm_dev.h> 53 54#include "vmm_lapic.h" 55#include "vmm_stat.h" 56#include "vmm_mem.h" 57#include "io/ppt.h" 58#include "io/vioapic.h" 59#include "io/vhpet.h" 60 61struct vmmdev_softc { 62 struct vm *vm; /* vm instance cookie */ 63 struct cdev *cdev; 64 SLIST_ENTRY(vmmdev_softc) link; 65 int flags; 66}; 67#define VSC_LINKED 0x01 68 69static SLIST_HEAD(, vmmdev_softc) head; 70 71static struct mtx vmmdev_mtx; 72 73static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev"); 74 75SYSCTL_DECL(_hw_vmm); 76 77static struct vmmdev_softc * 78vmmdev_lookup(const char *name) 79{ 80 struct vmmdev_softc *sc; 81 82#ifdef notyet /* XXX kernel is not compiled with invariants */ 83 mtx_assert(&vmmdev_mtx, MA_OWNED); 84#endif 85 86 SLIST_FOREACH(sc, &head, link) { 87 if (strcmp(name, vm_name(sc->vm)) == 0) 88 break; 89 } 90 91 return (sc); 92} 93 94static struct vmmdev_softc * 95vmmdev_lookup2(struct cdev *cdev) 96{ 97 98 return (cdev->si_drv1); 99} 100 101static int 102vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags) 103{ 104 int error, off, c, prot; 105 vm_paddr_t gpa; 106 void *hpa, *cookie; 107 struct vmmdev_softc *sc; 108 109 static char zerobuf[PAGE_SIZE]; 110 111 error = 0; 112 sc = vmmdev_lookup2(cdev); 113 if (sc == NULL) 114 error = ENXIO; 115 116 prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ); 117 while (uio->uio_resid > 0 && error == 0) { 118 gpa = uio->uio_offset; 119 off = gpa & PAGE_MASK; 120 c = min(uio->uio_resid, PAGE_SIZE - off); 121 122 /* 123 * The VM has a hole in its physical memory map. If we want to 124 * use 'dd' to inspect memory beyond the hole we need to 125 * provide bogus data for memory that lies in the hole. 126 * 127 * Since this device does not support lseek(2), dd(1) will 128 * read(2) blocks of data to simulate the lseek(2). 129 */ 130 hpa = vm_gpa_hold(sc->vm, gpa, c, prot, &cookie); 131 if (hpa == NULL) { 132 if (uio->uio_rw == UIO_READ) 133 error = uiomove(zerobuf, c, uio); 134 else 135 error = EFAULT; 136 } else { 137 error = uiomove(hpa, c, uio); 138 vm_gpa_release(cookie); 139 } 140 } 141 return (error); 142} 143 144static int 145vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, 146 struct thread *td) 147{ 148 int error, vcpu, state_changed; 149 struct vmmdev_softc *sc; 150 struct vm_memory_segment *seg; 151 struct vm_register *vmreg; 152 struct vm_seg_desc *vmsegdesc; 153 struct vm_run *vmrun; 154 struct vm_event *vmevent; 155 struct vm_lapic_irq *vmirq; 156 struct vm_ioapic_irq *ioapic_irq; 157 struct vm_capability *vmcap; 158 struct vm_pptdev *pptdev; 159 struct vm_pptdev_mmio *pptmmio; 160 struct vm_pptdev_msi *pptmsi; 161 struct vm_pptdev_msix *pptmsix; 162 struct vm_nmi *vmnmi; 163 struct vm_stats *vmstats; 164 struct vm_stat_desc *statdesc; 165 struct vm_x2apic *x2apic; 166 struct vm_gpa_pte *gpapte; 167 168 sc = vmmdev_lookup2(cdev); 169 if (sc == NULL) 170 return (ENXIO); 171 172 vcpu = -1; 173 state_changed = 0; 174 175 /* 176 * Some VMM ioctls can operate only on vcpus that are not running. 177 */ 178 switch (cmd) { 179 case VM_RUN: 180 case VM_GET_REGISTER: 181 case VM_SET_REGISTER: 182 case VM_GET_SEGMENT_DESCRIPTOR: 183 case VM_SET_SEGMENT_DESCRIPTOR: 184 case VM_INJECT_EVENT: 185 case VM_GET_CAPABILITY: 186 case VM_SET_CAPABILITY: 187 case VM_PPTDEV_MSI: 188 case VM_PPTDEV_MSIX: 189 case VM_SET_X2APIC_STATE: 190 /* 191 * XXX fragile, handle with care 192 * Assumes that the first field of the ioctl data is the vcpu. 193 */ 194 vcpu = *(int *)data; 195 if (vcpu < 0 || vcpu >= VM_MAXCPU) { 196 error = EINVAL; 197 goto done; 198 } 199 200 error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN); 201 if (error) 202 goto done; 203 204 state_changed = 1; 205 break; 206 207 case VM_MAP_PPTDEV_MMIO: 208 case VM_BIND_PPTDEV: 209 case VM_UNBIND_PPTDEV: 210 case VM_MAP_MEMORY: 211 /* 212 * ioctls that operate on the entire virtual machine must 213 * prevent all vcpus from running. 214 */ 215 error = 0; 216 for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) { 217 error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN); 218 if (error) 219 break; 220 } 221 222 if (error) { 223 while (--vcpu >= 0) 224 vcpu_set_state(sc->vm, vcpu, VCPU_IDLE); 225 goto done; 226 } 227 228 state_changed = 2; 229 break; 230 231 default: 232 break; 233 } 234 235 switch(cmd) { 236 case VM_RUN: 237 vmrun = (struct vm_run *)data; 238 error = vm_run(sc->vm, vmrun); 239 break; 240 case VM_STAT_DESC: { 241 statdesc = (struct vm_stat_desc *)data; 242 error = vmm_stat_desc_copy(statdesc->index, 243 statdesc->desc, sizeof(statdesc->desc)); 244 break; 245 } 246 case VM_STATS: { 247 CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS); 248 vmstats = (struct vm_stats *)data; 249 getmicrotime(&vmstats->tv); 250 error = vmm_stat_copy(sc->vm, vmstats->cpuid, 251 &vmstats->num_entries, vmstats->statbuf); 252 break; 253 } 254 case VM_PPTDEV_MSI: 255 pptmsi = (struct vm_pptdev_msi *)data; 256 error = ppt_setup_msi(sc->vm, pptmsi->vcpu, 257 pptmsi->bus, pptmsi->slot, pptmsi->func, 258 pptmsi->destcpu, pptmsi->vector, 259 pptmsi->numvec); 260 break; 261 case VM_PPTDEV_MSIX: 262 pptmsix = (struct vm_pptdev_msix *)data; 263 error = ppt_setup_msix(sc->vm, pptmsix->vcpu, 264 pptmsix->bus, pptmsix->slot, 265 pptmsix->func, pptmsix->idx, 266 pptmsix->msg, pptmsix->vector_control, 267 pptmsix->addr); 268 break; 269 case VM_MAP_PPTDEV_MMIO: 270 pptmmio = (struct vm_pptdev_mmio *)data; 271 error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot, 272 pptmmio->func, pptmmio->gpa, pptmmio->len, 273 pptmmio->hpa); 274 break; 275 case VM_BIND_PPTDEV: 276 pptdev = (struct vm_pptdev *)data; 277 error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot, 278 pptdev->func); 279 break; 280 case VM_UNBIND_PPTDEV: 281 pptdev = (struct vm_pptdev *)data; 282 error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot, 283 pptdev->func); 284 break; 285 case VM_INJECT_EVENT: 286 vmevent = (struct vm_event *)data; 287 error = vm_inject_event(sc->vm, vmevent->cpuid, vmevent->type, 288 vmevent->vector, 289 vmevent->error_code, 290 vmevent->error_code_valid); 291 break; 292 case VM_INJECT_NMI: 293 vmnmi = (struct vm_nmi *)data; 294 error = vm_inject_nmi(sc->vm, vmnmi->cpuid); 295 break; 296 case VM_LAPIC_IRQ: 297 vmirq = (struct vm_lapic_irq *)data; 298 error = lapic_intr_edge(sc->vm, vmirq->cpuid, vmirq->vector); 299 break; 300 case VM_IOAPIC_ASSERT_IRQ: 301 ioapic_irq = (struct vm_ioapic_irq *)data; 302 error = vioapic_assert_irq(sc->vm, ioapic_irq->irq); 303 break; 304 case VM_IOAPIC_DEASSERT_IRQ: 305 ioapic_irq = (struct vm_ioapic_irq *)data; 306 error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq); 307 break; 308 case VM_IOAPIC_PULSE_IRQ: 309 ioapic_irq = (struct vm_ioapic_irq *)data; 310 error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq); 311 break; 312 case VM_MAP_MEMORY: 313 seg = (struct vm_memory_segment *)data; 314 error = vm_malloc(sc->vm, seg->gpa, seg->len); 315 break; 316 case VM_GET_MEMORY_SEG: 317 seg = (struct vm_memory_segment *)data; 318 seg->len = 0; 319 (void)vm_gpabase2memseg(sc->vm, seg->gpa, seg); 320 error = 0; 321 break; 322 case VM_GET_REGISTER: 323 vmreg = (struct vm_register *)data; 324 error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum, 325 &vmreg->regval); 326 break; 327 case VM_SET_REGISTER: 328 vmreg = (struct vm_register *)data; 329 error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum, 330 vmreg->regval); 331 break; 332 case VM_SET_SEGMENT_DESCRIPTOR: 333 vmsegdesc = (struct vm_seg_desc *)data; 334 error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid, 335 vmsegdesc->regnum, 336 &vmsegdesc->desc); 337 break; 338 case VM_GET_SEGMENT_DESCRIPTOR: 339 vmsegdesc = (struct vm_seg_desc *)data; 340 error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid, 341 vmsegdesc->regnum, 342 &vmsegdesc->desc); 343 break; 344 case VM_GET_CAPABILITY: 345 vmcap = (struct vm_capability *)data; 346 error = vm_get_capability(sc->vm, vmcap->cpuid, 347 vmcap->captype, 348 &vmcap->capval); 349 break; 350 case VM_SET_CAPABILITY: 351 vmcap = (struct vm_capability *)data; 352 error = vm_set_capability(sc->vm, vmcap->cpuid, 353 vmcap->captype, 354 vmcap->capval); 355 break; 356 case VM_SET_X2APIC_STATE: 357 x2apic = (struct vm_x2apic *)data; 358 error = vm_set_x2apic_state(sc->vm, 359 x2apic->cpuid, x2apic->state); 360 break; 361 case VM_GET_X2APIC_STATE: 362 x2apic = (struct vm_x2apic *)data; 363 error = vm_get_x2apic_state(sc->vm, 364 x2apic->cpuid, &x2apic->state); 365 break; 366 case VM_GET_GPA_PMAP: 367 gpapte = (struct vm_gpa_pte *)data; 368 pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)), 369 gpapte->gpa, gpapte->pte, &gpapte->ptenum); 370 error = 0; 371 break; 372 case VM_GET_HPET_CAPABILITIES: 373 error = vhpet_getcap((struct vm_hpet_cap *)data); 374 break; 375 default: 376 error = ENOTTY; 377 break; 378 } 379 380 if (state_changed == 1) { 381 vcpu_set_state(sc->vm, vcpu, VCPU_IDLE); 382 } else if (state_changed == 2) { 383 for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) 384 vcpu_set_state(sc->vm, vcpu, VCPU_IDLE); 385 } 386 387done: 388 /* Make sure that no handler returns a bogus value like ERESTART */ 389 KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error)); 390 return (error); 391} 392 393static int 394vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, 395 vm_size_t size, struct vm_object **object, int nprot) 396{ 397 int error; 398 struct vmmdev_softc *sc; 399 400 sc = vmmdev_lookup2(cdev); 401 if (sc != NULL && (nprot & PROT_EXEC) == 0) 402 error = vm_get_memobj(sc->vm, *offset, size, offset, object); 403 else 404 error = EINVAL; 405 406 return (error); 407} 408 409static void 410vmmdev_destroy(void *arg) 411{ 412 413 struct vmmdev_softc *sc = arg; 414 415 if (sc->cdev != NULL) 416 destroy_dev(sc->cdev); 417 418 if (sc->vm != NULL) 419 vm_destroy(sc->vm); 420 421 if ((sc->flags & VSC_LINKED) != 0) { 422 mtx_lock(&vmmdev_mtx); 423 SLIST_REMOVE(&head, sc, vmmdev_softc, link); 424 mtx_unlock(&vmmdev_mtx); 425 } 426 427 free(sc, M_VMMDEV); 428} 429 430static int 431sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS) 432{ 433 int error; 434 char buf[VM_MAX_NAMELEN]; 435 struct vmmdev_softc *sc; 436 struct cdev *cdev; 437 438 strlcpy(buf, "beavis", sizeof(buf)); 439 error = sysctl_handle_string(oidp, buf, sizeof(buf), req); 440 if (error != 0 || req->newptr == NULL) 441 return (error); 442 443 mtx_lock(&vmmdev_mtx); 444 sc = vmmdev_lookup(buf); 445 if (sc == NULL || sc->cdev == NULL) { 446 mtx_unlock(&vmmdev_mtx); 447 return (EINVAL); 448 } 449 450 /* 451 * The 'cdev' will be destroyed asynchronously when 'si_threadcount' 452 * goes down to 0 so we should not do it again in the callback. 453 */ 454 cdev = sc->cdev; 455 sc->cdev = NULL; 456 mtx_unlock(&vmmdev_mtx); 457 458 /* 459 * Schedule the 'cdev' to be destroyed: 460 * 461 * - any new operations on this 'cdev' will return an error (ENXIO). 462 * 463 * - when the 'si_threadcount' dwindles down to zero the 'cdev' will 464 * be destroyed and the callback will be invoked in a taskqueue 465 * context. 466 */ 467 destroy_dev_sched_cb(cdev, vmmdev_destroy, sc); 468 469 return (0); 470} 471SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW, 472 NULL, 0, sysctl_vmm_destroy, "A", NULL); 473 474static struct cdevsw vmmdevsw = { 475 .d_name = "vmmdev", 476 .d_version = D_VERSION, 477 .d_ioctl = vmmdev_ioctl, 478 .d_mmap_single = vmmdev_mmap_single, 479 .d_read = vmmdev_rw, 480 .d_write = vmmdev_rw, 481}; 482 483static int 484sysctl_vmm_create(SYSCTL_HANDLER_ARGS) 485{ 486 int error; 487 struct vm *vm; 488 struct cdev *cdev; 489 struct vmmdev_softc *sc, *sc2; 490 char buf[VM_MAX_NAMELEN]; 491 492 strlcpy(buf, "beavis", sizeof(buf)); 493 error = sysctl_handle_string(oidp, buf, sizeof(buf), req); 494 if (error != 0 || req->newptr == NULL) 495 return (error); 496 497 mtx_lock(&vmmdev_mtx); 498 sc = vmmdev_lookup(buf); 499 mtx_unlock(&vmmdev_mtx); 500 if (sc != NULL) 501 return (EEXIST); 502 503 error = vm_create(buf, &vm); 504 if (error != 0) 505 return (error); 506 507 sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO); 508 sc->vm = vm; 509 510 /* 511 * Lookup the name again just in case somebody sneaked in when we 512 * dropped the lock. 513 */ 514 mtx_lock(&vmmdev_mtx); 515 sc2 = vmmdev_lookup(buf); 516 if (sc2 == NULL) { 517 SLIST_INSERT_HEAD(&head, sc, link); 518 sc->flags |= VSC_LINKED; 519 } 520 mtx_unlock(&vmmdev_mtx); 521 522 if (sc2 != NULL) { 523 vmmdev_destroy(sc); 524 return (EEXIST); 525 } 526 527 error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, NULL, 528 UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf); 529 if (error != 0) { 530 vmmdev_destroy(sc); 531 return (error); 532 } 533 534 mtx_lock(&vmmdev_mtx); 535 sc->cdev = cdev; 536 sc->cdev->si_drv1 = sc; 537 mtx_unlock(&vmmdev_mtx); 538 539 return (0); 540} 541SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW, 542 NULL, 0, sysctl_vmm_create, "A", NULL); 543 544void 545vmmdev_init(void) 546{ 547 mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF); 548} 549 550int 551vmmdev_cleanup(void) 552{ 553 int error; 554 555 if (SLIST_EMPTY(&head)) 556 error = 0; 557 else 558 error = EBUSY; 559 560 return (error); 561} 562