vmm_dev.c revision 258494
1/*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD: head/sys/amd64/vmm/vmm_dev.c 258494 2013-11-23 03:56:03Z neel $ 27 */ 28 29#include <sys/cdefs.h> 30__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_dev.c 258494 2013-11-23 03:56:03Z neel $"); 31 32#include <sys/param.h> 33#include <sys/kernel.h> 34#include <sys/queue.h> 35#include <sys/lock.h> 36#include <sys/mutex.h> 37#include <sys/malloc.h> 38#include <sys/conf.h> 39#include <sys/sysctl.h> 40#include <sys/libkern.h> 41#include <sys/ioccom.h> 42#include <sys/mman.h> 43#include <sys/uio.h> 44 45#include <vm/vm.h> 46#include <vm/pmap.h> 47#include <vm/vm_map.h> 48 49#include <machine/vmparam.h> 50 51#include <machine/vmm.h> 52#include "vmm_lapic.h" 53#include "vmm_stat.h" 54#include "vmm_mem.h" 55#include "io/ppt.h" 56#include "io/vioapic.h" 57#include <machine/vmm_dev.h> 58 59struct vmmdev_softc { 60 struct vm *vm; /* vm instance cookie */ 61 struct cdev *cdev; 62 SLIST_ENTRY(vmmdev_softc) link; 63 int flags; 64}; 65#define VSC_LINKED 0x01 66 67static SLIST_HEAD(, vmmdev_softc) head; 68 69static struct mtx vmmdev_mtx; 70 71static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev"); 72 73SYSCTL_DECL(_hw_vmm); 74 75static struct vmmdev_softc * 76vmmdev_lookup(const char *name) 77{ 78 struct vmmdev_softc *sc; 79 80#ifdef notyet /* XXX kernel is not compiled with invariants */ 81 mtx_assert(&vmmdev_mtx, MA_OWNED); 82#endif 83 84 SLIST_FOREACH(sc, &head, link) { 85 if (strcmp(name, vm_name(sc->vm)) == 0) 86 break; 87 } 88 89 return (sc); 90} 91 92static struct vmmdev_softc * 93vmmdev_lookup2(struct cdev *cdev) 94{ 95 96 return (cdev->si_drv1); 97} 98 99static int 100vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags) 101{ 102 int error, off, c, prot; 103 vm_paddr_t gpa; 104 void *hpa, *cookie; 105 struct vmmdev_softc *sc; 106 107 static char zerobuf[PAGE_SIZE]; 108 109 error = 0; 110 sc = vmmdev_lookup2(cdev); 111 if (sc == NULL) 112 error = ENXIO; 113 114 prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ); 115 while (uio->uio_resid > 0 && error == 0) { 116 gpa = uio->uio_offset; 117 off = gpa & PAGE_MASK; 118 c = min(uio->uio_resid, PAGE_SIZE - off); 119 120 /* 121 * The VM has a hole in its physical memory map. If we want to 122 * use 'dd' to inspect memory beyond the hole we need to 123 * provide bogus data for memory that lies in the hole. 124 * 125 * Since this device does not support lseek(2), dd(1) will 126 * read(2) blocks of data to simulate the lseek(2). 127 */ 128 hpa = vm_gpa_hold(sc->vm, gpa, c, prot, &cookie); 129 if (hpa == NULL) { 130 if (uio->uio_rw == UIO_READ) 131 error = uiomove(zerobuf, c, uio); 132 else 133 error = EFAULT; 134 } else { 135 error = uiomove(hpa, c, uio); 136 vm_gpa_release(cookie); 137 } 138 } 139 return (error); 140} 141 142static int 143vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, 144 struct thread *td) 145{ 146 int error, vcpu, state_changed; 147 struct vmmdev_softc *sc; 148 struct vm_memory_segment *seg; 149 struct vm_register *vmreg; 150 struct vm_seg_desc *vmsegdesc; 151 struct vm_run *vmrun; 152 struct vm_event *vmevent; 153 struct vm_lapic_irq *vmirq; 154 struct vm_ioapic_irq *ioapic_irq; 155 struct vm_capability *vmcap; 156 struct vm_pptdev *pptdev; 157 struct vm_pptdev_mmio *pptmmio; 158 struct vm_pptdev_msi *pptmsi; 159 struct vm_pptdev_msix *pptmsix; 160 struct vm_nmi *vmnmi; 161 struct vm_stats *vmstats; 162 struct vm_stat_desc *statdesc; 163 struct vm_x2apic *x2apic; 164 struct vm_gpa_pte *gpapte; 165 166 sc = vmmdev_lookup2(cdev); 167 if (sc == NULL) 168 return (ENXIO); 169 170 vcpu = -1; 171 state_changed = 0; 172 173 /* 174 * Some VMM ioctls can operate only on vcpus that are not running. 175 */ 176 switch (cmd) { 177 case VM_RUN: 178 case VM_GET_REGISTER: 179 case VM_SET_REGISTER: 180 case VM_GET_SEGMENT_DESCRIPTOR: 181 case VM_SET_SEGMENT_DESCRIPTOR: 182 case VM_INJECT_EVENT: 183 case VM_GET_CAPABILITY: 184 case VM_SET_CAPABILITY: 185 case VM_PPTDEV_MSI: 186 case VM_PPTDEV_MSIX: 187 case VM_SET_X2APIC_STATE: 188 /* 189 * XXX fragile, handle with care 190 * Assumes that the first field of the ioctl data is the vcpu. 191 */ 192 vcpu = *(int *)data; 193 if (vcpu < 0 || vcpu >= VM_MAXCPU) { 194 error = EINVAL; 195 goto done; 196 } 197 198 error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN); 199 if (error) 200 goto done; 201 202 state_changed = 1; 203 break; 204 205 case VM_MAP_PPTDEV_MMIO: 206 case VM_BIND_PPTDEV: 207 case VM_UNBIND_PPTDEV: 208 case VM_MAP_MEMORY: 209 /* 210 * ioctls that operate on the entire virtual machine must 211 * prevent all vcpus from running. 212 */ 213 error = 0; 214 for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) { 215 error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN); 216 if (error) 217 break; 218 } 219 220 if (error) { 221 while (--vcpu >= 0) 222 vcpu_set_state(sc->vm, vcpu, VCPU_IDLE); 223 goto done; 224 } 225 226 state_changed = 2; 227 break; 228 229 default: 230 break; 231 } 232 233 switch(cmd) { 234 case VM_RUN: 235 vmrun = (struct vm_run *)data; 236 error = vm_run(sc->vm, vmrun); 237 break; 238 case VM_STAT_DESC: { 239 statdesc = (struct vm_stat_desc *)data; 240 error = vmm_stat_desc_copy(statdesc->index, 241 statdesc->desc, sizeof(statdesc->desc)); 242 break; 243 } 244 case VM_STATS: { 245 CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS); 246 vmstats = (struct vm_stats *)data; 247 getmicrotime(&vmstats->tv); 248 error = vmm_stat_copy(sc->vm, vmstats->cpuid, 249 &vmstats->num_entries, vmstats->statbuf); 250 break; 251 } 252 case VM_PPTDEV_MSI: 253 pptmsi = (struct vm_pptdev_msi *)data; 254 error = ppt_setup_msi(sc->vm, pptmsi->vcpu, 255 pptmsi->bus, pptmsi->slot, pptmsi->func, 256 pptmsi->destcpu, pptmsi->vector, 257 pptmsi->numvec); 258 break; 259 case VM_PPTDEV_MSIX: 260 pptmsix = (struct vm_pptdev_msix *)data; 261 error = ppt_setup_msix(sc->vm, pptmsix->vcpu, 262 pptmsix->bus, pptmsix->slot, 263 pptmsix->func, pptmsix->idx, 264 pptmsix->msg, pptmsix->vector_control, 265 pptmsix->addr); 266 break; 267 case VM_MAP_PPTDEV_MMIO: 268 pptmmio = (struct vm_pptdev_mmio *)data; 269 error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot, 270 pptmmio->func, pptmmio->gpa, pptmmio->len, 271 pptmmio->hpa); 272 break; 273 case VM_BIND_PPTDEV: 274 pptdev = (struct vm_pptdev *)data; 275 error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot, 276 pptdev->func); 277 break; 278 case VM_UNBIND_PPTDEV: 279 pptdev = (struct vm_pptdev *)data; 280 error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot, 281 pptdev->func); 282 break; 283 case VM_INJECT_EVENT: 284 vmevent = (struct vm_event *)data; 285 error = vm_inject_event(sc->vm, vmevent->cpuid, vmevent->type, 286 vmevent->vector, 287 vmevent->error_code, 288 vmevent->error_code_valid); 289 break; 290 case VM_INJECT_NMI: 291 vmnmi = (struct vm_nmi *)data; 292 error = vm_inject_nmi(sc->vm, vmnmi->cpuid); 293 break; 294 case VM_LAPIC_IRQ: 295 vmirq = (struct vm_lapic_irq *)data; 296 error = lapic_set_intr(sc->vm, vmirq->cpuid, vmirq->vector); 297 break; 298 case VM_IOAPIC_ASSERT_IRQ: 299 ioapic_irq = (struct vm_ioapic_irq *)data; 300 error = vioapic_assert_irq(sc->vm, ioapic_irq->irq); 301 break; 302 case VM_IOAPIC_DEASSERT_IRQ: 303 ioapic_irq = (struct vm_ioapic_irq *)data; 304 error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq); 305 break; 306 case VM_IOAPIC_PULSE_IRQ: 307 ioapic_irq = (struct vm_ioapic_irq *)data; 308 error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq); 309 break; 310 case VM_MAP_MEMORY: 311 seg = (struct vm_memory_segment *)data; 312 error = vm_malloc(sc->vm, seg->gpa, seg->len); 313 break; 314 case VM_GET_MEMORY_SEG: 315 seg = (struct vm_memory_segment *)data; 316 seg->len = 0; 317 (void)vm_gpabase2memseg(sc->vm, seg->gpa, seg); 318 error = 0; 319 break; 320 case VM_GET_REGISTER: 321 vmreg = (struct vm_register *)data; 322 error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum, 323 &vmreg->regval); 324 break; 325 case VM_SET_REGISTER: 326 vmreg = (struct vm_register *)data; 327 error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum, 328 vmreg->regval); 329 break; 330 case VM_SET_SEGMENT_DESCRIPTOR: 331 vmsegdesc = (struct vm_seg_desc *)data; 332 error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid, 333 vmsegdesc->regnum, 334 &vmsegdesc->desc); 335 break; 336 case VM_GET_SEGMENT_DESCRIPTOR: 337 vmsegdesc = (struct vm_seg_desc *)data; 338 error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid, 339 vmsegdesc->regnum, 340 &vmsegdesc->desc); 341 break; 342 case VM_GET_CAPABILITY: 343 vmcap = (struct vm_capability *)data; 344 error = vm_get_capability(sc->vm, vmcap->cpuid, 345 vmcap->captype, 346 &vmcap->capval); 347 break; 348 case VM_SET_CAPABILITY: 349 vmcap = (struct vm_capability *)data; 350 error = vm_set_capability(sc->vm, vmcap->cpuid, 351 vmcap->captype, 352 vmcap->capval); 353 break; 354 case VM_SET_X2APIC_STATE: 355 x2apic = (struct vm_x2apic *)data; 356 error = vm_set_x2apic_state(sc->vm, 357 x2apic->cpuid, x2apic->state); 358 break; 359 case VM_GET_X2APIC_STATE: 360 x2apic = (struct vm_x2apic *)data; 361 error = vm_get_x2apic_state(sc->vm, 362 x2apic->cpuid, &x2apic->state); 363 break; 364 case VM_GET_GPA_PMAP: 365 gpapte = (struct vm_gpa_pte *)data; 366 pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)), 367 gpapte->gpa, gpapte->pte, &gpapte->ptenum); 368 error = 0; 369 break; 370 default: 371 error = ENOTTY; 372 break; 373 } 374 375 if (state_changed == 1) { 376 vcpu_set_state(sc->vm, vcpu, VCPU_IDLE); 377 } else if (state_changed == 2) { 378 for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) 379 vcpu_set_state(sc->vm, vcpu, VCPU_IDLE); 380 } 381 382done: 383 /* Make sure that no handler returns a bogus value like ERESTART */ 384 KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error)); 385 return (error); 386} 387 388static int 389vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, 390 vm_size_t size, struct vm_object **object, int nprot) 391{ 392 int error; 393 struct vmmdev_softc *sc; 394 395 sc = vmmdev_lookup2(cdev); 396 if (sc != NULL && (nprot & PROT_EXEC) == 0) 397 error = vm_get_memobj(sc->vm, *offset, size, offset, object); 398 else 399 error = EINVAL; 400 401 return (error); 402} 403 404static void 405vmmdev_destroy(void *arg) 406{ 407 408 struct vmmdev_softc *sc = arg; 409 410 if (sc->cdev != NULL) 411 destroy_dev(sc->cdev); 412 413 if (sc->vm != NULL) 414 vm_destroy(sc->vm); 415 416 if ((sc->flags & VSC_LINKED) != 0) { 417 mtx_lock(&vmmdev_mtx); 418 SLIST_REMOVE(&head, sc, vmmdev_softc, link); 419 mtx_unlock(&vmmdev_mtx); 420 } 421 422 free(sc, M_VMMDEV); 423} 424 425static int 426sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS) 427{ 428 int error; 429 char buf[VM_MAX_NAMELEN]; 430 struct vmmdev_softc *sc; 431 struct cdev *cdev; 432 433 strlcpy(buf, "beavis", sizeof(buf)); 434 error = sysctl_handle_string(oidp, buf, sizeof(buf), req); 435 if (error != 0 || req->newptr == NULL) 436 return (error); 437 438 mtx_lock(&vmmdev_mtx); 439 sc = vmmdev_lookup(buf); 440 if (sc == NULL || sc->cdev == NULL) { 441 mtx_unlock(&vmmdev_mtx); 442 return (EINVAL); 443 } 444 445 /* 446 * The 'cdev' will be destroyed asynchronously when 'si_threadcount' 447 * goes down to 0 so we should not do it again in the callback. 448 */ 449 cdev = sc->cdev; 450 sc->cdev = NULL; 451 mtx_unlock(&vmmdev_mtx); 452 453 /* 454 * Schedule the 'cdev' to be destroyed: 455 * 456 * - any new operations on this 'cdev' will return an error (ENXIO). 457 * 458 * - when the 'si_threadcount' dwindles down to zero the 'cdev' will 459 * be destroyed and the callback will be invoked in a taskqueue 460 * context. 461 */ 462 destroy_dev_sched_cb(cdev, vmmdev_destroy, sc); 463 464 return (0); 465} 466SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW, 467 NULL, 0, sysctl_vmm_destroy, "A", NULL); 468 469static struct cdevsw vmmdevsw = { 470 .d_name = "vmmdev", 471 .d_version = D_VERSION, 472 .d_ioctl = vmmdev_ioctl, 473 .d_mmap_single = vmmdev_mmap_single, 474 .d_read = vmmdev_rw, 475 .d_write = vmmdev_rw, 476}; 477 478static int 479sysctl_vmm_create(SYSCTL_HANDLER_ARGS) 480{ 481 int error; 482 struct vm *vm; 483 struct cdev *cdev; 484 struct vmmdev_softc *sc, *sc2; 485 char buf[VM_MAX_NAMELEN]; 486 487 strlcpy(buf, "beavis", sizeof(buf)); 488 error = sysctl_handle_string(oidp, buf, sizeof(buf), req); 489 if (error != 0 || req->newptr == NULL) 490 return (error); 491 492 mtx_lock(&vmmdev_mtx); 493 sc = vmmdev_lookup(buf); 494 mtx_unlock(&vmmdev_mtx); 495 if (sc != NULL) 496 return (EEXIST); 497 498 error = vm_create(buf, &vm); 499 if (error != 0) 500 return (error); 501 502 sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO); 503 sc->vm = vm; 504 505 /* 506 * Lookup the name again just in case somebody sneaked in when we 507 * dropped the lock. 508 */ 509 mtx_lock(&vmmdev_mtx); 510 sc2 = vmmdev_lookup(buf); 511 if (sc2 == NULL) { 512 SLIST_INSERT_HEAD(&head, sc, link); 513 sc->flags |= VSC_LINKED; 514 } 515 mtx_unlock(&vmmdev_mtx); 516 517 if (sc2 != NULL) { 518 vmmdev_destroy(sc); 519 return (EEXIST); 520 } 521 522 error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, NULL, 523 UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf); 524 if (error != 0) { 525 vmmdev_destroy(sc); 526 return (error); 527 } 528 529 mtx_lock(&vmmdev_mtx); 530 sc->cdev = cdev; 531 sc->cdev->si_drv1 = sc; 532 mtx_unlock(&vmmdev_mtx); 533 534 return (0); 535} 536SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW, 537 NULL, 0, sysctl_vmm_create, "A", NULL); 538 539void 540vmmdev_init(void) 541{ 542 mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF); 543} 544 545int 546vmmdev_cleanup(void) 547{ 548 int error; 549 550 if (SLIST_EMPTY(&head)) 551 error = 0; 552 else 553 error = EBUSY; 554 555 return (error); 556} 557