vmm_dev.c revision 245652
1/*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD$ 27 */ 28 29#include <sys/cdefs.h> 30__FBSDID("$FreeBSD$"); 31 32#include <sys/param.h> 33#include <sys/kernel.h> 34#include <sys/queue.h> 35#include <sys/lock.h> 36#include <sys/mutex.h> 37#include <sys/malloc.h> 38#include <sys/conf.h> 39#include <sys/sysctl.h> 40#include <sys/libkern.h> 41#include <sys/ioccom.h> 42#include <sys/mman.h> 43#include <sys/uio.h> 44 45#include <vm/vm.h> 46#include <vm/pmap.h> 47 48#include <machine/pmap.h> 49#include <machine/vmparam.h> 50 51#include <machine/vmm.h> 52#include "vmm_lapic.h" 53#include "vmm_stat.h" 54#include "vmm_mem.h" 55#include "io/ppt.h" 56#include <machine/vmm_dev.h> 57 58struct vmmdev_softc { 59 struct vm *vm; /* vm instance cookie */ 60 struct cdev *cdev; 61 SLIST_ENTRY(vmmdev_softc) link; 62}; 63static SLIST_HEAD(, vmmdev_softc) head; 64 65static struct mtx vmmdev_mtx; 66 67static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev"); 68 69SYSCTL_DECL(_hw_vmm); 70 71static struct vmmdev_softc * 72vmmdev_lookup(const char *name) 73{ 74 struct vmmdev_softc *sc; 75 76#ifdef notyet /* XXX kernel is not compiled with invariants */ 77 mtx_assert(&vmmdev_mtx, MA_OWNED); 78#endif 79 80 SLIST_FOREACH(sc, &head, link) { 81 if (strcmp(name, vm_name(sc->vm)) == 0) 82 break; 83 } 84 85 return (sc); 86} 87 88static struct vmmdev_softc * 89vmmdev_lookup2(struct cdev *cdev) 90{ 91 92 return (cdev->si_drv1); 93} 94 95static int 96vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags) 97{ 98 int error, off, c; 99 vm_paddr_t hpa, gpa; 100 struct vmmdev_softc *sc; 101 102 static char zerobuf[PAGE_SIZE]; 103 104 error = 0; 105 mtx_lock(&vmmdev_mtx); 106 sc = vmmdev_lookup2(cdev); 107 if (sc == NULL) 108 error = ENXIO; 109 110 while (uio->uio_resid > 0 && error == 0) { 111 gpa = uio->uio_offset; 112 off = gpa & PAGE_MASK; 113 c = min(uio->uio_resid, PAGE_SIZE - off); 114 115 /* 116 * The VM has a hole in its physical memory map. If we want to 117 * use 'dd' to inspect memory beyond the hole we need to 118 * provide bogus data for memory that lies in the hole. 119 * 120 * Since this device does not support lseek(2), dd(1) will 121 * read(2) blocks of data to simulate the lseek(2). 122 */ 123 hpa = vm_gpa2hpa(sc->vm, gpa, c); 124 if (hpa == (vm_paddr_t)-1) { 125 if (uio->uio_rw == UIO_READ) 126 error = uiomove(zerobuf, c, uio); 127 else 128 error = EFAULT; 129 } else 130 error = uiomove((void *)PHYS_TO_DMAP(hpa), c, uio); 131 } 132 133 mtx_unlock(&vmmdev_mtx); 134 return (error); 135} 136 137static int 138vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, 139 struct thread *td) 140{ 141 int error, vcpu, state_changed; 142 enum vcpu_state new_state; 143 struct vmmdev_softc *sc; 144 struct vm_memory_segment *seg; 145 struct vm_register *vmreg; 146 struct vm_seg_desc* vmsegdesc; 147 struct vm_pin *vmpin; 148 struct vm_run *vmrun; 149 struct vm_event *vmevent; 150 struct vm_lapic_irq *vmirq; 151 struct vm_capability *vmcap; 152 struct vm_pptdev *pptdev; 153 struct vm_pptdev_mmio *pptmmio; 154 struct vm_pptdev_msi *pptmsi; 155 struct vm_pptdev_msix *pptmsix; 156 struct vm_nmi *vmnmi; 157 struct vm_stats *vmstats; 158 struct vm_stat_desc *statdesc; 159 struct vm_x2apic *x2apic; 160 161 sc = vmmdev_lookup2(cdev); 162 if (sc == NULL) 163 return (ENXIO); 164 165 vcpu = -1; 166 state_changed = 0; 167 168 /* 169 * Some VMM ioctls can operate only on vcpus that are not running. 170 */ 171 switch (cmd) { 172 case VM_RUN: 173 case VM_SET_PINNING: 174 case VM_GET_REGISTER: 175 case VM_SET_REGISTER: 176 case VM_GET_SEGMENT_DESCRIPTOR: 177 case VM_SET_SEGMENT_DESCRIPTOR: 178 case VM_INJECT_EVENT: 179 case VM_GET_CAPABILITY: 180 case VM_SET_CAPABILITY: 181 case VM_PPTDEV_MSI: 182 case VM_PPTDEV_MSIX: 183 case VM_SET_X2APIC_STATE: 184 /* 185 * XXX fragile, handle with care 186 * Assumes that the first field of the ioctl data is the vcpu. 187 */ 188 vcpu = *(int *)data; 189 if (vcpu < 0 || vcpu >= VM_MAXCPU) { 190 error = EINVAL; 191 goto done; 192 } 193 194 if (cmd == VM_RUN) 195 new_state = VCPU_RUNNING; 196 else 197 new_state = VCPU_CANNOT_RUN; 198 199 error = vcpu_set_state(sc->vm, vcpu, new_state); 200 if (error) 201 goto done; 202 203 state_changed = 1; 204 break; 205 206 case VM_MAP_PPTDEV_MMIO: 207 case VM_BIND_PPTDEV: 208 case VM_UNBIND_PPTDEV: 209 case VM_MAP_MEMORY: 210 /* 211 * ioctls that operate on the entire virtual machine must 212 * prevent all vcpus from running. 213 */ 214 error = 0; 215 for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) { 216 error = vcpu_set_state(sc->vm, vcpu, VCPU_CANNOT_RUN); 217 if (error) 218 break; 219 } 220 221 if (error) { 222 while (--vcpu >= 0) 223 vcpu_set_state(sc->vm, vcpu, VCPU_IDLE); 224 goto done; 225 } 226 227 state_changed = 2; 228 break; 229 230 default: 231 break; 232 } 233 234 switch(cmd) { 235 case VM_RUN: 236 vmrun = (struct vm_run *)data; 237 error = vm_run(sc->vm, vmrun); 238 break; 239 case VM_STAT_DESC: { 240 const char *desc; 241 statdesc = (struct vm_stat_desc *)data; 242 desc = vmm_stat_desc(statdesc->index); 243 if (desc != NULL) { 244 error = 0; 245 strlcpy(statdesc->desc, desc, sizeof(statdesc->desc)); 246 } else 247 error = EINVAL; 248 break; 249 } 250 case VM_STATS: { 251 CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_TYPES); 252 vmstats = (struct vm_stats *)data; 253 getmicrotime(&vmstats->tv); 254 error = vmm_stat_copy(sc->vm, vmstats->cpuid, 255 &vmstats->num_entries, vmstats->statbuf); 256 break; 257 } 258 case VM_PPTDEV_MSI: 259 pptmsi = (struct vm_pptdev_msi *)data; 260 error = ppt_setup_msi(sc->vm, pptmsi->vcpu, 261 pptmsi->bus, pptmsi->slot, pptmsi->func, 262 pptmsi->destcpu, pptmsi->vector, 263 pptmsi->numvec); 264 break; 265 case VM_PPTDEV_MSIX: 266 pptmsix = (struct vm_pptdev_msix *)data; 267 error = ppt_setup_msix(sc->vm, pptmsix->vcpu, 268 pptmsix->bus, pptmsix->slot, 269 pptmsix->func, pptmsix->idx, 270 pptmsix->msg, pptmsix->vector_control, 271 pptmsix->addr); 272 break; 273 case VM_MAP_PPTDEV_MMIO: 274 pptmmio = (struct vm_pptdev_mmio *)data; 275 error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot, 276 pptmmio->func, pptmmio->gpa, pptmmio->len, 277 pptmmio->hpa); 278 break; 279 case VM_BIND_PPTDEV: 280 pptdev = (struct vm_pptdev *)data; 281 error = ppt_assign_device(sc->vm, pptdev->bus, pptdev->slot, 282 pptdev->func); 283 break; 284 case VM_UNBIND_PPTDEV: 285 pptdev = (struct vm_pptdev *)data; 286 error = ppt_unassign_device(sc->vm, pptdev->bus, pptdev->slot, 287 pptdev->func); 288 break; 289 case VM_INJECT_EVENT: 290 vmevent = (struct vm_event *)data; 291 error = vm_inject_event(sc->vm, vmevent->cpuid, vmevent->type, 292 vmevent->vector, 293 vmevent->error_code, 294 vmevent->error_code_valid); 295 break; 296 case VM_INJECT_NMI: 297 vmnmi = (struct vm_nmi *)data; 298 error = vm_inject_nmi(sc->vm, vmnmi->cpuid); 299 break; 300 case VM_LAPIC_IRQ: 301 vmirq = (struct vm_lapic_irq *)data; 302 error = lapic_set_intr(sc->vm, vmirq->cpuid, vmirq->vector); 303 break; 304 case VM_SET_PINNING: 305 vmpin = (struct vm_pin *)data; 306 error = vm_set_pinning(sc->vm, vmpin->vm_cpuid, 307 vmpin->host_cpuid); 308 break; 309 case VM_GET_PINNING: 310 vmpin = (struct vm_pin *)data; 311 error = vm_get_pinning(sc->vm, vmpin->vm_cpuid, 312 &vmpin->host_cpuid); 313 break; 314 case VM_MAP_MEMORY: 315 seg = (struct vm_memory_segment *)data; 316 error = vm_malloc(sc->vm, seg->gpa, seg->len); 317 break; 318 case VM_GET_MEMORY_SEG: 319 seg = (struct vm_memory_segment *)data; 320 seg->len = 0; 321 (void)vm_gpabase2memseg(sc->vm, seg->gpa, seg); 322 error = 0; 323 break; 324 case VM_GET_REGISTER: 325 vmreg = (struct vm_register *)data; 326 error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum, 327 &vmreg->regval); 328 break; 329 case VM_SET_REGISTER: 330 vmreg = (struct vm_register *)data; 331 error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum, 332 vmreg->regval); 333 break; 334 case VM_SET_SEGMENT_DESCRIPTOR: 335 vmsegdesc = (struct vm_seg_desc *)data; 336 error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid, 337 vmsegdesc->regnum, 338 &vmsegdesc->desc); 339 break; 340 case VM_GET_SEGMENT_DESCRIPTOR: 341 vmsegdesc = (struct vm_seg_desc *)data; 342 error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid, 343 vmsegdesc->regnum, 344 &vmsegdesc->desc); 345 break; 346 case VM_GET_CAPABILITY: 347 vmcap = (struct vm_capability *)data; 348 error = vm_get_capability(sc->vm, vmcap->cpuid, 349 vmcap->captype, 350 &vmcap->capval); 351 break; 352 case VM_SET_CAPABILITY: 353 vmcap = (struct vm_capability *)data; 354 error = vm_set_capability(sc->vm, vmcap->cpuid, 355 vmcap->captype, 356 vmcap->capval); 357 break; 358 case VM_SET_X2APIC_STATE: 359 x2apic = (struct vm_x2apic *)data; 360 error = vm_set_x2apic_state(sc->vm, 361 x2apic->cpuid, x2apic->state); 362 break; 363 case VM_GET_X2APIC_STATE: 364 x2apic = (struct vm_x2apic *)data; 365 error = vm_get_x2apic_state(sc->vm, 366 x2apic->cpuid, &x2apic->state); 367 break; 368 default: 369 error = ENOTTY; 370 break; 371 } 372 373 if (state_changed == 1) { 374 vcpu_set_state(sc->vm, vcpu, VCPU_IDLE); 375 } else if (state_changed == 2) { 376 for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) 377 vcpu_set_state(sc->vm, vcpu, VCPU_IDLE); 378 } 379 380done: 381 return (error); 382} 383 384static int 385vmmdev_mmap(struct cdev *cdev, vm_ooffset_t offset, vm_paddr_t *paddr, 386 int nprot, vm_memattr_t *memattr) 387{ 388 int error; 389 struct vmmdev_softc *sc; 390 391 error = -1; 392 mtx_lock(&vmmdev_mtx); 393 394 sc = vmmdev_lookup2(cdev); 395 if (sc != NULL && (nprot & PROT_EXEC) == 0) { 396 *paddr = vm_gpa2hpa(sc->vm, (vm_paddr_t)offset, PAGE_SIZE); 397 if (*paddr != (vm_paddr_t)-1) 398 error = 0; 399 } 400 401 mtx_unlock(&vmmdev_mtx); 402 403 return (error); 404} 405 406static void 407vmmdev_destroy(struct vmmdev_softc *sc, boolean_t unlink) 408{ 409 410 /* 411 * XXX must stop virtual machine instances that may be still 412 * running and cleanup their state. 413 */ 414 if (sc->cdev) 415 destroy_dev(sc->cdev); 416 417 if (sc->vm) 418 vm_destroy(sc->vm); 419 420 if (unlink) { 421 mtx_lock(&vmmdev_mtx); 422 SLIST_REMOVE(&head, sc, vmmdev_softc, link); 423 mtx_unlock(&vmmdev_mtx); 424 } 425 426 free(sc, M_VMMDEV); 427} 428 429static int 430sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS) 431{ 432 int error; 433 char buf[VM_MAX_NAMELEN]; 434 struct vmmdev_softc *sc; 435 436 strlcpy(buf, "beavis", sizeof(buf)); 437 error = sysctl_handle_string(oidp, buf, sizeof(buf), req); 438 if (error != 0 || req->newptr == NULL) 439 return (error); 440 441 /* 442 * XXX TODO if any process has this device open then fail 443 */ 444 445 mtx_lock(&vmmdev_mtx); 446 sc = vmmdev_lookup(buf); 447 if (sc == NULL) { 448 mtx_unlock(&vmmdev_mtx); 449 return (EINVAL); 450 } 451 452 sc->cdev->si_drv1 = NULL; 453 mtx_unlock(&vmmdev_mtx); 454 455 vmmdev_destroy(sc, TRUE); 456 457 return (0); 458} 459SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW, 460 NULL, 0, sysctl_vmm_destroy, "A", NULL); 461 462static struct cdevsw vmmdevsw = { 463 .d_name = "vmmdev", 464 .d_version = D_VERSION, 465 .d_ioctl = vmmdev_ioctl, 466 .d_mmap = vmmdev_mmap, 467 .d_read = vmmdev_rw, 468 .d_write = vmmdev_rw, 469}; 470 471static int 472sysctl_vmm_create(SYSCTL_HANDLER_ARGS) 473{ 474 int error; 475 struct vm *vm; 476 struct vmmdev_softc *sc, *sc2; 477 char buf[VM_MAX_NAMELEN]; 478 479 strlcpy(buf, "beavis", sizeof(buf)); 480 error = sysctl_handle_string(oidp, buf, sizeof(buf), req); 481 if (error != 0 || req->newptr == NULL) 482 return (error); 483 484 mtx_lock(&vmmdev_mtx); 485 sc = vmmdev_lookup(buf); 486 mtx_unlock(&vmmdev_mtx); 487 if (sc != NULL) 488 return (EEXIST); 489 490 vm = vm_create(buf); 491 if (vm == NULL) 492 return (EINVAL); 493 494 sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO); 495 sc->vm = vm; 496 497 /* 498 * Lookup the name again just in case somebody sneaked in when we 499 * dropped the lock. 500 */ 501 mtx_lock(&vmmdev_mtx); 502 sc2 = vmmdev_lookup(buf); 503 if (sc2 == NULL) 504 SLIST_INSERT_HEAD(&head, sc, link); 505 mtx_unlock(&vmmdev_mtx); 506 507 if (sc2 != NULL) { 508 vmmdev_destroy(sc, FALSE); 509 return (EEXIST); 510 } 511 512 sc->cdev = make_dev(&vmmdevsw, 0, UID_ROOT, GID_WHEEL, 0600, 513 "vmm/%s", buf); 514 sc->cdev->si_drv1 = sc; 515 516 return (0); 517} 518SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW, 519 NULL, 0, sysctl_vmm_create, "A", NULL); 520 521void 522vmmdev_init(void) 523{ 524 mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF); 525} 526 527int 528vmmdev_cleanup(void) 529{ 530 int error; 531 532 if (SLIST_EMPTY(&head)) 533 error = 0; 534 else 535 error = EBUSY; 536 537 return (error); 538} 539