vmm_dev.c revision 268935
155714Skris/*-
255714Skris * Copyright (c) 2011 NetApp, Inc.
355714Skris * All rights reserved.
455714Skris *
555714Skris * Redistribution and use in source and binary forms, with or without
655714Skris * modification, are permitted provided that the following conditions
755714Skris * are met:
855714Skris * 1. Redistributions of source code must retain the above copyright
955714Skris *    notice, this list of conditions and the following disclaimer.
1055714Skris * 2. Redistributions in binary form must reproduce the above copyright
1155714Skris *    notice, this list of conditions and the following disclaimer in the
1255714Skris *    documentation and/or other materials provided with the distribution.
1355714Skris *
1455714Skris * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
1555714Skris * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1655714Skris * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1755714Skris * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
1855714Skris * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
1955714Skris * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2055714Skris * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2155714Skris * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2255714Skris * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2355714Skris * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2455714Skris * SUCH DAMAGE.
2555714Skris *
2655714Skris * $FreeBSD: stable/10/sys/amd64/vmm/vmm_dev.c 268935 2014-07-21 02:39:17Z jhb $
2755714Skris */
2855714Skris
2955714Skris#include <sys/cdefs.h>
3055714Skris__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/vmm_dev.c 268935 2014-07-21 02:39:17Z jhb $");
3155714Skris
3255714Skris#include <sys/param.h>
3355714Skris#include <sys/kernel.h>
3455714Skris#include <sys/queue.h>
3555714Skris#include <sys/lock.h>
3655714Skris#include <sys/mutex.h>
3755714Skris#include <sys/malloc.h>
3855714Skris#include <sys/conf.h>
3955714Skris#include <sys/sysctl.h>
40#include <sys/libkern.h>
41#include <sys/ioccom.h>
42#include <sys/mman.h>
43#include <sys/uio.h>
44
45#include <vm/vm.h>
46#include <vm/pmap.h>
47#include <vm/vm_map.h>
48
49#include <machine/vmparam.h>
50#include <machine/vmm.h>
51#include <machine/vmm_dev.h>
52
53#include "vmm_lapic.h"
54#include "vmm_stat.h"
55#include "vmm_mem.h"
56#include "io/ppt.h"
57#include "io/vatpic.h"
58#include "io/vioapic.h"
59#include "io/vhpet.h"
60
61struct vmmdev_softc {
62	struct vm	*vm;		/* vm instance cookie */
63	struct cdev	*cdev;
64	SLIST_ENTRY(vmmdev_softc) link;
65	int		flags;
66};
67#define	VSC_LINKED		0x01
68
69static SLIST_HEAD(, vmmdev_softc) head;
70
71static struct mtx vmmdev_mtx;
72
73static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
74
75SYSCTL_DECL(_hw_vmm);
76
77static struct vmmdev_softc *
78vmmdev_lookup(const char *name)
79{
80	struct vmmdev_softc *sc;
81
82#ifdef notyet	/* XXX kernel is not compiled with invariants */
83	mtx_assert(&vmmdev_mtx, MA_OWNED);
84#endif
85
86	SLIST_FOREACH(sc, &head, link) {
87		if (strcmp(name, vm_name(sc->vm)) == 0)
88			break;
89	}
90
91	return (sc);
92}
93
94static struct vmmdev_softc *
95vmmdev_lookup2(struct cdev *cdev)
96{
97
98	return (cdev->si_drv1);
99}
100
101static int
102vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
103{
104	int error, off, c, prot;
105	vm_paddr_t gpa;
106	void *hpa, *cookie;
107	struct vmmdev_softc *sc;
108
109	static char zerobuf[PAGE_SIZE];
110
111	error = 0;
112	sc = vmmdev_lookup2(cdev);
113	if (sc == NULL)
114		error = ENXIO;
115
116	prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
117	while (uio->uio_resid > 0 && error == 0) {
118		gpa = uio->uio_offset;
119		off = gpa & PAGE_MASK;
120		c = min(uio->uio_resid, PAGE_SIZE - off);
121
122		/*
123		 * The VM has a hole in its physical memory map. If we want to
124		 * use 'dd' to inspect memory beyond the hole we need to
125		 * provide bogus data for memory that lies in the hole.
126		 *
127		 * Since this device does not support lseek(2), dd(1) will
128		 * read(2) blocks of data to simulate the lseek(2).
129		 */
130		hpa = vm_gpa_hold(sc->vm, gpa, c, prot, &cookie);
131		if (hpa == NULL) {
132			if (uio->uio_rw == UIO_READ)
133				error = uiomove(zerobuf, c, uio);
134			else
135				error = EFAULT;
136		} else {
137			error = uiomove(hpa, c, uio);
138			vm_gpa_release(cookie);
139		}
140	}
141	return (error);
142}
143
144static int
145vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
146	     struct thread *td)
147{
148	int error, vcpu, state_changed;
149	struct vmmdev_softc *sc;
150	struct vm_memory_segment *seg;
151	struct vm_register *vmreg;
152	struct vm_seg_desc *vmsegdesc;
153	struct vm_run *vmrun;
154	struct vm_exception *vmexc;
155	struct vm_lapic_irq *vmirq;
156	struct vm_lapic_msi *vmmsi;
157	struct vm_ioapic_irq *ioapic_irq;
158	struct vm_isa_irq *isa_irq;
159	struct vm_capability *vmcap;
160	struct vm_pptdev *pptdev;
161	struct vm_pptdev_mmio *pptmmio;
162	struct vm_pptdev_msi *pptmsi;
163	struct vm_pptdev_msix *pptmsix;
164	struct vm_nmi *vmnmi;
165	struct vm_stats *vmstats;
166	struct vm_stat_desc *statdesc;
167	struct vm_x2apic *x2apic;
168	struct vm_gpa_pte *gpapte;
169	struct vm_suspend *vmsuspend;
170
171	sc = vmmdev_lookup2(cdev);
172	if (sc == NULL)
173		return (ENXIO);
174
175	error = 0;
176	vcpu = -1;
177	state_changed = 0;
178
179	/*
180	 * Some VMM ioctls can operate only on vcpus that are not running.
181	 */
182	switch (cmd) {
183	case VM_RUN:
184	case VM_GET_REGISTER:
185	case VM_SET_REGISTER:
186	case VM_GET_SEGMENT_DESCRIPTOR:
187	case VM_SET_SEGMENT_DESCRIPTOR:
188	case VM_INJECT_EXCEPTION:
189	case VM_GET_CAPABILITY:
190	case VM_SET_CAPABILITY:
191	case VM_PPTDEV_MSI:
192	case VM_PPTDEV_MSIX:
193	case VM_SET_X2APIC_STATE:
194		/*
195		 * XXX fragile, handle with care
196		 * Assumes that the first field of the ioctl data is the vcpu.
197		 */
198		vcpu = *(int *)data;
199		if (vcpu < 0 || vcpu >= VM_MAXCPU) {
200			error = EINVAL;
201			goto done;
202		}
203
204		error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true);
205		if (error)
206			goto done;
207
208		state_changed = 1;
209		break;
210
211	case VM_MAP_PPTDEV_MMIO:
212	case VM_BIND_PPTDEV:
213	case VM_UNBIND_PPTDEV:
214	case VM_MAP_MEMORY:
215		/*
216		 * ioctls that operate on the entire virtual machine must
217		 * prevent all vcpus from running.
218		 */
219		error = 0;
220		for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) {
221			error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true);
222			if (error)
223				break;
224		}
225
226		if (error) {
227			while (--vcpu >= 0)
228				vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false);
229			goto done;
230		}
231
232		state_changed = 2;
233		break;
234
235	default:
236		break;
237	}
238
239	switch(cmd) {
240	case VM_RUN:
241		vmrun = (struct vm_run *)data;
242		error = vm_run(sc->vm, vmrun);
243		break;
244	case VM_SUSPEND:
245		vmsuspend = (struct vm_suspend *)data;
246		error = vm_suspend(sc->vm, vmsuspend->how);
247		break;
248	case VM_STAT_DESC: {
249		statdesc = (struct vm_stat_desc *)data;
250		error = vmm_stat_desc_copy(statdesc->index,
251					statdesc->desc, sizeof(statdesc->desc));
252		break;
253	}
254	case VM_STATS: {
255		CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS);
256		vmstats = (struct vm_stats *)data;
257		getmicrotime(&vmstats->tv);
258		error = vmm_stat_copy(sc->vm, vmstats->cpuid,
259				      &vmstats->num_entries, vmstats->statbuf);
260		break;
261	}
262	case VM_PPTDEV_MSI:
263		pptmsi = (struct vm_pptdev_msi *)data;
264		error = ppt_setup_msi(sc->vm, pptmsi->vcpu,
265				      pptmsi->bus, pptmsi->slot, pptmsi->func,
266				      pptmsi->addr, pptmsi->msg,
267				      pptmsi->numvec);
268		break;
269	case VM_PPTDEV_MSIX:
270		pptmsix = (struct vm_pptdev_msix *)data;
271		error = ppt_setup_msix(sc->vm, pptmsix->vcpu,
272				       pptmsix->bus, pptmsix->slot,
273				       pptmsix->func, pptmsix->idx,
274				       pptmsix->addr, pptmsix->msg,
275				       pptmsix->vector_control);
276		break;
277	case VM_MAP_PPTDEV_MMIO:
278		pptmmio = (struct vm_pptdev_mmio *)data;
279		error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
280				     pptmmio->func, pptmmio->gpa, pptmmio->len,
281				     pptmmio->hpa);
282		break;
283	case VM_BIND_PPTDEV:
284		pptdev = (struct vm_pptdev *)data;
285		error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
286					 pptdev->func);
287		break;
288	case VM_UNBIND_PPTDEV:
289		pptdev = (struct vm_pptdev *)data;
290		error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
291					   pptdev->func);
292		break;
293	case VM_INJECT_EXCEPTION:
294		vmexc = (struct vm_exception *)data;
295		error = vm_inject_exception(sc->vm, vmexc->cpuid, vmexc);
296		break;
297	case VM_INJECT_NMI:
298		vmnmi = (struct vm_nmi *)data;
299		error = vm_inject_nmi(sc->vm, vmnmi->cpuid);
300		break;
301	case VM_LAPIC_IRQ:
302		vmirq = (struct vm_lapic_irq *)data;
303		error = lapic_intr_edge(sc->vm, vmirq->cpuid, vmirq->vector);
304		break;
305	case VM_LAPIC_LOCAL_IRQ:
306		vmirq = (struct vm_lapic_irq *)data;
307		error = lapic_set_local_intr(sc->vm, vmirq->cpuid,
308		    vmirq->vector);
309		break;
310	case VM_LAPIC_MSI:
311		vmmsi = (struct vm_lapic_msi *)data;
312		error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg);
313		break;
314	case VM_IOAPIC_ASSERT_IRQ:
315		ioapic_irq = (struct vm_ioapic_irq *)data;
316		error = vioapic_assert_irq(sc->vm, ioapic_irq->irq);
317		break;
318	case VM_IOAPIC_DEASSERT_IRQ:
319		ioapic_irq = (struct vm_ioapic_irq *)data;
320		error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq);
321		break;
322	case VM_IOAPIC_PULSE_IRQ:
323		ioapic_irq = (struct vm_ioapic_irq *)data;
324		error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq);
325		break;
326	case VM_IOAPIC_PINCOUNT:
327		*(int *)data = vioapic_pincount(sc->vm);
328		break;
329	case VM_ISA_ASSERT_IRQ:
330		isa_irq = (struct vm_isa_irq *)data;
331		error = vatpic_assert_irq(sc->vm, isa_irq->atpic_irq);
332		if (error == 0 && isa_irq->ioapic_irq != -1)
333			error = vioapic_assert_irq(sc->vm,
334			    isa_irq->ioapic_irq);
335		break;
336	case VM_ISA_DEASSERT_IRQ:
337		isa_irq = (struct vm_isa_irq *)data;
338		error = vatpic_deassert_irq(sc->vm, isa_irq->atpic_irq);
339		if (error == 0 && isa_irq->ioapic_irq != -1)
340			error = vioapic_deassert_irq(sc->vm,
341			    isa_irq->ioapic_irq);
342		break;
343	case VM_ISA_PULSE_IRQ:
344		isa_irq = (struct vm_isa_irq *)data;
345		error = vatpic_pulse_irq(sc->vm, isa_irq->atpic_irq);
346		if (error == 0 && isa_irq->ioapic_irq != -1)
347			error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq);
348		break;
349	case VM_MAP_MEMORY:
350		seg = (struct vm_memory_segment *)data;
351		error = vm_malloc(sc->vm, seg->gpa, seg->len);
352		break;
353	case VM_GET_MEMORY_SEG:
354		seg = (struct vm_memory_segment *)data;
355		seg->len = 0;
356		(void)vm_gpabase2memseg(sc->vm, seg->gpa, seg);
357		error = 0;
358		break;
359	case VM_GET_REGISTER:
360		vmreg = (struct vm_register *)data;
361		error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum,
362					&vmreg->regval);
363		break;
364	case VM_SET_REGISTER:
365		vmreg = (struct vm_register *)data;
366		error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum,
367					vmreg->regval);
368		break;
369	case VM_SET_SEGMENT_DESCRIPTOR:
370		vmsegdesc = (struct vm_seg_desc *)data;
371		error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid,
372					vmsegdesc->regnum,
373					&vmsegdesc->desc);
374		break;
375	case VM_GET_SEGMENT_DESCRIPTOR:
376		vmsegdesc = (struct vm_seg_desc *)data;
377		error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid,
378					vmsegdesc->regnum,
379					&vmsegdesc->desc);
380		break;
381	case VM_GET_CAPABILITY:
382		vmcap = (struct vm_capability *)data;
383		error = vm_get_capability(sc->vm, vmcap->cpuid,
384					  vmcap->captype,
385					  &vmcap->capval);
386		break;
387	case VM_SET_CAPABILITY:
388		vmcap = (struct vm_capability *)data;
389		error = vm_set_capability(sc->vm, vmcap->cpuid,
390					  vmcap->captype,
391					  vmcap->capval);
392		break;
393	case VM_SET_X2APIC_STATE:
394		x2apic = (struct vm_x2apic *)data;
395		error = vm_set_x2apic_state(sc->vm,
396					    x2apic->cpuid, x2apic->state);
397		break;
398	case VM_GET_X2APIC_STATE:
399		x2apic = (struct vm_x2apic *)data;
400		error = vm_get_x2apic_state(sc->vm,
401					    x2apic->cpuid, &x2apic->state);
402		break;
403	case VM_GET_GPA_PMAP:
404		gpapte = (struct vm_gpa_pte *)data;
405		pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)),
406				 gpapte->gpa, gpapte->pte, &gpapte->ptenum);
407		error = 0;
408		break;
409	case VM_GET_HPET_CAPABILITIES:
410		error = vhpet_getcap((struct vm_hpet_cap *)data);
411		break;
412	default:
413		error = ENOTTY;
414		break;
415	}
416
417	if (state_changed == 1) {
418		vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false);
419	} else if (state_changed == 2) {
420		for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++)
421			vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false);
422	}
423
424done:
425	/* Make sure that no handler returns a bogus value like ERESTART */
426	KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error));
427	return (error);
428}
429
430static int
431vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset,
432		   vm_size_t size, struct vm_object **object, int nprot)
433{
434	int error;
435	struct vmmdev_softc *sc;
436
437	sc = vmmdev_lookup2(cdev);
438	if (sc != NULL && (nprot & PROT_EXEC) == 0)
439		error = vm_get_memobj(sc->vm, *offset, size, offset, object);
440	else
441		error = EINVAL;
442
443	return (error);
444}
445
446static void
447vmmdev_destroy(void *arg)
448{
449
450	struct vmmdev_softc *sc = arg;
451
452	if (sc->cdev != NULL)
453		destroy_dev(sc->cdev);
454
455	if (sc->vm != NULL)
456		vm_destroy(sc->vm);
457
458	if ((sc->flags & VSC_LINKED) != 0) {
459		mtx_lock(&vmmdev_mtx);
460		SLIST_REMOVE(&head, sc, vmmdev_softc, link);
461		mtx_unlock(&vmmdev_mtx);
462	}
463
464	free(sc, M_VMMDEV);
465}
466
467static int
468sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
469{
470	int error;
471	char buf[VM_MAX_NAMELEN];
472	struct vmmdev_softc *sc;
473	struct cdev *cdev;
474
475	strlcpy(buf, "beavis", sizeof(buf));
476	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
477	if (error != 0 || req->newptr == NULL)
478		return (error);
479
480	mtx_lock(&vmmdev_mtx);
481	sc = vmmdev_lookup(buf);
482	if (sc == NULL || sc->cdev == NULL) {
483		mtx_unlock(&vmmdev_mtx);
484		return (EINVAL);
485	}
486
487	/*
488	 * The 'cdev' will be destroyed asynchronously when 'si_threadcount'
489	 * goes down to 0 so we should not do it again in the callback.
490	 */
491	cdev = sc->cdev;
492	sc->cdev = NULL;
493	mtx_unlock(&vmmdev_mtx);
494
495	/*
496	 * Schedule the 'cdev' to be destroyed:
497	 *
498	 * - any new operations on this 'cdev' will return an error (ENXIO).
499	 *
500	 * - when the 'si_threadcount' dwindles down to zero the 'cdev' will
501	 *   be destroyed and the callback will be invoked in a taskqueue
502	 *   context.
503	 */
504	destroy_dev_sched_cb(cdev, vmmdev_destroy, sc);
505
506	return (0);
507}
508SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW,
509	    NULL, 0, sysctl_vmm_destroy, "A", NULL);
510
511static struct cdevsw vmmdevsw = {
512	.d_name		= "vmmdev",
513	.d_version	= D_VERSION,
514	.d_ioctl	= vmmdev_ioctl,
515	.d_mmap_single	= vmmdev_mmap_single,
516	.d_read		= vmmdev_rw,
517	.d_write	= vmmdev_rw,
518};
519
520static int
521sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
522{
523	int error;
524	struct vm *vm;
525	struct cdev *cdev;
526	struct vmmdev_softc *sc, *sc2;
527	char buf[VM_MAX_NAMELEN];
528
529	strlcpy(buf, "beavis", sizeof(buf));
530	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
531	if (error != 0 || req->newptr == NULL)
532		return (error);
533
534	mtx_lock(&vmmdev_mtx);
535	sc = vmmdev_lookup(buf);
536	mtx_unlock(&vmmdev_mtx);
537	if (sc != NULL)
538		return (EEXIST);
539
540	error = vm_create(buf, &vm);
541	if (error != 0)
542		return (error);
543
544	sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
545	sc->vm = vm;
546
547	/*
548	 * Lookup the name again just in case somebody sneaked in when we
549	 * dropped the lock.
550	 */
551	mtx_lock(&vmmdev_mtx);
552	sc2 = vmmdev_lookup(buf);
553	if (sc2 == NULL) {
554		SLIST_INSERT_HEAD(&head, sc, link);
555		sc->flags |= VSC_LINKED;
556	}
557	mtx_unlock(&vmmdev_mtx);
558
559	if (sc2 != NULL) {
560		vmmdev_destroy(sc);
561		return (EEXIST);
562	}
563
564	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, NULL,
565			   UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf);
566	if (error != 0) {
567		vmmdev_destroy(sc);
568		return (error);
569	}
570
571	mtx_lock(&vmmdev_mtx);
572	sc->cdev = cdev;
573	sc->cdev->si_drv1 = sc;
574	mtx_unlock(&vmmdev_mtx);
575
576	return (0);
577}
578SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW,
579	    NULL, 0, sysctl_vmm_create, "A", NULL);
580
581void
582vmmdev_init(void)
583{
584	mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF);
585}
586
587int
588vmmdev_cleanup(void)
589{
590	int error;
591
592	if (SLIST_EMPTY(&head))
593		error = 0;
594	else
595		error = EBUSY;
596
597	return (error);
598}
599