vmm_dev.c revision 258494
1/*-
2 * Copyright (c) 2011 NetApp, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD: head/sys/amd64/vmm/vmm_dev.c 258494 2013-11-23 03:56:03Z neel $
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_dev.c 258494 2013-11-23 03:56:03Z neel $");
31
32#include <sys/param.h>
33#include <sys/kernel.h>
34#include <sys/queue.h>
35#include <sys/lock.h>
36#include <sys/mutex.h>
37#include <sys/malloc.h>
38#include <sys/conf.h>
39#include <sys/sysctl.h>
40#include <sys/libkern.h>
41#include <sys/ioccom.h>
42#include <sys/mman.h>
43#include <sys/uio.h>
44
45#include <vm/vm.h>
46#include <vm/pmap.h>
47#include <vm/vm_map.h>
48
49#include <machine/vmparam.h>
50
51#include <machine/vmm.h>
52#include "vmm_lapic.h"
53#include "vmm_stat.h"
54#include "vmm_mem.h"
55#include "io/ppt.h"
56#include "io/vioapic.h"
57#include <machine/vmm_dev.h>
58
59struct vmmdev_softc {
60	struct vm	*vm;		/* vm instance cookie */
61	struct cdev	*cdev;
62	SLIST_ENTRY(vmmdev_softc) link;
63	int		flags;
64};
65#define	VSC_LINKED		0x01
66
67static SLIST_HEAD(, vmmdev_softc) head;
68
69static struct mtx vmmdev_mtx;
70
71static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
72
73SYSCTL_DECL(_hw_vmm);
74
75static struct vmmdev_softc *
76vmmdev_lookup(const char *name)
77{
78	struct vmmdev_softc *sc;
79
80#ifdef notyet	/* XXX kernel is not compiled with invariants */
81	mtx_assert(&vmmdev_mtx, MA_OWNED);
82#endif
83
84	SLIST_FOREACH(sc, &head, link) {
85		if (strcmp(name, vm_name(sc->vm)) == 0)
86			break;
87	}
88
89	return (sc);
90}
91
92static struct vmmdev_softc *
93vmmdev_lookup2(struct cdev *cdev)
94{
95
96	return (cdev->si_drv1);
97}
98
99static int
100vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
101{
102	int error, off, c, prot;
103	vm_paddr_t gpa;
104	void *hpa, *cookie;
105	struct vmmdev_softc *sc;
106
107	static char zerobuf[PAGE_SIZE];
108
109	error = 0;
110	sc = vmmdev_lookup2(cdev);
111	if (sc == NULL)
112		error = ENXIO;
113
114	prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
115	while (uio->uio_resid > 0 && error == 0) {
116		gpa = uio->uio_offset;
117		off = gpa & PAGE_MASK;
118		c = min(uio->uio_resid, PAGE_SIZE - off);
119
120		/*
121		 * The VM has a hole in its physical memory map. If we want to
122		 * use 'dd' to inspect memory beyond the hole we need to
123		 * provide bogus data for memory that lies in the hole.
124		 *
125		 * Since this device does not support lseek(2), dd(1) will
126		 * read(2) blocks of data to simulate the lseek(2).
127		 */
128		hpa = vm_gpa_hold(sc->vm, gpa, c, prot, &cookie);
129		if (hpa == NULL) {
130			if (uio->uio_rw == UIO_READ)
131				error = uiomove(zerobuf, c, uio);
132			else
133				error = EFAULT;
134		} else {
135			error = uiomove(hpa, c, uio);
136			vm_gpa_release(cookie);
137		}
138	}
139	return (error);
140}
141
142static int
143vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
144	     struct thread *td)
145{
146	int error, vcpu, state_changed;
147	struct vmmdev_softc *sc;
148	struct vm_memory_segment *seg;
149	struct vm_register *vmreg;
150	struct vm_seg_desc *vmsegdesc;
151	struct vm_run *vmrun;
152	struct vm_event *vmevent;
153	struct vm_lapic_irq *vmirq;
154	struct vm_ioapic_irq *ioapic_irq;
155	struct vm_capability *vmcap;
156	struct vm_pptdev *pptdev;
157	struct vm_pptdev_mmio *pptmmio;
158	struct vm_pptdev_msi *pptmsi;
159	struct vm_pptdev_msix *pptmsix;
160	struct vm_nmi *vmnmi;
161	struct vm_stats *vmstats;
162	struct vm_stat_desc *statdesc;
163	struct vm_x2apic *x2apic;
164	struct vm_gpa_pte *gpapte;
165
166	sc = vmmdev_lookup2(cdev);
167	if (sc == NULL)
168		return (ENXIO);
169
170	vcpu = -1;
171	state_changed = 0;
172
173	/*
174	 * Some VMM ioctls can operate only on vcpus that are not running.
175	 */
176	switch (cmd) {
177	case VM_RUN:
178	case VM_GET_REGISTER:
179	case VM_SET_REGISTER:
180	case VM_GET_SEGMENT_DESCRIPTOR:
181	case VM_SET_SEGMENT_DESCRIPTOR:
182	case VM_INJECT_EVENT:
183	case VM_GET_CAPABILITY:
184	case VM_SET_CAPABILITY:
185	case VM_PPTDEV_MSI:
186	case VM_PPTDEV_MSIX:
187	case VM_SET_X2APIC_STATE:
188		/*
189		 * XXX fragile, handle with care
190		 * Assumes that the first field of the ioctl data is the vcpu.
191		 */
192		vcpu = *(int *)data;
193		if (vcpu < 0 || vcpu >= VM_MAXCPU) {
194			error = EINVAL;
195			goto done;
196		}
197
198		error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN);
199		if (error)
200			goto done;
201
202		state_changed = 1;
203		break;
204
205	case VM_MAP_PPTDEV_MMIO:
206	case VM_BIND_PPTDEV:
207	case VM_UNBIND_PPTDEV:
208	case VM_MAP_MEMORY:
209		/*
210		 * ioctls that operate on the entire virtual machine must
211		 * prevent all vcpus from running.
212		 */
213		error = 0;
214		for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) {
215			error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN);
216			if (error)
217				break;
218		}
219
220		if (error) {
221			while (--vcpu >= 0)
222				vcpu_set_state(sc->vm, vcpu, VCPU_IDLE);
223			goto done;
224		}
225
226		state_changed = 2;
227		break;
228
229	default:
230		break;
231	}
232
233	switch(cmd) {
234	case VM_RUN:
235		vmrun = (struct vm_run *)data;
236		error = vm_run(sc->vm, vmrun);
237		break;
238	case VM_STAT_DESC: {
239		statdesc = (struct vm_stat_desc *)data;
240		error = vmm_stat_desc_copy(statdesc->index,
241					statdesc->desc, sizeof(statdesc->desc));
242		break;
243	}
244	case VM_STATS: {
245		CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS);
246		vmstats = (struct vm_stats *)data;
247		getmicrotime(&vmstats->tv);
248		error = vmm_stat_copy(sc->vm, vmstats->cpuid,
249				      &vmstats->num_entries, vmstats->statbuf);
250		break;
251	}
252	case VM_PPTDEV_MSI:
253		pptmsi = (struct vm_pptdev_msi *)data;
254		error = ppt_setup_msi(sc->vm, pptmsi->vcpu,
255				      pptmsi->bus, pptmsi->slot, pptmsi->func,
256				      pptmsi->destcpu, pptmsi->vector,
257				      pptmsi->numvec);
258		break;
259	case VM_PPTDEV_MSIX:
260		pptmsix = (struct vm_pptdev_msix *)data;
261		error = ppt_setup_msix(sc->vm, pptmsix->vcpu,
262				       pptmsix->bus, pptmsix->slot,
263				       pptmsix->func, pptmsix->idx,
264				       pptmsix->msg, pptmsix->vector_control,
265				       pptmsix->addr);
266		break;
267	case VM_MAP_PPTDEV_MMIO:
268		pptmmio = (struct vm_pptdev_mmio *)data;
269		error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
270				     pptmmio->func, pptmmio->gpa, pptmmio->len,
271				     pptmmio->hpa);
272		break;
273	case VM_BIND_PPTDEV:
274		pptdev = (struct vm_pptdev *)data;
275		error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
276					 pptdev->func);
277		break;
278	case VM_UNBIND_PPTDEV:
279		pptdev = (struct vm_pptdev *)data;
280		error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
281					   pptdev->func);
282		break;
283	case VM_INJECT_EVENT:
284		vmevent = (struct vm_event *)data;
285		error = vm_inject_event(sc->vm, vmevent->cpuid, vmevent->type,
286					vmevent->vector,
287					vmevent->error_code,
288					vmevent->error_code_valid);
289		break;
290	case VM_INJECT_NMI:
291		vmnmi = (struct vm_nmi *)data;
292		error = vm_inject_nmi(sc->vm, vmnmi->cpuid);
293		break;
294	case VM_LAPIC_IRQ:
295		vmirq = (struct vm_lapic_irq *)data;
296		error = lapic_set_intr(sc->vm, vmirq->cpuid, vmirq->vector);
297		break;
298	case VM_IOAPIC_ASSERT_IRQ:
299		ioapic_irq = (struct vm_ioapic_irq *)data;
300		error = vioapic_assert_irq(sc->vm, ioapic_irq->irq);
301		break;
302	case VM_IOAPIC_DEASSERT_IRQ:
303		ioapic_irq = (struct vm_ioapic_irq *)data;
304		error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq);
305		break;
306	case VM_IOAPIC_PULSE_IRQ:
307		ioapic_irq = (struct vm_ioapic_irq *)data;
308		error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq);
309		break;
310	case VM_MAP_MEMORY:
311		seg = (struct vm_memory_segment *)data;
312		error = vm_malloc(sc->vm, seg->gpa, seg->len);
313		break;
314	case VM_GET_MEMORY_SEG:
315		seg = (struct vm_memory_segment *)data;
316		seg->len = 0;
317		(void)vm_gpabase2memseg(sc->vm, seg->gpa, seg);
318		error = 0;
319		break;
320	case VM_GET_REGISTER:
321		vmreg = (struct vm_register *)data;
322		error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum,
323					&vmreg->regval);
324		break;
325	case VM_SET_REGISTER:
326		vmreg = (struct vm_register *)data;
327		error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum,
328					vmreg->regval);
329		break;
330	case VM_SET_SEGMENT_DESCRIPTOR:
331		vmsegdesc = (struct vm_seg_desc *)data;
332		error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid,
333					vmsegdesc->regnum,
334					&vmsegdesc->desc);
335		break;
336	case VM_GET_SEGMENT_DESCRIPTOR:
337		vmsegdesc = (struct vm_seg_desc *)data;
338		error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid,
339					vmsegdesc->regnum,
340					&vmsegdesc->desc);
341		break;
342	case VM_GET_CAPABILITY:
343		vmcap = (struct vm_capability *)data;
344		error = vm_get_capability(sc->vm, vmcap->cpuid,
345					  vmcap->captype,
346					  &vmcap->capval);
347		break;
348	case VM_SET_CAPABILITY:
349		vmcap = (struct vm_capability *)data;
350		error = vm_set_capability(sc->vm, vmcap->cpuid,
351					  vmcap->captype,
352					  vmcap->capval);
353		break;
354	case VM_SET_X2APIC_STATE:
355		x2apic = (struct vm_x2apic *)data;
356		error = vm_set_x2apic_state(sc->vm,
357					    x2apic->cpuid, x2apic->state);
358		break;
359	case VM_GET_X2APIC_STATE:
360		x2apic = (struct vm_x2apic *)data;
361		error = vm_get_x2apic_state(sc->vm,
362					    x2apic->cpuid, &x2apic->state);
363		break;
364	case VM_GET_GPA_PMAP:
365		gpapte = (struct vm_gpa_pte *)data;
366		pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)),
367				 gpapte->gpa, gpapte->pte, &gpapte->ptenum);
368		error = 0;
369		break;
370	default:
371		error = ENOTTY;
372		break;
373	}
374
375	if (state_changed == 1) {
376		vcpu_set_state(sc->vm, vcpu, VCPU_IDLE);
377	} else if (state_changed == 2) {
378		for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++)
379			vcpu_set_state(sc->vm, vcpu, VCPU_IDLE);
380	}
381
382done:
383	/* Make sure that no handler returns a bogus value like ERESTART */
384	KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error));
385	return (error);
386}
387
388static int
389vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset,
390		   vm_size_t size, struct vm_object **object, int nprot)
391{
392	int error;
393	struct vmmdev_softc *sc;
394
395	sc = vmmdev_lookup2(cdev);
396	if (sc != NULL && (nprot & PROT_EXEC) == 0)
397		error = vm_get_memobj(sc->vm, *offset, size, offset, object);
398	else
399		error = EINVAL;
400
401	return (error);
402}
403
404static void
405vmmdev_destroy(void *arg)
406{
407
408	struct vmmdev_softc *sc = arg;
409
410	if (sc->cdev != NULL)
411		destroy_dev(sc->cdev);
412
413	if (sc->vm != NULL)
414		vm_destroy(sc->vm);
415
416	if ((sc->flags & VSC_LINKED) != 0) {
417		mtx_lock(&vmmdev_mtx);
418		SLIST_REMOVE(&head, sc, vmmdev_softc, link);
419		mtx_unlock(&vmmdev_mtx);
420	}
421
422	free(sc, M_VMMDEV);
423}
424
425static int
426sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
427{
428	int error;
429	char buf[VM_MAX_NAMELEN];
430	struct vmmdev_softc *sc;
431	struct cdev *cdev;
432
433	strlcpy(buf, "beavis", sizeof(buf));
434	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
435	if (error != 0 || req->newptr == NULL)
436		return (error);
437
438	mtx_lock(&vmmdev_mtx);
439	sc = vmmdev_lookup(buf);
440	if (sc == NULL || sc->cdev == NULL) {
441		mtx_unlock(&vmmdev_mtx);
442		return (EINVAL);
443	}
444
445	/*
446	 * The 'cdev' will be destroyed asynchronously when 'si_threadcount'
447	 * goes down to 0 so we should not do it again in the callback.
448	 */
449	cdev = sc->cdev;
450	sc->cdev = NULL;
451	mtx_unlock(&vmmdev_mtx);
452
453	/*
454	 * Schedule the 'cdev' to be destroyed:
455	 *
456	 * - any new operations on this 'cdev' will return an error (ENXIO).
457	 *
458	 * - when the 'si_threadcount' dwindles down to zero the 'cdev' will
459	 *   be destroyed and the callback will be invoked in a taskqueue
460	 *   context.
461	 */
462	destroy_dev_sched_cb(cdev, vmmdev_destroy, sc);
463
464	return (0);
465}
466SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW,
467	    NULL, 0, sysctl_vmm_destroy, "A", NULL);
468
469static struct cdevsw vmmdevsw = {
470	.d_name		= "vmmdev",
471	.d_version	= D_VERSION,
472	.d_ioctl	= vmmdev_ioctl,
473	.d_mmap_single	= vmmdev_mmap_single,
474	.d_read		= vmmdev_rw,
475	.d_write	= vmmdev_rw,
476};
477
478static int
479sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
480{
481	int error;
482	struct vm *vm;
483	struct cdev *cdev;
484	struct vmmdev_softc *sc, *sc2;
485	char buf[VM_MAX_NAMELEN];
486
487	strlcpy(buf, "beavis", sizeof(buf));
488	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
489	if (error != 0 || req->newptr == NULL)
490		return (error);
491
492	mtx_lock(&vmmdev_mtx);
493	sc = vmmdev_lookup(buf);
494	mtx_unlock(&vmmdev_mtx);
495	if (sc != NULL)
496		return (EEXIST);
497
498	error = vm_create(buf, &vm);
499	if (error != 0)
500		return (error);
501
502	sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
503	sc->vm = vm;
504
505	/*
506	 * Lookup the name again just in case somebody sneaked in when we
507	 * dropped the lock.
508	 */
509	mtx_lock(&vmmdev_mtx);
510	sc2 = vmmdev_lookup(buf);
511	if (sc2 == NULL) {
512		SLIST_INSERT_HEAD(&head, sc, link);
513		sc->flags |= VSC_LINKED;
514	}
515	mtx_unlock(&vmmdev_mtx);
516
517	if (sc2 != NULL) {
518		vmmdev_destroy(sc);
519		return (EEXIST);
520	}
521
522	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, NULL,
523			   UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf);
524	if (error != 0) {
525		vmmdev_destroy(sc);
526		return (error);
527	}
528
529	mtx_lock(&vmmdev_mtx);
530	sc->cdev = cdev;
531	sc->cdev->si_drv1 = sc;
532	mtx_unlock(&vmmdev_mtx);
533
534	return (0);
535}
536SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW,
537	    NULL, 0, sysctl_vmm_create, "A", NULL);
538
539void
540vmmdev_init(void)
541{
542	mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF);
543}
544
545int
546vmmdev_cleanup(void)
547{
548	int error;
549
550	if (SLIST_EMPTY(&head))
551		error = 0;
552	else
553		error = EBUSY;
554
555	return (error);
556}
557