1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2011 NetApp, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 * $FreeBSD$
29 */
30
31#include <sys/cdefs.h>
32__FBSDID("$FreeBSD$");
33
34#include <sys/param.h>
35#include <sys/kernel.h>
36#include <sys/jail.h>
37#include <sys/queue.h>
38#include <sys/lock.h>
39#include <sys/mutex.h>
40#include <sys/malloc.h>
41#include <sys/conf.h>
42#include <sys/sysctl.h>
43#include <sys/libkern.h>
44#include <sys/ioccom.h>
45#include <sys/mman.h>
46#include <sys/uio.h>
47#include <sys/proc.h>
48
49#include <vm/vm.h>
50#include <vm/pmap.h>
51#include <vm/vm_map.h>
52#include <vm/vm_object.h>
53
54#include <machine/vmparam.h>
55#include <machine/vmm.h>
56#include <machine/vmm_instruction_emul.h>
57#include <machine/vmm_dev.h>
58
59#include "vmm_lapic.h"
60#include "vmm_stat.h"
61#include "vmm_mem.h"
62#include "io/ppt.h"
63#include "io/vatpic.h"
64#include "io/vioapic.h"
65#include "io/vhpet.h"
66#include "io/vrtc.h"
67
68struct devmem_softc {
69	int	segid;
70	char	*name;
71	struct cdev *cdev;
72	struct vmmdev_softc *sc;
73	SLIST_ENTRY(devmem_softc) link;
74};
75
76struct vmmdev_softc {
77	struct vm	*vm;		/* vm instance cookie */
78	struct cdev	*cdev;
79	SLIST_ENTRY(vmmdev_softc) link;
80	SLIST_HEAD(, devmem_softc) devmem;
81	int		flags;
82};
83#define	VSC_LINKED		0x01
84
85static SLIST_HEAD(, vmmdev_softc) head;
86
87static unsigned pr_allow_flag;
88static struct mtx vmmdev_mtx;
89
90static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
91
92SYSCTL_DECL(_hw_vmm);
93
94static int vmm_priv_check(struct ucred *ucred);
95static int devmem_create_cdev(const char *vmname, int id, char *devmem);
96static void devmem_destroy(void *arg);
97
98static int
99vmm_priv_check(struct ucred *ucred)
100{
101
102	if (jailed(ucred) &&
103	    !(ucred->cr_prison->pr_allow & pr_allow_flag))
104		return (EPERM);
105
106	return (0);
107}
108
109static int
110vcpu_lock_one(struct vmmdev_softc *sc, int vcpu)
111{
112	int error;
113
114	if (vcpu < 0 || vcpu >= vm_get_maxcpus(sc->vm))
115		return (EINVAL);
116
117	error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true);
118	return (error);
119}
120
121static void
122vcpu_unlock_one(struct vmmdev_softc *sc, int vcpu)
123{
124	enum vcpu_state state;
125
126	state = vcpu_get_state(sc->vm, vcpu, NULL);
127	if (state != VCPU_FROZEN) {
128		panic("vcpu %s(%d) has invalid state %d", vm_name(sc->vm),
129		    vcpu, state);
130	}
131
132	vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false);
133}
134
135static int
136vcpu_lock_all(struct vmmdev_softc *sc)
137{
138	int error, vcpu;
139	uint16_t maxcpus;
140
141	maxcpus = vm_get_maxcpus(sc->vm);
142	for (vcpu = 0; vcpu < maxcpus; vcpu++) {
143		error = vcpu_lock_one(sc, vcpu);
144		if (error)
145			break;
146	}
147
148	if (error) {
149		while (--vcpu >= 0)
150			vcpu_unlock_one(sc, vcpu);
151	}
152
153	return (error);
154}
155
156static void
157vcpu_unlock_all(struct vmmdev_softc *sc)
158{
159	int vcpu;
160	uint16_t maxcpus;
161
162	maxcpus = vm_get_maxcpus(sc->vm);
163	for (vcpu = 0; vcpu < maxcpus; vcpu++)
164		vcpu_unlock_one(sc, vcpu);
165}
166
167static struct vmmdev_softc *
168vmmdev_lookup(const char *name)
169{
170	struct vmmdev_softc *sc;
171
172#ifdef notyet	/* XXX kernel is not compiled with invariants */
173	mtx_assert(&vmmdev_mtx, MA_OWNED);
174#endif
175
176	SLIST_FOREACH(sc, &head, link) {
177		if (strcmp(name, vm_name(sc->vm)) == 0)
178			break;
179	}
180
181	return (sc);
182}
183
184static struct vmmdev_softc *
185vmmdev_lookup2(struct cdev *cdev)
186{
187
188	return (cdev->si_drv1);
189}
190
191static int
192vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
193{
194	int error, off, c, prot;
195	vm_paddr_t gpa, maxaddr;
196	void *hpa, *cookie;
197	struct vmmdev_softc *sc;
198	uint16_t lastcpu;
199
200	error = vmm_priv_check(curthread->td_ucred);
201	if (error)
202		return (error);
203
204	sc = vmmdev_lookup2(cdev);
205	if (sc == NULL)
206		return (ENXIO);
207
208	/*
209	 * Get a read lock on the guest memory map by freezing any vcpu.
210	 */
211	lastcpu = vm_get_maxcpus(sc->vm) - 1;
212	error = vcpu_lock_one(sc, lastcpu);
213	if (error)
214		return (error);
215
216	prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
217	maxaddr = vmm_sysmem_maxaddr(sc->vm);
218	while (uio->uio_resid > 0 && error == 0) {
219		gpa = uio->uio_offset;
220		off = gpa & PAGE_MASK;
221		c = min(uio->uio_resid, PAGE_SIZE - off);
222
223		/*
224		 * The VM has a hole in its physical memory map. If we want to
225		 * use 'dd' to inspect memory beyond the hole we need to
226		 * provide bogus data for memory that lies in the hole.
227		 *
228		 * Since this device does not support lseek(2), dd(1) will
229		 * read(2) blocks of data to simulate the lseek(2).
230		 */
231		hpa = vm_gpa_hold(sc->vm, lastcpu, gpa, c,
232		    prot, &cookie);
233		if (hpa == NULL) {
234			if (uio->uio_rw == UIO_READ && gpa < maxaddr)
235				error = uiomove(__DECONST(void *, zero_region),
236				    c, uio);
237			else
238				error = EFAULT;
239		} else {
240			error = uiomove(hpa, c, uio);
241			vm_gpa_release(cookie);
242		}
243	}
244	vcpu_unlock_one(sc, lastcpu);
245	return (error);
246}
247
248CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= SPECNAMELEN + 1);
249
250static int
251get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg)
252{
253	struct devmem_softc *dsc;
254	int error;
255	bool sysmem;
256
257	error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
258	if (error || mseg->len == 0)
259		return (error);
260
261	if (!sysmem) {
262		SLIST_FOREACH(dsc, &sc->devmem, link) {
263			if (dsc->segid == mseg->segid)
264				break;
265		}
266		KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
267		    __func__, mseg->segid));
268		error = copystr(dsc->name, mseg->name, SPECNAMELEN + 1, NULL);
269	} else {
270		bzero(mseg->name, sizeof(mseg->name));
271	}
272
273	return (error);
274}
275
276static int
277alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg)
278{
279	char *name;
280	int error;
281	bool sysmem;
282
283	error = 0;
284	name = NULL;
285	sysmem = true;
286
287	if (VM_MEMSEG_NAME(mseg)) {
288		sysmem = false;
289		name = malloc(SPECNAMELEN + 1, M_VMMDEV, M_WAITOK);
290		error = copystr(mseg->name, name, SPECNAMELEN + 1, 0);
291		if (error)
292			goto done;
293	}
294
295	error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem);
296	if (error)
297		goto done;
298
299	if (VM_MEMSEG_NAME(mseg)) {
300		error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name);
301		if (error)
302			vm_free_memseg(sc->vm, mseg->segid);
303		else
304			name = NULL;	/* freed when 'cdev' is destroyed */
305	}
306done:
307	free(name, M_VMMDEV);
308	return (error);
309}
310
311static int
312vm_get_register_set(struct vm *vm, int vcpu, unsigned int count, int *regnum,
313    uint64_t *regval)
314{
315	int error, i;
316
317	error = 0;
318	for (i = 0; i < count; i++) {
319		error = vm_get_register(vm, vcpu, regnum[i], &regval[i]);
320		if (error)
321			break;
322	}
323	return (error);
324}
325
326static int
327vm_set_register_set(struct vm *vm, int vcpu, unsigned int count, int *regnum,
328    uint64_t *regval)
329{
330	int error, i;
331
332	error = 0;
333	for (i = 0; i < count; i++) {
334		error = vm_set_register(vm, vcpu, regnum[i], regval[i]);
335		if (error)
336			break;
337	}
338	return (error);
339}
340
341static int
342vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
343	     struct thread *td)
344{
345	int error, vcpu, state_changed, size;
346	cpuset_t *cpuset;
347	struct vmmdev_softc *sc;
348	struct vm_register *vmreg;
349	struct vm_seg_desc *vmsegdesc;
350	struct vm_register_set *vmregset;
351	struct vm_run *vmrun;
352	struct vm_exception *vmexc;
353	struct vm_lapic_irq *vmirq;
354	struct vm_lapic_msi *vmmsi;
355	struct vm_ioapic_irq *ioapic_irq;
356	struct vm_isa_irq *isa_irq;
357	struct vm_isa_irq_trigger *isa_irq_trigger;
358	struct vm_capability *vmcap;
359	struct vm_pptdev *pptdev;
360	struct vm_pptdev_mmio *pptmmio;
361	struct vm_pptdev_msi *pptmsi;
362	struct vm_pptdev_msix *pptmsix;
363	struct vm_nmi *vmnmi;
364	struct vm_stats *vmstats;
365	struct vm_stat_desc *statdesc;
366	struct vm_x2apic *x2apic;
367	struct vm_gpa_pte *gpapte;
368	struct vm_suspend *vmsuspend;
369	struct vm_gla2gpa *gg;
370	struct vm_activate_cpu *vac;
371	struct vm_cpuset *vm_cpuset;
372	struct vm_intinfo *vmii;
373	struct vm_rtc_time *rtctime;
374	struct vm_rtc_data *rtcdata;
375	struct vm_memmap *mm;
376	struct vm_cpu_topology *topology;
377	uint64_t *regvals;
378	int *regnums;
379
380	error = vmm_priv_check(curthread->td_ucred);
381	if (error)
382		return (error);
383
384	sc = vmmdev_lookup2(cdev);
385	if (sc == NULL)
386		return (ENXIO);
387
388	vcpu = -1;
389	state_changed = 0;
390
391	/*
392	 * Some VMM ioctls can operate only on vcpus that are not running.
393	 */
394	switch (cmd) {
395	case VM_RUN:
396	case VM_GET_REGISTER:
397	case VM_SET_REGISTER:
398	case VM_GET_SEGMENT_DESCRIPTOR:
399	case VM_SET_SEGMENT_DESCRIPTOR:
400	case VM_GET_REGISTER_SET:
401	case VM_SET_REGISTER_SET:
402	case VM_INJECT_EXCEPTION:
403	case VM_GET_CAPABILITY:
404	case VM_SET_CAPABILITY:
405	case VM_PPTDEV_MSI:
406	case VM_PPTDEV_MSIX:
407	case VM_SET_X2APIC_STATE:
408	case VM_GLA2GPA:
409	case VM_GLA2GPA_NOFAULT:
410	case VM_ACTIVATE_CPU:
411	case VM_SET_INTINFO:
412	case VM_GET_INTINFO:
413	case VM_RESTART_INSTRUCTION:
414		/*
415		 * XXX fragile, handle with care
416		 * Assumes that the first field of the ioctl data is the vcpu.
417		 */
418		vcpu = *(int *)data;
419		error = vcpu_lock_one(sc, vcpu);
420		if (error)
421			goto done;
422		state_changed = 1;
423		break;
424
425	case VM_MAP_PPTDEV_MMIO:
426	case VM_BIND_PPTDEV:
427	case VM_UNBIND_PPTDEV:
428	case VM_ALLOC_MEMSEG:
429	case VM_MMAP_MEMSEG:
430	case VM_REINIT:
431		/*
432		 * ioctls that operate on the entire virtual machine must
433		 * prevent all vcpus from running.
434		 */
435		error = vcpu_lock_all(sc);
436		if (error)
437			goto done;
438		state_changed = 2;
439		break;
440
441	case VM_GET_MEMSEG:
442	case VM_MMAP_GETNEXT:
443		/*
444		 * Lock a vcpu to make sure that the memory map cannot be
445		 * modified while it is being inspected.
446		 */
447		vcpu = vm_get_maxcpus(sc->vm) - 1;
448		error = vcpu_lock_one(sc, vcpu);
449		if (error)
450			goto done;
451		state_changed = 1;
452		break;
453
454	default:
455		break;
456	}
457
458	switch(cmd) {
459	case VM_RUN:
460		vmrun = (struct vm_run *)data;
461		error = vm_run(sc->vm, vmrun);
462		break;
463	case VM_SUSPEND:
464		vmsuspend = (struct vm_suspend *)data;
465		error = vm_suspend(sc->vm, vmsuspend->how);
466		break;
467	case VM_REINIT:
468		error = vm_reinit(sc->vm);
469		break;
470	case VM_STAT_DESC: {
471		statdesc = (struct vm_stat_desc *)data;
472		error = vmm_stat_desc_copy(statdesc->index,
473					statdesc->desc, sizeof(statdesc->desc));
474		break;
475	}
476	case VM_STATS: {
477		CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS);
478		vmstats = (struct vm_stats *)data;
479		getmicrotime(&vmstats->tv);
480		error = vmm_stat_copy(sc->vm, vmstats->cpuid,
481				      &vmstats->num_entries, vmstats->statbuf);
482		break;
483	}
484	case VM_PPTDEV_MSI:
485		pptmsi = (struct vm_pptdev_msi *)data;
486		error = ppt_setup_msi(sc->vm, pptmsi->vcpu,
487				      pptmsi->bus, pptmsi->slot, pptmsi->func,
488				      pptmsi->addr, pptmsi->msg,
489				      pptmsi->numvec);
490		break;
491	case VM_PPTDEV_MSIX:
492		pptmsix = (struct vm_pptdev_msix *)data;
493		error = ppt_setup_msix(sc->vm, pptmsix->vcpu,
494				       pptmsix->bus, pptmsix->slot,
495				       pptmsix->func, pptmsix->idx,
496				       pptmsix->addr, pptmsix->msg,
497				       pptmsix->vector_control);
498		break;
499	case VM_PPTDEV_DISABLE_MSIX:
500		pptdev = (struct vm_pptdev *)data;
501		error = ppt_disable_msix(sc->vm, pptdev->bus, pptdev->slot,
502					 pptdev->func);
503		break;
504	case VM_MAP_PPTDEV_MMIO:
505		pptmmio = (struct vm_pptdev_mmio *)data;
506		error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
507				     pptmmio->func, pptmmio->gpa, pptmmio->len,
508				     pptmmio->hpa);
509		break;
510	case VM_BIND_PPTDEV:
511		pptdev = (struct vm_pptdev *)data;
512		error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
513					 pptdev->func);
514		break;
515	case VM_UNBIND_PPTDEV:
516		pptdev = (struct vm_pptdev *)data;
517		error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
518					   pptdev->func);
519		break;
520	case VM_INJECT_EXCEPTION:
521		vmexc = (struct vm_exception *)data;
522		error = vm_inject_exception(sc->vm, vmexc->cpuid,
523		    vmexc->vector, vmexc->error_code_valid, vmexc->error_code,
524		    vmexc->restart_instruction);
525		break;
526	case VM_INJECT_NMI:
527		vmnmi = (struct vm_nmi *)data;
528		error = vm_inject_nmi(sc->vm, vmnmi->cpuid);
529		break;
530	case VM_LAPIC_IRQ:
531		vmirq = (struct vm_lapic_irq *)data;
532		error = lapic_intr_edge(sc->vm, vmirq->cpuid, vmirq->vector);
533		break;
534	case VM_LAPIC_LOCAL_IRQ:
535		vmirq = (struct vm_lapic_irq *)data;
536		error = lapic_set_local_intr(sc->vm, vmirq->cpuid,
537		    vmirq->vector);
538		break;
539	case VM_LAPIC_MSI:
540		vmmsi = (struct vm_lapic_msi *)data;
541		error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg);
542		break;
543	case VM_IOAPIC_ASSERT_IRQ:
544		ioapic_irq = (struct vm_ioapic_irq *)data;
545		error = vioapic_assert_irq(sc->vm, ioapic_irq->irq);
546		break;
547	case VM_IOAPIC_DEASSERT_IRQ:
548		ioapic_irq = (struct vm_ioapic_irq *)data;
549		error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq);
550		break;
551	case VM_IOAPIC_PULSE_IRQ:
552		ioapic_irq = (struct vm_ioapic_irq *)data;
553		error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq);
554		break;
555	case VM_IOAPIC_PINCOUNT:
556		*(int *)data = vioapic_pincount(sc->vm);
557		break;
558	case VM_ISA_ASSERT_IRQ:
559		isa_irq = (struct vm_isa_irq *)data;
560		error = vatpic_assert_irq(sc->vm, isa_irq->atpic_irq);
561		if (error == 0 && isa_irq->ioapic_irq != -1)
562			error = vioapic_assert_irq(sc->vm,
563			    isa_irq->ioapic_irq);
564		break;
565	case VM_ISA_DEASSERT_IRQ:
566		isa_irq = (struct vm_isa_irq *)data;
567		error = vatpic_deassert_irq(sc->vm, isa_irq->atpic_irq);
568		if (error == 0 && isa_irq->ioapic_irq != -1)
569			error = vioapic_deassert_irq(sc->vm,
570			    isa_irq->ioapic_irq);
571		break;
572	case VM_ISA_PULSE_IRQ:
573		isa_irq = (struct vm_isa_irq *)data;
574		error = vatpic_pulse_irq(sc->vm, isa_irq->atpic_irq);
575		if (error == 0 && isa_irq->ioapic_irq != -1)
576			error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq);
577		break;
578	case VM_ISA_SET_IRQ_TRIGGER:
579		isa_irq_trigger = (struct vm_isa_irq_trigger *)data;
580		error = vatpic_set_irq_trigger(sc->vm,
581		    isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger);
582		break;
583	case VM_MMAP_GETNEXT:
584		mm = (struct vm_memmap *)data;
585		error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
586		    &mm->segoff, &mm->len, &mm->prot, &mm->flags);
587		break;
588	case VM_MMAP_MEMSEG:
589		mm = (struct vm_memmap *)data;
590		error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
591		    mm->len, mm->prot, mm->flags);
592		break;
593	case VM_ALLOC_MEMSEG:
594		error = alloc_memseg(sc, (struct vm_memseg *)data);
595		break;
596	case VM_GET_MEMSEG:
597		error = get_memseg(sc, (struct vm_memseg *)data);
598		break;
599	case VM_GET_REGISTER:
600		vmreg = (struct vm_register *)data;
601		error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum,
602					&vmreg->regval);
603		break;
604	case VM_SET_REGISTER:
605		vmreg = (struct vm_register *)data;
606		error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum,
607					vmreg->regval);
608		break;
609	case VM_SET_SEGMENT_DESCRIPTOR:
610		vmsegdesc = (struct vm_seg_desc *)data;
611		error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid,
612					vmsegdesc->regnum,
613					&vmsegdesc->desc);
614		break;
615	case VM_GET_SEGMENT_DESCRIPTOR:
616		vmsegdesc = (struct vm_seg_desc *)data;
617		error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid,
618					vmsegdesc->regnum,
619					&vmsegdesc->desc);
620		break;
621	case VM_GET_REGISTER_SET:
622		vmregset = (struct vm_register_set *)data;
623		if (vmregset->count > VM_REG_LAST) {
624			error = EINVAL;
625			break;
626		}
627		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
628		    M_WAITOK);
629		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
630		    M_WAITOK);
631		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
632		    vmregset->count);
633		if (error == 0)
634			error = vm_get_register_set(sc->vm, vmregset->cpuid,
635			    vmregset->count, regnums, regvals);
636		if (error == 0)
637			error = copyout(regvals, vmregset->regvals,
638			    sizeof(regvals[0]) * vmregset->count);
639		free(regvals, M_VMMDEV);
640		free(regnums, M_VMMDEV);
641		break;
642	case VM_SET_REGISTER_SET:
643		vmregset = (struct vm_register_set *)data;
644		if (vmregset->count > VM_REG_LAST) {
645			error = EINVAL;
646			break;
647		}
648		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
649		    M_WAITOK);
650		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
651		    M_WAITOK);
652		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
653		    vmregset->count);
654		if (error == 0)
655			error = copyin(vmregset->regvals, regvals,
656			    sizeof(regvals[0]) * vmregset->count);
657		if (error == 0)
658			error = vm_set_register_set(sc->vm, vmregset->cpuid,
659			    vmregset->count, regnums, regvals);
660		free(regvals, M_VMMDEV);
661		free(regnums, M_VMMDEV);
662		break;
663	case VM_GET_CAPABILITY:
664		vmcap = (struct vm_capability *)data;
665		error = vm_get_capability(sc->vm, vmcap->cpuid,
666					  vmcap->captype,
667					  &vmcap->capval);
668		break;
669	case VM_SET_CAPABILITY:
670		vmcap = (struct vm_capability *)data;
671		error = vm_set_capability(sc->vm, vmcap->cpuid,
672					  vmcap->captype,
673					  vmcap->capval);
674		break;
675	case VM_SET_X2APIC_STATE:
676		x2apic = (struct vm_x2apic *)data;
677		error = vm_set_x2apic_state(sc->vm,
678					    x2apic->cpuid, x2apic->state);
679		break;
680	case VM_GET_X2APIC_STATE:
681		x2apic = (struct vm_x2apic *)data;
682		error = vm_get_x2apic_state(sc->vm,
683					    x2apic->cpuid, &x2apic->state);
684		break;
685	case VM_GET_GPA_PMAP:
686		gpapte = (struct vm_gpa_pte *)data;
687		pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)),
688				 gpapte->gpa, gpapte->pte, &gpapte->ptenum);
689		error = 0;
690		break;
691	case VM_GET_HPET_CAPABILITIES:
692		error = vhpet_getcap((struct vm_hpet_cap *)data);
693		break;
694	case VM_GLA2GPA: {
695		CTASSERT(PROT_READ == VM_PROT_READ);
696		CTASSERT(PROT_WRITE == VM_PROT_WRITE);
697		CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
698		gg = (struct vm_gla2gpa *)data;
699		error = vm_gla2gpa(sc->vm, gg->vcpuid, &gg->paging, gg->gla,
700		    gg->prot, &gg->gpa, &gg->fault);
701		KASSERT(error == 0 || error == EFAULT,
702		    ("%s: vm_gla2gpa unknown error %d", __func__, error));
703		break;
704	}
705	case VM_GLA2GPA_NOFAULT:
706		gg = (struct vm_gla2gpa *)data;
707		error = vm_gla2gpa_nofault(sc->vm, gg->vcpuid, &gg->paging,
708		    gg->gla, gg->prot, &gg->gpa, &gg->fault);
709		KASSERT(error == 0 || error == EFAULT,
710		    ("%s: vm_gla2gpa unknown error %d", __func__, error));
711		break;
712	case VM_ACTIVATE_CPU:
713		vac = (struct vm_activate_cpu *)data;
714		error = vm_activate_cpu(sc->vm, vac->vcpuid);
715		break;
716	case VM_GET_CPUS:
717		error = 0;
718		vm_cpuset = (struct vm_cpuset *)data;
719		size = vm_cpuset->cpusetsize;
720		if (size < sizeof(cpuset_t) || size > CPU_MAXSIZE / NBBY) {
721			error = ERANGE;
722			break;
723		}
724		cpuset = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
725		if (vm_cpuset->which == VM_ACTIVE_CPUS)
726			*cpuset = vm_active_cpus(sc->vm);
727		else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
728			*cpuset = vm_suspended_cpus(sc->vm);
729		else if (vm_cpuset->which == VM_DEBUG_CPUS)
730			*cpuset = vm_debug_cpus(sc->vm);
731		else
732			error = EINVAL;
733		if (error == 0)
734			error = copyout(cpuset, vm_cpuset->cpus, size);
735		free(cpuset, M_TEMP);
736		break;
737	case VM_SUSPEND_CPU:
738		vac = (struct vm_activate_cpu *)data;
739		error = vm_suspend_cpu(sc->vm, vac->vcpuid);
740		break;
741	case VM_RESUME_CPU:
742		vac = (struct vm_activate_cpu *)data;
743		error = vm_resume_cpu(sc->vm, vac->vcpuid);
744		break;
745	case VM_SET_INTINFO:
746		vmii = (struct vm_intinfo *)data;
747		error = vm_exit_intinfo(sc->vm, vmii->vcpuid, vmii->info1);
748		break;
749	case VM_GET_INTINFO:
750		vmii = (struct vm_intinfo *)data;
751		error = vm_get_intinfo(sc->vm, vmii->vcpuid, &vmii->info1,
752		    &vmii->info2);
753		break;
754	case VM_RTC_WRITE:
755		rtcdata = (struct vm_rtc_data *)data;
756		error = vrtc_nvram_write(sc->vm, rtcdata->offset,
757		    rtcdata->value);
758		break;
759	case VM_RTC_READ:
760		rtcdata = (struct vm_rtc_data *)data;
761		error = vrtc_nvram_read(sc->vm, rtcdata->offset,
762		    &rtcdata->value);
763		break;
764	case VM_RTC_SETTIME:
765		rtctime = (struct vm_rtc_time *)data;
766		error = vrtc_set_time(sc->vm, rtctime->secs);
767		break;
768	case VM_RTC_GETTIME:
769		error = 0;
770		rtctime = (struct vm_rtc_time *)data;
771		rtctime->secs = vrtc_get_time(sc->vm);
772		break;
773	case VM_RESTART_INSTRUCTION:
774		error = vm_restart_instruction(sc->vm, vcpu);
775		break;
776	case VM_SET_TOPOLOGY:
777		topology = (struct vm_cpu_topology *)data;
778		error = vm_set_topology(sc->vm, topology->sockets,
779		    topology->cores, topology->threads, topology->maxcpus);
780		break;
781	case VM_GET_TOPOLOGY:
782		topology = (struct vm_cpu_topology *)data;
783		vm_get_topology(sc->vm, &topology->sockets, &topology->cores,
784		    &topology->threads, &topology->maxcpus);
785		error = 0;
786		break;
787	default:
788		error = ENOTTY;
789		break;
790	}
791
792	if (state_changed == 1)
793		vcpu_unlock_one(sc, vcpu);
794	else if (state_changed == 2)
795		vcpu_unlock_all(sc);
796
797done:
798	/*
799	 * Make sure that no handler returns a kernel-internal
800	 * error value to userspace.
801	 */
802	KASSERT(error == ERESTART || error >= 0,
803	    ("vmmdev_ioctl: invalid error return %d", error));
804	return (error);
805}
806
807static int
808vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
809    struct vm_object **objp, int nprot)
810{
811	struct vmmdev_softc *sc;
812	vm_paddr_t gpa;
813	size_t len;
814	vm_ooffset_t segoff, first, last;
815	int error, found, segid;
816	uint16_t lastcpu;
817	bool sysmem;
818
819	error = vmm_priv_check(curthread->td_ucred);
820	if (error)
821		return (error);
822
823	first = *offset;
824	last = first + mapsize;
825	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
826		return (EINVAL);
827
828	sc = vmmdev_lookup2(cdev);
829	if (sc == NULL) {
830		/* virtual machine is in the process of being created */
831		return (EINVAL);
832	}
833
834	/*
835	 * Get a read lock on the guest memory map by freezing any vcpu.
836	 */
837	lastcpu = vm_get_maxcpus(sc->vm) - 1;
838	error = vcpu_lock_one(sc, lastcpu);
839	if (error)
840		return (error);
841
842	gpa = 0;
843	found = 0;
844	while (!found) {
845		error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
846		    NULL, NULL);
847		if (error)
848			break;
849
850		if (first >= gpa && last <= gpa + len)
851			found = 1;
852		else
853			gpa += len;
854	}
855
856	if (found) {
857		error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
858		KASSERT(error == 0 && *objp != NULL,
859		    ("%s: invalid memory segment %d", __func__, segid));
860		if (sysmem) {
861			vm_object_reference(*objp);
862			*offset = segoff + (first - gpa);
863		} else {
864			error = EINVAL;
865		}
866	}
867	vcpu_unlock_one(sc, lastcpu);
868	return (error);
869}
870
871static void
872vmmdev_destroy(void *arg)
873{
874	struct vmmdev_softc *sc = arg;
875	struct devmem_softc *dsc;
876	int error;
877
878	error = vcpu_lock_all(sc);
879	KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
880
881	while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
882		KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
883		SLIST_REMOVE_HEAD(&sc->devmem, link);
884		free(dsc->name, M_VMMDEV);
885		free(dsc, M_VMMDEV);
886	}
887
888	if (sc->cdev != NULL)
889		destroy_dev(sc->cdev);
890
891	if (sc->vm != NULL)
892		vm_destroy(sc->vm);
893
894	if ((sc->flags & VSC_LINKED) != 0) {
895		mtx_lock(&vmmdev_mtx);
896		SLIST_REMOVE(&head, sc, vmmdev_softc, link);
897		mtx_unlock(&vmmdev_mtx);
898	}
899
900	free(sc, M_VMMDEV);
901}
902
903static int
904sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
905{
906	int error;
907	char buf[VM_MAX_NAMELEN];
908	struct devmem_softc *dsc;
909	struct vmmdev_softc *sc;
910	struct cdev *cdev;
911
912	error = vmm_priv_check(req->td->td_ucred);
913	if (error)
914		return (error);
915
916	strlcpy(buf, "beavis", sizeof(buf));
917	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
918	if (error != 0 || req->newptr == NULL)
919		return (error);
920
921	mtx_lock(&vmmdev_mtx);
922	sc = vmmdev_lookup(buf);
923	if (sc == NULL || sc->cdev == NULL) {
924		mtx_unlock(&vmmdev_mtx);
925		return (EINVAL);
926	}
927
928	/*
929	 * The 'cdev' will be destroyed asynchronously when 'si_threadcount'
930	 * goes down to 0 so we should not do it again in the callback.
931	 *
932	 * Setting 'sc->cdev' to NULL is also used to indicate that the VM
933	 * is scheduled for destruction.
934	 */
935	cdev = sc->cdev;
936	sc->cdev = NULL;
937	mtx_unlock(&vmmdev_mtx);
938
939	/*
940	 * Schedule all cdevs to be destroyed:
941	 *
942	 * - any new operations on the 'cdev' will return an error (ENXIO).
943	 *
944	 * - when the 'si_threadcount' dwindles down to zero the 'cdev' will
945	 *   be destroyed and the callback will be invoked in a taskqueue
946	 *   context.
947	 *
948	 * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
949	 */
950	SLIST_FOREACH(dsc, &sc->devmem, link) {
951		KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
952		destroy_dev_sched_cb(dsc->cdev, devmem_destroy, dsc);
953	}
954	destroy_dev_sched_cb(cdev, vmmdev_destroy, sc);
955	return (0);
956}
957SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy,
958	    CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON,
959	    NULL, 0, sysctl_vmm_destroy, "A", NULL);
960
961static struct cdevsw vmmdevsw = {
962	.d_name		= "vmmdev",
963	.d_version	= D_VERSION,
964	.d_ioctl	= vmmdev_ioctl,
965	.d_mmap_single	= vmmdev_mmap_single,
966	.d_read		= vmmdev_rw,
967	.d_write	= vmmdev_rw,
968};
969
970static int
971sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
972{
973	int error;
974	struct vm *vm;
975	struct cdev *cdev;
976	struct vmmdev_softc *sc, *sc2;
977	char buf[VM_MAX_NAMELEN];
978
979	error = vmm_priv_check(req->td->td_ucred);
980	if (error)
981		return (error);
982
983	strlcpy(buf, "beavis", sizeof(buf));
984	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
985	if (error != 0 || req->newptr == NULL)
986		return (error);
987
988	mtx_lock(&vmmdev_mtx);
989	sc = vmmdev_lookup(buf);
990	mtx_unlock(&vmmdev_mtx);
991	if (sc != NULL)
992		return (EEXIST);
993
994	error = vm_create(buf, &vm);
995	if (error != 0)
996		return (error);
997
998	sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
999	sc->vm = vm;
1000	SLIST_INIT(&sc->devmem);
1001
1002	/*
1003	 * Lookup the name again just in case somebody sneaked in when we
1004	 * dropped the lock.
1005	 */
1006	mtx_lock(&vmmdev_mtx);
1007	sc2 = vmmdev_lookup(buf);
1008	if (sc2 == NULL) {
1009		SLIST_INSERT_HEAD(&head, sc, link);
1010		sc->flags |= VSC_LINKED;
1011	}
1012	mtx_unlock(&vmmdev_mtx);
1013
1014	if (sc2 != NULL) {
1015		vmmdev_destroy(sc);
1016		return (EEXIST);
1017	}
1018
1019	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, NULL,
1020			   UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf);
1021	if (error != 0) {
1022		vmmdev_destroy(sc);
1023		return (error);
1024	}
1025
1026	mtx_lock(&vmmdev_mtx);
1027	sc->cdev = cdev;
1028	sc->cdev->si_drv1 = sc;
1029	mtx_unlock(&vmmdev_mtx);
1030
1031	return (0);
1032}
1033SYSCTL_PROC(_hw_vmm, OID_AUTO, create,
1034	    CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON,
1035	    NULL, 0, sysctl_vmm_create, "A", NULL);
1036
1037void
1038vmmdev_init(void)
1039{
1040	mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF);
1041	pr_allow_flag = prison_add_allow(NULL, "vmm", NULL,
1042	    "Allow use of vmm in a jail.");
1043}
1044
1045int
1046vmmdev_cleanup(void)
1047{
1048	int error;
1049
1050	if (SLIST_EMPTY(&head))
1051		error = 0;
1052	else
1053		error = EBUSY;
1054
1055	return (error);
1056}
1057
1058static int
1059devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
1060    struct vm_object **objp, int nprot)
1061{
1062	struct devmem_softc *dsc;
1063	vm_ooffset_t first, last;
1064	size_t seglen;
1065	int error;
1066	uint16_t lastcpu;
1067	bool sysmem;
1068
1069	dsc = cdev->si_drv1;
1070	if (dsc == NULL) {
1071		/* 'cdev' has been created but is not ready for use */
1072		return (ENXIO);
1073	}
1074
1075	first = *offset;
1076	last = *offset + len;
1077	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
1078		return (EINVAL);
1079
1080	lastcpu = vm_get_maxcpus(dsc->sc->vm) - 1;
1081	error = vcpu_lock_one(dsc->sc, lastcpu);
1082	if (error)
1083		return (error);
1084
1085	error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
1086	KASSERT(error == 0 && !sysmem && *objp != NULL,
1087	    ("%s: invalid devmem segment %d", __func__, dsc->segid));
1088
1089	vcpu_unlock_one(dsc->sc, lastcpu);
1090
1091	if (seglen >= last) {
1092		vm_object_reference(*objp);
1093		return (0);
1094	} else {
1095		return (EINVAL);
1096	}
1097}
1098
1099static struct cdevsw devmemsw = {
1100	.d_name		= "devmem",
1101	.d_version	= D_VERSION,
1102	.d_mmap_single	= devmem_mmap_single,
1103};
1104
1105static int
1106devmem_create_cdev(const char *vmname, int segid, char *devname)
1107{
1108	struct devmem_softc *dsc;
1109	struct vmmdev_softc *sc;
1110	struct cdev *cdev;
1111	int error;
1112
1113	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL,
1114	    UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname);
1115	if (error)
1116		return (error);
1117
1118	dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO);
1119
1120	mtx_lock(&vmmdev_mtx);
1121	sc = vmmdev_lookup(vmname);
1122	KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname));
1123	if (sc->cdev == NULL) {
1124		/* virtual machine is being created or destroyed */
1125		mtx_unlock(&vmmdev_mtx);
1126		free(dsc, M_VMMDEV);
1127		destroy_dev_sched_cb(cdev, NULL, 0);
1128		return (ENODEV);
1129	}
1130
1131	dsc->segid = segid;
1132	dsc->name = devname;
1133	dsc->cdev = cdev;
1134	dsc->sc = sc;
1135	SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
1136	mtx_unlock(&vmmdev_mtx);
1137
1138	/* The 'cdev' is ready for use after 'si_drv1' is initialized */
1139	cdev->si_drv1 = dsc;
1140	return (0);
1141}
1142
1143static void
1144devmem_destroy(void *arg)
1145{
1146	struct devmem_softc *dsc = arg;
1147
1148	KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__));
1149	dsc->cdev = NULL;
1150	dsc->sc = NULL;
1151}
1152