1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2011 NetApp, Inc.
5 * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com>
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include <sys/param.h>
31#include <sys/kernel.h>
32#include <sys/jail.h>
33#include <sys/queue.h>
34#include <sys/lock.h>
35#include <sys/mutex.h>
36#include <sys/malloc.h>
37#include <sys/conf.h>
38#include <sys/sysctl.h>
39#include <sys/libkern.h>
40#include <sys/ioccom.h>
41#include <sys/mman.h>
42#include <sys/uio.h>
43#include <sys/proc.h>
44
45#include <vm/vm.h>
46#include <vm/pmap.h>
47#include <vm/vm_map.h>
48#include <vm/vm_object.h>
49
50#include <machine/machdep.h>
51#include <machine/vmparam.h>
52#include <machine/vmm.h>
53#include <machine/vmm_dev.h>
54
55#include "vmm_stat.h"
56
57#include "io/vgic.h"
58
59struct devmem_softc {
60	int	segid;
61	char	*name;
62	struct cdev *cdev;
63	struct vmmdev_softc *sc;
64	SLIST_ENTRY(devmem_softc) link;
65};
66
67struct vmmdev_softc {
68	struct vm	*vm;		/* vm instance cookie */
69	struct cdev	*cdev;
70	struct ucred	*ucred;
71	SLIST_ENTRY(vmmdev_softc) link;
72	SLIST_HEAD(, devmem_softc) devmem;
73	int		flags;
74};
75#define	VSC_LINKED		0x01
76
77static SLIST_HEAD(, vmmdev_softc) head;
78
79static unsigned pr_allow_flag;
80static struct mtx vmmdev_mtx;
81MTX_SYSINIT(vmmdev_mtx, &vmmdev_mtx, "vmm device mutex", MTX_DEF);
82
83static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
84
85SYSCTL_DECL(_hw_vmm);
86
87static int vmm_priv_check(struct ucred *ucred);
88static int devmem_create_cdev(const char *vmname, int id, char *devmem);
89static void devmem_destroy(void *arg);
90
91static int
92vmm_priv_check(struct ucred *ucred)
93{
94
95	if (jailed(ucred) &&
96	    !(ucred->cr_prison->pr_allow & pr_allow_flag))
97		return (EPERM);
98
99	return (0);
100}
101
102static int
103vcpu_lock_one(struct vcpu *vcpu)
104{
105	return (vcpu_set_state(vcpu, VCPU_FROZEN, true));
106}
107
108static void
109vcpu_unlock_one(struct vcpu *vcpu)
110{
111	enum vcpu_state state;
112
113	state = vcpu_get_state(vcpu, NULL);
114	if (state != VCPU_FROZEN) {
115		panic("vcpu %s(%d) has invalid state %d",
116		    vm_name(vcpu_vm(vcpu)), vcpu_vcpuid(vcpu), state);
117	}
118
119	vcpu_set_state(vcpu, VCPU_IDLE, false);
120}
121
122static int
123vcpu_lock_all(struct vmmdev_softc *sc)
124{
125	struct vcpu *vcpu;
126	int error;
127	uint16_t i, j, maxcpus;
128
129	error = 0;
130	vm_slock_vcpus(sc->vm);
131	maxcpus = vm_get_maxcpus(sc->vm);
132	for (i = 0; i < maxcpus; i++) {
133		vcpu = vm_vcpu(sc->vm, i);
134		if (vcpu == NULL)
135			continue;
136		error = vcpu_lock_one(vcpu);
137		if (error)
138			break;
139	}
140
141	if (error) {
142		for (j = 0; j < i; j++) {
143			vcpu = vm_vcpu(sc->vm, j);
144			if (vcpu == NULL)
145				continue;
146			vcpu_unlock_one(vcpu);
147		}
148		vm_unlock_vcpus(sc->vm);
149	}
150
151	return (error);
152}
153
154static void
155vcpu_unlock_all(struct vmmdev_softc *sc)
156{
157	struct vcpu *vcpu;
158	uint16_t i, maxcpus;
159
160	maxcpus = vm_get_maxcpus(sc->vm);
161	for (i = 0; i < maxcpus; i++) {
162		vcpu = vm_vcpu(sc->vm, i);
163		if (vcpu == NULL)
164			continue;
165		vcpu_unlock_one(vcpu);
166	}
167	vm_unlock_vcpus(sc->vm);
168}
169
170static struct vmmdev_softc *
171vmmdev_lookup(const char *name)
172{
173	struct vmmdev_softc *sc;
174
175#ifdef notyet	/* XXX kernel is not compiled with invariants */
176	mtx_assert(&vmmdev_mtx, MA_OWNED);
177#endif
178
179	SLIST_FOREACH(sc, &head, link) {
180		if (strcmp(name, vm_name(sc->vm)) == 0)
181			break;
182	}
183
184	if (sc == NULL)
185		return (NULL);
186
187	if (cr_cansee(curthread->td_ucred, sc->ucred))
188		return (NULL);
189
190	return (sc);
191}
192
193static struct vmmdev_softc *
194vmmdev_lookup2(struct cdev *cdev)
195{
196
197	return (cdev->si_drv1);
198}
199
200static int
201vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
202{
203	int error, off, c, prot;
204	vm_paddr_t gpa, maxaddr;
205	void *hpa, *cookie;
206	struct vmmdev_softc *sc;
207
208	error = vmm_priv_check(curthread->td_ucred);
209	if (error)
210		return (error);
211
212	sc = vmmdev_lookup2(cdev);
213	if (sc == NULL)
214		return (ENXIO);
215
216	/*
217	 * Get a read lock on the guest memory map.
218	 */
219	vm_slock_memsegs(sc->vm);
220
221	prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
222	maxaddr = vmm_sysmem_maxaddr(sc->vm);
223	while (uio->uio_resid > 0 && error == 0) {
224		gpa = uio->uio_offset;
225		off = gpa & PAGE_MASK;
226		c = min(uio->uio_resid, PAGE_SIZE - off);
227
228		/*
229		 * The VM has a hole in its physical memory map. If we want to
230		 * use 'dd' to inspect memory beyond the hole we need to
231		 * provide bogus data for memory that lies in the hole.
232		 *
233		 * Since this device does not support lseek(2), dd(1) will
234		 * read(2) blocks of data to simulate the lseek(2).
235		 */
236		hpa = vm_gpa_hold_global(sc->vm, gpa, c, prot, &cookie);
237		if (hpa == NULL) {
238			if (uio->uio_rw == UIO_READ && gpa < maxaddr)
239				error = uiomove(__DECONST(void *, zero_region),
240				    c, uio);
241			else
242				error = EFAULT;
243		} else {
244			error = uiomove(hpa, c, uio);
245			vm_gpa_release(cookie);
246		}
247	}
248	vm_unlock_memsegs(sc->vm);
249	return (error);
250}
251
252CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= VM_MAX_SUFFIXLEN + 1);
253
254static int
255get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
256{
257	struct devmem_softc *dsc;
258	int error;
259	bool sysmem;
260
261	error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
262	if (error || mseg->len == 0)
263		return (error);
264
265	if (!sysmem) {
266		SLIST_FOREACH(dsc, &sc->devmem, link) {
267			if (dsc->segid == mseg->segid)
268				break;
269		}
270		KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
271		    __func__, mseg->segid));
272		error = copystr(dsc->name, mseg->name, len, NULL);
273	} else {
274		bzero(mseg->name, len);
275	}
276
277	return (error);
278}
279
280static int
281alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
282{
283	char *name;
284	int error;
285	bool sysmem;
286
287	error = 0;
288	name = NULL;
289	sysmem = true;
290
291	/*
292	 * The allocation is lengthened by 1 to hold a terminating NUL.  It'll
293	 * by stripped off when devfs processes the full string.
294	 */
295	if (VM_MEMSEG_NAME(mseg)) {
296		sysmem = false;
297		name = malloc(len, M_VMMDEV, M_WAITOK);
298		error = copystr(mseg->name, name, len, NULL);
299		if (error)
300			goto done;
301	}
302
303	error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem);
304	if (error)
305		goto done;
306
307	if (VM_MEMSEG_NAME(mseg)) {
308		error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name);
309		if (error)
310			vm_free_memseg(sc->vm, mseg->segid);
311		else
312			name = NULL;	/* freed when 'cdev' is destroyed */
313	}
314done:
315	free(name, M_VMMDEV);
316	return (error);
317}
318
319static int
320vm_get_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
321    uint64_t *regval)
322{
323	int error, i;
324
325	error = 0;
326	for (i = 0; i < count; i++) {
327		error = vm_get_register(vcpu, regnum[i], &regval[i]);
328		if (error)
329			break;
330	}
331	return (error);
332}
333
334static int
335vm_set_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
336    uint64_t *regval)
337{
338	int error, i;
339
340	error = 0;
341	for (i = 0; i < count; i++) {
342		error = vm_set_register(vcpu, regnum[i], regval[i]);
343		if (error)
344			break;
345	}
346	return (error);
347}
348
349static int
350vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
351	     struct thread *td)
352{
353	int error, vcpuid, size;
354	cpuset_t *cpuset;
355	struct vmmdev_softc *sc;
356	struct vcpu *vcpu;
357	struct vm_register *vmreg;
358	struct vm_register_set *vmregset;
359	struct vm_run *vmrun;
360	struct vm_vgic_version *vgv;
361	struct vm_vgic_descr *vgic;
362	struct vm_cpuset *vm_cpuset;
363	struct vm_irq *vi;
364	struct vm_capability *vmcap;
365	struct vm_stats *vmstats;
366	struct vm_stat_desc *statdesc;
367	struct vm_suspend *vmsuspend;
368	struct vm_exception *vmexc;
369	struct vm_gla2gpa *gg;
370	struct vm_memmap *mm;
371	struct vm_munmap *mu;
372	struct vm_msi *vmsi;
373	struct vm_cpu_topology *topology;
374	uint64_t *regvals;
375	int *regnums;
376	enum { NONE, SINGLE, ALL } vcpus_locked;
377	bool memsegs_locked;
378
379	error = vmm_priv_check(curthread->td_ucred);
380	if (error)
381		return (error);
382
383	sc = vmmdev_lookup2(cdev);
384	if (sc == NULL)
385		return (ENXIO);
386
387	error = 0;
388	vcpuid = -1;
389	vcpu = NULL;
390	vcpus_locked = NONE;
391	memsegs_locked = false;
392
393	/*
394	 * Some VMM ioctls can operate only on vcpus that are not running.
395	 */
396	switch (cmd) {
397	case VM_RUN:
398	case VM_GET_REGISTER:
399	case VM_SET_REGISTER:
400	case VM_GET_REGISTER_SET:
401	case VM_SET_REGISTER_SET:
402	case VM_INJECT_EXCEPTION:
403	case VM_GET_CAPABILITY:
404	case VM_SET_CAPABILITY:
405	case VM_GLA2GPA_NOFAULT:
406	case VM_ACTIVATE_CPU:
407		/*
408		 * ioctls that can operate only on vcpus that are not running.
409		 */
410		vcpuid = *(int *)data;
411		vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
412		if (vcpu == NULL) {
413			error = EINVAL;
414			goto done;
415		}
416		error = vcpu_lock_one(vcpu);
417		if (error)
418			goto done;
419		vcpus_locked = SINGLE;
420		break;
421
422	case VM_ALLOC_MEMSEG:
423	case VM_MMAP_MEMSEG:
424	case VM_MUNMAP_MEMSEG:
425	case VM_REINIT:
426	case VM_ATTACH_VGIC:
427		/*
428		 * ioctls that modify the memory map must lock memory
429		 * segments exclusively.
430		 */
431		vm_xlock_memsegs(sc->vm);
432		memsegs_locked = true;
433
434		/*
435		 * ioctls that operate on the entire virtual machine must
436		 * prevent all vcpus from running.
437		 */
438		error = vcpu_lock_all(sc);
439		if (error)
440			goto done;
441		vcpus_locked = ALL;
442		break;
443	case VM_GET_MEMSEG:
444	case VM_MMAP_GETNEXT:
445		/*
446		 * Lock the memory map while it is being inspected.
447		 */
448		vm_slock_memsegs(sc->vm);
449		memsegs_locked = true;
450		break;
451
452	case VM_STATS:
453		/*
454		 * These do not need the vCPU locked but do operate on
455		 * a specific vCPU.
456		 */
457		vcpuid = *(int *)data;
458		vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
459		if (vcpu == NULL) {
460			error = EINVAL;
461			goto done;
462		}
463		break;
464
465	case VM_SUSPEND_CPU:
466	case VM_RESUME_CPU:
467		/*
468		 * These can either operate on all CPUs via a vcpuid of
469		 * -1 or on a specific vCPU.
470		 */
471		vcpuid = *(int *)data;
472		if (vcpuid == -1)
473			break;
474		vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
475		if (vcpu == NULL) {
476			error = EINVAL;
477			goto done;
478		}
479		break;
480
481	case VM_ASSERT_IRQ:
482		vi = (struct vm_irq *)data;
483		error = vm_assert_irq(sc->vm, vi->irq);
484		break;
485	case VM_DEASSERT_IRQ:
486		vi = (struct vm_irq *)data;
487		error = vm_deassert_irq(sc->vm, vi->irq);
488		break;
489	default:
490		break;
491	}
492
493	switch (cmd) {
494	case VM_RUN: {
495		struct vm_exit *vme;
496
497		vmrun = (struct vm_run *)data;
498		vme = vm_exitinfo(vcpu);
499
500		error = vm_run(vcpu);
501		if (error != 0)
502			break;
503
504		error = copyout(vme, vmrun->vm_exit, sizeof(*vme));
505		if (error != 0)
506			break;
507		break;
508	}
509	case VM_SUSPEND:
510		vmsuspend = (struct vm_suspend *)data;
511		error = vm_suspend(sc->vm, vmsuspend->how);
512		break;
513	case VM_REINIT:
514		error = vm_reinit(sc->vm);
515		break;
516	case VM_STAT_DESC: {
517		statdesc = (struct vm_stat_desc *)data;
518		error = vmm_stat_desc_copy(statdesc->index,
519					statdesc->desc, sizeof(statdesc->desc));
520		break;
521	}
522	case VM_STATS: {
523		CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS);
524		vmstats = (struct vm_stats *)data;
525		getmicrotime(&vmstats->tv);
526		error = vmm_stat_copy(vcpu, vmstats->index,
527				      nitems(vmstats->statbuf),
528				      &vmstats->num_entries, vmstats->statbuf);
529		break;
530	}
531	case VM_MMAP_GETNEXT:
532		mm = (struct vm_memmap *)data;
533		error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
534		    &mm->segoff, &mm->len, &mm->prot, &mm->flags);
535		break;
536	case VM_MMAP_MEMSEG:
537		mm = (struct vm_memmap *)data;
538		error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
539		    mm->len, mm->prot, mm->flags);
540		break;
541	case VM_MUNMAP_MEMSEG:
542		mu = (struct vm_munmap *)data;
543		error = vm_munmap_memseg(sc->vm, mu->gpa, mu->len);
544		break;
545	case VM_ALLOC_MEMSEG:
546		error = alloc_memseg(sc, (struct vm_memseg *)data,
547		    sizeof(((struct vm_memseg *)0)->name));
548		break;
549	case VM_GET_MEMSEG:
550		error = get_memseg(sc, (struct vm_memseg *)data,
551		    sizeof(((struct vm_memseg *)0)->name));
552		break;
553	case VM_GET_REGISTER:
554		vmreg = (struct vm_register *)data;
555		error = vm_get_register(vcpu, vmreg->regnum, &vmreg->regval);
556		break;
557	case VM_SET_REGISTER:
558		vmreg = (struct vm_register *)data;
559		error = vm_set_register(vcpu, vmreg->regnum, vmreg->regval);
560		break;
561	case VM_GET_REGISTER_SET:
562		vmregset = (struct vm_register_set *)data;
563		if (vmregset->count > VM_REG_LAST) {
564			error = EINVAL;
565			break;
566		}
567		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
568		    M_WAITOK);
569		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
570		    M_WAITOK);
571		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
572		    vmregset->count);
573		if (error == 0)
574			error = vm_get_register_set(vcpu, vmregset->count,
575			    regnums, regvals);
576		if (error == 0)
577			error = copyout(regvals, vmregset->regvals,
578			    sizeof(regvals[0]) * vmregset->count);
579		free(regvals, M_VMMDEV);
580		free(regnums, M_VMMDEV);
581		break;
582	case VM_SET_REGISTER_SET:
583		vmregset = (struct vm_register_set *)data;
584		if (vmregset->count > VM_REG_LAST) {
585			error = EINVAL;
586			break;
587		}
588		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
589		    M_WAITOK);
590		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
591		    M_WAITOK);
592		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
593		    vmregset->count);
594		if (error == 0)
595			error = copyin(vmregset->regvals, regvals,
596			    sizeof(regvals[0]) * vmregset->count);
597		if (error == 0)
598			error = vm_set_register_set(vcpu, vmregset->count,
599			    regnums, regvals);
600		free(regvals, M_VMMDEV);
601		free(regnums, M_VMMDEV);
602		break;
603	case VM_GET_CAPABILITY:
604		vmcap = (struct vm_capability *)data;
605		error = vm_get_capability(vcpu,
606					  vmcap->captype,
607					  &vmcap->capval);
608		break;
609	case VM_SET_CAPABILITY:
610		vmcap = (struct vm_capability *)data;
611		error = vm_set_capability(vcpu,
612					  vmcap->captype,
613					  vmcap->capval);
614		break;
615	case VM_INJECT_EXCEPTION:
616		vmexc = (struct vm_exception *)data;
617		error = vm_inject_exception(vcpu, vmexc->esr, vmexc->far);
618		break;
619	case VM_GLA2GPA_NOFAULT:
620		gg = (struct vm_gla2gpa *)data;
621		error = vm_gla2gpa_nofault(vcpu, &gg->paging, gg->gla,
622		    gg->prot, &gg->gpa, &gg->fault);
623		KASSERT(error == 0 || error == EFAULT,
624		    ("%s: vm_gla2gpa unknown error %d", __func__, error));
625		break;
626	case VM_ACTIVATE_CPU:
627		error = vm_activate_cpu(vcpu);
628		break;
629	case VM_GET_CPUS:
630		error = 0;
631		vm_cpuset = (struct vm_cpuset *)data;
632		size = vm_cpuset->cpusetsize;
633		if (size < sizeof(cpuset_t) || size > CPU_MAXSIZE / NBBY) {
634			error = ERANGE;
635			break;
636		}
637		cpuset = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
638		if (vm_cpuset->which == VM_ACTIVE_CPUS)
639			*cpuset = vm_active_cpus(sc->vm);
640		else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
641			*cpuset = vm_suspended_cpus(sc->vm);
642		else if (vm_cpuset->which == VM_DEBUG_CPUS)
643			*cpuset = vm_debug_cpus(sc->vm);
644		else
645			error = EINVAL;
646		if (error == 0)
647			error = copyout(cpuset, vm_cpuset->cpus, size);
648		free(cpuset, M_TEMP);
649		break;
650	case VM_SUSPEND_CPU:
651		error = vm_suspend_cpu(sc->vm, vcpu);
652		break;
653	case VM_RESUME_CPU:
654		error = vm_resume_cpu(sc->vm, vcpu);
655		break;
656	case VM_GET_VGIC_VERSION:
657		vgv = (struct vm_vgic_version *)data;
658		/* TODO: Query the vgic driver for this */
659		vgv->version = 3;
660		vgv->flags = 0;
661		error = 0;
662		break;
663	case VM_ATTACH_VGIC:
664		vgic = (struct vm_vgic_descr *)data;
665		error = vm_attach_vgic(sc->vm, vgic);
666		break;
667	case VM_RAISE_MSI:
668		vmsi = (struct vm_msi *)data;
669		error = vm_raise_msi(sc->vm, vmsi->msg, vmsi->addr, vmsi->bus,
670		    vmsi->slot, vmsi->func);
671		break;
672	case VM_SET_TOPOLOGY:
673		topology = (struct vm_cpu_topology *)data;
674		error = vm_set_topology(sc->vm, topology->sockets,
675		    topology->cores, topology->threads, topology->maxcpus);
676		break;
677	case VM_GET_TOPOLOGY:
678		topology = (struct vm_cpu_topology *)data;
679		vm_get_topology(sc->vm, &topology->sockets, &topology->cores,
680		    &topology->threads, &topology->maxcpus);
681		error = 0;
682		break;
683	default:
684		error = ENOTTY;
685		break;
686	}
687
688done:
689	if (vcpus_locked == SINGLE)
690		vcpu_unlock_one(vcpu);
691	else if (vcpus_locked == ALL)
692		vcpu_unlock_all(sc);
693	if (memsegs_locked)
694		vm_unlock_memsegs(sc->vm);
695
696	/*
697	 * Make sure that no handler returns a kernel-internal
698	 * error value to userspace.
699	 */
700	KASSERT(error == ERESTART || error >= 0,
701	    ("vmmdev_ioctl: invalid error return %d", error));
702	return (error);
703}
704
705static int
706vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
707    struct vm_object **objp, int nprot)
708{
709	struct vmmdev_softc *sc;
710	vm_paddr_t gpa;
711	size_t len;
712	vm_ooffset_t segoff, first, last;
713	int error, found, segid;
714	bool sysmem;
715
716	error = vmm_priv_check(curthread->td_ucred);
717	if (error)
718		return (error);
719
720	first = *offset;
721	last = first + mapsize;
722	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
723		return (EINVAL);
724
725	sc = vmmdev_lookup2(cdev);
726	if (sc == NULL) {
727		/* virtual machine is in the process of being created */
728		return (EINVAL);
729	}
730
731	/*
732	 * Get a read lock on the guest memory map.
733	 */
734	vm_slock_memsegs(sc->vm);
735
736	gpa = 0;
737	found = 0;
738	while (!found) {
739		error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
740		    NULL, NULL);
741		if (error)
742			break;
743
744		if (first >= gpa && last <= gpa + len)
745			found = 1;
746		else
747			gpa += len;
748	}
749
750	if (found) {
751		error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
752		KASSERT(error == 0 && *objp != NULL,
753		    ("%s: invalid memory segment %d", __func__, segid));
754		if (sysmem) {
755			vm_object_reference(*objp);
756			*offset = segoff + (first - gpa);
757		} else {
758			error = EINVAL;
759		}
760	}
761	vm_unlock_memsegs(sc->vm);
762	return (error);
763}
764
765static void
766vmmdev_destroy(void *arg)
767{
768	struct vmmdev_softc *sc = arg;
769	struct devmem_softc *dsc;
770	int error __diagused;
771
772	error = vcpu_lock_all(sc);
773	KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
774	vm_unlock_vcpus(sc->vm);
775
776	while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
777		KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
778		SLIST_REMOVE_HEAD(&sc->devmem, link);
779		free(dsc->name, M_VMMDEV);
780		free(dsc, M_VMMDEV);
781	}
782
783	if (sc->cdev != NULL)
784		destroy_dev(sc->cdev);
785
786	if (sc->vm != NULL)
787		vm_destroy(sc->vm);
788
789	if (sc->ucred != NULL)
790		crfree(sc->ucred);
791
792	if ((sc->flags & VSC_LINKED) != 0) {
793		mtx_lock(&vmmdev_mtx);
794		SLIST_REMOVE(&head, sc, vmmdev_softc, link);
795		mtx_unlock(&vmmdev_mtx);
796	}
797
798	free(sc, M_VMMDEV);
799}
800
801static int
802sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
803{
804	struct devmem_softc *dsc;
805	struct vmmdev_softc *sc;
806	struct cdev *cdev;
807	char *buf;
808	int error, buflen;
809
810	error = vmm_priv_check(req->td->td_ucred);
811	if (error)
812		return (error);
813
814	buflen = VM_MAX_NAMELEN + 1;
815	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
816	strlcpy(buf, "beavis", buflen);
817	error = sysctl_handle_string(oidp, buf, buflen, req);
818	if (error != 0 || req->newptr == NULL)
819		goto out;
820
821	mtx_lock(&vmmdev_mtx);
822	sc = vmmdev_lookup(buf);
823	if (sc == NULL || sc->cdev == NULL) {
824		mtx_unlock(&vmmdev_mtx);
825		error = EINVAL;
826		goto out;
827	}
828
829	/*
830	 * Setting 'sc->cdev' to NULL is used to indicate that the VM
831	 * is scheduled for destruction.
832	 */
833	cdev = sc->cdev;
834	sc->cdev = NULL;
835	mtx_unlock(&vmmdev_mtx);
836
837	/*
838	 * Destroy all cdevs:
839	 *
840	 * - any new operations on the 'cdev' will return an error (ENXIO).
841	 *
842	 * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
843	 */
844	SLIST_FOREACH(dsc, &sc->devmem, link) {
845		KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
846		destroy_dev(dsc->cdev);
847		devmem_destroy(dsc);
848	}
849	destroy_dev(cdev);
850	vmmdev_destroy(sc);
851	error = 0;
852
853out:
854	free(buf, M_VMMDEV);
855	return (error);
856}
857SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy,
858    CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
859    NULL, 0, sysctl_vmm_destroy, "A",
860    NULL);
861
862static struct cdevsw vmmdevsw = {
863	.d_name		= "vmmdev",
864	.d_version	= D_VERSION,
865	.d_ioctl	= vmmdev_ioctl,
866	.d_mmap_single	= vmmdev_mmap_single,
867	.d_read		= vmmdev_rw,
868	.d_write	= vmmdev_rw,
869};
870
871static int
872sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
873{
874	struct vm *vm;
875	struct cdev *cdev;
876	struct vmmdev_softc *sc, *sc2;
877	char *buf;
878	int error, buflen;
879
880	error = vmm_priv_check(req->td->td_ucred);
881	if (error)
882		return (error);
883
884	buflen = VM_MAX_NAMELEN + 1;
885	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
886	strlcpy(buf, "beavis", buflen);
887	error = sysctl_handle_string(oidp, buf, buflen, req);
888	if (error != 0 || req->newptr == NULL)
889		goto out;
890
891	mtx_lock(&vmmdev_mtx);
892	sc = vmmdev_lookup(buf);
893	mtx_unlock(&vmmdev_mtx);
894	if (sc != NULL) {
895		error = EEXIST;
896		goto out;
897	}
898
899	error = vm_create(buf, &vm);
900	if (error != 0)
901		goto out;
902
903	sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
904	sc->ucred = crhold(curthread->td_ucred);
905	sc->vm = vm;
906	SLIST_INIT(&sc->devmem);
907
908	/*
909	 * Lookup the name again just in case somebody sneaked in when we
910	 * dropped the lock.
911	 */
912	mtx_lock(&vmmdev_mtx);
913	sc2 = vmmdev_lookup(buf);
914	if (sc2 == NULL) {
915		SLIST_INSERT_HEAD(&head, sc, link);
916		sc->flags |= VSC_LINKED;
917	}
918	mtx_unlock(&vmmdev_mtx);
919
920	if (sc2 != NULL) {
921		vmmdev_destroy(sc);
922		error = EEXIST;
923		goto out;
924	}
925
926	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, sc->ucred,
927	    UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf);
928	if (error != 0) {
929		vmmdev_destroy(sc);
930		goto out;
931	}
932
933	mtx_lock(&vmmdev_mtx);
934	sc->cdev = cdev;
935	sc->cdev->si_drv1 = sc;
936	mtx_unlock(&vmmdev_mtx);
937
938out:
939	free(buf, M_VMMDEV);
940	return (error);
941}
942SYSCTL_PROC(_hw_vmm, OID_AUTO, create,
943    CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
944    NULL, 0, sysctl_vmm_create, "A",
945    NULL);
946
947void
948vmmdev_init(void)
949{
950	pr_allow_flag = prison_add_allow(NULL, "vmm", NULL,
951	    "Allow use of vmm in a jail.");
952}
953
954int
955vmmdev_cleanup(void)
956{
957	int error;
958
959	if (SLIST_EMPTY(&head))
960		error = 0;
961	else
962		error = EBUSY;
963
964	return (error);
965}
966
967static int
968devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
969    struct vm_object **objp, int nprot)
970{
971	struct devmem_softc *dsc;
972	vm_ooffset_t first, last;
973	size_t seglen;
974	int error;
975	bool sysmem;
976
977	dsc = cdev->si_drv1;
978	if (dsc == NULL) {
979		/* 'cdev' has been created but is not ready for use */
980		return (ENXIO);
981	}
982
983	first = *offset;
984	last = *offset + len;
985	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
986		return (EINVAL);
987
988	vm_slock_memsegs(dsc->sc->vm);
989
990	error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
991	KASSERT(error == 0 && !sysmem && *objp != NULL,
992	    ("%s: invalid devmem segment %d", __func__, dsc->segid));
993
994	if (seglen >= last)
995		vm_object_reference(*objp);
996	else
997		error = EINVAL;
998
999	vm_unlock_memsegs(dsc->sc->vm);
1000	return (error);
1001}
1002
1003static struct cdevsw devmemsw = {
1004	.d_name		= "devmem",
1005	.d_version	= D_VERSION,
1006	.d_mmap_single	= devmem_mmap_single,
1007};
1008
1009static int
1010devmem_create_cdev(const char *vmname, int segid, char *devname)
1011{
1012	struct devmem_softc *dsc;
1013	struct vmmdev_softc *sc;
1014	struct cdev *cdev;
1015	int error;
1016
1017	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL,
1018	    UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname);
1019	if (error)
1020		return (error);
1021
1022	dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO);
1023
1024	mtx_lock(&vmmdev_mtx);
1025	sc = vmmdev_lookup(vmname);
1026	KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname));
1027	if (sc->cdev == NULL) {
1028		/* virtual machine is being created or destroyed */
1029		mtx_unlock(&vmmdev_mtx);
1030		free(dsc, M_VMMDEV);
1031		destroy_dev_sched_cb(cdev, NULL, 0);
1032		return (ENODEV);
1033	}
1034
1035	dsc->segid = segid;
1036	dsc->name = devname;
1037	dsc->cdev = cdev;
1038	dsc->sc = sc;
1039	SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
1040	mtx_unlock(&vmmdev_mtx);
1041
1042	/* The 'cdev' is ready for use after 'si_drv1' is initialized */
1043	cdev->si_drv1 = dsc;
1044	return (0);
1045}
1046
1047static void
1048devmem_destroy(void *arg)
1049{
1050	struct devmem_softc *dsc = arg;
1051
1052	KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__));
1053	dsc->cdev = NULL;
1054	dsc->sc = NULL;
1055}
1056