1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2011 NetApp, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include "opt_bhyve_snapshot.h"
30
31#include <sys/param.h>
32#include <sys/kernel.h>
33#include <sys/jail.h>
34#include <sys/queue.h>
35#include <sys/lock.h>
36#include <sys/mutex.h>
37#include <sys/malloc.h>
38#include <sys/conf.h>
39#include <sys/sysctl.h>
40#include <sys/libkern.h>
41#include <sys/ioccom.h>
42#include <sys/mman.h>
43#include <sys/uio.h>
44#include <sys/proc.h>
45
46#include <vm/vm.h>
47#include <vm/pmap.h>
48#include <vm/vm_map.h>
49#include <vm/vm_object.h>
50
51#include <machine/vmparam.h>
52#include <machine/vmm.h>
53#include <machine/vmm_dev.h>
54#include <machine/vmm_instruction_emul.h>
55#include <machine/vmm_snapshot.h>
56#include <x86/apicreg.h>
57
58#include "vmm_lapic.h"
59#include "vmm_stat.h"
60#include "vmm_mem.h"
61#include "io/ppt.h"
62#include "io/vatpic.h"
63#include "io/vioapic.h"
64#include "io/vhpet.h"
65#include "io/vrtc.h"
66
67#ifdef COMPAT_FREEBSD13
68struct vm_stats_old {
69	int		cpuid;				/* in */
70	int		num_entries;			/* out */
71	struct timeval	tv;
72	uint64_t	statbuf[MAX_VM_STATS];
73};
74
75#define	VM_STATS_OLD \
76	_IOWR('v', IOCNUM_VM_STATS, struct vm_stats_old)
77
78struct vm_snapshot_meta_old {
79	void *ctx;			/* unused */
80	void *dev_data;
81	const char *dev_name;      /* identify userspace devices */
82	enum snapshot_req dev_req; /* identify kernel structs */
83
84	struct vm_snapshot_buffer buffer;
85
86	enum vm_snapshot_op op;
87};
88
89#define VM_SNAPSHOT_REQ_OLD \
90	_IOWR('v', IOCNUM_SNAPSHOT_REQ, struct vm_snapshot_meta_old)
91
92struct vm_exit_ipi_13 {
93	uint32_t	mode;
94	uint8_t		vector;
95	__BITSET_DEFINE(, 256) dmask;
96};
97
98struct vm_exit_13 {
99	uint32_t	exitcode;
100	int32_t		inst_length;
101	uint64_t	rip;
102	uint64_t	u[120 / sizeof(uint64_t)];
103};
104
105struct vm_run_13 {
106	int		cpuid;
107	struct vm_exit_13 vm_exit;
108};
109
110#define	VM_RUN_13 \
111	_IOWR('v', IOCNUM_RUN, struct vm_run_13)
112
113#endif /* COMPAT_FREEBSD13 */
114
115struct devmem_softc {
116	int	segid;
117	char	*name;
118	struct cdev *cdev;
119	struct vmmdev_softc *sc;
120	SLIST_ENTRY(devmem_softc) link;
121};
122
123struct vmmdev_softc {
124	struct vm	*vm;		/* vm instance cookie */
125	struct cdev	*cdev;
126	struct ucred	*ucred;
127	SLIST_ENTRY(vmmdev_softc) link;
128	SLIST_HEAD(, devmem_softc) devmem;
129	int		flags;
130};
131#define	VSC_LINKED		0x01
132
133static SLIST_HEAD(, vmmdev_softc) head;
134
135static unsigned pr_allow_flag;
136static struct mtx vmmdev_mtx;
137MTX_SYSINIT(vmmdev_mtx, &vmmdev_mtx, "vmm device mutex", MTX_DEF);
138
139static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
140
141SYSCTL_DECL(_hw_vmm);
142
143static int vmm_priv_check(struct ucred *ucred);
144static int devmem_create_cdev(const char *vmname, int id, char *devmem);
145static void devmem_destroy(void *arg);
146
147static int
148vmm_priv_check(struct ucred *ucred)
149{
150
151	if (jailed(ucred) &&
152	    !(ucred->cr_prison->pr_allow & pr_allow_flag))
153		return (EPERM);
154
155	return (0);
156}
157
158static int
159vcpu_lock_one(struct vcpu *vcpu)
160{
161	return (vcpu_set_state(vcpu, VCPU_FROZEN, true));
162}
163
164static void
165vcpu_unlock_one(struct vcpu *vcpu)
166{
167	enum vcpu_state state;
168
169	state = vcpu_get_state(vcpu, NULL);
170	if (state != VCPU_FROZEN) {
171		panic("vcpu %s(%d) has invalid state %d",
172		    vm_name(vcpu_vm(vcpu)), vcpu_vcpuid(vcpu), state);
173	}
174
175	vcpu_set_state(vcpu, VCPU_IDLE, false);
176}
177
178static int
179vcpu_lock_all(struct vmmdev_softc *sc)
180{
181	struct vcpu *vcpu;
182	int error;
183	uint16_t i, j, maxcpus;
184
185	error = 0;
186	vm_slock_vcpus(sc->vm);
187	maxcpus = vm_get_maxcpus(sc->vm);
188	for (i = 0; i < maxcpus; i++) {
189		vcpu = vm_vcpu(sc->vm, i);
190		if (vcpu == NULL)
191			continue;
192		error = vcpu_lock_one(vcpu);
193		if (error)
194			break;
195	}
196
197	if (error) {
198		for (j = 0; j < i; j++) {
199			vcpu = vm_vcpu(sc->vm, j);
200			if (vcpu == NULL)
201				continue;
202			vcpu_unlock_one(vcpu);
203		}
204		vm_unlock_vcpus(sc->vm);
205	}
206
207	return (error);
208}
209
210static void
211vcpu_unlock_all(struct vmmdev_softc *sc)
212{
213	struct vcpu *vcpu;
214	uint16_t i, maxcpus;
215
216	maxcpus = vm_get_maxcpus(sc->vm);
217	for (i = 0; i < maxcpus; i++) {
218		vcpu = vm_vcpu(sc->vm, i);
219		if (vcpu == NULL)
220			continue;
221		vcpu_unlock_one(vcpu);
222	}
223	vm_unlock_vcpus(sc->vm);
224}
225
226static struct vmmdev_softc *
227vmmdev_lookup(const char *name)
228{
229	struct vmmdev_softc *sc;
230
231#ifdef notyet	/* XXX kernel is not compiled with invariants */
232	mtx_assert(&vmmdev_mtx, MA_OWNED);
233#endif
234
235	SLIST_FOREACH(sc, &head, link) {
236		if (strcmp(name, vm_name(sc->vm)) == 0)
237			break;
238	}
239
240	if (sc == NULL)
241		return (NULL);
242
243	if (cr_cansee(curthread->td_ucred, sc->ucred))
244		return (NULL);
245
246	return (sc);
247}
248
249static struct vmmdev_softc *
250vmmdev_lookup2(struct cdev *cdev)
251{
252
253	return (cdev->si_drv1);
254}
255
256static int
257vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
258{
259	int error, off, c, prot;
260	vm_paddr_t gpa, maxaddr;
261	void *hpa, *cookie;
262	struct vmmdev_softc *sc;
263
264	error = vmm_priv_check(curthread->td_ucred);
265	if (error)
266		return (error);
267
268	sc = vmmdev_lookup2(cdev);
269	if (sc == NULL)
270		return (ENXIO);
271
272	/*
273	 * Get a read lock on the guest memory map.
274	 */
275	vm_slock_memsegs(sc->vm);
276
277	prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
278	maxaddr = vmm_sysmem_maxaddr(sc->vm);
279	while (uio->uio_resid > 0 && error == 0) {
280		gpa = uio->uio_offset;
281		off = gpa & PAGE_MASK;
282		c = min(uio->uio_resid, PAGE_SIZE - off);
283
284		/*
285		 * The VM has a hole in its physical memory map. If we want to
286		 * use 'dd' to inspect memory beyond the hole we need to
287		 * provide bogus data for memory that lies in the hole.
288		 *
289		 * Since this device does not support lseek(2), dd(1) will
290		 * read(2) blocks of data to simulate the lseek(2).
291		 */
292		hpa = vm_gpa_hold_global(sc->vm, gpa, c, prot, &cookie);
293		if (hpa == NULL) {
294			if (uio->uio_rw == UIO_READ && gpa < maxaddr)
295				error = uiomove(__DECONST(void *, zero_region),
296				    c, uio);
297			else
298				error = EFAULT;
299		} else {
300			error = uiomove(hpa, c, uio);
301			vm_gpa_release(cookie);
302		}
303	}
304	vm_unlock_memsegs(sc->vm);
305	return (error);
306}
307
308CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= VM_MAX_SUFFIXLEN + 1);
309
310static int
311get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
312{
313	struct devmem_softc *dsc;
314	int error;
315	bool sysmem;
316
317	error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
318	if (error || mseg->len == 0)
319		return (error);
320
321	if (!sysmem) {
322		SLIST_FOREACH(dsc, &sc->devmem, link) {
323			if (dsc->segid == mseg->segid)
324				break;
325		}
326		KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
327		    __func__, mseg->segid));
328		error = copystr(dsc->name, mseg->name, len, NULL);
329	} else {
330		bzero(mseg->name, len);
331	}
332
333	return (error);
334}
335
336static int
337alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
338{
339	char *name;
340	int error;
341	bool sysmem;
342
343	error = 0;
344	name = NULL;
345	sysmem = true;
346
347	/*
348	 * The allocation is lengthened by 1 to hold a terminating NUL.  It'll
349	 * by stripped off when devfs processes the full string.
350	 */
351	if (VM_MEMSEG_NAME(mseg)) {
352		sysmem = false;
353		name = malloc(len, M_VMMDEV, M_WAITOK);
354		error = copystr(mseg->name, name, len, NULL);
355		if (error)
356			goto done;
357	}
358
359	error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem);
360	if (error)
361		goto done;
362
363	if (VM_MEMSEG_NAME(mseg)) {
364		error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name);
365		if (error)
366			vm_free_memseg(sc->vm, mseg->segid);
367		else
368			name = NULL;	/* freed when 'cdev' is destroyed */
369	}
370done:
371	free(name, M_VMMDEV);
372	return (error);
373}
374
375static int
376vm_get_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
377    uint64_t *regval)
378{
379	int error, i;
380
381	error = 0;
382	for (i = 0; i < count; i++) {
383		error = vm_get_register(vcpu, regnum[i], &regval[i]);
384		if (error)
385			break;
386	}
387	return (error);
388}
389
390static int
391vm_set_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
392    uint64_t *regval)
393{
394	int error, i;
395
396	error = 0;
397	for (i = 0; i < count; i++) {
398		error = vm_set_register(vcpu, regnum[i], regval[i]);
399		if (error)
400			break;
401	}
402	return (error);
403}
404
405static int
406vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
407	     struct thread *td)
408{
409	int error, vcpuid, size;
410	cpuset_t *cpuset;
411	struct vmmdev_softc *sc;
412	struct vcpu *vcpu;
413	struct vm_register *vmreg;
414	struct vm_seg_desc *vmsegdesc;
415	struct vm_register_set *vmregset;
416	struct vm_run *vmrun;
417#ifdef COMPAT_FREEBSD13
418	struct vm_run_13 *vmrun_13;
419#endif
420	struct vm_exception *vmexc;
421	struct vm_lapic_irq *vmirq;
422	struct vm_lapic_msi *vmmsi;
423	struct vm_ioapic_irq *ioapic_irq;
424	struct vm_isa_irq *isa_irq;
425	struct vm_isa_irq_trigger *isa_irq_trigger;
426	struct vm_capability *vmcap;
427	struct vm_pptdev *pptdev;
428	struct vm_pptdev_mmio *pptmmio;
429	struct vm_pptdev_msi *pptmsi;
430	struct vm_pptdev_msix *pptmsix;
431#ifdef COMPAT_FREEBSD13
432	struct vm_stats_old *vmstats_old;
433#endif
434	struct vm_stats *vmstats;
435	struct vm_stat_desc *statdesc;
436	struct vm_x2apic *x2apic;
437	struct vm_gpa_pte *gpapte;
438	struct vm_suspend *vmsuspend;
439	struct vm_gla2gpa *gg;
440	struct vm_cpuset *vm_cpuset;
441	struct vm_intinfo *vmii;
442	struct vm_rtc_time *rtctime;
443	struct vm_rtc_data *rtcdata;
444	struct vm_memmap *mm;
445	struct vm_munmap *mu;
446	struct vm_cpu_topology *topology;
447	struct vm_readwrite_kernemu_device *kernemu;
448	uint64_t *regvals;
449	int *regnums;
450	enum { NONE, SINGLE, ALL } vcpus_locked;
451	bool memsegs_locked;
452#ifdef BHYVE_SNAPSHOT
453	struct vm_snapshot_meta *snapshot_meta;
454#ifdef COMPAT_FREEBSD13
455	struct vm_snapshot_meta_old *snapshot_old;
456#endif
457#endif
458
459	error = vmm_priv_check(curthread->td_ucred);
460	if (error)
461		return (error);
462
463	sc = vmmdev_lookup2(cdev);
464	if (sc == NULL)
465		return (ENXIO);
466
467	vcpuid = -1;
468	vcpu = NULL;
469	vcpus_locked = NONE;
470	memsegs_locked = false;
471
472	/*
473	 * For VMM ioctls that operate on a single vCPU, lookup the
474	 * vcpu.  For VMM ioctls which require one or more vCPUs to
475	 * not be running, lock necessary vCPUs.
476	 *
477	 * XXX fragile, handle with care
478	 * Most of these assume that the first field of the ioctl data
479	 * is the vcpuid.
480	 */
481	switch (cmd) {
482	case VM_RUN:
483#ifdef COMPAT_FREEBSD13
484	case VM_RUN_13:
485#endif
486	case VM_GET_REGISTER:
487	case VM_SET_REGISTER:
488	case VM_GET_SEGMENT_DESCRIPTOR:
489	case VM_SET_SEGMENT_DESCRIPTOR:
490	case VM_GET_REGISTER_SET:
491	case VM_SET_REGISTER_SET:
492	case VM_INJECT_EXCEPTION:
493	case VM_GET_CAPABILITY:
494	case VM_SET_CAPABILITY:
495	case VM_SET_X2APIC_STATE:
496	case VM_GLA2GPA:
497	case VM_GLA2GPA_NOFAULT:
498	case VM_ACTIVATE_CPU:
499	case VM_SET_INTINFO:
500	case VM_GET_INTINFO:
501	case VM_RESTART_INSTRUCTION:
502	case VM_GET_KERNEMU_DEV:
503	case VM_SET_KERNEMU_DEV:
504		/*
505		 * ioctls that can operate only on vcpus that are not running.
506		 */
507		vcpuid = *(int *)data;
508		vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
509		if (vcpu == NULL) {
510			error = EINVAL;
511			goto done;
512		}
513		error = vcpu_lock_one(vcpu);
514		if (error)
515			goto done;
516		vcpus_locked = SINGLE;
517		break;
518
519#ifdef COMPAT_FREEBSD12
520	case VM_ALLOC_MEMSEG_FBSD12:
521#endif
522	case VM_ALLOC_MEMSEG:
523	case VM_BIND_PPTDEV:
524	case VM_UNBIND_PPTDEV:
525	case VM_MMAP_MEMSEG:
526	case VM_MUNMAP_MEMSEG:
527	case VM_REINIT:
528		/*
529		 * ioctls that modify the memory map must lock memory
530		 * segments exclusively.
531		 */
532		vm_xlock_memsegs(sc->vm);
533		memsegs_locked = true;
534		/* FALLTHROUGH */
535	case VM_MAP_PPTDEV_MMIO:
536	case VM_UNMAP_PPTDEV_MMIO:
537#ifdef BHYVE_SNAPSHOT
538	case VM_SNAPSHOT_REQ:
539#ifdef COMPAT_FREEBSD13
540	case VM_SNAPSHOT_REQ_OLD:
541#endif
542	case VM_RESTORE_TIME:
543#endif
544		/*
545		 * ioctls that operate on the entire virtual machine must
546		 * prevent all vcpus from running.
547		 */
548		error = vcpu_lock_all(sc);
549		if (error)
550			goto done;
551		vcpus_locked = ALL;
552		break;
553
554#ifdef COMPAT_FREEBSD12
555	case VM_GET_MEMSEG_FBSD12:
556#endif
557	case VM_GET_MEMSEG:
558	case VM_MMAP_GETNEXT:
559		/*
560		 * Lock the memory map while it is being inspected.
561		 */
562		vm_slock_memsegs(sc->vm);
563		memsegs_locked = true;
564		break;
565
566#ifdef COMPAT_FREEBSD13
567	case VM_STATS_OLD:
568#endif
569	case VM_STATS:
570	case VM_INJECT_NMI:
571	case VM_LAPIC_IRQ:
572	case VM_GET_X2APIC_STATE:
573		/*
574		 * These do not need the vCPU locked but do operate on
575		 * a specific vCPU.
576		 */
577		vcpuid = *(int *)data;
578		vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
579		if (vcpu == NULL) {
580			error = EINVAL;
581			goto done;
582		}
583		break;
584
585	case VM_LAPIC_LOCAL_IRQ:
586	case VM_SUSPEND_CPU:
587	case VM_RESUME_CPU:
588		/*
589		 * These can either operate on all CPUs via a vcpuid of
590		 * -1 or on a specific vCPU.
591		 */
592		vcpuid = *(int *)data;
593		if (vcpuid == -1)
594			break;
595		vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
596		if (vcpu == NULL) {
597			error = EINVAL;
598			goto done;
599		}
600		break;
601
602	default:
603		break;
604	}
605
606	switch (cmd) {
607	case VM_RUN: {
608		struct vm_exit *vme;
609
610		vmrun = (struct vm_run *)data;
611		vme = vm_exitinfo(vcpu);
612
613		error = vm_run(vcpu);
614		if (error != 0)
615			break;
616
617		error = copyout(vme, vmrun->vm_exit, sizeof(*vme));
618		if (error != 0)
619			break;
620		if (vme->exitcode == VM_EXITCODE_IPI) {
621			error = copyout(vm_exitinfo_cpuset(vcpu),
622			    vmrun->cpuset,
623			    min(vmrun->cpusetsize, sizeof(cpuset_t)));
624			if (error != 0)
625				break;
626			if (sizeof(cpuset_t) < vmrun->cpusetsize) {
627				uint8_t *p;
628
629				p = (uint8_t *)vmrun->cpuset +
630				    sizeof(cpuset_t);
631				while (p < (uint8_t *)vmrun->cpuset +
632				    vmrun->cpusetsize) {
633					if (subyte(p++, 0) != 0) {
634						error = EFAULT;
635						break;
636					}
637				}
638			}
639		}
640		break;
641	}
642#ifdef COMPAT_FREEBSD13
643	case VM_RUN_13: {
644		struct vm_exit *vme;
645		struct vm_exit_13 *vme_13;
646
647		vmrun_13 = (struct vm_run_13 *)data;
648		vme_13 = &vmrun_13->vm_exit;
649		vme = vm_exitinfo(vcpu);
650
651		error = vm_run(vcpu);
652		if (error == 0) {
653			vme_13->exitcode = vme->exitcode;
654			vme_13->inst_length = vme->inst_length;
655			vme_13->rip = vme->rip;
656			memcpy(vme_13->u, &vme->u, sizeof(vme_13->u));
657			if (vme->exitcode == VM_EXITCODE_IPI) {
658				struct vm_exit_ipi_13 *ipi;
659				cpuset_t *dmask;
660				int cpu;
661
662				dmask = vm_exitinfo_cpuset(vcpu);
663				ipi = (struct vm_exit_ipi_13 *)&vme_13->u[0];
664				BIT_ZERO(256, &ipi->dmask);
665				CPU_FOREACH_ISSET(cpu, dmask) {
666					if (cpu >= 256)
667						break;
668					BIT_SET(256, cpu, &ipi->dmask);
669				}
670			}
671		}
672		break;
673	}
674#endif
675	case VM_SUSPEND:
676		vmsuspend = (struct vm_suspend *)data;
677		error = vm_suspend(sc->vm, vmsuspend->how);
678		break;
679	case VM_REINIT:
680		error = vm_reinit(sc->vm);
681		break;
682	case VM_STAT_DESC: {
683		statdesc = (struct vm_stat_desc *)data;
684		error = vmm_stat_desc_copy(statdesc->index,
685					statdesc->desc, sizeof(statdesc->desc));
686		break;
687	}
688#ifdef COMPAT_FREEBSD13
689	case VM_STATS_OLD:
690		vmstats_old = (struct vm_stats_old *)data;
691		getmicrotime(&vmstats_old->tv);
692		error = vmm_stat_copy(vcpu, 0,
693				      nitems(vmstats_old->statbuf),
694				      &vmstats_old->num_entries,
695				      vmstats_old->statbuf);
696		break;
697#endif
698	case VM_STATS: {
699		vmstats = (struct vm_stats *)data;
700		getmicrotime(&vmstats->tv);
701		error = vmm_stat_copy(vcpu, vmstats->index,
702				      nitems(vmstats->statbuf),
703				      &vmstats->num_entries, vmstats->statbuf);
704		break;
705	}
706	case VM_PPTDEV_MSI:
707		pptmsi = (struct vm_pptdev_msi *)data;
708		error = ppt_setup_msi(sc->vm,
709				      pptmsi->bus, pptmsi->slot, pptmsi->func,
710				      pptmsi->addr, pptmsi->msg,
711				      pptmsi->numvec);
712		break;
713	case VM_PPTDEV_MSIX:
714		pptmsix = (struct vm_pptdev_msix *)data;
715		error = ppt_setup_msix(sc->vm,
716				       pptmsix->bus, pptmsix->slot,
717				       pptmsix->func, pptmsix->idx,
718				       pptmsix->addr, pptmsix->msg,
719				       pptmsix->vector_control);
720		break;
721	case VM_PPTDEV_DISABLE_MSIX:
722		pptdev = (struct vm_pptdev *)data;
723		error = ppt_disable_msix(sc->vm, pptdev->bus, pptdev->slot,
724					 pptdev->func);
725		break;
726	case VM_MAP_PPTDEV_MMIO:
727		pptmmio = (struct vm_pptdev_mmio *)data;
728		error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
729				     pptmmio->func, pptmmio->gpa, pptmmio->len,
730				     pptmmio->hpa);
731		break;
732	case VM_UNMAP_PPTDEV_MMIO:
733		pptmmio = (struct vm_pptdev_mmio *)data;
734		error = ppt_unmap_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
735				       pptmmio->func, pptmmio->gpa, pptmmio->len);
736		break;
737	case VM_BIND_PPTDEV:
738		pptdev = (struct vm_pptdev *)data;
739		error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
740					 pptdev->func);
741		break;
742	case VM_UNBIND_PPTDEV:
743		pptdev = (struct vm_pptdev *)data;
744		error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
745					   pptdev->func);
746		break;
747	case VM_INJECT_EXCEPTION:
748		vmexc = (struct vm_exception *)data;
749		error = vm_inject_exception(vcpu,
750		    vmexc->vector, vmexc->error_code_valid, vmexc->error_code,
751		    vmexc->restart_instruction);
752		break;
753	case VM_INJECT_NMI:
754		error = vm_inject_nmi(vcpu);
755		break;
756	case VM_LAPIC_IRQ:
757		vmirq = (struct vm_lapic_irq *)data;
758		error = lapic_intr_edge(vcpu, vmirq->vector);
759		break;
760	case VM_LAPIC_LOCAL_IRQ:
761		vmirq = (struct vm_lapic_irq *)data;
762		error = lapic_set_local_intr(sc->vm, vcpu, vmirq->vector);
763		break;
764	case VM_LAPIC_MSI:
765		vmmsi = (struct vm_lapic_msi *)data;
766		error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg);
767		break;
768	case VM_IOAPIC_ASSERT_IRQ:
769		ioapic_irq = (struct vm_ioapic_irq *)data;
770		error = vioapic_assert_irq(sc->vm, ioapic_irq->irq);
771		break;
772	case VM_IOAPIC_DEASSERT_IRQ:
773		ioapic_irq = (struct vm_ioapic_irq *)data;
774		error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq);
775		break;
776	case VM_IOAPIC_PULSE_IRQ:
777		ioapic_irq = (struct vm_ioapic_irq *)data;
778		error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq);
779		break;
780	case VM_IOAPIC_PINCOUNT:
781		*(int *)data = vioapic_pincount(sc->vm);
782		break;
783	case VM_SET_KERNEMU_DEV:
784	case VM_GET_KERNEMU_DEV: {
785		mem_region_write_t mwrite;
786		mem_region_read_t mread;
787		bool arg;
788
789		kernemu = (void *)data;
790
791		if (kernemu->access_width > 0)
792			size = (1u << kernemu->access_width);
793		else
794			size = 1;
795
796		if (kernemu->gpa >= DEFAULT_APIC_BASE && kernemu->gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
797			mread = lapic_mmio_read;
798			mwrite = lapic_mmio_write;
799		} else if (kernemu->gpa >= VIOAPIC_BASE && kernemu->gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
800			mread = vioapic_mmio_read;
801			mwrite = vioapic_mmio_write;
802		} else if (kernemu->gpa >= VHPET_BASE && kernemu->gpa < VHPET_BASE + VHPET_SIZE) {
803			mread = vhpet_mmio_read;
804			mwrite = vhpet_mmio_write;
805		} else {
806			error = EINVAL;
807			break;
808		}
809
810		if (cmd == VM_SET_KERNEMU_DEV)
811			error = mwrite(vcpu, kernemu->gpa,
812			    kernemu->value, size, &arg);
813		else
814			error = mread(vcpu, kernemu->gpa,
815			    &kernemu->value, size, &arg);
816		break;
817		}
818	case VM_ISA_ASSERT_IRQ:
819		isa_irq = (struct vm_isa_irq *)data;
820		error = vatpic_assert_irq(sc->vm, isa_irq->atpic_irq);
821		if (error == 0 && isa_irq->ioapic_irq != -1)
822			error = vioapic_assert_irq(sc->vm,
823			    isa_irq->ioapic_irq);
824		break;
825	case VM_ISA_DEASSERT_IRQ:
826		isa_irq = (struct vm_isa_irq *)data;
827		error = vatpic_deassert_irq(sc->vm, isa_irq->atpic_irq);
828		if (error == 0 && isa_irq->ioapic_irq != -1)
829			error = vioapic_deassert_irq(sc->vm,
830			    isa_irq->ioapic_irq);
831		break;
832	case VM_ISA_PULSE_IRQ:
833		isa_irq = (struct vm_isa_irq *)data;
834		error = vatpic_pulse_irq(sc->vm, isa_irq->atpic_irq);
835		if (error == 0 && isa_irq->ioapic_irq != -1)
836			error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq);
837		break;
838	case VM_ISA_SET_IRQ_TRIGGER:
839		isa_irq_trigger = (struct vm_isa_irq_trigger *)data;
840		error = vatpic_set_irq_trigger(sc->vm,
841		    isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger);
842		break;
843	case VM_MMAP_GETNEXT:
844		mm = (struct vm_memmap *)data;
845		error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
846		    &mm->segoff, &mm->len, &mm->prot, &mm->flags);
847		break;
848	case VM_MMAP_MEMSEG:
849		mm = (struct vm_memmap *)data;
850		error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
851		    mm->len, mm->prot, mm->flags);
852		break;
853	case VM_MUNMAP_MEMSEG:
854		mu = (struct vm_munmap *)data;
855		error = vm_munmap_memseg(sc->vm, mu->gpa, mu->len);
856		break;
857#ifdef COMPAT_FREEBSD12
858	case VM_ALLOC_MEMSEG_FBSD12:
859		error = alloc_memseg(sc, (struct vm_memseg *)data,
860		    sizeof(((struct vm_memseg_fbsd12 *)0)->name));
861		break;
862#endif
863	case VM_ALLOC_MEMSEG:
864		error = alloc_memseg(sc, (struct vm_memseg *)data,
865		    sizeof(((struct vm_memseg *)0)->name));
866		break;
867#ifdef COMPAT_FREEBSD12
868	case VM_GET_MEMSEG_FBSD12:
869		error = get_memseg(sc, (struct vm_memseg *)data,
870		    sizeof(((struct vm_memseg_fbsd12 *)0)->name));
871		break;
872#endif
873	case VM_GET_MEMSEG:
874		error = get_memseg(sc, (struct vm_memseg *)data,
875		    sizeof(((struct vm_memseg *)0)->name));
876		break;
877	case VM_GET_REGISTER:
878		vmreg = (struct vm_register *)data;
879		error = vm_get_register(vcpu, vmreg->regnum, &vmreg->regval);
880		break;
881	case VM_SET_REGISTER:
882		vmreg = (struct vm_register *)data;
883		error = vm_set_register(vcpu, vmreg->regnum, vmreg->regval);
884		break;
885	case VM_SET_SEGMENT_DESCRIPTOR:
886		vmsegdesc = (struct vm_seg_desc *)data;
887		error = vm_set_seg_desc(vcpu,
888					vmsegdesc->regnum,
889					&vmsegdesc->desc);
890		break;
891	case VM_GET_SEGMENT_DESCRIPTOR:
892		vmsegdesc = (struct vm_seg_desc *)data;
893		error = vm_get_seg_desc(vcpu,
894					vmsegdesc->regnum,
895					&vmsegdesc->desc);
896		break;
897	case VM_GET_REGISTER_SET:
898		vmregset = (struct vm_register_set *)data;
899		if (vmregset->count > VM_REG_LAST) {
900			error = EINVAL;
901			break;
902		}
903		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
904		    M_WAITOK);
905		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
906		    M_WAITOK);
907		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
908		    vmregset->count);
909		if (error == 0)
910			error = vm_get_register_set(vcpu,
911			    vmregset->count, regnums, regvals);
912		if (error == 0)
913			error = copyout(regvals, vmregset->regvals,
914			    sizeof(regvals[0]) * vmregset->count);
915		free(regvals, M_VMMDEV);
916		free(regnums, M_VMMDEV);
917		break;
918	case VM_SET_REGISTER_SET:
919		vmregset = (struct vm_register_set *)data;
920		if (vmregset->count > VM_REG_LAST) {
921			error = EINVAL;
922			break;
923		}
924		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
925		    M_WAITOK);
926		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
927		    M_WAITOK);
928		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
929		    vmregset->count);
930		if (error == 0)
931			error = copyin(vmregset->regvals, regvals,
932			    sizeof(regvals[0]) * vmregset->count);
933		if (error == 0)
934			error = vm_set_register_set(vcpu,
935			    vmregset->count, regnums, regvals);
936		free(regvals, M_VMMDEV);
937		free(regnums, M_VMMDEV);
938		break;
939	case VM_GET_CAPABILITY:
940		vmcap = (struct vm_capability *)data;
941		error = vm_get_capability(vcpu,
942					  vmcap->captype,
943					  &vmcap->capval);
944		break;
945	case VM_SET_CAPABILITY:
946		vmcap = (struct vm_capability *)data;
947		error = vm_set_capability(vcpu,
948					  vmcap->captype,
949					  vmcap->capval);
950		break;
951	case VM_SET_X2APIC_STATE:
952		x2apic = (struct vm_x2apic *)data;
953		error = vm_set_x2apic_state(vcpu, x2apic->state);
954		break;
955	case VM_GET_X2APIC_STATE:
956		x2apic = (struct vm_x2apic *)data;
957		error = vm_get_x2apic_state(vcpu, &x2apic->state);
958		break;
959	case VM_GET_GPA_PMAP:
960		gpapte = (struct vm_gpa_pte *)data;
961		pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)),
962				 gpapte->gpa, gpapte->pte, &gpapte->ptenum);
963		error = 0;
964		break;
965	case VM_GET_HPET_CAPABILITIES:
966		error = vhpet_getcap((struct vm_hpet_cap *)data);
967		break;
968	case VM_GLA2GPA: {
969		CTASSERT(PROT_READ == VM_PROT_READ);
970		CTASSERT(PROT_WRITE == VM_PROT_WRITE);
971		CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
972		gg = (struct vm_gla2gpa *)data;
973		error = vm_gla2gpa(vcpu, &gg->paging, gg->gla,
974		    gg->prot, &gg->gpa, &gg->fault);
975		KASSERT(error == 0 || error == EFAULT,
976		    ("%s: vm_gla2gpa unknown error %d", __func__, error));
977		break;
978	}
979	case VM_GLA2GPA_NOFAULT:
980		gg = (struct vm_gla2gpa *)data;
981		error = vm_gla2gpa_nofault(vcpu, &gg->paging, gg->gla,
982		    gg->prot, &gg->gpa, &gg->fault);
983		KASSERT(error == 0 || error == EFAULT,
984		    ("%s: vm_gla2gpa unknown error %d", __func__, error));
985		break;
986	case VM_ACTIVATE_CPU:
987		error = vm_activate_cpu(vcpu);
988		break;
989	case VM_GET_CPUS:
990		error = 0;
991		vm_cpuset = (struct vm_cpuset *)data;
992		size = vm_cpuset->cpusetsize;
993		if (size < 1 || size > CPU_MAXSIZE / NBBY) {
994			error = ERANGE;
995			break;
996		}
997		cpuset = malloc(max(size, sizeof(cpuset_t)), M_TEMP,
998		    M_WAITOK | M_ZERO);
999		if (vm_cpuset->which == VM_ACTIVE_CPUS)
1000			*cpuset = vm_active_cpus(sc->vm);
1001		else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
1002			*cpuset = vm_suspended_cpus(sc->vm);
1003		else if (vm_cpuset->which == VM_DEBUG_CPUS)
1004			*cpuset = vm_debug_cpus(sc->vm);
1005		else
1006			error = EINVAL;
1007		if (error == 0 && size < howmany(CPU_FLS(cpuset), NBBY))
1008			error = ERANGE;
1009		if (error == 0)
1010			error = copyout(cpuset, vm_cpuset->cpus, size);
1011		free(cpuset, M_TEMP);
1012		break;
1013	case VM_SUSPEND_CPU:
1014		error = vm_suspend_cpu(sc->vm, vcpu);
1015		break;
1016	case VM_RESUME_CPU:
1017		error = vm_resume_cpu(sc->vm, vcpu);
1018		break;
1019	case VM_SET_INTINFO:
1020		vmii = (struct vm_intinfo *)data;
1021		error = vm_exit_intinfo(vcpu, vmii->info1);
1022		break;
1023	case VM_GET_INTINFO:
1024		vmii = (struct vm_intinfo *)data;
1025		error = vm_get_intinfo(vcpu, &vmii->info1, &vmii->info2);
1026		break;
1027	case VM_RTC_WRITE:
1028		rtcdata = (struct vm_rtc_data *)data;
1029		error = vrtc_nvram_write(sc->vm, rtcdata->offset,
1030		    rtcdata->value);
1031		break;
1032	case VM_RTC_READ:
1033		rtcdata = (struct vm_rtc_data *)data;
1034		error = vrtc_nvram_read(sc->vm, rtcdata->offset,
1035		    &rtcdata->value);
1036		break;
1037	case VM_RTC_SETTIME:
1038		rtctime = (struct vm_rtc_time *)data;
1039		error = vrtc_set_time(sc->vm, rtctime->secs);
1040		break;
1041	case VM_RTC_GETTIME:
1042		error = 0;
1043		rtctime = (struct vm_rtc_time *)data;
1044		rtctime->secs = vrtc_get_time(sc->vm);
1045		break;
1046	case VM_RESTART_INSTRUCTION:
1047		error = vm_restart_instruction(vcpu);
1048		break;
1049	case VM_SET_TOPOLOGY:
1050		topology = (struct vm_cpu_topology *)data;
1051		error = vm_set_topology(sc->vm, topology->sockets,
1052		    topology->cores, topology->threads, topology->maxcpus);
1053		break;
1054	case VM_GET_TOPOLOGY:
1055		topology = (struct vm_cpu_topology *)data;
1056		vm_get_topology(sc->vm, &topology->sockets, &topology->cores,
1057		    &topology->threads, &topology->maxcpus);
1058		error = 0;
1059		break;
1060#ifdef BHYVE_SNAPSHOT
1061	case VM_SNAPSHOT_REQ:
1062		snapshot_meta = (struct vm_snapshot_meta *)data;
1063		error = vm_snapshot_req(sc->vm, snapshot_meta);
1064		break;
1065#ifdef COMPAT_FREEBSD13
1066	case VM_SNAPSHOT_REQ_OLD:
1067		/*
1068		 * The old structure just has an additional pointer at
1069		 * the start that is ignored.
1070		 */
1071		snapshot_old = (struct vm_snapshot_meta_old *)data;
1072		snapshot_meta =
1073		    (struct vm_snapshot_meta *)&snapshot_old->dev_data;
1074		error = vm_snapshot_req(sc->vm, snapshot_meta);
1075		break;
1076#endif
1077	case VM_RESTORE_TIME:
1078		error = vm_restore_time(sc->vm);
1079		break;
1080#endif
1081	default:
1082		error = ENOTTY;
1083		break;
1084	}
1085
1086done:
1087	if (vcpus_locked == SINGLE)
1088		vcpu_unlock_one(vcpu);
1089	else if (vcpus_locked == ALL)
1090		vcpu_unlock_all(sc);
1091	if (memsegs_locked)
1092		vm_unlock_memsegs(sc->vm);
1093
1094	/*
1095	 * Make sure that no handler returns a kernel-internal
1096	 * error value to userspace.
1097	 */
1098	KASSERT(error == ERESTART || error >= 0,
1099	    ("vmmdev_ioctl: invalid error return %d", error));
1100	return (error);
1101}
1102
1103static int
1104vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
1105    struct vm_object **objp, int nprot)
1106{
1107	struct vmmdev_softc *sc;
1108	vm_paddr_t gpa;
1109	size_t len;
1110	vm_ooffset_t segoff, first, last;
1111	int error, found, segid;
1112	bool sysmem;
1113
1114	error = vmm_priv_check(curthread->td_ucred);
1115	if (error)
1116		return (error);
1117
1118	first = *offset;
1119	last = first + mapsize;
1120	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
1121		return (EINVAL);
1122
1123	sc = vmmdev_lookup2(cdev);
1124	if (sc == NULL) {
1125		/* virtual machine is in the process of being created */
1126		return (EINVAL);
1127	}
1128
1129	/*
1130	 * Get a read lock on the guest memory map.
1131	 */
1132	vm_slock_memsegs(sc->vm);
1133
1134	gpa = 0;
1135	found = 0;
1136	while (!found) {
1137		error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
1138		    NULL, NULL);
1139		if (error)
1140			break;
1141
1142		if (first >= gpa && last <= gpa + len)
1143			found = 1;
1144		else
1145			gpa += len;
1146	}
1147
1148	if (found) {
1149		error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
1150		KASSERT(error == 0 && *objp != NULL,
1151		    ("%s: invalid memory segment %d", __func__, segid));
1152		if (sysmem) {
1153			vm_object_reference(*objp);
1154			*offset = segoff + (first - gpa);
1155		} else {
1156			error = EINVAL;
1157		}
1158	}
1159	vm_unlock_memsegs(sc->vm);
1160	return (error);
1161}
1162
1163static void
1164vmmdev_destroy(void *arg)
1165{
1166	struct vmmdev_softc *sc = arg;
1167	struct devmem_softc *dsc;
1168	int error __diagused;
1169
1170	vm_disable_vcpu_creation(sc->vm);
1171	error = vcpu_lock_all(sc);
1172	KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
1173	vm_unlock_vcpus(sc->vm);
1174
1175	while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
1176		KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
1177		SLIST_REMOVE_HEAD(&sc->devmem, link);
1178		free(dsc->name, M_VMMDEV);
1179		free(dsc, M_VMMDEV);
1180	}
1181
1182	if (sc->cdev != NULL)
1183		destroy_dev(sc->cdev);
1184
1185	if (sc->vm != NULL)
1186		vm_destroy(sc->vm);
1187
1188	if (sc->ucred != NULL)
1189		crfree(sc->ucred);
1190
1191	if ((sc->flags & VSC_LINKED) != 0) {
1192		mtx_lock(&vmmdev_mtx);
1193		SLIST_REMOVE(&head, sc, vmmdev_softc, link);
1194		mtx_unlock(&vmmdev_mtx);
1195	}
1196
1197	free(sc, M_VMMDEV);
1198}
1199
1200static int
1201sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
1202{
1203	struct devmem_softc *dsc;
1204	struct vmmdev_softc *sc;
1205	struct cdev *cdev;
1206	char *buf;
1207	int error, buflen;
1208
1209	error = vmm_priv_check(req->td->td_ucred);
1210	if (error)
1211		return (error);
1212
1213	buflen = VM_MAX_NAMELEN + 1;
1214	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
1215	strlcpy(buf, "beavis", buflen);
1216	error = sysctl_handle_string(oidp, buf, buflen, req);
1217	if (error != 0 || req->newptr == NULL)
1218		goto out;
1219
1220	mtx_lock(&vmmdev_mtx);
1221	sc = vmmdev_lookup(buf);
1222	if (sc == NULL || sc->cdev == NULL) {
1223		mtx_unlock(&vmmdev_mtx);
1224		error = EINVAL;
1225		goto out;
1226	}
1227
1228	/*
1229	 * Setting 'sc->cdev' to NULL is used to indicate that the VM
1230	 * is scheduled for destruction.
1231	 */
1232	cdev = sc->cdev;
1233	sc->cdev = NULL;
1234	mtx_unlock(&vmmdev_mtx);
1235
1236	/*
1237	 * Destroy all cdevs:
1238	 *
1239	 * - any new operations on the 'cdev' will return an error (ENXIO).
1240	 *
1241	 * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
1242	 */
1243	SLIST_FOREACH(dsc, &sc->devmem, link) {
1244		KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
1245		destroy_dev(dsc->cdev);
1246		devmem_destroy(dsc);
1247	}
1248	destroy_dev(cdev);
1249	vmmdev_destroy(sc);
1250	error = 0;
1251
1252out:
1253	free(buf, M_VMMDEV);
1254	return (error);
1255}
1256SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy,
1257    CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
1258    NULL, 0, sysctl_vmm_destroy, "A",
1259    NULL);
1260
1261static struct cdevsw vmmdevsw = {
1262	.d_name		= "vmmdev",
1263	.d_version	= D_VERSION,
1264	.d_ioctl	= vmmdev_ioctl,
1265	.d_mmap_single	= vmmdev_mmap_single,
1266	.d_read		= vmmdev_rw,
1267	.d_write	= vmmdev_rw,
1268};
1269
1270static int
1271sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
1272{
1273	struct vm *vm;
1274	struct cdev *cdev;
1275	struct vmmdev_softc *sc, *sc2;
1276	char *buf;
1277	int error, buflen;
1278
1279	error = vmm_priv_check(req->td->td_ucred);
1280	if (error)
1281		return (error);
1282
1283	buflen = VM_MAX_NAMELEN + 1;
1284	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
1285	strlcpy(buf, "beavis", buflen);
1286	error = sysctl_handle_string(oidp, buf, buflen, req);
1287	if (error != 0 || req->newptr == NULL)
1288		goto out;
1289
1290	mtx_lock(&vmmdev_mtx);
1291	sc = vmmdev_lookup(buf);
1292	mtx_unlock(&vmmdev_mtx);
1293	if (sc != NULL) {
1294		error = EEXIST;
1295		goto out;
1296	}
1297
1298	error = vm_create(buf, &vm);
1299	if (error != 0)
1300		goto out;
1301
1302	sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
1303	sc->ucred = crhold(curthread->td_ucred);
1304	sc->vm = vm;
1305	SLIST_INIT(&sc->devmem);
1306
1307	/*
1308	 * Lookup the name again just in case somebody sneaked in when we
1309	 * dropped the lock.
1310	 */
1311	mtx_lock(&vmmdev_mtx);
1312	sc2 = vmmdev_lookup(buf);
1313	if (sc2 == NULL) {
1314		SLIST_INSERT_HEAD(&head, sc, link);
1315		sc->flags |= VSC_LINKED;
1316	}
1317	mtx_unlock(&vmmdev_mtx);
1318
1319	if (sc2 != NULL) {
1320		vmmdev_destroy(sc);
1321		error = EEXIST;
1322		goto out;
1323	}
1324
1325	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, sc->ucred,
1326	    UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf);
1327	if (error != 0) {
1328		vmmdev_destroy(sc);
1329		goto out;
1330	}
1331
1332	mtx_lock(&vmmdev_mtx);
1333	sc->cdev = cdev;
1334	sc->cdev->si_drv1 = sc;
1335	mtx_unlock(&vmmdev_mtx);
1336
1337out:
1338	free(buf, M_VMMDEV);
1339	return (error);
1340}
1341SYSCTL_PROC(_hw_vmm, OID_AUTO, create,
1342    CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
1343    NULL, 0, sysctl_vmm_create, "A",
1344    NULL);
1345
1346void
1347vmmdev_init(void)
1348{
1349	pr_allow_flag = prison_add_allow(NULL, "vmm", NULL,
1350	    "Allow use of vmm in a jail.");
1351}
1352
1353int
1354vmmdev_cleanup(void)
1355{
1356	int error;
1357
1358	if (SLIST_EMPTY(&head))
1359		error = 0;
1360	else
1361		error = EBUSY;
1362
1363	return (error);
1364}
1365
1366static int
1367devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
1368    struct vm_object **objp, int nprot)
1369{
1370	struct devmem_softc *dsc;
1371	vm_ooffset_t first, last;
1372	size_t seglen;
1373	int error;
1374	bool sysmem;
1375
1376	dsc = cdev->si_drv1;
1377	if (dsc == NULL) {
1378		/* 'cdev' has been created but is not ready for use */
1379		return (ENXIO);
1380	}
1381
1382	first = *offset;
1383	last = *offset + len;
1384	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
1385		return (EINVAL);
1386
1387	vm_slock_memsegs(dsc->sc->vm);
1388
1389	error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
1390	KASSERT(error == 0 && !sysmem && *objp != NULL,
1391	    ("%s: invalid devmem segment %d", __func__, dsc->segid));
1392
1393	if (seglen >= last)
1394		vm_object_reference(*objp);
1395	else
1396		error = EINVAL;
1397
1398	vm_unlock_memsegs(dsc->sc->vm);
1399	return (error);
1400}
1401
1402static struct cdevsw devmemsw = {
1403	.d_name		= "devmem",
1404	.d_version	= D_VERSION,
1405	.d_mmap_single	= devmem_mmap_single,
1406};
1407
1408static int
1409devmem_create_cdev(const char *vmname, int segid, char *devname)
1410{
1411	struct devmem_softc *dsc;
1412	struct vmmdev_softc *sc;
1413	struct cdev *cdev;
1414	int error;
1415
1416	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL,
1417	    UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname);
1418	if (error)
1419		return (error);
1420
1421	dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO);
1422
1423	mtx_lock(&vmmdev_mtx);
1424	sc = vmmdev_lookup(vmname);
1425	KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname));
1426	if (sc->cdev == NULL) {
1427		/* virtual machine is being created or destroyed */
1428		mtx_unlock(&vmmdev_mtx);
1429		free(dsc, M_VMMDEV);
1430		destroy_dev_sched_cb(cdev, NULL, 0);
1431		return (ENODEV);
1432	}
1433
1434	dsc->segid = segid;
1435	dsc->name = devname;
1436	dsc->cdev = cdev;
1437	dsc->sc = sc;
1438	SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
1439	mtx_unlock(&vmmdev_mtx);
1440
1441	/* The 'cdev' is ready for use after 'si_drv1' is initialized */
1442	cdev->si_drv1 = dsc;
1443	return (0);
1444}
1445
1446static void
1447devmem_destroy(void *arg)
1448{
1449	struct devmem_softc *dsc = arg;
1450
1451	KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__));
1452	dsc->cdev = NULL;
1453	dsc->sc = NULL;
1454}
1455