1/*-
2 * Copyright (c) 2011 NetApp, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD: stable/11/sys/amd64/vmm/vmm_dev.c 348271 2019-05-25 11:27:56Z rgrimes $
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: stable/11/sys/amd64/vmm/vmm_dev.c 348271 2019-05-25 11:27:56Z rgrimes $");
31
32#include <sys/param.h>
33#include <sys/kernel.h>
34#include <sys/queue.h>
35#include <sys/lock.h>
36#include <sys/mutex.h>
37#include <sys/malloc.h>
38#include <sys/conf.h>
39#include <sys/sysctl.h>
40#include <sys/libkern.h>
41#include <sys/ioccom.h>
42#include <sys/mman.h>
43#include <sys/uio.h>
44
45#include <vm/vm.h>
46#include <vm/pmap.h>
47#include <vm/vm_map.h>
48#include <vm/vm_object.h>
49
50#include <machine/vmparam.h>
51#include <machine/vmm.h>
52#include <machine/vmm_instruction_emul.h>
53#include <machine/vmm_dev.h>
54
55#include "vmm_lapic.h"
56#include "vmm_stat.h"
57#include "vmm_mem.h"
58#include "io/ppt.h"
59#include "io/vatpic.h"
60#include "io/vioapic.h"
61#include "io/vhpet.h"
62#include "io/vrtc.h"
63
64struct devmem_softc {
65	int	segid;
66	char	*name;
67	struct cdev *cdev;
68	struct vmmdev_softc *sc;
69	SLIST_ENTRY(devmem_softc) link;
70};
71
72struct vmmdev_softc {
73	struct vm	*vm;		/* vm instance cookie */
74	struct cdev	*cdev;
75	SLIST_ENTRY(vmmdev_softc) link;
76	SLIST_HEAD(, devmem_softc) devmem;
77	int		flags;
78};
79#define	VSC_LINKED		0x01
80
81static SLIST_HEAD(, vmmdev_softc) head;
82
83static struct mtx vmmdev_mtx;
84
85static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
86
87SYSCTL_DECL(_hw_vmm);
88
89static int devmem_create_cdev(const char *vmname, int id, char *devmem);
90static void devmem_destroy(void *arg);
91
92static int
93vcpu_lock_one(struct vmmdev_softc *sc, int vcpu)
94{
95	int error;
96
97	if (vcpu < 0 || vcpu >= vm_get_maxcpus(sc->vm))
98		return (EINVAL);
99
100	error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true);
101	return (error);
102}
103
104static void
105vcpu_unlock_one(struct vmmdev_softc *sc, int vcpu)
106{
107	enum vcpu_state state;
108
109	state = vcpu_get_state(sc->vm, vcpu, NULL);
110	if (state != VCPU_FROZEN) {
111		panic("vcpu %s(%d) has invalid state %d", vm_name(sc->vm),
112		    vcpu, state);
113	}
114
115	vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false);
116}
117
118static int
119vcpu_lock_all(struct vmmdev_softc *sc)
120{
121	int error, vcpu;
122	uint16_t maxcpus;
123
124	maxcpus = vm_get_maxcpus(sc->vm);
125	for (vcpu = 0; vcpu < maxcpus; vcpu++) {
126		error = vcpu_lock_one(sc, vcpu);
127		if (error)
128			break;
129	}
130
131	if (error) {
132		while (--vcpu >= 0)
133			vcpu_unlock_one(sc, vcpu);
134	}
135
136	return (error);
137}
138
139static void
140vcpu_unlock_all(struct vmmdev_softc *sc)
141{
142	int vcpu;
143	uint16_t maxcpus;
144
145	maxcpus = vm_get_maxcpus(sc->vm);
146	for (vcpu = 0; vcpu < maxcpus; vcpu++)
147		vcpu_unlock_one(sc, vcpu);
148}
149
150static struct vmmdev_softc *
151vmmdev_lookup(const char *name)
152{
153	struct vmmdev_softc *sc;
154
155#ifdef notyet	/* XXX kernel is not compiled with invariants */
156	mtx_assert(&vmmdev_mtx, MA_OWNED);
157#endif
158
159	SLIST_FOREACH(sc, &head, link) {
160		if (strcmp(name, vm_name(sc->vm)) == 0)
161			break;
162	}
163
164	return (sc);
165}
166
167static struct vmmdev_softc *
168vmmdev_lookup2(struct cdev *cdev)
169{
170
171	return (cdev->si_drv1);
172}
173
174static int
175vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
176{
177	int error, off, c, prot;
178	vm_paddr_t gpa, maxaddr;
179	void *hpa, *cookie;
180	struct vmmdev_softc *sc;
181	uint16_t lastcpu;
182
183	sc = vmmdev_lookup2(cdev);
184	if (sc == NULL)
185		return (ENXIO);
186
187	/*
188	 * Get a read lock on the guest memory map by freezing any vcpu.
189	 */
190	lastcpu = vm_get_maxcpus(sc->vm) - 1;
191	error = vcpu_lock_one(sc, lastcpu);
192	if (error)
193		return (error);
194
195	prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
196	maxaddr = vmm_sysmem_maxaddr(sc->vm);
197	while (uio->uio_resid > 0 && error == 0) {
198		gpa = uio->uio_offset;
199		off = gpa & PAGE_MASK;
200		c = min(uio->uio_resid, PAGE_SIZE - off);
201
202		/*
203		 * The VM has a hole in its physical memory map. If we want to
204		 * use 'dd' to inspect memory beyond the hole we need to
205		 * provide bogus data for memory that lies in the hole.
206		 *
207		 * Since this device does not support lseek(2), dd(1) will
208		 * read(2) blocks of data to simulate the lseek(2).
209		 */
210		hpa = vm_gpa_hold(sc->vm, lastcpu, gpa, c,
211		    prot, &cookie);
212		if (hpa == NULL) {
213			if (uio->uio_rw == UIO_READ && gpa < maxaddr)
214				error = uiomove(__DECONST(void *, zero_region),
215				    c, uio);
216			else
217				error = EFAULT;
218		} else {
219			error = uiomove(hpa, c, uio);
220			vm_gpa_release(cookie);
221		}
222	}
223	vcpu_unlock_one(sc, lastcpu);
224	return (error);
225}
226
227CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= SPECNAMELEN + 1);
228
229static int
230get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg)
231{
232	struct devmem_softc *dsc;
233	int error;
234	bool sysmem;
235
236	error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
237	if (error || mseg->len == 0)
238		return (error);
239
240	if (!sysmem) {
241		SLIST_FOREACH(dsc, &sc->devmem, link) {
242			if (dsc->segid == mseg->segid)
243				break;
244		}
245		KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
246		    __func__, mseg->segid));
247		error = copystr(dsc->name, mseg->name, SPECNAMELEN + 1, NULL);
248	} else {
249		bzero(mseg->name, sizeof(mseg->name));
250	}
251
252	return (error);
253}
254
255static int
256alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg)
257{
258	char *name;
259	int error;
260	bool sysmem;
261
262	error = 0;
263	name = NULL;
264	sysmem = true;
265
266	if (VM_MEMSEG_NAME(mseg)) {
267		sysmem = false;
268		name = malloc(SPECNAMELEN + 1, M_VMMDEV, M_WAITOK);
269		error = copystr(mseg->name, name, SPECNAMELEN + 1, 0);
270		if (error)
271			goto done;
272	}
273
274	error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem);
275	if (error)
276		goto done;
277
278	if (VM_MEMSEG_NAME(mseg)) {
279		error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name);
280		if (error)
281			vm_free_memseg(sc->vm, mseg->segid);
282		else
283			name = NULL;	/* freed when 'cdev' is destroyed */
284	}
285done:
286	free(name, M_VMMDEV);
287	return (error);
288}
289
290static int
291vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
292	     struct thread *td)
293{
294	int error, vcpu, state_changed, size;
295	cpuset_t *cpuset;
296	struct vmmdev_softc *sc;
297	struct vm_register *vmreg;
298	struct vm_seg_desc *vmsegdesc;
299	struct vm_run *vmrun;
300	struct vm_exception *vmexc;
301	struct vm_lapic_irq *vmirq;
302	struct vm_lapic_msi *vmmsi;
303	struct vm_ioapic_irq *ioapic_irq;
304	struct vm_isa_irq *isa_irq;
305	struct vm_isa_irq_trigger *isa_irq_trigger;
306	struct vm_capability *vmcap;
307	struct vm_pptdev *pptdev;
308	struct vm_pptdev_mmio *pptmmio;
309	struct vm_pptdev_msi *pptmsi;
310	struct vm_pptdev_msix *pptmsix;
311	struct vm_nmi *vmnmi;
312	struct vm_stats *vmstats;
313	struct vm_stat_desc *statdesc;
314	struct vm_x2apic *x2apic;
315	struct vm_gpa_pte *gpapte;
316	struct vm_suspend *vmsuspend;
317	struct vm_gla2gpa *gg;
318	struct vm_activate_cpu *vac;
319	struct vm_cpuset *vm_cpuset;
320	struct vm_intinfo *vmii;
321	struct vm_rtc_time *rtctime;
322	struct vm_rtc_data *rtcdata;
323	struct vm_memmap *mm;
324	struct vm_cpu_topology *topology;
325
326	sc = vmmdev_lookup2(cdev);
327	if (sc == NULL)
328		return (ENXIO);
329
330	error = 0;
331	vcpu = -1;
332	state_changed = 0;
333
334	/*
335	 * Some VMM ioctls can operate only on vcpus that are not running.
336	 */
337	switch (cmd) {
338	case VM_RUN:
339	case VM_GET_REGISTER:
340	case VM_SET_REGISTER:
341	case VM_GET_SEGMENT_DESCRIPTOR:
342	case VM_SET_SEGMENT_DESCRIPTOR:
343	case VM_INJECT_EXCEPTION:
344	case VM_GET_CAPABILITY:
345	case VM_SET_CAPABILITY:
346	case VM_PPTDEV_MSI:
347	case VM_PPTDEV_MSIX:
348	case VM_SET_X2APIC_STATE:
349	case VM_GLA2GPA:
350	case VM_ACTIVATE_CPU:
351	case VM_SET_INTINFO:
352	case VM_GET_INTINFO:
353	case VM_RESTART_INSTRUCTION:
354		/*
355		 * XXX fragile, handle with care
356		 * Assumes that the first field of the ioctl data is the vcpu.
357		 */
358		vcpu = *(int *)data;
359		error = vcpu_lock_one(sc, vcpu);
360		if (error)
361			goto done;
362		state_changed = 1;
363		break;
364
365	case VM_MAP_PPTDEV_MMIO:
366	case VM_BIND_PPTDEV:
367	case VM_UNBIND_PPTDEV:
368	case VM_ALLOC_MEMSEG:
369	case VM_MMAP_MEMSEG:
370	case VM_REINIT:
371		/*
372		 * ioctls that operate on the entire virtual machine must
373		 * prevent all vcpus from running.
374		 */
375		error = vcpu_lock_all(sc);
376		if (error)
377			goto done;
378		state_changed = 2;
379		break;
380
381	case VM_GET_MEMSEG:
382	case VM_MMAP_GETNEXT:
383		/*
384		 * Lock a vcpu to make sure that the memory map cannot be
385		 * modified while it is being inspected.
386		 */
387		vcpu = vm_get_maxcpus(sc->vm) - 1;
388		error = vcpu_lock_one(sc, vcpu);
389		if (error)
390			goto done;
391		state_changed = 1;
392		break;
393
394	default:
395		break;
396	}
397
398	switch(cmd) {
399	case VM_RUN:
400		vmrun = (struct vm_run *)data;
401		error = vm_run(sc->vm, vmrun);
402		break;
403	case VM_SUSPEND:
404		vmsuspend = (struct vm_suspend *)data;
405		error = vm_suspend(sc->vm, vmsuspend->how);
406		break;
407	case VM_REINIT:
408		error = vm_reinit(sc->vm);
409		break;
410	case VM_STAT_DESC: {
411		statdesc = (struct vm_stat_desc *)data;
412		error = vmm_stat_desc_copy(statdesc->index,
413					statdesc->desc, sizeof(statdesc->desc));
414		break;
415	}
416	case VM_STATS: {
417		CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS);
418		vmstats = (struct vm_stats *)data;
419		getmicrotime(&vmstats->tv);
420		error = vmm_stat_copy(sc->vm, vmstats->cpuid,
421				      &vmstats->num_entries, vmstats->statbuf);
422		break;
423	}
424	case VM_PPTDEV_MSI:
425		pptmsi = (struct vm_pptdev_msi *)data;
426		error = ppt_setup_msi(sc->vm, pptmsi->vcpu,
427				      pptmsi->bus, pptmsi->slot, pptmsi->func,
428				      pptmsi->addr, pptmsi->msg,
429				      pptmsi->numvec);
430		break;
431	case VM_PPTDEV_MSIX:
432		pptmsix = (struct vm_pptdev_msix *)data;
433		error = ppt_setup_msix(sc->vm, pptmsix->vcpu,
434				       pptmsix->bus, pptmsix->slot,
435				       pptmsix->func, pptmsix->idx,
436				       pptmsix->addr, pptmsix->msg,
437				       pptmsix->vector_control);
438		break;
439	case VM_MAP_PPTDEV_MMIO:
440		pptmmio = (struct vm_pptdev_mmio *)data;
441		error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
442				     pptmmio->func, pptmmio->gpa, pptmmio->len,
443				     pptmmio->hpa);
444		break;
445	case VM_BIND_PPTDEV:
446		pptdev = (struct vm_pptdev *)data;
447		error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
448					 pptdev->func);
449		break;
450	case VM_UNBIND_PPTDEV:
451		pptdev = (struct vm_pptdev *)data;
452		error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
453					   pptdev->func);
454		break;
455	case VM_INJECT_EXCEPTION:
456		vmexc = (struct vm_exception *)data;
457		error = vm_inject_exception(sc->vm, vmexc->cpuid,
458		    vmexc->vector, vmexc->error_code_valid, vmexc->error_code,
459		    vmexc->restart_instruction);
460		break;
461	case VM_INJECT_NMI:
462		vmnmi = (struct vm_nmi *)data;
463		error = vm_inject_nmi(sc->vm, vmnmi->cpuid);
464		break;
465	case VM_LAPIC_IRQ:
466		vmirq = (struct vm_lapic_irq *)data;
467		error = lapic_intr_edge(sc->vm, vmirq->cpuid, vmirq->vector);
468		break;
469	case VM_LAPIC_LOCAL_IRQ:
470		vmirq = (struct vm_lapic_irq *)data;
471		error = lapic_set_local_intr(sc->vm, vmirq->cpuid,
472		    vmirq->vector);
473		break;
474	case VM_LAPIC_MSI:
475		vmmsi = (struct vm_lapic_msi *)data;
476		error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg);
477		break;
478	case VM_IOAPIC_ASSERT_IRQ:
479		ioapic_irq = (struct vm_ioapic_irq *)data;
480		error = vioapic_assert_irq(sc->vm, ioapic_irq->irq);
481		break;
482	case VM_IOAPIC_DEASSERT_IRQ:
483		ioapic_irq = (struct vm_ioapic_irq *)data;
484		error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq);
485		break;
486	case VM_IOAPIC_PULSE_IRQ:
487		ioapic_irq = (struct vm_ioapic_irq *)data;
488		error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq);
489		break;
490	case VM_IOAPIC_PINCOUNT:
491		*(int *)data = vioapic_pincount(sc->vm);
492		break;
493	case VM_ISA_ASSERT_IRQ:
494		isa_irq = (struct vm_isa_irq *)data;
495		error = vatpic_assert_irq(sc->vm, isa_irq->atpic_irq);
496		if (error == 0 && isa_irq->ioapic_irq != -1)
497			error = vioapic_assert_irq(sc->vm,
498			    isa_irq->ioapic_irq);
499		break;
500	case VM_ISA_DEASSERT_IRQ:
501		isa_irq = (struct vm_isa_irq *)data;
502		error = vatpic_deassert_irq(sc->vm, isa_irq->atpic_irq);
503		if (error == 0 && isa_irq->ioapic_irq != -1)
504			error = vioapic_deassert_irq(sc->vm,
505			    isa_irq->ioapic_irq);
506		break;
507	case VM_ISA_PULSE_IRQ:
508		isa_irq = (struct vm_isa_irq *)data;
509		error = vatpic_pulse_irq(sc->vm, isa_irq->atpic_irq);
510		if (error == 0 && isa_irq->ioapic_irq != -1)
511			error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq);
512		break;
513	case VM_ISA_SET_IRQ_TRIGGER:
514		isa_irq_trigger = (struct vm_isa_irq_trigger *)data;
515		error = vatpic_set_irq_trigger(sc->vm,
516		    isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger);
517		break;
518	case VM_MMAP_GETNEXT:
519		mm = (struct vm_memmap *)data;
520		error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
521		    &mm->segoff, &mm->len, &mm->prot, &mm->flags);
522		break;
523	case VM_MMAP_MEMSEG:
524		mm = (struct vm_memmap *)data;
525		error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
526		    mm->len, mm->prot, mm->flags);
527		break;
528	case VM_ALLOC_MEMSEG:
529		error = alloc_memseg(sc, (struct vm_memseg *)data);
530		break;
531	case VM_GET_MEMSEG:
532		error = get_memseg(sc, (struct vm_memseg *)data);
533		break;
534	case VM_GET_REGISTER:
535		vmreg = (struct vm_register *)data;
536		error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum,
537					&vmreg->regval);
538		break;
539	case VM_SET_REGISTER:
540		vmreg = (struct vm_register *)data;
541		error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum,
542					vmreg->regval);
543		break;
544	case VM_SET_SEGMENT_DESCRIPTOR:
545		vmsegdesc = (struct vm_seg_desc *)data;
546		error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid,
547					vmsegdesc->regnum,
548					&vmsegdesc->desc);
549		break;
550	case VM_GET_SEGMENT_DESCRIPTOR:
551		vmsegdesc = (struct vm_seg_desc *)data;
552		error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid,
553					vmsegdesc->regnum,
554					&vmsegdesc->desc);
555		break;
556	case VM_GET_CAPABILITY:
557		vmcap = (struct vm_capability *)data;
558		error = vm_get_capability(sc->vm, vmcap->cpuid,
559					  vmcap->captype,
560					  &vmcap->capval);
561		break;
562	case VM_SET_CAPABILITY:
563		vmcap = (struct vm_capability *)data;
564		error = vm_set_capability(sc->vm, vmcap->cpuid,
565					  vmcap->captype,
566					  vmcap->capval);
567		break;
568	case VM_SET_X2APIC_STATE:
569		x2apic = (struct vm_x2apic *)data;
570		error = vm_set_x2apic_state(sc->vm,
571					    x2apic->cpuid, x2apic->state);
572		break;
573	case VM_GET_X2APIC_STATE:
574		x2apic = (struct vm_x2apic *)data;
575		error = vm_get_x2apic_state(sc->vm,
576					    x2apic->cpuid, &x2apic->state);
577		break;
578	case VM_GET_GPA_PMAP:
579		gpapte = (struct vm_gpa_pte *)data;
580		pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)),
581				 gpapte->gpa, gpapte->pte, &gpapte->ptenum);
582		error = 0;
583		break;
584	case VM_GET_HPET_CAPABILITIES:
585		error = vhpet_getcap((struct vm_hpet_cap *)data);
586		break;
587	case VM_GLA2GPA: {
588		CTASSERT(PROT_READ == VM_PROT_READ);
589		CTASSERT(PROT_WRITE == VM_PROT_WRITE);
590		CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
591		gg = (struct vm_gla2gpa *)data;
592		error = vm_gla2gpa(sc->vm, gg->vcpuid, &gg->paging, gg->gla,
593		    gg->prot, &gg->gpa, &gg->fault);
594		KASSERT(error == 0 || error == EFAULT,
595		    ("%s: vm_gla2gpa unknown error %d", __func__, error));
596		break;
597	}
598	case VM_ACTIVATE_CPU:
599		vac = (struct vm_activate_cpu *)data;
600		error = vm_activate_cpu(sc->vm, vac->vcpuid);
601		break;
602	case VM_GET_CPUS:
603		error = 0;
604		vm_cpuset = (struct vm_cpuset *)data;
605		size = vm_cpuset->cpusetsize;
606		if (size < sizeof(cpuset_t) || size > CPU_MAXSIZE / NBBY) {
607			error = ERANGE;
608			break;
609		}
610		cpuset = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
611		if (vm_cpuset->which == VM_ACTIVE_CPUS)
612			*cpuset = vm_active_cpus(sc->vm);
613		else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
614			*cpuset = vm_suspended_cpus(sc->vm);
615		else
616			error = EINVAL;
617		if (error == 0)
618			error = copyout(cpuset, vm_cpuset->cpus, size);
619		free(cpuset, M_TEMP);
620		break;
621	case VM_SET_INTINFO:
622		vmii = (struct vm_intinfo *)data;
623		error = vm_exit_intinfo(sc->vm, vmii->vcpuid, vmii->info1);
624		break;
625	case VM_GET_INTINFO:
626		vmii = (struct vm_intinfo *)data;
627		error = vm_get_intinfo(sc->vm, vmii->vcpuid, &vmii->info1,
628		    &vmii->info2);
629		break;
630	case VM_RTC_WRITE:
631		rtcdata = (struct vm_rtc_data *)data;
632		error = vrtc_nvram_write(sc->vm, rtcdata->offset,
633		    rtcdata->value);
634		break;
635	case VM_RTC_READ:
636		rtcdata = (struct vm_rtc_data *)data;
637		error = vrtc_nvram_read(sc->vm, rtcdata->offset,
638		    &rtcdata->value);
639		break;
640	case VM_RTC_SETTIME:
641		rtctime = (struct vm_rtc_time *)data;
642		error = vrtc_set_time(sc->vm, rtctime->secs);
643		break;
644	case VM_RTC_GETTIME:
645		error = 0;
646		rtctime = (struct vm_rtc_time *)data;
647		rtctime->secs = vrtc_get_time(sc->vm);
648		break;
649	case VM_RESTART_INSTRUCTION:
650		error = vm_restart_instruction(sc->vm, vcpu);
651		break;
652	case VM_SET_TOPOLOGY:
653		topology = (struct vm_cpu_topology *)data;
654		error = vm_set_topology(sc->vm, topology->sockets,
655		    topology->cores, topology->threads, topology->maxcpus);
656		break;
657	case VM_GET_TOPOLOGY:
658		topology = (struct vm_cpu_topology *)data;
659		vm_get_topology(sc->vm, &topology->sockets, &topology->cores,
660		    &topology->threads, &topology->maxcpus);
661		error = 0;
662		break;
663	default:
664		error = ENOTTY;
665		break;
666	}
667
668	if (state_changed == 1)
669		vcpu_unlock_one(sc, vcpu);
670	else if (state_changed == 2)
671		vcpu_unlock_all(sc);
672
673done:
674	/* Make sure that no handler returns a bogus value like ERESTART */
675	KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error));
676	return (error);
677}
678
679static int
680vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
681    struct vm_object **objp, int nprot)
682{
683	struct vmmdev_softc *sc;
684	vm_paddr_t gpa;
685	size_t len;
686	vm_ooffset_t segoff, first, last;
687	int error, found, segid;
688	uint16_t lastcpu;
689	bool sysmem;
690
691	first = *offset;
692	last = first + mapsize;
693	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
694		return (EINVAL);
695
696	sc = vmmdev_lookup2(cdev);
697	if (sc == NULL) {
698		/* virtual machine is in the process of being created */
699		return (EINVAL);
700	}
701
702	/*
703	 * Get a read lock on the guest memory map by freezing any vcpu.
704	 */
705	lastcpu = vm_get_maxcpus(sc->vm) - 1;
706	error = vcpu_lock_one(sc, lastcpu);
707	if (error)
708		return (error);
709
710	gpa = 0;
711	found = 0;
712	while (!found) {
713		error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
714		    NULL, NULL);
715		if (error)
716			break;
717
718		if (first >= gpa && last <= gpa + len)
719			found = 1;
720		else
721			gpa += len;
722	}
723
724	if (found) {
725		error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
726		KASSERT(error == 0 && *objp != NULL,
727		    ("%s: invalid memory segment %d", __func__, segid));
728		if (sysmem) {
729			vm_object_reference(*objp);
730			*offset = segoff + (first - gpa);
731		} else {
732			error = EINVAL;
733		}
734	}
735	vcpu_unlock_one(sc, lastcpu);
736	return (error);
737}
738
739static void
740vmmdev_destroy(void *arg)
741{
742	struct vmmdev_softc *sc = arg;
743	struct devmem_softc *dsc;
744	int error;
745
746	error = vcpu_lock_all(sc);
747	KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
748
749	while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
750		KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
751		SLIST_REMOVE_HEAD(&sc->devmem, link);
752		free(dsc->name, M_VMMDEV);
753		free(dsc, M_VMMDEV);
754	}
755
756	if (sc->cdev != NULL)
757		destroy_dev(sc->cdev);
758
759	if (sc->vm != NULL)
760		vm_destroy(sc->vm);
761
762	if ((sc->flags & VSC_LINKED) != 0) {
763		mtx_lock(&vmmdev_mtx);
764		SLIST_REMOVE(&head, sc, vmmdev_softc, link);
765		mtx_unlock(&vmmdev_mtx);
766	}
767
768	free(sc, M_VMMDEV);
769}
770
771static int
772sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
773{
774	int error;
775	char buf[VM_MAX_NAMELEN];
776	struct devmem_softc *dsc;
777	struct vmmdev_softc *sc;
778	struct cdev *cdev;
779
780	strlcpy(buf, "beavis", sizeof(buf));
781	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
782	if (error != 0 || req->newptr == NULL)
783		return (error);
784
785	mtx_lock(&vmmdev_mtx);
786	sc = vmmdev_lookup(buf);
787	if (sc == NULL || sc->cdev == NULL) {
788		mtx_unlock(&vmmdev_mtx);
789		return (EINVAL);
790	}
791
792	/*
793	 * The 'cdev' will be destroyed asynchronously when 'si_threadcount'
794	 * goes down to 0 so we should not do it again in the callback.
795	 *
796	 * Setting 'sc->cdev' to NULL is also used to indicate that the VM
797	 * is scheduled for destruction.
798	 */
799	cdev = sc->cdev;
800	sc->cdev = NULL;
801	mtx_unlock(&vmmdev_mtx);
802
803	/*
804	 * Schedule all cdevs to be destroyed:
805	 *
806	 * - any new operations on the 'cdev' will return an error (ENXIO).
807	 *
808	 * - when the 'si_threadcount' dwindles down to zero the 'cdev' will
809	 *   be destroyed and the callback will be invoked in a taskqueue
810	 *   context.
811	 *
812	 * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
813	 */
814	SLIST_FOREACH(dsc, &sc->devmem, link) {
815		KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
816		destroy_dev_sched_cb(dsc->cdev, devmem_destroy, dsc);
817	}
818	destroy_dev_sched_cb(cdev, vmmdev_destroy, sc);
819	return (0);
820}
821SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW,
822	    NULL, 0, sysctl_vmm_destroy, "A", NULL);
823
824static struct cdevsw vmmdevsw = {
825	.d_name		= "vmmdev",
826	.d_version	= D_VERSION,
827	.d_ioctl	= vmmdev_ioctl,
828	.d_mmap_single	= vmmdev_mmap_single,
829	.d_read		= vmmdev_rw,
830	.d_write	= vmmdev_rw,
831};
832
833static int
834sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
835{
836	int error;
837	struct vm *vm;
838	struct cdev *cdev;
839	struct vmmdev_softc *sc, *sc2;
840	char buf[VM_MAX_NAMELEN];
841
842	strlcpy(buf, "beavis", sizeof(buf));
843	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
844	if (error != 0 || req->newptr == NULL)
845		return (error);
846
847	mtx_lock(&vmmdev_mtx);
848	sc = vmmdev_lookup(buf);
849	mtx_unlock(&vmmdev_mtx);
850	if (sc != NULL)
851		return (EEXIST);
852
853	error = vm_create(buf, &vm);
854	if (error != 0)
855		return (error);
856
857	sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
858	sc->vm = vm;
859	SLIST_INIT(&sc->devmem);
860
861	/*
862	 * Lookup the name again just in case somebody sneaked in when we
863	 * dropped the lock.
864	 */
865	mtx_lock(&vmmdev_mtx);
866	sc2 = vmmdev_lookup(buf);
867	if (sc2 == NULL) {
868		SLIST_INSERT_HEAD(&head, sc, link);
869		sc->flags |= VSC_LINKED;
870	}
871	mtx_unlock(&vmmdev_mtx);
872
873	if (sc2 != NULL) {
874		vmmdev_destroy(sc);
875		return (EEXIST);
876	}
877
878	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, NULL,
879			   UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf);
880	if (error != 0) {
881		vmmdev_destroy(sc);
882		return (error);
883	}
884
885	mtx_lock(&vmmdev_mtx);
886	sc->cdev = cdev;
887	sc->cdev->si_drv1 = sc;
888	mtx_unlock(&vmmdev_mtx);
889
890	return (0);
891}
892SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW,
893	    NULL, 0, sysctl_vmm_create, "A", NULL);
894
895void
896vmmdev_init(void)
897{
898	mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF);
899}
900
901int
902vmmdev_cleanup(void)
903{
904	int error;
905
906	if (SLIST_EMPTY(&head))
907		error = 0;
908	else
909		error = EBUSY;
910
911	return (error);
912}
913
914static int
915devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
916    struct vm_object **objp, int nprot)
917{
918	struct devmem_softc *dsc;
919	vm_ooffset_t first, last;
920	size_t seglen;
921	int error;
922	uint16_t lastcpu;
923	bool sysmem;
924
925	dsc = cdev->si_drv1;
926	if (dsc == NULL) {
927		/* 'cdev' has been created but is not ready for use */
928		return (ENXIO);
929	}
930
931	first = *offset;
932	last = *offset + len;
933	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
934		return (EINVAL);
935
936	lastcpu = vm_get_maxcpus(dsc->sc->vm) - 1;
937	error = vcpu_lock_one(dsc->sc, lastcpu);
938	if (error)
939		return (error);
940
941	error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
942	KASSERT(error == 0 && !sysmem && *objp != NULL,
943	    ("%s: invalid devmem segment %d", __func__, dsc->segid));
944
945	vcpu_unlock_one(dsc->sc, lastcpu);
946
947	if (seglen >= last) {
948		vm_object_reference(*objp);
949		return (0);
950	} else {
951		return (EINVAL);
952	}
953}
954
955static struct cdevsw devmemsw = {
956	.d_name		= "devmem",
957	.d_version	= D_VERSION,
958	.d_mmap_single	= devmem_mmap_single,
959};
960
961static int
962devmem_create_cdev(const char *vmname, int segid, char *devname)
963{
964	struct devmem_softc *dsc;
965	struct vmmdev_softc *sc;
966	struct cdev *cdev;
967	int error;
968
969	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL,
970	    UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname);
971	if (error)
972		return (error);
973
974	dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO);
975
976	mtx_lock(&vmmdev_mtx);
977	sc = vmmdev_lookup(vmname);
978	KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname));
979	if (sc->cdev == NULL) {
980		/* virtual machine is being created or destroyed */
981		mtx_unlock(&vmmdev_mtx);
982		free(dsc, M_VMMDEV);
983		destroy_dev_sched_cb(cdev, NULL, 0);
984		return (ENODEV);
985	}
986
987	dsc->segid = segid;
988	dsc->name = devname;
989	dsc->cdev = cdev;
990	dsc->sc = sc;
991	SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
992	mtx_unlock(&vmmdev_mtx);
993
994	/* The 'cdev' is ready for use after 'si_drv1' is initialized */
995	cdev->si_drv1 = dsc;
996	return (0);
997}
998
999static void
1000devmem_destroy(void *arg)
1001{
1002	struct devmem_softc *dsc = arg;
1003
1004	KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__));
1005	dsc->cdev = NULL;
1006	dsc->sc = NULL;
1007}
1008