vmm.c revision 262350
1/*-
2 * Copyright (c) 2011 NetApp, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD: stable/10/sys/amd64/vmm/vmm.c 262350 2014-02-23 00:46:05Z jhb $
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/vmm.c 262350 2014-02-23 00:46:05Z jhb $");
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/kernel.h>
35#include <sys/module.h>
36#include <sys/sysctl.h>
37#include <sys/malloc.h>
38#include <sys/pcpu.h>
39#include <sys/lock.h>
40#include <sys/mutex.h>
41#include <sys/proc.h>
42#include <sys/rwlock.h>
43#include <sys/sched.h>
44#include <sys/smp.h>
45#include <sys/systm.h>
46
47#include <vm/vm.h>
48#include <vm/vm_object.h>
49#include <vm/vm_page.h>
50#include <vm/pmap.h>
51#include <vm/vm_map.h>
52#include <vm/vm_extern.h>
53#include <vm/vm_param.h>
54
55#include <machine/cpu.h>
56#include <machine/vm.h>
57#include <machine/pcb.h>
58#include <machine/smp.h>
59#include <x86/psl.h>
60#include <x86/apicreg.h>
61#include <machine/vmparam.h>
62
63#include <machine/vmm.h>
64#include <machine/vmm_dev.h>
65
66#include "vmm_ktr.h"
67#include "vmm_host.h"
68#include "vmm_mem.h"
69#include "vmm_util.h"
70#include "vhpet.h"
71#include "vioapic.h"
72#include "vlapic.h"
73#include "vmm_msr.h"
74#include "vmm_ipi.h"
75#include "vmm_stat.h"
76#include "vmm_lapic.h"
77
78#include "io/ppt.h"
79#include "io/iommu.h"
80
81struct vlapic;
82
83struct vcpu {
84	int		flags;
85	enum vcpu_state	state;
86	struct mtx	mtx;
87	int		hostcpu;	/* host cpuid this vcpu last ran on */
88	uint64_t	guest_msrs[VMM_MSR_NUM];
89	struct vlapic	*vlapic;
90	int		 vcpuid;
91	struct savefpu	*guestfpu;	/* guest fpu state */
92	void		*stats;
93	struct vm_exit	exitinfo;
94	enum x2apic_state x2apic_state;
95	int		nmi_pending;
96};
97
98#define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
99#define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
100#define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
101#define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
102
103struct mem_seg {
104	vm_paddr_t	gpa;
105	size_t		len;
106	boolean_t	wired;
107	vm_object_t	object;
108};
109#define	VM_MAX_MEMORY_SEGMENTS	2
110
111struct vm {
112	void		*cookie;	/* processor-specific data */
113	void		*iommu;		/* iommu-specific data */
114	struct vhpet	*vhpet;		/* virtual HPET */
115	struct vioapic	*vioapic;	/* virtual ioapic */
116	struct vmspace	*vmspace;	/* guest's address space */
117	struct vcpu	vcpu[VM_MAXCPU];
118	int		num_mem_segs;
119	struct mem_seg	mem_segs[VM_MAX_MEMORY_SEGMENTS];
120	char		name[VM_MAX_NAMELEN];
121
122	/*
123	 * Set of active vcpus.
124	 * An active vcpu is one that has been started implicitly (BSP) or
125	 * explicitly (AP) by sending it a startup ipi.
126	 */
127	cpuset_t	active_cpus;
128};
129
130static int vmm_initialized;
131
132static struct vmm_ops *ops;
133#define	VMM_INIT()	(ops != NULL ? (*ops->init)() : 0)
134#define	VMM_CLEANUP()	(ops != NULL ? (*ops->cleanup)() : 0)
135#define	VMM_RESUME()	(ops != NULL ? (*ops->resume)() : 0)
136
137#define	VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
138#define	VMRUN(vmi, vcpu, rip, pmap) \
139	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap) : ENXIO)
140#define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
141#define	VMSPACE_ALLOC(min, max) \
142	(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
143#define	VMSPACE_FREE(vmspace) \
144	(ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
145#define	VMGETREG(vmi, vcpu, num, retval)		\
146	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
147#define	VMSETREG(vmi, vcpu, num, val)		\
148	(ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
149#define	VMGETDESC(vmi, vcpu, num, desc)		\
150	(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
151#define	VMSETDESC(vmi, vcpu, num, desc)		\
152	(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
153#define	VMINJECT(vmi, vcpu, type, vec, ec, ecv)	\
154	(ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO)
155#define	VMGETCAP(vmi, vcpu, num, retval)	\
156	(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
157#define	VMSETCAP(vmi, vcpu, num, val)		\
158	(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
159
160#define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
161#define	fpu_stop_emulating()	clts()
162
163static MALLOC_DEFINE(M_VM, "vm", "vm");
164CTASSERT(VMM_MSR_NUM <= 64);	/* msr_mask can keep track of up to 64 msrs */
165
166/* statistics */
167static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
168
169static void
170vcpu_cleanup(struct vcpu *vcpu)
171{
172	vlapic_cleanup(vcpu->vlapic);
173	vmm_stat_free(vcpu->stats);
174	fpu_save_area_free(vcpu->guestfpu);
175}
176
177static void
178vcpu_init(struct vm *vm, uint32_t vcpu_id)
179{
180	struct vcpu *vcpu;
181
182	vcpu = &vm->vcpu[vcpu_id];
183
184	vcpu_lock_init(vcpu);
185	vcpu->hostcpu = NOCPU;
186	vcpu->vcpuid = vcpu_id;
187	vcpu->vlapic = vlapic_init(vm, vcpu_id);
188	vm_set_x2apic_state(vm, vcpu_id, X2APIC_ENABLED);
189	vcpu->guestfpu = fpu_save_area_alloc();
190	fpu_save_area_reset(vcpu->guestfpu);
191	vcpu->stats = vmm_stat_alloc();
192}
193
194struct vm_exit *
195vm_exitinfo(struct vm *vm, int cpuid)
196{
197	struct vcpu *vcpu;
198
199	if (cpuid < 0 || cpuid >= VM_MAXCPU)
200		panic("vm_exitinfo: invalid cpuid %d", cpuid);
201
202	vcpu = &vm->vcpu[cpuid];
203
204	return (&vcpu->exitinfo);
205}
206
207static void
208vmm_resume(void)
209{
210	VMM_RESUME();
211}
212
213static int
214vmm_init(void)
215{
216	int error;
217
218	vmm_host_state_init();
219	vmm_ipi_init();
220
221	error = vmm_mem_init();
222	if (error)
223		return (error);
224
225	if (vmm_is_intel())
226		ops = &vmm_ops_intel;
227	else if (vmm_is_amd())
228		ops = &vmm_ops_amd;
229	else
230		return (ENXIO);
231
232	vmm_msr_init();
233	vmm_resume_p = vmm_resume;
234
235	return (VMM_INIT());
236}
237
238static int
239vmm_handler(module_t mod, int what, void *arg)
240{
241	int error;
242
243	switch (what) {
244	case MOD_LOAD:
245		vmmdev_init();
246		iommu_init();
247		error = vmm_init();
248		if (error == 0)
249			vmm_initialized = 1;
250		break;
251	case MOD_UNLOAD:
252		error = vmmdev_cleanup();
253		if (error == 0) {
254			vmm_resume_p = NULL;
255			iommu_cleanup();
256			vmm_ipi_cleanup();
257			error = VMM_CLEANUP();
258			/*
259			 * Something bad happened - prevent new
260			 * VMs from being created
261			 */
262			if (error)
263				vmm_initialized = 0;
264		}
265		break;
266	default:
267		error = 0;
268		break;
269	}
270	return (error);
271}
272
273static moduledata_t vmm_kmod = {
274	"vmm",
275	vmm_handler,
276	NULL
277};
278
279/*
280 * vmm initialization has the following dependencies:
281 *
282 * - iommu initialization must happen after the pci passthru driver has had
283 *   a chance to attach to any passthru devices (after SI_SUB_CONFIGURE).
284 *
285 * - VT-x initialization requires smp_rendezvous() and therefore must happen
286 *   after SMP is fully functional (after SI_SUB_SMP).
287 */
288DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
289MODULE_VERSION(vmm, 1);
290
291SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
292
293int
294vm_create(const char *name, struct vm **retvm)
295{
296	int i;
297	struct vm *vm;
298	struct vmspace *vmspace;
299
300	const int BSP = 0;
301
302	/*
303	 * If vmm.ko could not be successfully initialized then don't attempt
304	 * to create the virtual machine.
305	 */
306	if (!vmm_initialized)
307		return (ENXIO);
308
309	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
310		return (EINVAL);
311
312	vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS);
313	if (vmspace == NULL)
314		return (ENOMEM);
315
316	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
317	strcpy(vm->name, name);
318	vm->cookie = VMINIT(vm, vmspace_pmap(vmspace));
319	vm->vioapic = vioapic_init(vm);
320	vm->vhpet = vhpet_init(vm);
321
322	for (i = 0; i < VM_MAXCPU; i++) {
323		vcpu_init(vm, i);
324		guest_msrs_init(vm, i);
325	}
326
327	vm_activate_cpu(vm, BSP);
328	vm->vmspace = vmspace;
329
330	*retvm = vm;
331	return (0);
332}
333
334static void
335vm_free_mem_seg(struct vm *vm, struct mem_seg *seg)
336{
337
338	if (seg->object != NULL)
339		vmm_mem_free(vm->vmspace, seg->gpa, seg->len);
340
341	bzero(seg, sizeof(*seg));
342}
343
344void
345vm_destroy(struct vm *vm)
346{
347	int i;
348
349	ppt_unassign_all(vm);
350
351	if (vm->iommu != NULL)
352		iommu_destroy_domain(vm->iommu);
353
354	vhpet_cleanup(vm->vhpet);
355	vioapic_cleanup(vm->vioapic);
356
357	for (i = 0; i < vm->num_mem_segs; i++)
358		vm_free_mem_seg(vm, &vm->mem_segs[i]);
359
360	vm->num_mem_segs = 0;
361
362	for (i = 0; i < VM_MAXCPU; i++)
363		vcpu_cleanup(&vm->vcpu[i]);
364
365	VMSPACE_FREE(vm->vmspace);
366
367	VMCLEANUP(vm->cookie);
368
369	free(vm, M_VM);
370}
371
372const char *
373vm_name(struct vm *vm)
374{
375	return (vm->name);
376}
377
378int
379vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
380{
381	vm_object_t obj;
382
383	if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
384		return (ENOMEM);
385	else
386		return (0);
387}
388
389int
390vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
391{
392
393	vmm_mmio_free(vm->vmspace, gpa, len);
394	return (0);
395}
396
397boolean_t
398vm_mem_allocated(struct vm *vm, vm_paddr_t gpa)
399{
400	int i;
401	vm_paddr_t gpabase, gpalimit;
402
403	for (i = 0; i < vm->num_mem_segs; i++) {
404		gpabase = vm->mem_segs[i].gpa;
405		gpalimit = gpabase + vm->mem_segs[i].len;
406		if (gpa >= gpabase && gpa < gpalimit)
407			return (TRUE);		/* 'gpa' is regular memory */
408	}
409
410	if (ppt_is_mmio(vm, gpa))
411		return (TRUE);			/* 'gpa' is pci passthru mmio */
412
413	return (FALSE);
414}
415
416int
417vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
418{
419	int available, allocated;
420	struct mem_seg *seg;
421	vm_object_t object;
422	vm_paddr_t g;
423
424	if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
425		return (EINVAL);
426
427	available = allocated = 0;
428	g = gpa;
429	while (g < gpa + len) {
430		if (vm_mem_allocated(vm, g))
431			allocated++;
432		else
433			available++;
434
435		g += PAGE_SIZE;
436	}
437
438	/*
439	 * If there are some allocated and some available pages in the address
440	 * range then it is an error.
441	 */
442	if (allocated && available)
443		return (EINVAL);
444
445	/*
446	 * If the entire address range being requested has already been
447	 * allocated then there isn't anything more to do.
448	 */
449	if (allocated && available == 0)
450		return (0);
451
452	if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
453		return (E2BIG);
454
455	seg = &vm->mem_segs[vm->num_mem_segs];
456
457	if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL)
458		return (ENOMEM);
459
460	seg->gpa = gpa;
461	seg->len = len;
462	seg->object = object;
463	seg->wired = FALSE;
464
465	vm->num_mem_segs++;
466
467	return (0);
468}
469
470static void
471vm_gpa_unwire(struct vm *vm)
472{
473	int i, rv;
474	struct mem_seg *seg;
475
476	for (i = 0; i < vm->num_mem_segs; i++) {
477		seg = &vm->mem_segs[i];
478		if (!seg->wired)
479			continue;
480
481		rv = vm_map_unwire(&vm->vmspace->vm_map,
482				   seg->gpa, seg->gpa + seg->len,
483				   VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
484		KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment "
485		    "%#lx/%ld could not be unwired: %d",
486		    vm_name(vm), seg->gpa, seg->len, rv));
487
488		seg->wired = FALSE;
489	}
490}
491
492static int
493vm_gpa_wire(struct vm *vm)
494{
495	int i, rv;
496	struct mem_seg *seg;
497
498	for (i = 0; i < vm->num_mem_segs; i++) {
499		seg = &vm->mem_segs[i];
500		if (seg->wired)
501			continue;
502
503		/* XXX rlimits? */
504		rv = vm_map_wire(&vm->vmspace->vm_map,
505				 seg->gpa, seg->gpa + seg->len,
506				 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
507		if (rv != KERN_SUCCESS)
508			break;
509
510		seg->wired = TRUE;
511	}
512
513	if (i < vm->num_mem_segs) {
514		/*
515		 * Undo the wiring before returning an error.
516		 */
517		vm_gpa_unwire(vm);
518		return (EAGAIN);
519	}
520
521	return (0);
522}
523
524static void
525vm_iommu_modify(struct vm *vm, boolean_t map)
526{
527	int i, sz;
528	vm_paddr_t gpa, hpa;
529	struct mem_seg *seg;
530	void *vp, *cookie, *host_domain;
531
532	sz = PAGE_SIZE;
533	host_domain = iommu_host_domain();
534
535	for (i = 0; i < vm->num_mem_segs; i++) {
536		seg = &vm->mem_segs[i];
537		KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired",
538		    vm_name(vm), seg->gpa, seg->len));
539
540		gpa = seg->gpa;
541		while (gpa < seg->gpa + seg->len) {
542			vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE,
543					 &cookie);
544			KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
545			    vm_name(vm), gpa));
546
547			vm_gpa_release(cookie);
548
549			hpa = DMAP_TO_PHYS((uintptr_t)vp);
550			if (map) {
551				iommu_create_mapping(vm->iommu, gpa, hpa, sz);
552				iommu_remove_mapping(host_domain, hpa, sz);
553			} else {
554				iommu_remove_mapping(vm->iommu, gpa, sz);
555				iommu_create_mapping(host_domain, hpa, hpa, sz);
556			}
557
558			gpa += PAGE_SIZE;
559		}
560	}
561
562	/*
563	 * Invalidate the cached translations associated with the domain
564	 * from which pages were removed.
565	 */
566	if (map)
567		iommu_invalidate_tlb(host_domain);
568	else
569		iommu_invalidate_tlb(vm->iommu);
570}
571
572#define	vm_iommu_unmap(vm)	vm_iommu_modify((vm), FALSE)
573#define	vm_iommu_map(vm)	vm_iommu_modify((vm), TRUE)
574
575int
576vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
577{
578	int error;
579
580	error = ppt_unassign_device(vm, bus, slot, func);
581	if (error)
582		return (error);
583
584	if (ppt_num_devices(vm) == 0) {
585		vm_iommu_unmap(vm);
586		vm_gpa_unwire(vm);
587	}
588	return (0);
589}
590
591int
592vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
593{
594	int error;
595	vm_paddr_t maxaddr;
596
597	/*
598	 * Virtual machines with pci passthru devices get special treatment:
599	 * - the guest physical memory is wired
600	 * - the iommu is programmed to do the 'gpa' to 'hpa' translation
601	 *
602	 * We need to do this before the first pci passthru device is attached.
603	 */
604	if (ppt_num_devices(vm) == 0) {
605		KASSERT(vm->iommu == NULL,
606		    ("vm_assign_pptdev: iommu must be NULL"));
607		maxaddr = vmm_mem_maxaddr();
608		vm->iommu = iommu_create_domain(maxaddr);
609
610		error = vm_gpa_wire(vm);
611		if (error)
612			return (error);
613
614		vm_iommu_map(vm);
615	}
616
617	error = ppt_assign_device(vm, bus, slot, func);
618	return (error);
619}
620
621void *
622vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
623	    void **cookie)
624{
625	int count, pageoff;
626	vm_page_t m;
627
628	pageoff = gpa & PAGE_MASK;
629	if (len > PAGE_SIZE - pageoff)
630		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
631
632	count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
633	    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
634
635	if (count == 1) {
636		*cookie = m;
637		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
638	} else {
639		*cookie = NULL;
640		return (NULL);
641	}
642}
643
644void
645vm_gpa_release(void *cookie)
646{
647	vm_page_t m = cookie;
648
649	vm_page_lock(m);
650	vm_page_unhold(m);
651	vm_page_unlock(m);
652}
653
654int
655vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
656		  struct vm_memory_segment *seg)
657{
658	int i;
659
660	for (i = 0; i < vm->num_mem_segs; i++) {
661		if (gpabase == vm->mem_segs[i].gpa) {
662			seg->gpa = vm->mem_segs[i].gpa;
663			seg->len = vm->mem_segs[i].len;
664			seg->wired = vm->mem_segs[i].wired;
665			return (0);
666		}
667	}
668	return (-1);
669}
670
671int
672vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
673	      vm_offset_t *offset, struct vm_object **object)
674{
675	int i;
676	size_t seg_len;
677	vm_paddr_t seg_gpa;
678	vm_object_t seg_obj;
679
680	for (i = 0; i < vm->num_mem_segs; i++) {
681		if ((seg_obj = vm->mem_segs[i].object) == NULL)
682			continue;
683
684		seg_gpa = vm->mem_segs[i].gpa;
685		seg_len = vm->mem_segs[i].len;
686
687		if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) {
688			*offset = gpa - seg_gpa;
689			*object = seg_obj;
690			vm_object_reference(seg_obj);
691			return (0);
692		}
693	}
694
695	return (EINVAL);
696}
697
698int
699vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
700{
701
702	if (vcpu < 0 || vcpu >= VM_MAXCPU)
703		return (EINVAL);
704
705	if (reg >= VM_REG_LAST)
706		return (EINVAL);
707
708	return (VMGETREG(vm->cookie, vcpu, reg, retval));
709}
710
711int
712vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
713{
714
715	if (vcpu < 0 || vcpu >= VM_MAXCPU)
716		return (EINVAL);
717
718	if (reg >= VM_REG_LAST)
719		return (EINVAL);
720
721	return (VMSETREG(vm->cookie, vcpu, reg, val));
722}
723
724static boolean_t
725is_descriptor_table(int reg)
726{
727
728	switch (reg) {
729	case VM_REG_GUEST_IDTR:
730	case VM_REG_GUEST_GDTR:
731		return (TRUE);
732	default:
733		return (FALSE);
734	}
735}
736
737static boolean_t
738is_segment_register(int reg)
739{
740
741	switch (reg) {
742	case VM_REG_GUEST_ES:
743	case VM_REG_GUEST_CS:
744	case VM_REG_GUEST_SS:
745	case VM_REG_GUEST_DS:
746	case VM_REG_GUEST_FS:
747	case VM_REG_GUEST_GS:
748	case VM_REG_GUEST_TR:
749	case VM_REG_GUEST_LDTR:
750		return (TRUE);
751	default:
752		return (FALSE);
753	}
754}
755
756int
757vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
758		struct seg_desc *desc)
759{
760
761	if (vcpu < 0 || vcpu >= VM_MAXCPU)
762		return (EINVAL);
763
764	if (!is_segment_register(reg) && !is_descriptor_table(reg))
765		return (EINVAL);
766
767	return (VMGETDESC(vm->cookie, vcpu, reg, desc));
768}
769
770int
771vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
772		struct seg_desc *desc)
773{
774	if (vcpu < 0 || vcpu >= VM_MAXCPU)
775		return (EINVAL);
776
777	if (!is_segment_register(reg) && !is_descriptor_table(reg))
778		return (EINVAL);
779
780	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
781}
782
783static void
784restore_guest_fpustate(struct vcpu *vcpu)
785{
786
787	/* flush host state to the pcb */
788	fpuexit(curthread);
789
790	/* restore guest FPU state */
791	fpu_stop_emulating();
792	fpurestore(vcpu->guestfpu);
793
794	/*
795	 * The FPU is now "dirty" with the guest's state so turn on emulation
796	 * to trap any access to the FPU by the host.
797	 */
798	fpu_start_emulating();
799}
800
801static void
802save_guest_fpustate(struct vcpu *vcpu)
803{
804
805	if ((rcr0() & CR0_TS) == 0)
806		panic("fpu emulation not enabled in host!");
807
808	/* save guest FPU state */
809	fpu_stop_emulating();
810	fpusave(vcpu->guestfpu);
811	fpu_start_emulating();
812}
813
814static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
815
816static int
817vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
818{
819	int error;
820
821	vcpu_assert_locked(vcpu);
822
823	/*
824	 * The following state transitions are allowed:
825	 * IDLE -> FROZEN -> IDLE
826	 * FROZEN -> RUNNING -> FROZEN
827	 * FROZEN -> SLEEPING -> FROZEN
828	 */
829	switch (vcpu->state) {
830	case VCPU_IDLE:
831	case VCPU_RUNNING:
832	case VCPU_SLEEPING:
833		error = (newstate != VCPU_FROZEN);
834		break;
835	case VCPU_FROZEN:
836		error = (newstate == VCPU_FROZEN);
837		break;
838	default:
839		error = 1;
840		break;
841	}
842
843	if (error == 0)
844		vcpu->state = newstate;
845	else
846		error = EBUSY;
847
848	return (error);
849}
850
851static void
852vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
853{
854	int error;
855
856	if ((error = vcpu_set_state(vm, vcpuid, newstate)) != 0)
857		panic("Error %d setting state to %d\n", error, newstate);
858}
859
860static void
861vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
862{
863	int error;
864
865	if ((error = vcpu_set_state_locked(vcpu, newstate)) != 0)
866		panic("Error %d setting state to %d", error, newstate);
867}
868
869/*
870 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
871 */
872static int
873vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
874{
875	struct vm_exit *vmexit;
876	struct vcpu *vcpu;
877	int t, timo;
878
879	vcpu = &vm->vcpu[vcpuid];
880
881	vcpu_lock(vcpu);
882
883	/*
884	 * Do a final check for pending NMI or interrupts before
885	 * really putting this thread to sleep.
886	 *
887	 * These interrupts could have happened any time after we
888	 * returned from VMRUN() and before we grabbed the vcpu lock.
889	 */
890	if (!vm_nmi_pending(vm, vcpuid) &&
891	    (intr_disabled || vlapic_pending_intr(vcpu->vlapic) < 0)) {
892		t = ticks;
893		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
894		if (vlapic_enabled(vcpu->vlapic)) {
895			/*
896			 * XXX msleep_spin() is not interruptible so use the
897			 * 'timo' to put an upper bound on the sleep time.
898			 */
899			timo = hz;
900			msleep_spin(vcpu, &vcpu->mtx, "vmidle", timo);
901		} else {
902			/*
903			 * Spindown the vcpu if the apic is disabled and it
904			 * had entered the halted state.
905			 */
906			*retu = true;
907			vmexit = vm_exitinfo(vm, vcpuid);
908			vmexit->exitcode = VM_EXITCODE_SPINDOWN_CPU;
909			VCPU_CTR0(vm, vcpuid, "spinning down cpu");
910		}
911		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
912		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
913	}
914	vcpu_unlock(vcpu);
915
916	return (0);
917}
918
919static int
920vm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
921{
922	int rv, ftype;
923	struct vm_map *map;
924	struct vcpu *vcpu;
925	struct vm_exit *vme;
926
927	vcpu = &vm->vcpu[vcpuid];
928	vme = &vcpu->exitinfo;
929
930	ftype = vme->u.paging.fault_type;
931	KASSERT(ftype == VM_PROT_READ ||
932	    ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
933	    ("vm_handle_paging: invalid fault_type %d", ftype));
934
935	if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
936		rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
937		    vme->u.paging.gpa, ftype);
938		if (rv == 0)
939			goto done;
940	}
941
942	map = &vm->vmspace->vm_map;
943	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
944
945	VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, "
946	    "ftype = %d", rv, vme->u.paging.gpa, ftype);
947
948	if (rv != KERN_SUCCESS)
949		return (EFAULT);
950done:
951	/* restart execution at the faulting instruction */
952	vme->inst_length = 0;
953
954	return (0);
955}
956
957static int
958vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
959{
960	struct vie *vie;
961	struct vcpu *vcpu;
962	struct vm_exit *vme;
963	int error, inst_length;
964	uint64_t rip, gla, gpa, cr3;
965	mem_region_read_t mread;
966	mem_region_write_t mwrite;
967
968	vcpu = &vm->vcpu[vcpuid];
969	vme = &vcpu->exitinfo;
970
971	rip = vme->rip;
972	inst_length = vme->inst_length;
973
974	gla = vme->u.inst_emul.gla;
975	gpa = vme->u.inst_emul.gpa;
976	cr3 = vme->u.inst_emul.cr3;
977	vie = &vme->u.inst_emul.vie;
978
979	vie_init(vie);
980
981	/* Fetch, decode and emulate the faulting instruction */
982	if (vmm_fetch_instruction(vm, vcpuid, rip, inst_length, cr3, vie) != 0)
983		return (EFAULT);
984
985	if (vmm_decode_instruction(vm, vcpuid, gla, vie) != 0)
986		return (EFAULT);
987
988	/* return to userland unless this is an in-kernel emulated device */
989	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
990		mread = lapic_mmio_read;
991		mwrite = lapic_mmio_write;
992	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
993		mread = vioapic_mmio_read;
994		mwrite = vioapic_mmio_write;
995	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
996		mread = vhpet_mmio_read;
997		mwrite = vhpet_mmio_write;
998	} else {
999		*retu = true;
1000		return (0);
1001	}
1002
1003	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, mread, mwrite,
1004	    retu);
1005
1006	return (error);
1007}
1008
1009int
1010vm_run(struct vm *vm, struct vm_run *vmrun)
1011{
1012	int error, vcpuid;
1013	struct vcpu *vcpu;
1014	struct pcb *pcb;
1015	uint64_t tscval, rip;
1016	struct vm_exit *vme;
1017	bool retu, intr_disabled;
1018	pmap_t pmap;
1019
1020	vcpuid = vmrun->cpuid;
1021
1022	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1023		return (EINVAL);
1024
1025	pmap = vmspace_pmap(vm->vmspace);
1026	vcpu = &vm->vcpu[vcpuid];
1027	vme = &vcpu->exitinfo;
1028	rip = vmrun->rip;
1029restart:
1030	critical_enter();
1031
1032	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
1033	    ("vm_run: absurd pm_active"));
1034
1035	tscval = rdtsc();
1036
1037	pcb = PCPU_GET(curpcb);
1038	set_pcb_flags(pcb, PCB_FULL_IRET);
1039
1040	restore_guest_msrs(vm, vcpuid);
1041	restore_guest_fpustate(vcpu);
1042
1043	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
1044	vcpu->hostcpu = curcpu;
1045	error = VMRUN(vm->cookie, vcpuid, rip, pmap);
1046	vcpu->hostcpu = NOCPU;
1047	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
1048
1049	save_guest_fpustate(vcpu);
1050	restore_host_msrs(vm, vcpuid);
1051
1052	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
1053
1054	critical_exit();
1055
1056	if (error == 0) {
1057		retu = false;
1058		switch (vme->exitcode) {
1059		case VM_EXITCODE_HLT:
1060			intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
1061			error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu);
1062			break;
1063		case VM_EXITCODE_PAGING:
1064			error = vm_handle_paging(vm, vcpuid, &retu);
1065			break;
1066		case VM_EXITCODE_INST_EMUL:
1067			error = vm_handle_inst_emul(vm, vcpuid, &retu);
1068			break;
1069		default:
1070			retu = true;	/* handled in userland */
1071			break;
1072		}
1073	}
1074
1075	if (error == 0 && retu == false) {
1076		rip = vme->rip + vme->inst_length;
1077		goto restart;
1078	}
1079
1080	/* copy the exit information */
1081	bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
1082	return (error);
1083}
1084
1085int
1086vm_inject_event(struct vm *vm, int vcpuid, int type,
1087		int vector, uint32_t code, int code_valid)
1088{
1089	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1090		return (EINVAL);
1091
1092	if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0)
1093		return (EINVAL);
1094
1095	if (vector < 0 || vector > 255)
1096		return (EINVAL);
1097
1098	return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid));
1099}
1100
1101static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
1102
1103int
1104vm_inject_nmi(struct vm *vm, int vcpuid)
1105{
1106	struct vcpu *vcpu;
1107
1108	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1109		return (EINVAL);
1110
1111	vcpu = &vm->vcpu[vcpuid];
1112
1113	vcpu->nmi_pending = 1;
1114	vcpu_notify_event(vm, vcpuid);
1115	return (0);
1116}
1117
1118int
1119vm_nmi_pending(struct vm *vm, int vcpuid)
1120{
1121	struct vcpu *vcpu;
1122
1123	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1124		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1125
1126	vcpu = &vm->vcpu[vcpuid];
1127
1128	return (vcpu->nmi_pending);
1129}
1130
1131void
1132vm_nmi_clear(struct vm *vm, int vcpuid)
1133{
1134	struct vcpu *vcpu;
1135
1136	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1137		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1138
1139	vcpu = &vm->vcpu[vcpuid];
1140
1141	if (vcpu->nmi_pending == 0)
1142		panic("vm_nmi_clear: inconsistent nmi_pending state");
1143
1144	vcpu->nmi_pending = 0;
1145	vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
1146}
1147
1148int
1149vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
1150{
1151	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1152		return (EINVAL);
1153
1154	if (type < 0 || type >= VM_CAP_MAX)
1155		return (EINVAL);
1156
1157	return (VMGETCAP(vm->cookie, vcpu, type, retval));
1158}
1159
1160int
1161vm_set_capability(struct vm *vm, int vcpu, int type, int val)
1162{
1163	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1164		return (EINVAL);
1165
1166	if (type < 0 || type >= VM_CAP_MAX)
1167		return (EINVAL);
1168
1169	return (VMSETCAP(vm->cookie, vcpu, type, val));
1170}
1171
1172uint64_t *
1173vm_guest_msrs(struct vm *vm, int cpu)
1174{
1175	return (vm->vcpu[cpu].guest_msrs);
1176}
1177
1178struct vlapic *
1179vm_lapic(struct vm *vm, int cpu)
1180{
1181	return (vm->vcpu[cpu].vlapic);
1182}
1183
1184struct vioapic *
1185vm_ioapic(struct vm *vm)
1186{
1187
1188	return (vm->vioapic);
1189}
1190
1191struct vhpet *
1192vm_hpet(struct vm *vm)
1193{
1194
1195	return (vm->vhpet);
1196}
1197
1198boolean_t
1199vmm_is_pptdev(int bus, int slot, int func)
1200{
1201	int found, i, n;
1202	int b, s, f;
1203	char *val, *cp, *cp2;
1204
1205	/*
1206	 * XXX
1207	 * The length of an environment variable is limited to 128 bytes which
1208	 * puts an upper limit on the number of passthru devices that may be
1209	 * specified using a single environment variable.
1210	 *
1211	 * Work around this by scanning multiple environment variable
1212	 * names instead of a single one - yuck!
1213	 */
1214	const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
1215
1216	/* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
1217	found = 0;
1218	for (i = 0; names[i] != NULL && !found; i++) {
1219		cp = val = getenv(names[i]);
1220		while (cp != NULL && *cp != '\0') {
1221			if ((cp2 = strchr(cp, ' ')) != NULL)
1222				*cp2 = '\0';
1223
1224			n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
1225			if (n == 3 && bus == b && slot == s && func == f) {
1226				found = 1;
1227				break;
1228			}
1229
1230			if (cp2 != NULL)
1231				*cp2++ = ' ';
1232
1233			cp = cp2;
1234		}
1235		freeenv(val);
1236	}
1237	return (found);
1238}
1239
1240void *
1241vm_iommu_domain(struct vm *vm)
1242{
1243
1244	return (vm->iommu);
1245}
1246
1247int
1248vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1249{
1250	int error;
1251	struct vcpu *vcpu;
1252
1253	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1254		panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
1255
1256	vcpu = &vm->vcpu[vcpuid];
1257
1258	vcpu_lock(vcpu);
1259	error = vcpu_set_state_locked(vcpu, newstate);
1260	vcpu_unlock(vcpu);
1261
1262	return (error);
1263}
1264
1265enum vcpu_state
1266vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
1267{
1268	struct vcpu *vcpu;
1269	enum vcpu_state state;
1270
1271	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1272		panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
1273
1274	vcpu = &vm->vcpu[vcpuid];
1275
1276	vcpu_lock(vcpu);
1277	state = vcpu->state;
1278	if (hostcpu != NULL)
1279		*hostcpu = vcpu->hostcpu;
1280	vcpu_unlock(vcpu);
1281
1282	return (state);
1283}
1284
1285void
1286vm_activate_cpu(struct vm *vm, int vcpuid)
1287{
1288
1289	if (vcpuid >= 0 && vcpuid < VM_MAXCPU)
1290		CPU_SET(vcpuid, &vm->active_cpus);
1291}
1292
1293cpuset_t
1294vm_active_cpus(struct vm *vm)
1295{
1296
1297	return (vm->active_cpus);
1298}
1299
1300void *
1301vcpu_stats(struct vm *vm, int vcpuid)
1302{
1303
1304	return (vm->vcpu[vcpuid].stats);
1305}
1306
1307int
1308vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
1309{
1310	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1311		return (EINVAL);
1312
1313	*state = vm->vcpu[vcpuid].x2apic_state;
1314
1315	return (0);
1316}
1317
1318int
1319vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
1320{
1321	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1322		return (EINVAL);
1323
1324	if (state >= X2APIC_STATE_LAST)
1325		return (EINVAL);
1326
1327	vm->vcpu[vcpuid].x2apic_state = state;
1328
1329	vlapic_set_x2apic_state(vm, vcpuid, state);
1330
1331	return (0);
1332}
1333
1334/*
1335 * This function is called to ensure that a vcpu "sees" a pending event
1336 * as soon as possible:
1337 * - If the vcpu thread is sleeping then it is woken up.
1338 * - If the vcpu is running on a different host_cpu then an IPI will be directed
1339 *   to the host_cpu to cause the vcpu to trap into the hypervisor.
1340 */
1341void
1342vcpu_notify_event(struct vm *vm, int vcpuid)
1343{
1344	int hostcpu;
1345	struct vcpu *vcpu;
1346
1347	vcpu = &vm->vcpu[vcpuid];
1348
1349	vcpu_lock(vcpu);
1350	hostcpu = vcpu->hostcpu;
1351	if (hostcpu == NOCPU) {
1352		if (vcpu->state == VCPU_SLEEPING)
1353			wakeup_one(vcpu);
1354	} else {
1355		if (vcpu->state != VCPU_RUNNING)
1356			panic("invalid vcpu state %d", vcpu->state);
1357		if (hostcpu != curcpu)
1358			ipi_cpu(hostcpu, vmm_ipinum);
1359	}
1360	vcpu_unlock(vcpu);
1361}
1362
1363struct vmspace *
1364vm_get_vmspace(struct vm *vm)
1365{
1366
1367	return (vm->vmspace);
1368}
1369
1370int
1371vm_apicid2vcpuid(struct vm *vm, int apicid)
1372{
1373	/*
1374	 * XXX apic id is assumed to be numerically identical to vcpu id
1375	 */
1376	return (apicid);
1377}
1378