bhyverun.c revision 246686
1169689Skan/*-
2169689Skan * Copyright (c) 2011 NetApp, Inc.
3132718Skan * All rights reserved.
4169689Skan *
590075Sobrien * Redistribution and use in source and binary forms, with or without
690075Sobrien * modification, are permitted provided that the following conditions
790075Sobrien * are met:
890075Sobrien * 1. Redistributions of source code must retain the above copyright
990075Sobrien *    notice, this list of conditions and the following disclaimer.
1090075Sobrien * 2. Redistributions in binary form must reproduce the above copyright
1190075Sobrien *    notice, this list of conditions and the following disclaimer in the
1290075Sobrien *    documentation and/or other materials provided with the distribution.
1390075Sobrien *
14169689Skan * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15169689Skan * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16169689Skan * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17169689Skan * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
1890075Sobrien * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19169689Skan * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20169689Skan * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21169689Skan * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22169689Skan * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23169689Skan * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24169689Skan * SUCH DAMAGE.
25169689Skan *
2690075Sobrien * $FreeBSD: head/usr.sbin/bhyve/bhyverun.c 246686 2013-02-11 20:36:07Z neel $
27169689Skan */
28169689Skan
29169689Skan#include <sys/cdefs.h>
30169689Skan__FBSDID("$FreeBSD: head/usr.sbin/bhyve/bhyverun.c 246686 2013-02-11 20:36:07Z neel $");
31169689Skan
32169689Skan#include <sys/types.h>
33169689Skan#include <sys/mman.h>
34169689Skan#include <sys/time.h>
35169689Skan
36169689Skan#include <machine/segments.h>
37169689Skan
38169689Skan#include <stdio.h>
39169689Skan#include <stdlib.h>
40169689Skan#include <libgen.h>
41169689Skan#include <unistd.h>
42169689Skan#include <assert.h>
43169689Skan#include <errno.h>
44169689Skan#include <signal.h>
45169689Skan#include <pthread.h>
46169689Skan#include <pthread_np.h>
47169689Skan
48169689Skan#include <machine/vmm.h>
49169689Skan#include <vmmapi.h>
50169689Skan
51169689Skan#include "bhyverun.h"
52169689Skan#include "acpi.h"
53169689Skan#include "inout.h"
54169689Skan#include "dbgport.h"
55169689Skan#include "mem.h"
56169689Skan#include "mevent.h"
57169689Skan#include "mptbl.h"
58169689Skan#include "pci_emul.h"
59169689Skan#include "xmsr.h"
60169689Skan#include "ioapic.h"
61169689Skan#include "spinup_ap.h"
62169689Skan
63169689Skan#define	DEFAULT_GUEST_HZ	100
64169689Skan#define	DEFAULT_GUEST_TSLICE	200
65169689Skan
66169689Skan#define GUEST_NIO_PORT		0x488	/* guest upcalls via i/o port */
67169689Skan
68169689Skan#define	VMEXIT_SWITCH		0	/* force vcpu switch in mux mode */
69169689Skan#define	VMEXIT_CONTINUE		1	/* continue from next instruction */
70169689Skan#define	VMEXIT_RESTART		2	/* restart current instruction */
71169689Skan#define	VMEXIT_ABORT		3	/* abort the vm run loop */
72169689Skan#define	VMEXIT_RESET		4	/* guest machine has reset */
73169689Skan
74169689Skan#define MB		(1024UL * 1024)
75169689Skan#define GB		(1024UL * MB)
76169689Skan
77169689Skantypedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu);
78169689Skan
79169689Skanint guest_tslice = DEFAULT_GUEST_TSLICE;
80169689Skanint guest_hz = DEFAULT_GUEST_HZ;
81169689Skanchar *vmname;
82169689Skan
83169689Skanu_long lomem_sz;
84169689Skanu_long himem_sz;
85169689Skan
8690075Sobrienint guest_ncpus;
87169689Skan
88169689Skanstatic int pincpu = -1;
89169689Skanstatic int guest_vcpu_mux;
90169689Skanstatic int guest_vmexit_on_hlt, guest_vmexit_on_pause, disable_x2apic;
91169689Skan
92169689Skanstatic int foundcpus;
93169689Skan
9490075Sobrienstatic int strictio;
95169689Skan
96169689Skanstatic int acpi;
97169689Skan
98169689Skanstatic char *lomem_addr;
99169689Skanstatic char *himem_addr;
100169689Skan
101169689Skanstatic char *progname;
10290075Sobrienstatic const int BSP = 0;
103169689Skan
104169689Skanstatic int cpumask;
105169689Skan
106169689Skanstatic void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip);
107169689Skan
108169689Skanstruct vm_exit vmexit[VM_MAXCPU];
109169689Skan
110169689Skanstruct fbsdstats {
111169689Skan        uint64_t        vmexit_bogus;
112169689Skan        uint64_t        vmexit_bogus_switch;
113169689Skan        uint64_t        vmexit_hlt;
114169689Skan        uint64_t        vmexit_pause;
115169689Skan        uint64_t        vmexit_mtrap;
116169689Skan        uint64_t        vmexit_paging;
117169689Skan        uint64_t        cpu_switch_rotate;
118169689Skan        uint64_t        cpu_switch_direct;
119169689Skan        int             io_reset;
120169689Skan} stats;
121169689Skan
122169689Skanstruct mt_vmm_info {
123169689Skan	pthread_t	mt_thr;
124169689Skan	struct vmctx	*mt_ctx;
125169689Skan	int		mt_vcpu;
126169689Skan} mt_vmm_info[VM_MAXCPU];
127169689Skan
128169689Skanstatic void
129169689Skanusage(int code)
130169689Skan{
131169689Skan
132169689Skan        fprintf(stderr,
133169689Skan                "Usage: %s [-aehABHIP][-g <gdb port>][-z <hz>][-s <pci>]"
134169689Skan		"[-S <pci>][-p pincpu][-n <pci>][-m lowmem][-M highmem]"
135169689Skan		" <vmname>\n"
136169689Skan		"       -a: local apic is in XAPIC mode (default is X2APIC)\n"
137169689Skan		"       -A: create an ACPI table\n"
138169689Skan		"       -g: gdb port (default is %d and 0 means don't open)\n"
139169689Skan		"       -c: # cpus (default 1)\n"
140169689Skan		"       -p: pin vcpu 'n' to host cpu 'pincpu + n'\n"
141169689Skan		"       -B: inject breakpoint exception on vm entry\n"
142169689Skan		"       -H: vmexit from the guest on hlt\n"
143169689Skan		"       -I: present an ioapic to the guest\n"
144169689Skan		"       -P: vmexit from the guest on pause\n"
145169689Skan		"	-e: exit on unhandled i/o access\n"
146169689Skan		"       -h: help\n"
147169689Skan		"       -z: guest hz (default is %d)\n"
148169689Skan		"       -s: <slot,driver,configinfo> PCI slot config\n"
149169689Skan		"       -S: <slot,driver,configinfo> legacy PCI slot config\n"
150169689Skan		"       -m: lowmem in MB\n"
151169689Skan		"       -M: highmem in MB\n"
152169689Skan		"       -x: mux vcpus to 1 hcpu\n"
153169689Skan		"       -t: mux vcpu timeslice hz (default %d)\n",
154169689Skan		progname, DEFAULT_GDB_PORT, DEFAULT_GUEST_HZ,
155169689Skan		DEFAULT_GUEST_TSLICE);
156169689Skan	exit(code);
157169689Skan}
158169689Skan
159169689Skanvoid *
160169689Skanpaddr_guest2host(uintptr_t gaddr)
161169689Skan{
162169689Skan	if (lomem_sz == 0)
163169689Skan		return (NULL);
164169689Skan
165169689Skan	if (gaddr < lomem_sz) {
166169689Skan		return ((void *)(lomem_addr + gaddr));
167169689Skan	} else if (gaddr >= 4*GB && gaddr < (4*GB + himem_sz)) {
168169689Skan		return ((void *)(himem_addr + gaddr - 4*GB));
169169689Skan	} else
170169689Skan		return (NULL);
171169689Skan}
172169689Skan
173169689Skanint
174169689Skanfbsdrun_disable_x2apic(void)
175169689Skan{
176169689Skan
177169689Skan	return (disable_x2apic);
17890075Sobrien}
179169689Skan
18090075Sobrienint
181169689Skanfbsdrun_vmexit_on_pause(void)
182169689Skan{
183169689Skan
184169689Skan	return (guest_vmexit_on_pause);
185169689Skan}
186169689Skan
187169689Skanint
188169689Skanfbsdrun_vmexit_on_hlt(void)
189169689Skan{
19090075Sobrien
191169689Skan	return (guest_vmexit_on_hlt);
19290075Sobrien}
193169689Skan
194169689Skanint
195169689Skanfbsdrun_muxed(void)
196169689Skan{
197169689Skan
198169689Skan	return (guest_vcpu_mux);
199169689Skan}
200169689Skan
201169689Skanstatic void *
202169689Skanfbsdrun_start_thread(void *param)
203169689Skan{
20490075Sobrien	char tname[MAXCOMLEN + 1];
205169689Skan	struct mt_vmm_info *mtp;
20690075Sobrien	int vcpu;
207169689Skan
208169689Skan	mtp = param;
20990075Sobrien	vcpu = mtp->mt_vcpu;
210169689Skan
211169689Skan	snprintf(tname, sizeof(tname), "%s vcpu %d", vmname, vcpu);
212169689Skan	pthread_set_name_np(mtp->mt_thr, tname);
213169689Skan
214169689Skan	vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip);
215169689Skan
21690075Sobrien	/* not reached */
217169689Skan	exit(1);
21890075Sobrien	return (NULL);
219169689Skan}
220169689Skan
221169689Skanvoid
222169689Skanfbsdrun_addcpu(struct vmctx *ctx, int vcpu, uint64_t rip)
22390075Sobrien{
224169689Skan	int error;
22590075Sobrien
226169689Skan	if (cpumask & (1 << vcpu)) {
227169689Skan		fprintf(stderr, "addcpu: attempting to add existing cpu %d\n",
228169689Skan		    vcpu);
229169689Skan		exit(1);
230169689Skan	}
231169689Skan
232169689Skan	cpumask |= 1 << vcpu;
233169689Skan	foundcpus++;
23490075Sobrien
235169689Skan	/*
23690075Sobrien	 * Set up the vmexit struct to allow execution to start
237169689Skan	 * at the given RIP
238169689Skan	 */
239169689Skan	vmexit[vcpu].rip = rip;
240169689Skan	vmexit[vcpu].inst_length = 0;
241169689Skan
242169689Skan	if (vcpu == BSP || !guest_vcpu_mux){
243169689Skan		mt_vmm_info[vcpu].mt_ctx = ctx;
244169689Skan		mt_vmm_info[vcpu].mt_vcpu = vcpu;
245169689Skan
246169689Skan		error = pthread_create(&mt_vmm_info[vcpu].mt_thr, NULL,
24790075Sobrien				fbsdrun_start_thread, &mt_vmm_info[vcpu]);
248169689Skan		assert(error == 0);
249169689Skan	}
250169689Skan}
251169689Skan
252169689Skanstatic int
253169689Skanfbsdrun_get_next_cpu(int curcpu)
254169689Skan{
255169689Skan
256169689Skan	/*
257169689Skan	 * Get the next available CPU. Assumes they arrive
258169689Skan	 * in ascending order with no gaps.
259169689Skan	 */
260169689Skan	return ((curcpu + 1) % foundcpus);
261169689Skan}
262169689Skan
263169689Skanstatic int
264169689Skanvmexit_catch_reset(void)
265169689Skan{
266169689Skan        stats.io_reset++;
267169689Skan        return (VMEXIT_RESET);
268169689Skan}
269169689Skan
270169689Skanstatic int
271169689Skanvmexit_catch_inout(void)
272169689Skan{
273169689Skan	return (VMEXIT_ABORT);
274169689Skan}
275169689Skan
276169689Skanstatic int
277169689Skanvmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu,
278169689Skan		     uint32_t eax)
279169689Skan{
280169689Skan#if PG_DEBUG /* put all types of debug here */
281169689Skan        if (eax == 0) {
282169689Skan		pause_noswitch = 1;
283169689Skan	} else if (eax == 1) {
284169689Skan		pause_noswitch = 0;
285169689Skan	} else {
286169689Skan		pause_noswitch = 0;
287169689Skan		if (eax == 5) {
288169689Skan			vm_set_capability(ctx, *pvcpu, VM_CAP_MTRAP_EXIT, 1);
289169689Skan		}
290169689Skan	}
291169689Skan#endif
292169689Skan        return (VMEXIT_CONTINUE);
293169689Skan}
294169689Skan
295169689Skanstatic int
296169689Skanvmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
297169689Skan{
298169689Skan	int error;
299169689Skan	int bytes, port, in, out;
300169689Skan	uint32_t eax;
301169689Skan	int vcpu;
302169689Skan
303169689Skan	vcpu = *pvcpu;
304169689Skan
305169689Skan	port = vme->u.inout.port;
306169689Skan	bytes = vme->u.inout.bytes;
307169689Skan	eax = vme->u.inout.eax;
308169689Skan	in = vme->u.inout.in;
309169689Skan	out = !in;
310169689Skan
311169689Skan	/* We don't deal with these */
312169689Skan	if (vme->u.inout.string || vme->u.inout.rep)
313169689Skan		return (VMEXIT_ABORT);
314169689Skan
315169689Skan	/* Special case of guest reset */
316169689Skan	if (out && port == 0x64 && (uint8_t)eax == 0xFE)
317169689Skan		return (vmexit_catch_reset());
318169689Skan
319169689Skan        /* Extra-special case of host notifications */
320169689Skan        if (out && port == GUEST_NIO_PORT)
321169689Skan                return (vmexit_handle_notify(ctx, vme, pvcpu, eax));
322169689Skan
323169689Skan	error = emulate_inout(ctx, vcpu, in, port, bytes, &eax, strictio);
324169689Skan	if (error == 0 && in)
325169689Skan		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, eax);
326169689Skan
327169689Skan	if (error == 0)
328169689Skan		return (VMEXIT_CONTINUE);
329169689Skan	else {
330169689Skan		fprintf(stderr, "Unhandled %s%c 0x%04x\n",
331169689Skan			in ? "in" : "out",
332169689Skan			bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port);
333169689Skan		return (vmexit_catch_inout());
334169689Skan	}
335169689Skan}
336169689Skan
337169689Skanstatic int
338169689Skanvmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
339169689Skan{
340169689Skan	fprintf(stderr, "vm exit rdmsr 0x%x, cpu %d\n", vme->u.msr.code,
341169689Skan	    *pvcpu);
342169689Skan	return (VMEXIT_ABORT);
343169689Skan}
344169689Skan
345169689Skanstatic int
346169689Skanvmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
347169689Skan{
348169689Skan	int newcpu;
349169689Skan	int retval = VMEXIT_CONTINUE;
350169689Skan
351169689Skan	newcpu = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code,vme->u.msr.wval);
352169689Skan
353169689Skan	if (guest_vcpu_mux && *pvcpu != newcpu) {
354169689Skan                retval = VMEXIT_SWITCH;
355169689Skan                *pvcpu = newcpu;
356169689Skan        }
357169689Skan
358169689Skan        return (retval);
359169689Skan}
360169689Skan
361169689Skanstatic int
362169689Skanvmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
363169689Skan{
364169689Skan	int newcpu;
365169689Skan	int retval = VMEXIT_CONTINUE;
366169689Skan
367169689Skan	newcpu = spinup_ap(ctx, *pvcpu,
368169689Skan			   vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip);
369169689Skan
370169689Skan	if (guest_vcpu_mux && *pvcpu != newcpu) {
371169689Skan		retval = VMEXIT_SWITCH;
372169689Skan		*pvcpu = newcpu;
373169689Skan	}
374169689Skan
375169689Skan	return (retval);
376169689Skan}
377169689Skan
378169689Skanstatic int
379169689Skanvmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
380169689Skan{
381169689Skan
382169689Skan	fprintf(stderr, "vm exit[%d]\n", *pvcpu);
383169689Skan	fprintf(stderr, "\treason\t\tVMX\n");
384169689Skan	fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
385169689Skan	fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
386169689Skan	fprintf(stderr, "\terror\t\t%d\n", vmexit->u.vmx.error);
387169689Skan	fprintf(stderr, "\texit_reason\t%u\n", vmexit->u.vmx.exit_reason);
388169689Skan	fprintf(stderr, "\tqualification\t0x%016lx\n",
389169689Skan	    vmexit->u.vmx.exit_qualification);
390169689Skan
391169689Skan	return (VMEXIT_ABORT);
392169689Skan}
393169689Skan
394169689Skanstatic int bogus_noswitch = 1;
395169689Skan
396169689Skanstatic int
397169689Skanvmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
398169689Skan{
399169689Skan	stats.vmexit_bogus++;
400169689Skan
401169689Skan	if (!guest_vcpu_mux || guest_ncpus == 1 || bogus_noswitch) {
402169689Skan		return (VMEXIT_RESTART);
403169689Skan	} else {
404169689Skan		stats.vmexit_bogus_switch++;
405169689Skan		vmexit->inst_length = 0;
406169689Skan		*pvcpu = -1;
407169689Skan		return (VMEXIT_SWITCH);
408169689Skan	}
409169689Skan}
410169689Skan
411169689Skanstatic int
412169689Skanvmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
413169689Skan{
414169689Skan	stats.vmexit_hlt++;
415169689Skan	if (fbsdrun_muxed()) {
416169689Skan		*pvcpu = -1;
417169689Skan		return (VMEXIT_SWITCH);
418169689Skan	} else {
419169689Skan		/*
420169689Skan		 * Just continue execution with the next instruction. We use
421169689Skan		 * the HLT VM exit as a way to be friendly with the host
422169689Skan		 * scheduler.
423169689Skan		 */
424169689Skan		return (VMEXIT_CONTINUE);
425169689Skan	}
426169689Skan}
427169689Skan
428169689Skanstatic int pause_noswitch;
429169689Skan
430169689Skanstatic int
431169689Skanvmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
432169689Skan{
433169689Skan	stats.vmexit_pause++;
434169689Skan
435169689Skan	if (fbsdrun_muxed() && !pause_noswitch) {
436169689Skan		*pvcpu = -1;
437169689Skan		return (VMEXIT_SWITCH);
438169689Skan        } else {
439169689Skan		return (VMEXIT_CONTINUE);
440169689Skan	}
441169689Skan}
442169689Skan
443169689Skanstatic int
444169689Skanvmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
445169689Skan{
446169689Skan	stats.vmexit_mtrap++;
447169689Skan
448169689Skan	return (VMEXIT_RESTART);
449169689Skan}
450169689Skan
451169689Skanstatic int
452169689Skanvmexit_paging(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
453169689Skan{
454169689Skan	int err;
455169689Skan	stats.vmexit_paging++;
456169689Skan
457169689Skan	err = emulate_mem(ctx, *pvcpu, vmexit->u.paging.gpa,
458169689Skan			  &vmexit->u.paging.vie);
459169689Skan
460169689Skan	if (err) {
461169689Skan		if (err == EINVAL) {
462169689Skan			fprintf(stderr,
463169689Skan			    "Failed to emulate instruction at 0x%lx\n",
464169689Skan			    vmexit->rip);
465169689Skan		} else if (err == ESRCH) {
466169689Skan			fprintf(stderr, "Unhandled memory access to 0x%lx\n",
467169689Skan			    vmexit->u.paging.gpa);
468169689Skan		}
469169689Skan
470169689Skan		return (VMEXIT_ABORT);
471169689Skan	}
472169689Skan
473169689Skan	return (VMEXIT_CONTINUE);
474169689Skan}
475169689Skan
476169689Skanstatic void
477169689Skansigalrm(int sig)
478169689Skan{
479169689Skan	return;
480169689Skan}
481169689Skan
482169689Skanstatic void
483169689Skansetup_timeslice(void)
484169689Skan{
485169689Skan	struct sigaction sa;
486169689Skan	struct itimerval itv;
487169689Skan	int error;
488169689Skan
489169689Skan	/*
490169689Skan	 * Setup a realtime timer to generate a SIGALRM at a
491169689Skan	 * frequency of 'guest_tslice' ticks per second.
492169689Skan	 */
493169689Skan	sigemptyset(&sa.sa_mask);
494169689Skan	sa.sa_flags = 0;
495169689Skan	sa.sa_handler = sigalrm;
496169689Skan
497169689Skan	error = sigaction(SIGALRM, &sa, NULL);
498169689Skan	assert(error == 0);
499169689Skan
500169689Skan	itv.it_interval.tv_sec = 0;
501169689Skan	itv.it_interval.tv_usec = 1000000 / guest_tslice;
502169689Skan	itv.it_value.tv_sec = 0;
503169689Skan	itv.it_value.tv_usec = 1000000 / guest_tslice;
504169689Skan
505169689Skan	error = setitimer(ITIMER_REAL, &itv, NULL);
506169689Skan	assert(error == 0);
507169689Skan}
508169689Skan
509169689Skanstatic vmexit_handler_t handler[VM_EXITCODE_MAX] = {
510169689Skan	[VM_EXITCODE_INOUT]  = vmexit_inout,
511169689Skan	[VM_EXITCODE_VMX]    = vmexit_vmx,
512169689Skan	[VM_EXITCODE_BOGUS]  = vmexit_bogus,
513169689Skan	[VM_EXITCODE_RDMSR]  = vmexit_rdmsr,
514169689Skan	[VM_EXITCODE_WRMSR]  = vmexit_wrmsr,
515169689Skan	[VM_EXITCODE_MTRAP]  = vmexit_mtrap,
516169689Skan	[VM_EXITCODE_PAGING] = vmexit_paging,
517169689Skan	[VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap,
518169689Skan};
519169689Skan
520169689Skanstatic void
521169689Skanvm_loop(struct vmctx *ctx, int vcpu, uint64_t rip)
522169689Skan{
523169689Skan	cpuset_t mask;
524169689Skan	int error, rc, prevcpu;
525169689Skan
526169689Skan	if (guest_vcpu_mux)
527169689Skan		setup_timeslice();
528169689Skan
529169689Skan	if (pincpu >= 0) {
530169689Skan		CPU_ZERO(&mask);
531169689Skan		CPU_SET(pincpu + vcpu, &mask);
532169689Skan		error = pthread_setaffinity_np(pthread_self(),
533169689Skan					       sizeof(mask), &mask);
534169689Skan		assert(error == 0);
535169689Skan	}
536169689Skan
537169689Skan	while (1) {
538169689Skan		error = vm_run(ctx, vcpu, rip, &vmexit[vcpu]);
539169689Skan		if (error != 0) {
540169689Skan			/*
541169689Skan			 * It is possible that 'vmmctl' or some other process
542169689Skan			 * has transitioned the vcpu to CANNOT_RUN state right
543169689Skan			 * before we tried to transition it to RUNNING.
544169689Skan			 *
545169689Skan			 * This is expected to be temporary so just retry.
546169689Skan			 */
547169689Skan			if (errno == EBUSY)
548169689Skan				continue;
549169689Skan			else
550169689Skan				break;
551169689Skan		}
552169689Skan
553169689Skan		prevcpu = vcpu;
554169689Skan                rc = (*handler[vmexit[vcpu].exitcode])(ctx, &vmexit[vcpu],
555169689Skan                                                       &vcpu);
556169689Skan		switch (rc) {
557169689Skan                case VMEXIT_SWITCH:
558169689Skan			assert(guest_vcpu_mux);
559169689Skan			if (vcpu == -1) {
560169689Skan				stats.cpu_switch_rotate++;
561169689Skan				vcpu = fbsdrun_get_next_cpu(prevcpu);
562169689Skan			} else {
563169689Skan				stats.cpu_switch_direct++;
564169689Skan			}
565169689Skan			/* fall through */
566169689Skan		case VMEXIT_CONTINUE:
567169689Skan                        rip = vmexit[vcpu].rip + vmexit[vcpu].inst_length;
568169689Skan			break;
569169689Skan		case VMEXIT_RESTART:
570169689Skan                        rip = vmexit[vcpu].rip;
571169689Skan			break;
572169689Skan		case VMEXIT_RESET:
573169689Skan			exit(0);
574169689Skan		default:
575169689Skan			exit(1);
576169689Skan		}
577169689Skan	}
578169689Skan	fprintf(stderr, "vm_run error %d, errno %d\n", error, errno);
579169689Skan}
580169689Skan
581169689Skanstatic int
582169689Skannum_vcpus_allowed(struct vmctx *ctx)
583169689Skan{
584169689Skan	int tmp, error;
585169689Skan
586169689Skan	error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp);
587169689Skan
588169689Skan	/*
589169689Skan	 * The guest is allowed to spinup more than one processor only if the
590169689Skan	 * UNRESTRICTED_GUEST capability is available.
591169689Skan	 */
592169689Skan	if (error == 0)
593169689Skan		return (VM_MAXCPU);
594169689Skan	else
595169689Skan		return (1);
596169689Skan}
597169689Skan
598169689Skanint
599169689Skanmain(int argc, char *argv[])
600169689Skan{
601169689Skan	int c, error, gdb_port, inject_bkpt, tmp, err, ioapic, bvmcons;
602169689Skan	int max_vcpus;
603169689Skan	struct vmctx *ctx;
604169689Skan	uint64_t rip;
605169689Skan
606169689Skan	bvmcons = 0;
607169689Skan	inject_bkpt = 0;
608169689Skan	progname = basename(argv[0]);
609169689Skan	gdb_port = DEFAULT_GDB_PORT;
610169689Skan	guest_ncpus = 1;
611169689Skan	ioapic = 0;
612169689Skan
613169689Skan	while ((c = getopt(argc, argv, "abehABHIPxp:g:c:z:s:S:n:m:M:")) != -1) {
614169689Skan		switch (c) {
615169689Skan		case 'a':
616169689Skan			disable_x2apic = 1;
617169689Skan			break;
618169689Skan		case 'A':
619169689Skan			acpi = 1;
620169689Skan			break;
621169689Skan		case 'b':
622169689Skan			bvmcons = 1;
623169689Skan			break;
624169689Skan		case 'B':
625169689Skan			inject_bkpt = 1;
626169689Skan			break;
627169689Skan		case 'x':
628169689Skan			guest_vcpu_mux = 1;
629169689Skan			break;
630169689Skan		case 'p':
631169689Skan			pincpu = atoi(optarg);
632169689Skan			break;
633169689Skan                case 'c':
634169689Skan			guest_ncpus = atoi(optarg);
635169689Skan			break;
636169689Skan		case 'g':
637169689Skan			gdb_port = atoi(optarg);
638169689Skan			break;
639169689Skan		case 'z':
640169689Skan			guest_hz = atoi(optarg);
641169689Skan			break;
642169689Skan		case 't':
643169689Skan			guest_tslice = atoi(optarg);
644169689Skan			break;
645169689Skan		case 's':
646169689Skan			pci_parse_slot(optarg, 0);
647169689Skan			break;
648169689Skan		case 'S':
64990075Sobrien			pci_parse_slot(optarg, 1);
65090075Sobrien			break;
65190075Sobrien                case 'm':
65290075Sobrien			lomem_sz = strtoul(optarg, NULL, 0) * MB;
65390075Sobrien			break;
65490075Sobrien                case 'M':
65590075Sobrien			himem_sz = strtoul(optarg, NULL, 0) * MB;
65690075Sobrien			break;
65790075Sobrien		case 'H':
65890075Sobrien			guest_vmexit_on_hlt = 1;
65990075Sobrien			break;
66090075Sobrien		case 'I':
66190075Sobrien			ioapic = 1;
66290075Sobrien			break;
66390075Sobrien		case 'P':
66490075Sobrien			guest_vmexit_on_pause = 1;
66590075Sobrien			break;
66690075Sobrien		case 'e':
66790075Sobrien			strictio = 1;
66890075Sobrien			break;
66990075Sobrien		case 'h':
67090075Sobrien			usage(0);
671169689Skan		default:
67290075Sobrien			usage(1);
673169689Skan		}
674169689Skan	}
675169689Skan	argc -= optind;
676169689Skan	argv += optind;
677169689Skan
67890075Sobrien	if (argc != 1)
679169689Skan		usage(1);
68090075Sobrien
681169689Skan	/* No need to mux if guest is uni-processor */
682169689Skan	if (guest_ncpus <= 1)
683169689Skan		guest_vcpu_mux = 0;
684169689Skan
685169689Skan	/* vmexit on hlt if guest is muxed */
686169689Skan	if (guest_vcpu_mux) {
68790075Sobrien		guest_vmexit_on_hlt = 1;
688169689Skan		guest_vmexit_on_pause = 1;
68990075Sobrien	}
690169689Skan
691169689Skan	vmname = argv[0];
692169689Skan
69390075Sobrien	ctx = vm_open(vmname);
694169689Skan	if (ctx == NULL) {
69590075Sobrien		perror("vm_open");
696169689Skan		exit(1);
69790075Sobrien	}
69890075Sobrien
69990075Sobrien	max_vcpus = num_vcpus_allowed(ctx);
70090075Sobrien	if (guest_ncpus > max_vcpus) {
70190075Sobrien		fprintf(stderr, "%d vCPUs requested but only %d available\n",
70290075Sobrien			guest_ncpus, max_vcpus);
70390075Sobrien		exit(1);
70490075Sobrien	}
70590075Sobrien
70690075Sobrien	if (fbsdrun_vmexit_on_hlt()) {
70790075Sobrien		err = vm_get_capability(ctx, BSP, VM_CAP_HALT_EXIT, &tmp);
70890075Sobrien		if (err < 0) {
70990075Sobrien			fprintf(stderr, "VM exit on HLT not supported\n");
71090075Sobrien			exit(1);
71190075Sobrien		}
71290075Sobrien		vm_set_capability(ctx, BSP, VM_CAP_HALT_EXIT, 1);
71390075Sobrien		handler[VM_EXITCODE_HLT] = vmexit_hlt;
714169689Skan	}
71590075Sobrien
716169689Skan        if (fbsdrun_vmexit_on_pause()) {
717169689Skan		/*
718169689Skan		 * pause exit support required for this mode
719169689Skan		 */
720169689Skan		err = vm_get_capability(ctx, BSP, VM_CAP_PAUSE_EXIT, &tmp);
721169689Skan		if (err < 0) {
722169689Skan			fprintf(stderr,
723169689Skan			    "SMP mux requested, no pause support\n");
724169689Skan			exit(1);
72590075Sobrien		}
726169689Skan		vm_set_capability(ctx, BSP, VM_CAP_PAUSE_EXIT, 1);
727132718Skan		handler[VM_EXITCODE_PAUSE] = vmexit_pause;
728169689Skan        }
729169689Skan
730169689Skan	if (fbsdrun_disable_x2apic())
731132718Skan		err = vm_set_x2apic_state(ctx, BSP, X2APIC_DISABLED);
732169689Skan	else
733132718Skan		err = vm_set_x2apic_state(ctx, BSP, X2APIC_ENABLED);
734169689Skan
735169689Skan	if (err) {
736169689Skan		fprintf(stderr, "Unable to set x2apic state (%d)\n", err);
737169689Skan		exit(1);
738169689Skan	}
739132718Skan
740169689Skan	if (lomem_sz != 0) {
741132718Skan		lomem_addr = vm_map_memory(ctx, 0, lomem_sz);
742169689Skan		if (lomem_addr == (char *) MAP_FAILED) {
743169689Skan			lomem_sz = 0;
744169689Skan		} else if (himem_sz != 0) {
745132718Skan			himem_addr = vm_map_memory(ctx, 4*GB, himem_sz);
746169689Skan			if (himem_addr == (char *) MAP_FAILED) {
74790075Sobrien				lomem_sz = 0;
748169689Skan				himem_sz = 0;
749169689Skan			}
750169689Skan		}
751169689Skan	}
752169689Skan
753169689Skan	init_inout();
75490075Sobrien	init_pci(ctx);
755169689Skan	if (ioapic)
75690075Sobrien		ioapic_init(0);
757169689Skan
758169689Skan	if (gdb_port != 0)
759169689Skan		init_dbgport(gdb_port);
760169689Skan
761169689Skan	if (bvmcons)
76290075Sobrien		init_bvmcons();
763169689Skan
76490075Sobrien	error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip);
765169689Skan	assert(error == 0);
766169689Skan
767169689Skan	if (inject_bkpt) {
768169689Skan		error = vm_inject_event(ctx, BSP, VM_HW_EXCEPTION, IDT_BP);
769169689Skan		assert(error == 0);
77090075Sobrien	}
771169689Skan
77290075Sobrien	/*
773169689Skan	 * build the guest tables, MP etc.
774169689Skan	 */
775169689Skan	mptable_build(ctx, guest_ncpus, ioapic);
776169689Skan
777169689Skan	if (acpi) {
77890075Sobrien		error = acpi_build(ctx, guest_ncpus, ioapic);
779169689Skan		assert(error == 0);
780169689Skan	}
781169689Skan
78290075Sobrien	/*
783169689Skan	 * Add CPU 0
784169689Skan	 */
785169689Skan	fbsdrun_addcpu(ctx, BSP, rip);
786169689Skan
78790075Sobrien	/*
788169689Skan	 * Head off to the main event dispatch loop
78990075Sobrien	 */
790169689Skan	mevent_dispatch();
791169689Skan
792169689Skan	exit(1);
793169689Skan}
794169689Skan