1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2011 NetApp, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 * $FreeBSD: stable/11/usr.sbin/bhyve/bhyverun.c 363059 2020-07-09 20:55:18Z kaktus $
29 */
30
31#include <sys/cdefs.h>
32__FBSDID("$FreeBSD: stable/11/usr.sbin/bhyve/bhyverun.c 363059 2020-07-09 20:55:18Z kaktus $");
33
34#include <sys/types.h>
35#ifndef WITHOUT_CAPSICUM
36#include <sys/capsicum.h>
37#endif
38#include <sys/mman.h>
39#include <sys/time.h>
40
41#include <amd64/vmm/intel/vmcs.h>
42
43#include <machine/atomic.h>
44#include <machine/segments.h>
45
46#include <stdio.h>
47#include <stdlib.h>
48#include <string.h>
49#include <err.h>
50#include <errno.h>
51#include <libgen.h>
52#include <unistd.h>
53#include <assert.h>
54#include <errno.h>
55#include <pthread.h>
56#include <pthread_np.h>
57#include <sysexits.h>
58#include <stdbool.h>
59#include <stdint.h>
60#ifndef WITHOUT_CAPSICUM
61#include <nl_types.h>
62#include <termios.h>
63#endif
64
65#include <machine/vmm.h>
66#ifndef WITHOUT_CAPSICUM
67#include <machine/vmm_dev.h>
68#endif
69#include <vmmapi.h>
70
71#include "bhyverun.h"
72#include "acpi.h"
73#include "atkbdc.h"
74#include "inout.h"
75#include "dbgport.h"
76#include "fwctl.h"
77#include "ioapic.h"
78#include "mem.h"
79#include "mevent.h"
80#include "mptbl.h"
81#include "pci_emul.h"
82#include "pci_irq.h"
83#include "pci_lpc.h"
84#include "smbiostbl.h"
85#include "xmsr.h"
86#include "spinup_ap.h"
87#include "rtc.h"
88
89#define GUEST_NIO_PORT		0x488	/* guest upcalls via i/o port */
90
91#define MB		(1024UL * 1024)
92#define GB		(1024UL * MB)
93
94static const char * const vmx_exit_reason_desc[] = {
95	[EXIT_REASON_EXCEPTION] = "Exception or non-maskable interrupt (NMI)",
96	[EXIT_REASON_EXT_INTR] = "External interrupt",
97	[EXIT_REASON_TRIPLE_FAULT] = "Triple fault",
98	[EXIT_REASON_INIT] = "INIT signal",
99	[EXIT_REASON_SIPI] = "Start-up IPI (SIPI)",
100	[EXIT_REASON_IO_SMI] = "I/O system-management interrupt (SMI)",
101	[EXIT_REASON_SMI] = "Other SMI",
102	[EXIT_REASON_INTR_WINDOW] = "Interrupt window",
103	[EXIT_REASON_NMI_WINDOW] = "NMI window",
104	[EXIT_REASON_TASK_SWITCH] = "Task switch",
105	[EXIT_REASON_CPUID] = "CPUID",
106	[EXIT_REASON_GETSEC] = "GETSEC",
107	[EXIT_REASON_HLT] = "HLT",
108	[EXIT_REASON_INVD] = "INVD",
109	[EXIT_REASON_INVLPG] = "INVLPG",
110	[EXIT_REASON_RDPMC] = "RDPMC",
111	[EXIT_REASON_RDTSC] = "RDTSC",
112	[EXIT_REASON_RSM] = "RSM",
113	[EXIT_REASON_VMCALL] = "VMCALL",
114	[EXIT_REASON_VMCLEAR] = "VMCLEAR",
115	[EXIT_REASON_VMLAUNCH] = "VMLAUNCH",
116	[EXIT_REASON_VMPTRLD] = "VMPTRLD",
117	[EXIT_REASON_VMPTRST] = "VMPTRST",
118	[EXIT_REASON_VMREAD] = "VMREAD",
119	[EXIT_REASON_VMRESUME] = "VMRESUME",
120	[EXIT_REASON_VMWRITE] = "VMWRITE",
121	[EXIT_REASON_VMXOFF] = "VMXOFF",
122	[EXIT_REASON_VMXON] = "VMXON",
123	[EXIT_REASON_CR_ACCESS] = "Control-register accesses",
124	[EXIT_REASON_DR_ACCESS] = "MOV DR",
125	[EXIT_REASON_INOUT] = "I/O instruction",
126	[EXIT_REASON_RDMSR] = "RDMSR",
127	[EXIT_REASON_WRMSR] = "WRMSR",
128	[EXIT_REASON_INVAL_VMCS] =
129	    "VM-entry failure due to invalid guest state",
130	[EXIT_REASON_INVAL_MSR] = "VM-entry failure due to MSR loading",
131	[EXIT_REASON_MWAIT] = "MWAIT",
132	[EXIT_REASON_MTF] = "Monitor trap flag",
133	[EXIT_REASON_MONITOR] = "MONITOR",
134	[EXIT_REASON_PAUSE] = "PAUSE",
135	[EXIT_REASON_MCE_DURING_ENTRY] =
136	    "VM-entry failure due to machine-check event",
137	[EXIT_REASON_TPR] = "TPR below threshold",
138	[EXIT_REASON_APIC_ACCESS] = "APIC access",
139	[EXIT_REASON_VIRTUALIZED_EOI] = "Virtualized EOI",
140	[EXIT_REASON_GDTR_IDTR] = "Access to GDTR or IDTR",
141	[EXIT_REASON_LDTR_TR] = "Access to LDTR or TR",
142	[EXIT_REASON_EPT_FAULT] = "EPT violation",
143	[EXIT_REASON_EPT_MISCONFIG] = "EPT misconfiguration",
144	[EXIT_REASON_INVEPT] = "INVEPT",
145	[EXIT_REASON_RDTSCP] = "RDTSCP",
146	[EXIT_REASON_VMX_PREEMPT] = "VMX-preemption timer expired",
147	[EXIT_REASON_INVVPID] = "INVVPID",
148	[EXIT_REASON_WBINVD] = "WBINVD",
149	[EXIT_REASON_XSETBV] = "XSETBV",
150	[EXIT_REASON_APIC_WRITE] = "APIC write",
151	[EXIT_REASON_RDRAND] = "RDRAND",
152	[EXIT_REASON_INVPCID] = "INVPCID",
153	[EXIT_REASON_VMFUNC] = "VMFUNC",
154	[EXIT_REASON_ENCLS] = "ENCLS",
155	[EXIT_REASON_RDSEED] = "RDSEED",
156	[EXIT_REASON_PM_LOG_FULL] = "Page-modification log full",
157	[EXIT_REASON_XSAVES] = "XSAVES",
158	[EXIT_REASON_XRSTORS] = "XRSTORS"
159};
160
161typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu);
162extern int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu);
163
164char *vmname;
165
166int guest_ncpus;
167uint16_t cores, maxcpus, sockets, threads;
168
169char *guest_uuid_str;
170
171static int guest_vmexit_on_hlt, guest_vmexit_on_pause;
172static int virtio_msix = 1;
173static int x2apic_mode = 0;	/* default is xAPIC */
174static int destroy_on_poweroff = 0;
175
176static int strictio;
177static int strictmsr = 1;
178
179static int acpi;
180
181static char *progname;
182static const int BSP = 0;
183
184static cpuset_t cpumask;
185
186static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip);
187
188static struct vm_exit vmexit[VM_MAXCPU];
189
190struct bhyvestats {
191	uint64_t	vmexit_bogus;
192	uint64_t	vmexit_reqidle;
193	uint64_t	vmexit_hlt;
194	uint64_t	vmexit_pause;
195	uint64_t	vmexit_mtrap;
196	uint64_t	vmexit_inst_emul;
197	uint64_t	cpu_switch_rotate;
198	uint64_t	cpu_switch_direct;
199} stats;
200
201struct mt_vmm_info {
202	pthread_t	mt_thr;
203	struct vmctx	*mt_ctx;
204	int		mt_vcpu;
205} mt_vmm_info[VM_MAXCPU];
206
207static cpuset_t *vcpumap[VM_MAXCPU] = { NULL };
208
209static void
210usage(int code)
211{
212
213        fprintf(stderr,
214		"Usage: %s [-abehuwxACDHPSWY]\n"
215		"       %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n"
216		"       %*s [-g <gdb port>] [-l <lpc>]\n"
217		"       %*s [-m mem] [-p vcpu:hostcpu] [-s <pci>] [-U uuid] <vm>\n"
218		"       -a: local apic is in xAPIC mode (deprecated)\n"
219		"       -A: create ACPI tables\n"
220		"       -c: number of cpus and/or topology specification\n"
221		"       -C: include guest memory in core file\n"
222		"       -D: destroy on power-off\n"
223		"       -e: exit on unhandled I/O access\n"
224		"       -g: gdb port\n"
225		"       -h: help\n"
226		"       -H: vmexit from the guest on hlt\n"
227		"       -l: LPC device configuration\n"
228		"       -m: memory size in MB\n"
229		"       -p: pin 'vcpu' to 'hostcpu'\n"
230		"       -P: vmexit from the guest on pause\n"
231		"       -s: <slot,driver,configinfo> PCI slot config\n"
232		"       -S: guest memory cannot be swapped\n"
233		"       -u: RTC keeps UTC time\n"
234		"       -U: uuid\n"
235		"       -w: ignore unimplemented MSRs\n"
236		"       -W: force virtio to use single-vector MSI\n"
237		"       -x: local apic is in x2APIC mode\n"
238		"       -Y: disable MPtable generation\n",
239		progname, (int)strlen(progname), "", (int)strlen(progname), "",
240		(int)strlen(progname), "");
241
242	exit(code);
243}
244
245#ifndef WITHOUT_CAPSICUM
246/*
247 * 11-stable capsicum helpers
248 */
249static void
250bhyve_caph_cache_catpages(void)
251{
252
253	(void)catopen("libc", NL_CAT_LOCALE);
254}
255
256static int
257bhyve_caph_limit_stdoe(void)
258{
259	cap_rights_t rights;
260	unsigned long cmds[] = { TIOCGETA, TIOCGWINSZ };
261	int i, fds[] = { STDOUT_FILENO, STDERR_FILENO };
262
263	cap_rights_init(&rights, CAP_FCNTL, CAP_FSTAT, CAP_IOCTL);
264	cap_rights_set(&rights, CAP_WRITE);
265
266	for (i = 0; i < nitems(fds); i++) {
267		if (cap_rights_limit(fds[i], &rights) < 0 && errno != ENOSYS)
268			return (-1);
269
270		if (cap_ioctls_limit(fds[i], cmds, nitems(cmds)) < 0 && errno != ENOSYS)
271			return (-1);
272
273		if (cap_fcntls_limit(fds[i], CAP_FCNTL_GETFL) < 0 && errno != ENOSYS)
274			return (-1);
275	}
276
277	return (0);
278}
279
280#endif
281
282/*
283 * XXX This parser is known to have the following issues:
284 * 1.  It accepts null key=value tokens ",,".
285 * 2.  It accepts whitespace after = and before value.
286 * 3.  Values out of range of INT are silently wrapped.
287 * 4.  It doesn't check non-final values.
288 * 5.  The apparently bogus limits of UINT16_MAX are for future expansion.
289 *
290 * The acceptance of a null specification ('-c ""') is by design to match the
291 * manual page syntax specification, this results in a topology of 1 vCPU.
292 */
293static int
294topology_parse(const char *opt)
295{
296	uint64_t ncpus;
297	int c, chk, n, s, t, tmp;
298	char *cp, *str;
299	bool ns, scts;
300
301	c = 1, n = 1, s = 1, t = 1;
302	ns = false, scts = false;
303	str = strdup(opt);
304	if (str == NULL)
305		goto out;
306
307	while ((cp = strsep(&str, ",")) != NULL) {
308		if (sscanf(cp, "%i%n", &tmp, &chk) == 1) {
309			n = tmp;
310			ns = true;
311		} else if (sscanf(cp, "cpus=%i%n", &tmp, &chk) == 1) {
312			n = tmp;
313			ns = true;
314		} else if (sscanf(cp, "sockets=%i%n", &tmp, &chk) == 1) {
315			s = tmp;
316			scts = true;
317		} else if (sscanf(cp, "cores=%i%n", &tmp, &chk) == 1) {
318			c = tmp;
319			scts = true;
320		} else if (sscanf(cp, "threads=%i%n", &tmp, &chk) == 1) {
321			t = tmp;
322			scts = true;
323#ifdef notyet  /* Do not expose this until vmm.ko implements it */
324		} else if (sscanf(cp, "maxcpus=%i%n", &tmp, &chk) == 1) {
325			m = tmp;
326#endif
327		/* Skip the empty argument case from -c "" */
328		} else if (cp[0] == '\0')
329			continue;
330		else
331			goto out;
332		/* Any trailing garbage causes an error */
333		if (cp[chk] != '\0')
334			goto out;
335	}
336	free(str);
337	str = NULL;
338
339	/*
340	 * Range check 1 <= n <= UINT16_MAX all values
341	 */
342	if (n < 1 || s < 1 || c < 1 || t < 1 ||
343	    n > UINT16_MAX || s > UINT16_MAX || c > UINT16_MAX  ||
344	    t > UINT16_MAX)
345		return (-1);
346
347	/* If only the cpus was specified, use that as sockets */
348	if (!scts)
349		s = n;
350	/*
351	 * Compute sockets * cores * threads avoiding overflow
352	 * The range check above insures these are 16 bit values
353	 * If n was specified check it against computed ncpus
354	 */
355	ncpus = (uint64_t)s * c * t;
356	if (ncpus > UINT16_MAX || (ns && n != ncpus))
357		return (-1);
358
359	guest_ncpus = ncpus;
360	sockets = s;
361	cores = c;
362	threads = t;
363	return(0);
364
365out:
366	free(str);
367	return (-1);
368}
369
370static int
371pincpu_parse(const char *opt)
372{
373	int vcpu, pcpu;
374
375	if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) {
376		fprintf(stderr, "invalid format: %s\n", opt);
377		return (-1);
378	}
379
380	if (vcpu < 0 || vcpu >= VM_MAXCPU) {
381		fprintf(stderr, "vcpu '%d' outside valid range from 0 to %d\n",
382		    vcpu, VM_MAXCPU - 1);
383		return (-1);
384	}
385
386	if (pcpu < 0 || pcpu >= CPU_SETSIZE) {
387		fprintf(stderr, "hostcpu '%d' outside valid range from "
388		    "0 to %d\n", pcpu, CPU_SETSIZE - 1);
389		return (-1);
390	}
391
392	if (vcpumap[vcpu] == NULL) {
393		if ((vcpumap[vcpu] = malloc(sizeof(cpuset_t))) == NULL) {
394			perror("malloc");
395			return (-1);
396		}
397		CPU_ZERO(vcpumap[vcpu]);
398	}
399	CPU_SET(pcpu, vcpumap[vcpu]);
400	return (0);
401}
402
403void
404vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid,
405    int errcode)
406{
407	struct vmctx *ctx;
408	int error, restart_instruction;
409
410	ctx = arg;
411	restart_instruction = 1;
412
413	error = vm_inject_exception(ctx, vcpu, vector, errcode_valid, errcode,
414	    restart_instruction);
415	assert(error == 0);
416}
417
418void *
419paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len)
420{
421
422	return (vm_map_gpa(ctx, gaddr, len));
423}
424
425int
426fbsdrun_vmexit_on_pause(void)
427{
428
429	return (guest_vmexit_on_pause);
430}
431
432int
433fbsdrun_vmexit_on_hlt(void)
434{
435
436	return (guest_vmexit_on_hlt);
437}
438
439int
440fbsdrun_virtio_msix(void)
441{
442
443	return (virtio_msix);
444}
445
446static void *
447fbsdrun_start_thread(void *param)
448{
449	char tname[MAXCOMLEN + 1];
450	struct mt_vmm_info *mtp;
451	int vcpu;
452
453	mtp = param;
454	vcpu = mtp->mt_vcpu;
455
456	snprintf(tname, sizeof(tname), "vcpu %d", vcpu);
457	pthread_set_name_np(mtp->mt_thr, tname);
458
459	vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip);
460
461	/* not reached */
462	exit(1);
463	return (NULL);
464}
465
466void
467fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip)
468{
469	int error;
470
471	assert(fromcpu == BSP);
472
473	/*
474	 * The 'newcpu' must be activated in the context of 'fromcpu'. If
475	 * vm_activate_cpu() is delayed until newcpu's pthread starts running
476	 * then vmm.ko is out-of-sync with bhyve and this can create a race
477	 * with vm_suspend().
478	 */
479	error = vm_activate_cpu(ctx, newcpu);
480	if (error != 0)
481		err(EX_OSERR, "could not activate CPU %d", newcpu);
482
483	CPU_SET_ATOMIC(newcpu, &cpumask);
484
485	/*
486	 * Set up the vmexit struct to allow execution to start
487	 * at the given RIP
488	 */
489	vmexit[newcpu].rip = rip;
490	vmexit[newcpu].inst_length = 0;
491
492	mt_vmm_info[newcpu].mt_ctx = ctx;
493	mt_vmm_info[newcpu].mt_vcpu = newcpu;
494
495	error = pthread_create(&mt_vmm_info[newcpu].mt_thr, NULL,
496	    fbsdrun_start_thread, &mt_vmm_info[newcpu]);
497	assert(error == 0);
498}
499
500static int
501fbsdrun_deletecpu(struct vmctx *ctx, int vcpu)
502{
503
504	if (!CPU_ISSET(vcpu, &cpumask)) {
505		fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu);
506		exit(4);
507	}
508
509	CPU_CLR_ATOMIC(vcpu, &cpumask);
510	return (CPU_EMPTY(&cpumask));
511}
512
513static int
514vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu,
515		     uint32_t eax)
516{
517#if BHYVE_DEBUG
518	/*
519	 * put guest-driven debug here
520	 */
521#endif
522	return (VMEXIT_CONTINUE);
523}
524
525static int
526vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
527{
528	int error;
529	int bytes, port, in, out;
530	int vcpu;
531
532	vcpu = *pvcpu;
533
534	port = vme->u.inout.port;
535	bytes = vme->u.inout.bytes;
536	in = vme->u.inout.in;
537	out = !in;
538
539        /* Extra-special case of host notifications */
540        if (out && port == GUEST_NIO_PORT) {
541                error = vmexit_handle_notify(ctx, vme, pvcpu, vme->u.inout.eax);
542		return (error);
543	}
544
545	error = emulate_inout(ctx, vcpu, vme, strictio);
546	if (error) {
547		fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n",
548		    in ? "in" : "out",
549		    bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'),
550		    port, vmexit->rip);
551		return (VMEXIT_ABORT);
552	} else {
553		return (VMEXIT_CONTINUE);
554	}
555}
556
557static int
558vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
559{
560	uint64_t val;
561	uint32_t eax, edx;
562	int error;
563
564	val = 0;
565	error = emulate_rdmsr(ctx, *pvcpu, vme->u.msr.code, &val);
566	if (error != 0) {
567		fprintf(stderr, "rdmsr to register %#x on vcpu %d\n",
568		    vme->u.msr.code, *pvcpu);
569		if (strictmsr) {
570			vm_inject_gp(ctx, *pvcpu);
571			return (VMEXIT_CONTINUE);
572		}
573	}
574
575	eax = val;
576	error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RAX, eax);
577	assert(error == 0);
578
579	edx = val >> 32;
580	error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RDX, edx);
581	assert(error == 0);
582
583	return (VMEXIT_CONTINUE);
584}
585
586static int
587vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
588{
589	int error;
590
591	error = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code, vme->u.msr.wval);
592	if (error != 0) {
593		fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n",
594		    vme->u.msr.code, vme->u.msr.wval, *pvcpu);
595		if (strictmsr) {
596			vm_inject_gp(ctx, *pvcpu);
597			return (VMEXIT_CONTINUE);
598		}
599	}
600	return (VMEXIT_CONTINUE);
601}
602
603static int
604vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
605{
606
607	(void)spinup_ap(ctx, *pvcpu,
608		    vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip);
609
610	return (VMEXIT_CONTINUE);
611}
612
613#define	DEBUG_EPT_MISCONFIG
614#ifdef DEBUG_EPT_MISCONFIG
615#define	VMCS_GUEST_PHYSICAL_ADDRESS	0x00002400
616
617static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4];
618static int ept_misconfig_ptenum;
619#endif
620
621static const char *
622vmexit_vmx_desc(uint32_t exit_reason)
623{
624
625	if (exit_reason >= nitems(vmx_exit_reason_desc) ||
626	    vmx_exit_reason_desc[exit_reason] == NULL)
627		return ("Unknown");
628	return (vmx_exit_reason_desc[exit_reason]);
629}
630
631static int
632vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
633{
634
635	fprintf(stderr, "vm exit[%d]\n", *pvcpu);
636	fprintf(stderr, "\treason\t\tVMX\n");
637	fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
638	fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
639	fprintf(stderr, "\tstatus\t\t%d\n", vmexit->u.vmx.status);
640	fprintf(stderr, "\texit_reason\t%u (%s)\n", vmexit->u.vmx.exit_reason,
641	    vmexit_vmx_desc(vmexit->u.vmx.exit_reason));
642	fprintf(stderr, "\tqualification\t0x%016lx\n",
643	    vmexit->u.vmx.exit_qualification);
644	fprintf(stderr, "\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type);
645	fprintf(stderr, "\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error);
646#ifdef DEBUG_EPT_MISCONFIG
647	if (vmexit->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) {
648		vm_get_register(ctx, *pvcpu,
649		    VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS),
650		    &ept_misconfig_gpa);
651		vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte,
652		    &ept_misconfig_ptenum);
653		fprintf(stderr, "\tEPT misconfiguration:\n");
654		fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa);
655		fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n",
656		    ept_misconfig_ptenum, ept_misconfig_pte[0],
657		    ept_misconfig_pte[1], ept_misconfig_pte[2],
658		    ept_misconfig_pte[3]);
659	}
660#endif	/* DEBUG_EPT_MISCONFIG */
661	return (VMEXIT_ABORT);
662}
663
664static int
665vmexit_svm(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
666{
667
668	fprintf(stderr, "vm exit[%d]\n", *pvcpu);
669	fprintf(stderr, "\treason\t\tSVM\n");
670	fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
671	fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
672	fprintf(stderr, "\texitcode\t%#lx\n", vmexit->u.svm.exitcode);
673	fprintf(stderr, "\texitinfo1\t%#lx\n", vmexit->u.svm.exitinfo1);
674	fprintf(stderr, "\texitinfo2\t%#lx\n", vmexit->u.svm.exitinfo2);
675	return (VMEXIT_ABORT);
676}
677
678static int
679vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
680{
681
682	assert(vmexit->inst_length == 0);
683
684	stats.vmexit_bogus++;
685
686	return (VMEXIT_CONTINUE);
687}
688
689static int
690vmexit_reqidle(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
691{
692
693	assert(vmexit->inst_length == 0);
694
695	stats.vmexit_reqidle++;
696
697	return (VMEXIT_CONTINUE);
698}
699
700static int
701vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
702{
703
704	stats.vmexit_hlt++;
705
706	/*
707	 * Just continue execution with the next instruction. We use
708	 * the HLT VM exit as a way to be friendly with the host
709	 * scheduler.
710	 */
711	return (VMEXIT_CONTINUE);
712}
713
714static int
715vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
716{
717
718	stats.vmexit_pause++;
719
720	return (VMEXIT_CONTINUE);
721}
722
723static int
724vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
725{
726
727	assert(vmexit->inst_length == 0);
728
729	stats.vmexit_mtrap++;
730
731	return (VMEXIT_CONTINUE);
732}
733
734static int
735vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
736{
737	int err, i;
738	struct vie *vie;
739
740	stats.vmexit_inst_emul++;
741
742	vie = &vmexit->u.inst_emul.vie;
743	err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa,
744	    vie, &vmexit->u.inst_emul.paging);
745
746	if (err) {
747		if (err == ESRCH) {
748			fprintf(stderr, "Unhandled memory access to 0x%lx\n",
749			    vmexit->u.inst_emul.gpa);
750		}
751
752		fprintf(stderr, "Failed to emulate instruction [");
753		for (i = 0; i < vie->num_valid; i++) {
754			fprintf(stderr, "0x%02x%s", vie->inst[i],
755			    i != (vie->num_valid - 1) ? " " : "");
756		}
757		fprintf(stderr, "] at 0x%lx\n", vmexit->rip);
758		return (VMEXIT_ABORT);
759	}
760
761	return (VMEXIT_CONTINUE);
762}
763
764static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER;
765static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER;
766
767static int
768vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
769{
770	enum vm_suspend_how how;
771
772	how = vmexit->u.suspended.how;
773
774	fbsdrun_deletecpu(ctx, *pvcpu);
775
776	if (*pvcpu != BSP) {
777		pthread_mutex_lock(&resetcpu_mtx);
778		pthread_cond_signal(&resetcpu_cond);
779		pthread_mutex_unlock(&resetcpu_mtx);
780		pthread_exit(NULL);
781	}
782
783	pthread_mutex_lock(&resetcpu_mtx);
784	while (!CPU_EMPTY(&cpumask)) {
785		pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx);
786	}
787	pthread_mutex_unlock(&resetcpu_mtx);
788
789	switch (how) {
790	case VM_SUSPEND_RESET:
791		exit(0);
792	case VM_SUSPEND_POWEROFF:
793		if (destroy_on_poweroff)
794			vm_destroy(ctx);
795		exit(1);
796	case VM_SUSPEND_HALT:
797		exit(2);
798	case VM_SUSPEND_TRIPLEFAULT:
799		exit(3);
800	default:
801		fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how);
802		exit(100);
803	}
804	return (0);	/* NOTREACHED */
805}
806
807static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
808	[VM_EXITCODE_INOUT]  = vmexit_inout,
809	[VM_EXITCODE_INOUT_STR]  = vmexit_inout,
810	[VM_EXITCODE_VMX]    = vmexit_vmx,
811	[VM_EXITCODE_SVM]    = vmexit_svm,
812	[VM_EXITCODE_BOGUS]  = vmexit_bogus,
813	[VM_EXITCODE_REQIDLE] = vmexit_reqidle,
814	[VM_EXITCODE_RDMSR]  = vmexit_rdmsr,
815	[VM_EXITCODE_WRMSR]  = vmexit_wrmsr,
816	[VM_EXITCODE_MTRAP]  = vmexit_mtrap,
817	[VM_EXITCODE_INST_EMUL] = vmexit_inst_emul,
818	[VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap,
819	[VM_EXITCODE_SUSPENDED] = vmexit_suspend,
820	[VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch,
821};
822
823static void
824vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip)
825{
826	int error, rc;
827	enum vm_exitcode exitcode;
828	cpuset_t active_cpus;
829
830	if (vcpumap[vcpu] != NULL) {
831		error = pthread_setaffinity_np(pthread_self(),
832		    sizeof(cpuset_t), vcpumap[vcpu]);
833		assert(error == 0);
834	}
835
836	error = vm_active_cpus(ctx, &active_cpus);
837	assert(CPU_ISSET(vcpu, &active_cpus));
838
839	error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, startrip);
840	assert(error == 0);
841
842	while (1) {
843		error = vm_run(ctx, vcpu, &vmexit[vcpu]);
844		if (error != 0)
845			break;
846
847		exitcode = vmexit[vcpu].exitcode;
848		if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) {
849			fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n",
850			    exitcode);
851			exit(4);
852		}
853
854		rc = (*handler[exitcode])(ctx, &vmexit[vcpu], &vcpu);
855
856		switch (rc) {
857		case VMEXIT_CONTINUE:
858			break;
859		case VMEXIT_ABORT:
860			abort();
861		default:
862			exit(4);
863		}
864	}
865	fprintf(stderr, "vm_run error %d, errno %d\n", error, errno);
866}
867
868static int
869num_vcpus_allowed(struct vmctx *ctx)
870{
871	int tmp, error;
872
873	error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp);
874
875	/*
876	 * The guest is allowed to spinup more than one processor only if the
877	 * UNRESTRICTED_GUEST capability is available.
878	 */
879	if (error == 0)
880		return (VM_MAXCPU);
881	else
882		return (1);
883}
884
885void
886fbsdrun_set_capabilities(struct vmctx *ctx, int cpu)
887{
888	int err, tmp;
889
890	if (fbsdrun_vmexit_on_hlt()) {
891		err = vm_get_capability(ctx, cpu, VM_CAP_HALT_EXIT, &tmp);
892		if (err < 0) {
893			fprintf(stderr, "VM exit on HLT not supported\n");
894			exit(4);
895		}
896		vm_set_capability(ctx, cpu, VM_CAP_HALT_EXIT, 1);
897		if (cpu == BSP)
898			handler[VM_EXITCODE_HLT] = vmexit_hlt;
899	}
900
901        if (fbsdrun_vmexit_on_pause()) {
902		/*
903		 * pause exit support required for this mode
904		 */
905		err = vm_get_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, &tmp);
906		if (err < 0) {
907			fprintf(stderr,
908			    "SMP mux requested, no pause support\n");
909			exit(4);
910		}
911		vm_set_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, 1);
912		if (cpu == BSP)
913			handler[VM_EXITCODE_PAUSE] = vmexit_pause;
914        }
915
916	if (x2apic_mode)
917		err = vm_set_x2apic_state(ctx, cpu, X2APIC_ENABLED);
918	else
919		err = vm_set_x2apic_state(ctx, cpu, X2APIC_DISABLED);
920
921	if (err) {
922		fprintf(stderr, "Unable to set x2apic state (%d)\n", err);
923		exit(4);
924	}
925
926	vm_set_capability(ctx, cpu, VM_CAP_ENABLE_INVPCID, 1);
927}
928
929static struct vmctx *
930do_open(const char *vmname)
931{
932	struct vmctx *ctx;
933	int error;
934	bool reinit, romboot;
935#ifndef WITHOUT_CAPSICUM
936	cap_rights_t rights;
937	const cap_ioctl_t *cmds;
938	size_t ncmds;
939#endif
940
941	reinit = romboot = false;
942
943	if (lpc_bootrom())
944		romboot = true;
945
946	error = vm_create(vmname);
947	if (error) {
948		if (errno == EEXIST) {
949			if (romboot) {
950				reinit = true;
951			} else {
952				/*
953				 * The virtual machine has been setup by the
954				 * userspace bootloader.
955				 */
956			}
957		} else {
958			perror("vm_create");
959			exit(4);
960		}
961	} else {
962		if (!romboot) {
963			/*
964			 * If the virtual machine was just created then a
965			 * bootrom must be configured to boot it.
966			 */
967			fprintf(stderr, "virtual machine cannot be booted\n");
968			exit(4);
969		}
970	}
971
972	ctx = vm_open(vmname);
973	if (ctx == NULL) {
974		perror("vm_open");
975		exit(4);
976	}
977
978#ifndef WITHOUT_CAPSICUM
979	cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW);
980	if (cap_rights_limit(vm_get_device_fd(ctx), &rights) == -1 &&
981	    errno != ENOSYS)
982		errx(EX_OSERR, "Unable to apply rights for sandbox");
983	vm_get_ioctls(&ncmds);
984	cmds = vm_get_ioctls(NULL);
985	if (cmds == NULL)
986		errx(EX_OSERR, "out of memory");
987	if (cap_ioctls_limit(vm_get_device_fd(ctx), cmds, ncmds) == -1 &&
988	    errno != ENOSYS)
989		errx(EX_OSERR, "Unable to apply rights for sandbox");
990	free((cap_ioctl_t *)cmds);
991#endif
992
993	if (reinit) {
994		error = vm_reinit(ctx);
995		if (error) {
996			perror("vm_reinit");
997			exit(4);
998		}
999	}
1000	error = vm_set_topology(ctx, sockets, cores, threads, maxcpus);
1001	if (error)
1002		errx(EX_OSERR, "vm_set_topology");
1003	return (ctx);
1004}
1005
1006int
1007main(int argc, char *argv[])
1008{
1009	int c, error, gdb_port, err, bvmcons;
1010	int max_vcpus, mptgen, memflags;
1011	int rtc_localtime;
1012	struct vmctx *ctx;
1013	uint64_t rip;
1014	size_t memsize;
1015	char *optstr;
1016
1017	bvmcons = 0;
1018	progname = basename(argv[0]);
1019	gdb_port = 0;
1020	guest_ncpus = 1;
1021	sockets = cores = threads = 1;
1022	maxcpus = 0;
1023	memsize = 256 * MB;
1024	mptgen = 1;
1025	rtc_localtime = 1;
1026	memflags = 0;
1027
1028	optstr = "abehuwxACDHIPSWYp:g:c:s:m:l:U:";
1029	while ((c = getopt(argc, argv, optstr)) != -1) {
1030		switch (c) {
1031		case 'a':
1032			x2apic_mode = 0;
1033			break;
1034		case 'A':
1035			acpi = 1;
1036			break;
1037		case 'b':
1038			bvmcons = 1;
1039			break;
1040		case 'D':
1041			destroy_on_poweroff = 1;
1042			break;
1043		case 'p':
1044                        if (pincpu_parse(optarg) != 0) {
1045                            errx(EX_USAGE, "invalid vcpu pinning "
1046                                 "configuration '%s'", optarg);
1047                        }
1048			break;
1049                case 'c':
1050			if (topology_parse(optarg) != 0) {
1051			    errx(EX_USAGE, "invalid cpu topology "
1052				"'%s'", optarg);
1053			}
1054			break;
1055		case 'C':
1056			memflags |= VM_MEM_F_INCORE;
1057			break;
1058		case 'g':
1059			gdb_port = atoi(optarg);
1060			break;
1061		case 'l':
1062			if (lpc_device_parse(optarg) != 0) {
1063				errx(EX_USAGE, "invalid lpc device "
1064				    "configuration '%s'", optarg);
1065			}
1066			break;
1067		case 's':
1068			if (pci_parse_slot(optarg) != 0)
1069				exit(4);
1070			else
1071				break;
1072		case 'S':
1073			memflags |= VM_MEM_F_WIRED;
1074			break;
1075                case 'm':
1076			error = vm_parse_memsize(optarg, &memsize);
1077			if (error)
1078				errx(EX_USAGE, "invalid memsize '%s'", optarg);
1079			break;
1080		case 'H':
1081			guest_vmexit_on_hlt = 1;
1082			break;
1083		case 'I':
1084			/*
1085			 * The "-I" option was used to add an ioapic to the
1086			 * virtual machine.
1087			 *
1088			 * An ioapic is now provided unconditionally for each
1089			 * virtual machine and this option is now deprecated.
1090			 */
1091			break;
1092		case 'P':
1093			guest_vmexit_on_pause = 1;
1094			break;
1095		case 'e':
1096			strictio = 1;
1097			break;
1098		case 'u':
1099			rtc_localtime = 0;
1100			break;
1101		case 'U':
1102			guest_uuid_str = optarg;
1103			break;
1104		case 'w':
1105			strictmsr = 0;
1106			break;
1107		case 'W':
1108			virtio_msix = 0;
1109			break;
1110		case 'x':
1111			x2apic_mode = 1;
1112			break;
1113		case 'Y':
1114			mptgen = 0;
1115			break;
1116		case 'h':
1117			usage(0);
1118		default:
1119			usage(1);
1120		}
1121	}
1122	argc -= optind;
1123	argv += optind;
1124
1125	if (argc != 1)
1126		usage(1);
1127
1128	vmname = argv[0];
1129	ctx = do_open(vmname);
1130
1131	max_vcpus = num_vcpus_allowed(ctx);
1132	if (guest_ncpus > max_vcpus) {
1133		fprintf(stderr, "%d vCPUs requested but only %d available\n",
1134			guest_ncpus, max_vcpus);
1135		exit(4);
1136	}
1137
1138	fbsdrun_set_capabilities(ctx, BSP);
1139
1140	vm_set_memflags(ctx, memflags);
1141	err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL);
1142	if (err) {
1143		fprintf(stderr, "Unable to setup memory (%d)\n", errno);
1144		exit(4);
1145	}
1146
1147	error = init_msr();
1148	if (error) {
1149		fprintf(stderr, "init_msr error %d", error);
1150		exit(4);
1151	}
1152
1153	init_mem();
1154	init_inout();
1155	atkbdc_init(ctx);
1156	pci_irq_init(ctx);
1157	ioapic_init(ctx);
1158
1159	rtc_init(ctx, rtc_localtime);
1160	sci_init(ctx);
1161
1162	/*
1163	 * Exit if a device emulation finds an error in it's initilization
1164	 */
1165	if (init_pci(ctx) != 0) {
1166		perror("device emulation initialization error");
1167		exit(4);
1168	}
1169
1170	if (gdb_port != 0)
1171		init_dbgport(gdb_port);
1172
1173	if (bvmcons)
1174		init_bvmcons();
1175
1176	if (lpc_bootrom()) {
1177		if (vm_set_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, 1)) {
1178			fprintf(stderr, "ROM boot failed: unrestricted guest "
1179			    "capability not available\n");
1180			exit(4);
1181		}
1182		error = vcpu_reset(ctx, BSP);
1183		assert(error == 0);
1184	}
1185
1186	error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip);
1187	assert(error == 0);
1188
1189	/*
1190	 * build the guest tables, MP etc.
1191	 */
1192	if (mptgen) {
1193		error = mptable_build(ctx, guest_ncpus);
1194		if (error) {
1195			perror("error to build the guest tables");
1196			exit(4);
1197		}
1198	}
1199
1200	error = smbios_build(ctx);
1201	assert(error == 0);
1202
1203	if (acpi) {
1204		error = acpi_build(ctx, guest_ncpus);
1205		assert(error == 0);
1206	}
1207
1208	if (lpc_bootrom())
1209		fwctl_init();
1210
1211	/*
1212	 * Change the proc title to include the VM name.
1213	 */
1214	setproctitle("%s", vmname);
1215
1216#ifndef WITHOUT_CAPSICUM
1217	bhyve_caph_cache_catpages();
1218
1219	if (bhyve_caph_limit_stdoe() == -1)
1220		errx(EX_OSERR, "Unable to apply rights for sandbox");
1221
1222	if (cap_enter() == -1 && errno != ENOSYS)
1223		errx(EX_OSERR, "cap_enter() failed");
1224#endif
1225
1226	/*
1227	 * Add CPU 0
1228	 */
1229	fbsdrun_addcpu(ctx, BSP, BSP, rip);
1230
1231	/*
1232	 * Head off to the main event dispatch loop
1233	 */
1234	mevent_dispatch();
1235
1236	exit(4);
1237}
1238