bhyverun.c revision 256281
1/*-
2 * Copyright (c) 2011 NetApp, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD: stable/10/usr.sbin/bhyve/bhyverun.c 256176 2013-10-09 03:56:07Z neel $
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: stable/10/usr.sbin/bhyve/bhyverun.c 256176 2013-10-09 03:56:07Z neel $");
31
32#include <sys/types.h>
33#include <sys/mman.h>
34#include <sys/time.h>
35
36#include <machine/segments.h>
37
38#include <stdio.h>
39#include <stdlib.h>
40#include <err.h>
41#include <libgen.h>
42#include <unistd.h>
43#include <assert.h>
44#include <errno.h>
45#include <pthread.h>
46#include <pthread_np.h>
47#include <sysexits.h>
48
49#include <machine/vmm.h>
50#include <vmmapi.h>
51
52#include "bhyverun.h"
53#include "acpi.h"
54#include "inout.h"
55#include "dbgport.h"
56#include "mem.h"
57#include "mevent.h"
58#include "mptbl.h"
59#include "pci_emul.h"
60#include "xmsr.h"
61#include "ioapic.h"
62#include "spinup_ap.h"
63#include "rtc.h"
64
65#define GUEST_NIO_PORT		0x488	/* guest upcalls via i/o port */
66
67#define	VMEXIT_SWITCH		0	/* force vcpu switch in mux mode */
68#define	VMEXIT_CONTINUE		1	/* continue from next instruction */
69#define	VMEXIT_RESTART		2	/* restart current instruction */
70#define	VMEXIT_ABORT		3	/* abort the vm run loop */
71#define	VMEXIT_RESET		4	/* guest machine has reset */
72
73#define MB		(1024UL * 1024)
74#define GB		(1024UL * MB)
75
76typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu);
77
78char *vmname;
79
80int guest_ncpus;
81
82static int pincpu = -1;
83static int guest_vmexit_on_hlt, guest_vmexit_on_pause, disable_x2apic;
84
85static int foundcpus;
86
87static int strictio;
88
89static int acpi;
90
91static char *progname;
92static const int BSP = 0;
93
94static int cpumask;
95
96static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip);
97
98struct vm_exit vmexit[VM_MAXCPU];
99
100struct bhyvestats {
101        uint64_t        vmexit_bogus;
102        uint64_t        vmexit_bogus_switch;
103        uint64_t        vmexit_hlt;
104        uint64_t        vmexit_pause;
105        uint64_t        vmexit_mtrap;
106        uint64_t        vmexit_inst_emul;
107        uint64_t        cpu_switch_rotate;
108        uint64_t        cpu_switch_direct;
109        int             io_reset;
110} stats;
111
112struct mt_vmm_info {
113	pthread_t	mt_thr;
114	struct vmctx	*mt_ctx;
115	int		mt_vcpu;
116} mt_vmm_info[VM_MAXCPU];
117
118static void
119usage(int code)
120{
121
122        fprintf(stderr,
123                "Usage: %s [-aehAHIP][-g <gdb port>][-s <pci>][-S <pci>]"
124		"[-c vcpus][-p pincpu][-m mem]"
125		" <vmname>\n"
126		"       -a: local apic is in XAPIC mode (default is X2APIC)\n"
127		"       -A: create an ACPI table\n"
128		"       -g: gdb port\n"
129		"       -c: # cpus (default 1)\n"
130		"       -p: pin vcpu 'n' to host cpu 'pincpu + n'\n"
131		"       -H: vmexit from the guest on hlt\n"
132		"       -I: present an ioapic to the guest\n"
133		"       -P: vmexit from the guest on pause\n"
134		"	-e: exit on unhandled i/o access\n"
135		"       -h: help\n"
136		"       -s: <slot,driver,configinfo> PCI slot config\n"
137		"       -S: <slot,driver,configinfo> legacy PCI slot config\n"
138		"       -m: memory size in MB\n",
139		progname);
140
141	exit(code);
142}
143
144void *
145paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len)
146{
147
148	return (vm_map_gpa(ctx, gaddr, len));
149}
150
151int
152fbsdrun_disable_x2apic(void)
153{
154
155	return (disable_x2apic);
156}
157
158int
159fbsdrun_vmexit_on_pause(void)
160{
161
162	return (guest_vmexit_on_pause);
163}
164
165int
166fbsdrun_vmexit_on_hlt(void)
167{
168
169	return (guest_vmexit_on_hlt);
170}
171
172static void *
173fbsdrun_start_thread(void *param)
174{
175	char tname[MAXCOMLEN + 1];
176	struct mt_vmm_info *mtp;
177	int vcpu;
178
179	mtp = param;
180	vcpu = mtp->mt_vcpu;
181
182	snprintf(tname, sizeof(tname), "%s vcpu %d", vmname, vcpu);
183	pthread_set_name_np(mtp->mt_thr, tname);
184
185	vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip);
186
187	/* not reached */
188	exit(1);
189	return (NULL);
190}
191
192void
193fbsdrun_addcpu(struct vmctx *ctx, int vcpu, uint64_t rip)
194{
195	int error;
196
197	if (cpumask & (1 << vcpu)) {
198		fprintf(stderr, "addcpu: attempting to add existing cpu %d\n",
199		    vcpu);
200		exit(1);
201	}
202
203	cpumask |= 1 << vcpu;
204	foundcpus++;
205
206	/*
207	 * Set up the vmexit struct to allow execution to start
208	 * at the given RIP
209	 */
210	vmexit[vcpu].rip = rip;
211	vmexit[vcpu].inst_length = 0;
212
213	mt_vmm_info[vcpu].mt_ctx = ctx;
214	mt_vmm_info[vcpu].mt_vcpu = vcpu;
215
216	error = pthread_create(&mt_vmm_info[vcpu].mt_thr, NULL,
217	    fbsdrun_start_thread, &mt_vmm_info[vcpu]);
218	assert(error == 0);
219}
220
221static int
222vmexit_catch_reset(void)
223{
224        stats.io_reset++;
225        return (VMEXIT_RESET);
226}
227
228static int
229vmexit_catch_inout(void)
230{
231	return (VMEXIT_ABORT);
232}
233
234static int
235vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu,
236		     uint32_t eax)
237{
238#if BHYVE_DEBUG
239	/*
240	 * put guest-driven debug here
241	 */
242#endif
243        return (VMEXIT_CONTINUE);
244}
245
246static int
247vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
248{
249	int error;
250	int bytes, port, in, out;
251	uint32_t eax;
252	int vcpu;
253
254	vcpu = *pvcpu;
255
256	port = vme->u.inout.port;
257	bytes = vme->u.inout.bytes;
258	eax = vme->u.inout.eax;
259	in = vme->u.inout.in;
260	out = !in;
261
262	/* We don't deal with these */
263	if (vme->u.inout.string || vme->u.inout.rep)
264		return (VMEXIT_ABORT);
265
266	/* Special case of guest reset */
267	if (out && port == 0x64 && (uint8_t)eax == 0xFE)
268		return (vmexit_catch_reset());
269
270        /* Extra-special case of host notifications */
271        if (out && port == GUEST_NIO_PORT)
272                return (vmexit_handle_notify(ctx, vme, pvcpu, eax));
273
274	error = emulate_inout(ctx, vcpu, in, port, bytes, &eax, strictio);
275	if (error == 0 && in)
276		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, eax);
277
278	if (error == 0)
279		return (VMEXIT_CONTINUE);
280	else {
281		fprintf(stderr, "Unhandled %s%c 0x%04x\n",
282			in ? "in" : "out",
283			bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port);
284		return (vmexit_catch_inout());
285	}
286}
287
288static int
289vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
290{
291	fprintf(stderr, "vm exit rdmsr 0x%x, cpu %d\n", vme->u.msr.code,
292	    *pvcpu);
293	return (VMEXIT_ABORT);
294}
295
296static int
297vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
298{
299	int newcpu;
300	int retval = VMEXIT_CONTINUE;
301
302	newcpu = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code,vme->u.msr.wval);
303
304        return (retval);
305}
306
307static int
308vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
309{
310	int newcpu;
311	int retval = VMEXIT_CONTINUE;
312
313	newcpu = spinup_ap(ctx, *pvcpu,
314			   vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip);
315
316	return (retval);
317}
318
319static int
320vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
321{
322
323	fprintf(stderr, "vm exit[%d]\n", *pvcpu);
324	fprintf(stderr, "\treason\t\tVMX\n");
325	fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
326	fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
327	fprintf(stderr, "\terror\t\t%d\n", vmexit->u.vmx.error);
328	fprintf(stderr, "\texit_reason\t%u\n", vmexit->u.vmx.exit_reason);
329	fprintf(stderr, "\tqualification\t0x%016lx\n",
330	    vmexit->u.vmx.exit_qualification);
331
332	return (VMEXIT_ABORT);
333}
334
335static int
336vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
337{
338
339	stats.vmexit_bogus++;
340
341	return (VMEXIT_RESTART);
342}
343
344static int
345vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
346{
347
348	stats.vmexit_hlt++;
349
350	/*
351	 * Just continue execution with the next instruction. We use
352	 * the HLT VM exit as a way to be friendly with the host
353	 * scheduler.
354	 */
355	return (VMEXIT_CONTINUE);
356}
357
358static int
359vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
360{
361
362	stats.vmexit_pause++;
363
364	return (VMEXIT_CONTINUE);
365}
366
367static int
368vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
369{
370
371	stats.vmexit_mtrap++;
372
373	return (VMEXIT_RESTART);
374}
375
376static int
377vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
378{
379	int err;
380	stats.vmexit_inst_emul++;
381
382	err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa,
383			  &vmexit->u.inst_emul.vie);
384
385	if (err) {
386		if (err == EINVAL) {
387			fprintf(stderr,
388			    "Failed to emulate instruction at 0x%lx\n",
389			    vmexit->rip);
390		} else if (err == ESRCH) {
391			fprintf(stderr, "Unhandled memory access to 0x%lx\n",
392			    vmexit->u.inst_emul.gpa);
393		}
394
395		return (VMEXIT_ABORT);
396	}
397
398	return (VMEXIT_CONTINUE);
399}
400
401static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
402	[VM_EXITCODE_INOUT]  = vmexit_inout,
403	[VM_EXITCODE_VMX]    = vmexit_vmx,
404	[VM_EXITCODE_BOGUS]  = vmexit_bogus,
405	[VM_EXITCODE_RDMSR]  = vmexit_rdmsr,
406	[VM_EXITCODE_WRMSR]  = vmexit_wrmsr,
407	[VM_EXITCODE_MTRAP]  = vmexit_mtrap,
408	[VM_EXITCODE_INST_EMUL] = vmexit_inst_emul,
409	[VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap,
410};
411
412static void
413vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip)
414{
415	cpuset_t mask;
416	int error, rc, prevcpu;
417	enum vm_exitcode exitcode;
418
419	if (pincpu >= 0) {
420		CPU_ZERO(&mask);
421		CPU_SET(pincpu + vcpu, &mask);
422		error = pthread_setaffinity_np(pthread_self(),
423					       sizeof(mask), &mask);
424		assert(error == 0);
425	}
426
427	while (1) {
428		error = vm_run(ctx, vcpu, rip, &vmexit[vcpu]);
429		if (error != 0) {
430			/*
431			 * It is possible that 'vmmctl' or some other process
432			 * has transitioned the vcpu to CANNOT_RUN state right
433			 * before we tried to transition it to RUNNING.
434			 *
435			 * This is expected to be temporary so just retry.
436			 */
437			if (errno == EBUSY)
438				continue;
439			else
440				break;
441		}
442
443		prevcpu = vcpu;
444
445		exitcode = vmexit[vcpu].exitcode;
446		if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) {
447			fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n",
448			    exitcode);
449			exit(1);
450		}
451
452                rc = (*handler[exitcode])(ctx, &vmexit[vcpu], &vcpu);
453
454		switch (rc) {
455		case VMEXIT_CONTINUE:
456                        rip = vmexit[vcpu].rip + vmexit[vcpu].inst_length;
457			break;
458		case VMEXIT_RESTART:
459                        rip = vmexit[vcpu].rip;
460			break;
461		case VMEXIT_RESET:
462			exit(0);
463		default:
464			exit(1);
465		}
466	}
467	fprintf(stderr, "vm_run error %d, errno %d\n", error, errno);
468}
469
470static int
471num_vcpus_allowed(struct vmctx *ctx)
472{
473	int tmp, error;
474
475	error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp);
476
477	/*
478	 * The guest is allowed to spinup more than one processor only if the
479	 * UNRESTRICTED_GUEST capability is available.
480	 */
481	if (error == 0)
482		return (VM_MAXCPU);
483	else
484		return (1);
485}
486
487int
488main(int argc, char *argv[])
489{
490	int c, error, gdb_port, tmp, err, ioapic, bvmcons;
491	int max_vcpus;
492	struct vmctx *ctx;
493	uint64_t rip;
494	size_t memsize;
495
496	bvmcons = 0;
497	progname = basename(argv[0]);
498	gdb_port = 0;
499	guest_ncpus = 1;
500	ioapic = 0;
501	memsize = 256 * MB;
502
503	while ((c = getopt(argc, argv, "abehAHIPp:g:c:s:S:m:")) != -1) {
504		switch (c) {
505		case 'a':
506			disable_x2apic = 1;
507			break;
508		case 'A':
509			acpi = 1;
510			break;
511		case 'b':
512			bvmcons = 1;
513			break;
514		case 'p':
515			pincpu = atoi(optarg);
516			break;
517                case 'c':
518			guest_ncpus = atoi(optarg);
519			break;
520		case 'g':
521			gdb_port = atoi(optarg);
522			break;
523		case 's':
524			if (pci_parse_slot(optarg, 0) != 0)
525				exit(1);
526			else
527				break;
528		case 'S':
529			if (pci_parse_slot(optarg, 1) != 0)
530				exit(1);
531			else
532				break;
533                case 'm':
534			error = vm_parse_memsize(optarg, &memsize);
535			if (error)
536				errx(EX_USAGE, "invalid memsize '%s'", optarg);
537			break;
538		case 'H':
539			guest_vmexit_on_hlt = 1;
540			break;
541		case 'I':
542			ioapic = 1;
543			break;
544		case 'P':
545			guest_vmexit_on_pause = 1;
546			break;
547		case 'e':
548			strictio = 1;
549			break;
550		case 'h':
551			usage(0);
552		default:
553			usage(1);
554		}
555	}
556	argc -= optind;
557	argv += optind;
558
559	if (argc != 1)
560		usage(1);
561
562	vmname = argv[0];
563
564	ctx = vm_open(vmname);
565	if (ctx == NULL) {
566		perror("vm_open");
567		exit(1);
568	}
569
570	max_vcpus = num_vcpus_allowed(ctx);
571	if (guest_ncpus > max_vcpus) {
572		fprintf(stderr, "%d vCPUs requested but only %d available\n",
573			guest_ncpus, max_vcpus);
574		exit(1);
575	}
576
577	if (fbsdrun_vmexit_on_hlt()) {
578		err = vm_get_capability(ctx, BSP, VM_CAP_HALT_EXIT, &tmp);
579		if (err < 0) {
580			fprintf(stderr, "VM exit on HLT not supported\n");
581			exit(1);
582		}
583		vm_set_capability(ctx, BSP, VM_CAP_HALT_EXIT, 1);
584		handler[VM_EXITCODE_HLT] = vmexit_hlt;
585	}
586
587        if (fbsdrun_vmexit_on_pause()) {
588		/*
589		 * pause exit support required for this mode
590		 */
591		err = vm_get_capability(ctx, BSP, VM_CAP_PAUSE_EXIT, &tmp);
592		if (err < 0) {
593			fprintf(stderr,
594			    "SMP mux requested, no pause support\n");
595			exit(1);
596		}
597		vm_set_capability(ctx, BSP, VM_CAP_PAUSE_EXIT, 1);
598		handler[VM_EXITCODE_PAUSE] = vmexit_pause;
599        }
600
601	if (fbsdrun_disable_x2apic())
602		err = vm_set_x2apic_state(ctx, BSP, X2APIC_DISABLED);
603	else
604		err = vm_set_x2apic_state(ctx, BSP, X2APIC_ENABLED);
605
606	if (err) {
607		fprintf(stderr, "Unable to set x2apic state (%d)\n", err);
608		exit(1);
609	}
610
611	err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL);
612	if (err) {
613		fprintf(stderr, "Unable to setup memory (%d)\n", err);
614		exit(1);
615	}
616
617	init_mem();
618	init_inout();
619
620	rtc_init(ctx);
621
622	/*
623	 * Exit if a device emulation finds an error in it's initilization
624	 */
625	if (init_pci(ctx) != 0)
626		exit(1);
627
628	if (ioapic)
629		ioapic_init(0);
630
631	if (gdb_port != 0)
632		init_dbgport(gdb_port);
633
634	if (bvmcons)
635		init_bvmcons();
636
637	error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip);
638	assert(error == 0);
639
640	/*
641	 * build the guest tables, MP etc.
642	 */
643	mptable_build(ctx, guest_ncpus, ioapic);
644
645	if (acpi) {
646		error = acpi_build(ctx, guest_ncpus, ioapic);
647		assert(error == 0);
648	}
649
650	/*
651	 * Add CPU 0
652	 */
653	fbsdrun_addcpu(ctx, BSP, rip);
654
655	/*
656	 * Head off to the main event dispatch loop
657	 */
658	mevent_dispatch();
659
660	exit(1);
661}
662