bhyverun.c revision 239043
1/*-
2 * Copyright (c) 2011 NetApp, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD$
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD$");
31
32#include <sys/types.h>
33#include <sys/mman.h>
34#include <sys/time.h>
35
36#include <machine/segments.h>
37
38#include <stdio.h>
39#include <stdlib.h>
40#include <libgen.h>
41#include <unistd.h>
42#include <assert.h>
43#include <errno.h>
44#include <signal.h>
45#include <pthread.h>
46
47#include <machine/vmm.h>
48#include <vmmapi.h>
49
50#include "fbsdrun.h"
51#include "inout.h"
52#include "dbgport.h"
53#include "mevent.h"
54#include "pci_emul.h"
55#include "xmsr.h"
56#include "instruction_emul.h"
57
58#define	DEFAULT_GUEST_HZ	100
59#define	DEFAULT_GUEST_TSLICE	200
60
61#define GUEST_NIO_PORT		0x488	/* guest upcalls via i/o port */
62
63#define	VMEXIT_SWITCH		0	/* force vcpu switch in mux mode */
64#define	VMEXIT_CONTINUE		1	/* continue from next instruction */
65#define	VMEXIT_RESTART		2	/* restart current instruction */
66#define	VMEXIT_ABORT		3	/* abort the vm run loop */
67#define	VMEXIT_RESET		4	/* guest machine has reset */
68
69#define MB		(1024UL * 1024)
70#define GB		(1024UL * MB)
71
72typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu);
73
74int guest_tslice = DEFAULT_GUEST_TSLICE;
75int guest_hz = DEFAULT_GUEST_HZ;
76char *vmname;
77
78u_long lomem_sz;
79u_long himem_sz;
80
81int guest_ncpus;
82
83static int pincpu = -1;
84static int guest_vcpu_mux;
85static int guest_vmexit_on_hlt, guest_vmexit_on_pause;
86
87static int foundcpus;
88
89static int strictio;
90
91static char *lomem_addr;
92static char *himem_addr;
93
94static char *progname;
95static const int BSP = 0;
96
97static int cpumask;
98
99static void *oem_tbl_start;
100static int oem_tbl_size;
101
102static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip);
103
104struct vm_exit vmexit[VM_MAXCPU];
105
106struct fbsdstats {
107        uint64_t        vmexit_bogus;
108        uint64_t        vmexit_bogus_switch;
109        uint64_t        vmexit_hlt;
110        uint64_t        vmexit_pause;
111        uint64_t        vmexit_mtrap;
112        uint64_t        vmexit_paging;
113        uint64_t        cpu_switch_rotate;
114        uint64_t        cpu_switch_direct;
115        int             io_reset;
116} stats;
117
118struct mt_vmm_info {
119	pthread_t	mt_thr;
120	struct vmctx	*mt_ctx;
121	int		mt_vcpu;
122} mt_vmm_info[VM_MAXCPU];
123
124static void
125usage(int code)
126{
127
128        fprintf(stderr,
129                "Usage: %s [-ehBHIP][-g <gdb port>][-z <hz>][-s <pci>]"
130		"[-S <pci>][-p pincpu][-n <pci>][-m lowmem][-M highmem] <vm>\n"
131		"       -g: gdb port (default is %d and 0 means don't open)\n"
132		"       -c: # cpus (default 1)\n"
133		"       -p: pin vcpu 'n' to host cpu 'pincpu + n'\n"
134		"       -B: inject breakpoint exception on vm entry\n"
135		"       -H: vmexit from the guest on hlt\n"
136		"       -I: present an ioapic to the guest\n"
137		"       -P: vmexit from the guest on pause\n"
138		"	-e: exit on unhandled i/o access\n"
139		"       -h: help\n"
140		"       -z: guest hz (default is %d)\n"
141		"       -s: <slot,driver,configinfo> PCI slot config\n"
142		"       -S: <slot,driver,configinfo> legacy PCI slot config\n"
143		"	-n: <slot,name> PCI slot naming\n"
144		"       -m: lowmem in MB\n"
145		"       -M: highmem in MB\n"
146		"       -x: mux vcpus to 1 hcpu\n"
147		"       -t: mux vcpu timeslice hz (default %d)\n",
148		progname, DEFAULT_GDB_PORT, DEFAULT_GUEST_HZ,
149		DEFAULT_GUEST_TSLICE);
150	exit(code);
151}
152
153void *
154paddr_guest2host(uintptr_t gaddr)
155{
156	if (lomem_sz == 0)
157		return (NULL);
158
159	if (gaddr < lomem_sz) {
160		return ((void *)(lomem_addr + gaddr));
161	} else if (gaddr >= 4*GB && gaddr < (4*GB + himem_sz)) {
162		return ((void *)(himem_addr + gaddr - 4*GB));
163	} else
164		return (NULL);
165}
166
167void
168fbsdrun_add_oemtbl(void *tbl, int tblsz)
169{
170	oem_tbl_start = tbl;
171	oem_tbl_size = tblsz;
172}
173
174int
175fbsdrun_vmexit_on_pause(void)
176{
177
178	return (guest_vmexit_on_pause);
179}
180
181int
182fbsdrun_vmexit_on_hlt(void)
183{
184
185	return (guest_vmexit_on_hlt);
186}
187
188int
189fbsdrun_muxed(void)
190{
191
192	return (guest_vcpu_mux);
193}
194
195static void *
196fbsdrun_start_thread(void *param)
197{
198	int vcpu;
199	struct mt_vmm_info *mtp = param;
200
201	vcpu = mtp->mt_vcpu;
202	vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip);
203
204	/* not reached */
205	exit(1);
206	return (NULL);
207}
208
209void
210fbsdrun_addcpu(struct vmctx *ctx, int vcpu, uint64_t rip)
211{
212	int error;
213
214	if (cpumask & (1 << vcpu)) {
215		printf("addcpu: attempting to add existing cpu %d\n", vcpu);
216		exit(1);
217	}
218
219	cpumask |= 1 << vcpu;
220	foundcpus++;
221
222	/*
223	 * Set up the vmexit struct to allow execution to start
224	 * at the given RIP
225	 */
226	vmexit[vcpu].rip = rip;
227	vmexit[vcpu].inst_length = 0;
228
229	if (vcpu == BSP || !guest_vcpu_mux){
230		mt_vmm_info[vcpu].mt_ctx = ctx;
231		mt_vmm_info[vcpu].mt_vcpu = vcpu;
232
233		error = pthread_create(&mt_vmm_info[vcpu].mt_thr, NULL,
234				fbsdrun_start_thread, &mt_vmm_info[vcpu]);
235		assert(error == 0);
236	}
237}
238
239static int
240fbsdrun_get_next_cpu(int curcpu)
241{
242
243	/*
244	 * Get the next available CPU. Assumes they arrive
245	 * in ascending order with no gaps.
246	 */
247	return ((curcpu + 1) % foundcpus);
248}
249
250static int
251vmexit_catch_reset(void)
252{
253        stats.io_reset++;
254        return (VMEXIT_RESET);
255}
256
257static int
258vmexit_catch_inout(void)
259{
260	return (VMEXIT_ABORT);
261}
262
263static int
264vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu,
265		     uint32_t eax)
266{
267#if PG_DEBUG /* put all types of debug here */
268        if (eax == 0) {
269		pause_noswitch = 1;
270	} else if (eax == 1) {
271		pause_noswitch = 0;
272	} else {
273		pause_noswitch = 0;
274		if (eax == 5) {
275			vm_set_capability(ctx, *pvcpu, VM_CAP_MTRAP_EXIT, 1);
276		}
277	}
278#endif
279        return (VMEXIT_CONTINUE);
280}
281
282static int
283vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
284{
285	int error;
286	int bytes, port, in, out;
287	uint32_t eax;
288	int vcpu;
289
290	vcpu = *pvcpu;
291
292	port = vme->u.inout.port;
293	bytes = vme->u.inout.bytes;
294	eax = vme->u.inout.eax;
295	in = vme->u.inout.in;
296	out = !in;
297
298	/* We don't deal with these */
299	if (vme->u.inout.string || vme->u.inout.rep)
300		return (VMEXIT_ABORT);
301
302	/* Special case of guest reset */
303	if (out && port == 0x64 && (uint8_t)eax == 0xFE)
304		return (vmexit_catch_reset());
305
306        /* Extra-special case of host notifications */
307        if (out && port == GUEST_NIO_PORT)
308                return (vmexit_handle_notify(ctx, vme, pvcpu, eax));
309
310	error = emulate_inout(ctx, vcpu, in, port, bytes, &eax, strictio);
311	if (error == 0 && in)
312		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, eax);
313
314	if (error == 0)
315		return (VMEXIT_CONTINUE);
316	else {
317		fprintf(stderr, "Unhandled %s%c 0x%04x\n",
318			in ? "in" : "out",
319			bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port);
320		return (vmexit_catch_inout());
321	}
322}
323
324static int
325vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
326{
327	printf("vm exit rdmsr 0x%x, cpu %d\n", vme->u.msr.code, *pvcpu);
328	return (VMEXIT_ABORT);
329}
330
331static int
332vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
333{
334	int newcpu;
335	int retval = VMEXIT_CONTINUE;
336
337	newcpu = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code,vme->u.msr.wval);
338
339	if (guest_vcpu_mux && *pvcpu != newcpu) {
340                retval = VMEXIT_SWITCH;
341                *pvcpu = newcpu;
342        }
343
344        return (retval);
345}
346
347static int
348vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
349{
350
351	printf("vm exit[%d]\n", *pvcpu);
352	printf("\treason\t\tVMX\n");
353	printf("\trip\t\t0x%016lx\n", vmexit->rip);
354	printf("\tinst_length\t%d\n", vmexit->inst_length);
355	printf("\terror\t\t%d\n", vmexit->u.vmx.error);
356	printf("\texit_reason\t%u\n", vmexit->u.vmx.exit_reason);
357	printf("\tqualification\t0x%016lx\n", vmexit->u.vmx.exit_qualification);
358
359	return (VMEXIT_ABORT);
360}
361
362static int bogus_noswitch = 1;
363
364static int
365vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
366{
367	stats.vmexit_bogus++;
368
369	if (!guest_vcpu_mux || guest_ncpus == 1 || bogus_noswitch) {
370		return (VMEXIT_RESTART);
371	} else {
372		stats.vmexit_bogus_switch++;
373		vmexit->inst_length = 0;
374		*pvcpu = -1;
375		return (VMEXIT_SWITCH);
376	}
377}
378
379static int
380vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
381{
382	stats.vmexit_hlt++;
383	if (fbsdrun_muxed()) {
384		*pvcpu = -1;
385		return (VMEXIT_SWITCH);
386	} else {
387		/*
388		 * Just continue execution with the next instruction. We use
389		 * the HLT VM exit as a way to be friendly with the host
390		 * scheduler.
391		 */
392		return (VMEXIT_CONTINUE);
393	}
394}
395
396static int pause_noswitch;
397
398static int
399vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
400{
401	stats.vmexit_pause++;
402
403	if (fbsdrun_muxed() && !pause_noswitch) {
404		*pvcpu = -1;
405		return (VMEXIT_SWITCH);
406        } else {
407		return (VMEXIT_CONTINUE);
408	}
409}
410
411static int
412vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
413{
414	stats.vmexit_mtrap++;
415
416	return (VMEXIT_RESTART);
417}
418
419static int
420vmexit_paging(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
421{
422
423	stats.vmexit_paging++;
424
425	if (emulate_instruction(ctx, *pvcpu, vmexit->rip, vmexit->u.paging.cr3) != 0) {
426		printf("Failed to emulate instruction at 0x%lx\n", vmexit->rip);
427		return (VMEXIT_ABORT);
428	}
429
430	return (VMEXIT_CONTINUE);
431}
432
433static void
434sigalrm(int sig)
435{
436	return;
437}
438
439static void
440setup_timeslice(void)
441{
442	struct sigaction sa;
443	struct itimerval itv;
444	int error;
445
446	/*
447	 * Setup a realtime timer to generate a SIGALRM at a
448	 * frequency of 'guest_tslice' ticks per second.
449	 */
450	sigemptyset(&sa.sa_mask);
451	sa.sa_flags = 0;
452	sa.sa_handler = sigalrm;
453
454	error = sigaction(SIGALRM, &sa, NULL);
455	assert(error == 0);
456
457	itv.it_interval.tv_sec = 0;
458	itv.it_interval.tv_usec = 1000000 / guest_tslice;
459	itv.it_value.tv_sec = 0;
460	itv.it_value.tv_usec = 1000000 / guest_tslice;
461
462	error = setitimer(ITIMER_REAL, &itv, NULL);
463	assert(error == 0);
464}
465
466static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
467	[VM_EXITCODE_INOUT]  = vmexit_inout,
468	[VM_EXITCODE_VMX]    = vmexit_vmx,
469	[VM_EXITCODE_BOGUS]  = vmexit_bogus,
470	[VM_EXITCODE_RDMSR]  = vmexit_rdmsr,
471	[VM_EXITCODE_WRMSR]  = vmexit_wrmsr,
472	[VM_EXITCODE_MTRAP]  = vmexit_mtrap,
473	[VM_EXITCODE_PAGING] = vmexit_paging
474};
475
476static void
477vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip)
478{
479	int error, rc, prevcpu;
480
481	if (guest_vcpu_mux)
482		setup_timeslice();
483
484	if (pincpu >= 0) {
485		error = vm_set_pinning(ctx, vcpu, pincpu + vcpu);
486		assert(error == 0);
487	}
488
489	while (1) {
490		error = vm_run(ctx, vcpu, rip, &vmexit[vcpu]);
491		if (error != 0)
492			break;
493
494		prevcpu = vcpu;
495                rc = (*handler[vmexit[vcpu].exitcode])(ctx, &vmexit[vcpu],
496                                                       &vcpu);
497		switch (rc) {
498                case VMEXIT_SWITCH:
499			assert(guest_vcpu_mux);
500			if (vcpu == -1) {
501				stats.cpu_switch_rotate++;
502				vcpu = fbsdrun_get_next_cpu(prevcpu);
503			} else {
504				stats.cpu_switch_direct++;
505			}
506			/* fall through */
507		case VMEXIT_CONTINUE:
508                        rip = vmexit[vcpu].rip + vmexit[vcpu].inst_length;
509			break;
510		case VMEXIT_RESTART:
511                        rip = vmexit[vcpu].rip;
512			break;
513		case VMEXIT_RESET:
514			exit(0);
515		default:
516			exit(1);
517		}
518	}
519	fprintf(stderr, "vm_run error %d, errno %d\n", error, errno);
520}
521
522
523int
524main(int argc, char *argv[])
525{
526	int c, error, gdb_port, inject_bkpt, tmp, err, ioapic;
527	struct vmctx *ctx;
528	uint64_t rip;
529
530	inject_bkpt = 0;
531	progname = basename(argv[0]);
532	gdb_port = DEFAULT_GDB_PORT;
533	guest_ncpus = 1;
534	ioapic = 0;
535
536	while ((c = getopt(argc, argv, "ehBHIPxp:g:c:z:s:S:n:m:M:")) != -1) {
537		switch (c) {
538		case 'B':
539			inject_bkpt = 1;
540			break;
541		case 'x':
542			guest_vcpu_mux = 1;
543			break;
544		case 'p':
545			pincpu = atoi(optarg);
546			break;
547                case 'c':
548			guest_ncpus = atoi(optarg);
549			break;
550		case 'g':
551			gdb_port = atoi(optarg);
552			break;
553		case 'z':
554			guest_hz = atoi(optarg);
555			break;
556		case 't':
557			guest_tslice = atoi(optarg);
558			break;
559		case 's':
560			pci_parse_slot(optarg, 0);
561			break;
562		case 'S':
563			pci_parse_slot(optarg, 1);
564			break;
565		case 'n':
566			pci_parse_name(optarg);
567			break;
568                case 'm':
569			lomem_sz = strtoul(optarg, NULL, 0) * MB;
570			break;
571                case 'M':
572			himem_sz = strtoul(optarg, NULL, 0) * MB;
573			break;
574		case 'H':
575			guest_vmexit_on_hlt = 1;
576			break;
577		case 'I':
578			ioapic = 1;
579			break;
580		case 'P':
581			guest_vmexit_on_pause = 1;
582			break;
583		case 'e':
584			strictio = 1;
585			break;
586		case 'h':
587			usage(0);
588		default:
589			usage(1);
590		}
591	}
592	argc -= optind;
593	argv += optind;
594
595	if (argc != 1)
596		usage(1);
597
598	/* No need to mux if guest is uni-processor */
599	if (guest_ncpus <= 1)
600		guest_vcpu_mux = 0;
601
602	/* vmexit on hlt if guest is muxed */
603	if (guest_vcpu_mux) {
604		guest_vmexit_on_hlt = 1;
605		guest_vmexit_on_pause = 1;
606	}
607
608	vmname = argv[0];
609
610	ctx = vm_open(vmname);
611	if (ctx == NULL) {
612		perror("vm_open");
613		exit(1);
614	}
615
616	if (fbsdrun_vmexit_on_hlt()) {
617		err = vm_get_capability(ctx, BSP, VM_CAP_HALT_EXIT, &tmp);
618		if (err < 0) {
619			printf("VM exit on HLT not supported\n");
620			exit(1);
621		}
622		vm_set_capability(ctx, BSP, VM_CAP_HALT_EXIT, 1);
623		handler[VM_EXITCODE_HLT] = vmexit_hlt;
624	}
625
626        if (fbsdrun_vmexit_on_pause()) {
627		/*
628		 * pause exit support required for this mode
629		 */
630		err = vm_get_capability(ctx, BSP, VM_CAP_PAUSE_EXIT, &tmp);
631		if (err < 0) {
632			printf("SMP mux requested, no pause support\n");
633			exit(1);
634		}
635		vm_set_capability(ctx, BSP, VM_CAP_PAUSE_EXIT, 1);
636		handler[VM_EXITCODE_PAUSE] = vmexit_pause;
637        }
638
639	if (lomem_sz != 0) {
640		lomem_addr = vm_map_memory(ctx, 0, lomem_sz);
641		if (lomem_addr == (char *) MAP_FAILED) {
642			lomem_sz = 0;
643		} else if (himem_sz != 0) {
644			himem_addr = vm_map_memory(ctx, 4*GB, himem_sz);
645			if (himem_addr == (char *) MAP_FAILED) {
646				lomem_sz = 0;
647				himem_sz = 0;
648			}
649		}
650	}
651
652	init_inout();
653	init_pci(ctx);
654
655	if (gdb_port != 0)
656		init_dbgport(gdb_port);
657
658	error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip);
659	assert(error == 0);
660
661	if (inject_bkpt) {
662		error = vm_inject_event(ctx, BSP, VM_HW_EXCEPTION, IDT_BP);
663		assert(error == 0);
664	}
665
666	/*
667	 * build the guest tables, MP etc.
668	 */
669	vm_build_tables(ctx, guest_ncpus, ioapic, oem_tbl_start, oem_tbl_size);
670
671	/*
672	 * Add CPU 0
673	 */
674	fbsdrun_addcpu(ctx, BSP, rip);
675
676	/*
677	 * Head off to the main event dispatch loop
678	 */
679	mevent_dispatch();
680
681	exit(1);
682}
683