bhyverun.c revision 222105
1254219Scy/*-
2254219Scy * Copyright (c) 2011 NetApp, Inc.
3254219Scy * All rights reserved.
4254219Scy *
5254219Scy * Redistribution and use in source and binary forms, with or without
6254219Scy * modification, are permitted provided that the following conditions
7254219Scy * are met:
8254219Scy * 1. Redistributions of source code must retain the above copyright
9254219Scy *    notice, this list of conditions and the following disclaimer.
10254219Scy * 2. Redistributions in binary form must reproduce the above copyright
11254219Scy *    notice, this list of conditions and the following disclaimer in the
12254219Scy *    documentation and/or other materials provided with the distribution.
13254219Scy *
14254219Scy * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15254219Scy * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16254219Scy * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17254219Scy * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18254219Scy * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19254219Scy * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20254219Scy * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21254219Scy * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22254219Scy * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23254219Scy * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24254219Scy * SUCH DAMAGE.
25254219Scy *
26 * $FreeBSD$
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD$");
31
32#include <sys/types.h>
33#include <sys/mman.h>
34#include <sys/time.h>
35
36#include <machine/segments.h>
37
38#include <stdio.h>
39#include <stdlib.h>
40#include <libgen.h>
41#include <unistd.h>
42#include <assert.h>
43#include <errno.h>
44#include <signal.h>
45#include <pthread.h>
46
47#include <machine/vmm.h>
48#include <vmmapi.h>
49
50#include "fbsdrun.h"
51#include "inout.h"
52#include "dbgport.h"
53#include "mevent.h"
54#include "pci_emul.h"
55#include "xmsr.h"
56
57#define	DEFAULT_GUEST_HZ	100
58#define	DEFAULT_GUEST_TSLICE	200
59
60#define GUEST_NIO_PORT		0x488	/* guest upcalls via i/o port */
61
62#define	VMEXIT_SWITCH		0	/* force vcpu switch in mux mode */
63#define	VMEXIT_CONTINUE		1	/* continue from next instruction */
64#define	VMEXIT_RESTART		2	/* restart current instruction */
65#define	VMEXIT_ABORT		3	/* abort the vm run loop */
66#define	VMEXIT_RESET		4	/* guest machine has reset */
67
68#define MB		(1024UL * 1024)
69#define GB		(1024UL * MB)
70
71typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu);
72
73int guest_tslice = DEFAULT_GUEST_TSLICE;
74int guest_hz = DEFAULT_GUEST_HZ;
75char *vmname;
76
77u_long lomem_sz;
78u_long himem_sz;
79
80int guest_ncpus;
81
82static int pincpu = -1;
83static int guest_vcpu_mux;
84static int guest_vmexit_on_hlt, guest_vmexit_on_pause;
85
86static int foundcpus;
87
88static int strictio;
89
90static char *lomem_addr;
91static char *himem_addr;
92
93static char *progname;
94static const int BSP = 0;
95
96static int cpumask;
97
98static void *oem_tbl_start;
99static int oem_tbl_size;
100
101static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip);
102
103struct vm_exit vmexit[VM_MAXCPU];
104
105struct fbsdstats {
106        uint64_t        vmexit_bogus;
107        uint64_t        vmexit_bogus_switch;
108        uint64_t        vmexit_hlt;
109        uint64_t        vmexit_pause;
110        uint64_t        vmexit_mtrap;
111        uint64_t        cpu_switch_rotate;
112        uint64_t        cpu_switch_direct;
113        int             io_reset;
114} stats;
115
116struct mt_vmm_info {
117	pthread_t	mt_thr;
118	struct vmctx	*mt_ctx;
119	int		mt_vcpu;
120} mt_vmm_info[VM_MAXCPU];
121
122static void
123usage(int code)
124{
125
126        fprintf(stderr,
127                "Usage: %s [-ehBHP][-g <gdb port>][-z <hz>][-s <pci>][-p pincpu]"
128		"[-n <pci>][-m lowmem][-M highmem] <vm>\n"
129		"       -g: gdb port (default is %d and 0 means don't open)\n"
130		"       -c: # cpus (default 1)\n"
131		"       -p: pin vcpu 'n' to host cpu 'pincpu + n'\n"
132		"       -B: inject breakpoint exception on vm entry\n"
133		"       -H: vmexit from the guest on hlt\n"
134		"       -P: vmexit from the guest on pause\n"
135		"	-e: exit on unhandled i/o access\n"
136		"       -h: help\n"
137		"       -z: guest hz (default is %d)\n"
138		"       -s: <slot,driver,configinfo> PCI slot config\n"
139		"	-n: <slot,name> PCI slot naming\n"
140		"       -m: lowmem in MB\n"
141		"       -M: highmem in MB\n"
142		"       -x: mux vcpus to 1 hcpu\n"
143		"       -t: mux vcpu timeslice hz (default %d)\n",
144		progname, DEFAULT_GDB_PORT, DEFAULT_GUEST_HZ,
145		DEFAULT_GUEST_TSLICE);
146	exit(code);
147}
148
149void *
150paddr_guest2host(uintptr_t gaddr)
151{
152	if (lomem_sz == 0)
153		return (NULL);
154
155	if (gaddr < lomem_sz) {
156		return ((void *)(lomem_addr + gaddr));
157	} else if (gaddr >= 4*GB && gaddr < (4*GB + himem_sz)) {
158		return ((void *)(himem_addr + gaddr - 4*GB));
159	} else
160		return (NULL);
161}
162
163void
164fbsdrun_add_oemtbl(void *tbl, int tblsz)
165{
166	oem_tbl_start = tbl;
167	oem_tbl_size = tblsz;
168}
169
170int
171fbsdrun_vmexit_on_pause(void)
172{
173
174	return (guest_vmexit_on_pause);
175}
176
177int
178fbsdrun_vmexit_on_hlt(void)
179{
180
181	return (guest_vmexit_on_hlt);
182}
183
184int
185fbsdrun_muxed(void)
186{
187
188	return (guest_vcpu_mux);
189}
190
191static void *
192fbsdrun_start_thread(void *param)
193{
194	int vcpu;
195	struct mt_vmm_info *mtp = param;
196
197	vcpu = mtp->mt_vcpu;
198	vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip);
199
200	/* not reached */
201	exit(1);
202	return (NULL);
203}
204
205void
206fbsdrun_addcpu(struct vmctx *ctx, int vcpu, uint64_t rip)
207{
208	int error;
209
210	if (cpumask & (1 << vcpu)) {
211		printf("addcpu: attempting to add existing cpu %d\n", vcpu);
212		exit(1);
213	}
214
215	cpumask |= 1 << vcpu;
216	foundcpus++;
217
218	/*
219	 * Set up the vmexit struct to allow execution to start
220	 * at the given RIP
221	 */
222	vmexit[vcpu].rip = rip;
223	vmexit[vcpu].inst_length = 0;
224
225	if (vcpu == BSP || !guest_vcpu_mux){
226		mt_vmm_info[vcpu].mt_ctx = ctx;
227		mt_vmm_info[vcpu].mt_vcpu = vcpu;
228
229		error = pthread_create(&mt_vmm_info[vcpu].mt_thr, NULL,
230				fbsdrun_start_thread, &mt_vmm_info[vcpu]);
231		assert(error == 0);
232	}
233}
234
235static int
236fbsdrun_get_next_cpu(int curcpu)
237{
238
239	/*
240	 * Get the next available CPU. Assumes they arrive
241	 * in ascending order with no gaps.
242	 */
243	return ((curcpu + 1) % foundcpus);
244}
245
246static int
247vmexit_catch_reset(void)
248{
249        stats.io_reset++;
250        return (VMEXIT_RESET);
251}
252
253static int
254vmexit_catch_inout(void)
255{
256	return (VMEXIT_ABORT);
257}
258
259static int
260vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu,
261		     uint32_t eax)
262{
263#if PG_DEBUG /* put all types of debug here */
264        if (eax == 0) {
265		pause_noswitch = 1;
266	} else if (eax == 1) {
267		pause_noswitch = 0;
268	} else {
269		pause_noswitch = 0;
270		if (eax == 5) {
271			vm_set_capability(ctx, *pvcpu, VM_CAP_MTRAP_EXIT, 1);
272		}
273	}
274#endif
275        return (VMEXIT_CONTINUE);
276}
277
278static int
279vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
280{
281	int error;
282	int bytes, port, in, out;
283	uint32_t eax;
284	int vcpu;
285
286	vcpu = *pvcpu;
287
288	port = vme->u.inout.port;
289	bytes = vme->u.inout.bytes;
290	eax = vme->u.inout.eax;
291	in = vme->u.inout.in;
292	out = !in;
293
294	/* We don't deal with these */
295	if (vme->u.inout.string || vme->u.inout.rep)
296		return (VMEXIT_ABORT);
297
298	/* Special case of guest reset */
299	if (out && port == 0x64 && (uint8_t)eax == 0xFE)
300		return (vmexit_catch_reset());
301
302        /* Extra-special case of host notifications */
303        if (out && port == GUEST_NIO_PORT)
304                return (vmexit_handle_notify(ctx, vme, pvcpu, eax));
305
306	error = emulate_inout(ctx, vcpu, in, port, bytes, &eax, strictio);
307	if (error == 0 && in)
308		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, eax);
309
310	if (error == 0)
311		return (VMEXIT_CONTINUE);
312	else {
313		fprintf(stderr, "Unhandled %s%c 0x%04x\n",
314			in ? "in" : "out",
315			bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port);
316		return (vmexit_catch_inout());
317	}
318}
319
320static int
321vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
322{
323	printf("vm exit rdmsr 0x%x, cpu %d\n", vme->u.msr.code, *pvcpu);
324	return (VMEXIT_ABORT);
325}
326
327static int
328vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
329{
330	int newcpu;
331	int retval = VMEXIT_CONTINUE;
332
333	newcpu = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code,vme->u.msr.wval);
334
335	if (guest_vcpu_mux && *pvcpu != newcpu) {
336                retval = VMEXIT_SWITCH;
337                *pvcpu = newcpu;
338        }
339
340        return (retval);
341}
342
343static int
344vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
345{
346
347	printf("vm exit[%d]\n", *pvcpu);
348	printf("\treason\t\tVMX\n");
349	printf("\trip\t\t0x%016lx\n", vmexit->rip);
350	printf("\tinst_length\t%d\n", vmexit->inst_length);
351	printf("\terror\t\t%d\n", vmexit->u.vmx.error);
352	printf("\texit_reason\t%u\n", vmexit->u.vmx.exit_reason);
353	printf("\tqualification\t0x%016lx\n", vmexit->u.vmx.exit_qualification);
354
355	return (VMEXIT_ABORT);
356}
357
358static int bogus_noswitch = 1;
359
360static int
361vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
362{
363	stats.vmexit_bogus++;
364
365	if (!guest_vcpu_mux || guest_ncpus == 1 || bogus_noswitch) {
366		return (VMEXIT_RESTART);
367	} else {
368		stats.vmexit_bogus_switch++;
369		vmexit->inst_length = 0;
370		*pvcpu = -1;
371		return (VMEXIT_SWITCH);
372	}
373}
374
375static int
376vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
377{
378	stats.vmexit_hlt++;
379	if (fbsdrun_muxed()) {
380		*pvcpu = -1;
381		return (VMEXIT_SWITCH);
382	} else {
383		/*
384		 * Just continue execution with the next instruction. We use
385		 * the HLT VM exit as a way to be friendly with the host
386		 * scheduler.
387		 */
388		return (VMEXIT_CONTINUE);
389	}
390}
391
392static int pause_noswitch;
393
394static int
395vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
396{
397	stats.vmexit_pause++;
398
399	if (fbsdrun_muxed() && !pause_noswitch) {
400		*pvcpu = -1;
401		return (VMEXIT_SWITCH);
402        } else {
403		return (VMEXIT_CONTINUE);
404	}
405}
406
407static int
408vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
409{
410	stats.vmexit_mtrap++;
411
412	return (VMEXIT_RESTART);
413}
414
415static void
416sigalrm(int sig)
417{
418	return;
419}
420
421static void
422setup_timeslice(void)
423{
424	struct sigaction sa;
425	struct itimerval itv;
426	int error;
427
428	/*
429	 * Setup a realtime timer to generate a SIGALRM at a
430	 * frequency of 'guest_tslice' ticks per second.
431	 */
432	sigemptyset(&sa.sa_mask);
433	sa.sa_flags = 0;
434	sa.sa_handler = sigalrm;
435
436	error = sigaction(SIGALRM, &sa, NULL);
437	assert(error == 0);
438
439	itv.it_interval.tv_sec = 0;
440	itv.it_interval.tv_usec = 1000000 / guest_tslice;
441	itv.it_value.tv_sec = 0;
442	itv.it_value.tv_usec = 1000000 / guest_tslice;
443
444	error = setitimer(ITIMER_REAL, &itv, NULL);
445	assert(error == 0);
446}
447
448static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
449	[VM_EXITCODE_INOUT] = vmexit_inout,
450	[VM_EXITCODE_VMX]   = vmexit_vmx,
451	[VM_EXITCODE_BOGUS] = vmexit_bogus,
452	[VM_EXITCODE_RDMSR] = vmexit_rdmsr,
453	[VM_EXITCODE_WRMSR] = vmexit_wrmsr,
454	[VM_EXITCODE_MTRAP] = vmexit_mtrap,
455};
456
457static void
458vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip)
459{
460	int error, rc, prevcpu;
461
462	if (guest_vcpu_mux)
463		setup_timeslice();
464
465	if (pincpu >= 0) {
466		error = vm_set_pinning(ctx, vcpu, pincpu + vcpu);
467		assert(error == 0);
468	}
469
470	while (1) {
471		error = vm_run(ctx, vcpu, rip, &vmexit[vcpu]);
472		if (error != 0)
473			break;
474
475		prevcpu = vcpu;
476                rc = (*handler[vmexit[vcpu].exitcode])(ctx, &vmexit[vcpu],
477                                                       &vcpu);
478		switch (rc) {
479                case VMEXIT_SWITCH:
480			assert(guest_vcpu_mux);
481			if (vcpu == -1) {
482				stats.cpu_switch_rotate++;
483				vcpu = fbsdrun_get_next_cpu(prevcpu);
484			} else {
485				stats.cpu_switch_direct++;
486			}
487			/* fall through */
488		case VMEXIT_CONTINUE:
489                        rip = vmexit[vcpu].rip + vmexit[vcpu].inst_length;
490			break;
491		case VMEXIT_RESTART:
492                        rip = vmexit[vcpu].rip;
493			break;
494		case VMEXIT_RESET:
495			exit(0);
496		default:
497			exit(1);
498		}
499	}
500	fprintf(stderr, "vm_run error %d, errno %d\n", error, errno);
501}
502
503
504int
505main(int argc, char *argv[])
506{
507	int c, error, gdb_port, inject_bkpt, tmp, err;
508	struct vmctx *ctx;
509	uint64_t rip;
510
511	inject_bkpt = 0;
512	progname = basename(argv[0]);
513	gdb_port = DEFAULT_GDB_PORT;
514	guest_ncpus = 1;
515
516	while ((c = getopt(argc, argv, "ehBHPxp:g:c:z:s:n:m:M:")) != -1) {
517		switch (c) {
518		case 'B':
519			inject_bkpt = 1;
520			break;
521		case 'x':
522			guest_vcpu_mux = 1;
523			break;
524		case 'p':
525			pincpu = atoi(optarg);
526			break;
527                case 'c':
528			guest_ncpus = atoi(optarg);
529			break;
530		case 'g':
531			gdb_port = atoi(optarg);
532			break;
533		case 'z':
534			guest_hz = atoi(optarg);
535			break;
536		case 't':
537			guest_tslice = atoi(optarg);
538			break;
539		case 's':
540			pci_parse_slot(optarg);
541			break;
542		case 'n':
543			pci_parse_name(optarg);
544			break;
545                case 'm':
546			lomem_sz = strtoul(optarg, NULL, 0) * MB;
547			break;
548                case 'M':
549			himem_sz = strtoul(optarg, NULL, 0) * MB;
550			break;
551		case 'H':
552			guest_vmexit_on_hlt = 1;
553			break;
554		case 'P':
555			guest_vmexit_on_pause = 1;
556			break;
557		case 'e':
558			strictio = 1;
559			break;
560		case 'h':
561			usage(0);
562		default:
563			usage(1);
564		}
565	}
566	argc -= optind;
567	argv += optind;
568
569	if (argc != 1)
570		usage(1);
571
572	/* No need to mux if guest is uni-processor */
573	if (guest_ncpus <= 1)
574		guest_vcpu_mux = 0;
575
576	/* vmexit on hlt if guest is muxed */
577	if (guest_vcpu_mux) {
578		guest_vmexit_on_hlt = 1;
579		guest_vmexit_on_pause = 1;
580	}
581
582	vmname = argv[0];
583
584	ctx = vm_open(vmname);
585	if (ctx == NULL) {
586		perror("vm_open");
587		exit(1);
588	}
589
590	if (fbsdrun_vmexit_on_hlt()) {
591		err = vm_get_capability(ctx, BSP, VM_CAP_HALT_EXIT, &tmp);
592		if (err < 0) {
593			printf("VM exit on HLT not supported\n");
594			exit(1);
595		}
596		vm_set_capability(ctx, BSP, VM_CAP_HALT_EXIT, 1);
597		handler[VM_EXITCODE_HLT] = vmexit_hlt;
598	}
599
600        if (fbsdrun_vmexit_on_pause()) {
601		/*
602		 * pause exit support required for this mode
603		 */
604		err = vm_get_capability(ctx, BSP, VM_CAP_PAUSE_EXIT, &tmp);
605		if (err < 0) {
606			printf("SMP mux requested, no pause support\n");
607			exit(1);
608		}
609		vm_set_capability(ctx, BSP, VM_CAP_PAUSE_EXIT, 1);
610		handler[VM_EXITCODE_PAUSE] = vmexit_pause;
611        }
612
613	if (lomem_sz != 0) {
614		lomem_addr = vm_map_memory(ctx, 0, lomem_sz);
615		if (lomem_addr == (char *) MAP_FAILED) {
616			lomem_sz = 0;
617		} else if (himem_sz != 0) {
618			himem_addr = vm_map_memory(ctx, 4*GB, himem_sz);
619			if (himem_addr == (char *) MAP_FAILED) {
620				lomem_sz = 0;
621				himem_sz = 0;
622			}
623		}
624	}
625
626	init_inout();
627	init_pci(ctx);
628
629	if (gdb_port != 0)
630		init_dbgport(gdb_port);
631
632	error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip);
633	assert(error == 0);
634
635	if (inject_bkpt) {
636		error = vm_inject_event(ctx, BSP, VM_HW_EXCEPTION, IDT_BP);
637		assert(error == 0);
638	}
639
640	/*
641	 * build the guest tables, MP etc.
642	 */
643	vm_build_tables(ctx, guest_ncpus, oem_tbl_start, oem_tbl_size);
644
645	/*
646	 * Add CPU 0
647	 */
648	fbsdrun_addcpu(ctx, BSP, rip);
649
650	/*
651	 * Head off to the main event dispatch loop
652	 */
653	mevent_dispatch();
654
655	exit(1);
656}
657