bhyverun.c revision 239043
1/*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD$ 27 */ 28 29#include <sys/cdefs.h> 30__FBSDID("$FreeBSD$"); 31 32#include <sys/types.h> 33#include <sys/mman.h> 34#include <sys/time.h> 35 36#include <machine/segments.h> 37 38#include <stdio.h> 39#include <stdlib.h> 40#include <libgen.h> 41#include <unistd.h> 42#include <assert.h> 43#include <errno.h> 44#include <signal.h> 45#include <pthread.h> 46 47#include <machine/vmm.h> 48#include <vmmapi.h> 49 50#include "fbsdrun.h" 51#include "inout.h" 52#include "dbgport.h" 53#include "mevent.h" 54#include "pci_emul.h" 55#include "xmsr.h" 56#include "instruction_emul.h" 57 58#define DEFAULT_GUEST_HZ 100 59#define DEFAULT_GUEST_TSLICE 200 60 61#define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */ 62 63#define VMEXIT_SWITCH 0 /* force vcpu switch in mux mode */ 64#define VMEXIT_CONTINUE 1 /* continue from next instruction */ 65#define VMEXIT_RESTART 2 /* restart current instruction */ 66#define VMEXIT_ABORT 3 /* abort the vm run loop */ 67#define VMEXIT_RESET 4 /* guest machine has reset */ 68 69#define MB (1024UL * 1024) 70#define GB (1024UL * MB) 71 72typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu); 73 74int guest_tslice = DEFAULT_GUEST_TSLICE; 75int guest_hz = DEFAULT_GUEST_HZ; 76char *vmname; 77 78u_long lomem_sz; 79u_long himem_sz; 80 81int guest_ncpus; 82 83static int pincpu = -1; 84static int guest_vcpu_mux; 85static int guest_vmexit_on_hlt, guest_vmexit_on_pause; 86 87static int foundcpus; 88 89static int strictio; 90 91static char *lomem_addr; 92static char *himem_addr; 93 94static char *progname; 95static const int BSP = 0; 96 97static int cpumask; 98 99static void *oem_tbl_start; 100static int oem_tbl_size; 101 102static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip); 103 104struct vm_exit vmexit[VM_MAXCPU]; 105 106struct fbsdstats { 107 uint64_t vmexit_bogus; 108 uint64_t vmexit_bogus_switch; 109 uint64_t vmexit_hlt; 110 uint64_t vmexit_pause; 111 uint64_t vmexit_mtrap; 112 uint64_t vmexit_paging; 113 uint64_t cpu_switch_rotate; 114 uint64_t cpu_switch_direct; 115 int io_reset; 116} stats; 117 118struct mt_vmm_info { 119 pthread_t mt_thr; 120 struct vmctx *mt_ctx; 121 int mt_vcpu; 122} mt_vmm_info[VM_MAXCPU]; 123 124static void 125usage(int code) 126{ 127 128 fprintf(stderr, 129 "Usage: %s [-ehBHIP][-g <gdb port>][-z <hz>][-s <pci>]" 130 "[-S <pci>][-p pincpu][-n <pci>][-m lowmem][-M highmem] <vm>\n" 131 " -g: gdb port (default is %d and 0 means don't open)\n" 132 " -c: # cpus (default 1)\n" 133 " -p: pin vcpu 'n' to host cpu 'pincpu + n'\n" 134 " -B: inject breakpoint exception on vm entry\n" 135 " -H: vmexit from the guest on hlt\n" 136 " -I: present an ioapic to the guest\n" 137 " -P: vmexit from the guest on pause\n" 138 " -e: exit on unhandled i/o access\n" 139 " -h: help\n" 140 " -z: guest hz (default is %d)\n" 141 " -s: <slot,driver,configinfo> PCI slot config\n" 142 " -S: <slot,driver,configinfo> legacy PCI slot config\n" 143 " -n: <slot,name> PCI slot naming\n" 144 " -m: lowmem in MB\n" 145 " -M: highmem in MB\n" 146 " -x: mux vcpus to 1 hcpu\n" 147 " -t: mux vcpu timeslice hz (default %d)\n", 148 progname, DEFAULT_GDB_PORT, DEFAULT_GUEST_HZ, 149 DEFAULT_GUEST_TSLICE); 150 exit(code); 151} 152 153void * 154paddr_guest2host(uintptr_t gaddr) 155{ 156 if (lomem_sz == 0) 157 return (NULL); 158 159 if (gaddr < lomem_sz) { 160 return ((void *)(lomem_addr + gaddr)); 161 } else if (gaddr >= 4*GB && gaddr < (4*GB + himem_sz)) { 162 return ((void *)(himem_addr + gaddr - 4*GB)); 163 } else 164 return (NULL); 165} 166 167void 168fbsdrun_add_oemtbl(void *tbl, int tblsz) 169{ 170 oem_tbl_start = tbl; 171 oem_tbl_size = tblsz; 172} 173 174int 175fbsdrun_vmexit_on_pause(void) 176{ 177 178 return (guest_vmexit_on_pause); 179} 180 181int 182fbsdrun_vmexit_on_hlt(void) 183{ 184 185 return (guest_vmexit_on_hlt); 186} 187 188int 189fbsdrun_muxed(void) 190{ 191 192 return (guest_vcpu_mux); 193} 194 195static void * 196fbsdrun_start_thread(void *param) 197{ 198 int vcpu; 199 struct mt_vmm_info *mtp = param; 200 201 vcpu = mtp->mt_vcpu; 202 vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip); 203 204 /* not reached */ 205 exit(1); 206 return (NULL); 207} 208 209void 210fbsdrun_addcpu(struct vmctx *ctx, int vcpu, uint64_t rip) 211{ 212 int error; 213 214 if (cpumask & (1 << vcpu)) { 215 printf("addcpu: attempting to add existing cpu %d\n", vcpu); 216 exit(1); 217 } 218 219 cpumask |= 1 << vcpu; 220 foundcpus++; 221 222 /* 223 * Set up the vmexit struct to allow execution to start 224 * at the given RIP 225 */ 226 vmexit[vcpu].rip = rip; 227 vmexit[vcpu].inst_length = 0; 228 229 if (vcpu == BSP || !guest_vcpu_mux){ 230 mt_vmm_info[vcpu].mt_ctx = ctx; 231 mt_vmm_info[vcpu].mt_vcpu = vcpu; 232 233 error = pthread_create(&mt_vmm_info[vcpu].mt_thr, NULL, 234 fbsdrun_start_thread, &mt_vmm_info[vcpu]); 235 assert(error == 0); 236 } 237} 238 239static int 240fbsdrun_get_next_cpu(int curcpu) 241{ 242 243 /* 244 * Get the next available CPU. Assumes they arrive 245 * in ascending order with no gaps. 246 */ 247 return ((curcpu + 1) % foundcpus); 248} 249 250static int 251vmexit_catch_reset(void) 252{ 253 stats.io_reset++; 254 return (VMEXIT_RESET); 255} 256 257static int 258vmexit_catch_inout(void) 259{ 260 return (VMEXIT_ABORT); 261} 262 263static int 264vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu, 265 uint32_t eax) 266{ 267#if PG_DEBUG /* put all types of debug here */ 268 if (eax == 0) { 269 pause_noswitch = 1; 270 } else if (eax == 1) { 271 pause_noswitch = 0; 272 } else { 273 pause_noswitch = 0; 274 if (eax == 5) { 275 vm_set_capability(ctx, *pvcpu, VM_CAP_MTRAP_EXIT, 1); 276 } 277 } 278#endif 279 return (VMEXIT_CONTINUE); 280} 281 282static int 283vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) 284{ 285 int error; 286 int bytes, port, in, out; 287 uint32_t eax; 288 int vcpu; 289 290 vcpu = *pvcpu; 291 292 port = vme->u.inout.port; 293 bytes = vme->u.inout.bytes; 294 eax = vme->u.inout.eax; 295 in = vme->u.inout.in; 296 out = !in; 297 298 /* We don't deal with these */ 299 if (vme->u.inout.string || vme->u.inout.rep) 300 return (VMEXIT_ABORT); 301 302 /* Special case of guest reset */ 303 if (out && port == 0x64 && (uint8_t)eax == 0xFE) 304 return (vmexit_catch_reset()); 305 306 /* Extra-special case of host notifications */ 307 if (out && port == GUEST_NIO_PORT) 308 return (vmexit_handle_notify(ctx, vme, pvcpu, eax)); 309 310 error = emulate_inout(ctx, vcpu, in, port, bytes, &eax, strictio); 311 if (error == 0 && in) 312 error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, eax); 313 314 if (error == 0) 315 return (VMEXIT_CONTINUE); 316 else { 317 fprintf(stderr, "Unhandled %s%c 0x%04x\n", 318 in ? "in" : "out", 319 bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port); 320 return (vmexit_catch_inout()); 321 } 322} 323 324static int 325vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) 326{ 327 printf("vm exit rdmsr 0x%x, cpu %d\n", vme->u.msr.code, *pvcpu); 328 return (VMEXIT_ABORT); 329} 330 331static int 332vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) 333{ 334 int newcpu; 335 int retval = VMEXIT_CONTINUE; 336 337 newcpu = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code,vme->u.msr.wval); 338 339 if (guest_vcpu_mux && *pvcpu != newcpu) { 340 retval = VMEXIT_SWITCH; 341 *pvcpu = newcpu; 342 } 343 344 return (retval); 345} 346 347static int 348vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 349{ 350 351 printf("vm exit[%d]\n", *pvcpu); 352 printf("\treason\t\tVMX\n"); 353 printf("\trip\t\t0x%016lx\n", vmexit->rip); 354 printf("\tinst_length\t%d\n", vmexit->inst_length); 355 printf("\terror\t\t%d\n", vmexit->u.vmx.error); 356 printf("\texit_reason\t%u\n", vmexit->u.vmx.exit_reason); 357 printf("\tqualification\t0x%016lx\n", vmexit->u.vmx.exit_qualification); 358 359 return (VMEXIT_ABORT); 360} 361 362static int bogus_noswitch = 1; 363 364static int 365vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 366{ 367 stats.vmexit_bogus++; 368 369 if (!guest_vcpu_mux || guest_ncpus == 1 || bogus_noswitch) { 370 return (VMEXIT_RESTART); 371 } else { 372 stats.vmexit_bogus_switch++; 373 vmexit->inst_length = 0; 374 *pvcpu = -1; 375 return (VMEXIT_SWITCH); 376 } 377} 378 379static int 380vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 381{ 382 stats.vmexit_hlt++; 383 if (fbsdrun_muxed()) { 384 *pvcpu = -1; 385 return (VMEXIT_SWITCH); 386 } else { 387 /* 388 * Just continue execution with the next instruction. We use 389 * the HLT VM exit as a way to be friendly with the host 390 * scheduler. 391 */ 392 return (VMEXIT_CONTINUE); 393 } 394} 395 396static int pause_noswitch; 397 398static int 399vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 400{ 401 stats.vmexit_pause++; 402 403 if (fbsdrun_muxed() && !pause_noswitch) { 404 *pvcpu = -1; 405 return (VMEXIT_SWITCH); 406 } else { 407 return (VMEXIT_CONTINUE); 408 } 409} 410 411static int 412vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 413{ 414 stats.vmexit_mtrap++; 415 416 return (VMEXIT_RESTART); 417} 418 419static int 420vmexit_paging(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 421{ 422 423 stats.vmexit_paging++; 424 425 if (emulate_instruction(ctx, *pvcpu, vmexit->rip, vmexit->u.paging.cr3) != 0) { 426 printf("Failed to emulate instruction at 0x%lx\n", vmexit->rip); 427 return (VMEXIT_ABORT); 428 } 429 430 return (VMEXIT_CONTINUE); 431} 432 433static void 434sigalrm(int sig) 435{ 436 return; 437} 438 439static void 440setup_timeslice(void) 441{ 442 struct sigaction sa; 443 struct itimerval itv; 444 int error; 445 446 /* 447 * Setup a realtime timer to generate a SIGALRM at a 448 * frequency of 'guest_tslice' ticks per second. 449 */ 450 sigemptyset(&sa.sa_mask); 451 sa.sa_flags = 0; 452 sa.sa_handler = sigalrm; 453 454 error = sigaction(SIGALRM, &sa, NULL); 455 assert(error == 0); 456 457 itv.it_interval.tv_sec = 0; 458 itv.it_interval.tv_usec = 1000000 / guest_tslice; 459 itv.it_value.tv_sec = 0; 460 itv.it_value.tv_usec = 1000000 / guest_tslice; 461 462 error = setitimer(ITIMER_REAL, &itv, NULL); 463 assert(error == 0); 464} 465 466static vmexit_handler_t handler[VM_EXITCODE_MAX] = { 467 [VM_EXITCODE_INOUT] = vmexit_inout, 468 [VM_EXITCODE_VMX] = vmexit_vmx, 469 [VM_EXITCODE_BOGUS] = vmexit_bogus, 470 [VM_EXITCODE_RDMSR] = vmexit_rdmsr, 471 [VM_EXITCODE_WRMSR] = vmexit_wrmsr, 472 [VM_EXITCODE_MTRAP] = vmexit_mtrap, 473 [VM_EXITCODE_PAGING] = vmexit_paging 474}; 475 476static void 477vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip) 478{ 479 int error, rc, prevcpu; 480 481 if (guest_vcpu_mux) 482 setup_timeslice(); 483 484 if (pincpu >= 0) { 485 error = vm_set_pinning(ctx, vcpu, pincpu + vcpu); 486 assert(error == 0); 487 } 488 489 while (1) { 490 error = vm_run(ctx, vcpu, rip, &vmexit[vcpu]); 491 if (error != 0) 492 break; 493 494 prevcpu = vcpu; 495 rc = (*handler[vmexit[vcpu].exitcode])(ctx, &vmexit[vcpu], 496 &vcpu); 497 switch (rc) { 498 case VMEXIT_SWITCH: 499 assert(guest_vcpu_mux); 500 if (vcpu == -1) { 501 stats.cpu_switch_rotate++; 502 vcpu = fbsdrun_get_next_cpu(prevcpu); 503 } else { 504 stats.cpu_switch_direct++; 505 } 506 /* fall through */ 507 case VMEXIT_CONTINUE: 508 rip = vmexit[vcpu].rip + vmexit[vcpu].inst_length; 509 break; 510 case VMEXIT_RESTART: 511 rip = vmexit[vcpu].rip; 512 break; 513 case VMEXIT_RESET: 514 exit(0); 515 default: 516 exit(1); 517 } 518 } 519 fprintf(stderr, "vm_run error %d, errno %d\n", error, errno); 520} 521 522 523int 524main(int argc, char *argv[]) 525{ 526 int c, error, gdb_port, inject_bkpt, tmp, err, ioapic; 527 struct vmctx *ctx; 528 uint64_t rip; 529 530 inject_bkpt = 0; 531 progname = basename(argv[0]); 532 gdb_port = DEFAULT_GDB_PORT; 533 guest_ncpus = 1; 534 ioapic = 0; 535 536 while ((c = getopt(argc, argv, "ehBHIPxp:g:c:z:s:S:n:m:M:")) != -1) { 537 switch (c) { 538 case 'B': 539 inject_bkpt = 1; 540 break; 541 case 'x': 542 guest_vcpu_mux = 1; 543 break; 544 case 'p': 545 pincpu = atoi(optarg); 546 break; 547 case 'c': 548 guest_ncpus = atoi(optarg); 549 break; 550 case 'g': 551 gdb_port = atoi(optarg); 552 break; 553 case 'z': 554 guest_hz = atoi(optarg); 555 break; 556 case 't': 557 guest_tslice = atoi(optarg); 558 break; 559 case 's': 560 pci_parse_slot(optarg, 0); 561 break; 562 case 'S': 563 pci_parse_slot(optarg, 1); 564 break; 565 case 'n': 566 pci_parse_name(optarg); 567 break; 568 case 'm': 569 lomem_sz = strtoul(optarg, NULL, 0) * MB; 570 break; 571 case 'M': 572 himem_sz = strtoul(optarg, NULL, 0) * MB; 573 break; 574 case 'H': 575 guest_vmexit_on_hlt = 1; 576 break; 577 case 'I': 578 ioapic = 1; 579 break; 580 case 'P': 581 guest_vmexit_on_pause = 1; 582 break; 583 case 'e': 584 strictio = 1; 585 break; 586 case 'h': 587 usage(0); 588 default: 589 usage(1); 590 } 591 } 592 argc -= optind; 593 argv += optind; 594 595 if (argc != 1) 596 usage(1); 597 598 /* No need to mux if guest is uni-processor */ 599 if (guest_ncpus <= 1) 600 guest_vcpu_mux = 0; 601 602 /* vmexit on hlt if guest is muxed */ 603 if (guest_vcpu_mux) { 604 guest_vmexit_on_hlt = 1; 605 guest_vmexit_on_pause = 1; 606 } 607 608 vmname = argv[0]; 609 610 ctx = vm_open(vmname); 611 if (ctx == NULL) { 612 perror("vm_open"); 613 exit(1); 614 } 615 616 if (fbsdrun_vmexit_on_hlt()) { 617 err = vm_get_capability(ctx, BSP, VM_CAP_HALT_EXIT, &tmp); 618 if (err < 0) { 619 printf("VM exit on HLT not supported\n"); 620 exit(1); 621 } 622 vm_set_capability(ctx, BSP, VM_CAP_HALT_EXIT, 1); 623 handler[VM_EXITCODE_HLT] = vmexit_hlt; 624 } 625 626 if (fbsdrun_vmexit_on_pause()) { 627 /* 628 * pause exit support required for this mode 629 */ 630 err = vm_get_capability(ctx, BSP, VM_CAP_PAUSE_EXIT, &tmp); 631 if (err < 0) { 632 printf("SMP mux requested, no pause support\n"); 633 exit(1); 634 } 635 vm_set_capability(ctx, BSP, VM_CAP_PAUSE_EXIT, 1); 636 handler[VM_EXITCODE_PAUSE] = vmexit_pause; 637 } 638 639 if (lomem_sz != 0) { 640 lomem_addr = vm_map_memory(ctx, 0, lomem_sz); 641 if (lomem_addr == (char *) MAP_FAILED) { 642 lomem_sz = 0; 643 } else if (himem_sz != 0) { 644 himem_addr = vm_map_memory(ctx, 4*GB, himem_sz); 645 if (himem_addr == (char *) MAP_FAILED) { 646 lomem_sz = 0; 647 himem_sz = 0; 648 } 649 } 650 } 651 652 init_inout(); 653 init_pci(ctx); 654 655 if (gdb_port != 0) 656 init_dbgport(gdb_port); 657 658 error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip); 659 assert(error == 0); 660 661 if (inject_bkpt) { 662 error = vm_inject_event(ctx, BSP, VM_HW_EXCEPTION, IDT_BP); 663 assert(error == 0); 664 } 665 666 /* 667 * build the guest tables, MP etc. 668 */ 669 vm_build_tables(ctx, guest_ncpus, ioapic, oem_tbl_start, oem_tbl_size); 670 671 /* 672 * Add CPU 0 673 */ 674 fbsdrun_addcpu(ctx, BSP, rip); 675 676 /* 677 * Head off to the main event dispatch loop 678 */ 679 mevent_dispatch(); 680 681 exit(1); 682} 683