bhyverun.c revision 222105
1254219Scy/*- 2254219Scy * Copyright (c) 2011 NetApp, Inc. 3254219Scy * All rights reserved. 4254219Scy * 5254219Scy * Redistribution and use in source and binary forms, with or without 6254219Scy * modification, are permitted provided that the following conditions 7254219Scy * are met: 8254219Scy * 1. Redistributions of source code must retain the above copyright 9254219Scy * notice, this list of conditions and the following disclaimer. 10254219Scy * 2. Redistributions in binary form must reproduce the above copyright 11254219Scy * notice, this list of conditions and the following disclaimer in the 12254219Scy * documentation and/or other materials provided with the distribution. 13254219Scy * 14254219Scy * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15254219Scy * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16254219Scy * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17254219Scy * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18254219Scy * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19254219Scy * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20254219Scy * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21254219Scy * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22254219Scy * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23254219Scy * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24254219Scy * SUCH DAMAGE. 25254219Scy * 26 * $FreeBSD$ 27 */ 28 29#include <sys/cdefs.h> 30__FBSDID("$FreeBSD$"); 31 32#include <sys/types.h> 33#include <sys/mman.h> 34#include <sys/time.h> 35 36#include <machine/segments.h> 37 38#include <stdio.h> 39#include <stdlib.h> 40#include <libgen.h> 41#include <unistd.h> 42#include <assert.h> 43#include <errno.h> 44#include <signal.h> 45#include <pthread.h> 46 47#include <machine/vmm.h> 48#include <vmmapi.h> 49 50#include "fbsdrun.h" 51#include "inout.h" 52#include "dbgport.h" 53#include "mevent.h" 54#include "pci_emul.h" 55#include "xmsr.h" 56 57#define DEFAULT_GUEST_HZ 100 58#define DEFAULT_GUEST_TSLICE 200 59 60#define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */ 61 62#define VMEXIT_SWITCH 0 /* force vcpu switch in mux mode */ 63#define VMEXIT_CONTINUE 1 /* continue from next instruction */ 64#define VMEXIT_RESTART 2 /* restart current instruction */ 65#define VMEXIT_ABORT 3 /* abort the vm run loop */ 66#define VMEXIT_RESET 4 /* guest machine has reset */ 67 68#define MB (1024UL * 1024) 69#define GB (1024UL * MB) 70 71typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu); 72 73int guest_tslice = DEFAULT_GUEST_TSLICE; 74int guest_hz = DEFAULT_GUEST_HZ; 75char *vmname; 76 77u_long lomem_sz; 78u_long himem_sz; 79 80int guest_ncpus; 81 82static int pincpu = -1; 83static int guest_vcpu_mux; 84static int guest_vmexit_on_hlt, guest_vmexit_on_pause; 85 86static int foundcpus; 87 88static int strictio; 89 90static char *lomem_addr; 91static char *himem_addr; 92 93static char *progname; 94static const int BSP = 0; 95 96static int cpumask; 97 98static void *oem_tbl_start; 99static int oem_tbl_size; 100 101static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip); 102 103struct vm_exit vmexit[VM_MAXCPU]; 104 105struct fbsdstats { 106 uint64_t vmexit_bogus; 107 uint64_t vmexit_bogus_switch; 108 uint64_t vmexit_hlt; 109 uint64_t vmexit_pause; 110 uint64_t vmexit_mtrap; 111 uint64_t cpu_switch_rotate; 112 uint64_t cpu_switch_direct; 113 int io_reset; 114} stats; 115 116struct mt_vmm_info { 117 pthread_t mt_thr; 118 struct vmctx *mt_ctx; 119 int mt_vcpu; 120} mt_vmm_info[VM_MAXCPU]; 121 122static void 123usage(int code) 124{ 125 126 fprintf(stderr, 127 "Usage: %s [-ehBHP][-g <gdb port>][-z <hz>][-s <pci>][-p pincpu]" 128 "[-n <pci>][-m lowmem][-M highmem] <vm>\n" 129 " -g: gdb port (default is %d and 0 means don't open)\n" 130 " -c: # cpus (default 1)\n" 131 " -p: pin vcpu 'n' to host cpu 'pincpu + n'\n" 132 " -B: inject breakpoint exception on vm entry\n" 133 " -H: vmexit from the guest on hlt\n" 134 " -P: vmexit from the guest on pause\n" 135 " -e: exit on unhandled i/o access\n" 136 " -h: help\n" 137 " -z: guest hz (default is %d)\n" 138 " -s: <slot,driver,configinfo> PCI slot config\n" 139 " -n: <slot,name> PCI slot naming\n" 140 " -m: lowmem in MB\n" 141 " -M: highmem in MB\n" 142 " -x: mux vcpus to 1 hcpu\n" 143 " -t: mux vcpu timeslice hz (default %d)\n", 144 progname, DEFAULT_GDB_PORT, DEFAULT_GUEST_HZ, 145 DEFAULT_GUEST_TSLICE); 146 exit(code); 147} 148 149void * 150paddr_guest2host(uintptr_t gaddr) 151{ 152 if (lomem_sz == 0) 153 return (NULL); 154 155 if (gaddr < lomem_sz) { 156 return ((void *)(lomem_addr + gaddr)); 157 } else if (gaddr >= 4*GB && gaddr < (4*GB + himem_sz)) { 158 return ((void *)(himem_addr + gaddr - 4*GB)); 159 } else 160 return (NULL); 161} 162 163void 164fbsdrun_add_oemtbl(void *tbl, int tblsz) 165{ 166 oem_tbl_start = tbl; 167 oem_tbl_size = tblsz; 168} 169 170int 171fbsdrun_vmexit_on_pause(void) 172{ 173 174 return (guest_vmexit_on_pause); 175} 176 177int 178fbsdrun_vmexit_on_hlt(void) 179{ 180 181 return (guest_vmexit_on_hlt); 182} 183 184int 185fbsdrun_muxed(void) 186{ 187 188 return (guest_vcpu_mux); 189} 190 191static void * 192fbsdrun_start_thread(void *param) 193{ 194 int vcpu; 195 struct mt_vmm_info *mtp = param; 196 197 vcpu = mtp->mt_vcpu; 198 vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip); 199 200 /* not reached */ 201 exit(1); 202 return (NULL); 203} 204 205void 206fbsdrun_addcpu(struct vmctx *ctx, int vcpu, uint64_t rip) 207{ 208 int error; 209 210 if (cpumask & (1 << vcpu)) { 211 printf("addcpu: attempting to add existing cpu %d\n", vcpu); 212 exit(1); 213 } 214 215 cpumask |= 1 << vcpu; 216 foundcpus++; 217 218 /* 219 * Set up the vmexit struct to allow execution to start 220 * at the given RIP 221 */ 222 vmexit[vcpu].rip = rip; 223 vmexit[vcpu].inst_length = 0; 224 225 if (vcpu == BSP || !guest_vcpu_mux){ 226 mt_vmm_info[vcpu].mt_ctx = ctx; 227 mt_vmm_info[vcpu].mt_vcpu = vcpu; 228 229 error = pthread_create(&mt_vmm_info[vcpu].mt_thr, NULL, 230 fbsdrun_start_thread, &mt_vmm_info[vcpu]); 231 assert(error == 0); 232 } 233} 234 235static int 236fbsdrun_get_next_cpu(int curcpu) 237{ 238 239 /* 240 * Get the next available CPU. Assumes they arrive 241 * in ascending order with no gaps. 242 */ 243 return ((curcpu + 1) % foundcpus); 244} 245 246static int 247vmexit_catch_reset(void) 248{ 249 stats.io_reset++; 250 return (VMEXIT_RESET); 251} 252 253static int 254vmexit_catch_inout(void) 255{ 256 return (VMEXIT_ABORT); 257} 258 259static int 260vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu, 261 uint32_t eax) 262{ 263#if PG_DEBUG /* put all types of debug here */ 264 if (eax == 0) { 265 pause_noswitch = 1; 266 } else if (eax == 1) { 267 pause_noswitch = 0; 268 } else { 269 pause_noswitch = 0; 270 if (eax == 5) { 271 vm_set_capability(ctx, *pvcpu, VM_CAP_MTRAP_EXIT, 1); 272 } 273 } 274#endif 275 return (VMEXIT_CONTINUE); 276} 277 278static int 279vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) 280{ 281 int error; 282 int bytes, port, in, out; 283 uint32_t eax; 284 int vcpu; 285 286 vcpu = *pvcpu; 287 288 port = vme->u.inout.port; 289 bytes = vme->u.inout.bytes; 290 eax = vme->u.inout.eax; 291 in = vme->u.inout.in; 292 out = !in; 293 294 /* We don't deal with these */ 295 if (vme->u.inout.string || vme->u.inout.rep) 296 return (VMEXIT_ABORT); 297 298 /* Special case of guest reset */ 299 if (out && port == 0x64 && (uint8_t)eax == 0xFE) 300 return (vmexit_catch_reset()); 301 302 /* Extra-special case of host notifications */ 303 if (out && port == GUEST_NIO_PORT) 304 return (vmexit_handle_notify(ctx, vme, pvcpu, eax)); 305 306 error = emulate_inout(ctx, vcpu, in, port, bytes, &eax, strictio); 307 if (error == 0 && in) 308 error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, eax); 309 310 if (error == 0) 311 return (VMEXIT_CONTINUE); 312 else { 313 fprintf(stderr, "Unhandled %s%c 0x%04x\n", 314 in ? "in" : "out", 315 bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port); 316 return (vmexit_catch_inout()); 317 } 318} 319 320static int 321vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) 322{ 323 printf("vm exit rdmsr 0x%x, cpu %d\n", vme->u.msr.code, *pvcpu); 324 return (VMEXIT_ABORT); 325} 326 327static int 328vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) 329{ 330 int newcpu; 331 int retval = VMEXIT_CONTINUE; 332 333 newcpu = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code,vme->u.msr.wval); 334 335 if (guest_vcpu_mux && *pvcpu != newcpu) { 336 retval = VMEXIT_SWITCH; 337 *pvcpu = newcpu; 338 } 339 340 return (retval); 341} 342 343static int 344vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 345{ 346 347 printf("vm exit[%d]\n", *pvcpu); 348 printf("\treason\t\tVMX\n"); 349 printf("\trip\t\t0x%016lx\n", vmexit->rip); 350 printf("\tinst_length\t%d\n", vmexit->inst_length); 351 printf("\terror\t\t%d\n", vmexit->u.vmx.error); 352 printf("\texit_reason\t%u\n", vmexit->u.vmx.exit_reason); 353 printf("\tqualification\t0x%016lx\n", vmexit->u.vmx.exit_qualification); 354 355 return (VMEXIT_ABORT); 356} 357 358static int bogus_noswitch = 1; 359 360static int 361vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 362{ 363 stats.vmexit_bogus++; 364 365 if (!guest_vcpu_mux || guest_ncpus == 1 || bogus_noswitch) { 366 return (VMEXIT_RESTART); 367 } else { 368 stats.vmexit_bogus_switch++; 369 vmexit->inst_length = 0; 370 *pvcpu = -1; 371 return (VMEXIT_SWITCH); 372 } 373} 374 375static int 376vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 377{ 378 stats.vmexit_hlt++; 379 if (fbsdrun_muxed()) { 380 *pvcpu = -1; 381 return (VMEXIT_SWITCH); 382 } else { 383 /* 384 * Just continue execution with the next instruction. We use 385 * the HLT VM exit as a way to be friendly with the host 386 * scheduler. 387 */ 388 return (VMEXIT_CONTINUE); 389 } 390} 391 392static int pause_noswitch; 393 394static int 395vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 396{ 397 stats.vmexit_pause++; 398 399 if (fbsdrun_muxed() && !pause_noswitch) { 400 *pvcpu = -1; 401 return (VMEXIT_SWITCH); 402 } else { 403 return (VMEXIT_CONTINUE); 404 } 405} 406 407static int 408vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 409{ 410 stats.vmexit_mtrap++; 411 412 return (VMEXIT_RESTART); 413} 414 415static void 416sigalrm(int sig) 417{ 418 return; 419} 420 421static void 422setup_timeslice(void) 423{ 424 struct sigaction sa; 425 struct itimerval itv; 426 int error; 427 428 /* 429 * Setup a realtime timer to generate a SIGALRM at a 430 * frequency of 'guest_tslice' ticks per second. 431 */ 432 sigemptyset(&sa.sa_mask); 433 sa.sa_flags = 0; 434 sa.sa_handler = sigalrm; 435 436 error = sigaction(SIGALRM, &sa, NULL); 437 assert(error == 0); 438 439 itv.it_interval.tv_sec = 0; 440 itv.it_interval.tv_usec = 1000000 / guest_tslice; 441 itv.it_value.tv_sec = 0; 442 itv.it_value.tv_usec = 1000000 / guest_tslice; 443 444 error = setitimer(ITIMER_REAL, &itv, NULL); 445 assert(error == 0); 446} 447 448static vmexit_handler_t handler[VM_EXITCODE_MAX] = { 449 [VM_EXITCODE_INOUT] = vmexit_inout, 450 [VM_EXITCODE_VMX] = vmexit_vmx, 451 [VM_EXITCODE_BOGUS] = vmexit_bogus, 452 [VM_EXITCODE_RDMSR] = vmexit_rdmsr, 453 [VM_EXITCODE_WRMSR] = vmexit_wrmsr, 454 [VM_EXITCODE_MTRAP] = vmexit_mtrap, 455}; 456 457static void 458vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip) 459{ 460 int error, rc, prevcpu; 461 462 if (guest_vcpu_mux) 463 setup_timeslice(); 464 465 if (pincpu >= 0) { 466 error = vm_set_pinning(ctx, vcpu, pincpu + vcpu); 467 assert(error == 0); 468 } 469 470 while (1) { 471 error = vm_run(ctx, vcpu, rip, &vmexit[vcpu]); 472 if (error != 0) 473 break; 474 475 prevcpu = vcpu; 476 rc = (*handler[vmexit[vcpu].exitcode])(ctx, &vmexit[vcpu], 477 &vcpu); 478 switch (rc) { 479 case VMEXIT_SWITCH: 480 assert(guest_vcpu_mux); 481 if (vcpu == -1) { 482 stats.cpu_switch_rotate++; 483 vcpu = fbsdrun_get_next_cpu(prevcpu); 484 } else { 485 stats.cpu_switch_direct++; 486 } 487 /* fall through */ 488 case VMEXIT_CONTINUE: 489 rip = vmexit[vcpu].rip + vmexit[vcpu].inst_length; 490 break; 491 case VMEXIT_RESTART: 492 rip = vmexit[vcpu].rip; 493 break; 494 case VMEXIT_RESET: 495 exit(0); 496 default: 497 exit(1); 498 } 499 } 500 fprintf(stderr, "vm_run error %d, errno %d\n", error, errno); 501} 502 503 504int 505main(int argc, char *argv[]) 506{ 507 int c, error, gdb_port, inject_bkpt, tmp, err; 508 struct vmctx *ctx; 509 uint64_t rip; 510 511 inject_bkpt = 0; 512 progname = basename(argv[0]); 513 gdb_port = DEFAULT_GDB_PORT; 514 guest_ncpus = 1; 515 516 while ((c = getopt(argc, argv, "ehBHPxp:g:c:z:s:n:m:M:")) != -1) { 517 switch (c) { 518 case 'B': 519 inject_bkpt = 1; 520 break; 521 case 'x': 522 guest_vcpu_mux = 1; 523 break; 524 case 'p': 525 pincpu = atoi(optarg); 526 break; 527 case 'c': 528 guest_ncpus = atoi(optarg); 529 break; 530 case 'g': 531 gdb_port = atoi(optarg); 532 break; 533 case 'z': 534 guest_hz = atoi(optarg); 535 break; 536 case 't': 537 guest_tslice = atoi(optarg); 538 break; 539 case 's': 540 pci_parse_slot(optarg); 541 break; 542 case 'n': 543 pci_parse_name(optarg); 544 break; 545 case 'm': 546 lomem_sz = strtoul(optarg, NULL, 0) * MB; 547 break; 548 case 'M': 549 himem_sz = strtoul(optarg, NULL, 0) * MB; 550 break; 551 case 'H': 552 guest_vmexit_on_hlt = 1; 553 break; 554 case 'P': 555 guest_vmexit_on_pause = 1; 556 break; 557 case 'e': 558 strictio = 1; 559 break; 560 case 'h': 561 usage(0); 562 default: 563 usage(1); 564 } 565 } 566 argc -= optind; 567 argv += optind; 568 569 if (argc != 1) 570 usage(1); 571 572 /* No need to mux if guest is uni-processor */ 573 if (guest_ncpus <= 1) 574 guest_vcpu_mux = 0; 575 576 /* vmexit on hlt if guest is muxed */ 577 if (guest_vcpu_mux) { 578 guest_vmexit_on_hlt = 1; 579 guest_vmexit_on_pause = 1; 580 } 581 582 vmname = argv[0]; 583 584 ctx = vm_open(vmname); 585 if (ctx == NULL) { 586 perror("vm_open"); 587 exit(1); 588 } 589 590 if (fbsdrun_vmexit_on_hlt()) { 591 err = vm_get_capability(ctx, BSP, VM_CAP_HALT_EXIT, &tmp); 592 if (err < 0) { 593 printf("VM exit on HLT not supported\n"); 594 exit(1); 595 } 596 vm_set_capability(ctx, BSP, VM_CAP_HALT_EXIT, 1); 597 handler[VM_EXITCODE_HLT] = vmexit_hlt; 598 } 599 600 if (fbsdrun_vmexit_on_pause()) { 601 /* 602 * pause exit support required for this mode 603 */ 604 err = vm_get_capability(ctx, BSP, VM_CAP_PAUSE_EXIT, &tmp); 605 if (err < 0) { 606 printf("SMP mux requested, no pause support\n"); 607 exit(1); 608 } 609 vm_set_capability(ctx, BSP, VM_CAP_PAUSE_EXIT, 1); 610 handler[VM_EXITCODE_PAUSE] = vmexit_pause; 611 } 612 613 if (lomem_sz != 0) { 614 lomem_addr = vm_map_memory(ctx, 0, lomem_sz); 615 if (lomem_addr == (char *) MAP_FAILED) { 616 lomem_sz = 0; 617 } else if (himem_sz != 0) { 618 himem_addr = vm_map_memory(ctx, 4*GB, himem_sz); 619 if (himem_addr == (char *) MAP_FAILED) { 620 lomem_sz = 0; 621 himem_sz = 0; 622 } 623 } 624 } 625 626 init_inout(); 627 init_pci(ctx); 628 629 if (gdb_port != 0) 630 init_dbgport(gdb_port); 631 632 error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip); 633 assert(error == 0); 634 635 if (inject_bkpt) { 636 error = vm_inject_event(ctx, BSP, VM_HW_EXCEPTION, IDT_BP); 637 assert(error == 0); 638 } 639 640 /* 641 * build the guest tables, MP etc. 642 */ 643 vm_build_tables(ctx, guest_ncpus, oem_tbl_start, oem_tbl_size); 644 645 /* 646 * Add CPU 0 647 */ 648 fbsdrun_addcpu(ctx, BSP, rip); 649 650 /* 651 * Head off to the main event dispatch loop 652 */ 653 mevent_dispatch(); 654 655 exit(1); 656} 657