bhyverun.c revision 240943
1/*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD$ 27 */ 28 29#include <sys/cdefs.h> 30__FBSDID("$FreeBSD$"); 31 32#include <sys/types.h> 33#include <sys/mman.h> 34#include <sys/time.h> 35 36#include <machine/segments.h> 37 38#include <stdio.h> 39#include <stdlib.h> 40#include <libgen.h> 41#include <unistd.h> 42#include <assert.h> 43#include <errno.h> 44#include <signal.h> 45#include <pthread.h> 46 47#include <machine/vmm.h> 48#include <vmmapi.h> 49 50#include "fbsdrun.h" 51#include "inout.h" 52#include "dbgport.h" 53#include "mevent.h" 54#include "pci_emul.h" 55#include "xmsr.h" 56#include "instruction_emul.h" 57#include "ioapic.h" 58#include "spinup_ap.h" 59 60#define DEFAULT_GUEST_HZ 100 61#define DEFAULT_GUEST_TSLICE 200 62 63#define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */ 64 65#define VMEXIT_SWITCH 0 /* force vcpu switch in mux mode */ 66#define VMEXIT_CONTINUE 1 /* continue from next instruction */ 67#define VMEXIT_RESTART 2 /* restart current instruction */ 68#define VMEXIT_ABORT 3 /* abort the vm run loop */ 69#define VMEXIT_RESET 4 /* guest machine has reset */ 70 71#define MB (1024UL * 1024) 72#define GB (1024UL * MB) 73 74typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu); 75 76int guest_tslice = DEFAULT_GUEST_TSLICE; 77int guest_hz = DEFAULT_GUEST_HZ; 78char *vmname; 79 80u_long lomem_sz; 81u_long himem_sz; 82 83int guest_ncpus; 84 85static int pincpu = -1; 86static int guest_vcpu_mux; 87static int guest_vmexit_on_hlt, guest_vmexit_on_pause, disable_x2apic; 88 89static int foundcpus; 90 91static int strictio; 92 93static char *lomem_addr; 94static char *himem_addr; 95 96static char *progname; 97static const int BSP = 0; 98 99static int cpumask; 100 101static void *oem_tbl_start; 102static int oem_tbl_size; 103 104static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip); 105 106struct vm_exit vmexit[VM_MAXCPU]; 107 108struct fbsdstats { 109 uint64_t vmexit_bogus; 110 uint64_t vmexit_bogus_switch; 111 uint64_t vmexit_hlt; 112 uint64_t vmexit_pause; 113 uint64_t vmexit_mtrap; 114 uint64_t vmexit_paging; 115 uint64_t cpu_switch_rotate; 116 uint64_t cpu_switch_direct; 117 int io_reset; 118} stats; 119 120struct mt_vmm_info { 121 pthread_t mt_thr; 122 struct vmctx *mt_ctx; 123 int mt_vcpu; 124} mt_vmm_info[VM_MAXCPU]; 125 126static void 127usage(int code) 128{ 129 130 fprintf(stderr, 131 "Usage: %s [-aehBHIP][-g <gdb port>][-z <hz>][-s <pci>]" 132 "[-S <pci>][-p pincpu][-n <pci>][-m lowmem][-M highmem] <vm>\n" 133 " -a: local apic is in XAPIC mode (default is X2APIC)\n" 134 " -g: gdb port (default is %d and 0 means don't open)\n" 135 " -c: # cpus (default 1)\n" 136 " -p: pin vcpu 'n' to host cpu 'pincpu + n'\n" 137 " -B: inject breakpoint exception on vm entry\n" 138 " -H: vmexit from the guest on hlt\n" 139 " -I: present an ioapic to the guest\n" 140 " -P: vmexit from the guest on pause\n" 141 " -e: exit on unhandled i/o access\n" 142 " -h: help\n" 143 " -z: guest hz (default is %d)\n" 144 " -s: <slot,driver,configinfo> PCI slot config\n" 145 " -S: <slot,driver,configinfo> legacy PCI slot config\n" 146 " -n: <slot,name> PCI slot naming\n" 147 " -m: lowmem in MB\n" 148 " -M: highmem in MB\n" 149 " -x: mux vcpus to 1 hcpu\n" 150 " -t: mux vcpu timeslice hz (default %d)\n", 151 progname, DEFAULT_GDB_PORT, DEFAULT_GUEST_HZ, 152 DEFAULT_GUEST_TSLICE); 153 exit(code); 154} 155 156void * 157paddr_guest2host(uintptr_t gaddr) 158{ 159 if (lomem_sz == 0) 160 return (NULL); 161 162 if (gaddr < lomem_sz) { 163 return ((void *)(lomem_addr + gaddr)); 164 } else if (gaddr >= 4*GB && gaddr < (4*GB + himem_sz)) { 165 return ((void *)(himem_addr + gaddr - 4*GB)); 166 } else 167 return (NULL); 168} 169 170void 171fbsdrun_add_oemtbl(void *tbl, int tblsz) 172{ 173 oem_tbl_start = tbl; 174 oem_tbl_size = tblsz; 175} 176 177int 178fbsdrun_disable_x2apic(void) 179{ 180 181 return (disable_x2apic); 182} 183 184int 185fbsdrun_vmexit_on_pause(void) 186{ 187 188 return (guest_vmexit_on_pause); 189} 190 191int 192fbsdrun_vmexit_on_hlt(void) 193{ 194 195 return (guest_vmexit_on_hlt); 196} 197 198int 199fbsdrun_muxed(void) 200{ 201 202 return (guest_vcpu_mux); 203} 204 205static void * 206fbsdrun_start_thread(void *param) 207{ 208 int vcpu; 209 struct mt_vmm_info *mtp = param; 210 211 vcpu = mtp->mt_vcpu; 212 vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip); 213 214 /* not reached */ 215 exit(1); 216 return (NULL); 217} 218 219void 220fbsdrun_addcpu(struct vmctx *ctx, int vcpu, uint64_t rip) 221{ 222 int error; 223 224 if (cpumask & (1 << vcpu)) { 225 printf("addcpu: attempting to add existing cpu %d\n", vcpu); 226 exit(1); 227 } 228 229 cpumask |= 1 << vcpu; 230 foundcpus++; 231 232 /* 233 * Set up the vmexit struct to allow execution to start 234 * at the given RIP 235 */ 236 vmexit[vcpu].rip = rip; 237 vmexit[vcpu].inst_length = 0; 238 239 if (vcpu == BSP || !guest_vcpu_mux){ 240 mt_vmm_info[vcpu].mt_ctx = ctx; 241 mt_vmm_info[vcpu].mt_vcpu = vcpu; 242 243 error = pthread_create(&mt_vmm_info[vcpu].mt_thr, NULL, 244 fbsdrun_start_thread, &mt_vmm_info[vcpu]); 245 assert(error == 0); 246 } 247} 248 249static int 250fbsdrun_get_next_cpu(int curcpu) 251{ 252 253 /* 254 * Get the next available CPU. Assumes they arrive 255 * in ascending order with no gaps. 256 */ 257 return ((curcpu + 1) % foundcpus); 258} 259 260static int 261vmexit_catch_reset(void) 262{ 263 stats.io_reset++; 264 return (VMEXIT_RESET); 265} 266 267static int 268vmexit_catch_inout(void) 269{ 270 return (VMEXIT_ABORT); 271} 272 273static int 274vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu, 275 uint32_t eax) 276{ 277#if PG_DEBUG /* put all types of debug here */ 278 if (eax == 0) { 279 pause_noswitch = 1; 280 } else if (eax == 1) { 281 pause_noswitch = 0; 282 } else { 283 pause_noswitch = 0; 284 if (eax == 5) { 285 vm_set_capability(ctx, *pvcpu, VM_CAP_MTRAP_EXIT, 1); 286 } 287 } 288#endif 289 return (VMEXIT_CONTINUE); 290} 291 292static int 293vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) 294{ 295 int error; 296 int bytes, port, in, out; 297 uint32_t eax; 298 int vcpu; 299 300 vcpu = *pvcpu; 301 302 port = vme->u.inout.port; 303 bytes = vme->u.inout.bytes; 304 eax = vme->u.inout.eax; 305 in = vme->u.inout.in; 306 out = !in; 307 308 /* We don't deal with these */ 309 if (vme->u.inout.string || vme->u.inout.rep) 310 return (VMEXIT_ABORT); 311 312 /* Special case of guest reset */ 313 if (out && port == 0x64 && (uint8_t)eax == 0xFE) 314 return (vmexit_catch_reset()); 315 316 /* Extra-special case of host notifications */ 317 if (out && port == GUEST_NIO_PORT) 318 return (vmexit_handle_notify(ctx, vme, pvcpu, eax)); 319 320 error = emulate_inout(ctx, vcpu, in, port, bytes, &eax, strictio); 321 if (error == 0 && in) 322 error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, eax); 323 324 if (error == 0) 325 return (VMEXIT_CONTINUE); 326 else { 327 fprintf(stderr, "Unhandled %s%c 0x%04x\n", 328 in ? "in" : "out", 329 bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port); 330 return (vmexit_catch_inout()); 331 } 332} 333 334static int 335vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) 336{ 337 printf("vm exit rdmsr 0x%x, cpu %d\n", vme->u.msr.code, *pvcpu); 338 return (VMEXIT_ABORT); 339} 340 341static int 342vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) 343{ 344 int newcpu; 345 int retval = VMEXIT_CONTINUE; 346 347 newcpu = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code,vme->u.msr.wval); 348 349 if (guest_vcpu_mux && *pvcpu != newcpu) { 350 retval = VMEXIT_SWITCH; 351 *pvcpu = newcpu; 352 } 353 354 return (retval); 355} 356 357static int 358vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) 359{ 360 int newcpu; 361 int retval = VMEXIT_CONTINUE; 362 363 newcpu = spinup_ap(ctx, *pvcpu, 364 vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip); 365 366 if (guest_vcpu_mux && *pvcpu != newcpu) { 367 retval = VMEXIT_SWITCH; 368 *pvcpu = newcpu; 369 } 370 371 return (retval); 372} 373 374static int 375vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 376{ 377 378 printf("vm exit[%d]\n", *pvcpu); 379 printf("\treason\t\tVMX\n"); 380 printf("\trip\t\t0x%016lx\n", vmexit->rip); 381 printf("\tinst_length\t%d\n", vmexit->inst_length); 382 printf("\terror\t\t%d\n", vmexit->u.vmx.error); 383 printf("\texit_reason\t%u\n", vmexit->u.vmx.exit_reason); 384 printf("\tqualification\t0x%016lx\n", vmexit->u.vmx.exit_qualification); 385 386 return (VMEXIT_ABORT); 387} 388 389static int bogus_noswitch = 1; 390 391static int 392vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 393{ 394 stats.vmexit_bogus++; 395 396 if (!guest_vcpu_mux || guest_ncpus == 1 || bogus_noswitch) { 397 return (VMEXIT_RESTART); 398 } else { 399 stats.vmexit_bogus_switch++; 400 vmexit->inst_length = 0; 401 *pvcpu = -1; 402 return (VMEXIT_SWITCH); 403 } 404} 405 406static int 407vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 408{ 409 stats.vmexit_hlt++; 410 if (fbsdrun_muxed()) { 411 *pvcpu = -1; 412 return (VMEXIT_SWITCH); 413 } else { 414 /* 415 * Just continue execution with the next instruction. We use 416 * the HLT VM exit as a way to be friendly with the host 417 * scheduler. 418 */ 419 return (VMEXIT_CONTINUE); 420 } 421} 422 423static int pause_noswitch; 424 425static int 426vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 427{ 428 stats.vmexit_pause++; 429 430 if (fbsdrun_muxed() && !pause_noswitch) { 431 *pvcpu = -1; 432 return (VMEXIT_SWITCH); 433 } else { 434 return (VMEXIT_CONTINUE); 435 } 436} 437 438static int 439vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 440{ 441 stats.vmexit_mtrap++; 442 443 return (VMEXIT_RESTART); 444} 445 446static int 447vmexit_paging(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 448{ 449 450 stats.vmexit_paging++; 451 452 if (emulate_instruction(ctx, *pvcpu, vmexit->rip, vmexit->u.paging.cr3) != 0) { 453 printf("Failed to emulate instruction at 0x%lx\n", vmexit->rip); 454 return (VMEXIT_ABORT); 455 } 456 457 return (VMEXIT_CONTINUE); 458} 459 460static void 461sigalrm(int sig) 462{ 463 return; 464} 465 466static void 467setup_timeslice(void) 468{ 469 struct sigaction sa; 470 struct itimerval itv; 471 int error; 472 473 /* 474 * Setup a realtime timer to generate a SIGALRM at a 475 * frequency of 'guest_tslice' ticks per second. 476 */ 477 sigemptyset(&sa.sa_mask); 478 sa.sa_flags = 0; 479 sa.sa_handler = sigalrm; 480 481 error = sigaction(SIGALRM, &sa, NULL); 482 assert(error == 0); 483 484 itv.it_interval.tv_sec = 0; 485 itv.it_interval.tv_usec = 1000000 / guest_tslice; 486 itv.it_value.tv_sec = 0; 487 itv.it_value.tv_usec = 1000000 / guest_tslice; 488 489 error = setitimer(ITIMER_REAL, &itv, NULL); 490 assert(error == 0); 491} 492 493static vmexit_handler_t handler[VM_EXITCODE_MAX] = { 494 [VM_EXITCODE_INOUT] = vmexit_inout, 495 [VM_EXITCODE_VMX] = vmexit_vmx, 496 [VM_EXITCODE_BOGUS] = vmexit_bogus, 497 [VM_EXITCODE_RDMSR] = vmexit_rdmsr, 498 [VM_EXITCODE_WRMSR] = vmexit_wrmsr, 499 [VM_EXITCODE_MTRAP] = vmexit_mtrap, 500 [VM_EXITCODE_PAGING] = vmexit_paging, 501 [VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap, 502}; 503 504static void 505vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip) 506{ 507 int error, rc, prevcpu; 508 509 if (guest_vcpu_mux) 510 setup_timeslice(); 511 512 if (pincpu >= 0) { 513 error = vm_set_pinning(ctx, vcpu, pincpu + vcpu); 514 assert(error == 0); 515 } 516 517 while (1) { 518 error = vm_run(ctx, vcpu, rip, &vmexit[vcpu]); 519 if (error != 0) 520 break; 521 522 prevcpu = vcpu; 523 rc = (*handler[vmexit[vcpu].exitcode])(ctx, &vmexit[vcpu], 524 &vcpu); 525 switch (rc) { 526 case VMEXIT_SWITCH: 527 assert(guest_vcpu_mux); 528 if (vcpu == -1) { 529 stats.cpu_switch_rotate++; 530 vcpu = fbsdrun_get_next_cpu(prevcpu); 531 } else { 532 stats.cpu_switch_direct++; 533 } 534 /* fall through */ 535 case VMEXIT_CONTINUE: 536 rip = vmexit[vcpu].rip + vmexit[vcpu].inst_length; 537 break; 538 case VMEXIT_RESTART: 539 rip = vmexit[vcpu].rip; 540 break; 541 case VMEXIT_RESET: 542 exit(0); 543 default: 544 exit(1); 545 } 546 } 547 fprintf(stderr, "vm_run error %d, errno %d\n", error, errno); 548} 549 550 551int 552main(int argc, char *argv[]) 553{ 554 int c, error, gdb_port, inject_bkpt, tmp, err, ioapic; 555 struct vmctx *ctx; 556 uint64_t rip; 557 558 inject_bkpt = 0; 559 progname = basename(argv[0]); 560 gdb_port = DEFAULT_GDB_PORT; 561 guest_ncpus = 1; 562 ioapic = 0; 563 564 while ((c = getopt(argc, argv, "aehBHIPxp:g:c:z:s:S:n:m:M:")) != -1) { 565 switch (c) { 566 case 'a': 567 disable_x2apic = 1; 568 break; 569 case 'B': 570 inject_bkpt = 1; 571 break; 572 case 'x': 573 guest_vcpu_mux = 1; 574 break; 575 case 'p': 576 pincpu = atoi(optarg); 577 break; 578 case 'c': 579 guest_ncpus = atoi(optarg); 580 break; 581 case 'g': 582 gdb_port = atoi(optarg); 583 break; 584 case 'z': 585 guest_hz = atoi(optarg); 586 break; 587 case 't': 588 guest_tslice = atoi(optarg); 589 break; 590 case 's': 591 pci_parse_slot(optarg, 0); 592 break; 593 case 'S': 594 pci_parse_slot(optarg, 1); 595 break; 596 case 'n': 597 pci_parse_name(optarg); 598 break; 599 case 'm': 600 lomem_sz = strtoul(optarg, NULL, 0) * MB; 601 break; 602 case 'M': 603 himem_sz = strtoul(optarg, NULL, 0) * MB; 604 break; 605 case 'H': 606 guest_vmexit_on_hlt = 1; 607 break; 608 case 'I': 609 ioapic = 1; 610 break; 611 case 'P': 612 guest_vmexit_on_pause = 1; 613 break; 614 case 'e': 615 strictio = 1; 616 break; 617 case 'h': 618 usage(0); 619 default: 620 usage(1); 621 } 622 } 623 argc -= optind; 624 argv += optind; 625 626 if (argc != 1) 627 usage(1); 628 629 /* No need to mux if guest is uni-processor */ 630 if (guest_ncpus <= 1) 631 guest_vcpu_mux = 0; 632 633 /* vmexit on hlt if guest is muxed */ 634 if (guest_vcpu_mux) { 635 guest_vmexit_on_hlt = 1; 636 guest_vmexit_on_pause = 1; 637 } 638 639 vmname = argv[0]; 640 641 ctx = vm_open(vmname); 642 if (ctx == NULL) { 643 perror("vm_open"); 644 exit(1); 645 } 646 647 if (fbsdrun_vmexit_on_hlt()) { 648 err = vm_get_capability(ctx, BSP, VM_CAP_HALT_EXIT, &tmp); 649 if (err < 0) { 650 printf("VM exit on HLT not supported\n"); 651 exit(1); 652 } 653 vm_set_capability(ctx, BSP, VM_CAP_HALT_EXIT, 1); 654 handler[VM_EXITCODE_HLT] = vmexit_hlt; 655 } 656 657 if (fbsdrun_vmexit_on_pause()) { 658 /* 659 * pause exit support required for this mode 660 */ 661 err = vm_get_capability(ctx, BSP, VM_CAP_PAUSE_EXIT, &tmp); 662 if (err < 0) { 663 printf("SMP mux requested, no pause support\n"); 664 exit(1); 665 } 666 vm_set_capability(ctx, BSP, VM_CAP_PAUSE_EXIT, 1); 667 handler[VM_EXITCODE_PAUSE] = vmexit_pause; 668 } 669 670 if (fbsdrun_disable_x2apic()) 671 err = vm_set_x2apic_state(ctx, BSP, X2APIC_DISABLED); 672 else 673 err = vm_set_x2apic_state(ctx, BSP, X2APIC_ENABLED); 674 675 if (err) { 676 printf("Unable to set x2apic state (%d)\n", err); 677 exit(1); 678 } 679 680 if (lomem_sz != 0) { 681 lomem_addr = vm_map_memory(ctx, 0, lomem_sz); 682 if (lomem_addr == (char *) MAP_FAILED) { 683 lomem_sz = 0; 684 } else if (himem_sz != 0) { 685 himem_addr = vm_map_memory(ctx, 4*GB, himem_sz); 686 if (himem_addr == (char *) MAP_FAILED) { 687 lomem_sz = 0; 688 himem_sz = 0; 689 } 690 } 691 } 692 693 init_inout(); 694 init_pci(ctx); 695 if (ioapic) 696 ioapic_init(0); 697 698 if (gdb_port != 0) 699 init_dbgport(gdb_port); 700 701 error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip); 702 assert(error == 0); 703 704 if (inject_bkpt) { 705 error = vm_inject_event(ctx, BSP, VM_HW_EXCEPTION, IDT_BP); 706 assert(error == 0); 707 } 708 709 /* 710 * build the guest tables, MP etc. 711 */ 712 vm_build_tables(ctx, guest_ncpus, ioapic, oem_tbl_start, oem_tbl_size); 713 714 /* 715 * Add CPU 0 716 */ 717 fbsdrun_addcpu(ctx, BSP, rip); 718 719 /* 720 * Head off to the main event dispatch loop 721 */ 722 mevent_dispatch(); 723 724 exit(1); 725} 726