bhyverun.c revision 242131
1/*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD$ 27 */ 28 29#include <sys/cdefs.h> 30__FBSDID("$FreeBSD$"); 31 32#include <sys/types.h> 33#include <sys/mman.h> 34#include <sys/time.h> 35 36#include <machine/segments.h> 37 38#include <stdio.h> 39#include <stdlib.h> 40#include <libgen.h> 41#include <unistd.h> 42#include <assert.h> 43#include <errno.h> 44#include <signal.h> 45#include <pthread.h> 46 47#include <machine/vmm.h> 48#include <vmmapi.h> 49 50#include "fbsdrun.h" 51#include "inout.h" 52#include "dbgport.h" 53#include "mem.h" 54#include "mevent.h" 55#include "mptbl.h" 56#include "pci_emul.h" 57#include "xmsr.h" 58#include "instruction_emul.h" 59#include "ioapic.h" 60#include "spinup_ap.h" 61 62#define DEFAULT_GUEST_HZ 100 63#define DEFAULT_GUEST_TSLICE 200 64 65#define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */ 66 67#define VMEXIT_SWITCH 0 /* force vcpu switch in mux mode */ 68#define VMEXIT_CONTINUE 1 /* continue from next instruction */ 69#define VMEXIT_RESTART 2 /* restart current instruction */ 70#define VMEXIT_ABORT 3 /* abort the vm run loop */ 71#define VMEXIT_RESET 4 /* guest machine has reset */ 72 73#define MB (1024UL * 1024) 74#define GB (1024UL * MB) 75 76typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu); 77 78int guest_tslice = DEFAULT_GUEST_TSLICE; 79int guest_hz = DEFAULT_GUEST_HZ; 80char *vmname; 81 82u_long lomem_sz; 83u_long himem_sz; 84 85int guest_ncpus; 86 87static int pincpu = -1; 88static int guest_vcpu_mux; 89static int guest_vmexit_on_hlt, guest_vmexit_on_pause, disable_x2apic; 90 91static int foundcpus; 92 93static int strictio; 94 95static char *lomem_addr; 96static char *himem_addr; 97 98static char *progname; 99static const int BSP = 0; 100 101static int cpumask; 102 103static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip); 104 105struct vm_exit vmexit[VM_MAXCPU]; 106 107struct fbsdstats { 108 uint64_t vmexit_bogus; 109 uint64_t vmexit_bogus_switch; 110 uint64_t vmexit_hlt; 111 uint64_t vmexit_pause; 112 uint64_t vmexit_mtrap; 113 uint64_t vmexit_paging; 114 uint64_t cpu_switch_rotate; 115 uint64_t cpu_switch_direct; 116 int io_reset; 117} stats; 118 119struct mt_vmm_info { 120 pthread_t mt_thr; 121 struct vmctx *mt_ctx; 122 int mt_vcpu; 123} mt_vmm_info[VM_MAXCPU]; 124 125static void 126usage(int code) 127{ 128 129 fprintf(stderr, 130 "Usage: %s [-aehBHIP][-g <gdb port>][-z <hz>][-s <pci>]" 131 "[-S <pci>][-p pincpu][-n <pci>][-m lowmem][-M highmem] <vm>\n" 132 " -a: local apic is in XAPIC mode (default is X2APIC)\n" 133 " -g: gdb port (default is %d and 0 means don't open)\n" 134 " -c: # cpus (default 1)\n" 135 " -p: pin vcpu 'n' to host cpu 'pincpu + n'\n" 136 " -B: inject breakpoint exception on vm entry\n" 137 " -H: vmexit from the guest on hlt\n" 138 " -I: present an ioapic to the guest\n" 139 " -P: vmexit from the guest on pause\n" 140 " -e: exit on unhandled i/o access\n" 141 " -h: help\n" 142 " -z: guest hz (default is %d)\n" 143 " -s: <slot,driver,configinfo> PCI slot config\n" 144 " -S: <slot,driver,configinfo> legacy PCI slot config\n" 145 " -m: lowmem in MB\n" 146 " -M: highmem in MB\n" 147 " -x: mux vcpus to 1 hcpu\n" 148 " -t: mux vcpu timeslice hz (default %d)\n", 149 progname, DEFAULT_GDB_PORT, DEFAULT_GUEST_HZ, 150 DEFAULT_GUEST_TSLICE); 151 exit(code); 152} 153 154void * 155paddr_guest2host(uintptr_t gaddr) 156{ 157 if (lomem_sz == 0) 158 return (NULL); 159 160 if (gaddr < lomem_sz) { 161 return ((void *)(lomem_addr + gaddr)); 162 } else if (gaddr >= 4*GB && gaddr < (4*GB + himem_sz)) { 163 return ((void *)(himem_addr + gaddr - 4*GB)); 164 } else 165 return (NULL); 166} 167 168int 169fbsdrun_disable_x2apic(void) 170{ 171 172 return (disable_x2apic); 173} 174 175int 176fbsdrun_vmexit_on_pause(void) 177{ 178 179 return (guest_vmexit_on_pause); 180} 181 182int 183fbsdrun_vmexit_on_hlt(void) 184{ 185 186 return (guest_vmexit_on_hlt); 187} 188 189int 190fbsdrun_muxed(void) 191{ 192 193 return (guest_vcpu_mux); 194} 195 196static void * 197fbsdrun_start_thread(void *param) 198{ 199 int vcpu; 200 struct mt_vmm_info *mtp = param; 201 202 vcpu = mtp->mt_vcpu; 203 vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip); 204 205 /* not reached */ 206 exit(1); 207 return (NULL); 208} 209 210void 211fbsdrun_addcpu(struct vmctx *ctx, int vcpu, uint64_t rip) 212{ 213 int error; 214 215 if (cpumask & (1 << vcpu)) { 216 printf("addcpu: attempting to add existing cpu %d\n", vcpu); 217 exit(1); 218 } 219 220 cpumask |= 1 << vcpu; 221 foundcpus++; 222 223 /* 224 * Set up the vmexit struct to allow execution to start 225 * at the given RIP 226 */ 227 vmexit[vcpu].rip = rip; 228 vmexit[vcpu].inst_length = 0; 229 230 if (vcpu == BSP || !guest_vcpu_mux){ 231 mt_vmm_info[vcpu].mt_ctx = ctx; 232 mt_vmm_info[vcpu].mt_vcpu = vcpu; 233 234 error = pthread_create(&mt_vmm_info[vcpu].mt_thr, NULL, 235 fbsdrun_start_thread, &mt_vmm_info[vcpu]); 236 assert(error == 0); 237 } 238} 239 240static int 241fbsdrun_get_next_cpu(int curcpu) 242{ 243 244 /* 245 * Get the next available CPU. Assumes they arrive 246 * in ascending order with no gaps. 247 */ 248 return ((curcpu + 1) % foundcpus); 249} 250 251static int 252vmexit_catch_reset(void) 253{ 254 stats.io_reset++; 255 return (VMEXIT_RESET); 256} 257 258static int 259vmexit_catch_inout(void) 260{ 261 return (VMEXIT_ABORT); 262} 263 264static int 265vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu, 266 uint32_t eax) 267{ 268#if PG_DEBUG /* put all types of debug here */ 269 if (eax == 0) { 270 pause_noswitch = 1; 271 } else if (eax == 1) { 272 pause_noswitch = 0; 273 } else { 274 pause_noswitch = 0; 275 if (eax == 5) { 276 vm_set_capability(ctx, *pvcpu, VM_CAP_MTRAP_EXIT, 1); 277 } 278 } 279#endif 280 return (VMEXIT_CONTINUE); 281} 282 283static int 284vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) 285{ 286 int error; 287 int bytes, port, in, out; 288 uint32_t eax; 289 int vcpu; 290 291 vcpu = *pvcpu; 292 293 port = vme->u.inout.port; 294 bytes = vme->u.inout.bytes; 295 eax = vme->u.inout.eax; 296 in = vme->u.inout.in; 297 out = !in; 298 299 /* We don't deal with these */ 300 if (vme->u.inout.string || vme->u.inout.rep) 301 return (VMEXIT_ABORT); 302 303 /* Special case of guest reset */ 304 if (out && port == 0x64 && (uint8_t)eax == 0xFE) 305 return (vmexit_catch_reset()); 306 307 /* Extra-special case of host notifications */ 308 if (out && port == GUEST_NIO_PORT) 309 return (vmexit_handle_notify(ctx, vme, pvcpu, eax)); 310 311 error = emulate_inout(ctx, vcpu, in, port, bytes, &eax, strictio); 312 if (error == 0 && in) 313 error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, eax); 314 315 if (error == 0) 316 return (VMEXIT_CONTINUE); 317 else { 318 fprintf(stderr, "Unhandled %s%c 0x%04x\n", 319 in ? "in" : "out", 320 bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port); 321 return (vmexit_catch_inout()); 322 } 323} 324 325static int 326vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) 327{ 328 printf("vm exit rdmsr 0x%x, cpu %d\n", vme->u.msr.code, *pvcpu); 329 return (VMEXIT_ABORT); 330} 331 332static int 333vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) 334{ 335 int newcpu; 336 int retval = VMEXIT_CONTINUE; 337 338 newcpu = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code,vme->u.msr.wval); 339 340 if (guest_vcpu_mux && *pvcpu != newcpu) { 341 retval = VMEXIT_SWITCH; 342 *pvcpu = newcpu; 343 } 344 345 return (retval); 346} 347 348static int 349vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) 350{ 351 int newcpu; 352 int retval = VMEXIT_CONTINUE; 353 354 newcpu = spinup_ap(ctx, *pvcpu, 355 vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip); 356 357 if (guest_vcpu_mux && *pvcpu != newcpu) { 358 retval = VMEXIT_SWITCH; 359 *pvcpu = newcpu; 360 } 361 362 return (retval); 363} 364 365static int 366vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 367{ 368 369 printf("vm exit[%d]\n", *pvcpu); 370 printf("\treason\t\tVMX\n"); 371 printf("\trip\t\t0x%016lx\n", vmexit->rip); 372 printf("\tinst_length\t%d\n", vmexit->inst_length); 373 printf("\terror\t\t%d\n", vmexit->u.vmx.error); 374 printf("\texit_reason\t%u\n", vmexit->u.vmx.exit_reason); 375 printf("\tqualification\t0x%016lx\n", vmexit->u.vmx.exit_qualification); 376 377 return (VMEXIT_ABORT); 378} 379 380static int bogus_noswitch = 1; 381 382static int 383vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 384{ 385 stats.vmexit_bogus++; 386 387 if (!guest_vcpu_mux || guest_ncpus == 1 || bogus_noswitch) { 388 return (VMEXIT_RESTART); 389 } else { 390 stats.vmexit_bogus_switch++; 391 vmexit->inst_length = 0; 392 *pvcpu = -1; 393 return (VMEXIT_SWITCH); 394 } 395} 396 397static int 398vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 399{ 400 stats.vmexit_hlt++; 401 if (fbsdrun_muxed()) { 402 *pvcpu = -1; 403 return (VMEXIT_SWITCH); 404 } else { 405 /* 406 * Just continue execution with the next instruction. We use 407 * the HLT VM exit as a way to be friendly with the host 408 * scheduler. 409 */ 410 return (VMEXIT_CONTINUE); 411 } 412} 413 414static int pause_noswitch; 415 416static int 417vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 418{ 419 stats.vmexit_pause++; 420 421 if (fbsdrun_muxed() && !pause_noswitch) { 422 *pvcpu = -1; 423 return (VMEXIT_SWITCH); 424 } else { 425 return (VMEXIT_CONTINUE); 426 } 427} 428 429static int 430vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 431{ 432 stats.vmexit_mtrap++; 433 434 return (VMEXIT_RESTART); 435} 436 437static int 438vmexit_paging(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 439{ 440 int err; 441 stats.vmexit_paging++; 442 443 err = emulate_mem(ctx, *pvcpu, vmexit->u.paging.gpa, vmexit->rip, 444 vmexit->u.paging.cr3, vmexit->u.paging.rwx); 445 446 if (err) { 447 if (err == EINVAL) { 448 printf("Failed to emulate instruction at 0x%lx\n", 449 vmexit->rip); 450 } else if (err == ESRCH) { 451 printf("Unhandled memory access to 0x%lx\n", 452 vmexit->u.paging.gpa); 453 } 454 455 return (VMEXIT_ABORT); 456 } 457 458 return (VMEXIT_CONTINUE); 459} 460 461static void 462sigalrm(int sig) 463{ 464 return; 465} 466 467static void 468setup_timeslice(void) 469{ 470 struct sigaction sa; 471 struct itimerval itv; 472 int error; 473 474 /* 475 * Setup a realtime timer to generate a SIGALRM at a 476 * frequency of 'guest_tslice' ticks per second. 477 */ 478 sigemptyset(&sa.sa_mask); 479 sa.sa_flags = 0; 480 sa.sa_handler = sigalrm; 481 482 error = sigaction(SIGALRM, &sa, NULL); 483 assert(error == 0); 484 485 itv.it_interval.tv_sec = 0; 486 itv.it_interval.tv_usec = 1000000 / guest_tslice; 487 itv.it_value.tv_sec = 0; 488 itv.it_value.tv_usec = 1000000 / guest_tslice; 489 490 error = setitimer(ITIMER_REAL, &itv, NULL); 491 assert(error == 0); 492} 493 494static vmexit_handler_t handler[VM_EXITCODE_MAX] = { 495 [VM_EXITCODE_INOUT] = vmexit_inout, 496 [VM_EXITCODE_VMX] = vmexit_vmx, 497 [VM_EXITCODE_BOGUS] = vmexit_bogus, 498 [VM_EXITCODE_RDMSR] = vmexit_rdmsr, 499 [VM_EXITCODE_WRMSR] = vmexit_wrmsr, 500 [VM_EXITCODE_MTRAP] = vmexit_mtrap, 501 [VM_EXITCODE_PAGING] = vmexit_paging, 502 [VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap, 503}; 504 505static void 506vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip) 507{ 508 int error, rc, prevcpu; 509 510 if (guest_vcpu_mux) 511 setup_timeslice(); 512 513 if (pincpu >= 0) { 514 error = vm_set_pinning(ctx, vcpu, pincpu + vcpu); 515 assert(error == 0); 516 } 517 518 while (1) { 519 error = vm_run(ctx, vcpu, rip, &vmexit[vcpu]); 520 if (error != 0) { 521 /* 522 * It is possible that 'vmmctl' or some other process 523 * has transitioned the vcpu to CANNOT_RUN state right 524 * before we tried to transition it to RUNNING. 525 * 526 * This is expected to be temporary so just retry. 527 */ 528 if (errno == EBUSY) 529 continue; 530 else 531 break; 532 } 533 534 prevcpu = vcpu; 535 rc = (*handler[vmexit[vcpu].exitcode])(ctx, &vmexit[vcpu], 536 &vcpu); 537 switch (rc) { 538 case VMEXIT_SWITCH: 539 assert(guest_vcpu_mux); 540 if (vcpu == -1) { 541 stats.cpu_switch_rotate++; 542 vcpu = fbsdrun_get_next_cpu(prevcpu); 543 } else { 544 stats.cpu_switch_direct++; 545 } 546 /* fall through */ 547 case VMEXIT_CONTINUE: 548 rip = vmexit[vcpu].rip + vmexit[vcpu].inst_length; 549 break; 550 case VMEXIT_RESTART: 551 rip = vmexit[vcpu].rip; 552 break; 553 case VMEXIT_RESET: 554 exit(0); 555 default: 556 exit(1); 557 } 558 } 559 fprintf(stderr, "vm_run error %d, errno %d\n", error, errno); 560} 561 562 563int 564main(int argc, char *argv[]) 565{ 566 int c, error, gdb_port, inject_bkpt, tmp, err, ioapic; 567 struct vmctx *ctx; 568 uint64_t rip; 569 570 inject_bkpt = 0; 571 progname = basename(argv[0]); 572 gdb_port = DEFAULT_GDB_PORT; 573 guest_ncpus = 1; 574 ioapic = 0; 575 576 while ((c = getopt(argc, argv, "aehBHIPxp:g:c:z:s:S:n:m:M:")) != -1) { 577 switch (c) { 578 case 'a': 579 disable_x2apic = 1; 580 break; 581 case 'B': 582 inject_bkpt = 1; 583 break; 584 case 'x': 585 guest_vcpu_mux = 1; 586 break; 587 case 'p': 588 pincpu = atoi(optarg); 589 break; 590 case 'c': 591 guest_ncpus = atoi(optarg); 592 break; 593 case 'g': 594 gdb_port = atoi(optarg); 595 break; 596 case 'z': 597 guest_hz = atoi(optarg); 598 break; 599 case 't': 600 guest_tslice = atoi(optarg); 601 break; 602 case 's': 603 pci_parse_slot(optarg, 0); 604 break; 605 case 'S': 606 pci_parse_slot(optarg, 1); 607 break; 608 case 'm': 609 lomem_sz = strtoul(optarg, NULL, 0) * MB; 610 break; 611 case 'M': 612 himem_sz = strtoul(optarg, NULL, 0) * MB; 613 break; 614 case 'H': 615 guest_vmexit_on_hlt = 1; 616 break; 617 case 'I': 618 ioapic = 1; 619 break; 620 case 'P': 621 guest_vmexit_on_pause = 1; 622 break; 623 case 'e': 624 strictio = 1; 625 break; 626 case 'h': 627 usage(0); 628 default: 629 usage(1); 630 } 631 } 632 argc -= optind; 633 argv += optind; 634 635 if (argc != 1) 636 usage(1); 637 638 /* No need to mux if guest is uni-processor */ 639 if (guest_ncpus <= 1) 640 guest_vcpu_mux = 0; 641 642 /* vmexit on hlt if guest is muxed */ 643 if (guest_vcpu_mux) { 644 guest_vmexit_on_hlt = 1; 645 guest_vmexit_on_pause = 1; 646 } 647 648 vmname = argv[0]; 649 650 ctx = vm_open(vmname); 651 if (ctx == NULL) { 652 perror("vm_open"); 653 exit(1); 654 } 655 656 if (fbsdrun_vmexit_on_hlt()) { 657 err = vm_get_capability(ctx, BSP, VM_CAP_HALT_EXIT, &tmp); 658 if (err < 0) { 659 printf("VM exit on HLT not supported\n"); 660 exit(1); 661 } 662 vm_set_capability(ctx, BSP, VM_CAP_HALT_EXIT, 1); 663 handler[VM_EXITCODE_HLT] = vmexit_hlt; 664 } 665 666 if (fbsdrun_vmexit_on_pause()) { 667 /* 668 * pause exit support required for this mode 669 */ 670 err = vm_get_capability(ctx, BSP, VM_CAP_PAUSE_EXIT, &tmp); 671 if (err < 0) { 672 printf("SMP mux requested, no pause support\n"); 673 exit(1); 674 } 675 vm_set_capability(ctx, BSP, VM_CAP_PAUSE_EXIT, 1); 676 handler[VM_EXITCODE_PAUSE] = vmexit_pause; 677 } 678 679 if (fbsdrun_disable_x2apic()) 680 err = vm_set_x2apic_state(ctx, BSP, X2APIC_DISABLED); 681 else 682 err = vm_set_x2apic_state(ctx, BSP, X2APIC_ENABLED); 683 684 if (err) { 685 printf("Unable to set x2apic state (%d)\n", err); 686 exit(1); 687 } 688 689 if (lomem_sz != 0) { 690 lomem_addr = vm_map_memory(ctx, 0, lomem_sz); 691 if (lomem_addr == (char *) MAP_FAILED) { 692 lomem_sz = 0; 693 } else if (himem_sz != 0) { 694 himem_addr = vm_map_memory(ctx, 4*GB, himem_sz); 695 if (himem_addr == (char *) MAP_FAILED) { 696 lomem_sz = 0; 697 himem_sz = 0; 698 } 699 } 700 } 701 702 init_inout(); 703 init_pci(ctx); 704 if (ioapic) 705 ioapic_init(0); 706 707 if (gdb_port != 0) 708 init_dbgport(gdb_port); 709 710 error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip); 711 assert(error == 0); 712 713 if (inject_bkpt) { 714 error = vm_inject_event(ctx, BSP, VM_HW_EXCEPTION, IDT_BP); 715 assert(error == 0); 716 } 717 718 /* 719 * build the guest tables, MP etc. 720 */ 721 mptable_build(ctx, guest_ncpus, ioapic); 722 723 /* 724 * Add CPU 0 725 */ 726 fbsdrun_addcpu(ctx, BSP, rip); 727 728 /* 729 * Head off to the main event dispatch loop 730 */ 731 mevent_dispatch(); 732 733 exit(1); 734} 735