1/* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * This module enables machines with Intel VT-x extensions to run virtual 5 * machines without emulation or binary translation. 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * 9 * Authors: 10 * Avi Kivity <avi@qumranet.com> 11 * Yaniv Kamay <yaniv@qumranet.com> 12 * 13 * This work is licensed under the terms of the GNU GPL, version 2. See 14 * the COPYING file in the top-level directory. 15 * 16 */ 17 18#include "kvm.h" 19#include "vmx.h" 20#include <linux/module.h> 21#include <linux/kernel.h> 22#include <linux/mm.h> 23#include <linux/highmem.h> 24#include <linux/profile.h> 25#include <linux/sched.h> 26#include <asm/io.h> 27#include <asm/desc.h> 28 29#include "segment_descriptor.h" 30 31MODULE_AUTHOR("Qumranet"); 32MODULE_LICENSE("GPL"); 33 34static DEFINE_PER_CPU(struct vmcs *, vmxarea); 35static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 36 37#ifdef CONFIG_X86_64 38#define HOST_IS_64 1 39#else 40#define HOST_IS_64 0 41#endif 42 43static struct vmcs_descriptor { 44 int size; 45 int order; 46 u32 revision_id; 47} vmcs_descriptor; 48 49#define VMX_SEGMENT_FIELD(seg) \ 50 [VCPU_SREG_##seg] = { \ 51 .selector = GUEST_##seg##_SELECTOR, \ 52 .base = GUEST_##seg##_BASE, \ 53 .limit = GUEST_##seg##_LIMIT, \ 54 .ar_bytes = GUEST_##seg##_AR_BYTES, \ 55 } 56 57static struct kvm_vmx_segment_field { 58 unsigned selector; 59 unsigned base; 60 unsigned limit; 61 unsigned ar_bytes; 62} kvm_vmx_segment_fields[] = { 63 VMX_SEGMENT_FIELD(CS), 64 VMX_SEGMENT_FIELD(DS), 65 VMX_SEGMENT_FIELD(ES), 66 VMX_SEGMENT_FIELD(FS), 67 VMX_SEGMENT_FIELD(GS), 68 VMX_SEGMENT_FIELD(SS), 69 VMX_SEGMENT_FIELD(TR), 70 VMX_SEGMENT_FIELD(LDTR), 71}; 72 73/* 74 * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it 75 * away by decrementing the array size. 76 */ 77static const u32 vmx_msr_index[] = { 78#ifdef CONFIG_X86_64 79 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE, 80#endif 81 MSR_EFER, MSR_K6_STAR, 82}; 83#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) 84 85#ifdef CONFIG_X86_64 86static unsigned msr_offset_kernel_gs_base; 87#define NR_64BIT_MSRS 4 88/* 89 * avoid save/load MSR_SYSCALL_MASK and MSR_LSTAR by std vt 90 * mechanism (cpu bug AA24) 91 */ 92#define NR_BAD_MSRS 2 93#else 94#define NR_64BIT_MSRS 0 95#define NR_BAD_MSRS 0 96#endif 97 98static inline int is_page_fault(u32 intr_info) 99{ 100 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 101 INTR_INFO_VALID_MASK)) == 102 (INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK); 103} 104 105static inline int is_no_device(u32 intr_info) 106{ 107 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 108 INTR_INFO_VALID_MASK)) == 109 (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); 110} 111 112static inline int is_external_interrupt(u32 intr_info) 113{ 114 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) 115 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); 116} 117 118static struct vmx_msr_entry *find_msr_entry(struct kvm_vcpu *vcpu, u32 msr) 119{ 120 int i; 121 122 for (i = 0; i < vcpu->nmsrs; ++i) 123 if (vcpu->guest_msrs[i].index == msr) 124 return &vcpu->guest_msrs[i]; 125 return NULL; 126} 127 128static void vmcs_clear(struct vmcs *vmcs) 129{ 130 u64 phys_addr = __pa(vmcs); 131 u8 error; 132 133 asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0" 134 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) 135 : "cc", "memory"); 136 if (error) 137 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", 138 vmcs, phys_addr); 139} 140 141static void __vcpu_clear(void *arg) 142{ 143 struct kvm_vcpu *vcpu = arg; 144 int cpu = raw_smp_processor_id(); 145 146 if (vcpu->cpu == cpu) 147 vmcs_clear(vcpu->vmcs); 148 if (per_cpu(current_vmcs, cpu) == vcpu->vmcs) 149 per_cpu(current_vmcs, cpu) = NULL; 150} 151 152static void vcpu_clear(struct kvm_vcpu *vcpu) 153{ 154 if (vcpu->cpu != raw_smp_processor_id() && vcpu->cpu != -1) 155 smp_call_function_single(vcpu->cpu, __vcpu_clear, vcpu, 0, 1); 156 else 157 __vcpu_clear(vcpu); 158 vcpu->launched = 0; 159} 160 161static unsigned long vmcs_readl(unsigned long field) 162{ 163 unsigned long value; 164 165 asm volatile (ASM_VMX_VMREAD_RDX_RAX 166 : "=a"(value) : "d"(field) : "cc"); 167 return value; 168} 169 170static u16 vmcs_read16(unsigned long field) 171{ 172 return vmcs_readl(field); 173} 174 175static u32 vmcs_read32(unsigned long field) 176{ 177 return vmcs_readl(field); 178} 179 180static u64 vmcs_read64(unsigned long field) 181{ 182#ifdef CONFIG_X86_64 183 return vmcs_readl(field); 184#else 185 return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32); 186#endif 187} 188 189static noinline void vmwrite_error(unsigned long field, unsigned long value) 190{ 191 printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n", 192 field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); 193 dump_stack(); 194} 195 196static void vmcs_writel(unsigned long field, unsigned long value) 197{ 198 u8 error; 199 200 asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0" 201 : "=q"(error) : "a"(value), "d"(field) : "cc" ); 202 if (unlikely(error)) 203 vmwrite_error(field, value); 204} 205 206static void vmcs_write16(unsigned long field, u16 value) 207{ 208 vmcs_writel(field, value); 209} 210 211static void vmcs_write32(unsigned long field, u32 value) 212{ 213 vmcs_writel(field, value); 214} 215 216static void vmcs_write64(unsigned long field, u64 value) 217{ 218#ifdef CONFIG_X86_64 219 vmcs_writel(field, value); 220#else 221 vmcs_writel(field, value); 222 asm volatile (""); 223 vmcs_writel(field+1, value >> 32); 224#endif 225} 226 227static void vmcs_clear_bits(unsigned long field, u32 mask) 228{ 229 vmcs_writel(field, vmcs_readl(field) & ~mask); 230} 231 232static void vmcs_set_bits(unsigned long field, u32 mask) 233{ 234 vmcs_writel(field, vmcs_readl(field) | mask); 235} 236 237/* 238 * Switches to specified vcpu, until a matching vcpu_put(), but assumes 239 * vcpu mutex is already taken. 240 */ 241static void vmx_vcpu_load(struct kvm_vcpu *vcpu) 242{ 243 u64 phys_addr = __pa(vcpu->vmcs); 244 int cpu; 245 246 cpu = get_cpu(); 247 248 if (vcpu->cpu != cpu) 249 vcpu_clear(vcpu); 250 251 if (per_cpu(current_vmcs, cpu) != vcpu->vmcs) { 252 u8 error; 253 254 per_cpu(current_vmcs, cpu) = vcpu->vmcs; 255 asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0" 256 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) 257 : "cc"); 258 if (error) 259 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", 260 vcpu->vmcs, phys_addr); 261 } 262 263 if (vcpu->cpu != cpu) { 264 struct descriptor_table dt; 265 unsigned long sysenter_esp; 266 267 vcpu->cpu = cpu; 268 /* 269 * Linux uses per-cpu TSS and GDT, so set these when switching 270 * processors. 271 */ 272 vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */ 273 get_gdt(&dt); 274 vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */ 275 276 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); 277 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ 278 } 279} 280 281static void vmx_vcpu_put(struct kvm_vcpu *vcpu) 282{ 283 kvm_put_guest_fpu(vcpu); 284 put_cpu(); 285} 286 287static void vmx_vcpu_decache(struct kvm_vcpu *vcpu) 288{ 289 vcpu_clear(vcpu); 290} 291 292static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 293{ 294 return vmcs_readl(GUEST_RFLAGS); 295} 296 297static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 298{ 299 vmcs_writel(GUEST_RFLAGS, rflags); 300} 301 302static void skip_emulated_instruction(struct kvm_vcpu *vcpu) 303{ 304 unsigned long rip; 305 u32 interruptibility; 306 307 rip = vmcs_readl(GUEST_RIP); 308 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 309 vmcs_writel(GUEST_RIP, rip); 310 311 /* 312 * We emulated an instruction, so temporary interrupt blocking 313 * should be removed, if set. 314 */ 315 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 316 if (interruptibility & 3) 317 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 318 interruptibility & ~3); 319 vcpu->interrupt_window_open = 1; 320} 321 322static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) 323{ 324 printk(KERN_DEBUG "inject_general_protection: rip 0x%lx\n", 325 vmcs_readl(GUEST_RIP)); 326 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); 327 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 328 GP_VECTOR | 329 INTR_TYPE_EXCEPTION | 330 INTR_INFO_DELIEVER_CODE_MASK | 331 INTR_INFO_VALID_MASK); 332} 333 334/* 335 * Set up the vmcs to automatically save and restore system 336 * msrs. Don't touch the 64-bit msrs if the guest is in legacy 337 * mode, as fiddling with msrs is very expensive. 338 */ 339static void setup_msrs(struct kvm_vcpu *vcpu) 340{ 341 int nr_skip, nr_good_msrs; 342 343 if (is_long_mode(vcpu)) 344 nr_skip = NR_BAD_MSRS; 345 else 346 nr_skip = NR_64BIT_MSRS; 347 nr_good_msrs = vcpu->nmsrs - nr_skip; 348 349 /* 350 * MSR_K6_STAR is only needed on long mode guests, and only 351 * if efer.sce is enabled. 352 */ 353 if (find_msr_entry(vcpu, MSR_K6_STAR)) { 354 --nr_good_msrs; 355#ifdef CONFIG_X86_64 356 if (is_long_mode(vcpu) && (vcpu->shadow_efer & EFER_SCE)) 357 ++nr_good_msrs; 358#endif 359 } 360 361 vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR, 362 virt_to_phys(vcpu->guest_msrs + nr_skip)); 363 vmcs_writel(VM_EXIT_MSR_STORE_ADDR, 364 virt_to_phys(vcpu->guest_msrs + nr_skip)); 365 vmcs_writel(VM_EXIT_MSR_LOAD_ADDR, 366 virt_to_phys(vcpu->host_msrs + nr_skip)); 367 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs); /* 22.2.2 */ 368 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */ 369 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */ 370} 371 372/* 373 * reads and returns guest's timestamp counter "register" 374 * guest_tsc = host_tsc + tsc_offset -- 21.3 375 */ 376static u64 guest_read_tsc(void) 377{ 378 u64 host_tsc, tsc_offset; 379 380 rdtscll(host_tsc); 381 tsc_offset = vmcs_read64(TSC_OFFSET); 382 return host_tsc + tsc_offset; 383} 384 385/* 386 * writes 'guest_tsc' into guest's timestamp counter "register" 387 * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc 388 */ 389static void guest_write_tsc(u64 guest_tsc) 390{ 391 u64 host_tsc; 392 393 rdtscll(host_tsc); 394 vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc); 395} 396 397static void reload_tss(void) 398{ 399#ifndef CONFIG_X86_64 400 401 /* 402 * VT restores TR but not its size. Useless. 403 */ 404 struct descriptor_table gdt; 405 struct segment_descriptor *descs; 406 407 get_gdt(&gdt); 408 descs = (void *)gdt.base; 409 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ 410 load_TR_desc(); 411#endif 412} 413 414/* 415 * Reads an msr value (of 'msr_index') into 'pdata'. 416 * Returns 0 on success, non-0 otherwise. 417 * Assumes vcpu_load() was already called. 418 */ 419static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 420{ 421 u64 data; 422 struct vmx_msr_entry *msr; 423 424 if (!pdata) { 425 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n"); 426 return -EINVAL; 427 } 428 429 switch (msr_index) { 430#ifdef CONFIG_X86_64 431 case MSR_FS_BASE: 432 data = vmcs_readl(GUEST_FS_BASE); 433 break; 434 case MSR_GS_BASE: 435 data = vmcs_readl(GUEST_GS_BASE); 436 break; 437 case MSR_EFER: 438 return kvm_get_msr_common(vcpu, msr_index, pdata); 439#endif 440 case MSR_IA32_TIME_STAMP_COUNTER: 441 data = guest_read_tsc(); 442 break; 443 case MSR_IA32_SYSENTER_CS: 444 data = vmcs_read32(GUEST_SYSENTER_CS); 445 break; 446 case MSR_IA32_SYSENTER_EIP: 447 data = vmcs_readl(GUEST_SYSENTER_EIP); 448 break; 449 case MSR_IA32_SYSENTER_ESP: 450 data = vmcs_readl(GUEST_SYSENTER_ESP); 451 break; 452 default: 453 msr = find_msr_entry(vcpu, msr_index); 454 if (msr) { 455 data = msr->data; 456 break; 457 } 458 return kvm_get_msr_common(vcpu, msr_index, pdata); 459 } 460 461 *pdata = data; 462 return 0; 463} 464 465/* 466 * Writes msr value into into the appropriate "register". 467 * Returns 0 on success, non-0 otherwise. 468 * Assumes vcpu_load() was already called. 469 */ 470static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 471{ 472 struct vmx_msr_entry *msr; 473 switch (msr_index) { 474#ifdef CONFIG_X86_64 475 case MSR_EFER: 476 return kvm_set_msr_common(vcpu, msr_index, data); 477 case MSR_FS_BASE: 478 vmcs_writel(GUEST_FS_BASE, data); 479 break; 480 case MSR_GS_BASE: 481 vmcs_writel(GUEST_GS_BASE, data); 482 break; 483#endif 484 case MSR_IA32_SYSENTER_CS: 485 vmcs_write32(GUEST_SYSENTER_CS, data); 486 break; 487 case MSR_IA32_SYSENTER_EIP: 488 vmcs_writel(GUEST_SYSENTER_EIP, data); 489 break; 490 case MSR_IA32_SYSENTER_ESP: 491 vmcs_writel(GUEST_SYSENTER_ESP, data); 492 break; 493 case MSR_IA32_TIME_STAMP_COUNTER: 494 guest_write_tsc(data); 495 break; 496 default: 497 msr = find_msr_entry(vcpu, msr_index); 498 if (msr) { 499 msr->data = data; 500 break; 501 } 502 return kvm_set_msr_common(vcpu, msr_index, data); 503 msr->data = data; 504 break; 505 } 506 507 return 0; 508} 509 510/* 511 * Sync the rsp and rip registers into the vcpu structure. This allows 512 * registers to be accessed by indexing vcpu->regs. 513 */ 514static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu) 515{ 516 vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); 517 vcpu->rip = vmcs_readl(GUEST_RIP); 518} 519 520/* 521 * Syncs rsp and rip back into the vmcs. Should be called after possible 522 * modification. 523 */ 524static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu) 525{ 526 vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]); 527 vmcs_writel(GUEST_RIP, vcpu->rip); 528} 529 530static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) 531{ 532 unsigned long dr7 = 0x400; 533 u32 exception_bitmap; 534 int old_singlestep; 535 536 exception_bitmap = vmcs_read32(EXCEPTION_BITMAP); 537 old_singlestep = vcpu->guest_debug.singlestep; 538 539 vcpu->guest_debug.enabled = dbg->enabled; 540 if (vcpu->guest_debug.enabled) { 541 int i; 542 543 dr7 |= 0x200; /* exact */ 544 for (i = 0; i < 4; ++i) { 545 if (!dbg->breakpoints[i].enabled) 546 continue; 547 vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address; 548 dr7 |= 2 << (i*2); /* global enable */ 549 dr7 |= 0 << (i*4+16); /* execution breakpoint */ 550 } 551 552 exception_bitmap |= (1u << 1); /* Trap debug exceptions */ 553 554 vcpu->guest_debug.singlestep = dbg->singlestep; 555 } else { 556 exception_bitmap &= ~(1u << 1); /* Ignore debug exceptions */ 557 vcpu->guest_debug.singlestep = 0; 558 } 559 560 if (old_singlestep && !vcpu->guest_debug.singlestep) { 561 unsigned long flags; 562 563 flags = vmcs_readl(GUEST_RFLAGS); 564 flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); 565 vmcs_writel(GUEST_RFLAGS, flags); 566 } 567 568 vmcs_write32(EXCEPTION_BITMAP, exception_bitmap); 569 vmcs_writel(GUEST_DR7, dr7); 570 571 return 0; 572} 573 574static __init int cpu_has_kvm_support(void) 575{ 576 unsigned long ecx = cpuid_ecx(1); 577 return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */ 578} 579 580static __init int vmx_disabled_by_bios(void) 581{ 582 u64 msr; 583 584 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); 585 return (msr & 5) == 1; /* locked but not enabled */ 586} 587 588static void hardware_enable(void *garbage) 589{ 590 int cpu = raw_smp_processor_id(); 591 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 592 u64 old; 593 594 rdmsrl(MSR_IA32_FEATURE_CONTROL, old); 595 if ((old & 5) != 5) 596 /* enable and lock */ 597 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | 5); 598 write_cr4(read_cr4() | CR4_VMXE); 599 asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr) 600 : "memory", "cc"); 601} 602 603static void hardware_disable(void *garbage) 604{ 605 asm volatile (ASM_VMX_VMXOFF : : : "cc"); 606} 607 608static __init void setup_vmcs_descriptor(void) 609{ 610 u32 vmx_msr_low, vmx_msr_high; 611 612 rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); 613 vmcs_descriptor.size = vmx_msr_high & 0x1fff; 614 vmcs_descriptor.order = get_order(vmcs_descriptor.size); 615 vmcs_descriptor.revision_id = vmx_msr_low; 616} 617 618static struct vmcs *alloc_vmcs_cpu(int cpu) 619{ 620 int node = cpu_to_node(cpu); 621 struct page *pages; 622 struct vmcs *vmcs; 623 624 pages = alloc_pages_node(node, GFP_KERNEL, vmcs_descriptor.order); 625 if (!pages) 626 return NULL; 627 vmcs = page_address(pages); 628 memset(vmcs, 0, vmcs_descriptor.size); 629 vmcs->revision_id = vmcs_descriptor.revision_id; /* vmcs revision id */ 630 return vmcs; 631} 632 633static struct vmcs *alloc_vmcs(void) 634{ 635 return alloc_vmcs_cpu(raw_smp_processor_id()); 636} 637 638static void free_vmcs(struct vmcs *vmcs) 639{ 640 free_pages((unsigned long)vmcs, vmcs_descriptor.order); 641} 642 643static void free_kvm_area(void) 644{ 645 int cpu; 646 647 for_each_online_cpu(cpu) 648 free_vmcs(per_cpu(vmxarea, cpu)); 649} 650 651extern struct vmcs *alloc_vmcs_cpu(int cpu); 652 653static __init int alloc_kvm_area(void) 654{ 655 int cpu; 656 657 for_each_online_cpu(cpu) { 658 struct vmcs *vmcs; 659 660 vmcs = alloc_vmcs_cpu(cpu); 661 if (!vmcs) { 662 free_kvm_area(); 663 return -ENOMEM; 664 } 665 666 per_cpu(vmxarea, cpu) = vmcs; 667 } 668 return 0; 669} 670 671static __init int hardware_setup(void) 672{ 673 setup_vmcs_descriptor(); 674 return alloc_kvm_area(); 675} 676 677static __exit void hardware_unsetup(void) 678{ 679 free_kvm_area(); 680} 681 682static void update_exception_bitmap(struct kvm_vcpu *vcpu) 683{ 684 if (vcpu->rmode.active) 685 vmcs_write32(EXCEPTION_BITMAP, ~0); 686 else 687 vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR); 688} 689 690static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save) 691{ 692 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 693 694 if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) { 695 vmcs_write16(sf->selector, save->selector); 696 vmcs_writel(sf->base, save->base); 697 vmcs_write32(sf->limit, save->limit); 698 vmcs_write32(sf->ar_bytes, save->ar); 699 } else { 700 u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK) 701 << AR_DPL_SHIFT; 702 vmcs_write32(sf->ar_bytes, 0x93 | dpl); 703 } 704} 705 706static void enter_pmode(struct kvm_vcpu *vcpu) 707{ 708 unsigned long flags; 709 710 vcpu->rmode.active = 0; 711 712 vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base); 713 vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit); 714 vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar); 715 716 flags = vmcs_readl(GUEST_RFLAGS); 717 flags &= ~(IOPL_MASK | X86_EFLAGS_VM); 718 flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT); 719 vmcs_writel(GUEST_RFLAGS, flags); 720 721 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~CR4_VME_MASK) | 722 (vmcs_readl(CR4_READ_SHADOW) & CR4_VME_MASK)); 723 724 update_exception_bitmap(vcpu); 725 726 fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->rmode.es); 727 fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->rmode.ds); 728 fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->rmode.gs); 729 fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->rmode.fs); 730 731 vmcs_write16(GUEST_SS_SELECTOR, 0); 732 vmcs_write32(GUEST_SS_AR_BYTES, 0x93); 733 734 vmcs_write16(GUEST_CS_SELECTOR, 735 vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK); 736 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); 737} 738 739static int rmode_tss_base(struct kvm* kvm) 740{ 741 gfn_t base_gfn = kvm->memslots[0].base_gfn + kvm->memslots[0].npages - 3; 742 return base_gfn << PAGE_SHIFT; 743} 744 745static void fix_rmode_seg(int seg, struct kvm_save_segment *save) 746{ 747 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 748 749 save->selector = vmcs_read16(sf->selector); 750 save->base = vmcs_readl(sf->base); 751 save->limit = vmcs_read32(sf->limit); 752 save->ar = vmcs_read32(sf->ar_bytes); 753 vmcs_write16(sf->selector, vmcs_readl(sf->base) >> 4); 754 vmcs_write32(sf->limit, 0xffff); 755 vmcs_write32(sf->ar_bytes, 0xf3); 756} 757 758static void enter_rmode(struct kvm_vcpu *vcpu) 759{ 760 unsigned long flags; 761 762 vcpu->rmode.active = 1; 763 764 vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); 765 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); 766 767 vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); 768 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); 769 770 vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); 771 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 772 773 flags = vmcs_readl(GUEST_RFLAGS); 774 vcpu->rmode.save_iopl = (flags & IOPL_MASK) >> IOPL_SHIFT; 775 776 flags |= IOPL_MASK | X86_EFLAGS_VM; 777 778 vmcs_writel(GUEST_RFLAGS, flags); 779 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | CR4_VME_MASK); 780 update_exception_bitmap(vcpu); 781 782 vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); 783 vmcs_write32(GUEST_SS_LIMIT, 0xffff); 784 vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); 785 786 vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); 787 vmcs_write32(GUEST_CS_LIMIT, 0xffff); 788 if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000) 789 vmcs_writel(GUEST_CS_BASE, 0xf0000); 790 vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); 791 792 fix_rmode_seg(VCPU_SREG_ES, &vcpu->rmode.es); 793 fix_rmode_seg(VCPU_SREG_DS, &vcpu->rmode.ds); 794 fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs); 795 fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs); 796} 797 798#ifdef CONFIG_X86_64 799 800static void enter_lmode(struct kvm_vcpu *vcpu) 801{ 802 u32 guest_tr_ar; 803 804 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); 805 if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { 806 printk(KERN_DEBUG "%s: tss fixup for long mode. \n", 807 __FUNCTION__); 808 vmcs_write32(GUEST_TR_AR_BYTES, 809 (guest_tr_ar & ~AR_TYPE_MASK) 810 | AR_TYPE_BUSY_64_TSS); 811 } 812 813 vcpu->shadow_efer |= EFER_LMA; 814 815 find_msr_entry(vcpu, MSR_EFER)->data |= EFER_LMA | EFER_LME; 816 vmcs_write32(VM_ENTRY_CONTROLS, 817 vmcs_read32(VM_ENTRY_CONTROLS) 818 | VM_ENTRY_CONTROLS_IA32E_MASK); 819} 820 821static void exit_lmode(struct kvm_vcpu *vcpu) 822{ 823 vcpu->shadow_efer &= ~EFER_LMA; 824 825 vmcs_write32(VM_ENTRY_CONTROLS, 826 vmcs_read32(VM_ENTRY_CONTROLS) 827 & ~VM_ENTRY_CONTROLS_IA32E_MASK); 828} 829 830#endif 831 832static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 833{ 834 vcpu->cr4 &= KVM_GUEST_CR4_MASK; 835 vcpu->cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; 836} 837 838static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 839{ 840 if (vcpu->rmode.active && (cr0 & CR0_PE_MASK)) 841 enter_pmode(vcpu); 842 843 if (!vcpu->rmode.active && !(cr0 & CR0_PE_MASK)) 844 enter_rmode(vcpu); 845 846#ifdef CONFIG_X86_64 847 if (vcpu->shadow_efer & EFER_LME) { 848 if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK)) 849 enter_lmode(vcpu); 850 if (is_paging(vcpu) && !(cr0 & CR0_PG_MASK)) 851 exit_lmode(vcpu); 852 } 853#endif 854 855 if (!(cr0 & CR0_TS_MASK)) { 856 vcpu->fpu_active = 1; 857 vmcs_clear_bits(EXCEPTION_BITMAP, CR0_TS_MASK); 858 } 859 860 vmcs_writel(CR0_READ_SHADOW, cr0); 861 vmcs_writel(GUEST_CR0, 862 (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); 863 vcpu->cr0 = cr0; 864} 865 866static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 867{ 868 vmcs_writel(GUEST_CR3, cr3); 869 870 if (!(vcpu->cr0 & CR0_TS_MASK)) { 871 vcpu->fpu_active = 0; 872 vmcs_set_bits(GUEST_CR0, CR0_TS_MASK); 873 vmcs_set_bits(EXCEPTION_BITMAP, 1 << NM_VECTOR); 874 } 875} 876 877static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 878{ 879 vmcs_writel(CR4_READ_SHADOW, cr4); 880 vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ? 881 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON)); 882 vcpu->cr4 = cr4; 883} 884 885#ifdef CONFIG_X86_64 886 887static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) 888{ 889 struct vmx_msr_entry *msr = find_msr_entry(vcpu, MSR_EFER); 890 891 vcpu->shadow_efer = efer; 892 if (efer & EFER_LMA) { 893 vmcs_write32(VM_ENTRY_CONTROLS, 894 vmcs_read32(VM_ENTRY_CONTROLS) | 895 VM_ENTRY_CONTROLS_IA32E_MASK); 896 msr->data = efer; 897 898 } else { 899 vmcs_write32(VM_ENTRY_CONTROLS, 900 vmcs_read32(VM_ENTRY_CONTROLS) & 901 ~VM_ENTRY_CONTROLS_IA32E_MASK); 902 903 msr->data = efer & ~EFER_LME; 904 } 905 setup_msrs(vcpu); 906} 907 908#endif 909 910static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) 911{ 912 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 913 914 return vmcs_readl(sf->base); 915} 916 917static void vmx_get_segment(struct kvm_vcpu *vcpu, 918 struct kvm_segment *var, int seg) 919{ 920 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 921 u32 ar; 922 923 var->base = vmcs_readl(sf->base); 924 var->limit = vmcs_read32(sf->limit); 925 var->selector = vmcs_read16(sf->selector); 926 ar = vmcs_read32(sf->ar_bytes); 927 if (ar & AR_UNUSABLE_MASK) 928 ar = 0; 929 var->type = ar & 15; 930 var->s = (ar >> 4) & 1; 931 var->dpl = (ar >> 5) & 3; 932 var->present = (ar >> 7) & 1; 933 var->avl = (ar >> 12) & 1; 934 var->l = (ar >> 13) & 1; 935 var->db = (ar >> 14) & 1; 936 var->g = (ar >> 15) & 1; 937 var->unusable = (ar >> 16) & 1; 938} 939 940static void vmx_set_segment(struct kvm_vcpu *vcpu, 941 struct kvm_segment *var, int seg) 942{ 943 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 944 u32 ar; 945 946 vmcs_writel(sf->base, var->base); 947 vmcs_write32(sf->limit, var->limit); 948 vmcs_write16(sf->selector, var->selector); 949 if (vcpu->rmode.active && var->s) { 950 /* 951 * Hack real-mode segments into vm86 compatibility. 952 */ 953 if (var->base == 0xffff0000 && var->selector == 0xf000) 954 vmcs_writel(sf->base, 0xf0000); 955 ar = 0xf3; 956 } else if (var->unusable) 957 ar = 1 << 16; 958 else { 959 ar = var->type & 15; 960 ar |= (var->s & 1) << 4; 961 ar |= (var->dpl & 3) << 5; 962 ar |= (var->present & 1) << 7; 963 ar |= (var->avl & 1) << 12; 964 ar |= (var->l & 1) << 13; 965 ar |= (var->db & 1) << 14; 966 ar |= (var->g & 1) << 15; 967 } 968 if (ar == 0) /* a 0 value means unusable */ 969 ar = AR_UNUSABLE_MASK; 970 vmcs_write32(sf->ar_bytes, ar); 971} 972 973static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 974{ 975 u32 ar = vmcs_read32(GUEST_CS_AR_BYTES); 976 977 *db = (ar >> 14) & 1; 978 *l = (ar >> 13) & 1; 979} 980 981static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 982{ 983 dt->limit = vmcs_read32(GUEST_IDTR_LIMIT); 984 dt->base = vmcs_readl(GUEST_IDTR_BASE); 985} 986 987static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 988{ 989 vmcs_write32(GUEST_IDTR_LIMIT, dt->limit); 990 vmcs_writel(GUEST_IDTR_BASE, dt->base); 991} 992 993static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 994{ 995 dt->limit = vmcs_read32(GUEST_GDTR_LIMIT); 996 dt->base = vmcs_readl(GUEST_GDTR_BASE); 997} 998 999static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 1000{ 1001 vmcs_write32(GUEST_GDTR_LIMIT, dt->limit); 1002 vmcs_writel(GUEST_GDTR_BASE, dt->base); 1003} 1004 1005static int init_rmode_tss(struct kvm* kvm) 1006{ 1007 struct page *p1, *p2, *p3; 1008 gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; 1009 char *page; 1010 1011 p1 = gfn_to_page(kvm, fn++); 1012 p2 = gfn_to_page(kvm, fn++); 1013 p3 = gfn_to_page(kvm, fn); 1014 1015 if (!p1 || !p2 || !p3) { 1016 kvm_printf(kvm,"%s: gfn_to_page failed\n", __FUNCTION__); 1017 return 0; 1018 } 1019 1020 page = kmap_atomic(p1, KM_USER0); 1021 memset(page, 0, PAGE_SIZE); 1022 *(u16*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; 1023 kunmap_atomic(page, KM_USER0); 1024 1025 page = kmap_atomic(p2, KM_USER0); 1026 memset(page, 0, PAGE_SIZE); 1027 kunmap_atomic(page, KM_USER0); 1028 1029 page = kmap_atomic(p3, KM_USER0); 1030 memset(page, 0, PAGE_SIZE); 1031 *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0; 1032 kunmap_atomic(page, KM_USER0); 1033 1034 return 1; 1035} 1036 1037static void vmcs_write32_fixedbits(u32 msr, u32 vmcs_field, u32 val) 1038{ 1039 u32 msr_high, msr_low; 1040 1041 rdmsr(msr, msr_low, msr_high); 1042 1043 val &= msr_high; 1044 val |= msr_low; 1045 vmcs_write32(vmcs_field, val); 1046} 1047 1048static void seg_setup(int seg) 1049{ 1050 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 1051 1052 vmcs_write16(sf->selector, 0); 1053 vmcs_writel(sf->base, 0); 1054 vmcs_write32(sf->limit, 0xffff); 1055 vmcs_write32(sf->ar_bytes, 0x93); 1056} 1057 1058/* 1059 * Sets up the vmcs for emulated real mode. 1060 */ 1061static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) 1062{ 1063 u32 host_sysenter_cs; 1064 u32 junk; 1065 unsigned long a; 1066 struct descriptor_table dt; 1067 int i; 1068 int ret = 0; 1069 extern asmlinkage void kvm_vmx_return(void); 1070 1071 if (!init_rmode_tss(vcpu->kvm)) { 1072 ret = -ENOMEM; 1073 goto out; 1074 } 1075 1076 memset(vcpu->regs, 0, sizeof(vcpu->regs)); 1077 vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val(); 1078 vcpu->cr8 = 0; 1079 vcpu->apic_base = 0xfee00000 | 1080 /*for vcpu 0*/ MSR_IA32_APICBASE_BSP | 1081 MSR_IA32_APICBASE_ENABLE; 1082 1083 fx_init(vcpu); 1084 1085 /* 1086 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode 1087 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. 1088 */ 1089 vmcs_write16(GUEST_CS_SELECTOR, 0xf000); 1090 vmcs_writel(GUEST_CS_BASE, 0x000f0000); 1091 vmcs_write32(GUEST_CS_LIMIT, 0xffff); 1092 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); 1093 1094 seg_setup(VCPU_SREG_DS); 1095 seg_setup(VCPU_SREG_ES); 1096 seg_setup(VCPU_SREG_FS); 1097 seg_setup(VCPU_SREG_GS); 1098 seg_setup(VCPU_SREG_SS); 1099 1100 vmcs_write16(GUEST_TR_SELECTOR, 0); 1101 vmcs_writel(GUEST_TR_BASE, 0); 1102 vmcs_write32(GUEST_TR_LIMIT, 0xffff); 1103 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 1104 1105 vmcs_write16(GUEST_LDTR_SELECTOR, 0); 1106 vmcs_writel(GUEST_LDTR_BASE, 0); 1107 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); 1108 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); 1109 1110 vmcs_write32(GUEST_SYSENTER_CS, 0); 1111 vmcs_writel(GUEST_SYSENTER_ESP, 0); 1112 vmcs_writel(GUEST_SYSENTER_EIP, 0); 1113 1114 vmcs_writel(GUEST_RFLAGS, 0x02); 1115 vmcs_writel(GUEST_RIP, 0xfff0); 1116 vmcs_writel(GUEST_RSP, 0); 1117 1118 //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 1119 vmcs_writel(GUEST_DR7, 0x400); 1120 1121 vmcs_writel(GUEST_GDTR_BASE, 0); 1122 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); 1123 1124 vmcs_writel(GUEST_IDTR_BASE, 0); 1125 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); 1126 1127 vmcs_write32(GUEST_ACTIVITY_STATE, 0); 1128 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); 1129 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); 1130 1131 /* I/O */ 1132 vmcs_write64(IO_BITMAP_A, 0); 1133 vmcs_write64(IO_BITMAP_B, 0); 1134 1135 guest_write_tsc(0); 1136 1137 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ 1138 1139 /* Special registers */ 1140 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 1141 1142 /* Control */ 1143 vmcs_write32_fixedbits(MSR_IA32_VMX_PINBASED_CTLS, 1144 PIN_BASED_VM_EXEC_CONTROL, 1145 PIN_BASED_EXT_INTR_MASK /* 20.6.1 */ 1146 | PIN_BASED_NMI_EXITING /* 20.6.1 */ 1147 ); 1148 vmcs_write32_fixedbits(MSR_IA32_VMX_PROCBASED_CTLS, 1149 CPU_BASED_VM_EXEC_CONTROL, 1150 CPU_BASED_HLT_EXITING /* 20.6.2 */ 1151 | CPU_BASED_CR8_LOAD_EXITING /* 20.6.2 */ 1152 | CPU_BASED_CR8_STORE_EXITING /* 20.6.2 */ 1153 | CPU_BASED_UNCOND_IO_EXITING /* 20.6.2 */ 1154 | CPU_BASED_MOV_DR_EXITING 1155 | CPU_BASED_USE_TSC_OFFSETING /* 21.3 */ 1156 ); 1157 1158 vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR); 1159 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 1160 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 1161 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 1162 1163 vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */ 1164 vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ 1165 vmcs_writel(HOST_CR3, read_cr3()); 1166 1167 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ 1168 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 1169 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 1170 vmcs_write16(HOST_FS_SELECTOR, read_fs()); /* 22.2.4 */ 1171 vmcs_write16(HOST_GS_SELECTOR, read_gs()); /* 22.2.4 */ 1172 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 1173#ifdef CONFIG_X86_64 1174 rdmsrl(MSR_FS_BASE, a); 1175 vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ 1176 rdmsrl(MSR_GS_BASE, a); 1177 vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */ 1178#else 1179 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ 1180 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ 1181#endif 1182 1183 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ 1184 1185 get_idt(&dt); 1186 vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ 1187 1188 1189 vmcs_writel(HOST_RIP, (unsigned long)kvm_vmx_return); /* 22.2.5 */ 1190 1191 rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk); 1192 vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs); 1193 rdmsrl(MSR_IA32_SYSENTER_ESP, a); 1194 vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */ 1195 rdmsrl(MSR_IA32_SYSENTER_EIP, a); 1196 vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */ 1197 1198 for (i = 0; i < NR_VMX_MSR; ++i) { 1199 u32 index = vmx_msr_index[i]; 1200 u32 data_low, data_high; 1201 u64 data; 1202 int j = vcpu->nmsrs; 1203 1204 if (rdmsr_safe(index, &data_low, &data_high) < 0) 1205 continue; 1206 if (wrmsr_safe(index, data_low, data_high) < 0) 1207 continue; 1208 data = data_low | ((u64)data_high << 32); 1209 vcpu->host_msrs[j].index = index; 1210 vcpu->host_msrs[j].reserved = 0; 1211 vcpu->host_msrs[j].data = data; 1212 vcpu->guest_msrs[j] = vcpu->host_msrs[j]; 1213#ifdef CONFIG_X86_64 1214 if (index == MSR_KERNEL_GS_BASE) 1215 msr_offset_kernel_gs_base = j; 1216#endif 1217 ++vcpu->nmsrs; 1218 } 1219 1220 setup_msrs(vcpu); 1221 1222 vmcs_write32_fixedbits(MSR_IA32_VMX_EXIT_CTLS, VM_EXIT_CONTROLS, 1223 (HOST_IS_64 << 9)); /* 22.2,1, 20.7.1 */ 1224 1225 /* 22.2.1, 20.8.1 */ 1226 vmcs_write32_fixedbits(MSR_IA32_VMX_ENTRY_CTLS, 1227 VM_ENTRY_CONTROLS, 0); 1228 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ 1229 1230#ifdef CONFIG_X86_64 1231 vmcs_writel(VIRTUAL_APIC_PAGE_ADDR, 0); 1232 vmcs_writel(TPR_THRESHOLD, 0); 1233#endif 1234 1235 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); 1236 vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); 1237 1238 vcpu->cr0 = 0x60000010; 1239 vmx_set_cr0(vcpu, vcpu->cr0); // enter rmode 1240 vmx_set_cr4(vcpu, 0); 1241#ifdef CONFIG_X86_64 1242 vmx_set_efer(vcpu, 0); 1243#endif 1244 1245 return 0; 1246 1247out: 1248 return ret; 1249} 1250 1251static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq) 1252{ 1253 u16 ent[2]; 1254 u16 cs; 1255 u16 ip; 1256 unsigned long flags; 1257 unsigned long ss_base = vmcs_readl(GUEST_SS_BASE); 1258 u16 sp = vmcs_readl(GUEST_RSP); 1259 u32 ss_limit = vmcs_read32(GUEST_SS_LIMIT); 1260 1261 if (sp > ss_limit || sp < 6 ) { 1262 vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n", 1263 __FUNCTION__, 1264 vmcs_readl(GUEST_RSP), 1265 vmcs_readl(GUEST_SS_BASE), 1266 vmcs_read32(GUEST_SS_LIMIT)); 1267 return; 1268 } 1269 1270 if (kvm_read_guest(vcpu, irq * sizeof(ent), sizeof(ent), &ent) != 1271 sizeof(ent)) { 1272 vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__); 1273 return; 1274 } 1275 1276 flags = vmcs_readl(GUEST_RFLAGS); 1277 cs = vmcs_readl(GUEST_CS_BASE) >> 4; 1278 ip = vmcs_readl(GUEST_RIP); 1279 1280 1281 if (kvm_write_guest(vcpu, ss_base + sp - 2, 2, &flags) != 2 || 1282 kvm_write_guest(vcpu, ss_base + sp - 4, 2, &cs) != 2 || 1283 kvm_write_guest(vcpu, ss_base + sp - 6, 2, &ip) != 2) { 1284 vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__); 1285 return; 1286 } 1287 1288 vmcs_writel(GUEST_RFLAGS, flags & 1289 ~( X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF)); 1290 vmcs_write16(GUEST_CS_SELECTOR, ent[1]) ; 1291 vmcs_writel(GUEST_CS_BASE, ent[1] << 4); 1292 vmcs_writel(GUEST_RIP, ent[0]); 1293 vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6)); 1294} 1295 1296static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) 1297{ 1298 int word_index = __ffs(vcpu->irq_summary); 1299 int bit_index = __ffs(vcpu->irq_pending[word_index]); 1300 int irq = word_index * BITS_PER_LONG + bit_index; 1301 1302 clear_bit(bit_index, &vcpu->irq_pending[word_index]); 1303 if (!vcpu->irq_pending[word_index]) 1304 clear_bit(word_index, &vcpu->irq_summary); 1305 1306 if (vcpu->rmode.active) { 1307 inject_rmode_irq(vcpu, irq); 1308 return; 1309 } 1310 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 1311 irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); 1312} 1313 1314 1315static void do_interrupt_requests(struct kvm_vcpu *vcpu, 1316 struct kvm_run *kvm_run) 1317{ 1318 u32 cpu_based_vm_exec_control; 1319 1320 vcpu->interrupt_window_open = 1321 ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && 1322 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); 1323 1324 if (vcpu->interrupt_window_open && 1325 vcpu->irq_summary && 1326 !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK)) 1327 /* 1328 * If interrupts enabled, and not blocked by sti or mov ss. Good. 1329 */ 1330 kvm_do_inject_irq(vcpu); 1331 1332 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 1333 if (!vcpu->interrupt_window_open && 1334 (vcpu->irq_summary || kvm_run->request_interrupt_window)) 1335 /* 1336 * Interrupts blocked. Wait for unblock. 1337 */ 1338 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; 1339 else 1340 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; 1341 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 1342} 1343 1344static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu) 1345{ 1346 struct kvm_guest_debug *dbg = &vcpu->guest_debug; 1347 1348 set_debugreg(dbg->bp[0], 0); 1349 set_debugreg(dbg->bp[1], 1); 1350 set_debugreg(dbg->bp[2], 2); 1351 set_debugreg(dbg->bp[3], 3); 1352 1353 if (dbg->singlestep) { 1354 unsigned long flags; 1355 1356 flags = vmcs_readl(GUEST_RFLAGS); 1357 flags |= X86_EFLAGS_TF | X86_EFLAGS_RF; 1358 vmcs_writel(GUEST_RFLAGS, flags); 1359 } 1360} 1361 1362static int handle_rmode_exception(struct kvm_vcpu *vcpu, 1363 int vec, u32 err_code) 1364{ 1365 if (!vcpu->rmode.active) 1366 return 0; 1367 1368 if (vec == GP_VECTOR && err_code == 0) 1369 if (emulate_instruction(vcpu, NULL, 0, 0) == EMULATE_DONE) 1370 return 1; 1371 return 0; 1372} 1373 1374static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1375{ 1376 u32 intr_info, error_code; 1377 unsigned long cr2, rip; 1378 u32 vect_info; 1379 enum emulation_result er; 1380 int r; 1381 1382 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 1383 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 1384 1385 if ((vect_info & VECTORING_INFO_VALID_MASK) && 1386 !is_page_fault(intr_info)) { 1387 printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " 1388 "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info); 1389 } 1390 1391 if (is_external_interrupt(vect_info)) { 1392 int irq = vect_info & VECTORING_INFO_VECTOR_MASK; 1393 set_bit(irq, vcpu->irq_pending); 1394 set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary); 1395 } 1396 1397 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */ 1398 asm ("int $2"); 1399 return 1; 1400 } 1401 1402 if (is_no_device(intr_info)) { 1403 vcpu->fpu_active = 1; 1404 vmcs_clear_bits(EXCEPTION_BITMAP, 1 << NM_VECTOR); 1405 if (!(vcpu->cr0 & CR0_TS_MASK)) 1406 vmcs_clear_bits(GUEST_CR0, CR0_TS_MASK); 1407 return 1; 1408 } 1409 1410 error_code = 0; 1411 rip = vmcs_readl(GUEST_RIP); 1412 if (intr_info & INTR_INFO_DELIEVER_CODE_MASK) 1413 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 1414 if (is_page_fault(intr_info)) { 1415 cr2 = vmcs_readl(EXIT_QUALIFICATION); 1416 1417 spin_lock(&vcpu->kvm->lock); 1418 r = kvm_mmu_page_fault(vcpu, cr2, error_code); 1419 if (r < 0) { 1420 spin_unlock(&vcpu->kvm->lock); 1421 return r; 1422 } 1423 if (!r) { 1424 spin_unlock(&vcpu->kvm->lock); 1425 return 1; 1426 } 1427 1428 er = emulate_instruction(vcpu, kvm_run, cr2, error_code); 1429 spin_unlock(&vcpu->kvm->lock); 1430 1431 switch (er) { 1432 case EMULATE_DONE: 1433 return 1; 1434 case EMULATE_DO_MMIO: 1435 ++vcpu->stat.mmio_exits; 1436 kvm_run->exit_reason = KVM_EXIT_MMIO; 1437 return 0; 1438 case EMULATE_FAIL: 1439 vcpu_printf(vcpu, "%s: emulate fail\n", __FUNCTION__); 1440 break; 1441 default: 1442 BUG(); 1443 } 1444 } 1445 1446 if (vcpu->rmode.active && 1447 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, 1448 error_code)) 1449 return 1; 1450 1451 if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) { 1452 kvm_run->exit_reason = KVM_EXIT_DEBUG; 1453 return 0; 1454 } 1455 kvm_run->exit_reason = KVM_EXIT_EXCEPTION; 1456 kvm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK; 1457 kvm_run->ex.error_code = error_code; 1458 return 0; 1459} 1460 1461static int handle_external_interrupt(struct kvm_vcpu *vcpu, 1462 struct kvm_run *kvm_run) 1463{ 1464 ++vcpu->stat.irq_exits; 1465 return 1; 1466} 1467 1468static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1469{ 1470 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 1471 return 0; 1472} 1473 1474static int get_io_count(struct kvm_vcpu *vcpu, unsigned long *count) 1475{ 1476 u64 inst; 1477 gva_t rip; 1478 int countr_size; 1479 int i, n; 1480 1481 if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_VM)) { 1482 countr_size = 2; 1483 } else { 1484 u32 cs_ar = vmcs_read32(GUEST_CS_AR_BYTES); 1485 1486 countr_size = (cs_ar & AR_L_MASK) ? 8: 1487 (cs_ar & AR_DB_MASK) ? 4: 2; 1488 } 1489 1490 rip = vmcs_readl(GUEST_RIP); 1491 if (countr_size != 8) 1492 rip += vmcs_readl(GUEST_CS_BASE); 1493 1494 n = kvm_read_guest(vcpu, rip, sizeof(inst), &inst); 1495 1496 for (i = 0; i < n; i++) { 1497 switch (((u8*)&inst)[i]) { 1498 case 0xf0: 1499 case 0xf2: 1500 case 0xf3: 1501 case 0x2e: 1502 case 0x36: 1503 case 0x3e: 1504 case 0x26: 1505 case 0x64: 1506 case 0x65: 1507 case 0x66: 1508 break; 1509 case 0x67: 1510 countr_size = (countr_size == 2) ? 4: (countr_size >> 1); 1511 default: 1512 goto done; 1513 } 1514 } 1515 return 0; 1516done: 1517 countr_size *= 8; 1518 *count = vcpu->regs[VCPU_REGS_RCX] & (~0ULL >> (64 - countr_size)); 1519 //printk("cx: %lx\n", vcpu->regs[VCPU_REGS_RCX]); 1520 return 1; 1521} 1522 1523static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1524{ 1525 u64 exit_qualification; 1526 int size, down, in, string, rep; 1527 unsigned port; 1528 unsigned long count; 1529 gva_t address; 1530 1531 ++vcpu->stat.io_exits; 1532 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); 1533 in = (exit_qualification & 8) != 0; 1534 size = (exit_qualification & 7) + 1; 1535 string = (exit_qualification & 16) != 0; 1536 down = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0; 1537 count = 1; 1538 rep = (exit_qualification & 32) != 0; 1539 port = exit_qualification >> 16; 1540 address = 0; 1541 if (string) { 1542 if (rep && !get_io_count(vcpu, &count)) 1543 return 1; 1544 address = vmcs_readl(GUEST_LINEAR_ADDRESS); 1545 } 1546 return kvm_setup_pio(vcpu, kvm_run, in, size, count, string, down, 1547 address, rep, port); 1548} 1549 1550static void 1551vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) 1552{ 1553 /* 1554 * Patch in the VMCALL instruction: 1555 */ 1556 hypercall[0] = 0x0f; 1557 hypercall[1] = 0x01; 1558 hypercall[2] = 0xc1; 1559 hypercall[3] = 0xc3; 1560} 1561 1562static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1563{ 1564 u64 exit_qualification; 1565 int cr; 1566 int reg; 1567 1568 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); 1569 cr = exit_qualification & 15; 1570 reg = (exit_qualification >> 8) & 15; 1571 switch ((exit_qualification >> 4) & 3) { 1572 case 0: /* mov to cr */ 1573 switch (cr) { 1574 case 0: 1575 vcpu_load_rsp_rip(vcpu); 1576 set_cr0(vcpu, vcpu->regs[reg]); 1577 skip_emulated_instruction(vcpu); 1578 return 1; 1579 case 3: 1580 vcpu_load_rsp_rip(vcpu); 1581 set_cr3(vcpu, vcpu->regs[reg]); 1582 skip_emulated_instruction(vcpu); 1583 return 1; 1584 case 4: 1585 vcpu_load_rsp_rip(vcpu); 1586 set_cr4(vcpu, vcpu->regs[reg]); 1587 skip_emulated_instruction(vcpu); 1588 return 1; 1589 case 8: 1590 vcpu_load_rsp_rip(vcpu); 1591 set_cr8(vcpu, vcpu->regs[reg]); 1592 skip_emulated_instruction(vcpu); 1593 return 1; 1594 }; 1595 break; 1596 case 2: /* clts */ 1597 vcpu_load_rsp_rip(vcpu); 1598 vcpu->fpu_active = 1; 1599 vmcs_clear_bits(EXCEPTION_BITMAP, 1 << NM_VECTOR); 1600 vmcs_clear_bits(GUEST_CR0, CR0_TS_MASK); 1601 vcpu->cr0 &= ~CR0_TS_MASK; 1602 vmcs_writel(CR0_READ_SHADOW, vcpu->cr0); 1603 skip_emulated_instruction(vcpu); 1604 return 1; 1605 case 1: /*mov from cr*/ 1606 switch (cr) { 1607 case 3: 1608 vcpu_load_rsp_rip(vcpu); 1609 vcpu->regs[reg] = vcpu->cr3; 1610 vcpu_put_rsp_rip(vcpu); 1611 skip_emulated_instruction(vcpu); 1612 return 1; 1613 case 8: 1614 vcpu_load_rsp_rip(vcpu); 1615 vcpu->regs[reg] = vcpu->cr8; 1616 vcpu_put_rsp_rip(vcpu); 1617 skip_emulated_instruction(vcpu); 1618 return 1; 1619 } 1620 break; 1621 case 3: /* lmsw */ 1622 lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f); 1623 1624 skip_emulated_instruction(vcpu); 1625 return 1; 1626 default: 1627 break; 1628 } 1629 kvm_run->exit_reason = 0; 1630 printk(KERN_ERR "kvm: unhandled control register: op %d cr %d\n", 1631 (int)(exit_qualification >> 4) & 3, cr); 1632 return 0; 1633} 1634 1635static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1636{ 1637 u64 exit_qualification; 1638 unsigned long val; 1639 int dr, reg; 1640 1641 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); 1642 dr = exit_qualification & 7; 1643 reg = (exit_qualification >> 8) & 15; 1644 vcpu_load_rsp_rip(vcpu); 1645 if (exit_qualification & 16) { 1646 /* mov from dr */ 1647 switch (dr) { 1648 case 6: 1649 val = 0xffff0ff0; 1650 break; 1651 case 7: 1652 val = 0x400; 1653 break; 1654 default: 1655 val = 0; 1656 } 1657 vcpu->regs[reg] = val; 1658 } else { 1659 /* mov to dr */ 1660 } 1661 vcpu_put_rsp_rip(vcpu); 1662 skip_emulated_instruction(vcpu); 1663 return 1; 1664} 1665 1666static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1667{ 1668 kvm_emulate_cpuid(vcpu); 1669 return 1; 1670} 1671 1672static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1673{ 1674 u32 ecx = vcpu->regs[VCPU_REGS_RCX]; 1675 u64 data; 1676 1677 if (vmx_get_msr(vcpu, ecx, &data)) { 1678 vmx_inject_gp(vcpu, 0); 1679 return 1; 1680 } 1681 1682 vcpu->regs[VCPU_REGS_RAX] = data & -1u; 1683 vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u; 1684 skip_emulated_instruction(vcpu); 1685 return 1; 1686} 1687 1688static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1689{ 1690 u32 ecx = vcpu->regs[VCPU_REGS_RCX]; 1691 u64 data = (vcpu->regs[VCPU_REGS_RAX] & -1u) 1692 | ((u64)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32); 1693 1694 if (vmx_set_msr(vcpu, ecx, data) != 0) { 1695 vmx_inject_gp(vcpu, 0); 1696 return 1; 1697 } 1698 1699 skip_emulated_instruction(vcpu); 1700 return 1; 1701} 1702 1703static void post_kvm_run_save(struct kvm_vcpu *vcpu, 1704 struct kvm_run *kvm_run) 1705{ 1706 kvm_run->if_flag = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) != 0; 1707 kvm_run->cr8 = vcpu->cr8; 1708 kvm_run->apic_base = vcpu->apic_base; 1709 kvm_run->ready_for_interrupt_injection = (vcpu->interrupt_window_open && 1710 vcpu->irq_summary == 0); 1711} 1712 1713static int handle_interrupt_window(struct kvm_vcpu *vcpu, 1714 struct kvm_run *kvm_run) 1715{ 1716 /* 1717 * If the user space waits to inject interrupts, exit as soon as 1718 * possible 1719 */ 1720 if (kvm_run->request_interrupt_window && 1721 !vcpu->irq_summary) { 1722 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; 1723 ++vcpu->stat.irq_window_exits; 1724 return 0; 1725 } 1726 return 1; 1727} 1728 1729static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1730{ 1731 skip_emulated_instruction(vcpu); 1732 if (vcpu->irq_summary) 1733 return 1; 1734 1735 kvm_run->exit_reason = KVM_EXIT_HLT; 1736 ++vcpu->stat.halt_exits; 1737 return 0; 1738} 1739 1740static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1741{ 1742 skip_emulated_instruction(vcpu); 1743 return kvm_hypercall(vcpu, kvm_run); 1744} 1745 1746/* 1747 * The exit handlers return 1 if the exit was handled fully and guest execution 1748 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 1749 * to be done to userspace and return 0. 1750 */ 1751static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, 1752 struct kvm_run *kvm_run) = { 1753 [EXIT_REASON_EXCEPTION_NMI] = handle_exception, 1754 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 1755 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, 1756 [EXIT_REASON_IO_INSTRUCTION] = handle_io, 1757 [EXIT_REASON_CR_ACCESS] = handle_cr, 1758 [EXIT_REASON_DR_ACCESS] = handle_dr, 1759 [EXIT_REASON_CPUID] = handle_cpuid, 1760 [EXIT_REASON_MSR_READ] = handle_rdmsr, 1761 [EXIT_REASON_MSR_WRITE] = handle_wrmsr, 1762 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, 1763 [EXIT_REASON_HLT] = handle_halt, 1764 [EXIT_REASON_VMCALL] = handle_vmcall, 1765}; 1766 1767static const int kvm_vmx_max_exit_handlers = 1768 sizeof(kvm_vmx_exit_handlers) / sizeof(*kvm_vmx_exit_handlers); 1769 1770/* 1771 * The guest has exited. See if we can fix it or if we need userspace 1772 * assistance. 1773 */ 1774static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 1775{ 1776 u32 vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 1777 u32 exit_reason = vmcs_read32(VM_EXIT_REASON); 1778 1779 if ( (vectoring_info & VECTORING_INFO_VALID_MASK) && 1780 exit_reason != EXIT_REASON_EXCEPTION_NMI ) 1781 printk(KERN_WARNING "%s: unexpected, valid vectoring info and " 1782 "exit reason is 0x%x\n", __FUNCTION__, exit_reason); 1783 if (exit_reason < kvm_vmx_max_exit_handlers 1784 && kvm_vmx_exit_handlers[exit_reason]) 1785 return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); 1786 else { 1787 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 1788 kvm_run->hw.hardware_exit_reason = exit_reason; 1789 } 1790 return 0; 1791} 1792 1793/* 1794 * Check if userspace requested an interrupt window, and that the 1795 * interrupt window is open. 1796 * 1797 * No need to exit to userspace if we already have an interrupt queued. 1798 */ 1799static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, 1800 struct kvm_run *kvm_run) 1801{ 1802 return (!vcpu->irq_summary && 1803 kvm_run->request_interrupt_window && 1804 vcpu->interrupt_window_open && 1805 (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)); 1806} 1807 1808static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1809{ 1810 u8 fail; 1811 u16 fs_sel, gs_sel, ldt_sel; 1812 int fs_gs_ldt_reload_needed; 1813 int r; 1814 1815again: 1816 /* 1817 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not 1818 * allow segment selectors with cpl > 0 or ti == 1. 1819 */ 1820 fs_sel = read_fs(); 1821 gs_sel = read_gs(); 1822 ldt_sel = read_ldt(); 1823 fs_gs_ldt_reload_needed = (fs_sel & 7) | (gs_sel & 7) | ldt_sel; 1824 if (!fs_gs_ldt_reload_needed) { 1825 vmcs_write16(HOST_FS_SELECTOR, fs_sel); 1826 vmcs_write16(HOST_GS_SELECTOR, gs_sel); 1827 } else { 1828 vmcs_write16(HOST_FS_SELECTOR, 0); 1829 vmcs_write16(HOST_GS_SELECTOR, 0); 1830 } 1831 1832#ifdef CONFIG_X86_64 1833 vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); 1834 vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); 1835#else 1836 vmcs_writel(HOST_FS_BASE, segment_base(fs_sel)); 1837 vmcs_writel(HOST_GS_BASE, segment_base(gs_sel)); 1838#endif 1839 1840 if (!vcpu->mmio_read_completed) 1841 do_interrupt_requests(vcpu, kvm_run); 1842 1843 if (vcpu->guest_debug.enabled) 1844 kvm_guest_debug_pre(vcpu); 1845 1846 kvm_load_guest_fpu(vcpu); 1847 1848 /* 1849 * Loading guest fpu may have cleared host cr0.ts 1850 */ 1851 vmcs_writel(HOST_CR0, read_cr0()); 1852 1853#ifdef CONFIG_X86_64 1854 if (is_long_mode(vcpu)) { 1855 save_msrs(vcpu->host_msrs + msr_offset_kernel_gs_base, 1); 1856 load_msrs(vcpu->guest_msrs, NR_BAD_MSRS); 1857 } 1858#endif 1859 1860 asm ( 1861 /* Store host registers */ 1862 "pushf \n\t" 1863#ifdef CONFIG_X86_64 1864 "push %%rax; push %%rbx; push %%rdx;" 1865 "push %%rsi; push %%rdi; push %%rbp;" 1866 "push %%r8; push %%r9; push %%r10; push %%r11;" 1867 "push %%r12; push %%r13; push %%r14; push %%r15;" 1868 "push %%rcx \n\t" 1869 ASM_VMX_VMWRITE_RSP_RDX "\n\t" 1870#else 1871 "pusha; push %%ecx \n\t" 1872 ASM_VMX_VMWRITE_RSP_RDX "\n\t" 1873#endif 1874 /* Check if vmlaunch of vmresume is needed */ 1875 "cmp $0, %1 \n\t" 1876 /* Load guest registers. Don't clobber flags. */ 1877#ifdef CONFIG_X86_64 1878 "mov %c[cr2](%3), %%rax \n\t" 1879 "mov %%rax, %%cr2 \n\t" 1880 "mov %c[rax](%3), %%rax \n\t" 1881 "mov %c[rbx](%3), %%rbx \n\t" 1882 "mov %c[rdx](%3), %%rdx \n\t" 1883 "mov %c[rsi](%3), %%rsi \n\t" 1884 "mov %c[rdi](%3), %%rdi \n\t" 1885 "mov %c[rbp](%3), %%rbp \n\t" 1886 "mov %c[r8](%3), %%r8 \n\t" 1887 "mov %c[r9](%3), %%r9 \n\t" 1888 "mov %c[r10](%3), %%r10 \n\t" 1889 "mov %c[r11](%3), %%r11 \n\t" 1890 "mov %c[r12](%3), %%r12 \n\t" 1891 "mov %c[r13](%3), %%r13 \n\t" 1892 "mov %c[r14](%3), %%r14 \n\t" 1893 "mov %c[r15](%3), %%r15 \n\t" 1894 "mov %c[rcx](%3), %%rcx \n\t" /* kills %3 (rcx) */ 1895#else 1896 "mov %c[cr2](%3), %%eax \n\t" 1897 "mov %%eax, %%cr2 \n\t" 1898 "mov %c[rax](%3), %%eax \n\t" 1899 "mov %c[rbx](%3), %%ebx \n\t" 1900 "mov %c[rdx](%3), %%edx \n\t" 1901 "mov %c[rsi](%3), %%esi \n\t" 1902 "mov %c[rdi](%3), %%edi \n\t" 1903 "mov %c[rbp](%3), %%ebp \n\t" 1904 "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */ 1905#endif 1906 /* Enter guest mode */ 1907 "jne launched \n\t" 1908 ASM_VMX_VMLAUNCH "\n\t" 1909 "jmp kvm_vmx_return \n\t" 1910 "launched: " ASM_VMX_VMRESUME "\n\t" 1911 ".globl kvm_vmx_return \n\t" 1912 "kvm_vmx_return: " 1913 /* Save guest registers, load host registers, keep flags */ 1914#ifdef CONFIG_X86_64 1915 "xchg %3, (%%rsp) \n\t" 1916 "mov %%rax, %c[rax](%3) \n\t" 1917 "mov %%rbx, %c[rbx](%3) \n\t" 1918 "pushq (%%rsp); popq %c[rcx](%3) \n\t" 1919 "mov %%rdx, %c[rdx](%3) \n\t" 1920 "mov %%rsi, %c[rsi](%3) \n\t" 1921 "mov %%rdi, %c[rdi](%3) \n\t" 1922 "mov %%rbp, %c[rbp](%3) \n\t" 1923 "mov %%r8, %c[r8](%3) \n\t" 1924 "mov %%r9, %c[r9](%3) \n\t" 1925 "mov %%r10, %c[r10](%3) \n\t" 1926 "mov %%r11, %c[r11](%3) \n\t" 1927 "mov %%r12, %c[r12](%3) \n\t" 1928 "mov %%r13, %c[r13](%3) \n\t" 1929 "mov %%r14, %c[r14](%3) \n\t" 1930 "mov %%r15, %c[r15](%3) \n\t" 1931 "mov %%cr2, %%rax \n\t" 1932 "mov %%rax, %c[cr2](%3) \n\t" 1933 "mov (%%rsp), %3 \n\t" 1934 1935 "pop %%rcx; pop %%r15; pop %%r14; pop %%r13; pop %%r12;" 1936 "pop %%r11; pop %%r10; pop %%r9; pop %%r8;" 1937 "pop %%rbp; pop %%rdi; pop %%rsi;" 1938 "pop %%rdx; pop %%rbx; pop %%rax \n\t" 1939#else 1940 "xchg %3, (%%esp) \n\t" 1941 "mov %%eax, %c[rax](%3) \n\t" 1942 "mov %%ebx, %c[rbx](%3) \n\t" 1943 "pushl (%%esp); popl %c[rcx](%3) \n\t" 1944 "mov %%edx, %c[rdx](%3) \n\t" 1945 "mov %%esi, %c[rsi](%3) \n\t" 1946 "mov %%edi, %c[rdi](%3) \n\t" 1947 "mov %%ebp, %c[rbp](%3) \n\t" 1948 "mov %%cr2, %%eax \n\t" 1949 "mov %%eax, %c[cr2](%3) \n\t" 1950 "mov (%%esp), %3 \n\t" 1951 1952 "pop %%ecx; popa \n\t" 1953#endif 1954 "setbe %0 \n\t" 1955 "popf \n\t" 1956 : "=q" (fail) 1957 : "r"(vcpu->launched), "d"((unsigned long)HOST_RSP), 1958 "c"(vcpu), 1959 [rax]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RAX])), 1960 [rbx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBX])), 1961 [rcx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RCX])), 1962 [rdx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDX])), 1963 [rsi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RSI])), 1964 [rdi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDI])), 1965 [rbp]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBP])), 1966#ifdef CONFIG_X86_64 1967 [r8 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R8 ])), 1968 [r9 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R9 ])), 1969 [r10]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R10])), 1970 [r11]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R11])), 1971 [r12]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R12])), 1972 [r13]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R13])), 1973 [r14]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R14])), 1974 [r15]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R15])), 1975#endif 1976 [cr2]"i"(offsetof(struct kvm_vcpu, cr2)) 1977 : "cc", "memory" ); 1978 1979 /* 1980 * Reload segment selectors ASAP. (it's needed for a functional 1981 * kernel: x86 relies on having __KERNEL_PDA in %fs and x86_64 1982 * relies on having 0 in %gs for the CPU PDA to work.) 1983 */ 1984 if (fs_gs_ldt_reload_needed) { 1985 load_ldt(ldt_sel); 1986 load_fs(fs_sel); 1987 /* 1988 * If we have to reload gs, we must take care to 1989 * preserve our gs base. 1990 */ 1991 local_irq_disable(); 1992 load_gs(gs_sel); 1993#ifdef CONFIG_X86_64 1994 wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); 1995#endif 1996 local_irq_enable(); 1997 1998 reload_tss(); 1999 } 2000 ++vcpu->stat.exits; 2001 2002#ifdef CONFIG_X86_64 2003 if (is_long_mode(vcpu)) { 2004 save_msrs(vcpu->guest_msrs, NR_BAD_MSRS); 2005 load_msrs(vcpu->host_msrs, NR_BAD_MSRS); 2006 } 2007#endif 2008 2009 vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; 2010 2011 asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); 2012 2013 if (fail) { 2014 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 2015 kvm_run->fail_entry.hardware_entry_failure_reason 2016 = vmcs_read32(VM_INSTRUCTION_ERROR); 2017 r = 0; 2018 } else { 2019 /* 2020 * Profile KVM exit RIPs: 2021 */ 2022 if (unlikely(prof_on == KVM_PROFILING)) 2023 profile_hit(KVM_PROFILING, (void *)vmcs_readl(GUEST_RIP)); 2024 2025 vcpu->launched = 1; 2026 r = kvm_handle_exit(kvm_run, vcpu); 2027 if (r > 0) { 2028 /* Give scheduler a change to reschedule. */ 2029 if (signal_pending(current)) { 2030 ++vcpu->stat.signal_exits; 2031 post_kvm_run_save(vcpu, kvm_run); 2032 kvm_run->exit_reason = KVM_EXIT_INTR; 2033 return -EINTR; 2034 } 2035 2036 if (dm_request_for_irq_injection(vcpu, kvm_run)) { 2037 ++vcpu->stat.request_irq_exits; 2038 post_kvm_run_save(vcpu, kvm_run); 2039 kvm_run->exit_reason = KVM_EXIT_INTR; 2040 return -EINTR; 2041 } 2042 2043 kvm_resched(vcpu); 2044 goto again; 2045 } 2046 } 2047 2048 post_kvm_run_save(vcpu, kvm_run); 2049 return r; 2050} 2051 2052static void vmx_flush_tlb(struct kvm_vcpu *vcpu) 2053{ 2054 vmcs_writel(GUEST_CR3, vmcs_readl(GUEST_CR3)); 2055} 2056 2057static void vmx_inject_page_fault(struct kvm_vcpu *vcpu, 2058 unsigned long addr, 2059 u32 err_code) 2060{ 2061 u32 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 2062 2063 ++vcpu->stat.pf_guest; 2064 2065 if (is_page_fault(vect_info)) { 2066 printk(KERN_DEBUG "inject_page_fault: " 2067 "double fault 0x%lx @ 0x%lx\n", 2068 addr, vmcs_readl(GUEST_RIP)); 2069 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0); 2070 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2071 DF_VECTOR | 2072 INTR_TYPE_EXCEPTION | 2073 INTR_INFO_DELIEVER_CODE_MASK | 2074 INTR_INFO_VALID_MASK); 2075 return; 2076 } 2077 vcpu->cr2 = addr; 2078 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, err_code); 2079 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2080 PF_VECTOR | 2081 INTR_TYPE_EXCEPTION | 2082 INTR_INFO_DELIEVER_CODE_MASK | 2083 INTR_INFO_VALID_MASK); 2084 2085} 2086 2087static void vmx_free_vmcs(struct kvm_vcpu *vcpu) 2088{ 2089 if (vcpu->vmcs) { 2090 on_each_cpu(__vcpu_clear, vcpu, 0, 1); 2091 free_vmcs(vcpu->vmcs); 2092 vcpu->vmcs = NULL; 2093 } 2094} 2095 2096static void vmx_free_vcpu(struct kvm_vcpu *vcpu) 2097{ 2098 vmx_free_vmcs(vcpu); 2099} 2100 2101static int vmx_create_vcpu(struct kvm_vcpu *vcpu) 2102{ 2103 struct vmcs *vmcs; 2104 2105 vcpu->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); 2106 if (!vcpu->guest_msrs) 2107 return -ENOMEM; 2108 2109 vcpu->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); 2110 if (!vcpu->host_msrs) 2111 goto out_free_guest_msrs; 2112 2113 vmcs = alloc_vmcs(); 2114 if (!vmcs) 2115 goto out_free_msrs; 2116 2117 vmcs_clear(vmcs); 2118 vcpu->vmcs = vmcs; 2119 vcpu->launched = 0; 2120 vcpu->fpu_active = 1; 2121 2122 return 0; 2123 2124out_free_msrs: 2125 kfree(vcpu->host_msrs); 2126 vcpu->host_msrs = NULL; 2127 2128out_free_guest_msrs: 2129 kfree(vcpu->guest_msrs); 2130 vcpu->guest_msrs = NULL; 2131 2132 return -ENOMEM; 2133} 2134 2135static struct kvm_arch_ops vmx_arch_ops = { 2136 .cpu_has_kvm_support = cpu_has_kvm_support, 2137 .disabled_by_bios = vmx_disabled_by_bios, 2138 .hardware_setup = hardware_setup, 2139 .hardware_unsetup = hardware_unsetup, 2140 .hardware_enable = hardware_enable, 2141 .hardware_disable = hardware_disable, 2142 2143 .vcpu_create = vmx_create_vcpu, 2144 .vcpu_free = vmx_free_vcpu, 2145 2146 .vcpu_load = vmx_vcpu_load, 2147 .vcpu_put = vmx_vcpu_put, 2148 .vcpu_decache = vmx_vcpu_decache, 2149 2150 .set_guest_debug = set_guest_debug, 2151 .get_msr = vmx_get_msr, 2152 .set_msr = vmx_set_msr, 2153 .get_segment_base = vmx_get_segment_base, 2154 .get_segment = vmx_get_segment, 2155 .set_segment = vmx_set_segment, 2156 .get_cs_db_l_bits = vmx_get_cs_db_l_bits, 2157 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, 2158 .set_cr0 = vmx_set_cr0, 2159 .set_cr3 = vmx_set_cr3, 2160 .set_cr4 = vmx_set_cr4, 2161#ifdef CONFIG_X86_64 2162 .set_efer = vmx_set_efer, 2163#endif 2164 .get_idt = vmx_get_idt, 2165 .set_idt = vmx_set_idt, 2166 .get_gdt = vmx_get_gdt, 2167 .set_gdt = vmx_set_gdt, 2168 .cache_regs = vcpu_load_rsp_rip, 2169 .decache_regs = vcpu_put_rsp_rip, 2170 .get_rflags = vmx_get_rflags, 2171 .set_rflags = vmx_set_rflags, 2172 2173 .tlb_flush = vmx_flush_tlb, 2174 .inject_page_fault = vmx_inject_page_fault, 2175 2176 .inject_gp = vmx_inject_gp, 2177 2178 .run = vmx_vcpu_run, 2179 .skip_emulated_instruction = skip_emulated_instruction, 2180 .vcpu_setup = vmx_vcpu_setup, 2181 .patch_hypercall = vmx_patch_hypercall, 2182}; 2183 2184static int __init vmx_init(void) 2185{ 2186 return kvm_init_arch(&vmx_arch_ops, THIS_MODULE); 2187} 2188 2189static void __exit vmx_exit(void) 2190{ 2191 kvm_exit_arch(); 2192} 2193 2194module_init(vmx_init) 2195module_exit(vmx_exit) 2196