1/* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * This module enables machines with Intel VT-x extensions to run virtual 5 * machines without emulation or binary translation. 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * 9 * Authors: 10 * Avi Kivity <avi@qumranet.com> 11 * Yaniv Kamay <yaniv@qumranet.com> 12 * 13 * This work is licensed under the terms of the GNU GPL, version 2. See 14 * the COPYING file in the top-level directory. 15 * 16 */ 17 18#include "kvm.h" 19 20#include <linux/kvm.h> 21#include <linux/module.h> 22#include <linux/errno.h> 23#include <linux/magic.h> 24#include <asm/processor.h> 25#include <linux/percpu.h> 26#include <linux/gfp.h> 27#include <asm/msr.h> 28#include <linux/mm.h> 29#include <linux/miscdevice.h> 30#include <linux/vmalloc.h> 31#include <asm/uaccess.h> 32#include <linux/reboot.h> 33#include <asm/io.h> 34#include <linux/debugfs.h> 35#include <linux/highmem.h> 36#include <linux/file.h> 37#include <asm/desc.h> 38#include <linux/sysdev.h> 39#include <linux/cpu.h> 40#include <linux/file.h> 41#include <linux/fs.h> 42#include <linux/mount.h> 43#include <linux/sched.h> 44 45#include "x86_emulate.h" 46#include "segment_descriptor.h" 47 48MODULE_AUTHOR("Qumranet"); 49MODULE_LICENSE("GPL"); 50 51static DEFINE_SPINLOCK(kvm_lock); 52static LIST_HEAD(vm_list); 53 54struct kvm_arch_ops *kvm_arch_ops; 55 56#define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x) 57 58static struct kvm_stats_debugfs_item { 59 const char *name; 60 int offset; 61 struct dentry *dentry; 62} debugfs_entries[] = { 63 { "pf_fixed", STAT_OFFSET(pf_fixed) }, 64 { "pf_guest", STAT_OFFSET(pf_guest) }, 65 { "tlb_flush", STAT_OFFSET(tlb_flush) }, 66 { "invlpg", STAT_OFFSET(invlpg) }, 67 { "exits", STAT_OFFSET(exits) }, 68 { "io_exits", STAT_OFFSET(io_exits) }, 69 { "mmio_exits", STAT_OFFSET(mmio_exits) }, 70 { "signal_exits", STAT_OFFSET(signal_exits) }, 71 { "irq_window", STAT_OFFSET(irq_window_exits) }, 72 { "halt_exits", STAT_OFFSET(halt_exits) }, 73 { "request_irq", STAT_OFFSET(request_irq_exits) }, 74 { "irq_exits", STAT_OFFSET(irq_exits) }, 75 { NULL } 76}; 77 78static struct dentry *debugfs_dir; 79 80struct vfsmount *kvmfs_mnt; 81 82#define MAX_IO_MSRS 256 83 84#define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL 85#define LMSW_GUEST_MASK 0x0eULL 86#define CR4_RESEVED_BITS (~((1ULL << 11) - 1)) 87#define CR8_RESEVED_BITS (~0x0fULL) 88#define EFER_RESERVED_BITS 0xfffffffffffff2fe 89 90#ifdef CONFIG_X86_64 91// LDT or TSS descriptor in the GDT. 16 bytes. 92struct segment_descriptor_64 { 93 struct segment_descriptor s; 94 u32 base_higher; 95 u32 pad_zero; 96}; 97 98#endif 99 100static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 101 unsigned long arg); 102 103static struct inode *kvmfs_inode(struct file_operations *fops) 104{ 105 int error = -ENOMEM; 106 struct inode *inode = new_inode(kvmfs_mnt->mnt_sb); 107 108 if (!inode) 109 goto eexit_1; 110 111 inode->i_fop = fops; 112 113 /* 114 * Mark the inode dirty from the very beginning, 115 * that way it will never be moved to the dirty 116 * list because mark_inode_dirty() will think 117 * that it already _is_ on the dirty list. 118 */ 119 inode->i_state = I_DIRTY; 120 inode->i_mode = S_IRUSR | S_IWUSR; 121 inode->i_uid = current->fsuid; 122 inode->i_gid = current->fsgid; 123 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 124 return inode; 125 126eexit_1: 127 return ERR_PTR(error); 128} 129 130static struct file *kvmfs_file(struct inode *inode, void *private_data) 131{ 132 struct file *file = get_empty_filp(); 133 134 if (!file) 135 return ERR_PTR(-ENFILE); 136 137 file->f_path.mnt = mntget(kvmfs_mnt); 138 file->f_path.dentry = d_alloc_anon(inode); 139 if (!file->f_path.dentry) 140 return ERR_PTR(-ENOMEM); 141 file->f_mapping = inode->i_mapping; 142 143 file->f_pos = 0; 144 file->f_flags = O_RDWR; 145 file->f_op = inode->i_fop; 146 file->f_mode = FMODE_READ | FMODE_WRITE; 147 file->f_version = 0; 148 file->private_data = private_data; 149 return file; 150} 151 152unsigned long segment_base(u16 selector) 153{ 154 struct descriptor_table gdt; 155 struct segment_descriptor *d; 156 unsigned long table_base; 157 typedef unsigned long ul; 158 unsigned long v; 159 160 if (selector == 0) 161 return 0; 162 163 asm ("sgdt %0" : "=m"(gdt)); 164 table_base = gdt.base; 165 166 if (selector & 4) { /* from ldt */ 167 u16 ldt_selector; 168 169 asm ("sldt %0" : "=g"(ldt_selector)); 170 table_base = segment_base(ldt_selector); 171 } 172 d = (struct segment_descriptor *)(table_base + (selector & ~7)); 173 v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24); 174#ifdef CONFIG_X86_64 175 if (d->system == 0 176 && (d->type == 2 || d->type == 9 || d->type == 11)) 177 v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32; 178#endif 179 return v; 180} 181EXPORT_SYMBOL_GPL(segment_base); 182 183static inline int valid_vcpu(int n) 184{ 185 return likely(n >= 0 && n < KVM_MAX_VCPUS); 186} 187 188int kvm_read_guest(struct kvm_vcpu *vcpu, gva_t addr, unsigned long size, 189 void *dest) 190{ 191 unsigned char *host_buf = dest; 192 unsigned long req_size = size; 193 194 while (size) { 195 hpa_t paddr; 196 unsigned now; 197 unsigned offset; 198 hva_t guest_buf; 199 200 paddr = gva_to_hpa(vcpu, addr); 201 202 if (is_error_hpa(paddr)) 203 break; 204 205 guest_buf = (hva_t)kmap_atomic( 206 pfn_to_page(paddr >> PAGE_SHIFT), 207 KM_USER0); 208 offset = addr & ~PAGE_MASK; 209 guest_buf |= offset; 210 now = min(size, PAGE_SIZE - offset); 211 memcpy(host_buf, (void*)guest_buf, now); 212 host_buf += now; 213 addr += now; 214 size -= now; 215 kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0); 216 } 217 return req_size - size; 218} 219EXPORT_SYMBOL_GPL(kvm_read_guest); 220 221int kvm_write_guest(struct kvm_vcpu *vcpu, gva_t addr, unsigned long size, 222 void *data) 223{ 224 unsigned char *host_buf = data; 225 unsigned long req_size = size; 226 227 while (size) { 228 hpa_t paddr; 229 unsigned now; 230 unsigned offset; 231 hva_t guest_buf; 232 gfn_t gfn; 233 234 paddr = gva_to_hpa(vcpu, addr); 235 236 if (is_error_hpa(paddr)) 237 break; 238 239 gfn = vcpu->mmu.gva_to_gpa(vcpu, addr) >> PAGE_SHIFT; 240 mark_page_dirty(vcpu->kvm, gfn); 241 guest_buf = (hva_t)kmap_atomic( 242 pfn_to_page(paddr >> PAGE_SHIFT), KM_USER0); 243 offset = addr & ~PAGE_MASK; 244 guest_buf |= offset; 245 now = min(size, PAGE_SIZE - offset); 246 memcpy((void*)guest_buf, host_buf, now); 247 host_buf += now; 248 addr += now; 249 size -= now; 250 kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0); 251 } 252 return req_size - size; 253} 254EXPORT_SYMBOL_GPL(kvm_write_guest); 255 256void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) 257{ 258 if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) 259 return; 260 261 vcpu->guest_fpu_loaded = 1; 262 fx_save(vcpu->host_fx_image); 263 fx_restore(vcpu->guest_fx_image); 264} 265EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); 266 267void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) 268{ 269 if (!vcpu->guest_fpu_loaded) 270 return; 271 272 vcpu->guest_fpu_loaded = 0; 273 fx_save(vcpu->guest_fx_image); 274 fx_restore(vcpu->host_fx_image); 275} 276EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); 277 278/* 279 * Switches to specified vcpu, until a matching vcpu_put() 280 */ 281static void vcpu_load(struct kvm_vcpu *vcpu) 282{ 283 mutex_lock(&vcpu->mutex); 284 kvm_arch_ops->vcpu_load(vcpu); 285} 286 287/* 288 * Switches to specified vcpu, until a matching vcpu_put(). Will return NULL 289 * if the slot is not populated. 290 */ 291static struct kvm_vcpu *vcpu_load_slot(struct kvm *kvm, int slot) 292{ 293 struct kvm_vcpu *vcpu = &kvm->vcpus[slot]; 294 295 mutex_lock(&vcpu->mutex); 296 if (!vcpu->vmcs) { 297 mutex_unlock(&vcpu->mutex); 298 return NULL; 299 } 300 kvm_arch_ops->vcpu_load(vcpu); 301 return vcpu; 302} 303 304static void vcpu_put(struct kvm_vcpu *vcpu) 305{ 306 kvm_arch_ops->vcpu_put(vcpu); 307 mutex_unlock(&vcpu->mutex); 308} 309 310static struct kvm *kvm_create_vm(void) 311{ 312 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); 313 int i; 314 315 if (!kvm) 316 return ERR_PTR(-ENOMEM); 317 318 spin_lock_init(&kvm->lock); 319 INIT_LIST_HEAD(&kvm->active_mmu_pages); 320 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 321 struct kvm_vcpu *vcpu = &kvm->vcpus[i]; 322 323 mutex_init(&vcpu->mutex); 324 vcpu->cpu = -1; 325 vcpu->kvm = kvm; 326 vcpu->mmu.root_hpa = INVALID_PAGE; 327 INIT_LIST_HEAD(&vcpu->free_pages); 328 spin_lock(&kvm_lock); 329 list_add(&kvm->vm_list, &vm_list); 330 spin_unlock(&kvm_lock); 331 } 332 return kvm; 333} 334 335static int kvm_dev_open(struct inode *inode, struct file *filp) 336{ 337 return 0; 338} 339 340/* 341 * Free any memory in @free but not in @dont. 342 */ 343static void kvm_free_physmem_slot(struct kvm_memory_slot *free, 344 struct kvm_memory_slot *dont) 345{ 346 int i; 347 348 if (!dont || free->phys_mem != dont->phys_mem) 349 if (free->phys_mem) { 350 for (i = 0; i < free->npages; ++i) 351 if (free->phys_mem[i]) 352 __free_page(free->phys_mem[i]); 353 vfree(free->phys_mem); 354 } 355 356 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 357 vfree(free->dirty_bitmap); 358 359 free->phys_mem = NULL; 360 free->npages = 0; 361 free->dirty_bitmap = NULL; 362} 363 364static void kvm_free_physmem(struct kvm *kvm) 365{ 366 int i; 367 368 for (i = 0; i < kvm->nmemslots; ++i) 369 kvm_free_physmem_slot(&kvm->memslots[i], NULL); 370} 371 372static void free_pio_guest_pages(struct kvm_vcpu *vcpu) 373{ 374 int i; 375 376 for (i = 0; i < 2; ++i) 377 if (vcpu->pio.guest_pages[i]) { 378 __free_page(vcpu->pio.guest_pages[i]); 379 vcpu->pio.guest_pages[i] = NULL; 380 } 381} 382 383static void kvm_free_vcpu(struct kvm_vcpu *vcpu) 384{ 385 if (!vcpu->vmcs) 386 return; 387 388 vcpu_load(vcpu); 389 kvm_mmu_destroy(vcpu); 390 vcpu_put(vcpu); 391 kvm_arch_ops->vcpu_free(vcpu); 392 free_page((unsigned long)vcpu->run); 393 vcpu->run = NULL; 394 free_page((unsigned long)vcpu->pio_data); 395 vcpu->pio_data = NULL; 396 free_pio_guest_pages(vcpu); 397} 398 399static void kvm_free_vcpus(struct kvm *kvm) 400{ 401 unsigned int i; 402 403 for (i = 0; i < KVM_MAX_VCPUS; ++i) 404 kvm_free_vcpu(&kvm->vcpus[i]); 405} 406 407static int kvm_dev_release(struct inode *inode, struct file *filp) 408{ 409 return 0; 410} 411 412static void kvm_destroy_vm(struct kvm *kvm) 413{ 414 spin_lock(&kvm_lock); 415 list_del(&kvm->vm_list); 416 spin_unlock(&kvm_lock); 417 kvm_free_vcpus(kvm); 418 kvm_free_physmem(kvm); 419 kfree(kvm); 420} 421 422static int kvm_vm_release(struct inode *inode, struct file *filp) 423{ 424 struct kvm *kvm = filp->private_data; 425 426 kvm_destroy_vm(kvm); 427 return 0; 428} 429 430static void inject_gp(struct kvm_vcpu *vcpu) 431{ 432 kvm_arch_ops->inject_gp(vcpu, 0); 433} 434 435/* 436 * Load the pae pdptrs. Return true is they are all valid. 437 */ 438static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) 439{ 440 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; 441 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; 442 int i; 443 u64 pdpte; 444 u64 *pdpt; 445 int ret; 446 struct page *page; 447 448 spin_lock(&vcpu->kvm->lock); 449 page = gfn_to_page(vcpu->kvm, pdpt_gfn); 450 pdpt = kmap_atomic(page, KM_USER0); 451 452 ret = 1; 453 for (i = 0; i < 4; ++i) { 454 pdpte = pdpt[offset + i]; 455 if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull)) { 456 ret = 0; 457 goto out; 458 } 459 } 460 461 for (i = 0; i < 4; ++i) 462 vcpu->pdptrs[i] = pdpt[offset + i]; 463 464out: 465 kunmap_atomic(pdpt, KM_USER0); 466 spin_unlock(&vcpu->kvm->lock); 467 468 return ret; 469} 470 471void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 472{ 473 if (cr0 & CR0_RESEVED_BITS) { 474 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", 475 cr0, vcpu->cr0); 476 inject_gp(vcpu); 477 return; 478 } 479 480 if ((cr0 & CR0_NW_MASK) && !(cr0 & CR0_CD_MASK)) { 481 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); 482 inject_gp(vcpu); 483 return; 484 } 485 486 if ((cr0 & CR0_PG_MASK) && !(cr0 & CR0_PE_MASK)) { 487 printk(KERN_DEBUG "set_cr0: #GP, set PG flag " 488 "and a clear PE flag\n"); 489 inject_gp(vcpu); 490 return; 491 } 492 493 if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK)) { 494#ifdef CONFIG_X86_64 495 if ((vcpu->shadow_efer & EFER_LME)) { 496 int cs_db, cs_l; 497 498 if (!is_pae(vcpu)) { 499 printk(KERN_DEBUG "set_cr0: #GP, start paging " 500 "in long mode while PAE is disabled\n"); 501 inject_gp(vcpu); 502 return; 503 } 504 kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 505 if (cs_l) { 506 printk(KERN_DEBUG "set_cr0: #GP, start paging " 507 "in long mode while CS.L == 1\n"); 508 inject_gp(vcpu); 509 return; 510 511 } 512 } else 513#endif 514 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) { 515 printk(KERN_DEBUG "set_cr0: #GP, pdptrs " 516 "reserved bits\n"); 517 inject_gp(vcpu); 518 return; 519 } 520 521 } 522 523 kvm_arch_ops->set_cr0(vcpu, cr0); 524 vcpu->cr0 = cr0; 525 526 spin_lock(&vcpu->kvm->lock); 527 kvm_mmu_reset_context(vcpu); 528 spin_unlock(&vcpu->kvm->lock); 529 return; 530} 531EXPORT_SYMBOL_GPL(set_cr0); 532 533void lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 534{ 535 set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f)); 536} 537EXPORT_SYMBOL_GPL(lmsw); 538 539void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 540{ 541 if (cr4 & CR4_RESEVED_BITS) { 542 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); 543 inject_gp(vcpu); 544 return; 545 } 546 547 if (is_long_mode(vcpu)) { 548 if (!(cr4 & CR4_PAE_MASK)) { 549 printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " 550 "in long mode\n"); 551 inject_gp(vcpu); 552 return; 553 } 554 } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & CR4_PAE_MASK) 555 && !load_pdptrs(vcpu, vcpu->cr3)) { 556 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); 557 inject_gp(vcpu); 558 } 559 560 if (cr4 & CR4_VMXE_MASK) { 561 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); 562 inject_gp(vcpu); 563 return; 564 } 565 kvm_arch_ops->set_cr4(vcpu, cr4); 566 spin_lock(&vcpu->kvm->lock); 567 kvm_mmu_reset_context(vcpu); 568 spin_unlock(&vcpu->kvm->lock); 569} 570EXPORT_SYMBOL_GPL(set_cr4); 571 572void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 573{ 574 if (is_long_mode(vcpu)) { 575 if (cr3 & CR3_L_MODE_RESEVED_BITS) { 576 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); 577 inject_gp(vcpu); 578 return; 579 } 580 } else { 581 if (cr3 & CR3_RESEVED_BITS) { 582 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); 583 inject_gp(vcpu); 584 return; 585 } 586 if (is_paging(vcpu) && is_pae(vcpu) && 587 !load_pdptrs(vcpu, cr3)) { 588 printk(KERN_DEBUG "set_cr3: #GP, pdptrs " 589 "reserved bits\n"); 590 inject_gp(vcpu); 591 return; 592 } 593 } 594 595 vcpu->cr3 = cr3; 596 spin_lock(&vcpu->kvm->lock); 597 /* 598 * Does the new cr3 value map to physical memory? (Note, we 599 * catch an invalid cr3 even in real-mode, because it would 600 * cause trouble later on when we turn on paging anyway.) 601 * 602 * A real CPU would silently accept an invalid cr3 and would 603 * attempt to use it - with largely undefined (and often hard 604 * to debug) behavior on the guest side. 605 */ 606 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) 607 inject_gp(vcpu); 608 else 609 vcpu->mmu.new_cr3(vcpu); 610 spin_unlock(&vcpu->kvm->lock); 611} 612EXPORT_SYMBOL_GPL(set_cr3); 613 614void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 615{ 616 if ( cr8 & CR8_RESEVED_BITS) { 617 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); 618 inject_gp(vcpu); 619 return; 620 } 621 vcpu->cr8 = cr8; 622} 623EXPORT_SYMBOL_GPL(set_cr8); 624 625void fx_init(struct kvm_vcpu *vcpu) 626{ 627 struct __attribute__ ((__packed__)) fx_image_s { 628 u16 control; //fcw 629 u16 status; //fsw 630 u16 tag; // ftw 631 u16 opcode; //fop 632 u64 ip; // fpu ip 633 u64 operand;// fpu dp 634 u32 mxcsr; 635 u32 mxcsr_mask; 636 637 } *fx_image; 638 639 fx_save(vcpu->host_fx_image); 640 fpu_init(); 641 fx_save(vcpu->guest_fx_image); 642 fx_restore(vcpu->host_fx_image); 643 644 fx_image = (struct fx_image_s *)vcpu->guest_fx_image; 645 fx_image->mxcsr = 0x1f80; 646 memset(vcpu->guest_fx_image + sizeof(struct fx_image_s), 647 0, FX_IMAGE_SIZE - sizeof(struct fx_image_s)); 648} 649EXPORT_SYMBOL_GPL(fx_init); 650 651static void do_remove_write_access(struct kvm_vcpu *vcpu, int slot) 652{ 653 spin_lock(&vcpu->kvm->lock); 654 kvm_mmu_slot_remove_write_access(vcpu, slot); 655 spin_unlock(&vcpu->kvm->lock); 656} 657 658/* 659 * Allocate some memory and give it an address in the guest physical address 660 * space. 661 * 662 * Discontiguous memory is allowed, mostly for framebuffers. 663 */ 664static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 665 struct kvm_memory_region *mem) 666{ 667 int r; 668 gfn_t base_gfn; 669 unsigned long npages; 670 unsigned long i; 671 struct kvm_memory_slot *memslot; 672 struct kvm_memory_slot old, new; 673 int memory_config_version; 674 675 r = -EINVAL; 676 /* General sanity checks */ 677 if (mem->memory_size & (PAGE_SIZE - 1)) 678 goto out; 679 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 680 goto out; 681 if (mem->slot >= KVM_MEMORY_SLOTS) 682 goto out; 683 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 684 goto out; 685 686 memslot = &kvm->memslots[mem->slot]; 687 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 688 npages = mem->memory_size >> PAGE_SHIFT; 689 690 if (!npages) 691 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; 692 693raced: 694 spin_lock(&kvm->lock); 695 696 memory_config_version = kvm->memory_config_version; 697 new = old = *memslot; 698 699 new.base_gfn = base_gfn; 700 new.npages = npages; 701 new.flags = mem->flags; 702 703 /* Disallow changing a memory slot's size. */ 704 r = -EINVAL; 705 if (npages && old.npages && npages != old.npages) 706 goto out_unlock; 707 708 /* Check for overlaps */ 709 r = -EEXIST; 710 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 711 struct kvm_memory_slot *s = &kvm->memslots[i]; 712 713 if (s == memslot) 714 continue; 715 if (!((base_gfn + npages <= s->base_gfn) || 716 (base_gfn >= s->base_gfn + s->npages))) 717 goto out_unlock; 718 } 719 /* 720 * Do memory allocations outside lock. memory_config_version will 721 * detect any races. 722 */ 723 spin_unlock(&kvm->lock); 724 725 /* Deallocate if slot is being removed */ 726 if (!npages) 727 new.phys_mem = NULL; 728 729 /* Free page dirty bitmap if unneeded */ 730 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) 731 new.dirty_bitmap = NULL; 732 733 r = -ENOMEM; 734 735 /* Allocate if a slot is being created */ 736 if (npages && !new.phys_mem) { 737 new.phys_mem = vmalloc(npages * sizeof(struct page *)); 738 739 if (!new.phys_mem) 740 goto out_free; 741 742 memset(new.phys_mem, 0, npages * sizeof(struct page *)); 743 for (i = 0; i < npages; ++i) { 744 new.phys_mem[i] = alloc_page(GFP_HIGHUSER 745 | __GFP_ZERO); 746 if (!new.phys_mem[i]) 747 goto out_free; 748 set_page_private(new.phys_mem[i],0); 749 } 750 } 751 752 /* Allocate page dirty bitmap if needed */ 753 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 754 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; 755 756 new.dirty_bitmap = vmalloc(dirty_bytes); 757 if (!new.dirty_bitmap) 758 goto out_free; 759 memset(new.dirty_bitmap, 0, dirty_bytes); 760 } 761 762 spin_lock(&kvm->lock); 763 764 if (memory_config_version != kvm->memory_config_version) { 765 spin_unlock(&kvm->lock); 766 kvm_free_physmem_slot(&new, &old); 767 goto raced; 768 } 769 770 r = -EAGAIN; 771 if (kvm->busy) 772 goto out_unlock; 773 774 if (mem->slot >= kvm->nmemslots) 775 kvm->nmemslots = mem->slot + 1; 776 777 *memslot = new; 778 ++kvm->memory_config_version; 779 780 spin_unlock(&kvm->lock); 781 782 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 783 struct kvm_vcpu *vcpu; 784 785 vcpu = vcpu_load_slot(kvm, i); 786 if (!vcpu) 787 continue; 788 if (new.flags & KVM_MEM_LOG_DIRTY_PAGES) 789 do_remove_write_access(vcpu, mem->slot); 790 kvm_mmu_reset_context(vcpu); 791 vcpu_put(vcpu); 792 } 793 794 kvm_free_physmem_slot(&old, &new); 795 return 0; 796 797out_unlock: 798 spin_unlock(&kvm->lock); 799out_free: 800 kvm_free_physmem_slot(&new, &old); 801out: 802 return r; 803} 804 805/* 806 * Get (and clear) the dirty memory log for a memory slot. 807 */ 808static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 809 struct kvm_dirty_log *log) 810{ 811 struct kvm_memory_slot *memslot; 812 int r, i; 813 int n; 814 int cleared; 815 unsigned long any = 0; 816 817 spin_lock(&kvm->lock); 818 819 /* 820 * Prevent changes to guest memory configuration even while the lock 821 * is not taken. 822 */ 823 ++kvm->busy; 824 spin_unlock(&kvm->lock); 825 r = -EINVAL; 826 if (log->slot >= KVM_MEMORY_SLOTS) 827 goto out; 828 829 memslot = &kvm->memslots[log->slot]; 830 r = -ENOENT; 831 if (!memslot->dirty_bitmap) 832 goto out; 833 834 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 835 836 for (i = 0; !any && i < n/sizeof(long); ++i) 837 any = memslot->dirty_bitmap[i]; 838 839 r = -EFAULT; 840 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) 841 goto out; 842 843 if (any) { 844 cleared = 0; 845 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 846 struct kvm_vcpu *vcpu; 847 848 vcpu = vcpu_load_slot(kvm, i); 849 if (!vcpu) 850 continue; 851 if (!cleared) { 852 do_remove_write_access(vcpu, log->slot); 853 memset(memslot->dirty_bitmap, 0, n); 854 cleared = 1; 855 } 856 kvm_arch_ops->tlb_flush(vcpu); 857 vcpu_put(vcpu); 858 } 859 } 860 861 r = 0; 862 863out: 864 spin_lock(&kvm->lock); 865 --kvm->busy; 866 spin_unlock(&kvm->lock); 867 return r; 868} 869 870/* 871 * Set a new alias region. Aliases map a portion of physical memory into 872 * another portion. This is useful for memory windows, for example the PC 873 * VGA region. 874 */ 875static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, 876 struct kvm_memory_alias *alias) 877{ 878 int r, n; 879 struct kvm_mem_alias *p; 880 881 r = -EINVAL; 882 /* General sanity checks */ 883 if (alias->memory_size & (PAGE_SIZE - 1)) 884 goto out; 885 if (alias->guest_phys_addr & (PAGE_SIZE - 1)) 886 goto out; 887 if (alias->slot >= KVM_ALIAS_SLOTS) 888 goto out; 889 if (alias->guest_phys_addr + alias->memory_size 890 < alias->guest_phys_addr) 891 goto out; 892 if (alias->target_phys_addr + alias->memory_size 893 < alias->target_phys_addr) 894 goto out; 895 896 spin_lock(&kvm->lock); 897 898 p = &kvm->aliases[alias->slot]; 899 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; 900 p->npages = alias->memory_size >> PAGE_SHIFT; 901 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; 902 903 for (n = KVM_ALIAS_SLOTS; n > 0; --n) 904 if (kvm->aliases[n - 1].npages) 905 break; 906 kvm->naliases = n; 907 908 spin_unlock(&kvm->lock); 909 910 vcpu_load(&kvm->vcpus[0]); 911 spin_lock(&kvm->lock); 912 kvm_mmu_zap_all(&kvm->vcpus[0]); 913 spin_unlock(&kvm->lock); 914 vcpu_put(&kvm->vcpus[0]); 915 916 return 0; 917 918out: 919 return r; 920} 921 922static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) 923{ 924 int i; 925 struct kvm_mem_alias *alias; 926 927 for (i = 0; i < kvm->naliases; ++i) { 928 alias = &kvm->aliases[i]; 929 if (gfn >= alias->base_gfn 930 && gfn < alias->base_gfn + alias->npages) 931 return alias->target_gfn + gfn - alias->base_gfn; 932 } 933 return gfn; 934} 935 936static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 937{ 938 int i; 939 940 for (i = 0; i < kvm->nmemslots; ++i) { 941 struct kvm_memory_slot *memslot = &kvm->memslots[i]; 942 943 if (gfn >= memslot->base_gfn 944 && gfn < memslot->base_gfn + memslot->npages) 945 return memslot; 946 } 947 return NULL; 948} 949 950struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 951{ 952 gfn = unalias_gfn(kvm, gfn); 953 return __gfn_to_memslot(kvm, gfn); 954} 955 956struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 957{ 958 struct kvm_memory_slot *slot; 959 960 gfn = unalias_gfn(kvm, gfn); 961 slot = __gfn_to_memslot(kvm, gfn); 962 if (!slot) 963 return NULL; 964 return slot->phys_mem[gfn - slot->base_gfn]; 965} 966EXPORT_SYMBOL_GPL(gfn_to_page); 967 968void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 969{ 970 int i; 971 struct kvm_memory_slot *memslot = NULL; 972 unsigned long rel_gfn; 973 974 for (i = 0; i < kvm->nmemslots; ++i) { 975 memslot = &kvm->memslots[i]; 976 977 if (gfn >= memslot->base_gfn 978 && gfn < memslot->base_gfn + memslot->npages) { 979 980 if (!memslot || !memslot->dirty_bitmap) 981 return; 982 983 rel_gfn = gfn - memslot->base_gfn; 984 985 /* avoid RMW */ 986 if (!test_bit(rel_gfn, memslot->dirty_bitmap)) 987 set_bit(rel_gfn, memslot->dirty_bitmap); 988 return; 989 } 990 } 991} 992 993static int emulator_read_std(unsigned long addr, 994 void *val, 995 unsigned int bytes, 996 struct x86_emulate_ctxt *ctxt) 997{ 998 struct kvm_vcpu *vcpu = ctxt->vcpu; 999 void *data = val; 1000 1001 while (bytes) { 1002 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); 1003 unsigned offset = addr & (PAGE_SIZE-1); 1004 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset); 1005 unsigned long pfn; 1006 struct page *page; 1007 void *page_virt; 1008 1009 if (gpa == UNMAPPED_GVA) 1010 return X86EMUL_PROPAGATE_FAULT; 1011 pfn = gpa >> PAGE_SHIFT; 1012 page = gfn_to_page(vcpu->kvm, pfn); 1013 if (!page) 1014 return X86EMUL_UNHANDLEABLE; 1015 page_virt = kmap_atomic(page, KM_USER0); 1016 1017 memcpy(data, page_virt + offset, tocopy); 1018 1019 kunmap_atomic(page_virt, KM_USER0); 1020 1021 bytes -= tocopy; 1022 data += tocopy; 1023 addr += tocopy; 1024 } 1025 1026 return X86EMUL_CONTINUE; 1027} 1028 1029static int emulator_write_std(unsigned long addr, 1030 const void *val, 1031 unsigned int bytes, 1032 struct x86_emulate_ctxt *ctxt) 1033{ 1034 printk(KERN_ERR "emulator_write_std: addr %lx n %d\n", 1035 addr, bytes); 1036 return X86EMUL_UNHANDLEABLE; 1037} 1038 1039static int emulator_read_emulated(unsigned long addr, 1040 void *val, 1041 unsigned int bytes, 1042 struct x86_emulate_ctxt *ctxt) 1043{ 1044 struct kvm_vcpu *vcpu = ctxt->vcpu; 1045 1046 if (vcpu->mmio_read_completed) { 1047 memcpy(val, vcpu->mmio_data, bytes); 1048 vcpu->mmio_read_completed = 0; 1049 return X86EMUL_CONTINUE; 1050 } else if (emulator_read_std(addr, val, bytes, ctxt) 1051 == X86EMUL_CONTINUE) 1052 return X86EMUL_CONTINUE; 1053 else { 1054 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); 1055 1056 if (gpa == UNMAPPED_GVA) 1057 return X86EMUL_PROPAGATE_FAULT; 1058 vcpu->mmio_needed = 1; 1059 vcpu->mmio_phys_addr = gpa; 1060 vcpu->mmio_size = bytes; 1061 vcpu->mmio_is_write = 0; 1062 1063 return X86EMUL_UNHANDLEABLE; 1064 } 1065} 1066 1067static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 1068 const void *val, int bytes) 1069{ 1070 struct page *page; 1071 void *virt; 1072 1073 if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT)) 1074 return 0; 1075 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 1076 if (!page) 1077 return 0; 1078 kvm_mmu_pre_write(vcpu, gpa, bytes); 1079 mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT); 1080 virt = kmap_atomic(page, KM_USER0); 1081 memcpy(virt + offset_in_page(gpa), val, bytes); 1082 kunmap_atomic(virt, KM_USER0); 1083 kvm_mmu_post_write(vcpu, gpa, bytes); 1084 return 1; 1085} 1086 1087static int emulator_write_emulated(unsigned long addr, 1088 const void *val, 1089 unsigned int bytes, 1090 struct x86_emulate_ctxt *ctxt) 1091{ 1092 struct kvm_vcpu *vcpu = ctxt->vcpu; 1093 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); 1094 1095 if (gpa == UNMAPPED_GVA) { 1096 kvm_arch_ops->inject_page_fault(vcpu, addr, 2); 1097 return X86EMUL_PROPAGATE_FAULT; 1098 } 1099 1100 if (emulator_write_phys(vcpu, gpa, val, bytes)) 1101 return X86EMUL_CONTINUE; 1102 1103 vcpu->mmio_needed = 1; 1104 vcpu->mmio_phys_addr = gpa; 1105 vcpu->mmio_size = bytes; 1106 vcpu->mmio_is_write = 1; 1107 memcpy(vcpu->mmio_data, val, bytes); 1108 1109 return X86EMUL_CONTINUE; 1110} 1111 1112static int emulator_cmpxchg_emulated(unsigned long addr, 1113 const void *old, 1114 const void *new, 1115 unsigned int bytes, 1116 struct x86_emulate_ctxt *ctxt) 1117{ 1118 static int reported; 1119 1120 if (!reported) { 1121 reported = 1; 1122 printk(KERN_WARNING "kvm: emulating exchange as write\n"); 1123 } 1124 return emulator_write_emulated(addr, new, bytes, ctxt); 1125} 1126 1127static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) 1128{ 1129 return kvm_arch_ops->get_segment_base(vcpu, seg); 1130} 1131 1132int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) 1133{ 1134 return X86EMUL_CONTINUE; 1135} 1136 1137int emulate_clts(struct kvm_vcpu *vcpu) 1138{ 1139 unsigned long cr0; 1140 1141 cr0 = vcpu->cr0 & ~CR0_TS_MASK; 1142 kvm_arch_ops->set_cr0(vcpu, cr0); 1143 return X86EMUL_CONTINUE; 1144} 1145 1146int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest) 1147{ 1148 struct kvm_vcpu *vcpu = ctxt->vcpu; 1149 1150 switch (dr) { 1151 case 0 ... 3: 1152 *dest = kvm_arch_ops->get_dr(vcpu, dr); 1153 return X86EMUL_CONTINUE; 1154 default: 1155 printk(KERN_DEBUG "%s: unexpected dr %u\n", 1156 __FUNCTION__, dr); 1157 return X86EMUL_UNHANDLEABLE; 1158 } 1159} 1160 1161int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) 1162{ 1163 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; 1164 int exception; 1165 1166 kvm_arch_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); 1167 if (exception) { 1168 return X86EMUL_UNHANDLEABLE; 1169 } 1170 return X86EMUL_CONTINUE; 1171} 1172 1173static void report_emulation_failure(struct x86_emulate_ctxt *ctxt) 1174{ 1175 static int reported; 1176 u8 opcodes[4]; 1177 unsigned long rip = ctxt->vcpu->rip; 1178 unsigned long rip_linear; 1179 1180 rip_linear = rip + get_segment_base(ctxt->vcpu, VCPU_SREG_CS); 1181 1182 if (reported) 1183 return; 1184 1185 emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt); 1186 1187 printk(KERN_ERR "emulation failed but !mmio_needed?" 1188 " rip %lx %02x %02x %02x %02x\n", 1189 rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); 1190 reported = 1; 1191} 1192 1193struct x86_emulate_ops emulate_ops = { 1194 .read_std = emulator_read_std, 1195 .write_std = emulator_write_std, 1196 .read_emulated = emulator_read_emulated, 1197 .write_emulated = emulator_write_emulated, 1198 .cmpxchg_emulated = emulator_cmpxchg_emulated, 1199}; 1200 1201int emulate_instruction(struct kvm_vcpu *vcpu, 1202 struct kvm_run *run, 1203 unsigned long cr2, 1204 u16 error_code) 1205{ 1206 struct x86_emulate_ctxt emulate_ctxt; 1207 int r; 1208 int cs_db, cs_l; 1209 1210 vcpu->mmio_fault_cr2 = cr2; 1211 kvm_arch_ops->cache_regs(vcpu); 1212 1213 kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 1214 1215 emulate_ctxt.vcpu = vcpu; 1216 emulate_ctxt.eflags = kvm_arch_ops->get_rflags(vcpu); 1217 emulate_ctxt.cr2 = cr2; 1218 emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM) 1219 ? X86EMUL_MODE_REAL : cs_l 1220 ? X86EMUL_MODE_PROT64 : cs_db 1221 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 1222 1223 if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) { 1224 emulate_ctxt.cs_base = 0; 1225 emulate_ctxt.ds_base = 0; 1226 emulate_ctxt.es_base = 0; 1227 emulate_ctxt.ss_base = 0; 1228 } else { 1229 emulate_ctxt.cs_base = get_segment_base(vcpu, VCPU_SREG_CS); 1230 emulate_ctxt.ds_base = get_segment_base(vcpu, VCPU_SREG_DS); 1231 emulate_ctxt.es_base = get_segment_base(vcpu, VCPU_SREG_ES); 1232 emulate_ctxt.ss_base = get_segment_base(vcpu, VCPU_SREG_SS); 1233 } 1234 1235 emulate_ctxt.gs_base = get_segment_base(vcpu, VCPU_SREG_GS); 1236 emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS); 1237 1238 vcpu->mmio_is_write = 0; 1239 r = x86_emulate_memop(&emulate_ctxt, &emulate_ops); 1240 1241 if ((r || vcpu->mmio_is_write) && run) { 1242 run->mmio.phys_addr = vcpu->mmio_phys_addr; 1243 memcpy(run->mmio.data, vcpu->mmio_data, 8); 1244 run->mmio.len = vcpu->mmio_size; 1245 run->mmio.is_write = vcpu->mmio_is_write; 1246 } 1247 1248 if (r) { 1249 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 1250 return EMULATE_DONE; 1251 if (!vcpu->mmio_needed) { 1252 report_emulation_failure(&emulate_ctxt); 1253 return EMULATE_FAIL; 1254 } 1255 return EMULATE_DO_MMIO; 1256 } 1257 1258 kvm_arch_ops->decache_regs(vcpu); 1259 kvm_arch_ops->set_rflags(vcpu, emulate_ctxt.eflags); 1260 1261 if (vcpu->mmio_is_write) { 1262 vcpu->mmio_needed = 0; 1263 return EMULATE_DO_MMIO; 1264 } 1265 1266 return EMULATE_DONE; 1267} 1268EXPORT_SYMBOL_GPL(emulate_instruction); 1269 1270int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run) 1271{ 1272 unsigned long nr, a0, a1, a2, a3, a4, a5, ret; 1273 1274 kvm_arch_ops->cache_regs(vcpu); 1275 ret = -KVM_EINVAL; 1276#ifdef CONFIG_X86_64 1277 if (is_long_mode(vcpu)) { 1278 nr = vcpu->regs[VCPU_REGS_RAX]; 1279 a0 = vcpu->regs[VCPU_REGS_RDI]; 1280 a1 = vcpu->regs[VCPU_REGS_RSI]; 1281 a2 = vcpu->regs[VCPU_REGS_RDX]; 1282 a3 = vcpu->regs[VCPU_REGS_RCX]; 1283 a4 = vcpu->regs[VCPU_REGS_R8]; 1284 a5 = vcpu->regs[VCPU_REGS_R9]; 1285 } else 1286#endif 1287 { 1288 nr = vcpu->regs[VCPU_REGS_RBX] & -1u; 1289 a0 = vcpu->regs[VCPU_REGS_RAX] & -1u; 1290 a1 = vcpu->regs[VCPU_REGS_RCX] & -1u; 1291 a2 = vcpu->regs[VCPU_REGS_RDX] & -1u; 1292 a3 = vcpu->regs[VCPU_REGS_RSI] & -1u; 1293 a4 = vcpu->regs[VCPU_REGS_RDI] & -1u; 1294 a5 = vcpu->regs[VCPU_REGS_RBP] & -1u; 1295 } 1296 switch (nr) { 1297 default: 1298 run->hypercall.args[0] = a0; 1299 run->hypercall.args[1] = a1; 1300 run->hypercall.args[2] = a2; 1301 run->hypercall.args[3] = a3; 1302 run->hypercall.args[4] = a4; 1303 run->hypercall.args[5] = a5; 1304 run->hypercall.ret = ret; 1305 run->hypercall.longmode = is_long_mode(vcpu); 1306 kvm_arch_ops->decache_regs(vcpu); 1307 return 0; 1308 } 1309 vcpu->regs[VCPU_REGS_RAX] = ret; 1310 kvm_arch_ops->decache_regs(vcpu); 1311 return 1; 1312} 1313EXPORT_SYMBOL_GPL(kvm_hypercall); 1314 1315static u64 mk_cr_64(u64 curr_cr, u32 new_val) 1316{ 1317 return (curr_cr & ~((1ULL << 32) - 1)) | new_val; 1318} 1319 1320void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 1321{ 1322 struct descriptor_table dt = { limit, base }; 1323 1324 kvm_arch_ops->set_gdt(vcpu, &dt); 1325} 1326 1327void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 1328{ 1329 struct descriptor_table dt = { limit, base }; 1330 1331 kvm_arch_ops->set_idt(vcpu, &dt); 1332} 1333 1334void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, 1335 unsigned long *rflags) 1336{ 1337 lmsw(vcpu, msw); 1338 *rflags = kvm_arch_ops->get_rflags(vcpu); 1339} 1340 1341unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) 1342{ 1343 kvm_arch_ops->decache_cr4_guest_bits(vcpu); 1344 switch (cr) { 1345 case 0: 1346 return vcpu->cr0; 1347 case 2: 1348 return vcpu->cr2; 1349 case 3: 1350 return vcpu->cr3; 1351 case 4: 1352 return vcpu->cr4; 1353 default: 1354 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); 1355 return 0; 1356 } 1357} 1358 1359void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, 1360 unsigned long *rflags) 1361{ 1362 switch (cr) { 1363 case 0: 1364 set_cr0(vcpu, mk_cr_64(vcpu->cr0, val)); 1365 *rflags = kvm_arch_ops->get_rflags(vcpu); 1366 break; 1367 case 2: 1368 vcpu->cr2 = val; 1369 break; 1370 case 3: 1371 set_cr3(vcpu, val); 1372 break; 1373 case 4: 1374 set_cr4(vcpu, mk_cr_64(vcpu->cr4, val)); 1375 break; 1376 default: 1377 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); 1378 } 1379} 1380 1381/* 1382 * Register the para guest with the host: 1383 */ 1384static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa) 1385{ 1386 struct kvm_vcpu_para_state *para_state; 1387 hpa_t para_state_hpa, hypercall_hpa; 1388 struct page *para_state_page; 1389 unsigned char *hypercall; 1390 gpa_t hypercall_gpa; 1391 1392 printk(KERN_DEBUG "kvm: guest trying to enter paravirtual mode\n"); 1393 printk(KERN_DEBUG ".... para_state_gpa: %08Lx\n", para_state_gpa); 1394 1395 /* 1396 * Needs to be page aligned: 1397 */ 1398 if (para_state_gpa != PAGE_ALIGN(para_state_gpa)) 1399 goto err_gp; 1400 1401 para_state_hpa = gpa_to_hpa(vcpu, para_state_gpa); 1402 printk(KERN_DEBUG ".... para_state_hpa: %08Lx\n", para_state_hpa); 1403 if (is_error_hpa(para_state_hpa)) 1404 goto err_gp; 1405 1406 mark_page_dirty(vcpu->kvm, para_state_gpa >> PAGE_SHIFT); 1407 para_state_page = pfn_to_page(para_state_hpa >> PAGE_SHIFT); 1408 para_state = kmap_atomic(para_state_page, KM_USER0); 1409 1410 printk(KERN_DEBUG ".... guest version: %d\n", para_state->guest_version); 1411 printk(KERN_DEBUG ".... size: %d\n", para_state->size); 1412 1413 para_state->host_version = KVM_PARA_API_VERSION; 1414 /* 1415 * We cannot support guests that try to register themselves 1416 * with a newer API version than the host supports: 1417 */ 1418 if (para_state->guest_version > KVM_PARA_API_VERSION) { 1419 para_state->ret = -KVM_EINVAL; 1420 goto err_kunmap_skip; 1421 } 1422 1423 hypercall_gpa = para_state->hypercall_gpa; 1424 hypercall_hpa = gpa_to_hpa(vcpu, hypercall_gpa); 1425 printk(KERN_DEBUG ".... hypercall_hpa: %08Lx\n", hypercall_hpa); 1426 if (is_error_hpa(hypercall_hpa)) { 1427 para_state->ret = -KVM_EINVAL; 1428 goto err_kunmap_skip; 1429 } 1430 1431 printk(KERN_DEBUG "kvm: para guest successfully registered.\n"); 1432 vcpu->para_state_page = para_state_page; 1433 vcpu->para_state_gpa = para_state_gpa; 1434 vcpu->hypercall_gpa = hypercall_gpa; 1435 1436 mark_page_dirty(vcpu->kvm, hypercall_gpa >> PAGE_SHIFT); 1437 hypercall = kmap_atomic(pfn_to_page(hypercall_hpa >> PAGE_SHIFT), 1438 KM_USER1) + (hypercall_hpa & ~PAGE_MASK); 1439 kvm_arch_ops->patch_hypercall(vcpu, hypercall); 1440 kunmap_atomic(hypercall, KM_USER1); 1441 1442 para_state->ret = 0; 1443err_kunmap_skip: 1444 kunmap_atomic(para_state, KM_USER0); 1445 return 0; 1446err_gp: 1447 return 1; 1448} 1449 1450int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 1451{ 1452 u64 data; 1453 1454 switch (msr) { 1455 case 0xc0010010: /* SYSCFG */ 1456 case 0xc0010015: /* HWCR */ 1457 case MSR_IA32_PLATFORM_ID: 1458 case MSR_IA32_P5_MC_ADDR: 1459 case MSR_IA32_P5_MC_TYPE: 1460 case MSR_IA32_MC0_CTL: 1461 case MSR_IA32_MCG_STATUS: 1462 case MSR_IA32_MCG_CAP: 1463 case MSR_IA32_MC0_MISC: 1464 case MSR_IA32_MC0_MISC+4: 1465 case MSR_IA32_MC0_MISC+8: 1466 case MSR_IA32_MC0_MISC+12: 1467 case MSR_IA32_MC0_MISC+16: 1468 case MSR_IA32_UCODE_REV: 1469 case MSR_IA32_PERF_STATUS: 1470 /* MTRR registers */ 1471 case 0xfe: 1472 case 0x200 ... 0x2ff: 1473 data = 0; 1474 break; 1475 case 0xcd: /* fsb frequency */ 1476 data = 3; 1477 break; 1478 case MSR_IA32_APICBASE: 1479 data = vcpu->apic_base; 1480 break; 1481 case MSR_IA32_MISC_ENABLE: 1482 data = vcpu->ia32_misc_enable_msr; 1483 break; 1484#ifdef CONFIG_X86_64 1485 case MSR_EFER: 1486 data = vcpu->shadow_efer; 1487 break; 1488#endif 1489 default: 1490 printk(KERN_ERR "kvm: unhandled rdmsr: 0x%x\n", msr); 1491 return 1; 1492 } 1493 *pdata = data; 1494 return 0; 1495} 1496EXPORT_SYMBOL_GPL(kvm_get_msr_common); 1497 1498/* 1499 * Reads an msr value (of 'msr_index') into 'pdata'. 1500 * Returns 0 on success, non-0 otherwise. 1501 * Assumes vcpu_load() was already called. 1502 */ 1503static int get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 1504{ 1505 return kvm_arch_ops->get_msr(vcpu, msr_index, pdata); 1506} 1507 1508#ifdef CONFIG_X86_64 1509 1510static void set_efer(struct kvm_vcpu *vcpu, u64 efer) 1511{ 1512 if (efer & EFER_RESERVED_BITS) { 1513 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", 1514 efer); 1515 inject_gp(vcpu); 1516 return; 1517 } 1518 1519 if (is_paging(vcpu) 1520 && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) { 1521 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); 1522 inject_gp(vcpu); 1523 return; 1524 } 1525 1526 kvm_arch_ops->set_efer(vcpu, efer); 1527 1528 efer &= ~EFER_LMA; 1529 efer |= vcpu->shadow_efer & EFER_LMA; 1530 1531 vcpu->shadow_efer = efer; 1532} 1533 1534#endif 1535 1536int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1537{ 1538 switch (msr) { 1539#ifdef CONFIG_X86_64 1540 case MSR_EFER: 1541 set_efer(vcpu, data); 1542 break; 1543#endif 1544 case MSR_IA32_MC0_STATUS: 1545 printk(KERN_WARNING "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", 1546 __FUNCTION__, data); 1547 break; 1548 case MSR_IA32_MCG_STATUS: 1549 printk(KERN_WARNING "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", 1550 __FUNCTION__, data); 1551 break; 1552 case MSR_IA32_UCODE_REV: 1553 case MSR_IA32_UCODE_WRITE: 1554 case 0x200 ... 0x2ff: /* MTRRs */ 1555 break; 1556 case MSR_IA32_APICBASE: 1557 vcpu->apic_base = data; 1558 break; 1559 case MSR_IA32_MISC_ENABLE: 1560 vcpu->ia32_misc_enable_msr = data; 1561 break; 1562 /* 1563 * This is the 'probe whether the host is KVM' logic: 1564 */ 1565 case MSR_KVM_API_MAGIC: 1566 return vcpu_register_para(vcpu, data); 1567 1568 default: 1569 printk(KERN_ERR "kvm: unhandled wrmsr: 0x%x\n", msr); 1570 return 1; 1571 } 1572 return 0; 1573} 1574EXPORT_SYMBOL_GPL(kvm_set_msr_common); 1575 1576/* 1577 * Writes msr value into into the appropriate "register". 1578 * Returns 0 on success, non-0 otherwise. 1579 * Assumes vcpu_load() was already called. 1580 */ 1581static int set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1582{ 1583 return kvm_arch_ops->set_msr(vcpu, msr_index, data); 1584} 1585 1586void kvm_resched(struct kvm_vcpu *vcpu) 1587{ 1588 if (!need_resched()) 1589 return; 1590 vcpu_put(vcpu); 1591 cond_resched(); 1592 vcpu_load(vcpu); 1593} 1594EXPORT_SYMBOL_GPL(kvm_resched); 1595 1596void load_msrs(struct vmx_msr_entry *e, int n) 1597{ 1598 int i; 1599 1600 for (i = 0; i < n; ++i) 1601 wrmsrl(e[i].index, e[i].data); 1602} 1603EXPORT_SYMBOL_GPL(load_msrs); 1604 1605void save_msrs(struct vmx_msr_entry *e, int n) 1606{ 1607 int i; 1608 1609 for (i = 0; i < n; ++i) 1610 rdmsrl(e[i].index, e[i].data); 1611} 1612EXPORT_SYMBOL_GPL(save_msrs); 1613 1614void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) 1615{ 1616 int i; 1617 u32 function; 1618 struct kvm_cpuid_entry *e, *best; 1619 1620 kvm_arch_ops->cache_regs(vcpu); 1621 function = vcpu->regs[VCPU_REGS_RAX]; 1622 vcpu->regs[VCPU_REGS_RAX] = 0; 1623 vcpu->regs[VCPU_REGS_RBX] = 0; 1624 vcpu->regs[VCPU_REGS_RCX] = 0; 1625 vcpu->regs[VCPU_REGS_RDX] = 0; 1626 best = NULL; 1627 for (i = 0; i < vcpu->cpuid_nent; ++i) { 1628 e = &vcpu->cpuid_entries[i]; 1629 if (e->function == function) { 1630 best = e; 1631 break; 1632 } 1633 /* 1634 * Both basic or both extended? 1635 */ 1636 if (((e->function ^ function) & 0x80000000) == 0) 1637 if (!best || e->function > best->function) 1638 best = e; 1639 } 1640 if (best) { 1641 vcpu->regs[VCPU_REGS_RAX] = best->eax; 1642 vcpu->regs[VCPU_REGS_RBX] = best->ebx; 1643 vcpu->regs[VCPU_REGS_RCX] = best->ecx; 1644 vcpu->regs[VCPU_REGS_RDX] = best->edx; 1645 } 1646 kvm_arch_ops->decache_regs(vcpu); 1647 kvm_arch_ops->skip_emulated_instruction(vcpu); 1648} 1649EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); 1650 1651static int pio_copy_data(struct kvm_vcpu *vcpu) 1652{ 1653 void *p = vcpu->pio_data; 1654 void *q; 1655 unsigned bytes; 1656 int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1; 1657 1658 kvm_arch_ops->vcpu_put(vcpu); 1659 q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE, 1660 PAGE_KERNEL); 1661 if (!q) { 1662 kvm_arch_ops->vcpu_load(vcpu); 1663 free_pio_guest_pages(vcpu); 1664 return -ENOMEM; 1665 } 1666 q += vcpu->pio.guest_page_offset; 1667 bytes = vcpu->pio.size * vcpu->pio.cur_count; 1668 if (vcpu->pio.in) 1669 memcpy(q, p, bytes); 1670 else 1671 memcpy(p, q, bytes); 1672 q -= vcpu->pio.guest_page_offset; 1673 vunmap(q); 1674 kvm_arch_ops->vcpu_load(vcpu); 1675 free_pio_guest_pages(vcpu); 1676 return 0; 1677} 1678 1679static int complete_pio(struct kvm_vcpu *vcpu) 1680{ 1681 struct kvm_pio_request *io = &vcpu->pio; 1682 long delta; 1683 int r; 1684 1685 kvm_arch_ops->cache_regs(vcpu); 1686 1687 if (!io->string) { 1688 if (io->in) 1689 memcpy(&vcpu->regs[VCPU_REGS_RAX], vcpu->pio_data, 1690 io->size); 1691 } else { 1692 if (io->in) { 1693 r = pio_copy_data(vcpu); 1694 if (r) { 1695 kvm_arch_ops->cache_regs(vcpu); 1696 return r; 1697 } 1698 } 1699 1700 delta = 1; 1701 if (io->rep) { 1702 delta *= io->cur_count; 1703 /* 1704 * The size of the register should really depend on 1705 * current address size. 1706 */ 1707 vcpu->regs[VCPU_REGS_RCX] -= delta; 1708 } 1709 if (io->down) 1710 delta = -delta; 1711 delta *= io->size; 1712 if (io->in) 1713 vcpu->regs[VCPU_REGS_RDI] += delta; 1714 else 1715 vcpu->regs[VCPU_REGS_RSI] += delta; 1716 } 1717 1718 kvm_arch_ops->decache_regs(vcpu); 1719 1720 io->count -= io->cur_count; 1721 io->cur_count = 0; 1722 1723 if (!io->count) 1724 kvm_arch_ops->skip_emulated_instruction(vcpu); 1725 return 0; 1726} 1727 1728int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 1729 int size, unsigned long count, int string, int down, 1730 gva_t address, int rep, unsigned port) 1731{ 1732 unsigned now, in_page; 1733 int i; 1734 int nr_pages = 1; 1735 struct page *page; 1736 1737 vcpu->run->exit_reason = KVM_EXIT_IO; 1738 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 1739 vcpu->run->io.size = size; 1740 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 1741 vcpu->run->io.count = count; 1742 vcpu->run->io.port = port; 1743 vcpu->pio.count = count; 1744 vcpu->pio.cur_count = count; 1745 vcpu->pio.size = size; 1746 vcpu->pio.in = in; 1747 vcpu->pio.string = string; 1748 vcpu->pio.down = down; 1749 vcpu->pio.guest_page_offset = offset_in_page(address); 1750 vcpu->pio.rep = rep; 1751 1752 if (!string) { 1753 kvm_arch_ops->cache_regs(vcpu); 1754 memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4); 1755 kvm_arch_ops->decache_regs(vcpu); 1756 return 0; 1757 } 1758 1759 if (!count) { 1760 kvm_arch_ops->skip_emulated_instruction(vcpu); 1761 return 1; 1762 } 1763 1764 now = min(count, PAGE_SIZE / size); 1765 1766 if (!down) 1767 in_page = PAGE_SIZE - offset_in_page(address); 1768 else 1769 in_page = offset_in_page(address) + size; 1770 now = min(count, (unsigned long)in_page / size); 1771 if (!now) { 1772 /* 1773 * String I/O straddles page boundary. Pin two guest pages 1774 * so that we satisfy atomicity constraints. Do just one 1775 * transaction to avoid complexity. 1776 */ 1777 nr_pages = 2; 1778 now = 1; 1779 } 1780 if (down) { 1781 /* 1782 * String I/O in reverse. Yuck. Kill the guest, fix later. 1783 */ 1784 printk(KERN_ERR "kvm: guest string pio down\n"); 1785 inject_gp(vcpu); 1786 return 1; 1787 } 1788 vcpu->run->io.count = now; 1789 vcpu->pio.cur_count = now; 1790 1791 for (i = 0; i < nr_pages; ++i) { 1792 spin_lock(&vcpu->kvm->lock); 1793 page = gva_to_page(vcpu, address + i * PAGE_SIZE); 1794 if (page) 1795 get_page(page); 1796 vcpu->pio.guest_pages[i] = page; 1797 spin_unlock(&vcpu->kvm->lock); 1798 if (!page) { 1799 inject_gp(vcpu); 1800 free_pio_guest_pages(vcpu); 1801 return 1; 1802 } 1803 } 1804 1805 if (!vcpu->pio.in) 1806 return pio_copy_data(vcpu); 1807 return 0; 1808} 1809EXPORT_SYMBOL_GPL(kvm_setup_pio); 1810 1811static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1812{ 1813 int r; 1814 sigset_t sigsaved; 1815 1816 vcpu_load(vcpu); 1817 1818 if (vcpu->sigset_active) 1819 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 1820 1821 /* re-sync apic's tpr */ 1822 vcpu->cr8 = kvm_run->cr8; 1823 1824 if (vcpu->pio.cur_count) { 1825 r = complete_pio(vcpu); 1826 if (r) 1827 goto out; 1828 } 1829 1830 if (vcpu->mmio_needed) { 1831 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); 1832 vcpu->mmio_read_completed = 1; 1833 vcpu->mmio_needed = 0; 1834 r = emulate_instruction(vcpu, kvm_run, 1835 vcpu->mmio_fault_cr2, 0); 1836 if (r == EMULATE_DO_MMIO) { 1837 /* 1838 * Read-modify-write. Back to userspace. 1839 */ 1840 kvm_run->exit_reason = KVM_EXIT_MMIO; 1841 r = 0; 1842 goto out; 1843 } 1844 } 1845 1846 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) { 1847 kvm_arch_ops->cache_regs(vcpu); 1848 vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; 1849 kvm_arch_ops->decache_regs(vcpu); 1850 } 1851 1852 r = kvm_arch_ops->run(vcpu, kvm_run); 1853 1854out: 1855 if (vcpu->sigset_active) 1856 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 1857 1858 vcpu_put(vcpu); 1859 return r; 1860} 1861 1862static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, 1863 struct kvm_regs *regs) 1864{ 1865 vcpu_load(vcpu); 1866 1867 kvm_arch_ops->cache_regs(vcpu); 1868 1869 regs->rax = vcpu->regs[VCPU_REGS_RAX]; 1870 regs->rbx = vcpu->regs[VCPU_REGS_RBX]; 1871 regs->rcx = vcpu->regs[VCPU_REGS_RCX]; 1872 regs->rdx = vcpu->regs[VCPU_REGS_RDX]; 1873 regs->rsi = vcpu->regs[VCPU_REGS_RSI]; 1874 regs->rdi = vcpu->regs[VCPU_REGS_RDI]; 1875 regs->rsp = vcpu->regs[VCPU_REGS_RSP]; 1876 regs->rbp = vcpu->regs[VCPU_REGS_RBP]; 1877#ifdef CONFIG_X86_64 1878 regs->r8 = vcpu->regs[VCPU_REGS_R8]; 1879 regs->r9 = vcpu->regs[VCPU_REGS_R9]; 1880 regs->r10 = vcpu->regs[VCPU_REGS_R10]; 1881 regs->r11 = vcpu->regs[VCPU_REGS_R11]; 1882 regs->r12 = vcpu->regs[VCPU_REGS_R12]; 1883 regs->r13 = vcpu->regs[VCPU_REGS_R13]; 1884 regs->r14 = vcpu->regs[VCPU_REGS_R14]; 1885 regs->r15 = vcpu->regs[VCPU_REGS_R15]; 1886#endif 1887 1888 regs->rip = vcpu->rip; 1889 regs->rflags = kvm_arch_ops->get_rflags(vcpu); 1890 1891 /* 1892 * Don't leak debug flags in case they were set for guest debugging 1893 */ 1894 if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep) 1895 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); 1896 1897 vcpu_put(vcpu); 1898 1899 return 0; 1900} 1901 1902static int kvm_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, 1903 struct kvm_regs *regs) 1904{ 1905 vcpu_load(vcpu); 1906 1907 vcpu->regs[VCPU_REGS_RAX] = regs->rax; 1908 vcpu->regs[VCPU_REGS_RBX] = regs->rbx; 1909 vcpu->regs[VCPU_REGS_RCX] = regs->rcx; 1910 vcpu->regs[VCPU_REGS_RDX] = regs->rdx; 1911 vcpu->regs[VCPU_REGS_RSI] = regs->rsi; 1912 vcpu->regs[VCPU_REGS_RDI] = regs->rdi; 1913 vcpu->regs[VCPU_REGS_RSP] = regs->rsp; 1914 vcpu->regs[VCPU_REGS_RBP] = regs->rbp; 1915#ifdef CONFIG_X86_64 1916 vcpu->regs[VCPU_REGS_R8] = regs->r8; 1917 vcpu->regs[VCPU_REGS_R9] = regs->r9; 1918 vcpu->regs[VCPU_REGS_R10] = regs->r10; 1919 vcpu->regs[VCPU_REGS_R11] = regs->r11; 1920 vcpu->regs[VCPU_REGS_R12] = regs->r12; 1921 vcpu->regs[VCPU_REGS_R13] = regs->r13; 1922 vcpu->regs[VCPU_REGS_R14] = regs->r14; 1923 vcpu->regs[VCPU_REGS_R15] = regs->r15; 1924#endif 1925 1926 vcpu->rip = regs->rip; 1927 kvm_arch_ops->set_rflags(vcpu, regs->rflags); 1928 1929 kvm_arch_ops->decache_regs(vcpu); 1930 1931 vcpu_put(vcpu); 1932 1933 return 0; 1934} 1935 1936static void get_segment(struct kvm_vcpu *vcpu, 1937 struct kvm_segment *var, int seg) 1938{ 1939 return kvm_arch_ops->get_segment(vcpu, var, seg); 1940} 1941 1942static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, 1943 struct kvm_sregs *sregs) 1944{ 1945 struct descriptor_table dt; 1946 1947 vcpu_load(vcpu); 1948 1949 get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 1950 get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 1951 get_segment(vcpu, &sregs->es, VCPU_SREG_ES); 1952 get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 1953 get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 1954 get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 1955 1956 get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 1957 get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 1958 1959 kvm_arch_ops->get_idt(vcpu, &dt); 1960 sregs->idt.limit = dt.limit; 1961 sregs->idt.base = dt.base; 1962 kvm_arch_ops->get_gdt(vcpu, &dt); 1963 sregs->gdt.limit = dt.limit; 1964 sregs->gdt.base = dt.base; 1965 1966 kvm_arch_ops->decache_cr4_guest_bits(vcpu); 1967 sregs->cr0 = vcpu->cr0; 1968 sregs->cr2 = vcpu->cr2; 1969 sregs->cr3 = vcpu->cr3; 1970 sregs->cr4 = vcpu->cr4; 1971 sregs->cr8 = vcpu->cr8; 1972 sregs->efer = vcpu->shadow_efer; 1973 sregs->apic_base = vcpu->apic_base; 1974 1975 memcpy(sregs->interrupt_bitmap, vcpu->irq_pending, 1976 sizeof sregs->interrupt_bitmap); 1977 1978 vcpu_put(vcpu); 1979 1980 return 0; 1981} 1982 1983static void set_segment(struct kvm_vcpu *vcpu, 1984 struct kvm_segment *var, int seg) 1985{ 1986 return kvm_arch_ops->set_segment(vcpu, var, seg); 1987} 1988 1989static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, 1990 struct kvm_sregs *sregs) 1991{ 1992 int mmu_reset_needed = 0; 1993 int i; 1994 struct descriptor_table dt; 1995 1996 vcpu_load(vcpu); 1997 1998 dt.limit = sregs->idt.limit; 1999 dt.base = sregs->idt.base; 2000 kvm_arch_ops->set_idt(vcpu, &dt); 2001 dt.limit = sregs->gdt.limit; 2002 dt.base = sregs->gdt.base; 2003 kvm_arch_ops->set_gdt(vcpu, &dt); 2004 2005 vcpu->cr2 = sregs->cr2; 2006 mmu_reset_needed |= vcpu->cr3 != sregs->cr3; 2007 vcpu->cr3 = sregs->cr3; 2008 2009 vcpu->cr8 = sregs->cr8; 2010 2011 mmu_reset_needed |= vcpu->shadow_efer != sregs->efer; 2012#ifdef CONFIG_X86_64 2013 kvm_arch_ops->set_efer(vcpu, sregs->efer); 2014#endif 2015 vcpu->apic_base = sregs->apic_base; 2016 2017 kvm_arch_ops->decache_cr4_guest_bits(vcpu); 2018 2019 mmu_reset_needed |= vcpu->cr0 != sregs->cr0; 2020 kvm_arch_ops->set_cr0(vcpu, sregs->cr0); 2021 2022 mmu_reset_needed |= vcpu->cr4 != sregs->cr4; 2023 kvm_arch_ops->set_cr4(vcpu, sregs->cr4); 2024 if (!is_long_mode(vcpu) && is_pae(vcpu)) 2025 load_pdptrs(vcpu, vcpu->cr3); 2026 2027 if (mmu_reset_needed) 2028 kvm_mmu_reset_context(vcpu); 2029 2030 memcpy(vcpu->irq_pending, sregs->interrupt_bitmap, 2031 sizeof vcpu->irq_pending); 2032 vcpu->irq_summary = 0; 2033 for (i = 0; i < NR_IRQ_WORDS; ++i) 2034 if (vcpu->irq_pending[i]) 2035 __set_bit(i, &vcpu->irq_summary); 2036 2037 set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 2038 set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 2039 set_segment(vcpu, &sregs->es, VCPU_SREG_ES); 2040 set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 2041 set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 2042 set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 2043 2044 set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 2045 set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 2046 2047 vcpu_put(vcpu); 2048 2049 return 0; 2050} 2051 2052/* 2053 * List of msr numbers which we expose to userspace through KVM_GET_MSRS 2054 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. 2055 * 2056 * This list is modified at module load time to reflect the 2057 * capabilities of the host cpu. 2058 */ 2059static u32 msrs_to_save[] = { 2060 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 2061 MSR_K6_STAR, 2062#ifdef CONFIG_X86_64 2063 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 2064#endif 2065 MSR_IA32_TIME_STAMP_COUNTER, 2066}; 2067 2068static unsigned num_msrs_to_save; 2069 2070static u32 emulated_msrs[] = { 2071 MSR_IA32_MISC_ENABLE, 2072}; 2073 2074static __init void kvm_init_msr_list(void) 2075{ 2076 u32 dummy[2]; 2077 unsigned i, j; 2078 2079 for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { 2080 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) 2081 continue; 2082 if (j < i) 2083 msrs_to_save[j] = msrs_to_save[i]; 2084 j++; 2085 } 2086 num_msrs_to_save = j; 2087} 2088 2089/* 2090 * Adapt set_msr() to msr_io()'s calling convention 2091 */ 2092static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) 2093{ 2094 return set_msr(vcpu, index, *data); 2095} 2096 2097/* 2098 * Read or write a bunch of msrs. All parameters are kernel addresses. 2099 * 2100 * @return number of msrs set successfully. 2101 */ 2102static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, 2103 struct kvm_msr_entry *entries, 2104 int (*do_msr)(struct kvm_vcpu *vcpu, 2105 unsigned index, u64 *data)) 2106{ 2107 int i; 2108 2109 vcpu_load(vcpu); 2110 2111 for (i = 0; i < msrs->nmsrs; ++i) 2112 if (do_msr(vcpu, entries[i].index, &entries[i].data)) 2113 break; 2114 2115 vcpu_put(vcpu); 2116 2117 return i; 2118} 2119 2120/* 2121 * Read or write a bunch of msrs. Parameters are user addresses. 2122 * 2123 * @return number of msrs set successfully. 2124 */ 2125static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, 2126 int (*do_msr)(struct kvm_vcpu *vcpu, 2127 unsigned index, u64 *data), 2128 int writeback) 2129{ 2130 struct kvm_msrs msrs; 2131 struct kvm_msr_entry *entries; 2132 int r, n; 2133 unsigned size; 2134 2135 r = -EFAULT; 2136 if (copy_from_user(&msrs, user_msrs, sizeof msrs)) 2137 goto out; 2138 2139 r = -E2BIG; 2140 if (msrs.nmsrs >= MAX_IO_MSRS) 2141 goto out; 2142 2143 r = -ENOMEM; 2144 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; 2145 entries = vmalloc(size); 2146 if (!entries) 2147 goto out; 2148 2149 r = -EFAULT; 2150 if (copy_from_user(entries, user_msrs->entries, size)) 2151 goto out_free; 2152 2153 r = n = __msr_io(vcpu, &msrs, entries, do_msr); 2154 if (r < 0) 2155 goto out_free; 2156 2157 r = -EFAULT; 2158 if (writeback && copy_to_user(user_msrs->entries, entries, size)) 2159 goto out_free; 2160 2161 r = n; 2162 2163out_free: 2164 vfree(entries); 2165out: 2166 return r; 2167} 2168 2169/* 2170 * Translate a guest virtual address to a guest physical address. 2171 */ 2172static int kvm_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, 2173 struct kvm_translation *tr) 2174{ 2175 unsigned long vaddr = tr->linear_address; 2176 gpa_t gpa; 2177 2178 vcpu_load(vcpu); 2179 spin_lock(&vcpu->kvm->lock); 2180 gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr); 2181 tr->physical_address = gpa; 2182 tr->valid = gpa != UNMAPPED_GVA; 2183 tr->writeable = 1; 2184 tr->usermode = 0; 2185 spin_unlock(&vcpu->kvm->lock); 2186 vcpu_put(vcpu); 2187 2188 return 0; 2189} 2190 2191static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, 2192 struct kvm_interrupt *irq) 2193{ 2194 if (irq->irq < 0 || irq->irq >= 256) 2195 return -EINVAL; 2196 vcpu_load(vcpu); 2197 2198 set_bit(irq->irq, vcpu->irq_pending); 2199 set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary); 2200 2201 vcpu_put(vcpu); 2202 2203 return 0; 2204} 2205 2206static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, 2207 struct kvm_debug_guest *dbg) 2208{ 2209 int r; 2210 2211 vcpu_load(vcpu); 2212 2213 r = kvm_arch_ops->set_guest_debug(vcpu, dbg); 2214 2215 vcpu_put(vcpu); 2216 2217 return r; 2218} 2219 2220static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma, 2221 unsigned long address, 2222 int *type) 2223{ 2224 struct kvm_vcpu *vcpu = vma->vm_file->private_data; 2225 unsigned long pgoff; 2226 struct page *page; 2227 2228 *type = VM_FAULT_MINOR; 2229 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 2230 if (pgoff == 0) 2231 page = virt_to_page(vcpu->run); 2232 else if (pgoff == KVM_PIO_PAGE_OFFSET) 2233 page = virt_to_page(vcpu->pio_data); 2234 else 2235 return NOPAGE_SIGBUS; 2236 get_page(page); 2237 return page; 2238} 2239 2240static struct vm_operations_struct kvm_vcpu_vm_ops = { 2241 .nopage = kvm_vcpu_nopage, 2242}; 2243 2244static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) 2245{ 2246 vma->vm_ops = &kvm_vcpu_vm_ops; 2247 return 0; 2248} 2249 2250static int kvm_vcpu_release(struct inode *inode, struct file *filp) 2251{ 2252 struct kvm_vcpu *vcpu = filp->private_data; 2253 2254 fput(vcpu->kvm->filp); 2255 return 0; 2256} 2257 2258static struct file_operations kvm_vcpu_fops = { 2259 .release = kvm_vcpu_release, 2260 .unlocked_ioctl = kvm_vcpu_ioctl, 2261 .compat_ioctl = kvm_vcpu_ioctl, 2262 .mmap = kvm_vcpu_mmap, 2263}; 2264 2265/* 2266 * Allocates an inode for the vcpu. 2267 */ 2268static int create_vcpu_fd(struct kvm_vcpu *vcpu) 2269{ 2270 int fd, r; 2271 struct inode *inode; 2272 struct file *file; 2273 2274 atomic_inc(&vcpu->kvm->filp->f_count); 2275 inode = kvmfs_inode(&kvm_vcpu_fops); 2276 if (IS_ERR(inode)) { 2277 r = PTR_ERR(inode); 2278 goto out1; 2279 } 2280 2281 file = kvmfs_file(inode, vcpu); 2282 if (IS_ERR(file)) { 2283 r = PTR_ERR(file); 2284 goto out2; 2285 } 2286 2287 r = get_unused_fd(); 2288 if (r < 0) 2289 goto out3; 2290 fd = r; 2291 fd_install(fd, file); 2292 2293 return fd; 2294 2295out3: 2296 fput(file); 2297out2: 2298 iput(inode); 2299out1: 2300 fput(vcpu->kvm->filp); 2301 return r; 2302} 2303 2304/* 2305 * Creates some virtual cpus. Good luck creating more than one. 2306 */ 2307static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) 2308{ 2309 int r; 2310 struct kvm_vcpu *vcpu; 2311 struct page *page; 2312 2313 r = -EINVAL; 2314 if (!valid_vcpu(n)) 2315 goto out; 2316 2317 vcpu = &kvm->vcpus[n]; 2318 2319 mutex_lock(&vcpu->mutex); 2320 2321 if (vcpu->vmcs) { 2322 mutex_unlock(&vcpu->mutex); 2323 return -EEXIST; 2324 } 2325 2326 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 2327 r = -ENOMEM; 2328 if (!page) 2329 goto out_unlock; 2330 vcpu->run = page_address(page); 2331 2332 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 2333 r = -ENOMEM; 2334 if (!page) 2335 goto out_free_run; 2336 vcpu->pio_data = page_address(page); 2337 2338 vcpu->host_fx_image = (char*)ALIGN((hva_t)vcpu->fx_buf, 2339 FX_IMAGE_ALIGN); 2340 vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE; 2341 vcpu->cr0 = 0x10; 2342 2343 r = kvm_arch_ops->vcpu_create(vcpu); 2344 if (r < 0) 2345 goto out_free_vcpus; 2346 2347 r = kvm_mmu_create(vcpu); 2348 if (r < 0) 2349 goto out_free_vcpus; 2350 2351 kvm_arch_ops->vcpu_load(vcpu); 2352 r = kvm_mmu_setup(vcpu); 2353 if (r >= 0) 2354 r = kvm_arch_ops->vcpu_setup(vcpu); 2355 vcpu_put(vcpu); 2356 2357 if (r < 0) 2358 goto out_free_vcpus; 2359 2360 r = create_vcpu_fd(vcpu); 2361 if (r < 0) 2362 goto out_free_vcpus; 2363 2364 return r; 2365 2366out_free_vcpus: 2367 kvm_free_vcpu(vcpu); 2368out_free_run: 2369 free_page((unsigned long)vcpu->run); 2370 vcpu->run = NULL; 2371out_unlock: 2372 mutex_unlock(&vcpu->mutex); 2373out: 2374 return r; 2375} 2376 2377static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, 2378 struct kvm_cpuid *cpuid, 2379 struct kvm_cpuid_entry __user *entries) 2380{ 2381 int r; 2382 2383 r = -E2BIG; 2384 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 2385 goto out; 2386 r = -EFAULT; 2387 if (copy_from_user(&vcpu->cpuid_entries, entries, 2388 cpuid->nent * sizeof(struct kvm_cpuid_entry))) 2389 goto out; 2390 vcpu->cpuid_nent = cpuid->nent; 2391 return 0; 2392 2393out: 2394 return r; 2395} 2396 2397static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) 2398{ 2399 if (sigset) { 2400 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 2401 vcpu->sigset_active = 1; 2402 vcpu->sigset = *sigset; 2403 } else 2404 vcpu->sigset_active = 0; 2405 return 0; 2406} 2407 2408/* 2409 * fxsave fpu state. Taken from x86_64/processor.h. To be killed when 2410 * we have asm/x86/processor.h 2411 */ 2412struct fxsave { 2413 u16 cwd; 2414 u16 swd; 2415 u16 twd; 2416 u16 fop; 2417 u64 rip; 2418 u64 rdp; 2419 u32 mxcsr; 2420 u32 mxcsr_mask; 2421 u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ 2422#ifdef CONFIG_X86_64 2423 u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ 2424#else 2425 u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ 2426#endif 2427}; 2428 2429static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 2430{ 2431 struct fxsave *fxsave = (struct fxsave *)vcpu->guest_fx_image; 2432 2433 vcpu_load(vcpu); 2434 2435 memcpy(fpu->fpr, fxsave->st_space, 128); 2436 fpu->fcw = fxsave->cwd; 2437 fpu->fsw = fxsave->swd; 2438 fpu->ftwx = fxsave->twd; 2439 fpu->last_opcode = fxsave->fop; 2440 fpu->last_ip = fxsave->rip; 2441 fpu->last_dp = fxsave->rdp; 2442 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); 2443 2444 vcpu_put(vcpu); 2445 2446 return 0; 2447} 2448 2449static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 2450{ 2451 struct fxsave *fxsave = (struct fxsave *)vcpu->guest_fx_image; 2452 2453 vcpu_load(vcpu); 2454 2455 memcpy(fxsave->st_space, fpu->fpr, 128); 2456 fxsave->cwd = fpu->fcw; 2457 fxsave->swd = fpu->fsw; 2458 fxsave->twd = fpu->ftwx; 2459 fxsave->fop = fpu->last_opcode; 2460 fxsave->rip = fpu->last_ip; 2461 fxsave->rdp = fpu->last_dp; 2462 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); 2463 2464 vcpu_put(vcpu); 2465 2466 return 0; 2467} 2468 2469static long kvm_vcpu_ioctl(struct file *filp, 2470 unsigned int ioctl, unsigned long arg) 2471{ 2472 struct kvm_vcpu *vcpu = filp->private_data; 2473 void __user *argp = (void __user *)arg; 2474 int r = -EINVAL; 2475 2476 switch (ioctl) { 2477 case KVM_RUN: 2478 r = -EINVAL; 2479 if (arg) 2480 goto out; 2481 r = kvm_vcpu_ioctl_run(vcpu, vcpu->run); 2482 break; 2483 case KVM_GET_REGS: { 2484 struct kvm_regs kvm_regs; 2485 2486 memset(&kvm_regs, 0, sizeof kvm_regs); 2487 r = kvm_vcpu_ioctl_get_regs(vcpu, &kvm_regs); 2488 if (r) 2489 goto out; 2490 r = -EFAULT; 2491 if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs)) 2492 goto out; 2493 r = 0; 2494 break; 2495 } 2496 case KVM_SET_REGS: { 2497 struct kvm_regs kvm_regs; 2498 2499 r = -EFAULT; 2500 if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs)) 2501 goto out; 2502 r = kvm_vcpu_ioctl_set_regs(vcpu, &kvm_regs); 2503 if (r) 2504 goto out; 2505 r = 0; 2506 break; 2507 } 2508 case KVM_GET_SREGS: { 2509 struct kvm_sregs kvm_sregs; 2510 2511 memset(&kvm_sregs, 0, sizeof kvm_sregs); 2512 r = kvm_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs); 2513 if (r) 2514 goto out; 2515 r = -EFAULT; 2516 if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs)) 2517 goto out; 2518 r = 0; 2519 break; 2520 } 2521 case KVM_SET_SREGS: { 2522 struct kvm_sregs kvm_sregs; 2523 2524 r = -EFAULT; 2525 if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs)) 2526 goto out; 2527 r = kvm_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs); 2528 if (r) 2529 goto out; 2530 r = 0; 2531 break; 2532 } 2533 case KVM_TRANSLATE: { 2534 struct kvm_translation tr; 2535 2536 r = -EFAULT; 2537 if (copy_from_user(&tr, argp, sizeof tr)) 2538 goto out; 2539 r = kvm_vcpu_ioctl_translate(vcpu, &tr); 2540 if (r) 2541 goto out; 2542 r = -EFAULT; 2543 if (copy_to_user(argp, &tr, sizeof tr)) 2544 goto out; 2545 r = 0; 2546 break; 2547 } 2548 case KVM_INTERRUPT: { 2549 struct kvm_interrupt irq; 2550 2551 r = -EFAULT; 2552 if (copy_from_user(&irq, argp, sizeof irq)) 2553 goto out; 2554 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); 2555 if (r) 2556 goto out; 2557 r = 0; 2558 break; 2559 } 2560 case KVM_DEBUG_GUEST: { 2561 struct kvm_debug_guest dbg; 2562 2563 r = -EFAULT; 2564 if (copy_from_user(&dbg, argp, sizeof dbg)) 2565 goto out; 2566 r = kvm_vcpu_ioctl_debug_guest(vcpu, &dbg); 2567 if (r) 2568 goto out; 2569 r = 0; 2570 break; 2571 } 2572 case KVM_GET_MSRS: 2573 r = msr_io(vcpu, argp, get_msr, 1); 2574 break; 2575 case KVM_SET_MSRS: 2576 r = msr_io(vcpu, argp, do_set_msr, 0); 2577 break; 2578 case KVM_SET_CPUID: { 2579 struct kvm_cpuid __user *cpuid_arg = argp; 2580 struct kvm_cpuid cpuid; 2581 2582 r = -EFAULT; 2583 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 2584 goto out; 2585 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); 2586 if (r) 2587 goto out; 2588 break; 2589 } 2590 case KVM_SET_SIGNAL_MASK: { 2591 struct kvm_signal_mask __user *sigmask_arg = argp; 2592 struct kvm_signal_mask kvm_sigmask; 2593 sigset_t sigset, *p; 2594 2595 p = NULL; 2596 if (argp) { 2597 r = -EFAULT; 2598 if (copy_from_user(&kvm_sigmask, argp, 2599 sizeof kvm_sigmask)) 2600 goto out; 2601 r = -EINVAL; 2602 if (kvm_sigmask.len != sizeof sigset) 2603 goto out; 2604 r = -EFAULT; 2605 if (copy_from_user(&sigset, sigmask_arg->sigset, 2606 sizeof sigset)) 2607 goto out; 2608 p = &sigset; 2609 } 2610 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); 2611 break; 2612 } 2613 case KVM_GET_FPU: { 2614 struct kvm_fpu fpu; 2615 2616 memset(&fpu, 0, sizeof fpu); 2617 r = kvm_vcpu_ioctl_get_fpu(vcpu, &fpu); 2618 if (r) 2619 goto out; 2620 r = -EFAULT; 2621 if (copy_to_user(argp, &fpu, sizeof fpu)) 2622 goto out; 2623 r = 0; 2624 break; 2625 } 2626 case KVM_SET_FPU: { 2627 struct kvm_fpu fpu; 2628 2629 r = -EFAULT; 2630 if (copy_from_user(&fpu, argp, sizeof fpu)) 2631 goto out; 2632 r = kvm_vcpu_ioctl_set_fpu(vcpu, &fpu); 2633 if (r) 2634 goto out; 2635 r = 0; 2636 break; 2637 } 2638 default: 2639 ; 2640 } 2641out: 2642 return r; 2643} 2644 2645static long kvm_vm_ioctl(struct file *filp, 2646 unsigned int ioctl, unsigned long arg) 2647{ 2648 struct kvm *kvm = filp->private_data; 2649 void __user *argp = (void __user *)arg; 2650 int r = -EINVAL; 2651 2652 switch (ioctl) { 2653 case KVM_CREATE_VCPU: 2654 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 2655 if (r < 0) 2656 goto out; 2657 break; 2658 case KVM_SET_MEMORY_REGION: { 2659 struct kvm_memory_region kvm_mem; 2660 2661 r = -EFAULT; 2662 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) 2663 goto out; 2664 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_mem); 2665 if (r) 2666 goto out; 2667 break; 2668 } 2669 case KVM_GET_DIRTY_LOG: { 2670 struct kvm_dirty_log log; 2671 2672 r = -EFAULT; 2673 if (copy_from_user(&log, argp, sizeof log)) 2674 goto out; 2675 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 2676 if (r) 2677 goto out; 2678 break; 2679 } 2680 case KVM_SET_MEMORY_ALIAS: { 2681 struct kvm_memory_alias alias; 2682 2683 r = -EFAULT; 2684 if (copy_from_user(&alias, argp, sizeof alias)) 2685 goto out; 2686 r = kvm_vm_ioctl_set_memory_alias(kvm, &alias); 2687 if (r) 2688 goto out; 2689 break; 2690 } 2691 default: 2692 ; 2693 } 2694out: 2695 return r; 2696} 2697 2698static struct page *kvm_vm_nopage(struct vm_area_struct *vma, 2699 unsigned long address, 2700 int *type) 2701{ 2702 struct kvm *kvm = vma->vm_file->private_data; 2703 unsigned long pgoff; 2704 struct page *page; 2705 2706 *type = VM_FAULT_MINOR; 2707 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 2708 page = gfn_to_page(kvm, pgoff); 2709 if (!page) 2710 return NOPAGE_SIGBUS; 2711 get_page(page); 2712 return page; 2713} 2714 2715static struct vm_operations_struct kvm_vm_vm_ops = { 2716 .nopage = kvm_vm_nopage, 2717}; 2718 2719static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) 2720{ 2721 vma->vm_ops = &kvm_vm_vm_ops; 2722 return 0; 2723} 2724 2725static struct file_operations kvm_vm_fops = { 2726 .release = kvm_vm_release, 2727 .unlocked_ioctl = kvm_vm_ioctl, 2728 .compat_ioctl = kvm_vm_ioctl, 2729 .mmap = kvm_vm_mmap, 2730}; 2731 2732static int kvm_dev_ioctl_create_vm(void) 2733{ 2734 int fd, r; 2735 struct inode *inode; 2736 struct file *file; 2737 struct kvm *kvm; 2738 2739 inode = kvmfs_inode(&kvm_vm_fops); 2740 if (IS_ERR(inode)) { 2741 r = PTR_ERR(inode); 2742 goto out1; 2743 } 2744 2745 kvm = kvm_create_vm(); 2746 if (IS_ERR(kvm)) { 2747 r = PTR_ERR(kvm); 2748 goto out2; 2749 } 2750 2751 file = kvmfs_file(inode, kvm); 2752 if (IS_ERR(file)) { 2753 r = PTR_ERR(file); 2754 goto out3; 2755 } 2756 kvm->filp = file; 2757 2758 r = get_unused_fd(); 2759 if (r < 0) 2760 goto out4; 2761 fd = r; 2762 fd_install(fd, file); 2763 2764 return fd; 2765 2766out4: 2767 fput(file); 2768out3: 2769 kvm_destroy_vm(kvm); 2770out2: 2771 iput(inode); 2772out1: 2773 return r; 2774} 2775 2776static long kvm_dev_ioctl(struct file *filp, 2777 unsigned int ioctl, unsigned long arg) 2778{ 2779 void __user *argp = (void __user *)arg; 2780 long r = -EINVAL; 2781 2782 switch (ioctl) { 2783 case KVM_GET_API_VERSION: 2784 r = -EINVAL; 2785 if (arg) 2786 goto out; 2787 r = KVM_API_VERSION; 2788 break; 2789 case KVM_CREATE_VM: 2790 r = -EINVAL; 2791 if (arg) 2792 goto out; 2793 r = kvm_dev_ioctl_create_vm(); 2794 break; 2795 case KVM_GET_MSR_INDEX_LIST: { 2796 struct kvm_msr_list __user *user_msr_list = argp; 2797 struct kvm_msr_list msr_list; 2798 unsigned n; 2799 2800 r = -EFAULT; 2801 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) 2802 goto out; 2803 n = msr_list.nmsrs; 2804 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); 2805 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) 2806 goto out; 2807 r = -E2BIG; 2808 if (n < num_msrs_to_save) 2809 goto out; 2810 r = -EFAULT; 2811 if (copy_to_user(user_msr_list->indices, &msrs_to_save, 2812 num_msrs_to_save * sizeof(u32))) 2813 goto out; 2814 if (copy_to_user(user_msr_list->indices 2815 + num_msrs_to_save * sizeof(u32), 2816 &emulated_msrs, 2817 ARRAY_SIZE(emulated_msrs) * sizeof(u32))) 2818 goto out; 2819 r = 0; 2820 break; 2821 } 2822 case KVM_CHECK_EXTENSION: 2823 /* 2824 * No extensions defined at present. 2825 */ 2826 r = 0; 2827 break; 2828 case KVM_GET_VCPU_MMAP_SIZE: 2829 r = -EINVAL; 2830 if (arg) 2831 goto out; 2832 r = 2 * PAGE_SIZE; 2833 break; 2834 default: 2835 ; 2836 } 2837out: 2838 return r; 2839} 2840 2841static struct file_operations kvm_chardev_ops = { 2842 .open = kvm_dev_open, 2843 .release = kvm_dev_release, 2844 .unlocked_ioctl = kvm_dev_ioctl, 2845 .compat_ioctl = kvm_dev_ioctl, 2846}; 2847 2848static struct miscdevice kvm_dev = { 2849 KVM_MINOR, 2850 "kvm", 2851 &kvm_chardev_ops, 2852}; 2853 2854static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 2855 void *v) 2856{ 2857 if (val == SYS_RESTART) { 2858 /* 2859 * Some (well, at least mine) BIOSes hang on reboot if 2860 * in vmx root mode. 2861 */ 2862 printk(KERN_INFO "kvm: exiting hardware virtualization\n"); 2863 on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1); 2864 } 2865 return NOTIFY_OK; 2866} 2867 2868static struct notifier_block kvm_reboot_notifier = { 2869 .notifier_call = kvm_reboot, 2870 .priority = 0, 2871}; 2872 2873/* 2874 * Make sure that a cpu that is being hot-unplugged does not have any vcpus 2875 * cached on it. 2876 */ 2877static void decache_vcpus_on_cpu(int cpu) 2878{ 2879 struct kvm *vm; 2880 struct kvm_vcpu *vcpu; 2881 int i; 2882 2883 spin_lock(&kvm_lock); 2884 list_for_each_entry(vm, &vm_list, vm_list) 2885 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 2886 vcpu = &vm->vcpus[i]; 2887 /* 2888 * If the vcpu is locked, then it is running on some 2889 * other cpu and therefore it is not cached on the 2890 * cpu in question. 2891 * 2892 * If it's not locked, check the last cpu it executed 2893 * on. 2894 */ 2895 if (mutex_trylock(&vcpu->mutex)) { 2896 if (vcpu->cpu == cpu) { 2897 kvm_arch_ops->vcpu_decache(vcpu); 2898 vcpu->cpu = -1; 2899 } 2900 mutex_unlock(&vcpu->mutex); 2901 } 2902 } 2903 spin_unlock(&kvm_lock); 2904} 2905 2906static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, 2907 void *v) 2908{ 2909 int cpu = (long)v; 2910 2911 switch (val) { 2912 case CPU_DOWN_PREPARE: 2913 case CPU_DOWN_PREPARE_FROZEN: 2914 case CPU_UP_CANCELED: 2915 case CPU_UP_CANCELED_FROZEN: 2916 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", 2917 cpu); 2918 decache_vcpus_on_cpu(cpu); 2919 smp_call_function_single(cpu, kvm_arch_ops->hardware_disable, 2920 NULL, 0, 1); 2921 break; 2922 case CPU_ONLINE: 2923 case CPU_ONLINE_FROZEN: 2924 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", 2925 cpu); 2926 smp_call_function_single(cpu, kvm_arch_ops->hardware_enable, 2927 NULL, 0, 1); 2928 break; 2929 } 2930 return NOTIFY_OK; 2931} 2932 2933static struct notifier_block kvm_cpu_notifier = { 2934 .notifier_call = kvm_cpu_hotplug, 2935 .priority = 20, /* must be > scheduler priority */ 2936}; 2937 2938static u64 stat_get(void *_offset) 2939{ 2940 unsigned offset = (long)_offset; 2941 u64 total = 0; 2942 struct kvm *kvm; 2943 struct kvm_vcpu *vcpu; 2944 int i; 2945 2946 spin_lock(&kvm_lock); 2947 list_for_each_entry(kvm, &vm_list, vm_list) 2948 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 2949 vcpu = &kvm->vcpus[i]; 2950 total += *(u32 *)((void *)vcpu + offset); 2951 } 2952 spin_unlock(&kvm_lock); 2953 return total; 2954} 2955 2956static void stat_set(void *offset, u64 val) 2957{ 2958} 2959 2960DEFINE_SIMPLE_ATTRIBUTE(stat_fops, stat_get, stat_set, "%llu\n"); 2961 2962static __init void kvm_init_debug(void) 2963{ 2964 struct kvm_stats_debugfs_item *p; 2965 2966 debugfs_dir = debugfs_create_dir("kvm", NULL); 2967 for (p = debugfs_entries; p->name; ++p) 2968 p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir, 2969 (void *)(long)p->offset, 2970 &stat_fops); 2971} 2972 2973static void kvm_exit_debug(void) 2974{ 2975 struct kvm_stats_debugfs_item *p; 2976 2977 for (p = debugfs_entries; p->name; ++p) 2978 debugfs_remove(p->dentry); 2979 debugfs_remove(debugfs_dir); 2980} 2981 2982static int kvm_suspend(struct sys_device *dev, pm_message_t state) 2983{ 2984 decache_vcpus_on_cpu(raw_smp_processor_id()); 2985 on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1); 2986 return 0; 2987} 2988 2989static int kvm_resume(struct sys_device *dev) 2990{ 2991 on_each_cpu(kvm_arch_ops->hardware_enable, NULL, 0, 1); 2992 return 0; 2993} 2994 2995static struct sysdev_class kvm_sysdev_class = { 2996 set_kset_name("kvm"), 2997 .suspend = kvm_suspend, 2998 .resume = kvm_resume, 2999}; 3000 3001static struct sys_device kvm_sysdev = { 3002 .id = 0, 3003 .cls = &kvm_sysdev_class, 3004}; 3005 3006hpa_t bad_page_address; 3007 3008static int kvmfs_get_sb(struct file_system_type *fs_type, int flags, 3009 const char *dev_name, void *data, struct vfsmount *mnt) 3010{ 3011 return get_sb_pseudo(fs_type, "kvm:", NULL, KVMFS_SUPER_MAGIC, mnt); 3012} 3013 3014static struct file_system_type kvm_fs_type = { 3015 .name = "kvmfs", 3016 .get_sb = kvmfs_get_sb, 3017 .kill_sb = kill_anon_super, 3018}; 3019 3020int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module) 3021{ 3022 int r; 3023 3024 if (kvm_arch_ops) { 3025 printk(KERN_ERR "kvm: already loaded the other module\n"); 3026 return -EEXIST; 3027 } 3028 3029 if (!ops->cpu_has_kvm_support()) { 3030 printk(KERN_ERR "kvm: no hardware support\n"); 3031 return -EOPNOTSUPP; 3032 } 3033 if (ops->disabled_by_bios()) { 3034 printk(KERN_ERR "kvm: disabled by bios\n"); 3035 return -EOPNOTSUPP; 3036 } 3037 3038 kvm_arch_ops = ops; 3039 3040 r = kvm_arch_ops->hardware_setup(); 3041 if (r < 0) 3042 goto out; 3043 3044 on_each_cpu(kvm_arch_ops->hardware_enable, NULL, 0, 1); 3045 r = register_cpu_notifier(&kvm_cpu_notifier); 3046 if (r) 3047 goto out_free_1; 3048 register_reboot_notifier(&kvm_reboot_notifier); 3049 3050 r = sysdev_class_register(&kvm_sysdev_class); 3051 if (r) 3052 goto out_free_2; 3053 3054 r = sysdev_register(&kvm_sysdev); 3055 if (r) 3056 goto out_free_3; 3057 3058 kvm_chardev_ops.owner = module; 3059 3060 r = misc_register(&kvm_dev); 3061 if (r) { 3062 printk (KERN_ERR "kvm: misc device register failed\n"); 3063 goto out_free; 3064 } 3065 3066 return r; 3067 3068out_free: 3069 sysdev_unregister(&kvm_sysdev); 3070out_free_3: 3071 sysdev_class_unregister(&kvm_sysdev_class); 3072out_free_2: 3073 unregister_reboot_notifier(&kvm_reboot_notifier); 3074 unregister_cpu_notifier(&kvm_cpu_notifier); 3075out_free_1: 3076 on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1); 3077 kvm_arch_ops->hardware_unsetup(); 3078out: 3079 kvm_arch_ops = NULL; 3080 return r; 3081} 3082 3083void kvm_exit_arch(void) 3084{ 3085 misc_deregister(&kvm_dev); 3086 sysdev_unregister(&kvm_sysdev); 3087 sysdev_class_unregister(&kvm_sysdev_class); 3088 unregister_reboot_notifier(&kvm_reboot_notifier); 3089 unregister_cpu_notifier(&kvm_cpu_notifier); 3090 on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1); 3091 kvm_arch_ops->hardware_unsetup(); 3092 kvm_arch_ops = NULL; 3093} 3094 3095static __init int kvm_init(void) 3096{ 3097 static struct page *bad_page; 3098 int r; 3099 3100 r = kvm_mmu_module_init(); 3101 if (r) 3102 goto out4; 3103 3104 r = register_filesystem(&kvm_fs_type); 3105 if (r) 3106 goto out3; 3107 3108 kvmfs_mnt = kern_mount(&kvm_fs_type); 3109 r = PTR_ERR(kvmfs_mnt); 3110 if (IS_ERR(kvmfs_mnt)) 3111 goto out2; 3112 kvm_init_debug(); 3113 3114 kvm_init_msr_list(); 3115 3116 if ((bad_page = alloc_page(GFP_KERNEL)) == NULL) { 3117 r = -ENOMEM; 3118 goto out; 3119 } 3120 3121 bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT; 3122 memset(__va(bad_page_address), 0, PAGE_SIZE); 3123 3124 return 0; 3125 3126out: 3127 kvm_exit_debug(); 3128 mntput(kvmfs_mnt); 3129out2: 3130 unregister_filesystem(&kvm_fs_type); 3131out3: 3132 kvm_mmu_module_exit(); 3133out4: 3134 return r; 3135} 3136 3137static __exit void kvm_exit(void) 3138{ 3139 kvm_exit_debug(); 3140 __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT)); 3141 mntput(kvmfs_mnt); 3142 unregister_filesystem(&kvm_fs_type); 3143 kvm_mmu_module_exit(); 3144} 3145 3146module_init(kvm_init) 3147module_exit(kvm_exit) 3148 3149EXPORT_SYMBOL_GPL(kvm_init_arch); 3150EXPORT_SYMBOL_GPL(kvm_exit_arch); 3151