1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * Copyright (C) 2006 Qumranet, Inc.
8 *
9 * Authors:
10 *   Avi Kivity   <avi@qumranet.com>
11 *   Yaniv Kamay  <yaniv@qumranet.com>
12 *
13 * This work is licensed under the terms of the GNU GPL, version 2.  See
14 * the COPYING file in the top-level directory.
15 *
16 */
17
18#include "kvm.h"
19
20#include <linux/kvm.h>
21#include <linux/module.h>
22#include <linux/errno.h>
23#include <linux/magic.h>
24#include <asm/processor.h>
25#include <linux/percpu.h>
26#include <linux/gfp.h>
27#include <asm/msr.h>
28#include <linux/mm.h>
29#include <linux/miscdevice.h>
30#include <linux/vmalloc.h>
31#include <asm/uaccess.h>
32#include <linux/reboot.h>
33#include <asm/io.h>
34#include <linux/debugfs.h>
35#include <linux/highmem.h>
36#include <linux/file.h>
37#include <asm/desc.h>
38#include <linux/sysdev.h>
39#include <linux/cpu.h>
40#include <linux/file.h>
41#include <linux/fs.h>
42#include <linux/mount.h>
43#include <linux/sched.h>
44
45#include "x86_emulate.h"
46#include "segment_descriptor.h"
47
48MODULE_AUTHOR("Qumranet");
49MODULE_LICENSE("GPL");
50
51static DEFINE_SPINLOCK(kvm_lock);
52static LIST_HEAD(vm_list);
53
54struct kvm_arch_ops *kvm_arch_ops;
55
56#define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x)
57
58static struct kvm_stats_debugfs_item {
59	const char *name;
60	int offset;
61	struct dentry *dentry;
62} debugfs_entries[] = {
63	{ "pf_fixed", STAT_OFFSET(pf_fixed) },
64	{ "pf_guest", STAT_OFFSET(pf_guest) },
65	{ "tlb_flush", STAT_OFFSET(tlb_flush) },
66	{ "invlpg", STAT_OFFSET(invlpg) },
67	{ "exits", STAT_OFFSET(exits) },
68	{ "io_exits", STAT_OFFSET(io_exits) },
69	{ "mmio_exits", STAT_OFFSET(mmio_exits) },
70	{ "signal_exits", STAT_OFFSET(signal_exits) },
71	{ "irq_window", STAT_OFFSET(irq_window_exits) },
72	{ "halt_exits", STAT_OFFSET(halt_exits) },
73	{ "request_irq", STAT_OFFSET(request_irq_exits) },
74	{ "irq_exits", STAT_OFFSET(irq_exits) },
75	{ NULL }
76};
77
78static struct dentry *debugfs_dir;
79
80struct vfsmount *kvmfs_mnt;
81
82#define MAX_IO_MSRS 256
83
84#define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL
85#define LMSW_GUEST_MASK 0x0eULL
86#define CR4_RESEVED_BITS (~((1ULL << 11) - 1))
87#define CR8_RESEVED_BITS (~0x0fULL)
88#define EFER_RESERVED_BITS 0xfffffffffffff2fe
89
90#ifdef CONFIG_X86_64
91// LDT or TSS descriptor in the GDT. 16 bytes.
92struct segment_descriptor_64 {
93	struct segment_descriptor s;
94	u32 base_higher;
95	u32 pad_zero;
96};
97
98#endif
99
100static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
101			   unsigned long arg);
102
103static struct inode *kvmfs_inode(struct file_operations *fops)
104{
105	int error = -ENOMEM;
106	struct inode *inode = new_inode(kvmfs_mnt->mnt_sb);
107
108	if (!inode)
109		goto eexit_1;
110
111	inode->i_fop = fops;
112
113	/*
114	 * Mark the inode dirty from the very beginning,
115	 * that way it will never be moved to the dirty
116	 * list because mark_inode_dirty() will think
117	 * that it already _is_ on the dirty list.
118	 */
119	inode->i_state = I_DIRTY;
120	inode->i_mode = S_IRUSR | S_IWUSR;
121	inode->i_uid = current->fsuid;
122	inode->i_gid = current->fsgid;
123	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
124	return inode;
125
126eexit_1:
127	return ERR_PTR(error);
128}
129
130static struct file *kvmfs_file(struct inode *inode, void *private_data)
131{
132	struct file *file = get_empty_filp();
133
134	if (!file)
135		return ERR_PTR(-ENFILE);
136
137	file->f_path.mnt = mntget(kvmfs_mnt);
138	file->f_path.dentry = d_alloc_anon(inode);
139	if (!file->f_path.dentry)
140		return ERR_PTR(-ENOMEM);
141	file->f_mapping = inode->i_mapping;
142
143	file->f_pos = 0;
144	file->f_flags = O_RDWR;
145	file->f_op = inode->i_fop;
146	file->f_mode = FMODE_READ | FMODE_WRITE;
147	file->f_version = 0;
148	file->private_data = private_data;
149	return file;
150}
151
152unsigned long segment_base(u16 selector)
153{
154	struct descriptor_table gdt;
155	struct segment_descriptor *d;
156	unsigned long table_base;
157	typedef unsigned long ul;
158	unsigned long v;
159
160	if (selector == 0)
161		return 0;
162
163	asm ("sgdt %0" : "=m"(gdt));
164	table_base = gdt.base;
165
166	if (selector & 4) {           /* from ldt */
167		u16 ldt_selector;
168
169		asm ("sldt %0" : "=g"(ldt_selector));
170		table_base = segment_base(ldt_selector);
171	}
172	d = (struct segment_descriptor *)(table_base + (selector & ~7));
173	v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24);
174#ifdef CONFIG_X86_64
175	if (d->system == 0
176	    && (d->type == 2 || d->type == 9 || d->type == 11))
177		v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32;
178#endif
179	return v;
180}
181EXPORT_SYMBOL_GPL(segment_base);
182
183static inline int valid_vcpu(int n)
184{
185	return likely(n >= 0 && n < KVM_MAX_VCPUS);
186}
187
188int kvm_read_guest(struct kvm_vcpu *vcpu, gva_t addr, unsigned long size,
189		   void *dest)
190{
191	unsigned char *host_buf = dest;
192	unsigned long req_size = size;
193
194	while (size) {
195		hpa_t paddr;
196		unsigned now;
197		unsigned offset;
198		hva_t guest_buf;
199
200		paddr = gva_to_hpa(vcpu, addr);
201
202		if (is_error_hpa(paddr))
203			break;
204
205		guest_buf = (hva_t)kmap_atomic(
206					pfn_to_page(paddr >> PAGE_SHIFT),
207					KM_USER0);
208		offset = addr & ~PAGE_MASK;
209		guest_buf |= offset;
210		now = min(size, PAGE_SIZE - offset);
211		memcpy(host_buf, (void*)guest_buf, now);
212		host_buf += now;
213		addr += now;
214		size -= now;
215		kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0);
216	}
217	return req_size - size;
218}
219EXPORT_SYMBOL_GPL(kvm_read_guest);
220
221int kvm_write_guest(struct kvm_vcpu *vcpu, gva_t addr, unsigned long size,
222		    void *data)
223{
224	unsigned char *host_buf = data;
225	unsigned long req_size = size;
226
227	while (size) {
228		hpa_t paddr;
229		unsigned now;
230		unsigned offset;
231		hva_t guest_buf;
232		gfn_t gfn;
233
234		paddr = gva_to_hpa(vcpu, addr);
235
236		if (is_error_hpa(paddr))
237			break;
238
239		gfn = vcpu->mmu.gva_to_gpa(vcpu, addr) >> PAGE_SHIFT;
240		mark_page_dirty(vcpu->kvm, gfn);
241		guest_buf = (hva_t)kmap_atomic(
242				pfn_to_page(paddr >> PAGE_SHIFT), KM_USER0);
243		offset = addr & ~PAGE_MASK;
244		guest_buf |= offset;
245		now = min(size, PAGE_SIZE - offset);
246		memcpy((void*)guest_buf, host_buf, now);
247		host_buf += now;
248		addr += now;
249		size -= now;
250		kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0);
251	}
252	return req_size - size;
253}
254EXPORT_SYMBOL_GPL(kvm_write_guest);
255
256void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
257{
258	if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
259		return;
260
261	vcpu->guest_fpu_loaded = 1;
262	fx_save(vcpu->host_fx_image);
263	fx_restore(vcpu->guest_fx_image);
264}
265EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
266
267void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
268{
269	if (!vcpu->guest_fpu_loaded)
270		return;
271
272	vcpu->guest_fpu_loaded = 0;
273	fx_save(vcpu->guest_fx_image);
274	fx_restore(vcpu->host_fx_image);
275}
276EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
277
278/*
279 * Switches to specified vcpu, until a matching vcpu_put()
280 */
281static void vcpu_load(struct kvm_vcpu *vcpu)
282{
283	mutex_lock(&vcpu->mutex);
284	kvm_arch_ops->vcpu_load(vcpu);
285}
286
287/*
288 * Switches to specified vcpu, until a matching vcpu_put(). Will return NULL
289 * if the slot is not populated.
290 */
291static struct kvm_vcpu *vcpu_load_slot(struct kvm *kvm, int slot)
292{
293	struct kvm_vcpu *vcpu = &kvm->vcpus[slot];
294
295	mutex_lock(&vcpu->mutex);
296	if (!vcpu->vmcs) {
297		mutex_unlock(&vcpu->mutex);
298		return NULL;
299	}
300	kvm_arch_ops->vcpu_load(vcpu);
301	return vcpu;
302}
303
304static void vcpu_put(struct kvm_vcpu *vcpu)
305{
306	kvm_arch_ops->vcpu_put(vcpu);
307	mutex_unlock(&vcpu->mutex);
308}
309
310static struct kvm *kvm_create_vm(void)
311{
312	struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
313	int i;
314
315	if (!kvm)
316		return ERR_PTR(-ENOMEM);
317
318	spin_lock_init(&kvm->lock);
319	INIT_LIST_HEAD(&kvm->active_mmu_pages);
320	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
321		struct kvm_vcpu *vcpu = &kvm->vcpus[i];
322
323		mutex_init(&vcpu->mutex);
324		vcpu->cpu = -1;
325		vcpu->kvm = kvm;
326		vcpu->mmu.root_hpa = INVALID_PAGE;
327		INIT_LIST_HEAD(&vcpu->free_pages);
328		spin_lock(&kvm_lock);
329		list_add(&kvm->vm_list, &vm_list);
330		spin_unlock(&kvm_lock);
331	}
332	return kvm;
333}
334
335static int kvm_dev_open(struct inode *inode, struct file *filp)
336{
337	return 0;
338}
339
340/*
341 * Free any memory in @free but not in @dont.
342 */
343static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
344				  struct kvm_memory_slot *dont)
345{
346	int i;
347
348	if (!dont || free->phys_mem != dont->phys_mem)
349		if (free->phys_mem) {
350			for (i = 0; i < free->npages; ++i)
351				if (free->phys_mem[i])
352					__free_page(free->phys_mem[i]);
353			vfree(free->phys_mem);
354		}
355
356	if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
357		vfree(free->dirty_bitmap);
358
359	free->phys_mem = NULL;
360	free->npages = 0;
361	free->dirty_bitmap = NULL;
362}
363
364static void kvm_free_physmem(struct kvm *kvm)
365{
366	int i;
367
368	for (i = 0; i < kvm->nmemslots; ++i)
369		kvm_free_physmem_slot(&kvm->memslots[i], NULL);
370}
371
372static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
373{
374	int i;
375
376	for (i = 0; i < 2; ++i)
377		if (vcpu->pio.guest_pages[i]) {
378			__free_page(vcpu->pio.guest_pages[i]);
379			vcpu->pio.guest_pages[i] = NULL;
380		}
381}
382
383static void kvm_free_vcpu(struct kvm_vcpu *vcpu)
384{
385	if (!vcpu->vmcs)
386		return;
387
388	vcpu_load(vcpu);
389	kvm_mmu_destroy(vcpu);
390	vcpu_put(vcpu);
391	kvm_arch_ops->vcpu_free(vcpu);
392	free_page((unsigned long)vcpu->run);
393	vcpu->run = NULL;
394	free_page((unsigned long)vcpu->pio_data);
395	vcpu->pio_data = NULL;
396	free_pio_guest_pages(vcpu);
397}
398
399static void kvm_free_vcpus(struct kvm *kvm)
400{
401	unsigned int i;
402
403	for (i = 0; i < KVM_MAX_VCPUS; ++i)
404		kvm_free_vcpu(&kvm->vcpus[i]);
405}
406
407static int kvm_dev_release(struct inode *inode, struct file *filp)
408{
409	return 0;
410}
411
412static void kvm_destroy_vm(struct kvm *kvm)
413{
414	spin_lock(&kvm_lock);
415	list_del(&kvm->vm_list);
416	spin_unlock(&kvm_lock);
417	kvm_free_vcpus(kvm);
418	kvm_free_physmem(kvm);
419	kfree(kvm);
420}
421
422static int kvm_vm_release(struct inode *inode, struct file *filp)
423{
424	struct kvm *kvm = filp->private_data;
425
426	kvm_destroy_vm(kvm);
427	return 0;
428}
429
430static void inject_gp(struct kvm_vcpu *vcpu)
431{
432	kvm_arch_ops->inject_gp(vcpu, 0);
433}
434
435/*
436 * Load the pae pdptrs.  Return true is they are all valid.
437 */
438static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
439{
440	gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
441	unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
442	int i;
443	u64 pdpte;
444	u64 *pdpt;
445	int ret;
446	struct page *page;
447
448	spin_lock(&vcpu->kvm->lock);
449	page = gfn_to_page(vcpu->kvm, pdpt_gfn);
450	pdpt = kmap_atomic(page, KM_USER0);
451
452	ret = 1;
453	for (i = 0; i < 4; ++i) {
454		pdpte = pdpt[offset + i];
455		if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull)) {
456			ret = 0;
457			goto out;
458		}
459	}
460
461	for (i = 0; i < 4; ++i)
462		vcpu->pdptrs[i] = pdpt[offset + i];
463
464out:
465	kunmap_atomic(pdpt, KM_USER0);
466	spin_unlock(&vcpu->kvm->lock);
467
468	return ret;
469}
470
471void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
472{
473	if (cr0 & CR0_RESEVED_BITS) {
474		printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
475		       cr0, vcpu->cr0);
476		inject_gp(vcpu);
477		return;
478	}
479
480	if ((cr0 & CR0_NW_MASK) && !(cr0 & CR0_CD_MASK)) {
481		printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
482		inject_gp(vcpu);
483		return;
484	}
485
486	if ((cr0 & CR0_PG_MASK) && !(cr0 & CR0_PE_MASK)) {
487		printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
488		       "and a clear PE flag\n");
489		inject_gp(vcpu);
490		return;
491	}
492
493	if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK)) {
494#ifdef CONFIG_X86_64
495		if ((vcpu->shadow_efer & EFER_LME)) {
496			int cs_db, cs_l;
497
498			if (!is_pae(vcpu)) {
499				printk(KERN_DEBUG "set_cr0: #GP, start paging "
500				       "in long mode while PAE is disabled\n");
501				inject_gp(vcpu);
502				return;
503			}
504			kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
505			if (cs_l) {
506				printk(KERN_DEBUG "set_cr0: #GP, start paging "
507				       "in long mode while CS.L == 1\n");
508				inject_gp(vcpu);
509				return;
510
511			}
512		} else
513#endif
514		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) {
515			printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
516			       "reserved bits\n");
517			inject_gp(vcpu);
518			return;
519		}
520
521	}
522
523	kvm_arch_ops->set_cr0(vcpu, cr0);
524	vcpu->cr0 = cr0;
525
526	spin_lock(&vcpu->kvm->lock);
527	kvm_mmu_reset_context(vcpu);
528	spin_unlock(&vcpu->kvm->lock);
529	return;
530}
531EXPORT_SYMBOL_GPL(set_cr0);
532
533void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
534{
535	set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f));
536}
537EXPORT_SYMBOL_GPL(lmsw);
538
539void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
540{
541	if (cr4 & CR4_RESEVED_BITS) {
542		printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
543		inject_gp(vcpu);
544		return;
545	}
546
547	if (is_long_mode(vcpu)) {
548		if (!(cr4 & CR4_PAE_MASK)) {
549			printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
550			       "in long mode\n");
551			inject_gp(vcpu);
552			return;
553		}
554	} else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & CR4_PAE_MASK)
555		   && !load_pdptrs(vcpu, vcpu->cr3)) {
556		printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
557		inject_gp(vcpu);
558	}
559
560	if (cr4 & CR4_VMXE_MASK) {
561		printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
562		inject_gp(vcpu);
563		return;
564	}
565	kvm_arch_ops->set_cr4(vcpu, cr4);
566	spin_lock(&vcpu->kvm->lock);
567	kvm_mmu_reset_context(vcpu);
568	spin_unlock(&vcpu->kvm->lock);
569}
570EXPORT_SYMBOL_GPL(set_cr4);
571
572void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
573{
574	if (is_long_mode(vcpu)) {
575		if (cr3 & CR3_L_MODE_RESEVED_BITS) {
576			printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
577			inject_gp(vcpu);
578			return;
579		}
580	} else {
581		if (cr3 & CR3_RESEVED_BITS) {
582			printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
583			inject_gp(vcpu);
584			return;
585		}
586		if (is_paging(vcpu) && is_pae(vcpu) &&
587		    !load_pdptrs(vcpu, cr3)) {
588			printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
589			       "reserved bits\n");
590			inject_gp(vcpu);
591			return;
592		}
593	}
594
595	vcpu->cr3 = cr3;
596	spin_lock(&vcpu->kvm->lock);
597	/*
598	 * Does the new cr3 value map to physical memory? (Note, we
599	 * catch an invalid cr3 even in real-mode, because it would
600	 * cause trouble later on when we turn on paging anyway.)
601	 *
602	 * A real CPU would silently accept an invalid cr3 and would
603	 * attempt to use it - with largely undefined (and often hard
604	 * to debug) behavior on the guest side.
605	 */
606	if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
607		inject_gp(vcpu);
608	else
609		vcpu->mmu.new_cr3(vcpu);
610	spin_unlock(&vcpu->kvm->lock);
611}
612EXPORT_SYMBOL_GPL(set_cr3);
613
614void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
615{
616	if ( cr8 & CR8_RESEVED_BITS) {
617		printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
618		inject_gp(vcpu);
619		return;
620	}
621	vcpu->cr8 = cr8;
622}
623EXPORT_SYMBOL_GPL(set_cr8);
624
625void fx_init(struct kvm_vcpu *vcpu)
626{
627	struct __attribute__ ((__packed__)) fx_image_s {
628		u16 control; //fcw
629		u16 status; //fsw
630		u16 tag; // ftw
631		u16 opcode; //fop
632		u64 ip; // fpu ip
633		u64 operand;// fpu dp
634		u32 mxcsr;
635		u32 mxcsr_mask;
636
637	} *fx_image;
638
639	fx_save(vcpu->host_fx_image);
640	fpu_init();
641	fx_save(vcpu->guest_fx_image);
642	fx_restore(vcpu->host_fx_image);
643
644	fx_image = (struct fx_image_s *)vcpu->guest_fx_image;
645	fx_image->mxcsr = 0x1f80;
646	memset(vcpu->guest_fx_image + sizeof(struct fx_image_s),
647	       0, FX_IMAGE_SIZE - sizeof(struct fx_image_s));
648}
649EXPORT_SYMBOL_GPL(fx_init);
650
651static void do_remove_write_access(struct kvm_vcpu *vcpu, int slot)
652{
653	spin_lock(&vcpu->kvm->lock);
654	kvm_mmu_slot_remove_write_access(vcpu, slot);
655	spin_unlock(&vcpu->kvm->lock);
656}
657
658/*
659 * Allocate some memory and give it an address in the guest physical address
660 * space.
661 *
662 * Discontiguous memory is allowed, mostly for framebuffers.
663 */
664static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
665					  struct kvm_memory_region *mem)
666{
667	int r;
668	gfn_t base_gfn;
669	unsigned long npages;
670	unsigned long i;
671	struct kvm_memory_slot *memslot;
672	struct kvm_memory_slot old, new;
673	int memory_config_version;
674
675	r = -EINVAL;
676	/* General sanity checks */
677	if (mem->memory_size & (PAGE_SIZE - 1))
678		goto out;
679	if (mem->guest_phys_addr & (PAGE_SIZE - 1))
680		goto out;
681	if (mem->slot >= KVM_MEMORY_SLOTS)
682		goto out;
683	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
684		goto out;
685
686	memslot = &kvm->memslots[mem->slot];
687	base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
688	npages = mem->memory_size >> PAGE_SHIFT;
689
690	if (!npages)
691		mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
692
693raced:
694	spin_lock(&kvm->lock);
695
696	memory_config_version = kvm->memory_config_version;
697	new = old = *memslot;
698
699	new.base_gfn = base_gfn;
700	new.npages = npages;
701	new.flags = mem->flags;
702
703	/* Disallow changing a memory slot's size. */
704	r = -EINVAL;
705	if (npages && old.npages && npages != old.npages)
706		goto out_unlock;
707
708	/* Check for overlaps */
709	r = -EEXIST;
710	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
711		struct kvm_memory_slot *s = &kvm->memslots[i];
712
713		if (s == memslot)
714			continue;
715		if (!((base_gfn + npages <= s->base_gfn) ||
716		      (base_gfn >= s->base_gfn + s->npages)))
717			goto out_unlock;
718	}
719	/*
720	 * Do memory allocations outside lock.  memory_config_version will
721	 * detect any races.
722	 */
723	spin_unlock(&kvm->lock);
724
725	/* Deallocate if slot is being removed */
726	if (!npages)
727		new.phys_mem = NULL;
728
729	/* Free page dirty bitmap if unneeded */
730	if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
731		new.dirty_bitmap = NULL;
732
733	r = -ENOMEM;
734
735	/* Allocate if a slot is being created */
736	if (npages && !new.phys_mem) {
737		new.phys_mem = vmalloc(npages * sizeof(struct page *));
738
739		if (!new.phys_mem)
740			goto out_free;
741
742		memset(new.phys_mem, 0, npages * sizeof(struct page *));
743		for (i = 0; i < npages; ++i) {
744			new.phys_mem[i] = alloc_page(GFP_HIGHUSER
745						     | __GFP_ZERO);
746			if (!new.phys_mem[i])
747				goto out_free;
748			set_page_private(new.phys_mem[i],0);
749		}
750	}
751
752	/* Allocate page dirty bitmap if needed */
753	if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
754		unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
755
756		new.dirty_bitmap = vmalloc(dirty_bytes);
757		if (!new.dirty_bitmap)
758			goto out_free;
759		memset(new.dirty_bitmap, 0, dirty_bytes);
760	}
761
762	spin_lock(&kvm->lock);
763
764	if (memory_config_version != kvm->memory_config_version) {
765		spin_unlock(&kvm->lock);
766		kvm_free_physmem_slot(&new, &old);
767		goto raced;
768	}
769
770	r = -EAGAIN;
771	if (kvm->busy)
772		goto out_unlock;
773
774	if (mem->slot >= kvm->nmemslots)
775		kvm->nmemslots = mem->slot + 1;
776
777	*memslot = new;
778	++kvm->memory_config_version;
779
780	spin_unlock(&kvm->lock);
781
782	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
783		struct kvm_vcpu *vcpu;
784
785		vcpu = vcpu_load_slot(kvm, i);
786		if (!vcpu)
787			continue;
788		if (new.flags & KVM_MEM_LOG_DIRTY_PAGES)
789			do_remove_write_access(vcpu, mem->slot);
790		kvm_mmu_reset_context(vcpu);
791		vcpu_put(vcpu);
792	}
793
794	kvm_free_physmem_slot(&old, &new);
795	return 0;
796
797out_unlock:
798	spin_unlock(&kvm->lock);
799out_free:
800	kvm_free_physmem_slot(&new, &old);
801out:
802	return r;
803}
804
805/*
806 * Get (and clear) the dirty memory log for a memory slot.
807 */
808static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
809				      struct kvm_dirty_log *log)
810{
811	struct kvm_memory_slot *memslot;
812	int r, i;
813	int n;
814	int cleared;
815	unsigned long any = 0;
816
817	spin_lock(&kvm->lock);
818
819	/*
820	 * Prevent changes to guest memory configuration even while the lock
821	 * is not taken.
822	 */
823	++kvm->busy;
824	spin_unlock(&kvm->lock);
825	r = -EINVAL;
826	if (log->slot >= KVM_MEMORY_SLOTS)
827		goto out;
828
829	memslot = &kvm->memslots[log->slot];
830	r = -ENOENT;
831	if (!memslot->dirty_bitmap)
832		goto out;
833
834	n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
835
836	for (i = 0; !any && i < n/sizeof(long); ++i)
837		any = memslot->dirty_bitmap[i];
838
839	r = -EFAULT;
840	if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
841		goto out;
842
843	if (any) {
844		cleared = 0;
845		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
846			struct kvm_vcpu *vcpu;
847
848			vcpu = vcpu_load_slot(kvm, i);
849			if (!vcpu)
850				continue;
851			if (!cleared) {
852				do_remove_write_access(vcpu, log->slot);
853				memset(memslot->dirty_bitmap, 0, n);
854				cleared = 1;
855			}
856			kvm_arch_ops->tlb_flush(vcpu);
857			vcpu_put(vcpu);
858		}
859	}
860
861	r = 0;
862
863out:
864	spin_lock(&kvm->lock);
865	--kvm->busy;
866	spin_unlock(&kvm->lock);
867	return r;
868}
869
870/*
871 * Set a new alias region.  Aliases map a portion of physical memory into
872 * another portion.  This is useful for memory windows, for example the PC
873 * VGA region.
874 */
875static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
876					 struct kvm_memory_alias *alias)
877{
878	int r, n;
879	struct kvm_mem_alias *p;
880
881	r = -EINVAL;
882	/* General sanity checks */
883	if (alias->memory_size & (PAGE_SIZE - 1))
884		goto out;
885	if (alias->guest_phys_addr & (PAGE_SIZE - 1))
886		goto out;
887	if (alias->slot >= KVM_ALIAS_SLOTS)
888		goto out;
889	if (alias->guest_phys_addr + alias->memory_size
890	    < alias->guest_phys_addr)
891		goto out;
892	if (alias->target_phys_addr + alias->memory_size
893	    < alias->target_phys_addr)
894		goto out;
895
896	spin_lock(&kvm->lock);
897
898	p = &kvm->aliases[alias->slot];
899	p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
900	p->npages = alias->memory_size >> PAGE_SHIFT;
901	p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
902
903	for (n = KVM_ALIAS_SLOTS; n > 0; --n)
904		if (kvm->aliases[n - 1].npages)
905			break;
906	kvm->naliases = n;
907
908	spin_unlock(&kvm->lock);
909
910	vcpu_load(&kvm->vcpus[0]);
911	spin_lock(&kvm->lock);
912	kvm_mmu_zap_all(&kvm->vcpus[0]);
913	spin_unlock(&kvm->lock);
914	vcpu_put(&kvm->vcpus[0]);
915
916	return 0;
917
918out:
919	return r;
920}
921
922static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
923{
924	int i;
925	struct kvm_mem_alias *alias;
926
927	for (i = 0; i < kvm->naliases; ++i) {
928		alias = &kvm->aliases[i];
929		if (gfn >= alias->base_gfn
930		    && gfn < alias->base_gfn + alias->npages)
931			return alias->target_gfn + gfn - alias->base_gfn;
932	}
933	return gfn;
934}
935
936static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
937{
938	int i;
939
940	for (i = 0; i < kvm->nmemslots; ++i) {
941		struct kvm_memory_slot *memslot = &kvm->memslots[i];
942
943		if (gfn >= memslot->base_gfn
944		    && gfn < memslot->base_gfn + memslot->npages)
945			return memslot;
946	}
947	return NULL;
948}
949
950struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
951{
952	gfn = unalias_gfn(kvm, gfn);
953	return __gfn_to_memslot(kvm, gfn);
954}
955
956struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
957{
958	struct kvm_memory_slot *slot;
959
960	gfn = unalias_gfn(kvm, gfn);
961	slot = __gfn_to_memslot(kvm, gfn);
962	if (!slot)
963		return NULL;
964	return slot->phys_mem[gfn - slot->base_gfn];
965}
966EXPORT_SYMBOL_GPL(gfn_to_page);
967
968void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
969{
970	int i;
971	struct kvm_memory_slot *memslot = NULL;
972	unsigned long rel_gfn;
973
974	for (i = 0; i < kvm->nmemslots; ++i) {
975		memslot = &kvm->memslots[i];
976
977		if (gfn >= memslot->base_gfn
978		    && gfn < memslot->base_gfn + memslot->npages) {
979
980			if (!memslot || !memslot->dirty_bitmap)
981				return;
982
983			rel_gfn = gfn - memslot->base_gfn;
984
985			/* avoid RMW */
986			if (!test_bit(rel_gfn, memslot->dirty_bitmap))
987				set_bit(rel_gfn, memslot->dirty_bitmap);
988			return;
989		}
990	}
991}
992
993static int emulator_read_std(unsigned long addr,
994			     void *val,
995			     unsigned int bytes,
996			     struct x86_emulate_ctxt *ctxt)
997{
998	struct kvm_vcpu *vcpu = ctxt->vcpu;
999	void *data = val;
1000
1001	while (bytes) {
1002		gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1003		unsigned offset = addr & (PAGE_SIZE-1);
1004		unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
1005		unsigned long pfn;
1006		struct page *page;
1007		void *page_virt;
1008
1009		if (gpa == UNMAPPED_GVA)
1010			return X86EMUL_PROPAGATE_FAULT;
1011		pfn = gpa >> PAGE_SHIFT;
1012		page = gfn_to_page(vcpu->kvm, pfn);
1013		if (!page)
1014			return X86EMUL_UNHANDLEABLE;
1015		page_virt = kmap_atomic(page, KM_USER0);
1016
1017		memcpy(data, page_virt + offset, tocopy);
1018
1019		kunmap_atomic(page_virt, KM_USER0);
1020
1021		bytes -= tocopy;
1022		data += tocopy;
1023		addr += tocopy;
1024	}
1025
1026	return X86EMUL_CONTINUE;
1027}
1028
1029static int emulator_write_std(unsigned long addr,
1030			      const void *val,
1031			      unsigned int bytes,
1032			      struct x86_emulate_ctxt *ctxt)
1033{
1034	printk(KERN_ERR "emulator_write_std: addr %lx n %d\n",
1035	       addr, bytes);
1036	return X86EMUL_UNHANDLEABLE;
1037}
1038
1039static int emulator_read_emulated(unsigned long addr,
1040				  void *val,
1041				  unsigned int bytes,
1042				  struct x86_emulate_ctxt *ctxt)
1043{
1044	struct kvm_vcpu *vcpu = ctxt->vcpu;
1045
1046	if (vcpu->mmio_read_completed) {
1047		memcpy(val, vcpu->mmio_data, bytes);
1048		vcpu->mmio_read_completed = 0;
1049		return X86EMUL_CONTINUE;
1050	} else if (emulator_read_std(addr, val, bytes, ctxt)
1051		   == X86EMUL_CONTINUE)
1052		return X86EMUL_CONTINUE;
1053	else {
1054		gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1055
1056		if (gpa == UNMAPPED_GVA)
1057			return X86EMUL_PROPAGATE_FAULT;
1058		vcpu->mmio_needed = 1;
1059		vcpu->mmio_phys_addr = gpa;
1060		vcpu->mmio_size = bytes;
1061		vcpu->mmio_is_write = 0;
1062
1063		return X86EMUL_UNHANDLEABLE;
1064	}
1065}
1066
1067static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
1068			       const void *val, int bytes)
1069{
1070	struct page *page;
1071	void *virt;
1072
1073	if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT))
1074		return 0;
1075	page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1076	if (!page)
1077		return 0;
1078	kvm_mmu_pre_write(vcpu, gpa, bytes);
1079	mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
1080	virt = kmap_atomic(page, KM_USER0);
1081	memcpy(virt + offset_in_page(gpa), val, bytes);
1082	kunmap_atomic(virt, KM_USER0);
1083	kvm_mmu_post_write(vcpu, gpa, bytes);
1084	return 1;
1085}
1086
1087static int emulator_write_emulated(unsigned long addr,
1088				   const void *val,
1089				   unsigned int bytes,
1090				   struct x86_emulate_ctxt *ctxt)
1091{
1092	struct kvm_vcpu *vcpu = ctxt->vcpu;
1093	gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1094
1095	if (gpa == UNMAPPED_GVA) {
1096		kvm_arch_ops->inject_page_fault(vcpu, addr, 2);
1097		return X86EMUL_PROPAGATE_FAULT;
1098	}
1099
1100	if (emulator_write_phys(vcpu, gpa, val, bytes))
1101		return X86EMUL_CONTINUE;
1102
1103	vcpu->mmio_needed = 1;
1104	vcpu->mmio_phys_addr = gpa;
1105	vcpu->mmio_size = bytes;
1106	vcpu->mmio_is_write = 1;
1107	memcpy(vcpu->mmio_data, val, bytes);
1108
1109	return X86EMUL_CONTINUE;
1110}
1111
1112static int emulator_cmpxchg_emulated(unsigned long addr,
1113				     const void *old,
1114				     const void *new,
1115				     unsigned int bytes,
1116				     struct x86_emulate_ctxt *ctxt)
1117{
1118	static int reported;
1119
1120	if (!reported) {
1121		reported = 1;
1122		printk(KERN_WARNING "kvm: emulating exchange as write\n");
1123	}
1124	return emulator_write_emulated(addr, new, bytes, ctxt);
1125}
1126
1127static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
1128{
1129	return kvm_arch_ops->get_segment_base(vcpu, seg);
1130}
1131
1132int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
1133{
1134	return X86EMUL_CONTINUE;
1135}
1136
1137int emulate_clts(struct kvm_vcpu *vcpu)
1138{
1139	unsigned long cr0;
1140
1141	cr0 = vcpu->cr0 & ~CR0_TS_MASK;
1142	kvm_arch_ops->set_cr0(vcpu, cr0);
1143	return X86EMUL_CONTINUE;
1144}
1145
1146int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest)
1147{
1148	struct kvm_vcpu *vcpu = ctxt->vcpu;
1149
1150	switch (dr) {
1151	case 0 ... 3:
1152		*dest = kvm_arch_ops->get_dr(vcpu, dr);
1153		return X86EMUL_CONTINUE;
1154	default:
1155		printk(KERN_DEBUG "%s: unexpected dr %u\n",
1156		       __FUNCTION__, dr);
1157		return X86EMUL_UNHANDLEABLE;
1158	}
1159}
1160
1161int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
1162{
1163	unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
1164	int exception;
1165
1166	kvm_arch_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
1167	if (exception) {
1168		return X86EMUL_UNHANDLEABLE;
1169	}
1170	return X86EMUL_CONTINUE;
1171}
1172
1173static void report_emulation_failure(struct x86_emulate_ctxt *ctxt)
1174{
1175	static int reported;
1176	u8 opcodes[4];
1177	unsigned long rip = ctxt->vcpu->rip;
1178	unsigned long rip_linear;
1179
1180	rip_linear = rip + get_segment_base(ctxt->vcpu, VCPU_SREG_CS);
1181
1182	if (reported)
1183		return;
1184
1185	emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt);
1186
1187	printk(KERN_ERR "emulation failed but !mmio_needed?"
1188	       " rip %lx %02x %02x %02x %02x\n",
1189	       rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
1190	reported = 1;
1191}
1192
1193struct x86_emulate_ops emulate_ops = {
1194	.read_std            = emulator_read_std,
1195	.write_std           = emulator_write_std,
1196	.read_emulated       = emulator_read_emulated,
1197	.write_emulated      = emulator_write_emulated,
1198	.cmpxchg_emulated    = emulator_cmpxchg_emulated,
1199};
1200
1201int emulate_instruction(struct kvm_vcpu *vcpu,
1202			struct kvm_run *run,
1203			unsigned long cr2,
1204			u16 error_code)
1205{
1206	struct x86_emulate_ctxt emulate_ctxt;
1207	int r;
1208	int cs_db, cs_l;
1209
1210	vcpu->mmio_fault_cr2 = cr2;
1211	kvm_arch_ops->cache_regs(vcpu);
1212
1213	kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
1214
1215	emulate_ctxt.vcpu = vcpu;
1216	emulate_ctxt.eflags = kvm_arch_ops->get_rflags(vcpu);
1217	emulate_ctxt.cr2 = cr2;
1218	emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
1219		? X86EMUL_MODE_REAL : cs_l
1220		? X86EMUL_MODE_PROT64 :	cs_db
1221		? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
1222
1223	if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
1224		emulate_ctxt.cs_base = 0;
1225		emulate_ctxt.ds_base = 0;
1226		emulate_ctxt.es_base = 0;
1227		emulate_ctxt.ss_base = 0;
1228	} else {
1229		emulate_ctxt.cs_base = get_segment_base(vcpu, VCPU_SREG_CS);
1230		emulate_ctxt.ds_base = get_segment_base(vcpu, VCPU_SREG_DS);
1231		emulate_ctxt.es_base = get_segment_base(vcpu, VCPU_SREG_ES);
1232		emulate_ctxt.ss_base = get_segment_base(vcpu, VCPU_SREG_SS);
1233	}
1234
1235	emulate_ctxt.gs_base = get_segment_base(vcpu, VCPU_SREG_GS);
1236	emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS);
1237
1238	vcpu->mmio_is_write = 0;
1239	r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
1240
1241	if ((r || vcpu->mmio_is_write) && run) {
1242		run->mmio.phys_addr = vcpu->mmio_phys_addr;
1243		memcpy(run->mmio.data, vcpu->mmio_data, 8);
1244		run->mmio.len = vcpu->mmio_size;
1245		run->mmio.is_write = vcpu->mmio_is_write;
1246	}
1247
1248	if (r) {
1249		if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1250			return EMULATE_DONE;
1251		if (!vcpu->mmio_needed) {
1252			report_emulation_failure(&emulate_ctxt);
1253			return EMULATE_FAIL;
1254		}
1255		return EMULATE_DO_MMIO;
1256	}
1257
1258	kvm_arch_ops->decache_regs(vcpu);
1259	kvm_arch_ops->set_rflags(vcpu, emulate_ctxt.eflags);
1260
1261	if (vcpu->mmio_is_write) {
1262		vcpu->mmio_needed = 0;
1263		return EMULATE_DO_MMIO;
1264	}
1265
1266	return EMULATE_DONE;
1267}
1268EXPORT_SYMBOL_GPL(emulate_instruction);
1269
1270int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run)
1271{
1272	unsigned long nr, a0, a1, a2, a3, a4, a5, ret;
1273
1274	kvm_arch_ops->cache_regs(vcpu);
1275	ret = -KVM_EINVAL;
1276#ifdef CONFIG_X86_64
1277	if (is_long_mode(vcpu)) {
1278		nr = vcpu->regs[VCPU_REGS_RAX];
1279		a0 = vcpu->regs[VCPU_REGS_RDI];
1280		a1 = vcpu->regs[VCPU_REGS_RSI];
1281		a2 = vcpu->regs[VCPU_REGS_RDX];
1282		a3 = vcpu->regs[VCPU_REGS_RCX];
1283		a4 = vcpu->regs[VCPU_REGS_R8];
1284		a5 = vcpu->regs[VCPU_REGS_R9];
1285	} else
1286#endif
1287	{
1288		nr = vcpu->regs[VCPU_REGS_RBX] & -1u;
1289		a0 = vcpu->regs[VCPU_REGS_RAX] & -1u;
1290		a1 = vcpu->regs[VCPU_REGS_RCX] & -1u;
1291		a2 = vcpu->regs[VCPU_REGS_RDX] & -1u;
1292		a3 = vcpu->regs[VCPU_REGS_RSI] & -1u;
1293		a4 = vcpu->regs[VCPU_REGS_RDI] & -1u;
1294		a5 = vcpu->regs[VCPU_REGS_RBP] & -1u;
1295	}
1296	switch (nr) {
1297	default:
1298		run->hypercall.args[0] = a0;
1299		run->hypercall.args[1] = a1;
1300		run->hypercall.args[2] = a2;
1301		run->hypercall.args[3] = a3;
1302		run->hypercall.args[4] = a4;
1303		run->hypercall.args[5] = a5;
1304		run->hypercall.ret = ret;
1305		run->hypercall.longmode = is_long_mode(vcpu);
1306		kvm_arch_ops->decache_regs(vcpu);
1307		return 0;
1308	}
1309	vcpu->regs[VCPU_REGS_RAX] = ret;
1310	kvm_arch_ops->decache_regs(vcpu);
1311	return 1;
1312}
1313EXPORT_SYMBOL_GPL(kvm_hypercall);
1314
1315static u64 mk_cr_64(u64 curr_cr, u32 new_val)
1316{
1317	return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
1318}
1319
1320void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1321{
1322	struct descriptor_table dt = { limit, base };
1323
1324	kvm_arch_ops->set_gdt(vcpu, &dt);
1325}
1326
1327void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1328{
1329	struct descriptor_table dt = { limit, base };
1330
1331	kvm_arch_ops->set_idt(vcpu, &dt);
1332}
1333
1334void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
1335		   unsigned long *rflags)
1336{
1337	lmsw(vcpu, msw);
1338	*rflags = kvm_arch_ops->get_rflags(vcpu);
1339}
1340
1341unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
1342{
1343	kvm_arch_ops->decache_cr4_guest_bits(vcpu);
1344	switch (cr) {
1345	case 0:
1346		return vcpu->cr0;
1347	case 2:
1348		return vcpu->cr2;
1349	case 3:
1350		return vcpu->cr3;
1351	case 4:
1352		return vcpu->cr4;
1353	default:
1354		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1355		return 0;
1356	}
1357}
1358
1359void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
1360		     unsigned long *rflags)
1361{
1362	switch (cr) {
1363	case 0:
1364		set_cr0(vcpu, mk_cr_64(vcpu->cr0, val));
1365		*rflags = kvm_arch_ops->get_rflags(vcpu);
1366		break;
1367	case 2:
1368		vcpu->cr2 = val;
1369		break;
1370	case 3:
1371		set_cr3(vcpu, val);
1372		break;
1373	case 4:
1374		set_cr4(vcpu, mk_cr_64(vcpu->cr4, val));
1375		break;
1376	default:
1377		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1378	}
1379}
1380
1381/*
1382 * Register the para guest with the host:
1383 */
1384static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa)
1385{
1386	struct kvm_vcpu_para_state *para_state;
1387	hpa_t para_state_hpa, hypercall_hpa;
1388	struct page *para_state_page;
1389	unsigned char *hypercall;
1390	gpa_t hypercall_gpa;
1391
1392	printk(KERN_DEBUG "kvm: guest trying to enter paravirtual mode\n");
1393	printk(KERN_DEBUG ".... para_state_gpa: %08Lx\n", para_state_gpa);
1394
1395	/*
1396	 * Needs to be page aligned:
1397	 */
1398	if (para_state_gpa != PAGE_ALIGN(para_state_gpa))
1399		goto err_gp;
1400
1401	para_state_hpa = gpa_to_hpa(vcpu, para_state_gpa);
1402	printk(KERN_DEBUG ".... para_state_hpa: %08Lx\n", para_state_hpa);
1403	if (is_error_hpa(para_state_hpa))
1404		goto err_gp;
1405
1406	mark_page_dirty(vcpu->kvm, para_state_gpa >> PAGE_SHIFT);
1407	para_state_page = pfn_to_page(para_state_hpa >> PAGE_SHIFT);
1408	para_state = kmap_atomic(para_state_page, KM_USER0);
1409
1410	printk(KERN_DEBUG "....  guest version: %d\n", para_state->guest_version);
1411	printk(KERN_DEBUG "....           size: %d\n", para_state->size);
1412
1413	para_state->host_version = KVM_PARA_API_VERSION;
1414	/*
1415	 * We cannot support guests that try to register themselves
1416	 * with a newer API version than the host supports:
1417	 */
1418	if (para_state->guest_version > KVM_PARA_API_VERSION) {
1419		para_state->ret = -KVM_EINVAL;
1420		goto err_kunmap_skip;
1421	}
1422
1423	hypercall_gpa = para_state->hypercall_gpa;
1424	hypercall_hpa = gpa_to_hpa(vcpu, hypercall_gpa);
1425	printk(KERN_DEBUG ".... hypercall_hpa: %08Lx\n", hypercall_hpa);
1426	if (is_error_hpa(hypercall_hpa)) {
1427		para_state->ret = -KVM_EINVAL;
1428		goto err_kunmap_skip;
1429	}
1430
1431	printk(KERN_DEBUG "kvm: para guest successfully registered.\n");
1432	vcpu->para_state_page = para_state_page;
1433	vcpu->para_state_gpa = para_state_gpa;
1434	vcpu->hypercall_gpa = hypercall_gpa;
1435
1436	mark_page_dirty(vcpu->kvm, hypercall_gpa >> PAGE_SHIFT);
1437	hypercall = kmap_atomic(pfn_to_page(hypercall_hpa >> PAGE_SHIFT),
1438				KM_USER1) + (hypercall_hpa & ~PAGE_MASK);
1439	kvm_arch_ops->patch_hypercall(vcpu, hypercall);
1440	kunmap_atomic(hypercall, KM_USER1);
1441
1442	para_state->ret = 0;
1443err_kunmap_skip:
1444	kunmap_atomic(para_state, KM_USER0);
1445	return 0;
1446err_gp:
1447	return 1;
1448}
1449
1450int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1451{
1452	u64 data;
1453
1454	switch (msr) {
1455	case 0xc0010010: /* SYSCFG */
1456	case 0xc0010015: /* HWCR */
1457	case MSR_IA32_PLATFORM_ID:
1458	case MSR_IA32_P5_MC_ADDR:
1459	case MSR_IA32_P5_MC_TYPE:
1460	case MSR_IA32_MC0_CTL:
1461	case MSR_IA32_MCG_STATUS:
1462	case MSR_IA32_MCG_CAP:
1463	case MSR_IA32_MC0_MISC:
1464	case MSR_IA32_MC0_MISC+4:
1465	case MSR_IA32_MC0_MISC+8:
1466	case MSR_IA32_MC0_MISC+12:
1467	case MSR_IA32_MC0_MISC+16:
1468	case MSR_IA32_UCODE_REV:
1469	case MSR_IA32_PERF_STATUS:
1470		/* MTRR registers */
1471	case 0xfe:
1472	case 0x200 ... 0x2ff:
1473		data = 0;
1474		break;
1475	case 0xcd: /* fsb frequency */
1476		data = 3;
1477		break;
1478	case MSR_IA32_APICBASE:
1479		data = vcpu->apic_base;
1480		break;
1481	case MSR_IA32_MISC_ENABLE:
1482		data = vcpu->ia32_misc_enable_msr;
1483		break;
1484#ifdef CONFIG_X86_64
1485	case MSR_EFER:
1486		data = vcpu->shadow_efer;
1487		break;
1488#endif
1489	default:
1490		printk(KERN_ERR "kvm: unhandled rdmsr: 0x%x\n", msr);
1491		return 1;
1492	}
1493	*pdata = data;
1494	return 0;
1495}
1496EXPORT_SYMBOL_GPL(kvm_get_msr_common);
1497
1498/*
1499 * Reads an msr value (of 'msr_index') into 'pdata'.
1500 * Returns 0 on success, non-0 otherwise.
1501 * Assumes vcpu_load() was already called.
1502 */
1503static int get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1504{
1505	return kvm_arch_ops->get_msr(vcpu, msr_index, pdata);
1506}
1507
1508#ifdef CONFIG_X86_64
1509
1510static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
1511{
1512	if (efer & EFER_RESERVED_BITS) {
1513		printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
1514		       efer);
1515		inject_gp(vcpu);
1516		return;
1517	}
1518
1519	if (is_paging(vcpu)
1520	    && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
1521		printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
1522		inject_gp(vcpu);
1523		return;
1524	}
1525
1526	kvm_arch_ops->set_efer(vcpu, efer);
1527
1528	efer &= ~EFER_LMA;
1529	efer |= vcpu->shadow_efer & EFER_LMA;
1530
1531	vcpu->shadow_efer = efer;
1532}
1533
1534#endif
1535
1536int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1537{
1538	switch (msr) {
1539#ifdef CONFIG_X86_64
1540	case MSR_EFER:
1541		set_efer(vcpu, data);
1542		break;
1543#endif
1544	case MSR_IA32_MC0_STATUS:
1545		printk(KERN_WARNING "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
1546		       __FUNCTION__, data);
1547		break;
1548	case MSR_IA32_MCG_STATUS:
1549		printk(KERN_WARNING "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
1550			__FUNCTION__, data);
1551		break;
1552	case MSR_IA32_UCODE_REV:
1553	case MSR_IA32_UCODE_WRITE:
1554	case 0x200 ... 0x2ff: /* MTRRs */
1555		break;
1556	case MSR_IA32_APICBASE:
1557		vcpu->apic_base = data;
1558		break;
1559	case MSR_IA32_MISC_ENABLE:
1560		vcpu->ia32_misc_enable_msr = data;
1561		break;
1562	/*
1563	 * This is the 'probe whether the host is KVM' logic:
1564	 */
1565	case MSR_KVM_API_MAGIC:
1566		return vcpu_register_para(vcpu, data);
1567
1568	default:
1569		printk(KERN_ERR "kvm: unhandled wrmsr: 0x%x\n", msr);
1570		return 1;
1571	}
1572	return 0;
1573}
1574EXPORT_SYMBOL_GPL(kvm_set_msr_common);
1575
1576/*
1577 * Writes msr value into into the appropriate "register".
1578 * Returns 0 on success, non-0 otherwise.
1579 * Assumes vcpu_load() was already called.
1580 */
1581static int set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1582{
1583	return kvm_arch_ops->set_msr(vcpu, msr_index, data);
1584}
1585
1586void kvm_resched(struct kvm_vcpu *vcpu)
1587{
1588	if (!need_resched())
1589		return;
1590	vcpu_put(vcpu);
1591	cond_resched();
1592	vcpu_load(vcpu);
1593}
1594EXPORT_SYMBOL_GPL(kvm_resched);
1595
1596void load_msrs(struct vmx_msr_entry *e, int n)
1597{
1598	int i;
1599
1600	for (i = 0; i < n; ++i)
1601		wrmsrl(e[i].index, e[i].data);
1602}
1603EXPORT_SYMBOL_GPL(load_msrs);
1604
1605void save_msrs(struct vmx_msr_entry *e, int n)
1606{
1607	int i;
1608
1609	for (i = 0; i < n; ++i)
1610		rdmsrl(e[i].index, e[i].data);
1611}
1612EXPORT_SYMBOL_GPL(save_msrs);
1613
1614void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
1615{
1616	int i;
1617	u32 function;
1618	struct kvm_cpuid_entry *e, *best;
1619
1620	kvm_arch_ops->cache_regs(vcpu);
1621	function = vcpu->regs[VCPU_REGS_RAX];
1622	vcpu->regs[VCPU_REGS_RAX] = 0;
1623	vcpu->regs[VCPU_REGS_RBX] = 0;
1624	vcpu->regs[VCPU_REGS_RCX] = 0;
1625	vcpu->regs[VCPU_REGS_RDX] = 0;
1626	best = NULL;
1627	for (i = 0; i < vcpu->cpuid_nent; ++i) {
1628		e = &vcpu->cpuid_entries[i];
1629		if (e->function == function) {
1630			best = e;
1631			break;
1632		}
1633		/*
1634		 * Both basic or both extended?
1635		 */
1636		if (((e->function ^ function) & 0x80000000) == 0)
1637			if (!best || e->function > best->function)
1638				best = e;
1639	}
1640	if (best) {
1641		vcpu->regs[VCPU_REGS_RAX] = best->eax;
1642		vcpu->regs[VCPU_REGS_RBX] = best->ebx;
1643		vcpu->regs[VCPU_REGS_RCX] = best->ecx;
1644		vcpu->regs[VCPU_REGS_RDX] = best->edx;
1645	}
1646	kvm_arch_ops->decache_regs(vcpu);
1647	kvm_arch_ops->skip_emulated_instruction(vcpu);
1648}
1649EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
1650
1651static int pio_copy_data(struct kvm_vcpu *vcpu)
1652{
1653	void *p = vcpu->pio_data;
1654	void *q;
1655	unsigned bytes;
1656	int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1;
1657
1658	kvm_arch_ops->vcpu_put(vcpu);
1659	q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
1660		 PAGE_KERNEL);
1661	if (!q) {
1662		kvm_arch_ops->vcpu_load(vcpu);
1663		free_pio_guest_pages(vcpu);
1664		return -ENOMEM;
1665	}
1666	q += vcpu->pio.guest_page_offset;
1667	bytes = vcpu->pio.size * vcpu->pio.cur_count;
1668	if (vcpu->pio.in)
1669		memcpy(q, p, bytes);
1670	else
1671		memcpy(p, q, bytes);
1672	q -= vcpu->pio.guest_page_offset;
1673	vunmap(q);
1674	kvm_arch_ops->vcpu_load(vcpu);
1675	free_pio_guest_pages(vcpu);
1676	return 0;
1677}
1678
1679static int complete_pio(struct kvm_vcpu *vcpu)
1680{
1681	struct kvm_pio_request *io = &vcpu->pio;
1682	long delta;
1683	int r;
1684
1685	kvm_arch_ops->cache_regs(vcpu);
1686
1687	if (!io->string) {
1688		if (io->in)
1689			memcpy(&vcpu->regs[VCPU_REGS_RAX], vcpu->pio_data,
1690			       io->size);
1691	} else {
1692		if (io->in) {
1693			r = pio_copy_data(vcpu);
1694			if (r) {
1695				kvm_arch_ops->cache_regs(vcpu);
1696				return r;
1697			}
1698		}
1699
1700		delta = 1;
1701		if (io->rep) {
1702			delta *= io->cur_count;
1703			/*
1704			 * The size of the register should really depend on
1705			 * current address size.
1706			 */
1707			vcpu->regs[VCPU_REGS_RCX] -= delta;
1708		}
1709		if (io->down)
1710			delta = -delta;
1711		delta *= io->size;
1712		if (io->in)
1713			vcpu->regs[VCPU_REGS_RDI] += delta;
1714		else
1715			vcpu->regs[VCPU_REGS_RSI] += delta;
1716	}
1717
1718	kvm_arch_ops->decache_regs(vcpu);
1719
1720	io->count -= io->cur_count;
1721	io->cur_count = 0;
1722
1723	if (!io->count)
1724		kvm_arch_ops->skip_emulated_instruction(vcpu);
1725	return 0;
1726}
1727
1728int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1729		  int size, unsigned long count, int string, int down,
1730		  gva_t address, int rep, unsigned port)
1731{
1732	unsigned now, in_page;
1733	int i;
1734	int nr_pages = 1;
1735	struct page *page;
1736
1737	vcpu->run->exit_reason = KVM_EXIT_IO;
1738	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
1739	vcpu->run->io.size = size;
1740	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
1741	vcpu->run->io.count = count;
1742	vcpu->run->io.port = port;
1743	vcpu->pio.count = count;
1744	vcpu->pio.cur_count = count;
1745	vcpu->pio.size = size;
1746	vcpu->pio.in = in;
1747	vcpu->pio.string = string;
1748	vcpu->pio.down = down;
1749	vcpu->pio.guest_page_offset = offset_in_page(address);
1750	vcpu->pio.rep = rep;
1751
1752	if (!string) {
1753		kvm_arch_ops->cache_regs(vcpu);
1754		memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4);
1755		kvm_arch_ops->decache_regs(vcpu);
1756		return 0;
1757	}
1758
1759	if (!count) {
1760		kvm_arch_ops->skip_emulated_instruction(vcpu);
1761		return 1;
1762	}
1763
1764	now = min(count, PAGE_SIZE / size);
1765
1766	if (!down)
1767		in_page = PAGE_SIZE - offset_in_page(address);
1768	else
1769		in_page = offset_in_page(address) + size;
1770	now = min(count, (unsigned long)in_page / size);
1771	if (!now) {
1772		/*
1773		 * String I/O straddles page boundary.  Pin two guest pages
1774		 * so that we satisfy atomicity constraints.  Do just one
1775		 * transaction to avoid complexity.
1776		 */
1777		nr_pages = 2;
1778		now = 1;
1779	}
1780	if (down) {
1781		/*
1782		 * String I/O in reverse.  Yuck.  Kill the guest, fix later.
1783		 */
1784		printk(KERN_ERR "kvm: guest string pio down\n");
1785		inject_gp(vcpu);
1786		return 1;
1787	}
1788	vcpu->run->io.count = now;
1789	vcpu->pio.cur_count = now;
1790
1791	for (i = 0; i < nr_pages; ++i) {
1792		spin_lock(&vcpu->kvm->lock);
1793		page = gva_to_page(vcpu, address + i * PAGE_SIZE);
1794		if (page)
1795			get_page(page);
1796		vcpu->pio.guest_pages[i] = page;
1797		spin_unlock(&vcpu->kvm->lock);
1798		if (!page) {
1799			inject_gp(vcpu);
1800			free_pio_guest_pages(vcpu);
1801			return 1;
1802		}
1803	}
1804
1805	if (!vcpu->pio.in)
1806		return pio_copy_data(vcpu);
1807	return 0;
1808}
1809EXPORT_SYMBOL_GPL(kvm_setup_pio);
1810
1811static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1812{
1813	int r;
1814	sigset_t sigsaved;
1815
1816	vcpu_load(vcpu);
1817
1818	if (vcpu->sigset_active)
1819		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
1820
1821	/* re-sync apic's tpr */
1822	vcpu->cr8 = kvm_run->cr8;
1823
1824	if (vcpu->pio.cur_count) {
1825		r = complete_pio(vcpu);
1826		if (r)
1827			goto out;
1828	}
1829
1830	if (vcpu->mmio_needed) {
1831		memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
1832		vcpu->mmio_read_completed = 1;
1833		vcpu->mmio_needed = 0;
1834		r = emulate_instruction(vcpu, kvm_run,
1835					vcpu->mmio_fault_cr2, 0);
1836		if (r == EMULATE_DO_MMIO) {
1837			/*
1838			 * Read-modify-write.  Back to userspace.
1839			 */
1840			kvm_run->exit_reason = KVM_EXIT_MMIO;
1841			r = 0;
1842			goto out;
1843		}
1844	}
1845
1846	if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
1847		kvm_arch_ops->cache_regs(vcpu);
1848		vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
1849		kvm_arch_ops->decache_regs(vcpu);
1850	}
1851
1852	r = kvm_arch_ops->run(vcpu, kvm_run);
1853
1854out:
1855	if (vcpu->sigset_active)
1856		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
1857
1858	vcpu_put(vcpu);
1859	return r;
1860}
1861
1862static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu,
1863				   struct kvm_regs *regs)
1864{
1865	vcpu_load(vcpu);
1866
1867	kvm_arch_ops->cache_regs(vcpu);
1868
1869	regs->rax = vcpu->regs[VCPU_REGS_RAX];
1870	regs->rbx = vcpu->regs[VCPU_REGS_RBX];
1871	regs->rcx = vcpu->regs[VCPU_REGS_RCX];
1872	regs->rdx = vcpu->regs[VCPU_REGS_RDX];
1873	regs->rsi = vcpu->regs[VCPU_REGS_RSI];
1874	regs->rdi = vcpu->regs[VCPU_REGS_RDI];
1875	regs->rsp = vcpu->regs[VCPU_REGS_RSP];
1876	regs->rbp = vcpu->regs[VCPU_REGS_RBP];
1877#ifdef CONFIG_X86_64
1878	regs->r8 = vcpu->regs[VCPU_REGS_R8];
1879	regs->r9 = vcpu->regs[VCPU_REGS_R9];
1880	regs->r10 = vcpu->regs[VCPU_REGS_R10];
1881	regs->r11 = vcpu->regs[VCPU_REGS_R11];
1882	regs->r12 = vcpu->regs[VCPU_REGS_R12];
1883	regs->r13 = vcpu->regs[VCPU_REGS_R13];
1884	regs->r14 = vcpu->regs[VCPU_REGS_R14];
1885	regs->r15 = vcpu->regs[VCPU_REGS_R15];
1886#endif
1887
1888	regs->rip = vcpu->rip;
1889	regs->rflags = kvm_arch_ops->get_rflags(vcpu);
1890
1891	/*
1892	 * Don't leak debug flags in case they were set for guest debugging
1893	 */
1894	if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
1895		regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1896
1897	vcpu_put(vcpu);
1898
1899	return 0;
1900}
1901
1902static int kvm_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu,
1903				   struct kvm_regs *regs)
1904{
1905	vcpu_load(vcpu);
1906
1907	vcpu->regs[VCPU_REGS_RAX] = regs->rax;
1908	vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
1909	vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
1910	vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
1911	vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
1912	vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
1913	vcpu->regs[VCPU_REGS_RSP] = regs->rsp;
1914	vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
1915#ifdef CONFIG_X86_64
1916	vcpu->regs[VCPU_REGS_R8] = regs->r8;
1917	vcpu->regs[VCPU_REGS_R9] = regs->r9;
1918	vcpu->regs[VCPU_REGS_R10] = regs->r10;
1919	vcpu->regs[VCPU_REGS_R11] = regs->r11;
1920	vcpu->regs[VCPU_REGS_R12] = regs->r12;
1921	vcpu->regs[VCPU_REGS_R13] = regs->r13;
1922	vcpu->regs[VCPU_REGS_R14] = regs->r14;
1923	vcpu->regs[VCPU_REGS_R15] = regs->r15;
1924#endif
1925
1926	vcpu->rip = regs->rip;
1927	kvm_arch_ops->set_rflags(vcpu, regs->rflags);
1928
1929	kvm_arch_ops->decache_regs(vcpu);
1930
1931	vcpu_put(vcpu);
1932
1933	return 0;
1934}
1935
1936static void get_segment(struct kvm_vcpu *vcpu,
1937			struct kvm_segment *var, int seg)
1938{
1939	return kvm_arch_ops->get_segment(vcpu, var, seg);
1940}
1941
1942static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
1943				    struct kvm_sregs *sregs)
1944{
1945	struct descriptor_table dt;
1946
1947	vcpu_load(vcpu);
1948
1949	get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
1950	get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
1951	get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
1952	get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
1953	get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
1954	get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
1955
1956	get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
1957	get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
1958
1959	kvm_arch_ops->get_idt(vcpu, &dt);
1960	sregs->idt.limit = dt.limit;
1961	sregs->idt.base = dt.base;
1962	kvm_arch_ops->get_gdt(vcpu, &dt);
1963	sregs->gdt.limit = dt.limit;
1964	sregs->gdt.base = dt.base;
1965
1966	kvm_arch_ops->decache_cr4_guest_bits(vcpu);
1967	sregs->cr0 = vcpu->cr0;
1968	sregs->cr2 = vcpu->cr2;
1969	sregs->cr3 = vcpu->cr3;
1970	sregs->cr4 = vcpu->cr4;
1971	sregs->cr8 = vcpu->cr8;
1972	sregs->efer = vcpu->shadow_efer;
1973	sregs->apic_base = vcpu->apic_base;
1974
1975	memcpy(sregs->interrupt_bitmap, vcpu->irq_pending,
1976	       sizeof sregs->interrupt_bitmap);
1977
1978	vcpu_put(vcpu);
1979
1980	return 0;
1981}
1982
1983static void set_segment(struct kvm_vcpu *vcpu,
1984			struct kvm_segment *var, int seg)
1985{
1986	return kvm_arch_ops->set_segment(vcpu, var, seg);
1987}
1988
1989static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
1990				    struct kvm_sregs *sregs)
1991{
1992	int mmu_reset_needed = 0;
1993	int i;
1994	struct descriptor_table dt;
1995
1996	vcpu_load(vcpu);
1997
1998	dt.limit = sregs->idt.limit;
1999	dt.base = sregs->idt.base;
2000	kvm_arch_ops->set_idt(vcpu, &dt);
2001	dt.limit = sregs->gdt.limit;
2002	dt.base = sregs->gdt.base;
2003	kvm_arch_ops->set_gdt(vcpu, &dt);
2004
2005	vcpu->cr2 = sregs->cr2;
2006	mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
2007	vcpu->cr3 = sregs->cr3;
2008
2009	vcpu->cr8 = sregs->cr8;
2010
2011	mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
2012#ifdef CONFIG_X86_64
2013	kvm_arch_ops->set_efer(vcpu, sregs->efer);
2014#endif
2015	vcpu->apic_base = sregs->apic_base;
2016
2017	kvm_arch_ops->decache_cr4_guest_bits(vcpu);
2018
2019	mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
2020	kvm_arch_ops->set_cr0(vcpu, sregs->cr0);
2021
2022	mmu_reset_needed |= vcpu->cr4 != sregs->cr4;
2023	kvm_arch_ops->set_cr4(vcpu, sregs->cr4);
2024	if (!is_long_mode(vcpu) && is_pae(vcpu))
2025		load_pdptrs(vcpu, vcpu->cr3);
2026
2027	if (mmu_reset_needed)
2028		kvm_mmu_reset_context(vcpu);
2029
2030	memcpy(vcpu->irq_pending, sregs->interrupt_bitmap,
2031	       sizeof vcpu->irq_pending);
2032	vcpu->irq_summary = 0;
2033	for (i = 0; i < NR_IRQ_WORDS; ++i)
2034		if (vcpu->irq_pending[i])
2035			__set_bit(i, &vcpu->irq_summary);
2036
2037	set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2038	set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2039	set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2040	set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2041	set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2042	set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2043
2044	set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2045	set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2046
2047	vcpu_put(vcpu);
2048
2049	return 0;
2050}
2051
2052/*
2053 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
2054 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
2055 *
2056 * This list is modified at module load time to reflect the
2057 * capabilities of the host cpu.
2058 */
2059static u32 msrs_to_save[] = {
2060	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
2061	MSR_K6_STAR,
2062#ifdef CONFIG_X86_64
2063	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
2064#endif
2065	MSR_IA32_TIME_STAMP_COUNTER,
2066};
2067
2068static unsigned num_msrs_to_save;
2069
2070static u32 emulated_msrs[] = {
2071	MSR_IA32_MISC_ENABLE,
2072};
2073
2074static __init void kvm_init_msr_list(void)
2075{
2076	u32 dummy[2];
2077	unsigned i, j;
2078
2079	for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
2080		if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
2081			continue;
2082		if (j < i)
2083			msrs_to_save[j] = msrs_to_save[i];
2084		j++;
2085	}
2086	num_msrs_to_save = j;
2087}
2088
2089/*
2090 * Adapt set_msr() to msr_io()'s calling convention
2091 */
2092static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
2093{
2094	return set_msr(vcpu, index, *data);
2095}
2096
2097/*
2098 * Read or write a bunch of msrs. All parameters are kernel addresses.
2099 *
2100 * @return number of msrs set successfully.
2101 */
2102static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
2103		    struct kvm_msr_entry *entries,
2104		    int (*do_msr)(struct kvm_vcpu *vcpu,
2105				  unsigned index, u64 *data))
2106{
2107	int i;
2108
2109	vcpu_load(vcpu);
2110
2111	for (i = 0; i < msrs->nmsrs; ++i)
2112		if (do_msr(vcpu, entries[i].index, &entries[i].data))
2113			break;
2114
2115	vcpu_put(vcpu);
2116
2117	return i;
2118}
2119
2120/*
2121 * Read or write a bunch of msrs. Parameters are user addresses.
2122 *
2123 * @return number of msrs set successfully.
2124 */
2125static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
2126		  int (*do_msr)(struct kvm_vcpu *vcpu,
2127				unsigned index, u64 *data),
2128		  int writeback)
2129{
2130	struct kvm_msrs msrs;
2131	struct kvm_msr_entry *entries;
2132	int r, n;
2133	unsigned size;
2134
2135	r = -EFAULT;
2136	if (copy_from_user(&msrs, user_msrs, sizeof msrs))
2137		goto out;
2138
2139	r = -E2BIG;
2140	if (msrs.nmsrs >= MAX_IO_MSRS)
2141		goto out;
2142
2143	r = -ENOMEM;
2144	size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
2145	entries = vmalloc(size);
2146	if (!entries)
2147		goto out;
2148
2149	r = -EFAULT;
2150	if (copy_from_user(entries, user_msrs->entries, size))
2151		goto out_free;
2152
2153	r = n = __msr_io(vcpu, &msrs, entries, do_msr);
2154	if (r < 0)
2155		goto out_free;
2156
2157	r = -EFAULT;
2158	if (writeback && copy_to_user(user_msrs->entries, entries, size))
2159		goto out_free;
2160
2161	r = n;
2162
2163out_free:
2164	vfree(entries);
2165out:
2166	return r;
2167}
2168
2169/*
2170 * Translate a guest virtual address to a guest physical address.
2171 */
2172static int kvm_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
2173				    struct kvm_translation *tr)
2174{
2175	unsigned long vaddr = tr->linear_address;
2176	gpa_t gpa;
2177
2178	vcpu_load(vcpu);
2179	spin_lock(&vcpu->kvm->lock);
2180	gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
2181	tr->physical_address = gpa;
2182	tr->valid = gpa != UNMAPPED_GVA;
2183	tr->writeable = 1;
2184	tr->usermode = 0;
2185	spin_unlock(&vcpu->kvm->lock);
2186	vcpu_put(vcpu);
2187
2188	return 0;
2189}
2190
2191static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2192				    struct kvm_interrupt *irq)
2193{
2194	if (irq->irq < 0 || irq->irq >= 256)
2195		return -EINVAL;
2196	vcpu_load(vcpu);
2197
2198	set_bit(irq->irq, vcpu->irq_pending);
2199	set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
2200
2201	vcpu_put(vcpu);
2202
2203	return 0;
2204}
2205
2206static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
2207				      struct kvm_debug_guest *dbg)
2208{
2209	int r;
2210
2211	vcpu_load(vcpu);
2212
2213	r = kvm_arch_ops->set_guest_debug(vcpu, dbg);
2214
2215	vcpu_put(vcpu);
2216
2217	return r;
2218}
2219
2220static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma,
2221				    unsigned long address,
2222				    int *type)
2223{
2224	struct kvm_vcpu *vcpu = vma->vm_file->private_data;
2225	unsigned long pgoff;
2226	struct page *page;
2227
2228	*type = VM_FAULT_MINOR;
2229	pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2230	if (pgoff == 0)
2231		page = virt_to_page(vcpu->run);
2232	else if (pgoff == KVM_PIO_PAGE_OFFSET)
2233		page = virt_to_page(vcpu->pio_data);
2234	else
2235		return NOPAGE_SIGBUS;
2236	get_page(page);
2237	return page;
2238}
2239
2240static struct vm_operations_struct kvm_vcpu_vm_ops = {
2241	.nopage = kvm_vcpu_nopage,
2242};
2243
2244static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
2245{
2246	vma->vm_ops = &kvm_vcpu_vm_ops;
2247	return 0;
2248}
2249
2250static int kvm_vcpu_release(struct inode *inode, struct file *filp)
2251{
2252	struct kvm_vcpu *vcpu = filp->private_data;
2253
2254	fput(vcpu->kvm->filp);
2255	return 0;
2256}
2257
2258static struct file_operations kvm_vcpu_fops = {
2259	.release        = kvm_vcpu_release,
2260	.unlocked_ioctl = kvm_vcpu_ioctl,
2261	.compat_ioctl   = kvm_vcpu_ioctl,
2262	.mmap           = kvm_vcpu_mmap,
2263};
2264
2265/*
2266 * Allocates an inode for the vcpu.
2267 */
2268static int create_vcpu_fd(struct kvm_vcpu *vcpu)
2269{
2270	int fd, r;
2271	struct inode *inode;
2272	struct file *file;
2273
2274	atomic_inc(&vcpu->kvm->filp->f_count);
2275	inode = kvmfs_inode(&kvm_vcpu_fops);
2276	if (IS_ERR(inode)) {
2277		r = PTR_ERR(inode);
2278		goto out1;
2279	}
2280
2281	file = kvmfs_file(inode, vcpu);
2282	if (IS_ERR(file)) {
2283		r = PTR_ERR(file);
2284		goto out2;
2285	}
2286
2287	r = get_unused_fd();
2288	if (r < 0)
2289		goto out3;
2290	fd = r;
2291	fd_install(fd, file);
2292
2293	return fd;
2294
2295out3:
2296	fput(file);
2297out2:
2298	iput(inode);
2299out1:
2300	fput(vcpu->kvm->filp);
2301	return r;
2302}
2303
2304/*
2305 * Creates some virtual cpus.  Good luck creating more than one.
2306 */
2307static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
2308{
2309	int r;
2310	struct kvm_vcpu *vcpu;
2311	struct page *page;
2312
2313	r = -EINVAL;
2314	if (!valid_vcpu(n))
2315		goto out;
2316
2317	vcpu = &kvm->vcpus[n];
2318
2319	mutex_lock(&vcpu->mutex);
2320
2321	if (vcpu->vmcs) {
2322		mutex_unlock(&vcpu->mutex);
2323		return -EEXIST;
2324	}
2325
2326	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2327	r = -ENOMEM;
2328	if (!page)
2329		goto out_unlock;
2330	vcpu->run = page_address(page);
2331
2332	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2333	r = -ENOMEM;
2334	if (!page)
2335		goto out_free_run;
2336	vcpu->pio_data = page_address(page);
2337
2338	vcpu->host_fx_image = (char*)ALIGN((hva_t)vcpu->fx_buf,
2339					   FX_IMAGE_ALIGN);
2340	vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE;
2341	vcpu->cr0 = 0x10;
2342
2343	r = kvm_arch_ops->vcpu_create(vcpu);
2344	if (r < 0)
2345		goto out_free_vcpus;
2346
2347	r = kvm_mmu_create(vcpu);
2348	if (r < 0)
2349		goto out_free_vcpus;
2350
2351	kvm_arch_ops->vcpu_load(vcpu);
2352	r = kvm_mmu_setup(vcpu);
2353	if (r >= 0)
2354		r = kvm_arch_ops->vcpu_setup(vcpu);
2355	vcpu_put(vcpu);
2356
2357	if (r < 0)
2358		goto out_free_vcpus;
2359
2360	r = create_vcpu_fd(vcpu);
2361	if (r < 0)
2362		goto out_free_vcpus;
2363
2364	return r;
2365
2366out_free_vcpus:
2367	kvm_free_vcpu(vcpu);
2368out_free_run:
2369	free_page((unsigned long)vcpu->run);
2370	vcpu->run = NULL;
2371out_unlock:
2372	mutex_unlock(&vcpu->mutex);
2373out:
2374	return r;
2375}
2376
2377static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
2378				    struct kvm_cpuid *cpuid,
2379				    struct kvm_cpuid_entry __user *entries)
2380{
2381	int r;
2382
2383	r = -E2BIG;
2384	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
2385		goto out;
2386	r = -EFAULT;
2387	if (copy_from_user(&vcpu->cpuid_entries, entries,
2388			   cpuid->nent * sizeof(struct kvm_cpuid_entry)))
2389		goto out;
2390	vcpu->cpuid_nent = cpuid->nent;
2391	return 0;
2392
2393out:
2394	return r;
2395}
2396
2397static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
2398{
2399	if (sigset) {
2400		sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
2401		vcpu->sigset_active = 1;
2402		vcpu->sigset = *sigset;
2403	} else
2404		vcpu->sigset_active = 0;
2405	return 0;
2406}
2407
2408/*
2409 * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
2410 * we have asm/x86/processor.h
2411 */
2412struct fxsave {
2413	u16	cwd;
2414	u16	swd;
2415	u16	twd;
2416	u16	fop;
2417	u64	rip;
2418	u64	rdp;
2419	u32	mxcsr;
2420	u32	mxcsr_mask;
2421	u32	st_space[32];	/* 8*16 bytes for each FP-reg = 128 bytes */
2422#ifdef CONFIG_X86_64
2423	u32	xmm_space[64];	/* 16*16 bytes for each XMM-reg = 256 bytes */
2424#else
2425	u32	xmm_space[32];	/* 8*16 bytes for each XMM-reg = 128 bytes */
2426#endif
2427};
2428
2429static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2430{
2431	struct fxsave *fxsave = (struct fxsave *)vcpu->guest_fx_image;
2432
2433	vcpu_load(vcpu);
2434
2435	memcpy(fpu->fpr, fxsave->st_space, 128);
2436	fpu->fcw = fxsave->cwd;
2437	fpu->fsw = fxsave->swd;
2438	fpu->ftwx = fxsave->twd;
2439	fpu->last_opcode = fxsave->fop;
2440	fpu->last_ip = fxsave->rip;
2441	fpu->last_dp = fxsave->rdp;
2442	memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
2443
2444	vcpu_put(vcpu);
2445
2446	return 0;
2447}
2448
2449static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2450{
2451	struct fxsave *fxsave = (struct fxsave *)vcpu->guest_fx_image;
2452
2453	vcpu_load(vcpu);
2454
2455	memcpy(fxsave->st_space, fpu->fpr, 128);
2456	fxsave->cwd = fpu->fcw;
2457	fxsave->swd = fpu->fsw;
2458	fxsave->twd = fpu->ftwx;
2459	fxsave->fop = fpu->last_opcode;
2460	fxsave->rip = fpu->last_ip;
2461	fxsave->rdp = fpu->last_dp;
2462	memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
2463
2464	vcpu_put(vcpu);
2465
2466	return 0;
2467}
2468
2469static long kvm_vcpu_ioctl(struct file *filp,
2470			   unsigned int ioctl, unsigned long arg)
2471{
2472	struct kvm_vcpu *vcpu = filp->private_data;
2473	void __user *argp = (void __user *)arg;
2474	int r = -EINVAL;
2475
2476	switch (ioctl) {
2477	case KVM_RUN:
2478		r = -EINVAL;
2479		if (arg)
2480			goto out;
2481		r = kvm_vcpu_ioctl_run(vcpu, vcpu->run);
2482		break;
2483	case KVM_GET_REGS: {
2484		struct kvm_regs kvm_regs;
2485
2486		memset(&kvm_regs, 0, sizeof kvm_regs);
2487		r = kvm_vcpu_ioctl_get_regs(vcpu, &kvm_regs);
2488		if (r)
2489			goto out;
2490		r = -EFAULT;
2491		if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
2492			goto out;
2493		r = 0;
2494		break;
2495	}
2496	case KVM_SET_REGS: {
2497		struct kvm_regs kvm_regs;
2498
2499		r = -EFAULT;
2500		if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
2501			goto out;
2502		r = kvm_vcpu_ioctl_set_regs(vcpu, &kvm_regs);
2503		if (r)
2504			goto out;
2505		r = 0;
2506		break;
2507	}
2508	case KVM_GET_SREGS: {
2509		struct kvm_sregs kvm_sregs;
2510
2511		memset(&kvm_sregs, 0, sizeof kvm_sregs);
2512		r = kvm_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
2513		if (r)
2514			goto out;
2515		r = -EFAULT;
2516		if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
2517			goto out;
2518		r = 0;
2519		break;
2520	}
2521	case KVM_SET_SREGS: {
2522		struct kvm_sregs kvm_sregs;
2523
2524		r = -EFAULT;
2525		if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
2526			goto out;
2527		r = kvm_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
2528		if (r)
2529			goto out;
2530		r = 0;
2531		break;
2532	}
2533	case KVM_TRANSLATE: {
2534		struct kvm_translation tr;
2535
2536		r = -EFAULT;
2537		if (copy_from_user(&tr, argp, sizeof tr))
2538			goto out;
2539		r = kvm_vcpu_ioctl_translate(vcpu, &tr);
2540		if (r)
2541			goto out;
2542		r = -EFAULT;
2543		if (copy_to_user(argp, &tr, sizeof tr))
2544			goto out;
2545		r = 0;
2546		break;
2547	}
2548	case KVM_INTERRUPT: {
2549		struct kvm_interrupt irq;
2550
2551		r = -EFAULT;
2552		if (copy_from_user(&irq, argp, sizeof irq))
2553			goto out;
2554		r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
2555		if (r)
2556			goto out;
2557		r = 0;
2558		break;
2559	}
2560	case KVM_DEBUG_GUEST: {
2561		struct kvm_debug_guest dbg;
2562
2563		r = -EFAULT;
2564		if (copy_from_user(&dbg, argp, sizeof dbg))
2565			goto out;
2566		r = kvm_vcpu_ioctl_debug_guest(vcpu, &dbg);
2567		if (r)
2568			goto out;
2569		r = 0;
2570		break;
2571	}
2572	case KVM_GET_MSRS:
2573		r = msr_io(vcpu, argp, get_msr, 1);
2574		break;
2575	case KVM_SET_MSRS:
2576		r = msr_io(vcpu, argp, do_set_msr, 0);
2577		break;
2578	case KVM_SET_CPUID: {
2579		struct kvm_cpuid __user *cpuid_arg = argp;
2580		struct kvm_cpuid cpuid;
2581
2582		r = -EFAULT;
2583		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
2584			goto out;
2585		r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
2586		if (r)
2587			goto out;
2588		break;
2589	}
2590	case KVM_SET_SIGNAL_MASK: {
2591		struct kvm_signal_mask __user *sigmask_arg = argp;
2592		struct kvm_signal_mask kvm_sigmask;
2593		sigset_t sigset, *p;
2594
2595		p = NULL;
2596		if (argp) {
2597			r = -EFAULT;
2598			if (copy_from_user(&kvm_sigmask, argp,
2599					   sizeof kvm_sigmask))
2600				goto out;
2601			r = -EINVAL;
2602			if (kvm_sigmask.len != sizeof sigset)
2603				goto out;
2604			r = -EFAULT;
2605			if (copy_from_user(&sigset, sigmask_arg->sigset,
2606					   sizeof sigset))
2607				goto out;
2608			p = &sigset;
2609		}
2610		r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
2611		break;
2612	}
2613	case KVM_GET_FPU: {
2614		struct kvm_fpu fpu;
2615
2616		memset(&fpu, 0, sizeof fpu);
2617		r = kvm_vcpu_ioctl_get_fpu(vcpu, &fpu);
2618		if (r)
2619			goto out;
2620		r = -EFAULT;
2621		if (copy_to_user(argp, &fpu, sizeof fpu))
2622			goto out;
2623		r = 0;
2624		break;
2625	}
2626	case KVM_SET_FPU: {
2627		struct kvm_fpu fpu;
2628
2629		r = -EFAULT;
2630		if (copy_from_user(&fpu, argp, sizeof fpu))
2631			goto out;
2632		r = kvm_vcpu_ioctl_set_fpu(vcpu, &fpu);
2633		if (r)
2634			goto out;
2635		r = 0;
2636		break;
2637	}
2638	default:
2639		;
2640	}
2641out:
2642	return r;
2643}
2644
2645static long kvm_vm_ioctl(struct file *filp,
2646			   unsigned int ioctl, unsigned long arg)
2647{
2648	struct kvm *kvm = filp->private_data;
2649	void __user *argp = (void __user *)arg;
2650	int r = -EINVAL;
2651
2652	switch (ioctl) {
2653	case KVM_CREATE_VCPU:
2654		r = kvm_vm_ioctl_create_vcpu(kvm, arg);
2655		if (r < 0)
2656			goto out;
2657		break;
2658	case KVM_SET_MEMORY_REGION: {
2659		struct kvm_memory_region kvm_mem;
2660
2661		r = -EFAULT;
2662		if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
2663			goto out;
2664		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_mem);
2665		if (r)
2666			goto out;
2667		break;
2668	}
2669	case KVM_GET_DIRTY_LOG: {
2670		struct kvm_dirty_log log;
2671
2672		r = -EFAULT;
2673		if (copy_from_user(&log, argp, sizeof log))
2674			goto out;
2675		r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
2676		if (r)
2677			goto out;
2678		break;
2679	}
2680	case KVM_SET_MEMORY_ALIAS: {
2681		struct kvm_memory_alias alias;
2682
2683		r = -EFAULT;
2684		if (copy_from_user(&alias, argp, sizeof alias))
2685			goto out;
2686		r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
2687		if (r)
2688			goto out;
2689		break;
2690	}
2691	default:
2692		;
2693	}
2694out:
2695	return r;
2696}
2697
2698static struct page *kvm_vm_nopage(struct vm_area_struct *vma,
2699				  unsigned long address,
2700				  int *type)
2701{
2702	struct kvm *kvm = vma->vm_file->private_data;
2703	unsigned long pgoff;
2704	struct page *page;
2705
2706	*type = VM_FAULT_MINOR;
2707	pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2708	page = gfn_to_page(kvm, pgoff);
2709	if (!page)
2710		return NOPAGE_SIGBUS;
2711	get_page(page);
2712	return page;
2713}
2714
2715static struct vm_operations_struct kvm_vm_vm_ops = {
2716	.nopage = kvm_vm_nopage,
2717};
2718
2719static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
2720{
2721	vma->vm_ops = &kvm_vm_vm_ops;
2722	return 0;
2723}
2724
2725static struct file_operations kvm_vm_fops = {
2726	.release        = kvm_vm_release,
2727	.unlocked_ioctl = kvm_vm_ioctl,
2728	.compat_ioctl   = kvm_vm_ioctl,
2729	.mmap           = kvm_vm_mmap,
2730};
2731
2732static int kvm_dev_ioctl_create_vm(void)
2733{
2734	int fd, r;
2735	struct inode *inode;
2736	struct file *file;
2737	struct kvm *kvm;
2738
2739	inode = kvmfs_inode(&kvm_vm_fops);
2740	if (IS_ERR(inode)) {
2741		r = PTR_ERR(inode);
2742		goto out1;
2743	}
2744
2745	kvm = kvm_create_vm();
2746	if (IS_ERR(kvm)) {
2747		r = PTR_ERR(kvm);
2748		goto out2;
2749	}
2750
2751	file = kvmfs_file(inode, kvm);
2752	if (IS_ERR(file)) {
2753		r = PTR_ERR(file);
2754		goto out3;
2755	}
2756	kvm->filp = file;
2757
2758	r = get_unused_fd();
2759	if (r < 0)
2760		goto out4;
2761	fd = r;
2762	fd_install(fd, file);
2763
2764	return fd;
2765
2766out4:
2767	fput(file);
2768out3:
2769	kvm_destroy_vm(kvm);
2770out2:
2771	iput(inode);
2772out1:
2773	return r;
2774}
2775
2776static long kvm_dev_ioctl(struct file *filp,
2777			  unsigned int ioctl, unsigned long arg)
2778{
2779	void __user *argp = (void __user *)arg;
2780	long r = -EINVAL;
2781
2782	switch (ioctl) {
2783	case KVM_GET_API_VERSION:
2784		r = -EINVAL;
2785		if (arg)
2786			goto out;
2787		r = KVM_API_VERSION;
2788		break;
2789	case KVM_CREATE_VM:
2790		r = -EINVAL;
2791		if (arg)
2792			goto out;
2793		r = kvm_dev_ioctl_create_vm();
2794		break;
2795	case KVM_GET_MSR_INDEX_LIST: {
2796		struct kvm_msr_list __user *user_msr_list = argp;
2797		struct kvm_msr_list msr_list;
2798		unsigned n;
2799
2800		r = -EFAULT;
2801		if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
2802			goto out;
2803		n = msr_list.nmsrs;
2804		msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
2805		if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
2806			goto out;
2807		r = -E2BIG;
2808		if (n < num_msrs_to_save)
2809			goto out;
2810		r = -EFAULT;
2811		if (copy_to_user(user_msr_list->indices, &msrs_to_save,
2812				 num_msrs_to_save * sizeof(u32)))
2813			goto out;
2814		if (copy_to_user(user_msr_list->indices
2815				 + num_msrs_to_save * sizeof(u32),
2816				 &emulated_msrs,
2817				 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
2818			goto out;
2819		r = 0;
2820		break;
2821	}
2822	case KVM_CHECK_EXTENSION:
2823		/*
2824		 * No extensions defined at present.
2825		 */
2826		r = 0;
2827		break;
2828	case KVM_GET_VCPU_MMAP_SIZE:
2829		r = -EINVAL;
2830		if (arg)
2831			goto out;
2832		r = 2 * PAGE_SIZE;
2833		break;
2834	default:
2835		;
2836	}
2837out:
2838	return r;
2839}
2840
2841static struct file_operations kvm_chardev_ops = {
2842	.open		= kvm_dev_open,
2843	.release        = kvm_dev_release,
2844	.unlocked_ioctl = kvm_dev_ioctl,
2845	.compat_ioctl   = kvm_dev_ioctl,
2846};
2847
2848static struct miscdevice kvm_dev = {
2849	KVM_MINOR,
2850	"kvm",
2851	&kvm_chardev_ops,
2852};
2853
2854static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
2855                       void *v)
2856{
2857	if (val == SYS_RESTART) {
2858		/*
2859		 * Some (well, at least mine) BIOSes hang on reboot if
2860		 * in vmx root mode.
2861		 */
2862		printk(KERN_INFO "kvm: exiting hardware virtualization\n");
2863		on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1);
2864	}
2865	return NOTIFY_OK;
2866}
2867
2868static struct notifier_block kvm_reboot_notifier = {
2869	.notifier_call = kvm_reboot,
2870	.priority = 0,
2871};
2872
2873/*
2874 * Make sure that a cpu that is being hot-unplugged does not have any vcpus
2875 * cached on it.
2876 */
2877static void decache_vcpus_on_cpu(int cpu)
2878{
2879	struct kvm *vm;
2880	struct kvm_vcpu *vcpu;
2881	int i;
2882
2883	spin_lock(&kvm_lock);
2884	list_for_each_entry(vm, &vm_list, vm_list)
2885		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
2886			vcpu = &vm->vcpus[i];
2887			/*
2888			 * If the vcpu is locked, then it is running on some
2889			 * other cpu and therefore it is not cached on the
2890			 * cpu in question.
2891			 *
2892			 * If it's not locked, check the last cpu it executed
2893			 * on.
2894			 */
2895			if (mutex_trylock(&vcpu->mutex)) {
2896				if (vcpu->cpu == cpu) {
2897					kvm_arch_ops->vcpu_decache(vcpu);
2898					vcpu->cpu = -1;
2899				}
2900				mutex_unlock(&vcpu->mutex);
2901			}
2902		}
2903	spin_unlock(&kvm_lock);
2904}
2905
2906static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
2907			   void *v)
2908{
2909	int cpu = (long)v;
2910
2911	switch (val) {
2912	case CPU_DOWN_PREPARE:
2913	case CPU_DOWN_PREPARE_FROZEN:
2914	case CPU_UP_CANCELED:
2915	case CPU_UP_CANCELED_FROZEN:
2916		printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
2917		       cpu);
2918		decache_vcpus_on_cpu(cpu);
2919		smp_call_function_single(cpu, kvm_arch_ops->hardware_disable,
2920					 NULL, 0, 1);
2921		break;
2922	case CPU_ONLINE:
2923	case CPU_ONLINE_FROZEN:
2924		printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
2925		       cpu);
2926		smp_call_function_single(cpu, kvm_arch_ops->hardware_enable,
2927					 NULL, 0, 1);
2928		break;
2929	}
2930	return NOTIFY_OK;
2931}
2932
2933static struct notifier_block kvm_cpu_notifier = {
2934	.notifier_call = kvm_cpu_hotplug,
2935	.priority = 20, /* must be > scheduler priority */
2936};
2937
2938static u64 stat_get(void *_offset)
2939{
2940	unsigned offset = (long)_offset;
2941	u64 total = 0;
2942	struct kvm *kvm;
2943	struct kvm_vcpu *vcpu;
2944	int i;
2945
2946	spin_lock(&kvm_lock);
2947	list_for_each_entry(kvm, &vm_list, vm_list)
2948		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
2949			vcpu = &kvm->vcpus[i];
2950			total += *(u32 *)((void *)vcpu + offset);
2951		}
2952	spin_unlock(&kvm_lock);
2953	return total;
2954}
2955
2956static void stat_set(void *offset, u64 val)
2957{
2958}
2959
2960DEFINE_SIMPLE_ATTRIBUTE(stat_fops, stat_get, stat_set, "%llu\n");
2961
2962static __init void kvm_init_debug(void)
2963{
2964	struct kvm_stats_debugfs_item *p;
2965
2966	debugfs_dir = debugfs_create_dir("kvm", NULL);
2967	for (p = debugfs_entries; p->name; ++p)
2968		p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir,
2969						(void *)(long)p->offset,
2970						&stat_fops);
2971}
2972
2973static void kvm_exit_debug(void)
2974{
2975	struct kvm_stats_debugfs_item *p;
2976
2977	for (p = debugfs_entries; p->name; ++p)
2978		debugfs_remove(p->dentry);
2979	debugfs_remove(debugfs_dir);
2980}
2981
2982static int kvm_suspend(struct sys_device *dev, pm_message_t state)
2983{
2984	decache_vcpus_on_cpu(raw_smp_processor_id());
2985	on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1);
2986	return 0;
2987}
2988
2989static int kvm_resume(struct sys_device *dev)
2990{
2991	on_each_cpu(kvm_arch_ops->hardware_enable, NULL, 0, 1);
2992	return 0;
2993}
2994
2995static struct sysdev_class kvm_sysdev_class = {
2996	set_kset_name("kvm"),
2997	.suspend = kvm_suspend,
2998	.resume = kvm_resume,
2999};
3000
3001static struct sys_device kvm_sysdev = {
3002	.id = 0,
3003	.cls = &kvm_sysdev_class,
3004};
3005
3006hpa_t bad_page_address;
3007
3008static int kvmfs_get_sb(struct file_system_type *fs_type, int flags,
3009			const char *dev_name, void *data, struct vfsmount *mnt)
3010{
3011	return get_sb_pseudo(fs_type, "kvm:", NULL, KVMFS_SUPER_MAGIC, mnt);
3012}
3013
3014static struct file_system_type kvm_fs_type = {
3015	.name		= "kvmfs",
3016	.get_sb		= kvmfs_get_sb,
3017	.kill_sb	= kill_anon_super,
3018};
3019
3020int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module)
3021{
3022	int r;
3023
3024	if (kvm_arch_ops) {
3025		printk(KERN_ERR "kvm: already loaded the other module\n");
3026		return -EEXIST;
3027	}
3028
3029	if (!ops->cpu_has_kvm_support()) {
3030		printk(KERN_ERR "kvm: no hardware support\n");
3031		return -EOPNOTSUPP;
3032	}
3033	if (ops->disabled_by_bios()) {
3034		printk(KERN_ERR "kvm: disabled by bios\n");
3035		return -EOPNOTSUPP;
3036	}
3037
3038	kvm_arch_ops = ops;
3039
3040	r = kvm_arch_ops->hardware_setup();
3041	if (r < 0)
3042		goto out;
3043
3044	on_each_cpu(kvm_arch_ops->hardware_enable, NULL, 0, 1);
3045	r = register_cpu_notifier(&kvm_cpu_notifier);
3046	if (r)
3047		goto out_free_1;
3048	register_reboot_notifier(&kvm_reboot_notifier);
3049
3050	r = sysdev_class_register(&kvm_sysdev_class);
3051	if (r)
3052		goto out_free_2;
3053
3054	r = sysdev_register(&kvm_sysdev);
3055	if (r)
3056		goto out_free_3;
3057
3058	kvm_chardev_ops.owner = module;
3059
3060	r = misc_register(&kvm_dev);
3061	if (r) {
3062		printk (KERN_ERR "kvm: misc device register failed\n");
3063		goto out_free;
3064	}
3065
3066	return r;
3067
3068out_free:
3069	sysdev_unregister(&kvm_sysdev);
3070out_free_3:
3071	sysdev_class_unregister(&kvm_sysdev_class);
3072out_free_2:
3073	unregister_reboot_notifier(&kvm_reboot_notifier);
3074	unregister_cpu_notifier(&kvm_cpu_notifier);
3075out_free_1:
3076	on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1);
3077	kvm_arch_ops->hardware_unsetup();
3078out:
3079	kvm_arch_ops = NULL;
3080	return r;
3081}
3082
3083void kvm_exit_arch(void)
3084{
3085	misc_deregister(&kvm_dev);
3086	sysdev_unregister(&kvm_sysdev);
3087	sysdev_class_unregister(&kvm_sysdev_class);
3088	unregister_reboot_notifier(&kvm_reboot_notifier);
3089	unregister_cpu_notifier(&kvm_cpu_notifier);
3090	on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1);
3091	kvm_arch_ops->hardware_unsetup();
3092	kvm_arch_ops = NULL;
3093}
3094
3095static __init int kvm_init(void)
3096{
3097	static struct page *bad_page;
3098	int r;
3099
3100	r = kvm_mmu_module_init();
3101	if (r)
3102		goto out4;
3103
3104	r = register_filesystem(&kvm_fs_type);
3105	if (r)
3106		goto out3;
3107
3108	kvmfs_mnt = kern_mount(&kvm_fs_type);
3109	r = PTR_ERR(kvmfs_mnt);
3110	if (IS_ERR(kvmfs_mnt))
3111		goto out2;
3112	kvm_init_debug();
3113
3114	kvm_init_msr_list();
3115
3116	if ((bad_page = alloc_page(GFP_KERNEL)) == NULL) {
3117		r = -ENOMEM;
3118		goto out;
3119	}
3120
3121	bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT;
3122	memset(__va(bad_page_address), 0, PAGE_SIZE);
3123
3124	return 0;
3125
3126out:
3127	kvm_exit_debug();
3128	mntput(kvmfs_mnt);
3129out2:
3130	unregister_filesystem(&kvm_fs_type);
3131out3:
3132	kvm_mmu_module_exit();
3133out4:
3134	return r;
3135}
3136
3137static __exit void kvm_exit(void)
3138{
3139	kvm_exit_debug();
3140	__free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
3141	mntput(kvmfs_mnt);
3142	unregister_filesystem(&kvm_fs_type);
3143	kvm_mmu_module_exit();
3144}
3145
3146module_init(kvm_init)
3147module_exit(kvm_exit)
3148
3149EXPORT_SYMBOL_GPL(kvm_init_arch);
3150EXPORT_SYMBOL_GPL(kvm_exit_arch);
3151