1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * Copyright (C) 2006 Qumranet, Inc.
8 *
9 * Authors:
10 *   Avi Kivity   <avi@qumranet.com>
11 *   Yaniv Kamay  <yaniv@qumranet.com>
12 *
13 * This work is licensed under the terms of the GNU GPL, version 2.  See
14 * the COPYING file in the top-level directory.
15 *
16 */
17
18#include "kvm.h"
19#include "vmx.h"
20#include <linux/module.h>
21#include <linux/kernel.h>
22#include <linux/mm.h>
23#include <linux/highmem.h>
24#include <linux/profile.h>
25#include <linux/sched.h>
26#include <asm/io.h>
27#include <asm/desc.h>
28
29#include "segment_descriptor.h"
30
31MODULE_AUTHOR("Qumranet");
32MODULE_LICENSE("GPL");
33
34static DEFINE_PER_CPU(struct vmcs *, vmxarea);
35static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
36
37#ifdef CONFIG_X86_64
38#define HOST_IS_64 1
39#else
40#define HOST_IS_64 0
41#endif
42
43static struct vmcs_descriptor {
44	int size;
45	int order;
46	u32 revision_id;
47} vmcs_descriptor;
48
49#define VMX_SEGMENT_FIELD(seg)					\
50	[VCPU_SREG_##seg] = {                                   \
51		.selector = GUEST_##seg##_SELECTOR,		\
52		.base = GUEST_##seg##_BASE,		   	\
53		.limit = GUEST_##seg##_LIMIT,		   	\
54		.ar_bytes = GUEST_##seg##_AR_BYTES,	   	\
55	}
56
57static struct kvm_vmx_segment_field {
58	unsigned selector;
59	unsigned base;
60	unsigned limit;
61	unsigned ar_bytes;
62} kvm_vmx_segment_fields[] = {
63	VMX_SEGMENT_FIELD(CS),
64	VMX_SEGMENT_FIELD(DS),
65	VMX_SEGMENT_FIELD(ES),
66	VMX_SEGMENT_FIELD(FS),
67	VMX_SEGMENT_FIELD(GS),
68	VMX_SEGMENT_FIELD(SS),
69	VMX_SEGMENT_FIELD(TR),
70	VMX_SEGMENT_FIELD(LDTR),
71};
72
73/*
74 * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
75 * away by decrementing the array size.
76 */
77static const u32 vmx_msr_index[] = {
78#ifdef CONFIG_X86_64
79	MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
80#endif
81	MSR_EFER, MSR_K6_STAR,
82};
83#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
84
85#ifdef CONFIG_X86_64
86static unsigned msr_offset_kernel_gs_base;
87#define NR_64BIT_MSRS 4
88/*
89 * avoid save/load MSR_SYSCALL_MASK and MSR_LSTAR by std vt
90 * mechanism (cpu bug AA24)
91 */
92#define NR_BAD_MSRS 2
93#else
94#define NR_64BIT_MSRS 0
95#define NR_BAD_MSRS 0
96#endif
97
98static inline int is_page_fault(u32 intr_info)
99{
100	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
101			     INTR_INFO_VALID_MASK)) ==
102		(INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
103}
104
105static inline int is_no_device(u32 intr_info)
106{
107	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
108			     INTR_INFO_VALID_MASK)) ==
109		(INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
110}
111
112static inline int is_external_interrupt(u32 intr_info)
113{
114	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
115		== (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
116}
117
118static struct vmx_msr_entry *find_msr_entry(struct kvm_vcpu *vcpu, u32 msr)
119{
120	int i;
121
122	for (i = 0; i < vcpu->nmsrs; ++i)
123		if (vcpu->guest_msrs[i].index == msr)
124			return &vcpu->guest_msrs[i];
125	return NULL;
126}
127
128static void vmcs_clear(struct vmcs *vmcs)
129{
130	u64 phys_addr = __pa(vmcs);
131	u8 error;
132
133	asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0"
134		      : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
135		      : "cc", "memory");
136	if (error)
137		printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
138		       vmcs, phys_addr);
139}
140
141static void __vcpu_clear(void *arg)
142{
143	struct kvm_vcpu *vcpu = arg;
144	int cpu = raw_smp_processor_id();
145
146	if (vcpu->cpu == cpu)
147		vmcs_clear(vcpu->vmcs);
148	if (per_cpu(current_vmcs, cpu) == vcpu->vmcs)
149		per_cpu(current_vmcs, cpu) = NULL;
150}
151
152static void vcpu_clear(struct kvm_vcpu *vcpu)
153{
154	if (vcpu->cpu != raw_smp_processor_id() && vcpu->cpu != -1)
155		smp_call_function_single(vcpu->cpu, __vcpu_clear, vcpu, 0, 1);
156	else
157		__vcpu_clear(vcpu);
158	vcpu->launched = 0;
159}
160
161static unsigned long vmcs_readl(unsigned long field)
162{
163	unsigned long value;
164
165	asm volatile (ASM_VMX_VMREAD_RDX_RAX
166		      : "=a"(value) : "d"(field) : "cc");
167	return value;
168}
169
170static u16 vmcs_read16(unsigned long field)
171{
172	return vmcs_readl(field);
173}
174
175static u32 vmcs_read32(unsigned long field)
176{
177	return vmcs_readl(field);
178}
179
180static u64 vmcs_read64(unsigned long field)
181{
182#ifdef CONFIG_X86_64
183	return vmcs_readl(field);
184#else
185	return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32);
186#endif
187}
188
189static noinline void vmwrite_error(unsigned long field, unsigned long value)
190{
191	printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
192	       field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
193	dump_stack();
194}
195
196static void vmcs_writel(unsigned long field, unsigned long value)
197{
198	u8 error;
199
200	asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0"
201		       : "=q"(error) : "a"(value), "d"(field) : "cc" );
202	if (unlikely(error))
203		vmwrite_error(field, value);
204}
205
206static void vmcs_write16(unsigned long field, u16 value)
207{
208	vmcs_writel(field, value);
209}
210
211static void vmcs_write32(unsigned long field, u32 value)
212{
213	vmcs_writel(field, value);
214}
215
216static void vmcs_write64(unsigned long field, u64 value)
217{
218#ifdef CONFIG_X86_64
219	vmcs_writel(field, value);
220#else
221	vmcs_writel(field, value);
222	asm volatile ("");
223	vmcs_writel(field+1, value >> 32);
224#endif
225}
226
227static void vmcs_clear_bits(unsigned long field, u32 mask)
228{
229	vmcs_writel(field, vmcs_readl(field) & ~mask);
230}
231
232static void vmcs_set_bits(unsigned long field, u32 mask)
233{
234	vmcs_writel(field, vmcs_readl(field) | mask);
235}
236
237/*
238 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
239 * vcpu mutex is already taken.
240 */
241static void vmx_vcpu_load(struct kvm_vcpu *vcpu)
242{
243	u64 phys_addr = __pa(vcpu->vmcs);
244	int cpu;
245
246	cpu = get_cpu();
247
248	if (vcpu->cpu != cpu)
249		vcpu_clear(vcpu);
250
251	if (per_cpu(current_vmcs, cpu) != vcpu->vmcs) {
252		u8 error;
253
254		per_cpu(current_vmcs, cpu) = vcpu->vmcs;
255		asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0"
256			      : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
257			      : "cc");
258		if (error)
259			printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
260			       vcpu->vmcs, phys_addr);
261	}
262
263	if (vcpu->cpu != cpu) {
264		struct descriptor_table dt;
265		unsigned long sysenter_esp;
266
267		vcpu->cpu = cpu;
268		/*
269		 * Linux uses per-cpu TSS and GDT, so set these when switching
270		 * processors.
271		 */
272		vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */
273		get_gdt(&dt);
274		vmcs_writel(HOST_GDTR_BASE, dt.base);   /* 22.2.4 */
275
276		rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
277		vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
278	}
279}
280
281static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
282{
283	kvm_put_guest_fpu(vcpu);
284	put_cpu();
285}
286
287static void vmx_vcpu_decache(struct kvm_vcpu *vcpu)
288{
289	vcpu_clear(vcpu);
290}
291
292static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
293{
294	return vmcs_readl(GUEST_RFLAGS);
295}
296
297static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
298{
299	vmcs_writel(GUEST_RFLAGS, rflags);
300}
301
302static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
303{
304	unsigned long rip;
305	u32 interruptibility;
306
307	rip = vmcs_readl(GUEST_RIP);
308	rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
309	vmcs_writel(GUEST_RIP, rip);
310
311	/*
312	 * We emulated an instruction, so temporary interrupt blocking
313	 * should be removed, if set.
314	 */
315	interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
316	if (interruptibility & 3)
317		vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
318			     interruptibility & ~3);
319	vcpu->interrupt_window_open = 1;
320}
321
322static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code)
323{
324	printk(KERN_DEBUG "inject_general_protection: rip 0x%lx\n",
325	       vmcs_readl(GUEST_RIP));
326	vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
327	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
328		     GP_VECTOR |
329		     INTR_TYPE_EXCEPTION |
330		     INTR_INFO_DELIEVER_CODE_MASK |
331		     INTR_INFO_VALID_MASK);
332}
333
334/*
335 * Set up the vmcs to automatically save and restore system
336 * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
337 * mode, as fiddling with msrs is very expensive.
338 */
339static void setup_msrs(struct kvm_vcpu *vcpu)
340{
341	int nr_skip, nr_good_msrs;
342
343	if (is_long_mode(vcpu))
344		nr_skip = NR_BAD_MSRS;
345	else
346		nr_skip = NR_64BIT_MSRS;
347	nr_good_msrs = vcpu->nmsrs - nr_skip;
348
349	/*
350	 * MSR_K6_STAR is only needed on long mode guests, and only
351	 * if efer.sce is enabled.
352	 */
353	if (find_msr_entry(vcpu, MSR_K6_STAR)) {
354		--nr_good_msrs;
355#ifdef CONFIG_X86_64
356		if (is_long_mode(vcpu) && (vcpu->shadow_efer & EFER_SCE))
357			++nr_good_msrs;
358#endif
359	}
360
361	vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR,
362		    virt_to_phys(vcpu->guest_msrs + nr_skip));
363	vmcs_writel(VM_EXIT_MSR_STORE_ADDR,
364		    virt_to_phys(vcpu->guest_msrs + nr_skip));
365	vmcs_writel(VM_EXIT_MSR_LOAD_ADDR,
366		    virt_to_phys(vcpu->host_msrs + nr_skip));
367	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs); /* 22.2.2 */
368	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs);  /* 22.2.2 */
369	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */
370}
371
372/*
373 * reads and returns guest's timestamp counter "register"
374 * guest_tsc = host_tsc + tsc_offset    -- 21.3
375 */
376static u64 guest_read_tsc(void)
377{
378	u64 host_tsc, tsc_offset;
379
380	rdtscll(host_tsc);
381	tsc_offset = vmcs_read64(TSC_OFFSET);
382	return host_tsc + tsc_offset;
383}
384
385/*
386 * writes 'guest_tsc' into guest's timestamp counter "register"
387 * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
388 */
389static void guest_write_tsc(u64 guest_tsc)
390{
391	u64 host_tsc;
392
393	rdtscll(host_tsc);
394	vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
395}
396
397static void reload_tss(void)
398{
399#ifndef CONFIG_X86_64
400
401	/*
402	 * VT restores TR but not its size.  Useless.
403	 */
404	struct descriptor_table gdt;
405	struct segment_descriptor *descs;
406
407	get_gdt(&gdt);
408	descs = (void *)gdt.base;
409	descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
410	load_TR_desc();
411#endif
412}
413
414/*
415 * Reads an msr value (of 'msr_index') into 'pdata'.
416 * Returns 0 on success, non-0 otherwise.
417 * Assumes vcpu_load() was already called.
418 */
419static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
420{
421	u64 data;
422	struct vmx_msr_entry *msr;
423
424	if (!pdata) {
425		printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
426		return -EINVAL;
427	}
428
429	switch (msr_index) {
430#ifdef CONFIG_X86_64
431	case MSR_FS_BASE:
432		data = vmcs_readl(GUEST_FS_BASE);
433		break;
434	case MSR_GS_BASE:
435		data = vmcs_readl(GUEST_GS_BASE);
436		break;
437	case MSR_EFER:
438		return kvm_get_msr_common(vcpu, msr_index, pdata);
439#endif
440	case MSR_IA32_TIME_STAMP_COUNTER:
441		data = guest_read_tsc();
442		break;
443	case MSR_IA32_SYSENTER_CS:
444		data = vmcs_read32(GUEST_SYSENTER_CS);
445		break;
446	case MSR_IA32_SYSENTER_EIP:
447		data = vmcs_readl(GUEST_SYSENTER_EIP);
448		break;
449	case MSR_IA32_SYSENTER_ESP:
450		data = vmcs_readl(GUEST_SYSENTER_ESP);
451		break;
452	default:
453		msr = find_msr_entry(vcpu, msr_index);
454		if (msr) {
455			data = msr->data;
456			break;
457		}
458		return kvm_get_msr_common(vcpu, msr_index, pdata);
459	}
460
461	*pdata = data;
462	return 0;
463}
464
465/*
466 * Writes msr value into into the appropriate "register".
467 * Returns 0 on success, non-0 otherwise.
468 * Assumes vcpu_load() was already called.
469 */
470static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
471{
472	struct vmx_msr_entry *msr;
473	switch (msr_index) {
474#ifdef CONFIG_X86_64
475	case MSR_EFER:
476		return kvm_set_msr_common(vcpu, msr_index, data);
477	case MSR_FS_BASE:
478		vmcs_writel(GUEST_FS_BASE, data);
479		break;
480	case MSR_GS_BASE:
481		vmcs_writel(GUEST_GS_BASE, data);
482		break;
483#endif
484	case MSR_IA32_SYSENTER_CS:
485		vmcs_write32(GUEST_SYSENTER_CS, data);
486		break;
487	case MSR_IA32_SYSENTER_EIP:
488		vmcs_writel(GUEST_SYSENTER_EIP, data);
489		break;
490	case MSR_IA32_SYSENTER_ESP:
491		vmcs_writel(GUEST_SYSENTER_ESP, data);
492		break;
493	case MSR_IA32_TIME_STAMP_COUNTER:
494		guest_write_tsc(data);
495		break;
496	default:
497		msr = find_msr_entry(vcpu, msr_index);
498		if (msr) {
499			msr->data = data;
500			break;
501		}
502		return kvm_set_msr_common(vcpu, msr_index, data);
503		msr->data = data;
504		break;
505	}
506
507	return 0;
508}
509
510/*
511 * Sync the rsp and rip registers into the vcpu structure.  This allows
512 * registers to be accessed by indexing vcpu->regs.
513 */
514static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
515{
516	vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
517	vcpu->rip = vmcs_readl(GUEST_RIP);
518}
519
520/*
521 * Syncs rsp and rip back into the vmcs.  Should be called after possible
522 * modification.
523 */
524static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
525{
526	vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]);
527	vmcs_writel(GUEST_RIP, vcpu->rip);
528}
529
530static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
531{
532	unsigned long dr7 = 0x400;
533	u32 exception_bitmap;
534	int old_singlestep;
535
536	exception_bitmap = vmcs_read32(EXCEPTION_BITMAP);
537	old_singlestep = vcpu->guest_debug.singlestep;
538
539	vcpu->guest_debug.enabled = dbg->enabled;
540	if (vcpu->guest_debug.enabled) {
541		int i;
542
543		dr7 |= 0x200;  /* exact */
544		for (i = 0; i < 4; ++i) {
545			if (!dbg->breakpoints[i].enabled)
546				continue;
547			vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address;
548			dr7 |= 2 << (i*2);    /* global enable */
549			dr7 |= 0 << (i*4+16); /* execution breakpoint */
550		}
551
552		exception_bitmap |= (1u << 1);  /* Trap debug exceptions */
553
554		vcpu->guest_debug.singlestep = dbg->singlestep;
555	} else {
556		exception_bitmap &= ~(1u << 1); /* Ignore debug exceptions */
557		vcpu->guest_debug.singlestep = 0;
558	}
559
560	if (old_singlestep && !vcpu->guest_debug.singlestep) {
561		unsigned long flags;
562
563		flags = vmcs_readl(GUEST_RFLAGS);
564		flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
565		vmcs_writel(GUEST_RFLAGS, flags);
566	}
567
568	vmcs_write32(EXCEPTION_BITMAP, exception_bitmap);
569	vmcs_writel(GUEST_DR7, dr7);
570
571	return 0;
572}
573
574static __init int cpu_has_kvm_support(void)
575{
576	unsigned long ecx = cpuid_ecx(1);
577	return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
578}
579
580static __init int vmx_disabled_by_bios(void)
581{
582	u64 msr;
583
584	rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
585	return (msr & 5) == 1; /* locked but not enabled */
586}
587
588static void hardware_enable(void *garbage)
589{
590	int cpu = raw_smp_processor_id();
591	u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
592	u64 old;
593
594	rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
595	if ((old & 5) != 5)
596		/* enable and lock */
597		wrmsrl(MSR_IA32_FEATURE_CONTROL, old | 5);
598	write_cr4(read_cr4() | CR4_VMXE);
599	asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr)
600		      : "memory", "cc");
601}
602
603static void hardware_disable(void *garbage)
604{
605	asm volatile (ASM_VMX_VMXOFF : : : "cc");
606}
607
608static __init void setup_vmcs_descriptor(void)
609{
610	u32 vmx_msr_low, vmx_msr_high;
611
612	rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
613	vmcs_descriptor.size = vmx_msr_high & 0x1fff;
614	vmcs_descriptor.order = get_order(vmcs_descriptor.size);
615	vmcs_descriptor.revision_id = vmx_msr_low;
616}
617
618static struct vmcs *alloc_vmcs_cpu(int cpu)
619{
620	int node = cpu_to_node(cpu);
621	struct page *pages;
622	struct vmcs *vmcs;
623
624	pages = alloc_pages_node(node, GFP_KERNEL, vmcs_descriptor.order);
625	if (!pages)
626		return NULL;
627	vmcs = page_address(pages);
628	memset(vmcs, 0, vmcs_descriptor.size);
629	vmcs->revision_id = vmcs_descriptor.revision_id; /* vmcs revision id */
630	return vmcs;
631}
632
633static struct vmcs *alloc_vmcs(void)
634{
635	return alloc_vmcs_cpu(raw_smp_processor_id());
636}
637
638static void free_vmcs(struct vmcs *vmcs)
639{
640	free_pages((unsigned long)vmcs, vmcs_descriptor.order);
641}
642
643static void free_kvm_area(void)
644{
645	int cpu;
646
647	for_each_online_cpu(cpu)
648		free_vmcs(per_cpu(vmxarea, cpu));
649}
650
651extern struct vmcs *alloc_vmcs_cpu(int cpu);
652
653static __init int alloc_kvm_area(void)
654{
655	int cpu;
656
657	for_each_online_cpu(cpu) {
658		struct vmcs *vmcs;
659
660		vmcs = alloc_vmcs_cpu(cpu);
661		if (!vmcs) {
662			free_kvm_area();
663			return -ENOMEM;
664		}
665
666		per_cpu(vmxarea, cpu) = vmcs;
667	}
668	return 0;
669}
670
671static __init int hardware_setup(void)
672{
673	setup_vmcs_descriptor();
674	return alloc_kvm_area();
675}
676
677static __exit void hardware_unsetup(void)
678{
679	free_kvm_area();
680}
681
682static void update_exception_bitmap(struct kvm_vcpu *vcpu)
683{
684	if (vcpu->rmode.active)
685		vmcs_write32(EXCEPTION_BITMAP, ~0);
686	else
687		vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
688}
689
690static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
691{
692	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
693
694	if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) {
695		vmcs_write16(sf->selector, save->selector);
696		vmcs_writel(sf->base, save->base);
697		vmcs_write32(sf->limit, save->limit);
698		vmcs_write32(sf->ar_bytes, save->ar);
699	} else {
700		u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK)
701			<< AR_DPL_SHIFT;
702		vmcs_write32(sf->ar_bytes, 0x93 | dpl);
703	}
704}
705
706static void enter_pmode(struct kvm_vcpu *vcpu)
707{
708	unsigned long flags;
709
710	vcpu->rmode.active = 0;
711
712	vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base);
713	vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit);
714	vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar);
715
716	flags = vmcs_readl(GUEST_RFLAGS);
717	flags &= ~(IOPL_MASK | X86_EFLAGS_VM);
718	flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT);
719	vmcs_writel(GUEST_RFLAGS, flags);
720
721	vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~CR4_VME_MASK) |
722			(vmcs_readl(CR4_READ_SHADOW) & CR4_VME_MASK));
723
724	update_exception_bitmap(vcpu);
725
726	fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->rmode.es);
727	fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->rmode.ds);
728	fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->rmode.gs);
729	fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->rmode.fs);
730
731	vmcs_write16(GUEST_SS_SELECTOR, 0);
732	vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
733
734	vmcs_write16(GUEST_CS_SELECTOR,
735		     vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
736	vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
737}
738
739static int rmode_tss_base(struct kvm* kvm)
740{
741	gfn_t base_gfn = kvm->memslots[0].base_gfn + kvm->memslots[0].npages - 3;
742	return base_gfn << PAGE_SHIFT;
743}
744
745static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
746{
747	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
748
749	save->selector = vmcs_read16(sf->selector);
750	save->base = vmcs_readl(sf->base);
751	save->limit = vmcs_read32(sf->limit);
752	save->ar = vmcs_read32(sf->ar_bytes);
753	vmcs_write16(sf->selector, vmcs_readl(sf->base) >> 4);
754	vmcs_write32(sf->limit, 0xffff);
755	vmcs_write32(sf->ar_bytes, 0xf3);
756}
757
758static void enter_rmode(struct kvm_vcpu *vcpu)
759{
760	unsigned long flags;
761
762	vcpu->rmode.active = 1;
763
764	vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
765	vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
766
767	vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
768	vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
769
770	vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
771	vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
772
773	flags = vmcs_readl(GUEST_RFLAGS);
774	vcpu->rmode.save_iopl = (flags & IOPL_MASK) >> IOPL_SHIFT;
775
776	flags |= IOPL_MASK | X86_EFLAGS_VM;
777
778	vmcs_writel(GUEST_RFLAGS, flags);
779	vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | CR4_VME_MASK);
780	update_exception_bitmap(vcpu);
781
782	vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
783	vmcs_write32(GUEST_SS_LIMIT, 0xffff);
784	vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
785
786	vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
787	vmcs_write32(GUEST_CS_LIMIT, 0xffff);
788	if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
789		vmcs_writel(GUEST_CS_BASE, 0xf0000);
790	vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
791
792	fix_rmode_seg(VCPU_SREG_ES, &vcpu->rmode.es);
793	fix_rmode_seg(VCPU_SREG_DS, &vcpu->rmode.ds);
794	fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs);
795	fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs);
796}
797
798#ifdef CONFIG_X86_64
799
800static void enter_lmode(struct kvm_vcpu *vcpu)
801{
802	u32 guest_tr_ar;
803
804	guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
805	if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
806		printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
807		       __FUNCTION__);
808		vmcs_write32(GUEST_TR_AR_BYTES,
809			     (guest_tr_ar & ~AR_TYPE_MASK)
810			     | AR_TYPE_BUSY_64_TSS);
811	}
812
813	vcpu->shadow_efer |= EFER_LMA;
814
815	find_msr_entry(vcpu, MSR_EFER)->data |= EFER_LMA | EFER_LME;
816	vmcs_write32(VM_ENTRY_CONTROLS,
817		     vmcs_read32(VM_ENTRY_CONTROLS)
818		     | VM_ENTRY_CONTROLS_IA32E_MASK);
819}
820
821static void exit_lmode(struct kvm_vcpu *vcpu)
822{
823	vcpu->shadow_efer &= ~EFER_LMA;
824
825	vmcs_write32(VM_ENTRY_CONTROLS,
826		     vmcs_read32(VM_ENTRY_CONTROLS)
827		     & ~VM_ENTRY_CONTROLS_IA32E_MASK);
828}
829
830#endif
831
832static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
833{
834	vcpu->cr4 &= KVM_GUEST_CR4_MASK;
835	vcpu->cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
836}
837
838static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
839{
840	if (vcpu->rmode.active && (cr0 & CR0_PE_MASK))
841		enter_pmode(vcpu);
842
843	if (!vcpu->rmode.active && !(cr0 & CR0_PE_MASK))
844		enter_rmode(vcpu);
845
846#ifdef CONFIG_X86_64
847	if (vcpu->shadow_efer & EFER_LME) {
848		if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK))
849			enter_lmode(vcpu);
850		if (is_paging(vcpu) && !(cr0 & CR0_PG_MASK))
851			exit_lmode(vcpu);
852	}
853#endif
854
855	if (!(cr0 & CR0_TS_MASK)) {
856		vcpu->fpu_active = 1;
857		vmcs_clear_bits(EXCEPTION_BITMAP, CR0_TS_MASK);
858	}
859
860	vmcs_writel(CR0_READ_SHADOW, cr0);
861	vmcs_writel(GUEST_CR0,
862		    (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
863	vcpu->cr0 = cr0;
864}
865
866static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
867{
868	vmcs_writel(GUEST_CR3, cr3);
869
870	if (!(vcpu->cr0 & CR0_TS_MASK)) {
871		vcpu->fpu_active = 0;
872		vmcs_set_bits(GUEST_CR0, CR0_TS_MASK);
873		vmcs_set_bits(EXCEPTION_BITMAP, 1 << NM_VECTOR);
874	}
875}
876
877static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
878{
879	vmcs_writel(CR4_READ_SHADOW, cr4);
880	vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ?
881		    KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON));
882	vcpu->cr4 = cr4;
883}
884
885#ifdef CONFIG_X86_64
886
887static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
888{
889	struct vmx_msr_entry *msr = find_msr_entry(vcpu, MSR_EFER);
890
891	vcpu->shadow_efer = efer;
892	if (efer & EFER_LMA) {
893		vmcs_write32(VM_ENTRY_CONTROLS,
894				     vmcs_read32(VM_ENTRY_CONTROLS) |
895				     VM_ENTRY_CONTROLS_IA32E_MASK);
896		msr->data = efer;
897
898	} else {
899		vmcs_write32(VM_ENTRY_CONTROLS,
900				     vmcs_read32(VM_ENTRY_CONTROLS) &
901				     ~VM_ENTRY_CONTROLS_IA32E_MASK);
902
903		msr->data = efer & ~EFER_LME;
904	}
905	setup_msrs(vcpu);
906}
907
908#endif
909
910static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
911{
912	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
913
914	return vmcs_readl(sf->base);
915}
916
917static void vmx_get_segment(struct kvm_vcpu *vcpu,
918			    struct kvm_segment *var, int seg)
919{
920	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
921	u32 ar;
922
923	var->base = vmcs_readl(sf->base);
924	var->limit = vmcs_read32(sf->limit);
925	var->selector = vmcs_read16(sf->selector);
926	ar = vmcs_read32(sf->ar_bytes);
927	if (ar & AR_UNUSABLE_MASK)
928		ar = 0;
929	var->type = ar & 15;
930	var->s = (ar >> 4) & 1;
931	var->dpl = (ar >> 5) & 3;
932	var->present = (ar >> 7) & 1;
933	var->avl = (ar >> 12) & 1;
934	var->l = (ar >> 13) & 1;
935	var->db = (ar >> 14) & 1;
936	var->g = (ar >> 15) & 1;
937	var->unusable = (ar >> 16) & 1;
938}
939
940static void vmx_set_segment(struct kvm_vcpu *vcpu,
941			    struct kvm_segment *var, int seg)
942{
943	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
944	u32 ar;
945
946	vmcs_writel(sf->base, var->base);
947	vmcs_write32(sf->limit, var->limit);
948	vmcs_write16(sf->selector, var->selector);
949	if (vcpu->rmode.active && var->s) {
950		/*
951		 * Hack real-mode segments into vm86 compatibility.
952		 */
953		if (var->base == 0xffff0000 && var->selector == 0xf000)
954			vmcs_writel(sf->base, 0xf0000);
955		ar = 0xf3;
956	} else if (var->unusable)
957		ar = 1 << 16;
958	else {
959		ar = var->type & 15;
960		ar |= (var->s & 1) << 4;
961		ar |= (var->dpl & 3) << 5;
962		ar |= (var->present & 1) << 7;
963		ar |= (var->avl & 1) << 12;
964		ar |= (var->l & 1) << 13;
965		ar |= (var->db & 1) << 14;
966		ar |= (var->g & 1) << 15;
967	}
968	if (ar == 0) /* a 0 value means unusable */
969		ar = AR_UNUSABLE_MASK;
970	vmcs_write32(sf->ar_bytes, ar);
971}
972
973static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
974{
975	u32 ar = vmcs_read32(GUEST_CS_AR_BYTES);
976
977	*db = (ar >> 14) & 1;
978	*l = (ar >> 13) & 1;
979}
980
981static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
982{
983	dt->limit = vmcs_read32(GUEST_IDTR_LIMIT);
984	dt->base = vmcs_readl(GUEST_IDTR_BASE);
985}
986
987static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
988{
989	vmcs_write32(GUEST_IDTR_LIMIT, dt->limit);
990	vmcs_writel(GUEST_IDTR_BASE, dt->base);
991}
992
993static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
994{
995	dt->limit = vmcs_read32(GUEST_GDTR_LIMIT);
996	dt->base = vmcs_readl(GUEST_GDTR_BASE);
997}
998
999static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1000{
1001	vmcs_write32(GUEST_GDTR_LIMIT, dt->limit);
1002	vmcs_writel(GUEST_GDTR_BASE, dt->base);
1003}
1004
1005static int init_rmode_tss(struct kvm* kvm)
1006{
1007	struct page *p1, *p2, *p3;
1008	gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
1009	char *page;
1010
1011	p1 = gfn_to_page(kvm, fn++);
1012	p2 = gfn_to_page(kvm, fn++);
1013	p3 = gfn_to_page(kvm, fn);
1014
1015	if (!p1 || !p2 || !p3) {
1016		kvm_printf(kvm,"%s: gfn_to_page failed\n", __FUNCTION__);
1017		return 0;
1018	}
1019
1020	page = kmap_atomic(p1, KM_USER0);
1021	memset(page, 0, PAGE_SIZE);
1022	*(u16*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
1023	kunmap_atomic(page, KM_USER0);
1024
1025	page = kmap_atomic(p2, KM_USER0);
1026	memset(page, 0, PAGE_SIZE);
1027	kunmap_atomic(page, KM_USER0);
1028
1029	page = kmap_atomic(p3, KM_USER0);
1030	memset(page, 0, PAGE_SIZE);
1031	*(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0;
1032	kunmap_atomic(page, KM_USER0);
1033
1034	return 1;
1035}
1036
1037static void vmcs_write32_fixedbits(u32 msr, u32 vmcs_field, u32 val)
1038{
1039	u32 msr_high, msr_low;
1040
1041	rdmsr(msr, msr_low, msr_high);
1042
1043	val &= msr_high;
1044	val |= msr_low;
1045	vmcs_write32(vmcs_field, val);
1046}
1047
1048static void seg_setup(int seg)
1049{
1050	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1051
1052	vmcs_write16(sf->selector, 0);
1053	vmcs_writel(sf->base, 0);
1054	vmcs_write32(sf->limit, 0xffff);
1055	vmcs_write32(sf->ar_bytes, 0x93);
1056}
1057
1058/*
1059 * Sets up the vmcs for emulated real mode.
1060 */
1061static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
1062{
1063	u32 host_sysenter_cs;
1064	u32 junk;
1065	unsigned long a;
1066	struct descriptor_table dt;
1067	int i;
1068	int ret = 0;
1069	extern asmlinkage void kvm_vmx_return(void);
1070
1071	if (!init_rmode_tss(vcpu->kvm)) {
1072		ret = -ENOMEM;
1073		goto out;
1074	}
1075
1076	memset(vcpu->regs, 0, sizeof(vcpu->regs));
1077	vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val();
1078	vcpu->cr8 = 0;
1079	vcpu->apic_base = 0xfee00000 |
1080			/*for vcpu 0*/ MSR_IA32_APICBASE_BSP |
1081			MSR_IA32_APICBASE_ENABLE;
1082
1083	fx_init(vcpu);
1084
1085	/*
1086	 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
1087	 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4.  Sigh.
1088	 */
1089	vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
1090	vmcs_writel(GUEST_CS_BASE, 0x000f0000);
1091	vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1092	vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1093
1094	seg_setup(VCPU_SREG_DS);
1095	seg_setup(VCPU_SREG_ES);
1096	seg_setup(VCPU_SREG_FS);
1097	seg_setup(VCPU_SREG_GS);
1098	seg_setup(VCPU_SREG_SS);
1099
1100	vmcs_write16(GUEST_TR_SELECTOR, 0);
1101	vmcs_writel(GUEST_TR_BASE, 0);
1102	vmcs_write32(GUEST_TR_LIMIT, 0xffff);
1103	vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1104
1105	vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1106	vmcs_writel(GUEST_LDTR_BASE, 0);
1107	vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
1108	vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
1109
1110	vmcs_write32(GUEST_SYSENTER_CS, 0);
1111	vmcs_writel(GUEST_SYSENTER_ESP, 0);
1112	vmcs_writel(GUEST_SYSENTER_EIP, 0);
1113
1114	vmcs_writel(GUEST_RFLAGS, 0x02);
1115	vmcs_writel(GUEST_RIP, 0xfff0);
1116	vmcs_writel(GUEST_RSP, 0);
1117
1118	//todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0
1119	vmcs_writel(GUEST_DR7, 0x400);
1120
1121	vmcs_writel(GUEST_GDTR_BASE, 0);
1122	vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
1123
1124	vmcs_writel(GUEST_IDTR_BASE, 0);
1125	vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
1126
1127	vmcs_write32(GUEST_ACTIVITY_STATE, 0);
1128	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1129	vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1130
1131	/* I/O */
1132	vmcs_write64(IO_BITMAP_A, 0);
1133	vmcs_write64(IO_BITMAP_B, 0);
1134
1135	guest_write_tsc(0);
1136
1137	vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1138
1139	/* Special registers */
1140	vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1141
1142	/* Control */
1143	vmcs_write32_fixedbits(MSR_IA32_VMX_PINBASED_CTLS,
1144			       PIN_BASED_VM_EXEC_CONTROL,
1145			       PIN_BASED_EXT_INTR_MASK   /* 20.6.1 */
1146			       | PIN_BASED_NMI_EXITING   /* 20.6.1 */
1147			);
1148	vmcs_write32_fixedbits(MSR_IA32_VMX_PROCBASED_CTLS,
1149			       CPU_BASED_VM_EXEC_CONTROL,
1150			       CPU_BASED_HLT_EXITING         /* 20.6.2 */
1151			       | CPU_BASED_CR8_LOAD_EXITING    /* 20.6.2 */
1152			       | CPU_BASED_CR8_STORE_EXITING   /* 20.6.2 */
1153			       | CPU_BASED_UNCOND_IO_EXITING   /* 20.6.2 */
1154			       | CPU_BASED_MOV_DR_EXITING
1155			       | CPU_BASED_USE_TSC_OFFSETING   /* 21.3 */
1156			);
1157
1158	vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
1159	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
1160	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
1161	vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
1162
1163	vmcs_writel(HOST_CR0, read_cr0());  /* 22.2.3 */
1164	vmcs_writel(HOST_CR4, read_cr4());  /* 22.2.3, 22.2.5 */
1165	vmcs_writel(HOST_CR3, read_cr3());
1166
1167	vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
1168	vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
1169	vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
1170	vmcs_write16(HOST_FS_SELECTOR, read_fs());    /* 22.2.4 */
1171	vmcs_write16(HOST_GS_SELECTOR, read_gs());    /* 22.2.4 */
1172	vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
1173#ifdef CONFIG_X86_64
1174	rdmsrl(MSR_FS_BASE, a);
1175	vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
1176	rdmsrl(MSR_GS_BASE, a);
1177	vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
1178#else
1179	vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
1180	vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
1181#endif
1182
1183	vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
1184
1185	get_idt(&dt);
1186	vmcs_writel(HOST_IDTR_BASE, dt.base);   /* 22.2.4 */
1187
1188
1189	vmcs_writel(HOST_RIP, (unsigned long)kvm_vmx_return); /* 22.2.5 */
1190
1191	rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
1192	vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
1193	rdmsrl(MSR_IA32_SYSENTER_ESP, a);
1194	vmcs_writel(HOST_IA32_SYSENTER_ESP, a);   /* 22.2.3 */
1195	rdmsrl(MSR_IA32_SYSENTER_EIP, a);
1196	vmcs_writel(HOST_IA32_SYSENTER_EIP, a);   /* 22.2.3 */
1197
1198	for (i = 0; i < NR_VMX_MSR; ++i) {
1199		u32 index = vmx_msr_index[i];
1200		u32 data_low, data_high;
1201		u64 data;
1202		int j = vcpu->nmsrs;
1203
1204		if (rdmsr_safe(index, &data_low, &data_high) < 0)
1205			continue;
1206		if (wrmsr_safe(index, data_low, data_high) < 0)
1207			continue;
1208		data = data_low | ((u64)data_high << 32);
1209		vcpu->host_msrs[j].index = index;
1210		vcpu->host_msrs[j].reserved = 0;
1211		vcpu->host_msrs[j].data = data;
1212		vcpu->guest_msrs[j] = vcpu->host_msrs[j];
1213#ifdef CONFIG_X86_64
1214		if (index == MSR_KERNEL_GS_BASE)
1215			msr_offset_kernel_gs_base = j;
1216#endif
1217		++vcpu->nmsrs;
1218	}
1219
1220	setup_msrs(vcpu);
1221
1222	vmcs_write32_fixedbits(MSR_IA32_VMX_EXIT_CTLS, VM_EXIT_CONTROLS,
1223		     	       (HOST_IS_64 << 9));  /* 22.2,1, 20.7.1 */
1224
1225	/* 22.2.1, 20.8.1 */
1226	vmcs_write32_fixedbits(MSR_IA32_VMX_ENTRY_CTLS,
1227                               VM_ENTRY_CONTROLS, 0);
1228	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
1229
1230#ifdef CONFIG_X86_64
1231	vmcs_writel(VIRTUAL_APIC_PAGE_ADDR, 0);
1232	vmcs_writel(TPR_THRESHOLD, 0);
1233#endif
1234
1235	vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
1236	vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
1237
1238	vcpu->cr0 = 0x60000010;
1239	vmx_set_cr0(vcpu, vcpu->cr0); // enter rmode
1240	vmx_set_cr4(vcpu, 0);
1241#ifdef CONFIG_X86_64
1242	vmx_set_efer(vcpu, 0);
1243#endif
1244
1245	return 0;
1246
1247out:
1248	return ret;
1249}
1250
1251static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq)
1252{
1253	u16 ent[2];
1254	u16 cs;
1255	u16 ip;
1256	unsigned long flags;
1257	unsigned long ss_base = vmcs_readl(GUEST_SS_BASE);
1258	u16 sp =  vmcs_readl(GUEST_RSP);
1259	u32 ss_limit = vmcs_read32(GUEST_SS_LIMIT);
1260
1261	if (sp > ss_limit || sp < 6 ) {
1262		vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n",
1263			    __FUNCTION__,
1264			    vmcs_readl(GUEST_RSP),
1265			    vmcs_readl(GUEST_SS_BASE),
1266			    vmcs_read32(GUEST_SS_LIMIT));
1267		return;
1268	}
1269
1270	if (kvm_read_guest(vcpu, irq * sizeof(ent), sizeof(ent), &ent) !=
1271								sizeof(ent)) {
1272		vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__);
1273		return;
1274	}
1275
1276	flags =  vmcs_readl(GUEST_RFLAGS);
1277	cs =  vmcs_readl(GUEST_CS_BASE) >> 4;
1278	ip =  vmcs_readl(GUEST_RIP);
1279
1280
1281	if (kvm_write_guest(vcpu, ss_base + sp - 2, 2, &flags) != 2 ||
1282	    kvm_write_guest(vcpu, ss_base + sp - 4, 2, &cs) != 2 ||
1283	    kvm_write_guest(vcpu, ss_base + sp - 6, 2, &ip) != 2) {
1284		vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__);
1285		return;
1286	}
1287
1288	vmcs_writel(GUEST_RFLAGS, flags &
1289		    ~( X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF));
1290	vmcs_write16(GUEST_CS_SELECTOR, ent[1]) ;
1291	vmcs_writel(GUEST_CS_BASE, ent[1] << 4);
1292	vmcs_writel(GUEST_RIP, ent[0]);
1293	vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6));
1294}
1295
1296static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
1297{
1298	int word_index = __ffs(vcpu->irq_summary);
1299	int bit_index = __ffs(vcpu->irq_pending[word_index]);
1300	int irq = word_index * BITS_PER_LONG + bit_index;
1301
1302	clear_bit(bit_index, &vcpu->irq_pending[word_index]);
1303	if (!vcpu->irq_pending[word_index])
1304		clear_bit(word_index, &vcpu->irq_summary);
1305
1306	if (vcpu->rmode.active) {
1307		inject_rmode_irq(vcpu, irq);
1308		return;
1309	}
1310	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
1311			irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
1312}
1313
1314
1315static void do_interrupt_requests(struct kvm_vcpu *vcpu,
1316				       struct kvm_run *kvm_run)
1317{
1318	u32 cpu_based_vm_exec_control;
1319
1320	vcpu->interrupt_window_open =
1321		((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
1322		 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
1323
1324	if (vcpu->interrupt_window_open &&
1325	    vcpu->irq_summary &&
1326	    !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
1327		/*
1328		 * If interrupts enabled, and not blocked by sti or mov ss. Good.
1329		 */
1330		kvm_do_inject_irq(vcpu);
1331
1332	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
1333	if (!vcpu->interrupt_window_open &&
1334	    (vcpu->irq_summary || kvm_run->request_interrupt_window))
1335		/*
1336		 * Interrupts blocked.  Wait for unblock.
1337		 */
1338		cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
1339	else
1340		cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
1341	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
1342}
1343
1344static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
1345{
1346	struct kvm_guest_debug *dbg = &vcpu->guest_debug;
1347
1348	set_debugreg(dbg->bp[0], 0);
1349	set_debugreg(dbg->bp[1], 1);
1350	set_debugreg(dbg->bp[2], 2);
1351	set_debugreg(dbg->bp[3], 3);
1352
1353	if (dbg->singlestep) {
1354		unsigned long flags;
1355
1356		flags = vmcs_readl(GUEST_RFLAGS);
1357		flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
1358		vmcs_writel(GUEST_RFLAGS, flags);
1359	}
1360}
1361
1362static int handle_rmode_exception(struct kvm_vcpu *vcpu,
1363				  int vec, u32 err_code)
1364{
1365	if (!vcpu->rmode.active)
1366		return 0;
1367
1368	if (vec == GP_VECTOR && err_code == 0)
1369		if (emulate_instruction(vcpu, NULL, 0, 0) == EMULATE_DONE)
1370			return 1;
1371	return 0;
1372}
1373
1374static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1375{
1376	u32 intr_info, error_code;
1377	unsigned long cr2, rip;
1378	u32 vect_info;
1379	enum emulation_result er;
1380	int r;
1381
1382	vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
1383	intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
1384
1385	if ((vect_info & VECTORING_INFO_VALID_MASK) &&
1386						!is_page_fault(intr_info)) {
1387		printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
1388		       "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
1389	}
1390
1391	if (is_external_interrupt(vect_info)) {
1392		int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
1393		set_bit(irq, vcpu->irq_pending);
1394		set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary);
1395	}
1396
1397	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */
1398		asm ("int $2");
1399		return 1;
1400	}
1401
1402	if (is_no_device(intr_info)) {
1403		vcpu->fpu_active = 1;
1404		vmcs_clear_bits(EXCEPTION_BITMAP, 1 << NM_VECTOR);
1405		if (!(vcpu->cr0 & CR0_TS_MASK))
1406			vmcs_clear_bits(GUEST_CR0, CR0_TS_MASK);
1407		return 1;
1408	}
1409
1410	error_code = 0;
1411	rip = vmcs_readl(GUEST_RIP);
1412	if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
1413		error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
1414	if (is_page_fault(intr_info)) {
1415		cr2 = vmcs_readl(EXIT_QUALIFICATION);
1416
1417		spin_lock(&vcpu->kvm->lock);
1418		r = kvm_mmu_page_fault(vcpu, cr2, error_code);
1419		if (r < 0) {
1420			spin_unlock(&vcpu->kvm->lock);
1421			return r;
1422		}
1423		if (!r) {
1424			spin_unlock(&vcpu->kvm->lock);
1425			return 1;
1426		}
1427
1428		er = emulate_instruction(vcpu, kvm_run, cr2, error_code);
1429		spin_unlock(&vcpu->kvm->lock);
1430
1431		switch (er) {
1432		case EMULATE_DONE:
1433			return 1;
1434		case EMULATE_DO_MMIO:
1435			++vcpu->stat.mmio_exits;
1436			kvm_run->exit_reason = KVM_EXIT_MMIO;
1437			return 0;
1438		 case EMULATE_FAIL:
1439			vcpu_printf(vcpu, "%s: emulate fail\n", __FUNCTION__);
1440			break;
1441		default:
1442			BUG();
1443		}
1444	}
1445
1446	if (vcpu->rmode.active &&
1447	    handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
1448								error_code))
1449		return 1;
1450
1451	if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) {
1452		kvm_run->exit_reason = KVM_EXIT_DEBUG;
1453		return 0;
1454	}
1455	kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
1456	kvm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
1457	kvm_run->ex.error_code = error_code;
1458	return 0;
1459}
1460
1461static int handle_external_interrupt(struct kvm_vcpu *vcpu,
1462				     struct kvm_run *kvm_run)
1463{
1464	++vcpu->stat.irq_exits;
1465	return 1;
1466}
1467
1468static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1469{
1470	kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
1471	return 0;
1472}
1473
1474static int get_io_count(struct kvm_vcpu *vcpu, unsigned long *count)
1475{
1476	u64 inst;
1477	gva_t rip;
1478	int countr_size;
1479	int i, n;
1480
1481	if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_VM)) {
1482		countr_size = 2;
1483	} else {
1484		u32 cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
1485
1486		countr_size = (cs_ar & AR_L_MASK) ? 8:
1487			      (cs_ar & AR_DB_MASK) ? 4: 2;
1488	}
1489
1490	rip =  vmcs_readl(GUEST_RIP);
1491	if (countr_size != 8)
1492		rip += vmcs_readl(GUEST_CS_BASE);
1493
1494	n = kvm_read_guest(vcpu, rip, sizeof(inst), &inst);
1495
1496	for (i = 0; i < n; i++) {
1497		switch (((u8*)&inst)[i]) {
1498		case 0xf0:
1499		case 0xf2:
1500		case 0xf3:
1501		case 0x2e:
1502		case 0x36:
1503		case 0x3e:
1504		case 0x26:
1505		case 0x64:
1506		case 0x65:
1507		case 0x66:
1508			break;
1509		case 0x67:
1510			countr_size = (countr_size == 2) ? 4: (countr_size >> 1);
1511		default:
1512			goto done;
1513		}
1514	}
1515	return 0;
1516done:
1517	countr_size *= 8;
1518	*count = vcpu->regs[VCPU_REGS_RCX] & (~0ULL >> (64 - countr_size));
1519	//printk("cx: %lx\n", vcpu->regs[VCPU_REGS_RCX]);
1520	return 1;
1521}
1522
1523static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1524{
1525	u64 exit_qualification;
1526	int size, down, in, string, rep;
1527	unsigned port;
1528	unsigned long count;
1529	gva_t address;
1530
1531	++vcpu->stat.io_exits;
1532	exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
1533	in = (exit_qualification & 8) != 0;
1534	size = (exit_qualification & 7) + 1;
1535	string = (exit_qualification & 16) != 0;
1536	down = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
1537	count = 1;
1538	rep = (exit_qualification & 32) != 0;
1539	port = exit_qualification >> 16;
1540	address = 0;
1541	if (string) {
1542		if (rep && !get_io_count(vcpu, &count))
1543			return 1;
1544		address = vmcs_readl(GUEST_LINEAR_ADDRESS);
1545	}
1546	return kvm_setup_pio(vcpu, kvm_run, in, size, count, string, down,
1547			     address, rep, port);
1548}
1549
1550static void
1551vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
1552{
1553	/*
1554	 * Patch in the VMCALL instruction:
1555	 */
1556	hypercall[0] = 0x0f;
1557	hypercall[1] = 0x01;
1558	hypercall[2] = 0xc1;
1559	hypercall[3] = 0xc3;
1560}
1561
1562static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1563{
1564	u64 exit_qualification;
1565	int cr;
1566	int reg;
1567
1568	exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
1569	cr = exit_qualification & 15;
1570	reg = (exit_qualification >> 8) & 15;
1571	switch ((exit_qualification >> 4) & 3) {
1572	case 0: /* mov to cr */
1573		switch (cr) {
1574		case 0:
1575			vcpu_load_rsp_rip(vcpu);
1576			set_cr0(vcpu, vcpu->regs[reg]);
1577			skip_emulated_instruction(vcpu);
1578			return 1;
1579		case 3:
1580			vcpu_load_rsp_rip(vcpu);
1581			set_cr3(vcpu, vcpu->regs[reg]);
1582			skip_emulated_instruction(vcpu);
1583			return 1;
1584		case 4:
1585			vcpu_load_rsp_rip(vcpu);
1586			set_cr4(vcpu, vcpu->regs[reg]);
1587			skip_emulated_instruction(vcpu);
1588			return 1;
1589		case 8:
1590			vcpu_load_rsp_rip(vcpu);
1591			set_cr8(vcpu, vcpu->regs[reg]);
1592			skip_emulated_instruction(vcpu);
1593			return 1;
1594		};
1595		break;
1596	case 2: /* clts */
1597		vcpu_load_rsp_rip(vcpu);
1598		vcpu->fpu_active = 1;
1599		vmcs_clear_bits(EXCEPTION_BITMAP, 1 << NM_VECTOR);
1600		vmcs_clear_bits(GUEST_CR0, CR0_TS_MASK);
1601		vcpu->cr0 &= ~CR0_TS_MASK;
1602		vmcs_writel(CR0_READ_SHADOW, vcpu->cr0);
1603		skip_emulated_instruction(vcpu);
1604		return 1;
1605	case 1: /*mov from cr*/
1606		switch (cr) {
1607		case 3:
1608			vcpu_load_rsp_rip(vcpu);
1609			vcpu->regs[reg] = vcpu->cr3;
1610			vcpu_put_rsp_rip(vcpu);
1611			skip_emulated_instruction(vcpu);
1612			return 1;
1613		case 8:
1614			vcpu_load_rsp_rip(vcpu);
1615			vcpu->regs[reg] = vcpu->cr8;
1616			vcpu_put_rsp_rip(vcpu);
1617			skip_emulated_instruction(vcpu);
1618			return 1;
1619		}
1620		break;
1621	case 3: /* lmsw */
1622		lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
1623
1624		skip_emulated_instruction(vcpu);
1625		return 1;
1626	default:
1627		break;
1628	}
1629	kvm_run->exit_reason = 0;
1630	printk(KERN_ERR "kvm: unhandled control register: op %d cr %d\n",
1631	       (int)(exit_qualification >> 4) & 3, cr);
1632	return 0;
1633}
1634
1635static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1636{
1637	u64 exit_qualification;
1638	unsigned long val;
1639	int dr, reg;
1640
1641	exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
1642	dr = exit_qualification & 7;
1643	reg = (exit_qualification >> 8) & 15;
1644	vcpu_load_rsp_rip(vcpu);
1645	if (exit_qualification & 16) {
1646		/* mov from dr */
1647		switch (dr) {
1648		case 6:
1649			val = 0xffff0ff0;
1650			break;
1651		case 7:
1652			val = 0x400;
1653			break;
1654		default:
1655			val = 0;
1656		}
1657		vcpu->regs[reg] = val;
1658	} else {
1659		/* mov to dr */
1660	}
1661	vcpu_put_rsp_rip(vcpu);
1662	skip_emulated_instruction(vcpu);
1663	return 1;
1664}
1665
1666static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1667{
1668	kvm_emulate_cpuid(vcpu);
1669	return 1;
1670}
1671
1672static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1673{
1674	u32 ecx = vcpu->regs[VCPU_REGS_RCX];
1675	u64 data;
1676
1677	if (vmx_get_msr(vcpu, ecx, &data)) {
1678		vmx_inject_gp(vcpu, 0);
1679		return 1;
1680	}
1681
1682	vcpu->regs[VCPU_REGS_RAX] = data & -1u;
1683	vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
1684	skip_emulated_instruction(vcpu);
1685	return 1;
1686}
1687
1688static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1689{
1690	u32 ecx = vcpu->regs[VCPU_REGS_RCX];
1691	u64 data = (vcpu->regs[VCPU_REGS_RAX] & -1u)
1692		| ((u64)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32);
1693
1694	if (vmx_set_msr(vcpu, ecx, data) != 0) {
1695		vmx_inject_gp(vcpu, 0);
1696		return 1;
1697	}
1698
1699	skip_emulated_instruction(vcpu);
1700	return 1;
1701}
1702
1703static void post_kvm_run_save(struct kvm_vcpu *vcpu,
1704			      struct kvm_run *kvm_run)
1705{
1706	kvm_run->if_flag = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) != 0;
1707	kvm_run->cr8 = vcpu->cr8;
1708	kvm_run->apic_base = vcpu->apic_base;
1709	kvm_run->ready_for_interrupt_injection = (vcpu->interrupt_window_open &&
1710						  vcpu->irq_summary == 0);
1711}
1712
1713static int handle_interrupt_window(struct kvm_vcpu *vcpu,
1714				   struct kvm_run *kvm_run)
1715{
1716	/*
1717	 * If the user space waits to inject interrupts, exit as soon as
1718	 * possible
1719	 */
1720	if (kvm_run->request_interrupt_window &&
1721	    !vcpu->irq_summary) {
1722		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
1723		++vcpu->stat.irq_window_exits;
1724		return 0;
1725	}
1726	return 1;
1727}
1728
1729static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1730{
1731	skip_emulated_instruction(vcpu);
1732	if (vcpu->irq_summary)
1733		return 1;
1734
1735	kvm_run->exit_reason = KVM_EXIT_HLT;
1736	++vcpu->stat.halt_exits;
1737	return 0;
1738}
1739
1740static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1741{
1742	skip_emulated_instruction(vcpu);
1743	return kvm_hypercall(vcpu, kvm_run);
1744}
1745
1746/*
1747 * The exit handlers return 1 if the exit was handled fully and guest execution
1748 * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
1749 * to be done to userspace and return 0.
1750 */
1751static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
1752				      struct kvm_run *kvm_run) = {
1753	[EXIT_REASON_EXCEPTION_NMI]           = handle_exception,
1754	[EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
1755	[EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
1756	[EXIT_REASON_IO_INSTRUCTION]          = handle_io,
1757	[EXIT_REASON_CR_ACCESS]               = handle_cr,
1758	[EXIT_REASON_DR_ACCESS]               = handle_dr,
1759	[EXIT_REASON_CPUID]                   = handle_cpuid,
1760	[EXIT_REASON_MSR_READ]                = handle_rdmsr,
1761	[EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
1762	[EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
1763	[EXIT_REASON_HLT]                     = handle_halt,
1764	[EXIT_REASON_VMCALL]                  = handle_vmcall,
1765};
1766
1767static const int kvm_vmx_max_exit_handlers =
1768	sizeof(kvm_vmx_exit_handlers) / sizeof(*kvm_vmx_exit_handlers);
1769
1770/*
1771 * The guest has exited.  See if we can fix it or if we need userspace
1772 * assistance.
1773 */
1774static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1775{
1776	u32 vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
1777	u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
1778
1779	if ( (vectoring_info & VECTORING_INFO_VALID_MASK) &&
1780				exit_reason != EXIT_REASON_EXCEPTION_NMI )
1781		printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
1782		       "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
1783	if (exit_reason < kvm_vmx_max_exit_handlers
1784	    && kvm_vmx_exit_handlers[exit_reason])
1785		return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
1786	else {
1787		kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
1788		kvm_run->hw.hardware_exit_reason = exit_reason;
1789	}
1790	return 0;
1791}
1792
1793/*
1794 * Check if userspace requested an interrupt window, and that the
1795 * interrupt window is open.
1796 *
1797 * No need to exit to userspace if we already have an interrupt queued.
1798 */
1799static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
1800					  struct kvm_run *kvm_run)
1801{
1802	return (!vcpu->irq_summary &&
1803		kvm_run->request_interrupt_window &&
1804		vcpu->interrupt_window_open &&
1805		(vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF));
1806}
1807
1808static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1809{
1810	u8 fail;
1811	u16 fs_sel, gs_sel, ldt_sel;
1812	int fs_gs_ldt_reload_needed;
1813	int r;
1814
1815again:
1816	/*
1817	 * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
1818	 * allow segment selectors with cpl > 0 or ti == 1.
1819	 */
1820	fs_sel = read_fs();
1821	gs_sel = read_gs();
1822	ldt_sel = read_ldt();
1823	fs_gs_ldt_reload_needed = (fs_sel & 7) | (gs_sel & 7) | ldt_sel;
1824	if (!fs_gs_ldt_reload_needed) {
1825		vmcs_write16(HOST_FS_SELECTOR, fs_sel);
1826		vmcs_write16(HOST_GS_SELECTOR, gs_sel);
1827	} else {
1828		vmcs_write16(HOST_FS_SELECTOR, 0);
1829		vmcs_write16(HOST_GS_SELECTOR, 0);
1830	}
1831
1832#ifdef CONFIG_X86_64
1833	vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
1834	vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
1835#else
1836	vmcs_writel(HOST_FS_BASE, segment_base(fs_sel));
1837	vmcs_writel(HOST_GS_BASE, segment_base(gs_sel));
1838#endif
1839
1840	if (!vcpu->mmio_read_completed)
1841		do_interrupt_requests(vcpu, kvm_run);
1842
1843	if (vcpu->guest_debug.enabled)
1844		kvm_guest_debug_pre(vcpu);
1845
1846	kvm_load_guest_fpu(vcpu);
1847
1848	/*
1849	 * Loading guest fpu may have cleared host cr0.ts
1850	 */
1851	vmcs_writel(HOST_CR0, read_cr0());
1852
1853#ifdef CONFIG_X86_64
1854	if (is_long_mode(vcpu)) {
1855		save_msrs(vcpu->host_msrs + msr_offset_kernel_gs_base, 1);
1856		load_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
1857	}
1858#endif
1859
1860	asm (
1861		/* Store host registers */
1862		"pushf \n\t"
1863#ifdef CONFIG_X86_64
1864		"push %%rax; push %%rbx; push %%rdx;"
1865		"push %%rsi; push %%rdi; push %%rbp;"
1866		"push %%r8;  push %%r9;  push %%r10; push %%r11;"
1867		"push %%r12; push %%r13; push %%r14; push %%r15;"
1868		"push %%rcx \n\t"
1869		ASM_VMX_VMWRITE_RSP_RDX "\n\t"
1870#else
1871		"pusha; push %%ecx \n\t"
1872		ASM_VMX_VMWRITE_RSP_RDX "\n\t"
1873#endif
1874		/* Check if vmlaunch of vmresume is needed */
1875		"cmp $0, %1 \n\t"
1876		/* Load guest registers.  Don't clobber flags. */
1877#ifdef CONFIG_X86_64
1878		"mov %c[cr2](%3), %%rax \n\t"
1879		"mov %%rax, %%cr2 \n\t"
1880		"mov %c[rax](%3), %%rax \n\t"
1881		"mov %c[rbx](%3), %%rbx \n\t"
1882		"mov %c[rdx](%3), %%rdx \n\t"
1883		"mov %c[rsi](%3), %%rsi \n\t"
1884		"mov %c[rdi](%3), %%rdi \n\t"
1885		"mov %c[rbp](%3), %%rbp \n\t"
1886		"mov %c[r8](%3),  %%r8  \n\t"
1887		"mov %c[r9](%3),  %%r9  \n\t"
1888		"mov %c[r10](%3), %%r10 \n\t"
1889		"mov %c[r11](%3), %%r11 \n\t"
1890		"mov %c[r12](%3), %%r12 \n\t"
1891		"mov %c[r13](%3), %%r13 \n\t"
1892		"mov %c[r14](%3), %%r14 \n\t"
1893		"mov %c[r15](%3), %%r15 \n\t"
1894		"mov %c[rcx](%3), %%rcx \n\t" /* kills %3 (rcx) */
1895#else
1896		"mov %c[cr2](%3), %%eax \n\t"
1897		"mov %%eax,   %%cr2 \n\t"
1898		"mov %c[rax](%3), %%eax \n\t"
1899		"mov %c[rbx](%3), %%ebx \n\t"
1900		"mov %c[rdx](%3), %%edx \n\t"
1901		"mov %c[rsi](%3), %%esi \n\t"
1902		"mov %c[rdi](%3), %%edi \n\t"
1903		"mov %c[rbp](%3), %%ebp \n\t"
1904		"mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */
1905#endif
1906		/* Enter guest mode */
1907		"jne launched \n\t"
1908		ASM_VMX_VMLAUNCH "\n\t"
1909		"jmp kvm_vmx_return \n\t"
1910		"launched: " ASM_VMX_VMRESUME "\n\t"
1911		".globl kvm_vmx_return \n\t"
1912		"kvm_vmx_return: "
1913		/* Save guest registers, load host registers, keep flags */
1914#ifdef CONFIG_X86_64
1915		"xchg %3,     (%%rsp) \n\t"
1916		"mov %%rax, %c[rax](%3) \n\t"
1917		"mov %%rbx, %c[rbx](%3) \n\t"
1918		"pushq (%%rsp); popq %c[rcx](%3) \n\t"
1919		"mov %%rdx, %c[rdx](%3) \n\t"
1920		"mov %%rsi, %c[rsi](%3) \n\t"
1921		"mov %%rdi, %c[rdi](%3) \n\t"
1922		"mov %%rbp, %c[rbp](%3) \n\t"
1923		"mov %%r8,  %c[r8](%3) \n\t"
1924		"mov %%r9,  %c[r9](%3) \n\t"
1925		"mov %%r10, %c[r10](%3) \n\t"
1926		"mov %%r11, %c[r11](%3) \n\t"
1927		"mov %%r12, %c[r12](%3) \n\t"
1928		"mov %%r13, %c[r13](%3) \n\t"
1929		"mov %%r14, %c[r14](%3) \n\t"
1930		"mov %%r15, %c[r15](%3) \n\t"
1931		"mov %%cr2, %%rax   \n\t"
1932		"mov %%rax, %c[cr2](%3) \n\t"
1933		"mov (%%rsp), %3 \n\t"
1934
1935		"pop  %%rcx; pop  %%r15; pop  %%r14; pop  %%r13; pop  %%r12;"
1936		"pop  %%r11; pop  %%r10; pop  %%r9;  pop  %%r8;"
1937		"pop  %%rbp; pop  %%rdi; pop  %%rsi;"
1938		"pop  %%rdx; pop  %%rbx; pop  %%rax \n\t"
1939#else
1940		"xchg %3, (%%esp) \n\t"
1941		"mov %%eax, %c[rax](%3) \n\t"
1942		"mov %%ebx, %c[rbx](%3) \n\t"
1943		"pushl (%%esp); popl %c[rcx](%3) \n\t"
1944		"mov %%edx, %c[rdx](%3) \n\t"
1945		"mov %%esi, %c[rsi](%3) \n\t"
1946		"mov %%edi, %c[rdi](%3) \n\t"
1947		"mov %%ebp, %c[rbp](%3) \n\t"
1948		"mov %%cr2, %%eax  \n\t"
1949		"mov %%eax, %c[cr2](%3) \n\t"
1950		"mov (%%esp), %3 \n\t"
1951
1952		"pop %%ecx; popa \n\t"
1953#endif
1954		"setbe %0 \n\t"
1955		"popf \n\t"
1956	      : "=q" (fail)
1957	      : "r"(vcpu->launched), "d"((unsigned long)HOST_RSP),
1958		"c"(vcpu),
1959		[rax]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RAX])),
1960		[rbx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBX])),
1961		[rcx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RCX])),
1962		[rdx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDX])),
1963		[rsi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RSI])),
1964		[rdi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDI])),
1965		[rbp]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBP])),
1966#ifdef CONFIG_X86_64
1967		[r8 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R8 ])),
1968		[r9 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R9 ])),
1969		[r10]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R10])),
1970		[r11]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R11])),
1971		[r12]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R12])),
1972		[r13]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R13])),
1973		[r14]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R14])),
1974		[r15]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R15])),
1975#endif
1976		[cr2]"i"(offsetof(struct kvm_vcpu, cr2))
1977	      : "cc", "memory" );
1978
1979	/*
1980	 * Reload segment selectors ASAP. (it's needed for a functional
1981	 * kernel: x86 relies on having __KERNEL_PDA in %fs and x86_64
1982	 * relies on having 0 in %gs for the CPU PDA to work.)
1983	 */
1984	if (fs_gs_ldt_reload_needed) {
1985		load_ldt(ldt_sel);
1986		load_fs(fs_sel);
1987		/*
1988		 * If we have to reload gs, we must take care to
1989		 * preserve our gs base.
1990		 */
1991		local_irq_disable();
1992		load_gs(gs_sel);
1993#ifdef CONFIG_X86_64
1994		wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
1995#endif
1996		local_irq_enable();
1997
1998		reload_tss();
1999	}
2000	++vcpu->stat.exits;
2001
2002#ifdef CONFIG_X86_64
2003	if (is_long_mode(vcpu)) {
2004		save_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
2005		load_msrs(vcpu->host_msrs, NR_BAD_MSRS);
2006	}
2007#endif
2008
2009	vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
2010
2011	asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
2012
2013	if (fail) {
2014		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
2015		kvm_run->fail_entry.hardware_entry_failure_reason
2016			= vmcs_read32(VM_INSTRUCTION_ERROR);
2017		r = 0;
2018	} else {
2019		/*
2020		 * Profile KVM exit RIPs:
2021		 */
2022		if (unlikely(prof_on == KVM_PROFILING))
2023			profile_hit(KVM_PROFILING, (void *)vmcs_readl(GUEST_RIP));
2024
2025		vcpu->launched = 1;
2026		r = kvm_handle_exit(kvm_run, vcpu);
2027		if (r > 0) {
2028			/* Give scheduler a change to reschedule. */
2029			if (signal_pending(current)) {
2030				++vcpu->stat.signal_exits;
2031				post_kvm_run_save(vcpu, kvm_run);
2032				kvm_run->exit_reason = KVM_EXIT_INTR;
2033				return -EINTR;
2034			}
2035
2036			if (dm_request_for_irq_injection(vcpu, kvm_run)) {
2037				++vcpu->stat.request_irq_exits;
2038				post_kvm_run_save(vcpu, kvm_run);
2039				kvm_run->exit_reason = KVM_EXIT_INTR;
2040				return -EINTR;
2041			}
2042
2043			kvm_resched(vcpu);
2044			goto again;
2045		}
2046	}
2047
2048	post_kvm_run_save(vcpu, kvm_run);
2049	return r;
2050}
2051
2052static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
2053{
2054	vmcs_writel(GUEST_CR3, vmcs_readl(GUEST_CR3));
2055}
2056
2057static void vmx_inject_page_fault(struct kvm_vcpu *vcpu,
2058				  unsigned long addr,
2059				  u32 err_code)
2060{
2061	u32 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2062
2063	++vcpu->stat.pf_guest;
2064
2065	if (is_page_fault(vect_info)) {
2066		printk(KERN_DEBUG "inject_page_fault: "
2067		       "double fault 0x%lx @ 0x%lx\n",
2068		       addr, vmcs_readl(GUEST_RIP));
2069		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0);
2070		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2071			     DF_VECTOR |
2072			     INTR_TYPE_EXCEPTION |
2073			     INTR_INFO_DELIEVER_CODE_MASK |
2074			     INTR_INFO_VALID_MASK);
2075		return;
2076	}
2077	vcpu->cr2 = addr;
2078	vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, err_code);
2079	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2080		     PF_VECTOR |
2081		     INTR_TYPE_EXCEPTION |
2082		     INTR_INFO_DELIEVER_CODE_MASK |
2083		     INTR_INFO_VALID_MASK);
2084
2085}
2086
2087static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
2088{
2089	if (vcpu->vmcs) {
2090		on_each_cpu(__vcpu_clear, vcpu, 0, 1);
2091		free_vmcs(vcpu->vmcs);
2092		vcpu->vmcs = NULL;
2093	}
2094}
2095
2096static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
2097{
2098	vmx_free_vmcs(vcpu);
2099}
2100
2101static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
2102{
2103	struct vmcs *vmcs;
2104
2105	vcpu->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
2106	if (!vcpu->guest_msrs)
2107		return -ENOMEM;
2108
2109	vcpu->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
2110	if (!vcpu->host_msrs)
2111		goto out_free_guest_msrs;
2112
2113	vmcs = alloc_vmcs();
2114	if (!vmcs)
2115		goto out_free_msrs;
2116
2117	vmcs_clear(vmcs);
2118	vcpu->vmcs = vmcs;
2119	vcpu->launched = 0;
2120	vcpu->fpu_active = 1;
2121
2122	return 0;
2123
2124out_free_msrs:
2125	kfree(vcpu->host_msrs);
2126	vcpu->host_msrs = NULL;
2127
2128out_free_guest_msrs:
2129	kfree(vcpu->guest_msrs);
2130	vcpu->guest_msrs = NULL;
2131
2132	return -ENOMEM;
2133}
2134
2135static struct kvm_arch_ops vmx_arch_ops = {
2136	.cpu_has_kvm_support = cpu_has_kvm_support,
2137	.disabled_by_bios = vmx_disabled_by_bios,
2138	.hardware_setup = hardware_setup,
2139	.hardware_unsetup = hardware_unsetup,
2140	.hardware_enable = hardware_enable,
2141	.hardware_disable = hardware_disable,
2142
2143	.vcpu_create = vmx_create_vcpu,
2144	.vcpu_free = vmx_free_vcpu,
2145
2146	.vcpu_load = vmx_vcpu_load,
2147	.vcpu_put = vmx_vcpu_put,
2148	.vcpu_decache = vmx_vcpu_decache,
2149
2150	.set_guest_debug = set_guest_debug,
2151	.get_msr = vmx_get_msr,
2152	.set_msr = vmx_set_msr,
2153	.get_segment_base = vmx_get_segment_base,
2154	.get_segment = vmx_get_segment,
2155	.set_segment = vmx_set_segment,
2156	.get_cs_db_l_bits = vmx_get_cs_db_l_bits,
2157	.decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
2158	.set_cr0 = vmx_set_cr0,
2159	.set_cr3 = vmx_set_cr3,
2160	.set_cr4 = vmx_set_cr4,
2161#ifdef CONFIG_X86_64
2162	.set_efer = vmx_set_efer,
2163#endif
2164	.get_idt = vmx_get_idt,
2165	.set_idt = vmx_set_idt,
2166	.get_gdt = vmx_get_gdt,
2167	.set_gdt = vmx_set_gdt,
2168	.cache_regs = vcpu_load_rsp_rip,
2169	.decache_regs = vcpu_put_rsp_rip,
2170	.get_rflags = vmx_get_rflags,
2171	.set_rflags = vmx_set_rflags,
2172
2173	.tlb_flush = vmx_flush_tlb,
2174	.inject_page_fault = vmx_inject_page_fault,
2175
2176	.inject_gp = vmx_inject_gp,
2177
2178	.run = vmx_vcpu_run,
2179	.skip_emulated_instruction = skip_emulated_instruction,
2180	.vcpu_setup = vmx_vcpu_setup,
2181	.patch_hypercall = vmx_patch_hypercall,
2182};
2183
2184static int __init vmx_init(void)
2185{
2186	return kvm_init_arch(&vmx_arch_ops, THIS_MODULE);
2187}
2188
2189static void __exit vmx_exit(void)
2190{
2191	kvm_exit_arch();
2192}
2193
2194module_init(vmx_init)
2195module_exit(vmx_exit)
2196