• Home
  • History
  • Annotate
  • Line#
  • Navigate
  • Raw
  • Download
  • only in /asuswrt-rt-n18u-9.0.0.4.380.2695/release/src-rt-6.x.4708/linux/linux-2.6/arch/x86/xen/
1/*
2 * Core of Xen paravirt_ops implementation.
3 *
4 * This file contains the xen_paravirt_ops structure itself, and the
5 * implementations for:
6 * - privileged instructions
7 * - interrupt flags
8 * - segment operations
9 * - booting and setup
10 *
11 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
12 */
13
14#include <linux/cpu.h>
15#include <linux/kernel.h>
16#include <linux/init.h>
17#include <linux/smp.h>
18#include <linux/preempt.h>
19#include <linux/hardirq.h>
20#include <linux/percpu.h>
21#include <linux/delay.h>
22#include <linux/start_kernel.h>
23#include <linux/sched.h>
24#include <linux/kprobes.h>
25#include <linux/bootmem.h>
26#include <linux/module.h>
27#include <linux/mm.h>
28#include <linux/page-flags.h>
29#include <linux/highmem.h>
30#include <linux/console.h>
31#include <linux/pci.h>
32#include <linux/gfp.h>
33
34#include <xen/xen.h>
35#include <xen/interface/xen.h>
36#include <xen/interface/version.h>
37#include <xen/interface/physdev.h>
38#include <xen/interface/vcpu.h>
39#include <xen/interface/memory.h>
40#include <xen/features.h>
41#include <xen/page.h>
42#include <xen/hvm.h>
43#include <xen/hvc-console.h>
44
45#include <asm/paravirt.h>
46#include <asm/apic.h>
47#include <asm/page.h>
48#include <asm/xen/hypercall.h>
49#include <asm/xen/hypervisor.h>
50#include <asm/fixmap.h>
51#include <asm/processor.h>
52#include <asm/proto.h>
53#include <asm/msr-index.h>
54#include <asm/traps.h>
55#include <asm/setup.h>
56#include <asm/desc.h>
57#include <asm/pgalloc.h>
58#include <asm/pgtable.h>
59#include <asm/tlbflush.h>
60#include <asm/reboot.h>
61#include <asm/setup.h>
62#include <asm/stackprotector.h>
63#include <asm/hypervisor.h>
64
65#include "xen-ops.h"
66#include "mmu.h"
67#include "multicalls.h"
68
69EXPORT_SYMBOL_GPL(hypercall_page);
70
71DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
72DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
73
74enum xen_domain_type xen_domain_type = XEN_NATIVE;
75EXPORT_SYMBOL_GPL(xen_domain_type);
76
77struct start_info *xen_start_info;
78EXPORT_SYMBOL_GPL(xen_start_info);
79
80struct shared_info xen_dummy_shared_info;
81
82void *xen_initial_gdt;
83
84RESERVE_BRK(shared_info_page_brk, PAGE_SIZE);
85__read_mostly int xen_have_vector_callback;
86EXPORT_SYMBOL_GPL(xen_have_vector_callback);
87
88/*
89 * Point at some empty memory to start with. We map the real shared_info
90 * page as soon as fixmap is up and running.
91 */
92struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
93
94/*
95 * Flag to determine whether vcpu info placement is available on all
96 * VCPUs.  We assume it is to start with, and then set it to zero on
97 * the first failure.  This is because it can succeed on some VCPUs
98 * and not others, since it can involve hypervisor memory allocation,
99 * or because the guest failed to guarantee all the appropriate
100 * constraints on all VCPUs (ie buffer can't cross a page boundary).
101 *
102 * Note that any particular CPU may be using a placed vcpu structure,
103 * but we can only optimise if the all are.
104 *
105 * 0: not available, 1: available
106 */
107static int have_vcpu_info_placement = 1;
108
109static void clamp_max_cpus(void)
110{
111#ifdef CONFIG_SMP
112	if (setup_max_cpus > MAX_VIRT_CPUS)
113		setup_max_cpus = MAX_VIRT_CPUS;
114#endif
115}
116
117static void xen_vcpu_setup(int cpu)
118{
119	struct vcpu_register_vcpu_info info;
120	int err;
121	struct vcpu_info *vcpup;
122
123	BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
124
125	if (cpu < MAX_VIRT_CPUS)
126		per_cpu(xen_vcpu,cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
127
128	if (!have_vcpu_info_placement) {
129		if (cpu >= MAX_VIRT_CPUS)
130			clamp_max_cpus();
131		return;
132	}
133
134	vcpup = &per_cpu(xen_vcpu_info, cpu);
135	info.mfn = arbitrary_virt_to_mfn(vcpup);
136	info.offset = offset_in_page(vcpup);
137
138	printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n",
139	       cpu, vcpup, info.mfn, info.offset);
140
141	/* Check to see if the hypervisor will put the vcpu_info
142	   structure where we want it, which allows direct access via
143	   a percpu-variable. */
144	err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info);
145
146	if (err) {
147		printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err);
148		have_vcpu_info_placement = 0;
149		clamp_max_cpus();
150	} else {
151		/* This cpu is using the registered vcpu info, even if
152		   later ones fail to. */
153		per_cpu(xen_vcpu, cpu) = vcpup;
154
155		printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n",
156		       cpu, vcpup);
157	}
158}
159
160/*
161 * On restore, set the vcpu placement up again.
162 * If it fails, then we're in a bad state, since
163 * we can't back out from using it...
164 */
165void xen_vcpu_restore(void)
166{
167	int cpu;
168
169	for_each_online_cpu(cpu) {
170		bool other_cpu = (cpu != smp_processor_id());
171
172		if (other_cpu &&
173		    HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL))
174			BUG();
175
176		xen_setup_runstate_info(cpu);
177
178		if (have_vcpu_info_placement)
179			xen_vcpu_setup(cpu);
180
181		if (other_cpu &&
182		    HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL))
183			BUG();
184	}
185}
186
187static void __init xen_banner(void)
188{
189	unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL);
190	struct xen_extraversion extra;
191	HYPERVISOR_xen_version(XENVER_extraversion, &extra);
192
193	printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
194	       pv_info.name);
195	printk(KERN_INFO "Xen version: %d.%d%s%s\n",
196	       version >> 16, version & 0xffff, extra.extraversion,
197	       xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
198}
199
200static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0;
201static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0;
202
203static void xen_cpuid(unsigned int *ax, unsigned int *bx,
204		      unsigned int *cx, unsigned int *dx)
205{
206	unsigned maskebx = ~0;
207	unsigned maskecx = ~0;
208	unsigned maskedx = ~0;
209
210	/*
211	 * Mask out inconvenient features, to try and disable as many
212	 * unsupported kernel subsystems as possible.
213	 */
214	switch (*ax) {
215	case 1:
216		maskecx = cpuid_leaf1_ecx_mask;
217		maskedx = cpuid_leaf1_edx_mask;
218		break;
219
220	case 0xb:
221		/* Suppress extended topology stuff */
222		maskebx = 0;
223		break;
224	}
225
226	asm(XEN_EMULATE_PREFIX "cpuid"
227		: "=a" (*ax),
228		  "=b" (*bx),
229		  "=c" (*cx),
230		  "=d" (*dx)
231		: "0" (*ax), "2" (*cx));
232
233	*bx &= maskebx;
234	*cx &= maskecx;
235	*dx &= maskedx;
236}
237
238static __init void xen_init_cpuid_mask(void)
239{
240	unsigned int ax, bx, cx, dx;
241
242	cpuid_leaf1_edx_mask =
243		~((1 << X86_FEATURE_MCE)  |  /* disable MCE */
244		  (1 << X86_FEATURE_MCA)  |  /* disable MCA */
245		  (1 << X86_FEATURE_ACC));   /* thermal monitoring */
246
247	if (!xen_initial_domain())
248		cpuid_leaf1_edx_mask &=
249			~((1 << X86_FEATURE_APIC) |  /* disable local APIC */
250			  (1 << X86_FEATURE_ACPI));  /* disable ACPI */
251
252	ax = 1;
253	cx = 0;
254	xen_cpuid(&ax, &bx, &cx, &dx);
255
256	/* cpuid claims we support xsave; try enabling it to see what happens */
257	if (cx & (1 << (X86_FEATURE_XSAVE % 32))) {
258		unsigned long cr4;
259
260		set_in_cr4(X86_CR4_OSXSAVE);
261
262		cr4 = read_cr4();
263
264		if ((cr4 & X86_CR4_OSXSAVE) == 0)
265			cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_XSAVE % 32));
266
267		clear_in_cr4(X86_CR4_OSXSAVE);
268	}
269}
270
271static void xen_set_debugreg(int reg, unsigned long val)
272{
273	HYPERVISOR_set_debugreg(reg, val);
274}
275
276static unsigned long xen_get_debugreg(int reg)
277{
278	return HYPERVISOR_get_debugreg(reg);
279}
280
281static void xen_end_context_switch(struct task_struct *next)
282{
283	xen_mc_flush();
284	paravirt_end_context_switch(next);
285}
286
287static unsigned long xen_store_tr(void)
288{
289	return 0;
290}
291
292/*
293 * Set the page permissions for a particular virtual address.  If the
294 * address is a vmalloc mapping (or other non-linear mapping), then
295 * find the linear mapping of the page and also set its protections to
296 * match.
297 */
298static void set_aliased_prot(void *v, pgprot_t prot)
299{
300	int level;
301	pte_t *ptep;
302	pte_t pte;
303	unsigned long pfn;
304	struct page *page;
305
306	ptep = lookup_address((unsigned long)v, &level);
307	BUG_ON(ptep == NULL);
308
309	pfn = pte_pfn(*ptep);
310	page = pfn_to_page(pfn);
311
312	pte = pfn_pte(pfn, prot);
313
314	if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0))
315		BUG();
316
317	if (!PageHighMem(page)) {
318		void *av = __va(PFN_PHYS(pfn));
319
320		if (av != v)
321			if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0))
322				BUG();
323	} else
324		kmap_flush_unused();
325}
326
327static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries)
328{
329	const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
330	int i;
331
332	for(i = 0; i < entries; i += entries_per_page)
333		set_aliased_prot(ldt + i, PAGE_KERNEL_RO);
334}
335
336static void xen_free_ldt(struct desc_struct *ldt, unsigned entries)
337{
338	const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
339	int i;
340
341	for(i = 0; i < entries; i += entries_per_page)
342		set_aliased_prot(ldt + i, PAGE_KERNEL);
343}
344
345static void xen_set_ldt(const void *addr, unsigned entries)
346{
347	struct mmuext_op *op;
348	struct multicall_space mcs = xen_mc_entry(sizeof(*op));
349
350	op = mcs.args;
351	op->cmd = MMUEXT_SET_LDT;
352	op->arg1.linear_addr = (unsigned long)addr;
353	op->arg2.nr_ents = entries;
354
355	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
356
357	xen_mc_issue(PARAVIRT_LAZY_CPU);
358}
359
360static void xen_load_gdt(const struct desc_ptr *dtr)
361{
362	unsigned long va = dtr->address;
363	unsigned int size = dtr->size + 1;
364	unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
365	unsigned long frames[pages];
366	int f;
367
368	/*
369	 * A GDT can be up to 64k in size, which corresponds to 8192
370	 * 8-byte entries, or 16 4k pages..
371	 */
372
373	BUG_ON(size > 65536);
374	BUG_ON(va & ~PAGE_MASK);
375
376	for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
377		int level;
378		pte_t *ptep;
379		unsigned long pfn, mfn;
380		void *virt;
381
382		/*
383		 * The GDT is per-cpu and is in the percpu data area.
384		 * That can be virtually mapped, so we need to do a
385		 * page-walk to get the underlying MFN for the
386		 * hypercall.  The page can also be in the kernel's
387		 * linear range, so we need to RO that mapping too.
388		 */
389		ptep = lookup_address(va, &level);
390		BUG_ON(ptep == NULL);
391
392		pfn = pte_pfn(*ptep);
393		mfn = pfn_to_mfn(pfn);
394		virt = __va(PFN_PHYS(pfn));
395
396		frames[f] = mfn;
397
398		make_lowmem_page_readonly((void *)va);
399		make_lowmem_page_readonly(virt);
400	}
401
402	if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct)))
403		BUG();
404}
405
406/*
407 * load_gdt for early boot, when the gdt is only mapped once
408 */
409static __init void xen_load_gdt_boot(const struct desc_ptr *dtr)
410{
411	unsigned long va = dtr->address;
412	unsigned int size = dtr->size + 1;
413	unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
414	unsigned long frames[pages];
415	int f;
416
417	/*
418	 * A GDT can be up to 64k in size, which corresponds to 8192
419	 * 8-byte entries, or 16 4k pages..
420	 */
421
422	BUG_ON(size > 65536);
423	BUG_ON(va & ~PAGE_MASK);
424
425	for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
426		pte_t pte;
427		unsigned long pfn, mfn;
428
429		pfn = virt_to_pfn(va);
430		mfn = pfn_to_mfn(pfn);
431
432		pte = pfn_pte(pfn, PAGE_KERNEL_RO);
433
434		if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0))
435			BUG();
436
437		frames[f] = mfn;
438	}
439
440	if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct)))
441		BUG();
442}
443
444static void load_TLS_descriptor(struct thread_struct *t,
445				unsigned int cpu, unsigned int i)
446{
447	struct desc_struct *gdt = get_cpu_gdt_table(cpu);
448	xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
449	struct multicall_space mc = __xen_mc_entry(0);
450
451	MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
452}
453
454static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
455{
456	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
457#ifdef CONFIG_X86_32
458		lazy_load_gs(0);
459#else
460		loadsegment(fs, 0);
461#endif
462	}
463
464	xen_mc_batch();
465
466	load_TLS_descriptor(t, cpu, 0);
467	load_TLS_descriptor(t, cpu, 1);
468	load_TLS_descriptor(t, cpu, 2);
469
470	xen_mc_issue(PARAVIRT_LAZY_CPU);
471}
472
473#ifdef CONFIG_X86_64
474static void xen_load_gs_index(unsigned int idx)
475{
476	if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx))
477		BUG();
478}
479#endif
480
481static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
482				const void *ptr)
483{
484	xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]);
485	u64 entry = *(u64 *)ptr;
486
487	preempt_disable();
488
489	xen_mc_flush();
490	if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry))
491		BUG();
492
493	preempt_enable();
494}
495
496static int cvt_gate_to_trap(int vector, const gate_desc *val,
497			    struct trap_info *info)
498{
499	unsigned long addr;
500
501	if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT)
502		return 0;
503
504	info->vector = vector;
505
506	addr = gate_offset(*val);
507#ifdef CONFIG_X86_64
508	/*
509	 * Look for known traps using IST, and substitute them
510	 * appropriately.  The debugger ones are the only ones we care
511	 * about.  Xen will handle faults like double_fault and
512	 * machine_check, so we should never see them.  Warn if
513	 * there's an unexpected IST-using fault handler.
514	 */
515	if (addr == (unsigned long)debug)
516		addr = (unsigned long)xen_debug;
517	else if (addr == (unsigned long)int3)
518		addr = (unsigned long)xen_int3;
519	else if (addr == (unsigned long)stack_segment)
520		addr = (unsigned long)xen_stack_segment;
521	else if (addr == (unsigned long)double_fault ||
522		 addr == (unsigned long)nmi) {
523		/* Don't need to handle these */
524		return 0;
525#ifdef CONFIG_X86_MCE
526	} else if (addr == (unsigned long)machine_check) {
527		return 0;
528#endif
529	} else {
530		/* Some other trap using IST? */
531		if (WARN_ON(val->ist != 0))
532			return 0;
533	}
534#endif	/* CONFIG_X86_64 */
535	info->address = addr;
536
537	info->cs = gate_segment(*val);
538	info->flags = val->dpl;
539	/* interrupt gates clear IF */
540	if (val->type == GATE_INTERRUPT)
541		info->flags |= 1 << 2;
542
543	return 1;
544}
545
546/* Locations of each CPU's IDT */
547static DEFINE_PER_CPU(struct desc_ptr, idt_desc);
548
549/* Set an IDT entry.  If the entry is part of the current IDT, then
550   also update Xen. */
551static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
552{
553	unsigned long p = (unsigned long)&dt[entrynum];
554	unsigned long start, end;
555
556	preempt_disable();
557
558	start = __get_cpu_var(idt_desc).address;
559	end = start + __get_cpu_var(idt_desc).size + 1;
560
561	xen_mc_flush();
562
563	native_write_idt_entry(dt, entrynum, g);
564
565	if (p >= start && (p + 8) <= end) {
566		struct trap_info info[2];
567
568		info[1].address = 0;
569
570		if (cvt_gate_to_trap(entrynum, g, &info[0]))
571			if (HYPERVISOR_set_trap_table(info))
572				BUG();
573	}
574
575	preempt_enable();
576}
577
578static void xen_convert_trap_info(const struct desc_ptr *desc,
579				  struct trap_info *traps)
580{
581	unsigned in, out, count;
582
583	count = (desc->size+1) / sizeof(gate_desc);
584	BUG_ON(count > 256);
585
586	for (in = out = 0; in < count; in++) {
587		gate_desc *entry = (gate_desc*)(desc->address) + in;
588
589		if (cvt_gate_to_trap(in, entry, &traps[out]))
590			out++;
591	}
592	traps[out].address = 0;
593}
594
595void xen_copy_trap_info(struct trap_info *traps)
596{
597	const struct desc_ptr *desc = &__get_cpu_var(idt_desc);
598
599	xen_convert_trap_info(desc, traps);
600}
601
602/* Load a new IDT into Xen.  In principle this can be per-CPU, so we
603   hold a spinlock to protect the static traps[] array (static because
604   it avoids allocation, and saves stack space). */
605static void xen_load_idt(const struct desc_ptr *desc)
606{
607	static DEFINE_SPINLOCK(lock);
608	static struct trap_info traps[257];
609
610	spin_lock(&lock);
611
612	__get_cpu_var(idt_desc) = *desc;
613
614	xen_convert_trap_info(desc, traps);
615
616	xen_mc_flush();
617	if (HYPERVISOR_set_trap_table(traps))
618		BUG();
619
620	spin_unlock(&lock);
621}
622
623/* Write a GDT descriptor entry.  Ignore LDT descriptors, since
624   they're handled differently. */
625static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
626				const void *desc, int type)
627{
628	preempt_disable();
629
630	switch (type) {
631	case DESC_LDT:
632	case DESC_TSS:
633		/* ignore */
634		break;
635
636	default: {
637		xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]);
638
639		xen_mc_flush();
640		if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
641			BUG();
642	}
643
644	}
645
646	preempt_enable();
647}
648
649/*
650 * Version of write_gdt_entry for use at early boot-time needed to
651 * update an entry as simply as possible.
652 */
653static __init void xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
654					    const void *desc, int type)
655{
656	switch (type) {
657	case DESC_LDT:
658	case DESC_TSS:
659		/* ignore */
660		break;
661
662	default: {
663		xmaddr_t maddr = virt_to_machine(&dt[entry]);
664
665		if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
666			dt[entry] = *(struct desc_struct *)desc;
667	}
668
669	}
670}
671
672static void xen_load_sp0(struct tss_struct *tss,
673			 struct thread_struct *thread)
674{
675	struct multicall_space mcs = xen_mc_entry(0);
676	MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
677	xen_mc_issue(PARAVIRT_LAZY_CPU);
678}
679
680static void xen_set_iopl_mask(unsigned mask)
681{
682	struct physdev_set_iopl set_iopl;
683
684	/* Force the change at ring 0. */
685	set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
686	HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
687}
688
689static void xen_io_delay(void)
690{
691}
692
693#ifdef CONFIG_X86_LOCAL_APIC
694static u32 xen_apic_read(u32 reg)
695{
696	return 0;
697}
698
699static void xen_apic_write(u32 reg, u32 val)
700{
701	/* Warn to see if there's any stray references */
702	WARN_ON(1);
703}
704
705static u64 xen_apic_icr_read(void)
706{
707	return 0;
708}
709
710static void xen_apic_icr_write(u32 low, u32 id)
711{
712	/* Warn to see if there's any stray references */
713	WARN_ON(1);
714}
715
716static void xen_apic_wait_icr_idle(void)
717{
718        return;
719}
720
721static u32 xen_safe_apic_wait_icr_idle(void)
722{
723        return 0;
724}
725
726static void set_xen_basic_apic_ops(void)
727{
728	apic->read = xen_apic_read;
729	apic->write = xen_apic_write;
730	apic->icr_read = xen_apic_icr_read;
731	apic->icr_write = xen_apic_icr_write;
732	apic->wait_icr_idle = xen_apic_wait_icr_idle;
733	apic->safe_wait_icr_idle = xen_safe_apic_wait_icr_idle;
734}
735
736#endif
737
738static void xen_clts(void)
739{
740	struct multicall_space mcs;
741
742	mcs = xen_mc_entry(0);
743
744	MULTI_fpu_taskswitch(mcs.mc, 0);
745
746	xen_mc_issue(PARAVIRT_LAZY_CPU);
747}
748
749static DEFINE_PER_CPU(unsigned long, xen_cr0_value);
750
751static unsigned long xen_read_cr0(void)
752{
753	unsigned long cr0 = percpu_read(xen_cr0_value);
754
755	if (unlikely(cr0 == 0)) {
756		cr0 = native_read_cr0();
757		percpu_write(xen_cr0_value, cr0);
758	}
759
760	return cr0;
761}
762
763static void xen_write_cr0(unsigned long cr0)
764{
765	struct multicall_space mcs;
766
767	percpu_write(xen_cr0_value, cr0);
768
769	/* Only pay attention to cr0.TS; everything else is
770	   ignored. */
771	mcs = xen_mc_entry(0);
772
773	MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0);
774
775	xen_mc_issue(PARAVIRT_LAZY_CPU);
776}
777
778static void xen_write_cr4(unsigned long cr4)
779{
780	cr4 &= ~X86_CR4_PGE;
781	cr4 &= ~X86_CR4_PSE;
782
783	native_write_cr4(cr4);
784}
785
786static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
787{
788	int ret;
789
790	ret = 0;
791
792	switch (msr) {
793#ifdef CONFIG_X86_64
794		unsigned which;
795		u64 base;
796
797	case MSR_FS_BASE:		which = SEGBASE_FS; goto set;
798	case MSR_KERNEL_GS_BASE:	which = SEGBASE_GS_USER; goto set;
799	case MSR_GS_BASE:		which = SEGBASE_GS_KERNEL; goto set;
800
801	set:
802		base = ((u64)high << 32) | low;
803		if (HYPERVISOR_set_segment_base(which, base) != 0)
804			ret = -EIO;
805		break;
806#endif
807
808	case MSR_STAR:
809	case MSR_CSTAR:
810	case MSR_LSTAR:
811	case MSR_SYSCALL_MASK:
812	case MSR_IA32_SYSENTER_CS:
813	case MSR_IA32_SYSENTER_ESP:
814	case MSR_IA32_SYSENTER_EIP:
815		/* Fast syscall setup is all done in hypercalls, so
816		   these are all ignored.  Stub them out here to stop
817		   Xen console noise. */
818		break;
819
820	default:
821		ret = native_write_msr_safe(msr, low, high);
822	}
823
824	return ret;
825}
826
827void xen_setup_shared_info(void)
828{
829	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
830		set_fixmap(FIX_PARAVIRT_BOOTMAP,
831			   xen_start_info->shared_info);
832
833		HYPERVISOR_shared_info =
834			(struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
835	} else
836		HYPERVISOR_shared_info =
837			(struct shared_info *)__va(xen_start_info->shared_info);
838
839#ifndef CONFIG_SMP
840	/* In UP this is as good a place as any to set up shared info */
841	xen_setup_vcpu_info_placement();
842#endif
843
844	xen_setup_mfn_list_list();
845}
846
847/* This is called once we have the cpu_possible_map */
848void xen_setup_vcpu_info_placement(void)
849{
850	int cpu;
851
852	for_each_possible_cpu(cpu)
853		xen_vcpu_setup(cpu);
854
855	/* xen_vcpu_setup managed to place the vcpu_info within the
856	   percpu area for all cpus, so make use of it */
857	if (have_vcpu_info_placement) {
858		printk(KERN_INFO "Xen: using vcpu_info placement\n");
859
860		pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
861		pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
862		pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
863		pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
864		pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
865	}
866}
867
868static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
869			  unsigned long addr, unsigned len)
870{
871	char *start, *end, *reloc;
872	unsigned ret;
873
874	start = end = reloc = NULL;
875
876#define SITE(op, x)							\
877	case PARAVIRT_PATCH(op.x):					\
878	if (have_vcpu_info_placement) {					\
879		start = (char *)xen_##x##_direct;			\
880		end = xen_##x##_direct_end;				\
881		reloc = xen_##x##_direct_reloc;				\
882	}								\
883	goto patch_site
884
885	switch (type) {
886		SITE(pv_irq_ops, irq_enable);
887		SITE(pv_irq_ops, irq_disable);
888		SITE(pv_irq_ops, save_fl);
889		SITE(pv_irq_ops, restore_fl);
890#undef SITE
891
892	patch_site:
893		if (start == NULL || (end-start) > len)
894			goto default_patch;
895
896		ret = paravirt_patch_insns(insnbuf, len, start, end);
897
898		/* Note: because reloc is assigned from something that
899		   appears to be an array, gcc assumes it's non-null,
900		   but doesn't know its relationship with start and
901		   end. */
902		if (reloc > start && reloc < end) {
903			int reloc_off = reloc - start;
904			long *relocp = (long *)(insnbuf + reloc_off);
905			long delta = start - (char *)addr;
906
907			*relocp += delta;
908		}
909		break;
910
911	default_patch:
912	default:
913		ret = paravirt_patch_default(type, clobbers, insnbuf,
914					     addr, len);
915		break;
916	}
917
918	return ret;
919}
920
921static const struct pv_info xen_info __initdata = {
922	.paravirt_enabled = 1,
923	.shared_kernel_pmd = 0,
924
925	.name = "Xen",
926};
927
928static const struct pv_init_ops xen_init_ops __initdata = {
929	.patch = xen_patch,
930};
931
932static const struct pv_cpu_ops xen_cpu_ops __initdata = {
933	.cpuid = xen_cpuid,
934
935	.set_debugreg = xen_set_debugreg,
936	.get_debugreg = xen_get_debugreg,
937
938	.clts = xen_clts,
939
940	.read_cr0 = xen_read_cr0,
941	.write_cr0 = xen_write_cr0,
942
943	.read_cr4 = native_read_cr4,
944	.read_cr4_safe = native_read_cr4_safe,
945	.write_cr4 = xen_write_cr4,
946
947	.wbinvd = native_wbinvd,
948
949	.read_msr = native_read_msr_safe,
950	.write_msr = xen_write_msr_safe,
951	.read_tsc = native_read_tsc,
952	.read_pmc = native_read_pmc,
953
954	.iret = xen_iret,
955	.irq_enable_sysexit = xen_sysexit,
956#ifdef CONFIG_X86_64
957	.usergs_sysret32 = xen_sysret32,
958	.usergs_sysret64 = xen_sysret64,
959#endif
960
961	.load_tr_desc = paravirt_nop,
962	.set_ldt = xen_set_ldt,
963	.load_gdt = xen_load_gdt,
964	.load_idt = xen_load_idt,
965	.load_tls = xen_load_tls,
966#ifdef CONFIG_X86_64
967	.load_gs_index = xen_load_gs_index,
968#endif
969
970	.alloc_ldt = xen_alloc_ldt,
971	.free_ldt = xen_free_ldt,
972
973	.store_gdt = native_store_gdt,
974	.store_idt = native_store_idt,
975	.store_tr = xen_store_tr,
976
977	.write_ldt_entry = xen_write_ldt_entry,
978	.write_gdt_entry = xen_write_gdt_entry,
979	.write_idt_entry = xen_write_idt_entry,
980	.load_sp0 = xen_load_sp0,
981
982	.set_iopl_mask = xen_set_iopl_mask,
983	.io_delay = xen_io_delay,
984
985	/* Xen takes care of %gs when switching to usermode for us */
986	.swapgs = paravirt_nop,
987
988	.start_context_switch = paravirt_start_context_switch,
989	.end_context_switch = xen_end_context_switch,
990};
991
992static const struct pv_apic_ops xen_apic_ops __initdata = {
993#ifdef CONFIG_X86_LOCAL_APIC
994	.startup_ipi_hook = paravirt_nop,
995#endif
996};
997
998static void xen_reboot(int reason)
999{
1000	struct sched_shutdown r = { .reason = reason };
1001
1002	if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
1003		BUG();
1004}
1005
1006static void xen_restart(char *msg)
1007{
1008	xen_reboot(SHUTDOWN_reboot);
1009}
1010
1011static void xen_emergency_restart(void)
1012{
1013	xen_reboot(SHUTDOWN_reboot);
1014}
1015
1016static void xen_machine_halt(void)
1017{
1018	xen_reboot(SHUTDOWN_poweroff);
1019}
1020
1021static void xen_crash_shutdown(struct pt_regs *regs)
1022{
1023	xen_reboot(SHUTDOWN_crash);
1024}
1025
1026static int
1027xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
1028{
1029	xen_reboot(SHUTDOWN_crash);
1030	return NOTIFY_DONE;
1031}
1032
1033static struct notifier_block xen_panic_block = {
1034	.notifier_call= xen_panic_event,
1035};
1036
1037int xen_panic_handler_init(void)
1038{
1039	atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
1040	return 0;
1041}
1042
1043static const struct machine_ops __initdata xen_machine_ops = {
1044	.restart = xen_restart,
1045	.halt = xen_machine_halt,
1046	.power_off = xen_machine_halt,
1047	.shutdown = xen_machine_halt,
1048	.crash_shutdown = xen_crash_shutdown,
1049	.emergency_restart = xen_emergency_restart,
1050};
1051
1052/*
1053 * Set up the GDT and segment registers for -fstack-protector.  Until
1054 * we do this, we have to be careful not to call any stack-protected
1055 * function, which is most of the kernel.
1056 */
1057static void __init xen_setup_stackprotector(void)
1058{
1059	pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot;
1060	pv_cpu_ops.load_gdt = xen_load_gdt_boot;
1061
1062	setup_stack_canary_segment(0);
1063	switch_to_new_gdt(0);
1064
1065	pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry;
1066	pv_cpu_ops.load_gdt = xen_load_gdt;
1067}
1068
1069/* First C function to be called on Xen boot */
1070asmlinkage void __init xen_start_kernel(void)
1071{
1072	pgd_t *pgd;
1073
1074	if (!xen_start_info)
1075		return;
1076
1077	xen_domain_type = XEN_PV_DOMAIN;
1078
1079	/* Install Xen paravirt ops */
1080	pv_info = xen_info;
1081	pv_init_ops = xen_init_ops;
1082	pv_cpu_ops = xen_cpu_ops;
1083	pv_apic_ops = xen_apic_ops;
1084
1085	x86_init.resources.memory_setup = xen_memory_setup;
1086	x86_init.oem.arch_setup = xen_arch_setup;
1087	x86_init.oem.banner = xen_banner;
1088
1089	xen_init_time_ops();
1090
1091	/*
1092	 * Set up some pagetable state before starting to set any ptes.
1093	 */
1094
1095	xen_init_mmu_ops();
1096
1097	/* Prevent unwanted bits from being set in PTEs. */
1098	__supported_pte_mask &= ~_PAGE_GLOBAL;
1099	if (!xen_initial_domain())
1100		__supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1101
1102	__supported_pte_mask |= _PAGE_IOMAP;
1103
1104	/*
1105	 * Prevent page tables from being allocated in highmem, even
1106	 * if CONFIG_HIGHPTE is enabled.
1107	 */
1108	__userpte_alloc_gfp &= ~__GFP_HIGHMEM;
1109
1110	/* Work out if we support NX */
1111	x86_configure_nx();
1112
1113	xen_setup_features();
1114
1115	/* Get mfn list */
1116	if (!xen_feature(XENFEAT_auto_translated_physmap))
1117		xen_build_dynamic_phys_to_machine();
1118
1119	/*
1120	 * Set up kernel GDT and segment registers, mainly so that
1121	 * -fstack-protector code can be executed.
1122	 */
1123	xen_setup_stackprotector();
1124
1125	xen_init_irq_ops();
1126	xen_init_cpuid_mask();
1127
1128#ifdef CONFIG_X86_LOCAL_APIC
1129	/*
1130	 * set up the basic apic ops.
1131	 */
1132	set_xen_basic_apic_ops();
1133#endif
1134
1135	if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
1136		pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start;
1137		pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit;
1138	}
1139
1140	machine_ops = xen_machine_ops;
1141
1142	/*
1143	 * The only reliable way to retain the initial address of the
1144	 * percpu gdt_page is to remember it here, so we can go and
1145	 * mark it RW later, when the initial percpu area is freed.
1146	 */
1147	xen_initial_gdt = &per_cpu(gdt_page, 0);
1148
1149	xen_smp_init();
1150
1151	pgd = (pgd_t *)xen_start_info->pt_base;
1152
1153	if (!xen_initial_domain())
1154		__supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1155
1156	__supported_pte_mask |= _PAGE_IOMAP;
1157	/* Don't do the full vcpu_info placement stuff until we have a
1158	   possible map and a non-dummy shared_info. */
1159	per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
1160
1161	local_irq_disable();
1162	early_boot_irqs_off();
1163
1164	xen_raw_console_write("mapping kernel into physical memory\n");
1165	pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
1166
1167	init_mm.pgd = pgd;
1168
1169	/* keep using Xen gdt for now; no urgent need to change it */
1170
1171#ifdef CONFIG_X86_32
1172	pv_info.kernel_rpl = 1;
1173	if (xen_feature(XENFEAT_supervisor_mode_kernel))
1174		pv_info.kernel_rpl = 0;
1175#else
1176	pv_info.kernel_rpl = 0;
1177#endif
1178
1179	/* set the limit of our address space */
1180	xen_reserve_top();
1181
1182#ifdef CONFIG_X86_32
1183	/* set up basic CPUID stuff */
1184	cpu_detect(&new_cpu_data);
1185	new_cpu_data.hard_math = 1;
1186	new_cpu_data.wp_works_ok = 1;
1187	new_cpu_data.x86_capability[0] = cpuid_edx(1);
1188#endif
1189
1190	/* Poke various useful things into boot_params */
1191	boot_params.hdr.type_of_loader = (9 << 4) | 0;
1192	boot_params.hdr.ramdisk_image = xen_start_info->mod_start
1193		? __pa(xen_start_info->mod_start) : 0;
1194	boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
1195	boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
1196
1197	if (!xen_initial_domain()) {
1198		add_preferred_console("xenboot", 0, NULL);
1199		add_preferred_console("tty", 0, NULL);
1200		add_preferred_console("hvc", 0, NULL);
1201	} else {
1202		/* Make sure ACS will be enabled */
1203		pci_request_acs();
1204	}
1205
1206
1207	xen_raw_console_write("about to get started...\n");
1208
1209	xen_setup_runstate_info(0);
1210
1211	/* Start the world */
1212#ifdef CONFIG_X86_32
1213	i386_start_kernel();
1214#else
1215	x86_64_start_reservations((char *)__pa_symbol(&boot_params));
1216#endif
1217}
1218
1219static uint32_t xen_cpuid_base(void)
1220{
1221	uint32_t base, eax, ebx, ecx, edx;
1222	char signature[13];
1223
1224	for (base = 0x40000000; base < 0x40010000; base += 0x100) {
1225		cpuid(base, &eax, &ebx, &ecx, &edx);
1226		*(uint32_t *)(signature + 0) = ebx;
1227		*(uint32_t *)(signature + 4) = ecx;
1228		*(uint32_t *)(signature + 8) = edx;
1229		signature[12] = 0;
1230
1231		if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2))
1232			return base;
1233	}
1234
1235	return 0;
1236}
1237
1238static int init_hvm_pv_info(int *major, int *minor)
1239{
1240	uint32_t eax, ebx, ecx, edx, pages, msr, base;
1241	u64 pfn;
1242
1243	base = xen_cpuid_base();
1244	cpuid(base + 1, &eax, &ebx, &ecx, &edx);
1245
1246	*major = eax >> 16;
1247	*minor = eax & 0xffff;
1248	printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor);
1249
1250	cpuid(base + 2, &pages, &msr, &ecx, &edx);
1251
1252	pfn = __pa(hypercall_page);
1253	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
1254
1255	xen_setup_features();
1256
1257	pv_info = xen_info;
1258	pv_info.kernel_rpl = 0;
1259
1260	xen_domain_type = XEN_HVM_DOMAIN;
1261
1262	return 0;
1263}
1264
1265void xen_hvm_init_shared_info(void)
1266{
1267	int cpu;
1268	struct xen_add_to_physmap xatp;
1269	static struct shared_info *shared_info_page = 0;
1270
1271	if (!shared_info_page)
1272		shared_info_page = (struct shared_info *)
1273			extend_brk(PAGE_SIZE, PAGE_SIZE);
1274	xatp.domid = DOMID_SELF;
1275	xatp.idx = 0;
1276	xatp.space = XENMAPSPACE_shared_info;
1277	xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;
1278	if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
1279		BUG();
1280
1281	HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;
1282
1283	/* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
1284	 * page, we use it in the event channel upcall and in some pvclock
1285	 * related functions. We don't need the vcpu_info placement
1286	 * optimizations because we don't use any pv_mmu or pv_irq op on
1287	 * HVM.
1288	 * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is
1289	 * online but xen_hvm_init_shared_info is run at resume time too and
1290	 * in that case multiple vcpus might be online. */
1291	for_each_online_cpu(cpu) {
1292		per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
1293	}
1294}
1295
1296#ifdef CONFIG_XEN_PVHVM
1297static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
1298				    unsigned long action, void *hcpu)
1299{
1300	int cpu = (long)hcpu;
1301	switch (action) {
1302	case CPU_UP_PREPARE:
1303		per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
1304		break;
1305	default:
1306		break;
1307	}
1308	return NOTIFY_OK;
1309}
1310
1311static struct notifier_block __cpuinitdata xen_hvm_cpu_notifier = {
1312	.notifier_call	= xen_hvm_cpu_notify,
1313};
1314
1315static void __init xen_hvm_guest_init(void)
1316{
1317	int r;
1318	int major, minor;
1319
1320	r = init_hvm_pv_info(&major, &minor);
1321	if (r < 0)
1322		return;
1323
1324	xen_hvm_init_shared_info();
1325
1326	if (xen_feature(XENFEAT_hvm_callback_vector))
1327		xen_have_vector_callback = 1;
1328	register_cpu_notifier(&xen_hvm_cpu_notifier);
1329	xen_unplug_emulated_devices();
1330	have_vcpu_info_placement = 0;
1331	x86_init.irqs.intr_init = xen_init_IRQ;
1332	xen_hvm_init_time_ops();
1333	xen_hvm_init_mmu_ops();
1334}
1335
1336static bool __init xen_hvm_platform(void)
1337{
1338	if (xen_pv_domain())
1339		return false;
1340
1341	if (!xen_cpuid_base())
1342		return false;
1343
1344	return true;
1345}
1346
1347const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = {
1348	.name			= "Xen HVM",
1349	.detect			= xen_hvm_platform,
1350	.init_platform		= xen_hvm_guest_init,
1351};
1352EXPORT_SYMBOL(x86_hyper_xen_hvm);
1353#endif
1354