1/* 2 * Core of Xen paravirt_ops implementation. 3 * 4 * This file contains the xen_paravirt_ops structure itself, and the 5 * implementations for: 6 * - privileged instructions 7 * - interrupt flags 8 * - segment operations 9 * - booting and setup 10 * 11 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 12 */ 13 14#include <linux/cpu.h> 15#include <linux/kernel.h> 16#include <linux/init.h> 17#include <linux/smp.h> 18#include <linux/preempt.h> 19#include <linux/hardirq.h> 20#include <linux/percpu.h> 21#include <linux/delay.h> 22#include <linux/start_kernel.h> 23#include <linux/sched.h> 24#include <linux/kprobes.h> 25#include <linux/bootmem.h> 26#include <linux/module.h> 27#include <linux/mm.h> 28#include <linux/page-flags.h> 29#include <linux/highmem.h> 30#include <linux/console.h> 31#include <linux/pci.h> 32#include <linux/gfp.h> 33 34#include <xen/xen.h> 35#include <xen/interface/xen.h> 36#include <xen/interface/version.h> 37#include <xen/interface/physdev.h> 38#include <xen/interface/vcpu.h> 39#include <xen/interface/memory.h> 40#include <xen/features.h> 41#include <xen/page.h> 42#include <xen/hvm.h> 43#include <xen/hvc-console.h> 44 45#include <asm/paravirt.h> 46#include <asm/apic.h> 47#include <asm/page.h> 48#include <asm/xen/hypercall.h> 49#include <asm/xen/hypervisor.h> 50#include <asm/fixmap.h> 51#include <asm/processor.h> 52#include <asm/proto.h> 53#include <asm/msr-index.h> 54#include <asm/traps.h> 55#include <asm/setup.h> 56#include <asm/desc.h> 57#include <asm/pgalloc.h> 58#include <asm/pgtable.h> 59#include <asm/tlbflush.h> 60#include <asm/reboot.h> 61#include <asm/setup.h> 62#include <asm/stackprotector.h> 63#include <asm/hypervisor.h> 64 65#include "xen-ops.h" 66#include "mmu.h" 67#include "multicalls.h" 68 69EXPORT_SYMBOL_GPL(hypercall_page); 70 71DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); 72DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); 73 74enum xen_domain_type xen_domain_type = XEN_NATIVE; 75EXPORT_SYMBOL_GPL(xen_domain_type); 76 77struct start_info *xen_start_info; 78EXPORT_SYMBOL_GPL(xen_start_info); 79 80struct shared_info xen_dummy_shared_info; 81 82void *xen_initial_gdt; 83 84RESERVE_BRK(shared_info_page_brk, PAGE_SIZE); 85__read_mostly int xen_have_vector_callback; 86EXPORT_SYMBOL_GPL(xen_have_vector_callback); 87 88/* 89 * Point at some empty memory to start with. We map the real shared_info 90 * page as soon as fixmap is up and running. 91 */ 92struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; 93 94/* 95 * Flag to determine whether vcpu info placement is available on all 96 * VCPUs. We assume it is to start with, and then set it to zero on 97 * the first failure. This is because it can succeed on some VCPUs 98 * and not others, since it can involve hypervisor memory allocation, 99 * or because the guest failed to guarantee all the appropriate 100 * constraints on all VCPUs (ie buffer can't cross a page boundary). 101 * 102 * Note that any particular CPU may be using a placed vcpu structure, 103 * but we can only optimise if the all are. 104 * 105 * 0: not available, 1: available 106 */ 107static int have_vcpu_info_placement = 1; 108 109static void clamp_max_cpus(void) 110{ 111#ifdef CONFIG_SMP 112 if (setup_max_cpus > MAX_VIRT_CPUS) 113 setup_max_cpus = MAX_VIRT_CPUS; 114#endif 115} 116 117static void xen_vcpu_setup(int cpu) 118{ 119 struct vcpu_register_vcpu_info info; 120 int err; 121 struct vcpu_info *vcpup; 122 123 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); 124 125 if (cpu < MAX_VIRT_CPUS) 126 per_cpu(xen_vcpu,cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; 127 128 if (!have_vcpu_info_placement) { 129 if (cpu >= MAX_VIRT_CPUS) 130 clamp_max_cpus(); 131 return; 132 } 133 134 vcpup = &per_cpu(xen_vcpu_info, cpu); 135 info.mfn = arbitrary_virt_to_mfn(vcpup); 136 info.offset = offset_in_page(vcpup); 137 138 printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n", 139 cpu, vcpup, info.mfn, info.offset); 140 141 /* Check to see if the hypervisor will put the vcpu_info 142 structure where we want it, which allows direct access via 143 a percpu-variable. */ 144 err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info); 145 146 if (err) { 147 printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err); 148 have_vcpu_info_placement = 0; 149 clamp_max_cpus(); 150 } else { 151 /* This cpu is using the registered vcpu info, even if 152 later ones fail to. */ 153 per_cpu(xen_vcpu, cpu) = vcpup; 154 155 printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n", 156 cpu, vcpup); 157 } 158} 159 160/* 161 * On restore, set the vcpu placement up again. 162 * If it fails, then we're in a bad state, since 163 * we can't back out from using it... 164 */ 165void xen_vcpu_restore(void) 166{ 167 int cpu; 168 169 for_each_online_cpu(cpu) { 170 bool other_cpu = (cpu != smp_processor_id()); 171 172 if (other_cpu && 173 HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL)) 174 BUG(); 175 176 xen_setup_runstate_info(cpu); 177 178 if (have_vcpu_info_placement) 179 xen_vcpu_setup(cpu); 180 181 if (other_cpu && 182 HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)) 183 BUG(); 184 } 185} 186 187static void __init xen_banner(void) 188{ 189 unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL); 190 struct xen_extraversion extra; 191 HYPERVISOR_xen_version(XENVER_extraversion, &extra); 192 193 printk(KERN_INFO "Booting paravirtualized kernel on %s\n", 194 pv_info.name); 195 printk(KERN_INFO "Xen version: %d.%d%s%s\n", 196 version >> 16, version & 0xffff, extra.extraversion, 197 xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); 198} 199 200static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0; 201static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0; 202 203static void xen_cpuid(unsigned int *ax, unsigned int *bx, 204 unsigned int *cx, unsigned int *dx) 205{ 206 unsigned maskebx = ~0; 207 unsigned maskecx = ~0; 208 unsigned maskedx = ~0; 209 210 /* 211 * Mask out inconvenient features, to try and disable as many 212 * unsupported kernel subsystems as possible. 213 */ 214 switch (*ax) { 215 case 1: 216 maskecx = cpuid_leaf1_ecx_mask; 217 maskedx = cpuid_leaf1_edx_mask; 218 break; 219 220 case 0xb: 221 /* Suppress extended topology stuff */ 222 maskebx = 0; 223 break; 224 } 225 226 asm(XEN_EMULATE_PREFIX "cpuid" 227 : "=a" (*ax), 228 "=b" (*bx), 229 "=c" (*cx), 230 "=d" (*dx) 231 : "0" (*ax), "2" (*cx)); 232 233 *bx &= maskebx; 234 *cx &= maskecx; 235 *dx &= maskedx; 236} 237 238static __init void xen_init_cpuid_mask(void) 239{ 240 unsigned int ax, bx, cx, dx; 241 242 cpuid_leaf1_edx_mask = 243 ~((1 << X86_FEATURE_MCE) | /* disable MCE */ 244 (1 << X86_FEATURE_MCA) | /* disable MCA */ 245 (1 << X86_FEATURE_ACC)); /* thermal monitoring */ 246 247 if (!xen_initial_domain()) 248 cpuid_leaf1_edx_mask &= 249 ~((1 << X86_FEATURE_APIC) | /* disable local APIC */ 250 (1 << X86_FEATURE_ACPI)); /* disable ACPI */ 251 252 ax = 1; 253 cx = 0; 254 xen_cpuid(&ax, &bx, &cx, &dx); 255 256 /* cpuid claims we support xsave; try enabling it to see what happens */ 257 if (cx & (1 << (X86_FEATURE_XSAVE % 32))) { 258 unsigned long cr4; 259 260 set_in_cr4(X86_CR4_OSXSAVE); 261 262 cr4 = read_cr4(); 263 264 if ((cr4 & X86_CR4_OSXSAVE) == 0) 265 cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_XSAVE % 32)); 266 267 clear_in_cr4(X86_CR4_OSXSAVE); 268 } 269} 270 271static void xen_set_debugreg(int reg, unsigned long val) 272{ 273 HYPERVISOR_set_debugreg(reg, val); 274} 275 276static unsigned long xen_get_debugreg(int reg) 277{ 278 return HYPERVISOR_get_debugreg(reg); 279} 280 281static void xen_end_context_switch(struct task_struct *next) 282{ 283 xen_mc_flush(); 284 paravirt_end_context_switch(next); 285} 286 287static unsigned long xen_store_tr(void) 288{ 289 return 0; 290} 291 292/* 293 * Set the page permissions for a particular virtual address. If the 294 * address is a vmalloc mapping (or other non-linear mapping), then 295 * find the linear mapping of the page and also set its protections to 296 * match. 297 */ 298static void set_aliased_prot(void *v, pgprot_t prot) 299{ 300 int level; 301 pte_t *ptep; 302 pte_t pte; 303 unsigned long pfn; 304 struct page *page; 305 306 ptep = lookup_address((unsigned long)v, &level); 307 BUG_ON(ptep == NULL); 308 309 pfn = pte_pfn(*ptep); 310 page = pfn_to_page(pfn); 311 312 pte = pfn_pte(pfn, prot); 313 314 if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0)) 315 BUG(); 316 317 if (!PageHighMem(page)) { 318 void *av = __va(PFN_PHYS(pfn)); 319 320 if (av != v) 321 if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0)) 322 BUG(); 323 } else 324 kmap_flush_unused(); 325} 326 327static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries) 328{ 329 const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE; 330 int i; 331 332 for(i = 0; i < entries; i += entries_per_page) 333 set_aliased_prot(ldt + i, PAGE_KERNEL_RO); 334} 335 336static void xen_free_ldt(struct desc_struct *ldt, unsigned entries) 337{ 338 const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE; 339 int i; 340 341 for(i = 0; i < entries; i += entries_per_page) 342 set_aliased_prot(ldt + i, PAGE_KERNEL); 343} 344 345static void xen_set_ldt(const void *addr, unsigned entries) 346{ 347 struct mmuext_op *op; 348 struct multicall_space mcs = xen_mc_entry(sizeof(*op)); 349 350 op = mcs.args; 351 op->cmd = MMUEXT_SET_LDT; 352 op->arg1.linear_addr = (unsigned long)addr; 353 op->arg2.nr_ents = entries; 354 355 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 356 357 xen_mc_issue(PARAVIRT_LAZY_CPU); 358} 359 360static void xen_load_gdt(const struct desc_ptr *dtr) 361{ 362 unsigned long va = dtr->address; 363 unsigned int size = dtr->size + 1; 364 unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE; 365 unsigned long frames[pages]; 366 int f; 367 368 /* 369 * A GDT can be up to 64k in size, which corresponds to 8192 370 * 8-byte entries, or 16 4k pages.. 371 */ 372 373 BUG_ON(size > 65536); 374 BUG_ON(va & ~PAGE_MASK); 375 376 for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { 377 int level; 378 pte_t *ptep; 379 unsigned long pfn, mfn; 380 void *virt; 381 382 /* 383 * The GDT is per-cpu and is in the percpu data area. 384 * That can be virtually mapped, so we need to do a 385 * page-walk to get the underlying MFN for the 386 * hypercall. The page can also be in the kernel's 387 * linear range, so we need to RO that mapping too. 388 */ 389 ptep = lookup_address(va, &level); 390 BUG_ON(ptep == NULL); 391 392 pfn = pte_pfn(*ptep); 393 mfn = pfn_to_mfn(pfn); 394 virt = __va(PFN_PHYS(pfn)); 395 396 frames[f] = mfn; 397 398 make_lowmem_page_readonly((void *)va); 399 make_lowmem_page_readonly(virt); 400 } 401 402 if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct))) 403 BUG(); 404} 405 406/* 407 * load_gdt for early boot, when the gdt is only mapped once 408 */ 409static __init void xen_load_gdt_boot(const struct desc_ptr *dtr) 410{ 411 unsigned long va = dtr->address; 412 unsigned int size = dtr->size + 1; 413 unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE; 414 unsigned long frames[pages]; 415 int f; 416 417 /* 418 * A GDT can be up to 64k in size, which corresponds to 8192 419 * 8-byte entries, or 16 4k pages.. 420 */ 421 422 BUG_ON(size > 65536); 423 BUG_ON(va & ~PAGE_MASK); 424 425 for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { 426 pte_t pte; 427 unsigned long pfn, mfn; 428 429 pfn = virt_to_pfn(va); 430 mfn = pfn_to_mfn(pfn); 431 432 pte = pfn_pte(pfn, PAGE_KERNEL_RO); 433 434 if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0)) 435 BUG(); 436 437 frames[f] = mfn; 438 } 439 440 if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct))) 441 BUG(); 442} 443 444static void load_TLS_descriptor(struct thread_struct *t, 445 unsigned int cpu, unsigned int i) 446{ 447 struct desc_struct *gdt = get_cpu_gdt_table(cpu); 448 xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); 449 struct multicall_space mc = __xen_mc_entry(0); 450 451 MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); 452} 453 454static void xen_load_tls(struct thread_struct *t, unsigned int cpu) 455{ 456 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) { 457#ifdef CONFIG_X86_32 458 lazy_load_gs(0); 459#else 460 loadsegment(fs, 0); 461#endif 462 } 463 464 xen_mc_batch(); 465 466 load_TLS_descriptor(t, cpu, 0); 467 load_TLS_descriptor(t, cpu, 1); 468 load_TLS_descriptor(t, cpu, 2); 469 470 xen_mc_issue(PARAVIRT_LAZY_CPU); 471} 472 473#ifdef CONFIG_X86_64 474static void xen_load_gs_index(unsigned int idx) 475{ 476 if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx)) 477 BUG(); 478} 479#endif 480 481static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, 482 const void *ptr) 483{ 484 xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]); 485 u64 entry = *(u64 *)ptr; 486 487 preempt_disable(); 488 489 xen_mc_flush(); 490 if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry)) 491 BUG(); 492 493 preempt_enable(); 494} 495 496static int cvt_gate_to_trap(int vector, const gate_desc *val, 497 struct trap_info *info) 498{ 499 unsigned long addr; 500 501 if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT) 502 return 0; 503 504 info->vector = vector; 505 506 addr = gate_offset(*val); 507#ifdef CONFIG_X86_64 508 /* 509 * Look for known traps using IST, and substitute them 510 * appropriately. The debugger ones are the only ones we care 511 * about. Xen will handle faults like double_fault and 512 * machine_check, so we should never see them. Warn if 513 * there's an unexpected IST-using fault handler. 514 */ 515 if (addr == (unsigned long)debug) 516 addr = (unsigned long)xen_debug; 517 else if (addr == (unsigned long)int3) 518 addr = (unsigned long)xen_int3; 519 else if (addr == (unsigned long)stack_segment) 520 addr = (unsigned long)xen_stack_segment; 521 else if (addr == (unsigned long)double_fault || 522 addr == (unsigned long)nmi) { 523 /* Don't need to handle these */ 524 return 0; 525#ifdef CONFIG_X86_MCE 526 } else if (addr == (unsigned long)machine_check) { 527 return 0; 528#endif 529 } else { 530 /* Some other trap using IST? */ 531 if (WARN_ON(val->ist != 0)) 532 return 0; 533 } 534#endif /* CONFIG_X86_64 */ 535 info->address = addr; 536 537 info->cs = gate_segment(*val); 538 info->flags = val->dpl; 539 /* interrupt gates clear IF */ 540 if (val->type == GATE_INTERRUPT) 541 info->flags |= 1 << 2; 542 543 return 1; 544} 545 546/* Locations of each CPU's IDT */ 547static DEFINE_PER_CPU(struct desc_ptr, idt_desc); 548 549/* Set an IDT entry. If the entry is part of the current IDT, then 550 also update Xen. */ 551static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) 552{ 553 unsigned long p = (unsigned long)&dt[entrynum]; 554 unsigned long start, end; 555 556 preempt_disable(); 557 558 start = __get_cpu_var(idt_desc).address; 559 end = start + __get_cpu_var(idt_desc).size + 1; 560 561 xen_mc_flush(); 562 563 native_write_idt_entry(dt, entrynum, g); 564 565 if (p >= start && (p + 8) <= end) { 566 struct trap_info info[2]; 567 568 info[1].address = 0; 569 570 if (cvt_gate_to_trap(entrynum, g, &info[0])) 571 if (HYPERVISOR_set_trap_table(info)) 572 BUG(); 573 } 574 575 preempt_enable(); 576} 577 578static void xen_convert_trap_info(const struct desc_ptr *desc, 579 struct trap_info *traps) 580{ 581 unsigned in, out, count; 582 583 count = (desc->size+1) / sizeof(gate_desc); 584 BUG_ON(count > 256); 585 586 for (in = out = 0; in < count; in++) { 587 gate_desc *entry = (gate_desc*)(desc->address) + in; 588 589 if (cvt_gate_to_trap(in, entry, &traps[out])) 590 out++; 591 } 592 traps[out].address = 0; 593} 594 595void xen_copy_trap_info(struct trap_info *traps) 596{ 597 const struct desc_ptr *desc = &__get_cpu_var(idt_desc); 598 599 xen_convert_trap_info(desc, traps); 600} 601 602/* Load a new IDT into Xen. In principle this can be per-CPU, so we 603 hold a spinlock to protect the static traps[] array (static because 604 it avoids allocation, and saves stack space). */ 605static void xen_load_idt(const struct desc_ptr *desc) 606{ 607 static DEFINE_SPINLOCK(lock); 608 static struct trap_info traps[257]; 609 610 spin_lock(&lock); 611 612 __get_cpu_var(idt_desc) = *desc; 613 614 xen_convert_trap_info(desc, traps); 615 616 xen_mc_flush(); 617 if (HYPERVISOR_set_trap_table(traps)) 618 BUG(); 619 620 spin_unlock(&lock); 621} 622 623/* Write a GDT descriptor entry. Ignore LDT descriptors, since 624 they're handled differently. */ 625static void xen_write_gdt_entry(struct desc_struct *dt, int entry, 626 const void *desc, int type) 627{ 628 preempt_disable(); 629 630 switch (type) { 631 case DESC_LDT: 632 case DESC_TSS: 633 /* ignore */ 634 break; 635 636 default: { 637 xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]); 638 639 xen_mc_flush(); 640 if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) 641 BUG(); 642 } 643 644 } 645 646 preempt_enable(); 647} 648 649/* 650 * Version of write_gdt_entry for use at early boot-time needed to 651 * update an entry as simply as possible. 652 */ 653static __init void xen_write_gdt_entry_boot(struct desc_struct *dt, int entry, 654 const void *desc, int type) 655{ 656 switch (type) { 657 case DESC_LDT: 658 case DESC_TSS: 659 /* ignore */ 660 break; 661 662 default: { 663 xmaddr_t maddr = virt_to_machine(&dt[entry]); 664 665 if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) 666 dt[entry] = *(struct desc_struct *)desc; 667 } 668 669 } 670} 671 672static void xen_load_sp0(struct tss_struct *tss, 673 struct thread_struct *thread) 674{ 675 struct multicall_space mcs = xen_mc_entry(0); 676 MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); 677 xen_mc_issue(PARAVIRT_LAZY_CPU); 678} 679 680static void xen_set_iopl_mask(unsigned mask) 681{ 682 struct physdev_set_iopl set_iopl; 683 684 /* Force the change at ring 0. */ 685 set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3; 686 HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 687} 688 689static void xen_io_delay(void) 690{ 691} 692 693#ifdef CONFIG_X86_LOCAL_APIC 694static u32 xen_apic_read(u32 reg) 695{ 696 return 0; 697} 698 699static void xen_apic_write(u32 reg, u32 val) 700{ 701 /* Warn to see if there's any stray references */ 702 WARN_ON(1); 703} 704 705static u64 xen_apic_icr_read(void) 706{ 707 return 0; 708} 709 710static void xen_apic_icr_write(u32 low, u32 id) 711{ 712 /* Warn to see if there's any stray references */ 713 WARN_ON(1); 714} 715 716static void xen_apic_wait_icr_idle(void) 717{ 718 return; 719} 720 721static u32 xen_safe_apic_wait_icr_idle(void) 722{ 723 return 0; 724} 725 726static void set_xen_basic_apic_ops(void) 727{ 728 apic->read = xen_apic_read; 729 apic->write = xen_apic_write; 730 apic->icr_read = xen_apic_icr_read; 731 apic->icr_write = xen_apic_icr_write; 732 apic->wait_icr_idle = xen_apic_wait_icr_idle; 733 apic->safe_wait_icr_idle = xen_safe_apic_wait_icr_idle; 734} 735 736#endif 737 738static void xen_clts(void) 739{ 740 struct multicall_space mcs; 741 742 mcs = xen_mc_entry(0); 743 744 MULTI_fpu_taskswitch(mcs.mc, 0); 745 746 xen_mc_issue(PARAVIRT_LAZY_CPU); 747} 748 749static DEFINE_PER_CPU(unsigned long, xen_cr0_value); 750 751static unsigned long xen_read_cr0(void) 752{ 753 unsigned long cr0 = percpu_read(xen_cr0_value); 754 755 if (unlikely(cr0 == 0)) { 756 cr0 = native_read_cr0(); 757 percpu_write(xen_cr0_value, cr0); 758 } 759 760 return cr0; 761} 762 763static void xen_write_cr0(unsigned long cr0) 764{ 765 struct multicall_space mcs; 766 767 percpu_write(xen_cr0_value, cr0); 768 769 /* Only pay attention to cr0.TS; everything else is 770 ignored. */ 771 mcs = xen_mc_entry(0); 772 773 MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0); 774 775 xen_mc_issue(PARAVIRT_LAZY_CPU); 776} 777 778static void xen_write_cr4(unsigned long cr4) 779{ 780 cr4 &= ~X86_CR4_PGE; 781 cr4 &= ~X86_CR4_PSE; 782 783 native_write_cr4(cr4); 784} 785 786static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) 787{ 788 int ret; 789 790 ret = 0; 791 792 switch (msr) { 793#ifdef CONFIG_X86_64 794 unsigned which; 795 u64 base; 796 797 case MSR_FS_BASE: which = SEGBASE_FS; goto set; 798 case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set; 799 case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set; 800 801 set: 802 base = ((u64)high << 32) | low; 803 if (HYPERVISOR_set_segment_base(which, base) != 0) 804 ret = -EIO; 805 break; 806#endif 807 808 case MSR_STAR: 809 case MSR_CSTAR: 810 case MSR_LSTAR: 811 case MSR_SYSCALL_MASK: 812 case MSR_IA32_SYSENTER_CS: 813 case MSR_IA32_SYSENTER_ESP: 814 case MSR_IA32_SYSENTER_EIP: 815 /* Fast syscall setup is all done in hypercalls, so 816 these are all ignored. Stub them out here to stop 817 Xen console noise. */ 818 break; 819 820 default: 821 ret = native_write_msr_safe(msr, low, high); 822 } 823 824 return ret; 825} 826 827void xen_setup_shared_info(void) 828{ 829 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 830 set_fixmap(FIX_PARAVIRT_BOOTMAP, 831 xen_start_info->shared_info); 832 833 HYPERVISOR_shared_info = 834 (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); 835 } else 836 HYPERVISOR_shared_info = 837 (struct shared_info *)__va(xen_start_info->shared_info); 838 839#ifndef CONFIG_SMP 840 /* In UP this is as good a place as any to set up shared info */ 841 xen_setup_vcpu_info_placement(); 842#endif 843 844 xen_setup_mfn_list_list(); 845} 846 847/* This is called once we have the cpu_possible_map */ 848void xen_setup_vcpu_info_placement(void) 849{ 850 int cpu; 851 852 for_each_possible_cpu(cpu) 853 xen_vcpu_setup(cpu); 854 855 /* xen_vcpu_setup managed to place the vcpu_info within the 856 percpu area for all cpus, so make use of it */ 857 if (have_vcpu_info_placement) { 858 printk(KERN_INFO "Xen: using vcpu_info placement\n"); 859 860 pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); 861 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); 862 pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); 863 pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct); 864 pv_mmu_ops.read_cr2 = xen_read_cr2_direct; 865 } 866} 867 868static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, 869 unsigned long addr, unsigned len) 870{ 871 char *start, *end, *reloc; 872 unsigned ret; 873 874 start = end = reloc = NULL; 875 876#define SITE(op, x) \ 877 case PARAVIRT_PATCH(op.x): \ 878 if (have_vcpu_info_placement) { \ 879 start = (char *)xen_##x##_direct; \ 880 end = xen_##x##_direct_end; \ 881 reloc = xen_##x##_direct_reloc; \ 882 } \ 883 goto patch_site 884 885 switch (type) { 886 SITE(pv_irq_ops, irq_enable); 887 SITE(pv_irq_ops, irq_disable); 888 SITE(pv_irq_ops, save_fl); 889 SITE(pv_irq_ops, restore_fl); 890#undef SITE 891 892 patch_site: 893 if (start == NULL || (end-start) > len) 894 goto default_patch; 895 896 ret = paravirt_patch_insns(insnbuf, len, start, end); 897 898 /* Note: because reloc is assigned from something that 899 appears to be an array, gcc assumes it's non-null, 900 but doesn't know its relationship with start and 901 end. */ 902 if (reloc > start && reloc < end) { 903 int reloc_off = reloc - start; 904 long *relocp = (long *)(insnbuf + reloc_off); 905 long delta = start - (char *)addr; 906 907 *relocp += delta; 908 } 909 break; 910 911 default_patch: 912 default: 913 ret = paravirt_patch_default(type, clobbers, insnbuf, 914 addr, len); 915 break; 916 } 917 918 return ret; 919} 920 921static const struct pv_info xen_info __initdata = { 922 .paravirt_enabled = 1, 923 .shared_kernel_pmd = 0, 924 925 .name = "Xen", 926}; 927 928static const struct pv_init_ops xen_init_ops __initdata = { 929 .patch = xen_patch, 930}; 931 932static const struct pv_cpu_ops xen_cpu_ops __initdata = { 933 .cpuid = xen_cpuid, 934 935 .set_debugreg = xen_set_debugreg, 936 .get_debugreg = xen_get_debugreg, 937 938 .clts = xen_clts, 939 940 .read_cr0 = xen_read_cr0, 941 .write_cr0 = xen_write_cr0, 942 943 .read_cr4 = native_read_cr4, 944 .read_cr4_safe = native_read_cr4_safe, 945 .write_cr4 = xen_write_cr4, 946 947 .wbinvd = native_wbinvd, 948 949 .read_msr = native_read_msr_safe, 950 .write_msr = xen_write_msr_safe, 951 .read_tsc = native_read_tsc, 952 .read_pmc = native_read_pmc, 953 954 .iret = xen_iret, 955 .irq_enable_sysexit = xen_sysexit, 956#ifdef CONFIG_X86_64 957 .usergs_sysret32 = xen_sysret32, 958 .usergs_sysret64 = xen_sysret64, 959#endif 960 961 .load_tr_desc = paravirt_nop, 962 .set_ldt = xen_set_ldt, 963 .load_gdt = xen_load_gdt, 964 .load_idt = xen_load_idt, 965 .load_tls = xen_load_tls, 966#ifdef CONFIG_X86_64 967 .load_gs_index = xen_load_gs_index, 968#endif 969 970 .alloc_ldt = xen_alloc_ldt, 971 .free_ldt = xen_free_ldt, 972 973 .store_gdt = native_store_gdt, 974 .store_idt = native_store_idt, 975 .store_tr = xen_store_tr, 976 977 .write_ldt_entry = xen_write_ldt_entry, 978 .write_gdt_entry = xen_write_gdt_entry, 979 .write_idt_entry = xen_write_idt_entry, 980 .load_sp0 = xen_load_sp0, 981 982 .set_iopl_mask = xen_set_iopl_mask, 983 .io_delay = xen_io_delay, 984 985 /* Xen takes care of %gs when switching to usermode for us */ 986 .swapgs = paravirt_nop, 987 988 .start_context_switch = paravirt_start_context_switch, 989 .end_context_switch = xen_end_context_switch, 990}; 991 992static const struct pv_apic_ops xen_apic_ops __initdata = { 993#ifdef CONFIG_X86_LOCAL_APIC 994 .startup_ipi_hook = paravirt_nop, 995#endif 996}; 997 998static void xen_reboot(int reason) 999{ 1000 struct sched_shutdown r = { .reason = reason }; 1001 1002 if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) 1003 BUG(); 1004} 1005 1006static void xen_restart(char *msg) 1007{ 1008 xen_reboot(SHUTDOWN_reboot); 1009} 1010 1011static void xen_emergency_restart(void) 1012{ 1013 xen_reboot(SHUTDOWN_reboot); 1014} 1015 1016static void xen_machine_halt(void) 1017{ 1018 xen_reboot(SHUTDOWN_poweroff); 1019} 1020 1021static void xen_crash_shutdown(struct pt_regs *regs) 1022{ 1023 xen_reboot(SHUTDOWN_crash); 1024} 1025 1026static int 1027xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr) 1028{ 1029 xen_reboot(SHUTDOWN_crash); 1030 return NOTIFY_DONE; 1031} 1032 1033static struct notifier_block xen_panic_block = { 1034 .notifier_call= xen_panic_event, 1035}; 1036 1037int xen_panic_handler_init(void) 1038{ 1039 atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block); 1040 return 0; 1041} 1042 1043static const struct machine_ops __initdata xen_machine_ops = { 1044 .restart = xen_restart, 1045 .halt = xen_machine_halt, 1046 .power_off = xen_machine_halt, 1047 .shutdown = xen_machine_halt, 1048 .crash_shutdown = xen_crash_shutdown, 1049 .emergency_restart = xen_emergency_restart, 1050}; 1051 1052/* 1053 * Set up the GDT and segment registers for -fstack-protector. Until 1054 * we do this, we have to be careful not to call any stack-protected 1055 * function, which is most of the kernel. 1056 */ 1057static void __init xen_setup_stackprotector(void) 1058{ 1059 pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot; 1060 pv_cpu_ops.load_gdt = xen_load_gdt_boot; 1061 1062 setup_stack_canary_segment(0); 1063 switch_to_new_gdt(0); 1064 1065 pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry; 1066 pv_cpu_ops.load_gdt = xen_load_gdt; 1067} 1068 1069/* First C function to be called on Xen boot */ 1070asmlinkage void __init xen_start_kernel(void) 1071{ 1072 pgd_t *pgd; 1073 1074 if (!xen_start_info) 1075 return; 1076 1077 xen_domain_type = XEN_PV_DOMAIN; 1078 1079 /* Install Xen paravirt ops */ 1080 pv_info = xen_info; 1081 pv_init_ops = xen_init_ops; 1082 pv_cpu_ops = xen_cpu_ops; 1083 pv_apic_ops = xen_apic_ops; 1084 1085 x86_init.resources.memory_setup = xen_memory_setup; 1086 x86_init.oem.arch_setup = xen_arch_setup; 1087 x86_init.oem.banner = xen_banner; 1088 1089 xen_init_time_ops(); 1090 1091 /* 1092 * Set up some pagetable state before starting to set any ptes. 1093 */ 1094 1095 xen_init_mmu_ops(); 1096 1097 /* Prevent unwanted bits from being set in PTEs. */ 1098 __supported_pte_mask &= ~_PAGE_GLOBAL; 1099 if (!xen_initial_domain()) 1100 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); 1101 1102 __supported_pte_mask |= _PAGE_IOMAP; 1103 1104 /* 1105 * Prevent page tables from being allocated in highmem, even 1106 * if CONFIG_HIGHPTE is enabled. 1107 */ 1108 __userpte_alloc_gfp &= ~__GFP_HIGHMEM; 1109 1110 /* Work out if we support NX */ 1111 x86_configure_nx(); 1112 1113 xen_setup_features(); 1114 1115 /* Get mfn list */ 1116 if (!xen_feature(XENFEAT_auto_translated_physmap)) 1117 xen_build_dynamic_phys_to_machine(); 1118 1119 /* 1120 * Set up kernel GDT and segment registers, mainly so that 1121 * -fstack-protector code can be executed. 1122 */ 1123 xen_setup_stackprotector(); 1124 1125 xen_init_irq_ops(); 1126 xen_init_cpuid_mask(); 1127 1128#ifdef CONFIG_X86_LOCAL_APIC 1129 /* 1130 * set up the basic apic ops. 1131 */ 1132 set_xen_basic_apic_ops(); 1133#endif 1134 1135 if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { 1136 pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start; 1137 pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit; 1138 } 1139 1140 machine_ops = xen_machine_ops; 1141 1142 /* 1143 * The only reliable way to retain the initial address of the 1144 * percpu gdt_page is to remember it here, so we can go and 1145 * mark it RW later, when the initial percpu area is freed. 1146 */ 1147 xen_initial_gdt = &per_cpu(gdt_page, 0); 1148 1149 xen_smp_init(); 1150 1151 pgd = (pgd_t *)xen_start_info->pt_base; 1152 1153 if (!xen_initial_domain()) 1154 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); 1155 1156 __supported_pte_mask |= _PAGE_IOMAP; 1157 /* Don't do the full vcpu_info placement stuff until we have a 1158 possible map and a non-dummy shared_info. */ 1159 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; 1160 1161 local_irq_disable(); 1162 early_boot_irqs_off(); 1163 1164 xen_raw_console_write("mapping kernel into physical memory\n"); 1165 pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); 1166 1167 init_mm.pgd = pgd; 1168 1169 /* keep using Xen gdt for now; no urgent need to change it */ 1170 1171#ifdef CONFIG_X86_32 1172 pv_info.kernel_rpl = 1; 1173 if (xen_feature(XENFEAT_supervisor_mode_kernel)) 1174 pv_info.kernel_rpl = 0; 1175#else 1176 pv_info.kernel_rpl = 0; 1177#endif 1178 1179 /* set the limit of our address space */ 1180 xen_reserve_top(); 1181 1182#ifdef CONFIG_X86_32 1183 /* set up basic CPUID stuff */ 1184 cpu_detect(&new_cpu_data); 1185 new_cpu_data.hard_math = 1; 1186 new_cpu_data.wp_works_ok = 1; 1187 new_cpu_data.x86_capability[0] = cpuid_edx(1); 1188#endif 1189 1190 /* Poke various useful things into boot_params */ 1191 boot_params.hdr.type_of_loader = (9 << 4) | 0; 1192 boot_params.hdr.ramdisk_image = xen_start_info->mod_start 1193 ? __pa(xen_start_info->mod_start) : 0; 1194 boot_params.hdr.ramdisk_size = xen_start_info->mod_len; 1195 boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line); 1196 1197 if (!xen_initial_domain()) { 1198 add_preferred_console("xenboot", 0, NULL); 1199 add_preferred_console("tty", 0, NULL); 1200 add_preferred_console("hvc", 0, NULL); 1201 } else { 1202 /* Make sure ACS will be enabled */ 1203 pci_request_acs(); 1204 } 1205 1206 1207 xen_raw_console_write("about to get started...\n"); 1208 1209 xen_setup_runstate_info(0); 1210 1211 /* Start the world */ 1212#ifdef CONFIG_X86_32 1213 i386_start_kernel(); 1214#else 1215 x86_64_start_reservations((char *)__pa_symbol(&boot_params)); 1216#endif 1217} 1218 1219static uint32_t xen_cpuid_base(void) 1220{ 1221 uint32_t base, eax, ebx, ecx, edx; 1222 char signature[13]; 1223 1224 for (base = 0x40000000; base < 0x40010000; base += 0x100) { 1225 cpuid(base, &eax, &ebx, &ecx, &edx); 1226 *(uint32_t *)(signature + 0) = ebx; 1227 *(uint32_t *)(signature + 4) = ecx; 1228 *(uint32_t *)(signature + 8) = edx; 1229 signature[12] = 0; 1230 1231 if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2)) 1232 return base; 1233 } 1234 1235 return 0; 1236} 1237 1238static int init_hvm_pv_info(int *major, int *minor) 1239{ 1240 uint32_t eax, ebx, ecx, edx, pages, msr, base; 1241 u64 pfn; 1242 1243 base = xen_cpuid_base(); 1244 cpuid(base + 1, &eax, &ebx, &ecx, &edx); 1245 1246 *major = eax >> 16; 1247 *minor = eax & 0xffff; 1248 printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor); 1249 1250 cpuid(base + 2, &pages, &msr, &ecx, &edx); 1251 1252 pfn = __pa(hypercall_page); 1253 wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); 1254 1255 xen_setup_features(); 1256 1257 pv_info = xen_info; 1258 pv_info.kernel_rpl = 0; 1259 1260 xen_domain_type = XEN_HVM_DOMAIN; 1261 1262 return 0; 1263} 1264 1265void xen_hvm_init_shared_info(void) 1266{ 1267 int cpu; 1268 struct xen_add_to_physmap xatp; 1269 static struct shared_info *shared_info_page = 0; 1270 1271 if (!shared_info_page) 1272 shared_info_page = (struct shared_info *) 1273 extend_brk(PAGE_SIZE, PAGE_SIZE); 1274 xatp.domid = DOMID_SELF; 1275 xatp.idx = 0; 1276 xatp.space = XENMAPSPACE_shared_info; 1277 xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT; 1278 if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) 1279 BUG(); 1280 1281 HYPERVISOR_shared_info = (struct shared_info *)shared_info_page; 1282 1283 /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info 1284 * page, we use it in the event channel upcall and in some pvclock 1285 * related functions. We don't need the vcpu_info placement 1286 * optimizations because we don't use any pv_mmu or pv_irq op on 1287 * HVM. 1288 * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is 1289 * online but xen_hvm_init_shared_info is run at resume time too and 1290 * in that case multiple vcpus might be online. */ 1291 for_each_online_cpu(cpu) { 1292 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; 1293 } 1294} 1295 1296#ifdef CONFIG_XEN_PVHVM 1297static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, 1298 unsigned long action, void *hcpu) 1299{ 1300 int cpu = (long)hcpu; 1301 switch (action) { 1302 case CPU_UP_PREPARE: 1303 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; 1304 break; 1305 default: 1306 break; 1307 } 1308 return NOTIFY_OK; 1309} 1310 1311static struct notifier_block __cpuinitdata xen_hvm_cpu_notifier = { 1312 .notifier_call = xen_hvm_cpu_notify, 1313}; 1314 1315static void __init xen_hvm_guest_init(void) 1316{ 1317 int r; 1318 int major, minor; 1319 1320 r = init_hvm_pv_info(&major, &minor); 1321 if (r < 0) 1322 return; 1323 1324 xen_hvm_init_shared_info(); 1325 1326 if (xen_feature(XENFEAT_hvm_callback_vector)) 1327 xen_have_vector_callback = 1; 1328 register_cpu_notifier(&xen_hvm_cpu_notifier); 1329 xen_unplug_emulated_devices(); 1330 have_vcpu_info_placement = 0; 1331 x86_init.irqs.intr_init = xen_init_IRQ; 1332 xen_hvm_init_time_ops(); 1333 xen_hvm_init_mmu_ops(); 1334} 1335 1336static bool __init xen_hvm_platform(void) 1337{ 1338 if (xen_pv_domain()) 1339 return false; 1340 1341 if (!xen_cpuid_base()) 1342 return false; 1343 1344 return true; 1345} 1346 1347const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = { 1348 .name = "Xen HVM", 1349 .detect = xen_hvm_platform, 1350 .init_platform = xen_hvm_guest_init, 1351}; 1352EXPORT_SYMBOL(x86_hyper_xen_hvm); 1353#endif 1354