mp_machdep.c revision 223758
1/*- 2 * Copyright (c) 1996, by Steve Passe 3 * Copyright (c) 2008, by Kip Macy 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. The name of the developer may NOT be used to endorse or promote products 12 * derived from this software without specific prior written permission. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: head/sys/i386/xen/mp_machdep.c 223758 2011-07-04 12:04:52Z attilio $"); 29 30#include "opt_apic.h" 31#include "opt_cpu.h" 32#include "opt_kstack_pages.h" 33#include "opt_mp_watchdog.h" 34#include "opt_pmap.h" 35#include "opt_sched.h" 36#include "opt_smp.h" 37 38#if !defined(lint) 39#if !defined(SMP) 40#error How did you get here? 41#endif 42 43#ifndef DEV_APIC 44#error The apic device is required for SMP, add "device apic" to your config file. 45#endif 46#if defined(CPU_DISABLE_CMPXCHG) && !defined(COMPILING_LINT) 47#error SMP not supported with CPU_DISABLE_CMPXCHG 48#endif 49#endif /* not lint */ 50 51#include <sys/param.h> 52#include <sys/systm.h> 53#include <sys/bus.h> 54#include <sys/cons.h> /* cngetc() */ 55#include <sys/cpuset.h> 56#ifdef GPROF 57#include <sys/gmon.h> 58#endif 59#include <sys/kernel.h> 60#include <sys/ktr.h> 61#include <sys/lock.h> 62#include <sys/malloc.h> 63#include <sys/memrange.h> 64#include <sys/mutex.h> 65#include <sys/pcpu.h> 66#include <sys/proc.h> 67#include <sys/sched.h> 68#include <sys/smp.h> 69#include <sys/sysctl.h> 70 71#include <vm/vm.h> 72#include <vm/vm_param.h> 73#include <vm/pmap.h> 74#include <vm/vm_kern.h> 75#include <vm/vm_extern.h> 76#include <vm/vm_page.h> 77 78#include <x86/apicreg.h> 79#include <machine/md_var.h> 80#include <machine/mp_watchdog.h> 81#include <machine/pcb.h> 82#include <machine/psl.h> 83#include <machine/smp.h> 84#include <machine/specialreg.h> 85#include <machine/pcpu.h> 86 87 88 89#include <machine/xen/xen-os.h> 90#include <xen/evtchn.h> 91#include <xen/xen_intr.h> 92#include <xen/hypervisor.h> 93#include <xen/interface/vcpu.h> 94 95 96int mp_naps; /* # of Applications processors */ 97int boot_cpu_id = -1; /* designated BSP */ 98 99extern struct pcpu __pcpu[]; 100 101static int bootAP; 102static union descriptor *bootAPgdt; 103 104static char resched_name[NR_CPUS][15]; 105static char callfunc_name[NR_CPUS][15]; 106 107/* Free these after use */ 108void *bootstacks[MAXCPU]; 109 110struct pcb stoppcbs[MAXCPU]; 111 112/* Variables needed for SMP tlb shootdown. */ 113vm_offset_t smp_tlb_addr1; 114vm_offset_t smp_tlb_addr2; 115volatile int smp_tlb_wait; 116 117typedef void call_data_func_t(uintptr_t , uintptr_t); 118 119static u_int logical_cpus; 120static volatile cpuset_t ipi_nmi_pending; 121 122/* used to hold the AP's until we are ready to release them */ 123static struct mtx ap_boot_mtx; 124 125/* Set to 1 once we're ready to let the APs out of the pen. */ 126static volatile int aps_ready = 0; 127 128/* 129 * Store data from cpu_add() until later in the boot when we actually setup 130 * the APs. 131 */ 132struct cpu_info { 133 int cpu_present:1; 134 int cpu_bsp:1; 135 int cpu_disabled:1; 136} static cpu_info[MAX_APIC_ID + 1]; 137int cpu_apic_ids[MAXCPU]; 138int apic_cpuids[MAX_APIC_ID + 1]; 139 140/* Holds pending bitmap based IPIs per CPU */ 141static volatile u_int cpu_ipi_pending[MAXCPU]; 142 143static int cpu_logical; 144static int cpu_cores; 145 146static void assign_cpu_ids(void); 147static void set_interrupt_apic_ids(void); 148int start_all_aps(void); 149static int start_ap(int apic_id); 150static void release_aps(void *dummy); 151 152static u_int hyperthreading_cpus; 153static cpuset_t hyperthreading_cpus_mask; 154 155extern void Xhypervisor_callback(void); 156extern void failsafe_callback(void); 157extern void pmap_lazyfix_action(void); 158 159struct cpu_group * 160cpu_topo(void) 161{ 162 if (cpu_cores == 0) 163 cpu_cores = 1; 164 if (cpu_logical == 0) 165 cpu_logical = 1; 166 if (mp_ncpus % (cpu_cores * cpu_logical) != 0) { 167 printf("WARNING: Non-uniform processors.\n"); 168 printf("WARNING: Using suboptimal topology.\n"); 169 return (smp_topo_none()); 170 } 171 /* 172 * No multi-core or hyper-threaded. 173 */ 174 if (cpu_logical * cpu_cores == 1) 175 return (smp_topo_none()); 176 /* 177 * Only HTT no multi-core. 178 */ 179 if (cpu_logical > 1 && cpu_cores == 1) 180 return (smp_topo_1level(CG_SHARE_L1, cpu_logical, CG_FLAG_HTT)); 181 /* 182 * Only multi-core no HTT. 183 */ 184 if (cpu_cores > 1 && cpu_logical == 1) 185 return (smp_topo_1level(CG_SHARE_NONE, cpu_cores, 0)); 186 /* 187 * Both HTT and multi-core. 188 */ 189 return (smp_topo_2level(CG_SHARE_NONE, cpu_cores, 190 CG_SHARE_L1, cpu_logical, CG_FLAG_HTT)); 191} 192 193/* 194 * Calculate usable address in base memory for AP trampoline code. 195 */ 196u_int 197mp_bootaddress(u_int basemem) 198{ 199 200 return (basemem); 201} 202 203void 204cpu_add(u_int apic_id, char boot_cpu) 205{ 206 207 if (apic_id > MAX_APIC_ID) { 208 panic("SMP: APIC ID %d too high", apic_id); 209 return; 210 } 211 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", 212 apic_id)); 213 cpu_info[apic_id].cpu_present = 1; 214 if (boot_cpu) { 215 KASSERT(boot_cpu_id == -1, 216 ("CPU %d claims to be BSP, but CPU %d already is", apic_id, 217 boot_cpu_id)); 218 boot_cpu_id = apic_id; 219 cpu_info[apic_id].cpu_bsp = 1; 220 } 221 if (mp_ncpus < MAXCPU) 222 mp_ncpus++; 223 if (bootverbose) 224 printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : 225 "AP"); 226} 227 228void 229cpu_mp_setmaxid(void) 230{ 231 232 mp_maxid = MAXCPU - 1; 233} 234 235int 236cpu_mp_probe(void) 237{ 238 239 /* 240 * Always record BSP in CPU map so that the mbuf init code works 241 * correctly. 242 */ 243 CPU_SETOF(0, &all_cpus); 244 if (mp_ncpus == 0) { 245 /* 246 * No CPUs were found, so this must be a UP system. Setup 247 * the variables to represent a system with a single CPU 248 * with an id of 0. 249 */ 250 mp_ncpus = 1; 251 return (0); 252 } 253 254 /* At least one CPU was found. */ 255 if (mp_ncpus == 1) { 256 /* 257 * One CPU was found, so this must be a UP system with 258 * an I/O APIC. 259 */ 260 return (0); 261 } 262 263 /* At least two CPUs were found. */ 264 return (1); 265} 266 267/* 268 * Initialize the IPI handlers and start up the AP's. 269 */ 270void 271cpu_mp_start(void) 272{ 273 int i; 274 275 /* Initialize the logical ID to APIC ID table. */ 276 for (i = 0; i < MAXCPU; i++) { 277 cpu_apic_ids[i] = -1; 278 cpu_ipi_pending[i] = 0; 279 } 280 281 /* Set boot_cpu_id if needed. */ 282 if (boot_cpu_id == -1) { 283 boot_cpu_id = PCPU_GET(apic_id); 284 cpu_info[boot_cpu_id].cpu_bsp = 1; 285 } else 286 KASSERT(boot_cpu_id == PCPU_GET(apic_id), 287 ("BSP's APIC ID doesn't match boot_cpu_id")); 288 cpu_apic_ids[0] = boot_cpu_id; 289 apic_cpuids[boot_cpu_id] = 0; 290 291 assign_cpu_ids(); 292 293 /* Start each Application Processor */ 294 start_all_aps(); 295 296 /* Setup the initial logical CPUs info. */ 297 logical_cpus = 0; 298 CPU_ZERO(&logical_cpus_mask); 299 if (cpu_feature & CPUID_HTT) 300 logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16; 301 302 set_interrupt_apic_ids(); 303} 304 305 306static void 307iv_rendezvous(uintptr_t a, uintptr_t b) 308{ 309 smp_rendezvous_action(); 310} 311 312static void 313iv_invltlb(uintptr_t a, uintptr_t b) 314{ 315 xen_tlb_flush(); 316} 317 318static void 319iv_invlpg(uintptr_t a, uintptr_t b) 320{ 321 xen_invlpg(a); 322} 323 324static void 325iv_invlrng(uintptr_t a, uintptr_t b) 326{ 327 vm_offset_t start = (vm_offset_t)a; 328 vm_offset_t end = (vm_offset_t)b; 329 330 while (start < end) { 331 xen_invlpg(start); 332 start += PAGE_SIZE; 333 } 334} 335 336 337static void 338iv_invlcache(uintptr_t a, uintptr_t b) 339{ 340 341 wbinvd(); 342 atomic_add_int(&smp_tlb_wait, 1); 343} 344 345static void 346iv_lazypmap(uintptr_t a, uintptr_t b) 347{ 348 pmap_lazyfix_action(); 349 atomic_add_int(&smp_tlb_wait, 1); 350} 351 352/* 353 * These start from "IPI offset" APIC_IPI_INTS 354 */ 355static call_data_func_t *ipi_vectors[6] = 356{ 357 iv_rendezvous, 358 iv_invltlb, 359 iv_invlpg, 360 iv_invlrng, 361 iv_invlcache, 362 iv_lazypmap, 363}; 364 365/* 366 * Reschedule call back. Nothing to do, 367 * all the work is done automatically when 368 * we return from the interrupt. 369 */ 370static int 371smp_reschedule_interrupt(void *unused) 372{ 373 int cpu = PCPU_GET(cpuid); 374 u_int ipi_bitmap; 375 376 ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); 377 378 if (ipi_bitmap & (1 << IPI_PREEMPT)) { 379#ifdef COUNT_IPIS 380 (*ipi_preempt_counts[cpu])++; 381#endif 382 sched_preempt(curthread); 383 } 384 385 if (ipi_bitmap & (1 << IPI_AST)) { 386#ifdef COUNT_IPIS 387 (*ipi_ast_counts[cpu])++; 388#endif 389 /* Nothing to do for AST */ 390 } 391 return (FILTER_HANDLED); 392} 393 394struct _call_data { 395 uint16_t func_id; 396 uint16_t wait; 397 uintptr_t arg1; 398 uintptr_t arg2; 399 atomic_t started; 400 atomic_t finished; 401}; 402 403static struct _call_data *call_data; 404 405static int 406smp_call_function_interrupt(void *unused) 407{ 408 call_data_func_t *func; 409 uintptr_t arg1 = call_data->arg1; 410 uintptr_t arg2 = call_data->arg2; 411 int wait = call_data->wait; 412 atomic_t *started = &call_data->started; 413 atomic_t *finished = &call_data->finished; 414 415 /* We only handle function IPIs, not bitmap IPIs */ 416 if (call_data->func_id < APIC_IPI_INTS || call_data->func_id > IPI_BITMAP_VECTOR) 417 panic("invalid function id %u", call_data->func_id); 418 419 func = ipi_vectors[call_data->func_id - APIC_IPI_INTS]; 420 /* 421 * Notify initiating CPU that I've grabbed the data and am 422 * about to execute the function 423 */ 424 mb(); 425 atomic_inc(started); 426 /* 427 * At this point the info structure may be out of scope unless wait==1 428 */ 429 (*func)(arg1, arg2); 430 431 if (wait) { 432 mb(); 433 atomic_inc(finished); 434 } 435 atomic_add_int(&smp_tlb_wait, 1); 436 return (FILTER_HANDLED); 437} 438 439/* 440 * Print various information about the SMP system hardware and setup. 441 */ 442void 443cpu_mp_announce(void) 444{ 445 int i, x; 446 447 /* List CPUs */ 448 printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id); 449 for (i = 1, x = 0; x <= MAX_APIC_ID; x++) { 450 if (!cpu_info[x].cpu_present || cpu_info[x].cpu_bsp) 451 continue; 452 if (cpu_info[x].cpu_disabled) 453 printf(" cpu (AP): APIC ID: %2d (disabled)\n", x); 454 else { 455 KASSERT(i < mp_ncpus, 456 ("mp_ncpus and actual cpus are out of whack")); 457 printf(" cpu%d (AP): APIC ID: %2d\n", i++, x); 458 } 459 } 460} 461 462static int 463xen_smp_intr_init(unsigned int cpu) 464{ 465 int rc; 466 unsigned int irq; 467 468 per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1; 469 470 sprintf(resched_name[cpu], "resched%u", cpu); 471 rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR, 472 cpu, 473 resched_name[cpu], 474 smp_reschedule_interrupt, 475 INTR_TYPE_TTY, &irq); 476 477 printf("[XEN] IPI cpu=%d irq=%d vector=RESCHEDULE_VECTOR (%d)\n", 478 cpu, irq, RESCHEDULE_VECTOR); 479 480 per_cpu(resched_irq, cpu) = irq; 481 482 sprintf(callfunc_name[cpu], "callfunc%u", cpu); 483 rc = bind_ipi_to_irqhandler(CALL_FUNCTION_VECTOR, 484 cpu, 485 callfunc_name[cpu], 486 smp_call_function_interrupt, 487 INTR_TYPE_TTY, &irq); 488 if (rc < 0) 489 goto fail; 490 per_cpu(callfunc_irq, cpu) = irq; 491 492 printf("[XEN] IPI cpu=%d irq=%d vector=CALL_FUNCTION_VECTOR (%d)\n", 493 cpu, irq, CALL_FUNCTION_VECTOR); 494 495 496 if ((cpu != 0) && ((rc = ap_cpu_initclocks(cpu)) != 0)) 497 goto fail; 498 499 return 0; 500 501 fail: 502 if (per_cpu(resched_irq, cpu) >= 0) 503 unbind_from_irqhandler(per_cpu(resched_irq, cpu)); 504 if (per_cpu(callfunc_irq, cpu) >= 0) 505 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu)); 506 return rc; 507} 508 509static void 510xen_smp_intr_init_cpus(void *unused) 511{ 512 int i; 513 514 for (i = 0; i < mp_ncpus; i++) 515 xen_smp_intr_init(i); 516} 517 518#define MTOPSIZE (1<<(14 + PAGE_SHIFT)) 519 520/* 521 * AP CPU's call this to initialize themselves. 522 */ 523void 524init_secondary(void) 525{ 526 vm_offset_t addr; 527 u_int cpuid; 528 int gsel_tss; 529 530 531 /* bootAP is set in start_ap() to our ID. */ 532 PCPU_SET(currentldt, _default_ldt); 533 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 534#if 0 535 gdt[bootAP * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; 536#endif 537 PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */ 538 PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); 539 PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); 540#if 0 541 PCPU_SET(tss_gdt, &gdt[bootAP * NGDT + GPROC0_SEL].sd); 542 543 PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); 544#endif 545 PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); 546 547 /* 548 * Set to a known state: 549 * Set by mpboot.s: CR0_PG, CR0_PE 550 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM 551 */ 552 /* 553 * signal our startup to the BSP. 554 */ 555 mp_naps++; 556 557 /* Spin until the BSP releases the AP's. */ 558 while (!aps_ready) 559 ia32_pause(); 560 561 /* BSP may have changed PTD while we were waiting */ 562 invltlb(); 563 for (addr = 0; addr < NKPT * NBPDR - 1; addr += PAGE_SIZE) 564 invlpg(addr); 565 566 /* set up FPU state on the AP */ 567 npxinit(); 568#if 0 569 570 /* set up SSE registers */ 571 enable_sse(); 572#endif 573#if 0 && defined(PAE) 574 /* Enable the PTE no-execute bit. */ 575 if ((amd_feature & AMDID_NX) != 0) { 576 uint64_t msr; 577 578 msr = rdmsr(MSR_EFER) | EFER_NXE; 579 wrmsr(MSR_EFER, msr); 580 } 581#endif 582#if 0 583 /* A quick check from sanity claus */ 584 if (PCPU_GET(apic_id) != lapic_id()) { 585 printf("SMP: cpuid = %d\n", PCPU_GET(cpuid)); 586 printf("SMP: actual apic_id = %d\n", lapic_id()); 587 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); 588 panic("cpuid mismatch! boom!!"); 589 } 590#endif 591 592 /* Initialize curthread. */ 593 KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); 594 PCPU_SET(curthread, PCPU_GET(idlethread)); 595 596 mtx_lock_spin(&ap_boot_mtx); 597#if 0 598 599 /* Init local apic for irq's */ 600 lapic_setup(1); 601#endif 602 smp_cpus++; 603 604 cpuid = PCPU_GET(cpuid); 605 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid); 606 printf("SMP: AP CPU #%d Launched!\n", cpuid); 607 608 /* Determine if we are a logical CPU. */ 609 if (logical_cpus > 1 && PCPU_GET(apic_id) % logical_cpus != 0) 610 CPU_SET(cpuid, &logical_cpus_mask); 611 612 /* Determine if we are a hyperthread. */ 613 if (hyperthreading_cpus > 1 && 614 PCPU_GET(apic_id) % hyperthreading_cpus != 0) 615 CPU_SET(cpuid, &hyperthreading_cpus_mask); 616#if 0 617 if (bootverbose) 618 lapic_dump("AP"); 619#endif 620 if (smp_cpus == mp_ncpus) { 621 /* enable IPI's, tlb shootdown, freezes etc */ 622 atomic_store_rel_int(&smp_started, 1); 623 smp_active = 1; /* historic */ 624 } 625 626 mtx_unlock_spin(&ap_boot_mtx); 627 628 /* wait until all the AP's are up */ 629 while (smp_started == 0) 630 ia32_pause(); 631 632 PCPU_SET(curthread, PCPU_GET(idlethread)); 633 634 /* Start per-CPU event timers. */ 635 cpu_initclocks_ap(); 636 637 /* enter the scheduler */ 638 sched_throw(NULL); 639 640 panic("scheduler returned us to %s", __func__); 641 /* NOTREACHED */ 642} 643 644/******************************************************************* 645 * local functions and data 646 */ 647 648/* 649 * We tell the I/O APIC code about all the CPUs we want to receive 650 * interrupts. If we don't want certain CPUs to receive IRQs we 651 * can simply not tell the I/O APIC code about them in this function. 652 * We also do not tell it about the BSP since it tells itself about 653 * the BSP internally to work with UP kernels and on UP machines. 654 */ 655static void 656set_interrupt_apic_ids(void) 657{ 658 u_int i, apic_id; 659 660 for (i = 0; i < MAXCPU; i++) { 661 apic_id = cpu_apic_ids[i]; 662 if (apic_id == -1) 663 continue; 664 if (cpu_info[apic_id].cpu_bsp) 665 continue; 666 if (cpu_info[apic_id].cpu_disabled) 667 continue; 668 669 /* Don't let hyperthreads service interrupts. */ 670 if (hyperthreading_cpus > 1 && 671 apic_id % hyperthreading_cpus != 0) 672 continue; 673 674 intr_add_cpu(i); 675 } 676} 677 678/* 679 * Assign logical CPU IDs to local APICs. 680 */ 681static void 682assign_cpu_ids(void) 683{ 684 u_int i; 685 686 /* Check for explicitly disabled CPUs. */ 687 for (i = 0; i <= MAX_APIC_ID; i++) { 688 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp) 689 continue; 690 691 /* Don't use this CPU if it has been disabled by a tunable. */ 692 if (resource_disabled("lapic", i)) { 693 cpu_info[i].cpu_disabled = 1; 694 continue; 695 } 696 } 697 698 /* 699 * Assign CPU IDs to local APIC IDs and disable any CPUs 700 * beyond MAXCPU. CPU 0 has already been assigned to the BSP, 701 * so we only have to assign IDs for APs. 702 */ 703 mp_ncpus = 1; 704 for (i = 0; i <= MAX_APIC_ID; i++) { 705 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp || 706 cpu_info[i].cpu_disabled) 707 continue; 708 709 if (mp_ncpus < MAXCPU) { 710 cpu_apic_ids[mp_ncpus] = i; 711 apic_cpuids[i] = mp_ncpus; 712 mp_ncpus++; 713 } else 714 cpu_info[i].cpu_disabled = 1; 715 } 716 KASSERT(mp_maxid >= mp_ncpus - 1, 717 ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, 718 mp_ncpus)); 719} 720 721/* 722 * start each AP in our list 723 */ 724/* Lowest 1MB is already mapped: don't touch*/ 725#define TMPMAP_START 1 726int 727start_all_aps(void) 728{ 729 int x,apic_id, cpu; 730 struct pcpu *pc; 731 732 mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); 733 734 /* set up temporary P==V mapping for AP boot */ 735 /* XXX this is a hack, we should boot the AP on its own stack/PTD */ 736 737 /* start each AP */ 738 for (cpu = 1; cpu < mp_ncpus; cpu++) { 739 apic_id = cpu_apic_ids[cpu]; 740 741 742 bootAP = cpu; 743 bootAPgdt = gdt + (512*cpu); 744 745 /* Get per-cpu data */ 746 pc = &__pcpu[bootAP]; 747 pcpu_init(pc, bootAP, sizeof(struct pcpu)); 748 dpcpu_init((void *)kmem_alloc(kernel_map, DPCPU_SIZE), bootAP); 749 pc->pc_apic_id = cpu_apic_ids[bootAP]; 750 pc->pc_prvspace = pc; 751 pc->pc_curthread = 0; 752 753 gdt_segs[GPRIV_SEL].ssd_base = (int) pc; 754 gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; 755 756 PT_SET_MA(bootAPgdt, VTOM(bootAPgdt) | PG_V | PG_RW); 757 bzero(bootAPgdt, PAGE_SIZE); 758 for (x = 0; x < NGDT; x++) 759 ssdtosd(&gdt_segs[x], &bootAPgdt[x].sd); 760 PT_SET_MA(bootAPgdt, vtomach(bootAPgdt) | PG_V); 761#ifdef notyet 762 763 if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, cpu, &cpu_id) == 0) { 764 apicid = xen_vcpu_physid_to_x86_apicid(cpu_id.phys_id); 765 acpiid = xen_vcpu_physid_to_x86_acpiid(cpu_id.phys_id); 766#ifdef CONFIG_ACPI 767 if (acpiid != 0xff) 768 x86_acpiid_to_apicid[acpiid] = apicid; 769#endif 770 } 771#endif 772 773 /* attempt to start the Application Processor */ 774 if (!start_ap(cpu)) { 775 printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id); 776 /* better panic as the AP may be running loose */ 777 printf("panic y/n? [y] "); 778 if (cngetc() != 'n') 779 panic("bye-bye"); 780 } 781 782 CPU_SET(cpu, &all_cpus); /* record AP in CPU map */ 783 } 784 785 786 pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1); 787 788 /* number of APs actually started */ 789 return mp_naps; 790} 791 792extern uint8_t *pcpu_boot_stack; 793extern trap_info_t trap_table[]; 794 795static void 796smp_trap_init(trap_info_t *trap_ctxt) 797{ 798 const trap_info_t *t = trap_table; 799 800 for (t = trap_table; t->address; t++) { 801 trap_ctxt[t->vector].flags = t->flags; 802 trap_ctxt[t->vector].cs = t->cs; 803 trap_ctxt[t->vector].address = t->address; 804 } 805} 806 807extern int nkpt; 808static void 809cpu_initialize_context(unsigned int cpu) 810{ 811 /* vcpu_guest_context_t is too large to allocate on the stack. 812 * Hence we allocate statically and protect it with a lock */ 813 vm_page_t m[4]; 814 static vcpu_guest_context_t ctxt; 815 vm_offset_t boot_stack; 816 vm_offset_t newPTD; 817 vm_paddr_t ma[NPGPTD]; 818 static int color; 819 int i; 820 821 /* 822 * Page 0,[0-3] PTD 823 * Page 1, [4] boot stack 824 * Page [5] PDPT 825 * 826 */ 827 for (i = 0; i < NPGPTD + 2; i++) { 828 m[i] = vm_page_alloc(NULL, color++, 829 VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 830 VM_ALLOC_ZERO); 831 832 pmap_zero_page(m[i]); 833 834 } 835 boot_stack = kmem_alloc_nofault(kernel_map, 1); 836 newPTD = kmem_alloc_nofault(kernel_map, NPGPTD); 837 ma[0] = VM_PAGE_TO_MACH(m[0])|PG_V; 838 839#ifdef PAE 840 pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD + 1])); 841 for (i = 0; i < NPGPTD; i++) { 842 ((vm_paddr_t *)boot_stack)[i] = 843 ma[i] = VM_PAGE_TO_MACH(m[i])|PG_V; 844 } 845#endif 846 847 /* 848 * Copy cpu0 IdlePTD to new IdlePTD - copying only 849 * kernel mappings 850 */ 851 pmap_qenter(newPTD, m, 4); 852 853 memcpy((uint8_t *)newPTD + KPTDI*sizeof(vm_paddr_t), 854 (uint8_t *)PTOV(IdlePTD) + KPTDI*sizeof(vm_paddr_t), 855 nkpt*sizeof(vm_paddr_t)); 856 857 pmap_qremove(newPTD, 4); 858 kmem_free(kernel_map, newPTD, 4); 859 /* 860 * map actual idle stack to boot_stack 861 */ 862 pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD])); 863 864 865 xen_pgdpt_pin(VM_PAGE_TO_MACH(m[NPGPTD + 1])); 866 vm_page_lock_queues(); 867 for (i = 0; i < 4; i++) { 868 int pdir = (PTDPTDI + i) / NPDEPG; 869 int curoffset = (PTDPTDI + i) % NPDEPG; 870 871 xen_queue_pt_update((vm_paddr_t) 872 ((ma[pdir] & ~PG_V) + (curoffset*sizeof(vm_paddr_t))), 873 ma[i]); 874 } 875 PT_UPDATES_FLUSH(); 876 vm_page_unlock_queues(); 877 878 memset(&ctxt, 0, sizeof(ctxt)); 879 ctxt.flags = VGCF_IN_KERNEL; 880 ctxt.user_regs.ds = GSEL(GDATA_SEL, SEL_KPL); 881 ctxt.user_regs.es = GSEL(GDATA_SEL, SEL_KPL); 882 ctxt.user_regs.fs = GSEL(GPRIV_SEL, SEL_KPL); 883 ctxt.user_regs.gs = GSEL(GDATA_SEL, SEL_KPL); 884 ctxt.user_regs.cs = GSEL(GCODE_SEL, SEL_KPL); 885 ctxt.user_regs.ss = GSEL(GDATA_SEL, SEL_KPL); 886 ctxt.user_regs.eip = (unsigned long)init_secondary; 887 ctxt.user_regs.eflags = PSL_KERNEL | 0x1000; /* IOPL_RING1 */ 888 889 memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt)); 890 891 smp_trap_init(ctxt.trap_ctxt); 892 893 ctxt.ldt_ents = 0; 894 ctxt.gdt_frames[0] = (uint32_t)((uint64_t)vtomach(bootAPgdt) >> PAGE_SHIFT); 895 ctxt.gdt_ents = 512; 896 897#ifdef __i386__ 898 ctxt.user_regs.esp = boot_stack + PAGE_SIZE; 899 900 ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL); 901 ctxt.kernel_sp = boot_stack + PAGE_SIZE; 902 903 ctxt.event_callback_cs = GSEL(GCODE_SEL, SEL_KPL); 904 ctxt.event_callback_eip = (unsigned long)Xhypervisor_callback; 905 ctxt.failsafe_callback_cs = GSEL(GCODE_SEL, SEL_KPL); 906 ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; 907 908 ctxt.ctrlreg[3] = VM_PAGE_TO_MACH(m[NPGPTD + 1]); 909#else /* __x86_64__ */ 910 ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs); 911 ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL); 912 ctxt.kernel_sp = idle->thread.rsp0; 913 914 ctxt.event_callback_eip = (unsigned long)hypervisor_callback; 915 ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; 916 ctxt.syscall_callback_eip = (unsigned long)system_call; 917 918 ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt)); 919 920 ctxt.gs_base_kernel = (unsigned long)(cpu_pda(cpu)); 921#endif 922 923 printf("gdtpfn=%lx pdptpfn=%lx\n", 924 ctxt.gdt_frames[0], 925 ctxt.ctrlreg[3] >> PAGE_SHIFT); 926 927 PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt)); 928 DELAY(3000); 929 PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)); 930} 931 932/* 933 * This function starts the AP (application processor) identified 934 * by the APIC ID 'physicalCpu'. It does quite a "song and dance" 935 * to accomplish this. This is necessary because of the nuances 936 * of the different hardware we might encounter. It isn't pretty, 937 * but it seems to work. 938 */ 939 940int cpus; 941static int 942start_ap(int apic_id) 943{ 944 int ms; 945 946 /* used as a watchpoint to signal AP startup */ 947 cpus = mp_naps; 948 949 cpu_initialize_context(apic_id); 950 951 /* Wait up to 5 seconds for it to start. */ 952 for (ms = 0; ms < 5000; ms++) { 953 if (mp_naps > cpus) 954 return 1; /* return SUCCESS */ 955 DELAY(1000); 956 } 957 return 0; /* return FAILURE */ 958} 959 960/* 961 * send an IPI to a specific CPU. 962 */ 963static void 964ipi_send_cpu(int cpu, u_int ipi) 965{ 966 u_int bitmap, old_pending, new_pending; 967 968 if (IPI_IS_BITMAPED(ipi)) { 969 bitmap = 1 << ipi; 970 ipi = IPI_BITMAP_VECTOR; 971 do { 972 old_pending = cpu_ipi_pending[cpu]; 973 new_pending = old_pending | bitmap; 974 } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu], 975 old_pending, new_pending)); 976 if (!old_pending) 977 ipi_pcpu(cpu, RESCHEDULE_VECTOR); 978 } else { 979 KASSERT(call_data != NULL, ("call_data not set")); 980 ipi_pcpu(cpu, CALL_FUNCTION_VECTOR); 981 } 982} 983 984/* 985 * Flush the TLB on all other CPU's 986 */ 987static void 988smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) 989{ 990 u_int ncpu; 991 struct _call_data data; 992 993 ncpu = mp_ncpus - 1; /* does not shootdown self */ 994 if (ncpu < 1) 995 return; /* no other cpus */ 996 if (!(read_eflags() & PSL_I)) 997 panic("%s: interrupts disabled", __func__); 998 mtx_lock_spin(&smp_ipi_mtx); 999 KASSERT(call_data == NULL, ("call_data isn't null?!")); 1000 call_data = &data; 1001 call_data->func_id = vector; 1002 call_data->arg1 = addr1; 1003 call_data->arg2 = addr2; 1004 atomic_store_rel_int(&smp_tlb_wait, 0); 1005 ipi_all_but_self(vector); 1006 while (smp_tlb_wait < ncpu) 1007 ia32_pause(); 1008 call_data = NULL; 1009 mtx_unlock_spin(&smp_ipi_mtx); 1010} 1011 1012static void 1013smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2) 1014{ 1015 int cpu, ncpu, othercpus; 1016 struct _call_data data; 1017 1018 othercpus = mp_ncpus - 1; 1019 if (CPU_ISFULLSET(&mask)) { 1020 if (othercpus < 1) 1021 return; 1022 } else { 1023 CPU_CLR(PCPU_GET(cpuid), &mask); 1024 if (CPU_EMPTY(&mask)) 1025 return; 1026 } 1027 if (!(read_eflags() & PSL_I)) 1028 panic("%s: interrupts disabled", __func__); 1029 mtx_lock_spin(&smp_ipi_mtx); 1030 KASSERT(call_data == NULL, ("call_data isn't null?!")); 1031 call_data = &data; 1032 call_data->func_id = vector; 1033 call_data->arg1 = addr1; 1034 call_data->arg2 = addr2; 1035 atomic_store_rel_int(&smp_tlb_wait, 0); 1036 if (CPU_ISFULLSET(&mask)) { 1037 ncpu = othercpus; 1038 ipi_all_but_self(vector); 1039 } else { 1040 ncpu = 0; 1041 while ((cpu = cpusetobj_ffs(&mask)) != 0) { 1042 cpu--; 1043 CPU_CLR(cpu, &mask); 1044 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, 1045 vector); 1046 ipi_send_cpu(cpu, vector); 1047 ncpu++; 1048 } 1049 } 1050 while (smp_tlb_wait < ncpu) 1051 ia32_pause(); 1052 call_data = NULL; 1053 mtx_unlock_spin(&smp_ipi_mtx); 1054} 1055 1056void 1057smp_cache_flush(void) 1058{ 1059 1060 if (smp_started) 1061 smp_tlb_shootdown(IPI_INVLCACHE, 0, 0); 1062} 1063 1064void 1065smp_invltlb(void) 1066{ 1067 1068 if (smp_started) { 1069 smp_tlb_shootdown(IPI_INVLTLB, 0, 0); 1070 } 1071} 1072 1073void 1074smp_invlpg(vm_offset_t addr) 1075{ 1076 1077 if (smp_started) { 1078 smp_tlb_shootdown(IPI_INVLPG, addr, 0); 1079 } 1080} 1081 1082void 1083smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2) 1084{ 1085 1086 if (smp_started) { 1087 smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2); 1088 } 1089} 1090 1091void 1092smp_masked_invltlb(cpuset_t mask) 1093{ 1094 1095 if (smp_started) { 1096 smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0); 1097 } 1098} 1099 1100void 1101smp_masked_invlpg(cpuset_t mask, vm_offset_t addr) 1102{ 1103 1104 if (smp_started) { 1105 smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0); 1106 } 1107} 1108 1109void 1110smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2) 1111{ 1112 1113 if (smp_started) { 1114 smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2); 1115 } 1116} 1117 1118/* 1119 * send an IPI to a set of cpus. 1120 */ 1121void 1122ipi_selected(cpuset_t cpus, u_int ipi) 1123{ 1124 int cpu; 1125 1126 /* 1127 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1128 * of help in order to understand what is the source. 1129 * Set the mask of receiving CPUs for this purpose. 1130 */ 1131 if (ipi == IPI_STOP_HARD) 1132 CPU_OR_ATOMIC(&ipi_nmi_pending, &cpus); 1133 1134 while ((cpu = cpusetobj_ffs(&cpus)) != 0) { 1135 cpu--; 1136 CPU_CLR(cpu, &cpus); 1137 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1138 ipi_send_cpu(cpu, ipi); 1139 } 1140} 1141 1142/* 1143 * send an IPI to a specific CPU. 1144 */ 1145void 1146ipi_cpu(int cpu, u_int ipi) 1147{ 1148 1149 /* 1150 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1151 * of help in order to understand what is the source. 1152 * Set the mask of receiving CPUs for this purpose. 1153 */ 1154 if (ipi == IPI_STOP_HARD) 1155 CPU_SET_ATOMIC(cpu, &ipi_nmi_pending); 1156 1157 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1158 ipi_send_cpu(cpu, ipi); 1159} 1160 1161/* 1162 * send an IPI to all CPUs EXCEPT myself 1163 */ 1164void 1165ipi_all_but_self(u_int ipi) 1166{ 1167 cpuset_t other_cpus; 1168 1169 /* 1170 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1171 * of help in order to understand what is the source. 1172 * Set the mask of receiving CPUs for this purpose. 1173 */ 1174 other_cpus = all_cpus; 1175 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 1176 if (ipi == IPI_STOP_HARD) 1177 CPU_OR_ATOMIC(&ipi_nmi_pending, &other_cpus); 1178 1179 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); 1180 ipi_selected(other_cpus, ipi); 1181} 1182 1183int 1184ipi_nmi_handler() 1185{ 1186 u_int cpuid; 1187 1188 /* 1189 * As long as there is not a simple way to know about a NMI's 1190 * source, if the bitmask for the current CPU is present in 1191 * the global pending bitword an IPI_STOP_HARD has been issued 1192 * and should be handled. 1193 */ 1194 cpuid = PCPU_GET(cpuid); 1195 if (!CPU_ISSET(cpuid, &ipi_nmi_pending)) 1196 return (1); 1197 1198 CPU_CLR_ATOMIC(cpuid, &ipi_nmi_pending); 1199 cpustop_handler(); 1200 return (0); 1201} 1202 1203/* 1204 * Handle an IPI_STOP by saving our current context and spinning until we 1205 * are resumed. 1206 */ 1207void 1208cpustop_handler(void) 1209{ 1210 int cpu; 1211 1212 cpu = PCPU_GET(cpuid); 1213 1214 savectx(&stoppcbs[cpu]); 1215 1216 /* Indicate that we are stopped */ 1217 CPU_SET_ATOMIC(cpu, &stopped_cpus); 1218 1219 /* Wait for restart */ 1220 while (!CPU_ISSET(cpu, &started_cpus)) 1221 ia32_pause(); 1222 1223 CPU_CLR_ATOMIC(cpu, &started_cpus); 1224 CPU_CLR_ATOMIC(cpu, &stopped_cpus); 1225 1226 if (cpu == 0 && cpustop_restartfunc != NULL) { 1227 cpustop_restartfunc(); 1228 cpustop_restartfunc = NULL; 1229 } 1230} 1231 1232/* 1233 * This is called once the rest of the system is up and running and we're 1234 * ready to let the AP's out of the pen. 1235 */ 1236static void 1237release_aps(void *dummy __unused) 1238{ 1239 1240 if (mp_ncpus == 1) 1241 return; 1242 atomic_store_rel_int(&aps_ready, 1); 1243 while (smp_started == 0) 1244 ia32_pause(); 1245} 1246SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); 1247SYSINIT(start_ipis, SI_SUB_INTR, SI_ORDER_ANY, xen_smp_intr_init_cpus, NULL); 1248 1249