mp_machdep.c revision 189420
1/*- 2 * Copyright (c) 1996, by Steve Passe 3 * Copyright (c) 2008, by Kip Macy 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. The name of the developer may NOT be used to endorse or promote products 12 * derived from this software without specific prior written permission. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: head/sys/i386/xen/mp_machdep.c 189420 2009-03-05 18:43:54Z jhb $"); 29 30#include "opt_apic.h" 31#include "opt_cpu.h" 32#include "opt_kstack_pages.h" 33#include "opt_mp_watchdog.h" 34#include "opt_sched.h" 35#include "opt_smp.h" 36 37#if !defined(lint) 38#if !defined(SMP) 39#error How did you get here? 40#endif 41 42#ifndef DEV_APIC 43#error The apic device is required for SMP, add "device apic" to your config file. 44#endif 45#if defined(CPU_DISABLE_CMPXCHG) && !defined(COMPILING_LINT) 46#error SMP not supported with CPU_DISABLE_CMPXCHG 47#endif 48#endif /* not lint */ 49 50#include <sys/param.h> 51#include <sys/systm.h> 52#include <sys/bus.h> 53#include <sys/cons.h> /* cngetc() */ 54#ifdef GPROF 55#include <sys/gmon.h> 56#endif 57#include <sys/kernel.h> 58#include <sys/ktr.h> 59#include <sys/lock.h> 60#include <sys/malloc.h> 61#include <sys/memrange.h> 62#include <sys/mutex.h> 63#include <sys/pcpu.h> 64#include <sys/proc.h> 65#include <sys/sched.h> 66#include <sys/smp.h> 67#include <sys/sysctl.h> 68 69#include <vm/vm.h> 70#include <vm/vm_param.h> 71#include <vm/pmap.h> 72#include <vm/vm_kern.h> 73#include <vm/vm_extern.h> 74#include <vm/vm_page.h> 75 76#include <machine/apicreg.h> 77#include <machine/md_var.h> 78#include <machine/mp_watchdog.h> 79#include <machine/pcb.h> 80#include <machine/psl.h> 81#include <machine/smp.h> 82#include <machine/specialreg.h> 83#include <machine/pcpu.h> 84 85 86 87#include <machine/xen/xen-os.h> 88#include <xen/evtchn.h> 89#include <xen/xen_intr.h> 90#include <xen/hypervisor.h> 91#include <xen/interface/vcpu.h> 92 93#define stop_cpus_with_nmi 0 94 95 96int mp_naps; /* # of Applications processors */ 97int boot_cpu_id = -1; /* designated BSP */ 98 99extern struct pcpu __pcpu[]; 100 101static int bootAP; 102static union descriptor *bootAPgdt; 103 104static char resched_name[NR_CPUS][15]; 105static char callfunc_name[NR_CPUS][15]; 106 107/* Free these after use */ 108void *bootstacks[MAXCPU]; 109 110/* Hotwire a 0->4MB V==P mapping */ 111extern pt_entry_t *KPTphys; 112 113struct pcb stoppcbs[MAXCPU]; 114 115/* Variables needed for SMP tlb shootdown. */ 116vm_offset_t smp_tlb_addr1; 117vm_offset_t smp_tlb_addr2; 118volatile int smp_tlb_wait; 119 120typedef void call_data_func_t(uintptr_t , uintptr_t); 121 122static u_int logical_cpus; 123 124/* used to hold the AP's until we are ready to release them */ 125static struct mtx ap_boot_mtx; 126 127/* Set to 1 once we're ready to let the APs out of the pen. */ 128static volatile int aps_ready = 0; 129 130/* 131 * Store data from cpu_add() until later in the boot when we actually setup 132 * the APs. 133 */ 134struct cpu_info { 135 int cpu_present:1; 136 int cpu_bsp:1; 137 int cpu_disabled:1; 138} static cpu_info[MAX_APIC_ID + 1]; 139int cpu_apic_ids[MAXCPU]; 140int apic_cpuids[MAX_APIC_ID + 1]; 141 142/* Holds pending bitmap based IPIs per CPU */ 143static volatile u_int cpu_ipi_pending[MAXCPU]; 144 145static void assign_cpu_ids(void); 146static void set_interrupt_apic_ids(void); 147int start_all_aps(void); 148static int start_ap(int apic_id); 149static void release_aps(void *dummy); 150 151static u_int hyperthreading_cpus; 152static cpumask_t hyperthreading_cpus_mask; 153 154extern void Xhypervisor_callback(void); 155extern void failsafe_callback(void); 156extern void pmap_lazyfix_action(void); 157 158struct cpu_group * 159cpu_topo(void) 160{ 161 if (cpu_cores == 0) 162 cpu_cores = 1; 163 if (cpu_logical == 0) 164 cpu_logical = 1; 165 if (mp_ncpus % (cpu_cores * cpu_logical) != 0) { 166 printf("WARNING: Non-uniform processors.\n"); 167 printf("WARNING: Using suboptimal topology.\n"); 168 return (smp_topo_none()); 169 } 170 /* 171 * No multi-core or hyper-threaded. 172 */ 173 if (cpu_logical * cpu_cores == 1) 174 return (smp_topo_none()); 175 /* 176 * Only HTT no multi-core. 177 */ 178 if (cpu_logical > 1 && cpu_cores == 1) 179 return (smp_topo_1level(CG_SHARE_L1, cpu_logical, CG_FLAG_HTT)); 180 /* 181 * Only multi-core no HTT. 182 */ 183 if (cpu_cores > 1 && cpu_logical == 1) 184 return (smp_topo_1level(CG_SHARE_NONE, cpu_cores, 0)); 185 /* 186 * Both HTT and multi-core. 187 */ 188 return (smp_topo_2level(CG_SHARE_NONE, cpu_cores, 189 CG_SHARE_L1, cpu_logical, CG_FLAG_HTT)); 190} 191 192/* 193 * Calculate usable address in base memory for AP trampoline code. 194 */ 195u_int 196mp_bootaddress(u_int basemem) 197{ 198 199 return (basemem); 200} 201 202void 203cpu_add(u_int apic_id, char boot_cpu) 204{ 205 206 if (apic_id > MAX_APIC_ID) { 207 panic("SMP: APIC ID %d too high", apic_id); 208 return; 209 } 210 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", 211 apic_id)); 212 cpu_info[apic_id].cpu_present = 1; 213 if (boot_cpu) { 214 KASSERT(boot_cpu_id == -1, 215 ("CPU %d claims to be BSP, but CPU %d already is", apic_id, 216 boot_cpu_id)); 217 boot_cpu_id = apic_id; 218 cpu_info[apic_id].cpu_bsp = 1; 219 } 220 if (mp_ncpus < MAXCPU) 221 mp_ncpus++; 222 if (bootverbose) 223 printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : 224 "AP"); 225} 226 227void 228cpu_mp_setmaxid(void) 229{ 230 231 mp_maxid = MAXCPU - 1; 232} 233 234int 235cpu_mp_probe(void) 236{ 237 238 /* 239 * Always record BSP in CPU map so that the mbuf init code works 240 * correctly. 241 */ 242 all_cpus = 1; 243 if (mp_ncpus == 0) { 244 /* 245 * No CPUs were found, so this must be a UP system. Setup 246 * the variables to represent a system with a single CPU 247 * with an id of 0. 248 */ 249 mp_ncpus = 1; 250 return (0); 251 } 252 253 /* At least one CPU was found. */ 254 if (mp_ncpus == 1) { 255 /* 256 * One CPU was found, so this must be a UP system with 257 * an I/O APIC. 258 */ 259 return (0); 260 } 261 262 /* At least two CPUs were found. */ 263 return (1); 264} 265 266/* 267 * Initialize the IPI handlers and start up the AP's. 268 */ 269void 270cpu_mp_start(void) 271{ 272 int i; 273 274 /* Initialize the logical ID to APIC ID table. */ 275 for (i = 0; i < MAXCPU; i++) { 276 cpu_apic_ids[i] = -1; 277 cpu_ipi_pending[i] = 0; 278 } 279 280 /* Set boot_cpu_id if needed. */ 281 if (boot_cpu_id == -1) { 282 boot_cpu_id = PCPU_GET(apic_id); 283 cpu_info[boot_cpu_id].cpu_bsp = 1; 284 } else 285 KASSERT(boot_cpu_id == PCPU_GET(apic_id), 286 ("BSP's APIC ID doesn't match boot_cpu_id")); 287 cpu_apic_ids[0] = boot_cpu_id; 288 apic_cpuids[boot_cpu_id] = 0; 289 290 assign_cpu_ids(); 291 292 /* Start each Application Processor */ 293 start_all_aps(); 294 295 /* Setup the initial logical CPUs info. */ 296 logical_cpus = logical_cpus_mask = 0; 297 if (cpu_feature & CPUID_HTT) 298 logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16; 299 300 set_interrupt_apic_ids(); 301} 302 303 304static void 305iv_rendezvous(uintptr_t a, uintptr_t b) 306{ 307 smp_rendezvous_action(); 308} 309 310static void 311iv_invltlb(uintptr_t a, uintptr_t b) 312{ 313 xen_tlb_flush(); 314} 315 316static void 317iv_invlpg(uintptr_t a, uintptr_t b) 318{ 319 xen_invlpg(a); 320} 321 322static void 323iv_invlrng(uintptr_t a, uintptr_t b) 324{ 325 vm_offset_t start = (vm_offset_t)a; 326 vm_offset_t end = (vm_offset_t)b; 327 328 while (start < end) { 329 xen_invlpg(start); 330 start += PAGE_SIZE; 331 } 332} 333 334 335static void 336iv_invlcache(uintptr_t a, uintptr_t b) 337{ 338 339 wbinvd(); 340 atomic_add_int(&smp_tlb_wait, 1); 341} 342 343static void 344iv_lazypmap(uintptr_t a, uintptr_t b) 345{ 346 pmap_lazyfix_action(); 347 atomic_add_int(&smp_tlb_wait, 1); 348} 349 350 351static void 352iv_noop(uintptr_t a, uintptr_t b) 353{ 354 atomic_add_int(&smp_tlb_wait, 1); 355} 356 357static call_data_func_t *ipi_vectors[IPI_BITMAP_VECTOR] = 358{ 359 iv_noop, 360 iv_noop, 361 iv_rendezvous, 362 iv_invltlb, 363 iv_invlpg, 364 iv_invlrng, 365 iv_invlcache, 366 iv_lazypmap, 367}; 368 369/* 370 * Reschedule call back. Nothing to do, 371 * all the work is done automatically when 372 * we return from the interrupt. 373 */ 374static int 375smp_reschedule_interrupt(void *unused) 376{ 377 int cpu = PCPU_GET(cpuid); 378 u_int ipi_bitmap; 379 380 ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); 381 382 if (ipi_bitmap & (1 << IPI_PREEMPT)) { 383#ifdef COUNT_IPIS 384 (*ipi_preempt_counts[cpu])++; 385#endif 386 sched_preempt(curthread); 387 } 388 389 if (ipi_bitmap & (1 << IPI_AST)) { 390#ifdef COUNT_IPIS 391 (*ipi_ast_counts[cpu])++; 392#endif 393 /* Nothing to do for AST */ 394 } 395 return (FILTER_HANDLED); 396} 397 398struct _call_data { 399 uint16_t func_id; 400 uint16_t wait; 401 uintptr_t arg1; 402 uintptr_t arg2; 403 atomic_t started; 404 atomic_t finished; 405}; 406 407static struct _call_data *call_data; 408 409static int 410smp_call_function_interrupt(void *unused) 411{ 412 call_data_func_t *func; 413 uintptr_t arg1 = call_data->arg1; 414 uintptr_t arg2 = call_data->arg2; 415 int wait = call_data->wait; 416 atomic_t *started = &call_data->started; 417 atomic_t *finished = &call_data->finished; 418 419 if (call_data->func_id > IPI_BITMAP_VECTOR) 420 panic("invalid function id %u", call_data->func_id); 421 422 func = ipi_vectors[call_data->func_id]; 423 /* 424 * Notify initiating CPU that I've grabbed the data and am 425 * about to execute the function 426 */ 427 mb(); 428 atomic_inc(started); 429 /* 430 * At this point the info structure may be out of scope unless wait==1 431 */ 432 (*func)(arg1, arg2); 433 434 if (wait) { 435 mb(); 436 atomic_inc(finished); 437 } 438 atomic_add_int(&smp_tlb_wait, 1); 439 return (FILTER_HANDLED); 440} 441 442/* 443 * Print various information about the SMP system hardware and setup. 444 */ 445void 446cpu_mp_announce(void) 447{ 448 int i, x; 449 450 /* List CPUs */ 451 printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id); 452 for (i = 1, x = 0; x <= MAX_APIC_ID; x++) { 453 if (!cpu_info[x].cpu_present || cpu_info[x].cpu_bsp) 454 continue; 455 if (cpu_info[x].cpu_disabled) 456 printf(" cpu (AP): APIC ID: %2d (disabled)\n", x); 457 else { 458 KASSERT(i < mp_ncpus, 459 ("mp_ncpus and actual cpus are out of whack")); 460 printf(" cpu%d (AP): APIC ID: %2d\n", i++, x); 461 } 462 } 463} 464 465static int 466xen_smp_intr_init(unsigned int cpu) 467{ 468 int rc; 469 unsigned int irq; 470 471 per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1; 472 473 sprintf(resched_name[cpu], "resched%u", cpu); 474 rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR, 475 cpu, 476 resched_name[cpu], 477 smp_reschedule_interrupt, 478 INTR_FAST|INTR_TYPE_TTY|INTR_MPSAFE, &irq); 479 480 printf("cpu=%d irq=%d vector=%d\n", 481 cpu, rc, RESCHEDULE_VECTOR); 482 483 per_cpu(resched_irq, cpu) = irq; 484 485 sprintf(callfunc_name[cpu], "callfunc%u", cpu); 486 rc = bind_ipi_to_irqhandler(CALL_FUNCTION_VECTOR, 487 cpu, 488 callfunc_name[cpu], 489 smp_call_function_interrupt, 490 INTR_FAST|INTR_TYPE_TTY|INTR_MPSAFE, &irq); 491 if (rc < 0) 492 goto fail; 493 per_cpu(callfunc_irq, cpu) = irq; 494 495 printf("cpu=%d irq=%d vector=%d\n", 496 cpu, rc, CALL_FUNCTION_VECTOR); 497 498 499 if ((cpu != 0) && ((rc = ap_cpu_initclocks(cpu)) != 0)) 500 goto fail; 501 502 return 0; 503 504 fail: 505 if (per_cpu(resched_irq, cpu) >= 0) 506 unbind_from_irqhandler(per_cpu(resched_irq, cpu)); 507 if (per_cpu(callfunc_irq, cpu) >= 0) 508 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu)); 509 return rc; 510} 511 512static void 513xen_smp_intr_init_cpus(void *unused) 514{ 515 int i; 516 517 for (i = 0; i < mp_ncpus; i++) 518 xen_smp_intr_init(i); 519} 520 521#define MTOPSIZE (1<<(14 + PAGE_SHIFT)) 522 523/* 524 * AP CPU's call this to initialize themselves. 525 */ 526void 527init_secondary(void) 528{ 529 vm_offset_t addr; 530 int gsel_tss; 531 532 533 /* bootAP is set in start_ap() to our ID. */ 534 PCPU_SET(currentldt, _default_ldt); 535 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 536#if 0 537 gdt[bootAP * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; 538#endif 539 PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */ 540 PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); 541 PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); 542#if 0 543 PCPU_SET(tss_gdt, &gdt[bootAP * NGDT + GPROC0_SEL].sd); 544 545 PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); 546#endif 547 PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); 548 549 /* 550 * Set to a known state: 551 * Set by mpboot.s: CR0_PG, CR0_PE 552 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM 553 */ 554 /* 555 * signal our startup to the BSP. 556 */ 557 mp_naps++; 558 559 /* Spin until the BSP releases the AP's. */ 560 while (!aps_ready) 561 ia32_pause(); 562 563 /* BSP may have changed PTD while we were waiting */ 564 invltlb(); 565 for (addr = 0; addr < NKPT * NBPDR - 1; addr += PAGE_SIZE) 566 invlpg(addr); 567 568 /* set up FPU state on the AP */ 569 npxinit(); 570#if 0 571 572 /* set up SSE registers */ 573 enable_sse(); 574#endif 575#if 0 && defined(PAE) 576 /* Enable the PTE no-execute bit. */ 577 if ((amd_feature & AMDID_NX) != 0) { 578 uint64_t msr; 579 580 msr = rdmsr(MSR_EFER) | EFER_NXE; 581 wrmsr(MSR_EFER, msr); 582 } 583#endif 584#if 0 585 /* A quick check from sanity claus */ 586 if (PCPU_GET(apic_id) != lapic_id()) { 587 printf("SMP: cpuid = %d\n", PCPU_GET(cpuid)); 588 printf("SMP: actual apic_id = %d\n", lapic_id()); 589 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); 590 panic("cpuid mismatch! boom!!"); 591 } 592#endif 593 594 /* Initialize curthread. */ 595 KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); 596 PCPU_SET(curthread, PCPU_GET(idlethread)); 597 598 mtx_lock_spin(&ap_boot_mtx); 599#if 0 600 601 /* Init local apic for irq's */ 602 lapic_setup(1); 603#endif 604 smp_cpus++; 605 606 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid)); 607 printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid)); 608 609 /* Determine if we are a logical CPU. */ 610 if (logical_cpus > 1 && PCPU_GET(apic_id) % logical_cpus != 0) 611 logical_cpus_mask |= PCPU_GET(cpumask); 612 613 /* Determine if we are a hyperthread. */ 614 if (hyperthreading_cpus > 1 && 615 PCPU_GET(apic_id) % hyperthreading_cpus != 0) 616 hyperthreading_cpus_mask |= PCPU_GET(cpumask); 617 618 /* Build our map of 'other' CPUs. */ 619 PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); 620#if 0 621 if (bootverbose) 622 lapic_dump("AP"); 623#endif 624 if (smp_cpus == mp_ncpus) { 625 /* enable IPI's, tlb shootdown, freezes etc */ 626 atomic_store_rel_int(&smp_started, 1); 627 smp_active = 1; /* historic */ 628 } 629 630 mtx_unlock_spin(&ap_boot_mtx); 631 632 /* wait until all the AP's are up */ 633 while (smp_started == 0) 634 ia32_pause(); 635 636 637 PCPU_SET(curthread, PCPU_GET(idlethread)); 638 /* enter the scheduler */ 639 sched_throw(NULL); 640 641 panic("scheduler returned us to %s", __func__); 642 /* NOTREACHED */ 643} 644 645/******************************************************************* 646 * local functions and data 647 */ 648 649/* 650 * We tell the I/O APIC code about all the CPUs we want to receive 651 * interrupts. If we don't want certain CPUs to receive IRQs we 652 * can simply not tell the I/O APIC code about them in this function. 653 * We also do not tell it about the BSP since it tells itself about 654 * the BSP internally to work with UP kernels and on UP machines. 655 */ 656static void 657set_interrupt_apic_ids(void) 658{ 659 u_int i, apic_id; 660 661 for (i = 0; i < MAXCPU; i++) { 662 apic_id = cpu_apic_ids[i]; 663 if (apic_id == -1) 664 continue; 665 if (cpu_info[apic_id].cpu_bsp) 666 continue; 667 if (cpu_info[apic_id].cpu_disabled) 668 continue; 669 670 /* Don't let hyperthreads service interrupts. */ 671 if (hyperthreading_cpus > 1 && 672 apic_id % hyperthreading_cpus != 0) 673 continue; 674 675 intr_add_cpu(i); 676 } 677} 678 679/* 680 * Assign logical CPU IDs to local APICs. 681 */ 682static void 683assign_cpu_ids(void) 684{ 685 u_int i; 686 687 /* Check for explicitly disabled CPUs. */ 688 for (i = 0; i <= MAX_APIC_ID; i++) { 689 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp) 690 continue; 691 692 /* Don't use this CPU if it has been disabled by a tunable. */ 693 if (resource_disabled("lapic", i)) { 694 cpu_info[i].cpu_disabled = 1; 695 continue; 696 } 697 } 698 699 /* 700 * Assign CPU IDs to local APIC IDs and disable any CPUs 701 * beyond MAXCPU. CPU 0 has already been assigned to the BSP, 702 * so we only have to assign IDs for APs. 703 */ 704 mp_ncpus = 1; 705 for (i = 0; i <= MAX_APIC_ID; i++) { 706 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp || 707 cpu_info[i].cpu_disabled) 708 continue; 709 710 if (mp_ncpus < MAXCPU) { 711 cpu_apic_ids[mp_ncpus] = i; 712 apic_cpuids[i] = mp_ncpus; 713 mp_ncpus++; 714 } else 715 cpu_info[i].cpu_disabled = 1; 716 } 717 KASSERT(mp_maxid >= mp_ncpus - 1, 718 ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, 719 mp_ncpus)); 720} 721 722/* 723 * start each AP in our list 724 */ 725/* Lowest 1MB is already mapped: don't touch*/ 726#define TMPMAP_START 1 727int 728start_all_aps(void) 729{ 730 int x,apic_id, cpu; 731 struct pcpu *pc; 732 733 mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); 734 735 /* set up temporary P==V mapping for AP boot */ 736 /* XXX this is a hack, we should boot the AP on its own stack/PTD */ 737 738 /* start each AP */ 739 for (cpu = 1; cpu < mp_ncpus; cpu++) { 740 apic_id = cpu_apic_ids[cpu]; 741 742 743 bootAP = cpu; 744 bootAPgdt = gdt + (512*cpu); 745 746 /* Get per-cpu data */ 747 pc = &__pcpu[bootAP]; 748 pcpu_init(pc, bootAP, sizeof(struct pcpu)); 749 pc->pc_apic_id = cpu_apic_ids[bootAP]; 750 pc->pc_prvspace = pc; 751 pc->pc_curthread = 0; 752 753 gdt_segs[GPRIV_SEL].ssd_base = (int) pc; 754 gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; 755 756 PT_SET_MA(bootAPgdt, xpmap_ptom(VTOP(bootAPgdt)) | PG_V | PG_RW); 757 bzero(bootAPgdt, PAGE_SIZE); 758 for (x = 0; x < NGDT; x++) 759 ssdtosd(&gdt_segs[x], &bootAPgdt[x].sd); 760 PT_SET_MA(bootAPgdt, vtomach(bootAPgdt) | PG_V); 761#ifdef notyet 762 763 if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, cpu, &cpu_id) == 0) { 764 apicid = xen_vcpu_physid_to_x86_apicid(cpu_id.phys_id); 765 acpiid = xen_vcpu_physid_to_x86_acpiid(cpu_id.phys_id); 766#ifdef CONFIG_ACPI 767 if (acpiid != 0xff) 768 x86_acpiid_to_apicid[acpiid] = apicid; 769#endif 770 } 771#endif 772 773 /* attempt to start the Application Processor */ 774 if (!start_ap(cpu)) { 775 printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id); 776 /* better panic as the AP may be running loose */ 777 printf("panic y/n? [y] "); 778 if (cngetc() != 'n') 779 panic("bye-bye"); 780 } 781 782 all_cpus |= (1 << cpu); /* record AP in CPU map */ 783 } 784 785 786 /* build our map of 'other' CPUs */ 787 PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); 788 789 pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1); 790 791 /* number of APs actually started */ 792 return mp_naps; 793} 794 795extern uint8_t *pcpu_boot_stack; 796extern trap_info_t trap_table[]; 797 798static void 799smp_trap_init(trap_info_t *trap_ctxt) 800{ 801 const trap_info_t *t = trap_table; 802 803 for (t = trap_table; t->address; t++) { 804 trap_ctxt[t->vector].flags = t->flags; 805 trap_ctxt[t->vector].cs = t->cs; 806 trap_ctxt[t->vector].address = t->address; 807 } 808} 809 810extern int nkpt; 811static void 812cpu_initialize_context(unsigned int cpu) 813{ 814 /* vcpu_guest_context_t is too large to allocate on the stack. 815 * Hence we allocate statically and protect it with a lock */ 816 vm_page_t m[4]; 817 static vcpu_guest_context_t ctxt; 818 vm_offset_t boot_stack; 819 vm_offset_t newPTD; 820 vm_paddr_t ma[NPGPTD]; 821 static int color; 822 int i; 823 824 /* 825 * Page 0,[0-3] PTD 826 * Page 1, [4] boot stack 827 * Page [5] PDPT 828 * 829 */ 830 for (i = 0; i < NPGPTD + 2; i++) { 831 m[i] = vm_page_alloc(NULL, color++, 832 VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 833 VM_ALLOC_ZERO); 834 835 pmap_zero_page(m[i]); 836 837 } 838 boot_stack = kmem_alloc_nofault(kernel_map, 1); 839 newPTD = kmem_alloc_nofault(kernel_map, NPGPTD); 840 ma[0] = xpmap_ptom(VM_PAGE_TO_PHYS(m[0]))|PG_V; 841 842#ifdef PAE 843 pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD + 1])); 844 for (i = 0; i < NPGPTD; i++) { 845 ((vm_paddr_t *)boot_stack)[i] = 846 ma[i] = 847 xpmap_ptom(VM_PAGE_TO_PHYS(m[i]))|PG_V; 848 } 849#endif 850 851 /* 852 * Copy cpu0 IdlePTD to new IdlePTD - copying only 853 * kernel mappings 854 */ 855 pmap_qenter(newPTD, m, 4); 856 857 memcpy((uint8_t *)newPTD + KPTDI*sizeof(vm_paddr_t), 858 (uint8_t *)PTOV(IdlePTD) + KPTDI*sizeof(vm_paddr_t), 859 nkpt*sizeof(vm_paddr_t)); 860 861 pmap_qremove(newPTD, 4); 862 kmem_free(kernel_map, newPTD, 4); 863 /* 864 * map actual idle stack to boot_stack 865 */ 866 pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD])); 867 868 869 xen_pgdpt_pin(xpmap_ptom(VM_PAGE_TO_PHYS(m[NPGPTD + 1]))); 870 vm_page_lock_queues(); 871 for (i = 0; i < 4; i++) { 872 int pdir = (PTDPTDI + i) / NPDEPG; 873 int curoffset = (PTDPTDI + i) % NPDEPG; 874 875 xen_queue_pt_update((vm_paddr_t) 876 ((ma[pdir] & ~PG_V) + (curoffset*sizeof(vm_paddr_t))), 877 ma[i]); 878 } 879 PT_UPDATES_FLUSH(); 880 vm_page_unlock_queues(); 881 882 memset(&ctxt, 0, sizeof(ctxt)); 883 ctxt.flags = VGCF_IN_KERNEL; 884 ctxt.user_regs.ds = GSEL(GDATA_SEL, SEL_KPL); 885 ctxt.user_regs.es = GSEL(GDATA_SEL, SEL_KPL); 886 ctxt.user_regs.fs = GSEL(GPRIV_SEL, SEL_KPL); 887 ctxt.user_regs.gs = GSEL(GDATA_SEL, SEL_KPL); 888 ctxt.user_regs.cs = GSEL(GCODE_SEL, SEL_KPL); 889 ctxt.user_regs.ss = GSEL(GDATA_SEL, SEL_KPL); 890 ctxt.user_regs.eip = (unsigned long)init_secondary; 891 ctxt.user_regs.eflags = PSL_KERNEL | 0x1000; /* IOPL_RING1 */ 892 893 memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt)); 894 895 smp_trap_init(ctxt.trap_ctxt); 896 897 ctxt.ldt_ents = 0; 898 ctxt.gdt_frames[0] = (uint32_t)((uint64_t)vtomach(bootAPgdt) >> PAGE_SHIFT); 899 ctxt.gdt_ents = 512; 900 901#ifdef __i386__ 902 ctxt.user_regs.esp = boot_stack + PAGE_SIZE; 903 904 ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL); 905 ctxt.kernel_sp = boot_stack + PAGE_SIZE; 906 907 ctxt.event_callback_cs = GSEL(GCODE_SEL, SEL_KPL); 908 ctxt.event_callback_eip = (unsigned long)Xhypervisor_callback; 909 ctxt.failsafe_callback_cs = GSEL(GCODE_SEL, SEL_KPL); 910 ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; 911 912 ctxt.ctrlreg[3] = xpmap_ptom(VM_PAGE_TO_PHYS(m[NPGPTD + 1])); 913#else /* __x86_64__ */ 914 ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs); 915 ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL); 916 ctxt.kernel_sp = idle->thread.rsp0; 917 918 ctxt.event_callback_eip = (unsigned long)hypervisor_callback; 919 ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; 920 ctxt.syscall_callback_eip = (unsigned long)system_call; 921 922 ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt)); 923 924 ctxt.gs_base_kernel = (unsigned long)(cpu_pda(cpu)); 925#endif 926 927 printf("gdtpfn=%lx pdptpfn=%lx\n", 928 ctxt.gdt_frames[0], 929 ctxt.ctrlreg[3] >> PAGE_SHIFT); 930 931 PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt)); 932 DELAY(3000); 933 PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)); 934} 935 936/* 937 * This function starts the AP (application processor) identified 938 * by the APIC ID 'physicalCpu'. It does quite a "song and dance" 939 * to accomplish this. This is necessary because of the nuances 940 * of the different hardware we might encounter. It isn't pretty, 941 * but it seems to work. 942 */ 943 944int cpus; 945static int 946start_ap(int apic_id) 947{ 948 int ms; 949 950 /* used as a watchpoint to signal AP startup */ 951 cpus = mp_naps; 952 953 cpu_initialize_context(apic_id); 954 955 /* Wait up to 5 seconds for it to start. */ 956 for (ms = 0; ms < 5000; ms++) { 957 if (mp_naps > cpus) 958 return 1; /* return SUCCESS */ 959 DELAY(1000); 960 } 961 return 0; /* return FAILURE */ 962} 963 964/* 965 * Flush the TLB on all other CPU's 966 */ 967static void 968smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) 969{ 970 u_int ncpu; 971 struct _call_data data; 972 973 call_data = &data; 974 975 ncpu = mp_ncpus - 1; /* does not shootdown self */ 976 if (ncpu < 1) 977 return; /* no other cpus */ 978 if (!(read_eflags() & PSL_I)) 979 panic("%s: interrupts disabled", __func__); 980 mtx_lock_spin(&smp_ipi_mtx); 981 call_data->func_id = vector; 982 call_data->arg1 = addr1; 983 call_data->arg2 = addr2; 984 atomic_store_rel_int(&smp_tlb_wait, 0); 985 ipi_all_but_self(vector); 986 while (smp_tlb_wait < ncpu) 987 ia32_pause(); 988 call_data = NULL; 989 mtx_unlock_spin(&smp_ipi_mtx); 990} 991 992static void 993smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2) 994{ 995 int ncpu, othercpus; 996 struct _call_data data; 997 998 othercpus = mp_ncpus - 1; 999 if (mask == (u_int)-1) { 1000 ncpu = othercpus; 1001 if (ncpu < 1) 1002 return; 1003 } else { 1004 mask &= ~PCPU_GET(cpumask); 1005 if (mask == 0) 1006 return; 1007 ncpu = bitcount32(mask); 1008 if (ncpu > othercpus) { 1009 /* XXX this should be a panic offence */ 1010 printf("SMP: tlb shootdown to %d other cpus (only have %d)\n", 1011 ncpu, othercpus); 1012 ncpu = othercpus; 1013 } 1014 /* XXX should be a panic, implied by mask == 0 above */ 1015 if (ncpu < 1) 1016 return; 1017 } 1018 if (!(read_eflags() & PSL_I)) 1019 panic("%s: interrupts disabled", __func__); 1020 mtx_lock_spin(&smp_ipi_mtx); 1021 call_data = &data; 1022 call_data->func_id = vector; 1023 call_data->arg1 = addr1; 1024 call_data->arg2 = addr2; 1025 atomic_store_rel_int(&smp_tlb_wait, 0); 1026 if (mask == (u_int)-1) 1027 ipi_all_but_self(vector); 1028 else 1029 ipi_selected(mask, vector); 1030 while (smp_tlb_wait < ncpu) 1031 ia32_pause(); 1032 call_data = NULL; 1033 mtx_unlock_spin(&smp_ipi_mtx); 1034} 1035 1036void 1037smp_cache_flush(void) 1038{ 1039 1040 if (smp_started) 1041 smp_tlb_shootdown(IPI_INVLCACHE, 0, 0); 1042} 1043 1044void 1045smp_invltlb(void) 1046{ 1047 1048 if (smp_started) { 1049 smp_tlb_shootdown(IPI_INVLTLB, 0, 0); 1050 } 1051} 1052 1053void 1054smp_invlpg(vm_offset_t addr) 1055{ 1056 1057 if (smp_started) { 1058 smp_tlb_shootdown(IPI_INVLPG, addr, 0); 1059 } 1060} 1061 1062void 1063smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2) 1064{ 1065 1066 if (smp_started) { 1067 smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2); 1068 } 1069} 1070 1071void 1072smp_masked_invltlb(u_int mask) 1073{ 1074 1075 if (smp_started) { 1076 smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0); 1077 } 1078} 1079 1080void 1081smp_masked_invlpg(u_int mask, vm_offset_t addr) 1082{ 1083 1084 if (smp_started) { 1085 smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0); 1086 } 1087} 1088 1089void 1090smp_masked_invlpg_range(u_int mask, vm_offset_t addr1, vm_offset_t addr2) 1091{ 1092 1093 if (smp_started) { 1094 smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2); 1095 } 1096} 1097 1098/* 1099 * send an IPI to a set of cpus. 1100 */ 1101void 1102ipi_selected(uint32_t cpus, u_int ipi) 1103{ 1104 int cpu; 1105 u_int bitmap = 0; 1106 u_int old_pending; 1107 u_int new_pending; 1108 1109 if (IPI_IS_BITMAPED(ipi)) { 1110 bitmap = 1 << ipi; 1111 ipi = IPI_BITMAP_VECTOR; 1112 } 1113 1114#ifdef STOP_NMI 1115 if (ipi == IPI_STOP && stop_cpus_with_nmi) { 1116 ipi_nmi_selected(cpus); 1117 return; 1118 } 1119#endif 1120 CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi); 1121 while ((cpu = ffs(cpus)) != 0) { 1122 cpu--; 1123 cpus &= ~(1 << cpu); 1124 1125 KASSERT(cpu_apic_ids[cpu] != -1, 1126 ("IPI to non-existent CPU %d", cpu)); 1127 1128 if (bitmap) { 1129 do { 1130 old_pending = cpu_ipi_pending[cpu]; 1131 new_pending = old_pending | bitmap; 1132 } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu],old_pending, new_pending)); 1133 1134 if (!old_pending) 1135 ipi_pcpu(cpu, RESCHEDULE_VECTOR); 1136 continue; 1137 1138 } 1139 1140 KASSERT(call_data != NULL, ("call_data not set")); 1141 ipi_pcpu(cpu, CALL_FUNCTION_VECTOR); 1142 } 1143} 1144 1145/* 1146 * send an IPI to all CPUs EXCEPT myself 1147 */ 1148void 1149ipi_all_but_self(u_int ipi) 1150{ 1151 1152 if (IPI_IS_BITMAPED(ipi) || (ipi == IPI_STOP && stop_cpus_with_nmi)) { 1153 ipi_selected(PCPU_GET(other_cpus), ipi); 1154 return; 1155 } 1156 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); 1157 ipi_selected(PCPU_GET(other_cpus), ipi); 1158} 1159 1160#ifdef STOP_NMI 1161/* 1162 * send NMI IPI to selected CPUs 1163 */ 1164 1165#define BEFORE_SPIN 1000000 1166 1167void 1168ipi_nmi_selected(u_int32_t cpus) 1169{ 1170 int cpu; 1171 register_t icrlo; 1172 1173 icrlo = APIC_DELMODE_NMI | APIC_DESTMODE_PHY | APIC_LEVEL_ASSERT 1174 | APIC_TRIGMOD_EDGE; 1175 1176 CTR2(KTR_SMP, "%s: cpus: %x nmi", __func__, cpus); 1177 1178 atomic_set_int(&ipi_nmi_pending, cpus); 1179 1180 while ((cpu = ffs(cpus)) != 0) { 1181 cpu--; 1182 cpus &= ~(1 << cpu); 1183 1184 KASSERT(cpu_apic_ids[cpu] != -1, 1185 ("IPI NMI to non-existent CPU %d", cpu)); 1186 1187 /* Wait for an earlier IPI to finish. */ 1188 if (!lapic_ipi_wait(BEFORE_SPIN)) 1189 panic("ipi_nmi_selected: previous IPI has not cleared"); 1190 1191 lapic_ipi_raw(icrlo, cpu_apic_ids[cpu]); 1192 } 1193} 1194 1195int 1196ipi_nmi_handler(void) 1197{ 1198 int cpumask = PCPU_GET(cpumask); 1199 1200 if (!(ipi_nmi_pending & cpumask)) 1201 return 1; 1202 1203 atomic_clear_int(&ipi_nmi_pending, cpumask); 1204 cpustop_handler(); 1205 return 0; 1206} 1207 1208#endif /* STOP_NMI */ 1209 1210/* 1211 * Handle an IPI_STOP by saving our current context and spinning until we 1212 * are resumed. 1213 */ 1214void 1215cpustop_handler(void) 1216{ 1217 int cpu = PCPU_GET(cpuid); 1218 int cpumask = PCPU_GET(cpumask); 1219 1220 savectx(&stoppcbs[cpu]); 1221 1222 /* Indicate that we are stopped */ 1223 atomic_set_int(&stopped_cpus, cpumask); 1224 1225 /* Wait for restart */ 1226 while (!(started_cpus & cpumask)) 1227 ia32_pause(); 1228 1229 atomic_clear_int(&started_cpus, cpumask); 1230 atomic_clear_int(&stopped_cpus, cpumask); 1231 1232 if (cpu == 0 && cpustop_restartfunc != NULL) { 1233 cpustop_restartfunc(); 1234 cpustop_restartfunc = NULL; 1235 } 1236} 1237 1238/* 1239 * This is called once the rest of the system is up and running and we're 1240 * ready to let the AP's out of the pen. 1241 */ 1242static void 1243release_aps(void *dummy __unused) 1244{ 1245 1246 if (mp_ncpus == 1) 1247 return; 1248 atomic_store_rel_int(&aps_ready, 1); 1249 while (smp_started == 0) 1250 ia32_pause(); 1251} 1252SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); 1253SYSINIT(start_ipis, SI_SUB_INTR, SI_ORDER_ANY, xen_smp_intr_init_cpus, NULL); 1254 1255