mp_machdep.c revision 193094
1/*- 2 * Copyright (c) 1996, by Steve Passe 3 * Copyright (c) 2008, by Kip Macy 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. The name of the developer may NOT be used to endorse or promote products 12 * derived from this software without specific prior written permission. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: head/sys/i386/xen/mp_machdep.c 193094 2009-05-30 14:59:08Z adrian $"); 29 30#include "opt_apic.h" 31#include "opt_cpu.h" 32#include "opt_kstack_pages.h" 33#include "opt_mp_watchdog.h" 34#include "opt_sched.h" 35#include "opt_smp.h" 36 37#if !defined(lint) 38#if !defined(SMP) 39#error How did you get here? 40#endif 41 42#ifndef DEV_APIC 43#error The apic device is required for SMP, add "device apic" to your config file. 44#endif 45#if defined(CPU_DISABLE_CMPXCHG) && !defined(COMPILING_LINT) 46#error SMP not supported with CPU_DISABLE_CMPXCHG 47#endif 48#endif /* not lint */ 49 50#include <sys/param.h> 51#include <sys/systm.h> 52#include <sys/bus.h> 53#include <sys/cons.h> /* cngetc() */ 54#ifdef GPROF 55#include <sys/gmon.h> 56#endif 57#include <sys/kernel.h> 58#include <sys/ktr.h> 59#include <sys/lock.h> 60#include <sys/malloc.h> 61#include <sys/memrange.h> 62#include <sys/mutex.h> 63#include <sys/pcpu.h> 64#include <sys/proc.h> 65#include <sys/sched.h> 66#include <sys/smp.h> 67#include <sys/sysctl.h> 68 69#include <vm/vm.h> 70#include <vm/vm_param.h> 71#include <vm/pmap.h> 72#include <vm/vm_kern.h> 73#include <vm/vm_extern.h> 74#include <vm/vm_page.h> 75 76#include <machine/apicreg.h> 77#include <machine/md_var.h> 78#include <machine/mp_watchdog.h> 79#include <machine/pcb.h> 80#include <machine/psl.h> 81#include <machine/smp.h> 82#include <machine/specialreg.h> 83#include <machine/pcpu.h> 84 85 86 87#include <machine/xen/xen-os.h> 88#include <xen/evtchn.h> 89#include <xen/xen_intr.h> 90#include <xen/hypervisor.h> 91#include <xen/interface/vcpu.h> 92 93#define stop_cpus_with_nmi 0 94 95 96int mp_naps; /* # of Applications processors */ 97int boot_cpu_id = -1; /* designated BSP */ 98 99extern struct pcpu __pcpu[]; 100 101static int bootAP; 102static union descriptor *bootAPgdt; 103 104static char resched_name[NR_CPUS][15]; 105static char callfunc_name[NR_CPUS][15]; 106 107/* Free these after use */ 108void *bootstacks[MAXCPU]; 109 110/* Hotwire a 0->4MB V==P mapping */ 111extern pt_entry_t *KPTphys; 112 113struct pcb stoppcbs[MAXCPU]; 114 115/* Variables needed for SMP tlb shootdown. */ 116vm_offset_t smp_tlb_addr1; 117vm_offset_t smp_tlb_addr2; 118volatile int smp_tlb_wait; 119 120typedef void call_data_func_t(uintptr_t , uintptr_t); 121 122static u_int logical_cpus; 123 124/* used to hold the AP's until we are ready to release them */ 125static struct mtx ap_boot_mtx; 126 127/* Set to 1 once we're ready to let the APs out of the pen. */ 128static volatile int aps_ready = 0; 129 130/* 131 * Store data from cpu_add() until later in the boot when we actually setup 132 * the APs. 133 */ 134struct cpu_info { 135 int cpu_present:1; 136 int cpu_bsp:1; 137 int cpu_disabled:1; 138} static cpu_info[MAX_APIC_ID + 1]; 139int cpu_apic_ids[MAXCPU]; 140int apic_cpuids[MAX_APIC_ID + 1]; 141 142/* Holds pending bitmap based IPIs per CPU */ 143static volatile u_int cpu_ipi_pending[MAXCPU]; 144 145static int cpu_logical; 146static int cpu_cores; 147 148static void assign_cpu_ids(void); 149static void set_interrupt_apic_ids(void); 150int start_all_aps(void); 151static int start_ap(int apic_id); 152static void release_aps(void *dummy); 153 154static u_int hyperthreading_cpus; 155static cpumask_t hyperthreading_cpus_mask; 156 157extern void Xhypervisor_callback(void); 158extern void failsafe_callback(void); 159extern void pmap_lazyfix_action(void); 160 161struct cpu_group * 162cpu_topo(void) 163{ 164 if (cpu_cores == 0) 165 cpu_cores = 1; 166 if (cpu_logical == 0) 167 cpu_logical = 1; 168 if (mp_ncpus % (cpu_cores * cpu_logical) != 0) { 169 printf("WARNING: Non-uniform processors.\n"); 170 printf("WARNING: Using suboptimal topology.\n"); 171 return (smp_topo_none()); 172 } 173 /* 174 * No multi-core or hyper-threaded. 175 */ 176 if (cpu_logical * cpu_cores == 1) 177 return (smp_topo_none()); 178 /* 179 * Only HTT no multi-core. 180 */ 181 if (cpu_logical > 1 && cpu_cores == 1) 182 return (smp_topo_1level(CG_SHARE_L1, cpu_logical, CG_FLAG_HTT)); 183 /* 184 * Only multi-core no HTT. 185 */ 186 if (cpu_cores > 1 && cpu_logical == 1) 187 return (smp_topo_1level(CG_SHARE_NONE, cpu_cores, 0)); 188 /* 189 * Both HTT and multi-core. 190 */ 191 return (smp_topo_2level(CG_SHARE_NONE, cpu_cores, 192 CG_SHARE_L1, cpu_logical, CG_FLAG_HTT)); 193} 194 195/* 196 * Calculate usable address in base memory for AP trampoline code. 197 */ 198u_int 199mp_bootaddress(u_int basemem) 200{ 201 202 return (basemem); 203} 204 205void 206cpu_add(u_int apic_id, char boot_cpu) 207{ 208 209 if (apic_id > MAX_APIC_ID) { 210 panic("SMP: APIC ID %d too high", apic_id); 211 return; 212 } 213 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", 214 apic_id)); 215 cpu_info[apic_id].cpu_present = 1; 216 if (boot_cpu) { 217 KASSERT(boot_cpu_id == -1, 218 ("CPU %d claims to be BSP, but CPU %d already is", apic_id, 219 boot_cpu_id)); 220 boot_cpu_id = apic_id; 221 cpu_info[apic_id].cpu_bsp = 1; 222 } 223 if (mp_ncpus < MAXCPU) 224 mp_ncpus++; 225 if (bootverbose) 226 printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : 227 "AP"); 228} 229 230void 231cpu_mp_setmaxid(void) 232{ 233 234 mp_maxid = MAXCPU - 1; 235} 236 237int 238cpu_mp_probe(void) 239{ 240 241 /* 242 * Always record BSP in CPU map so that the mbuf init code works 243 * correctly. 244 */ 245 all_cpus = 1; 246 if (mp_ncpus == 0) { 247 /* 248 * No CPUs were found, so this must be a UP system. Setup 249 * the variables to represent a system with a single CPU 250 * with an id of 0. 251 */ 252 mp_ncpus = 1; 253 return (0); 254 } 255 256 /* At least one CPU was found. */ 257 if (mp_ncpus == 1) { 258 /* 259 * One CPU was found, so this must be a UP system with 260 * an I/O APIC. 261 */ 262 return (0); 263 } 264 265 /* At least two CPUs were found. */ 266 return (1); 267} 268 269/* 270 * Initialize the IPI handlers and start up the AP's. 271 */ 272void 273cpu_mp_start(void) 274{ 275 int i; 276 277 /* Initialize the logical ID to APIC ID table. */ 278 for (i = 0; i < MAXCPU; i++) { 279 cpu_apic_ids[i] = -1; 280 cpu_ipi_pending[i] = 0; 281 } 282 283 /* Set boot_cpu_id if needed. */ 284 if (boot_cpu_id == -1) { 285 boot_cpu_id = PCPU_GET(apic_id); 286 cpu_info[boot_cpu_id].cpu_bsp = 1; 287 } else 288 KASSERT(boot_cpu_id == PCPU_GET(apic_id), 289 ("BSP's APIC ID doesn't match boot_cpu_id")); 290 cpu_apic_ids[0] = boot_cpu_id; 291 apic_cpuids[boot_cpu_id] = 0; 292 293 assign_cpu_ids(); 294 295 /* Start each Application Processor */ 296 start_all_aps(); 297 298 /* Setup the initial logical CPUs info. */ 299 logical_cpus = logical_cpus_mask = 0; 300 if (cpu_feature & CPUID_HTT) 301 logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16; 302 303 set_interrupt_apic_ids(); 304} 305 306 307static void 308iv_rendezvous(uintptr_t a, uintptr_t b) 309{ 310 smp_rendezvous_action(); 311} 312 313static void 314iv_invltlb(uintptr_t a, uintptr_t b) 315{ 316 xen_tlb_flush(); 317} 318 319static void 320iv_invlpg(uintptr_t a, uintptr_t b) 321{ 322 xen_invlpg(a); 323} 324 325static void 326iv_invlrng(uintptr_t a, uintptr_t b) 327{ 328 vm_offset_t start = (vm_offset_t)a; 329 vm_offset_t end = (vm_offset_t)b; 330 331 while (start < end) { 332 xen_invlpg(start); 333 start += PAGE_SIZE; 334 } 335} 336 337 338static void 339iv_invlcache(uintptr_t a, uintptr_t b) 340{ 341 342 wbinvd(); 343 atomic_add_int(&smp_tlb_wait, 1); 344} 345 346static void 347iv_lazypmap(uintptr_t a, uintptr_t b) 348{ 349 pmap_lazyfix_action(); 350 atomic_add_int(&smp_tlb_wait, 1); 351} 352 353 354static void 355iv_noop(uintptr_t a, uintptr_t b) 356{ 357 atomic_add_int(&smp_tlb_wait, 1); 358} 359 360static call_data_func_t *ipi_vectors[IPI_BITMAP_VECTOR] = 361{ 362 iv_noop, 363 iv_noop, 364 iv_rendezvous, 365 iv_invltlb, 366 iv_invlpg, 367 iv_invlrng, 368 iv_invlcache, 369 iv_lazypmap, 370}; 371 372/* 373 * Reschedule call back. Nothing to do, 374 * all the work is done automatically when 375 * we return from the interrupt. 376 */ 377static int 378smp_reschedule_interrupt(void *unused) 379{ 380 int cpu = PCPU_GET(cpuid); 381 u_int ipi_bitmap; 382 383 ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); 384 385 if (ipi_bitmap & (1 << IPI_PREEMPT)) { 386#ifdef COUNT_IPIS 387 (*ipi_preempt_counts[cpu])++; 388#endif 389 sched_preempt(curthread); 390 } 391 392 if (ipi_bitmap & (1 << IPI_AST)) { 393#ifdef COUNT_IPIS 394 (*ipi_ast_counts[cpu])++; 395#endif 396 /* Nothing to do for AST */ 397 } 398 return (FILTER_HANDLED); 399} 400 401struct _call_data { 402 uint16_t func_id; 403 uint16_t wait; 404 uintptr_t arg1; 405 uintptr_t arg2; 406 atomic_t started; 407 atomic_t finished; 408}; 409 410static struct _call_data *call_data; 411 412static int 413smp_call_function_interrupt(void *unused) 414{ 415 call_data_func_t *func; 416 uintptr_t arg1 = call_data->arg1; 417 uintptr_t arg2 = call_data->arg2; 418 int wait = call_data->wait; 419 atomic_t *started = &call_data->started; 420 atomic_t *finished = &call_data->finished; 421 422 if (call_data->func_id > IPI_BITMAP_VECTOR) 423 panic("invalid function id %u", call_data->func_id); 424 425 func = ipi_vectors[call_data->func_id]; 426 /* 427 * Notify initiating CPU that I've grabbed the data and am 428 * about to execute the function 429 */ 430 mb(); 431 atomic_inc(started); 432 /* 433 * At this point the info structure may be out of scope unless wait==1 434 */ 435 (*func)(arg1, arg2); 436 437 if (wait) { 438 mb(); 439 atomic_inc(finished); 440 } 441 atomic_add_int(&smp_tlb_wait, 1); 442 return (FILTER_HANDLED); 443} 444 445/* 446 * Print various information about the SMP system hardware and setup. 447 */ 448void 449cpu_mp_announce(void) 450{ 451 int i, x; 452 453 /* List CPUs */ 454 printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id); 455 for (i = 1, x = 0; x <= MAX_APIC_ID; x++) { 456 if (!cpu_info[x].cpu_present || cpu_info[x].cpu_bsp) 457 continue; 458 if (cpu_info[x].cpu_disabled) 459 printf(" cpu (AP): APIC ID: %2d (disabled)\n", x); 460 else { 461 KASSERT(i < mp_ncpus, 462 ("mp_ncpus and actual cpus are out of whack")); 463 printf(" cpu%d (AP): APIC ID: %2d\n", i++, x); 464 } 465 } 466} 467 468static int 469xen_smp_intr_init(unsigned int cpu) 470{ 471 int rc; 472 unsigned int irq; 473 474 per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1; 475 476 sprintf(resched_name[cpu], "resched%u", cpu); 477 rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR, 478 cpu, 479 resched_name[cpu], 480 smp_reschedule_interrupt, 481 INTR_FAST|INTR_TYPE_TTY|INTR_MPSAFE, &irq); 482 483 printf("[XEN] IPI cpu=%d irq=%d vector=RESCHEDULE_VECTOR (%d)\n", 484 cpu, irq, RESCHEDULE_VECTOR); 485 486 per_cpu(resched_irq, cpu) = irq; 487 488 sprintf(callfunc_name[cpu], "callfunc%u", cpu); 489 rc = bind_ipi_to_irqhandler(CALL_FUNCTION_VECTOR, 490 cpu, 491 callfunc_name[cpu], 492 smp_call_function_interrupt, 493 INTR_FAST|INTR_TYPE_TTY|INTR_MPSAFE, &irq); 494 if (rc < 0) 495 goto fail; 496 per_cpu(callfunc_irq, cpu) = irq; 497 498 printf("[XEN] IPI cpu=%d irq=%d vector=CALL_FUNCTION_VECTOR (%d)\n", 499 cpu, irq, CALL_FUNCTION_VECTOR); 500 501 502 if ((cpu != 0) && ((rc = ap_cpu_initclocks(cpu)) != 0)) 503 goto fail; 504 505 return 0; 506 507 fail: 508 if (per_cpu(resched_irq, cpu) >= 0) 509 unbind_from_irqhandler(per_cpu(resched_irq, cpu)); 510 if (per_cpu(callfunc_irq, cpu) >= 0) 511 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu)); 512 return rc; 513} 514 515static void 516xen_smp_intr_init_cpus(void *unused) 517{ 518 int i; 519 520 for (i = 0; i < mp_ncpus; i++) 521 xen_smp_intr_init(i); 522} 523 524#define MTOPSIZE (1<<(14 + PAGE_SHIFT)) 525 526/* 527 * AP CPU's call this to initialize themselves. 528 */ 529void 530init_secondary(void) 531{ 532 vm_offset_t addr; 533 int gsel_tss; 534 535 536 /* bootAP is set in start_ap() to our ID. */ 537 PCPU_SET(currentldt, _default_ldt); 538 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 539#if 0 540 gdt[bootAP * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; 541#endif 542 PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */ 543 PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); 544 PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); 545#if 0 546 PCPU_SET(tss_gdt, &gdt[bootAP * NGDT + GPROC0_SEL].sd); 547 548 PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); 549#endif 550 PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); 551 552 /* 553 * Set to a known state: 554 * Set by mpboot.s: CR0_PG, CR0_PE 555 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM 556 */ 557 /* 558 * signal our startup to the BSP. 559 */ 560 mp_naps++; 561 562 /* Spin until the BSP releases the AP's. */ 563 while (!aps_ready) 564 ia32_pause(); 565 566 /* BSP may have changed PTD while we were waiting */ 567 invltlb(); 568 for (addr = 0; addr < NKPT * NBPDR - 1; addr += PAGE_SIZE) 569 invlpg(addr); 570 571 /* set up FPU state on the AP */ 572 npxinit(); 573#if 0 574 575 /* set up SSE registers */ 576 enable_sse(); 577#endif 578#if 0 && defined(PAE) 579 /* Enable the PTE no-execute bit. */ 580 if ((amd_feature & AMDID_NX) != 0) { 581 uint64_t msr; 582 583 msr = rdmsr(MSR_EFER) | EFER_NXE; 584 wrmsr(MSR_EFER, msr); 585 } 586#endif 587#if 0 588 /* A quick check from sanity claus */ 589 if (PCPU_GET(apic_id) != lapic_id()) { 590 printf("SMP: cpuid = %d\n", PCPU_GET(cpuid)); 591 printf("SMP: actual apic_id = %d\n", lapic_id()); 592 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); 593 panic("cpuid mismatch! boom!!"); 594 } 595#endif 596 597 /* Initialize curthread. */ 598 KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); 599 PCPU_SET(curthread, PCPU_GET(idlethread)); 600 601 mtx_lock_spin(&ap_boot_mtx); 602#if 0 603 604 /* Init local apic for irq's */ 605 lapic_setup(1); 606#endif 607 smp_cpus++; 608 609 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid)); 610 printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid)); 611 612 /* Determine if we are a logical CPU. */ 613 if (logical_cpus > 1 && PCPU_GET(apic_id) % logical_cpus != 0) 614 logical_cpus_mask |= PCPU_GET(cpumask); 615 616 /* Determine if we are a hyperthread. */ 617 if (hyperthreading_cpus > 1 && 618 PCPU_GET(apic_id) % hyperthreading_cpus != 0) 619 hyperthreading_cpus_mask |= PCPU_GET(cpumask); 620 621 /* Build our map of 'other' CPUs. */ 622 PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); 623#if 0 624 if (bootverbose) 625 lapic_dump("AP"); 626#endif 627 if (smp_cpus == mp_ncpus) { 628 /* enable IPI's, tlb shootdown, freezes etc */ 629 atomic_store_rel_int(&smp_started, 1); 630 smp_active = 1; /* historic */ 631 } 632 633 mtx_unlock_spin(&ap_boot_mtx); 634 635 /* wait until all the AP's are up */ 636 while (smp_started == 0) 637 ia32_pause(); 638 639 640 PCPU_SET(curthread, PCPU_GET(idlethread)); 641 /* enter the scheduler */ 642 sched_throw(NULL); 643 644 panic("scheduler returned us to %s", __func__); 645 /* NOTREACHED */ 646} 647 648/******************************************************************* 649 * local functions and data 650 */ 651 652/* 653 * We tell the I/O APIC code about all the CPUs we want to receive 654 * interrupts. If we don't want certain CPUs to receive IRQs we 655 * can simply not tell the I/O APIC code about them in this function. 656 * We also do not tell it about the BSP since it tells itself about 657 * the BSP internally to work with UP kernels and on UP machines. 658 */ 659static void 660set_interrupt_apic_ids(void) 661{ 662 u_int i, apic_id; 663 664 for (i = 0; i < MAXCPU; i++) { 665 apic_id = cpu_apic_ids[i]; 666 if (apic_id == -1) 667 continue; 668 if (cpu_info[apic_id].cpu_bsp) 669 continue; 670 if (cpu_info[apic_id].cpu_disabled) 671 continue; 672 673 /* Don't let hyperthreads service interrupts. */ 674 if (hyperthreading_cpus > 1 && 675 apic_id % hyperthreading_cpus != 0) 676 continue; 677 678 intr_add_cpu(i); 679 } 680} 681 682/* 683 * Assign logical CPU IDs to local APICs. 684 */ 685static void 686assign_cpu_ids(void) 687{ 688 u_int i; 689 690 /* Check for explicitly disabled CPUs. */ 691 for (i = 0; i <= MAX_APIC_ID; i++) { 692 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp) 693 continue; 694 695 /* Don't use this CPU if it has been disabled by a tunable. */ 696 if (resource_disabled("lapic", i)) { 697 cpu_info[i].cpu_disabled = 1; 698 continue; 699 } 700 } 701 702 /* 703 * Assign CPU IDs to local APIC IDs and disable any CPUs 704 * beyond MAXCPU. CPU 0 has already been assigned to the BSP, 705 * so we only have to assign IDs for APs. 706 */ 707 mp_ncpus = 1; 708 for (i = 0; i <= MAX_APIC_ID; i++) { 709 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp || 710 cpu_info[i].cpu_disabled) 711 continue; 712 713 if (mp_ncpus < MAXCPU) { 714 cpu_apic_ids[mp_ncpus] = i; 715 apic_cpuids[i] = mp_ncpus; 716 mp_ncpus++; 717 } else 718 cpu_info[i].cpu_disabled = 1; 719 } 720 KASSERT(mp_maxid >= mp_ncpus - 1, 721 ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, 722 mp_ncpus)); 723} 724 725/* 726 * start each AP in our list 727 */ 728/* Lowest 1MB is already mapped: don't touch*/ 729#define TMPMAP_START 1 730int 731start_all_aps(void) 732{ 733 int x,apic_id, cpu; 734 struct pcpu *pc; 735 736 mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); 737 738 /* set up temporary P==V mapping for AP boot */ 739 /* XXX this is a hack, we should boot the AP on its own stack/PTD */ 740 741 /* start each AP */ 742 for (cpu = 1; cpu < mp_ncpus; cpu++) { 743 apic_id = cpu_apic_ids[cpu]; 744 745 746 bootAP = cpu; 747 bootAPgdt = gdt + (512*cpu); 748 749 /* Get per-cpu data */ 750 pc = &__pcpu[bootAP]; 751 pcpu_init(pc, bootAP, sizeof(struct pcpu)); 752 pc->pc_apic_id = cpu_apic_ids[bootAP]; 753 pc->pc_prvspace = pc; 754 pc->pc_curthread = 0; 755 756 gdt_segs[GPRIV_SEL].ssd_base = (int) pc; 757 gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; 758 759 PT_SET_MA(bootAPgdt, xpmap_ptom(VTOP(bootAPgdt)) | PG_V | PG_RW); 760 bzero(bootAPgdt, PAGE_SIZE); 761 for (x = 0; x < NGDT; x++) 762 ssdtosd(&gdt_segs[x], &bootAPgdt[x].sd); 763 PT_SET_MA(bootAPgdt, vtomach(bootAPgdt) | PG_V); 764#ifdef notyet 765 766 if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, cpu, &cpu_id) == 0) { 767 apicid = xen_vcpu_physid_to_x86_apicid(cpu_id.phys_id); 768 acpiid = xen_vcpu_physid_to_x86_acpiid(cpu_id.phys_id); 769#ifdef CONFIG_ACPI 770 if (acpiid != 0xff) 771 x86_acpiid_to_apicid[acpiid] = apicid; 772#endif 773 } 774#endif 775 776 /* attempt to start the Application Processor */ 777 if (!start_ap(cpu)) { 778 printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id); 779 /* better panic as the AP may be running loose */ 780 printf("panic y/n? [y] "); 781 if (cngetc() != 'n') 782 panic("bye-bye"); 783 } 784 785 all_cpus |= (1 << cpu); /* record AP in CPU map */ 786 } 787 788 789 /* build our map of 'other' CPUs */ 790 PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); 791 792 pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1); 793 794 /* number of APs actually started */ 795 return mp_naps; 796} 797 798extern uint8_t *pcpu_boot_stack; 799extern trap_info_t trap_table[]; 800 801static void 802smp_trap_init(trap_info_t *trap_ctxt) 803{ 804 const trap_info_t *t = trap_table; 805 806 for (t = trap_table; t->address; t++) { 807 trap_ctxt[t->vector].flags = t->flags; 808 trap_ctxt[t->vector].cs = t->cs; 809 trap_ctxt[t->vector].address = t->address; 810 } 811} 812 813extern int nkpt; 814static void 815cpu_initialize_context(unsigned int cpu) 816{ 817 /* vcpu_guest_context_t is too large to allocate on the stack. 818 * Hence we allocate statically and protect it with a lock */ 819 vm_page_t m[4]; 820 static vcpu_guest_context_t ctxt; 821 vm_offset_t boot_stack; 822 vm_offset_t newPTD; 823 vm_paddr_t ma[NPGPTD]; 824 static int color; 825 int i; 826 827 /* 828 * Page 0,[0-3] PTD 829 * Page 1, [4] boot stack 830 * Page [5] PDPT 831 * 832 */ 833 for (i = 0; i < NPGPTD + 2; i++) { 834 m[i] = vm_page_alloc(NULL, color++, 835 VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 836 VM_ALLOC_ZERO); 837 838 pmap_zero_page(m[i]); 839 840 } 841 boot_stack = kmem_alloc_nofault(kernel_map, 1); 842 newPTD = kmem_alloc_nofault(kernel_map, NPGPTD); 843 ma[0] = xpmap_ptom(VM_PAGE_TO_PHYS(m[0]))|PG_V; 844 845#ifdef PAE 846 pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD + 1])); 847 for (i = 0; i < NPGPTD; i++) { 848 ((vm_paddr_t *)boot_stack)[i] = 849 ma[i] = 850 xpmap_ptom(VM_PAGE_TO_PHYS(m[i]))|PG_V; 851 } 852#endif 853 854 /* 855 * Copy cpu0 IdlePTD to new IdlePTD - copying only 856 * kernel mappings 857 */ 858 pmap_qenter(newPTD, m, 4); 859 860 memcpy((uint8_t *)newPTD + KPTDI*sizeof(vm_paddr_t), 861 (uint8_t *)PTOV(IdlePTD) + KPTDI*sizeof(vm_paddr_t), 862 nkpt*sizeof(vm_paddr_t)); 863 864 pmap_qremove(newPTD, 4); 865 kmem_free(kernel_map, newPTD, 4); 866 /* 867 * map actual idle stack to boot_stack 868 */ 869 pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD])); 870 871 872 xen_pgdpt_pin(xpmap_ptom(VM_PAGE_TO_PHYS(m[NPGPTD + 1]))); 873 vm_page_lock_queues(); 874 for (i = 0; i < 4; i++) { 875 int pdir = (PTDPTDI + i) / NPDEPG; 876 int curoffset = (PTDPTDI + i) % NPDEPG; 877 878 xen_queue_pt_update((vm_paddr_t) 879 ((ma[pdir] & ~PG_V) + (curoffset*sizeof(vm_paddr_t))), 880 ma[i]); 881 } 882 PT_UPDATES_FLUSH(); 883 vm_page_unlock_queues(); 884 885 memset(&ctxt, 0, sizeof(ctxt)); 886 ctxt.flags = VGCF_IN_KERNEL; 887 ctxt.user_regs.ds = GSEL(GDATA_SEL, SEL_KPL); 888 ctxt.user_regs.es = GSEL(GDATA_SEL, SEL_KPL); 889 ctxt.user_regs.fs = GSEL(GPRIV_SEL, SEL_KPL); 890 ctxt.user_regs.gs = GSEL(GDATA_SEL, SEL_KPL); 891 ctxt.user_regs.cs = GSEL(GCODE_SEL, SEL_KPL); 892 ctxt.user_regs.ss = GSEL(GDATA_SEL, SEL_KPL); 893 ctxt.user_regs.eip = (unsigned long)init_secondary; 894 ctxt.user_regs.eflags = PSL_KERNEL | 0x1000; /* IOPL_RING1 */ 895 896 memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt)); 897 898 smp_trap_init(ctxt.trap_ctxt); 899 900 ctxt.ldt_ents = 0; 901 ctxt.gdt_frames[0] = (uint32_t)((uint64_t)vtomach(bootAPgdt) >> PAGE_SHIFT); 902 ctxt.gdt_ents = 512; 903 904#ifdef __i386__ 905 ctxt.user_regs.esp = boot_stack + PAGE_SIZE; 906 907 ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL); 908 ctxt.kernel_sp = boot_stack + PAGE_SIZE; 909 910 ctxt.event_callback_cs = GSEL(GCODE_SEL, SEL_KPL); 911 ctxt.event_callback_eip = (unsigned long)Xhypervisor_callback; 912 ctxt.failsafe_callback_cs = GSEL(GCODE_SEL, SEL_KPL); 913 ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; 914 915 ctxt.ctrlreg[3] = xpmap_ptom(VM_PAGE_TO_PHYS(m[NPGPTD + 1])); 916#else /* __x86_64__ */ 917 ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs); 918 ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL); 919 ctxt.kernel_sp = idle->thread.rsp0; 920 921 ctxt.event_callback_eip = (unsigned long)hypervisor_callback; 922 ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; 923 ctxt.syscall_callback_eip = (unsigned long)system_call; 924 925 ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt)); 926 927 ctxt.gs_base_kernel = (unsigned long)(cpu_pda(cpu)); 928#endif 929 930 printf("gdtpfn=%lx pdptpfn=%lx\n", 931 ctxt.gdt_frames[0], 932 ctxt.ctrlreg[3] >> PAGE_SHIFT); 933 934 PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt)); 935 DELAY(3000); 936 PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)); 937} 938 939/* 940 * This function starts the AP (application processor) identified 941 * by the APIC ID 'physicalCpu'. It does quite a "song and dance" 942 * to accomplish this. This is necessary because of the nuances 943 * of the different hardware we might encounter. It isn't pretty, 944 * but it seems to work. 945 */ 946 947int cpus; 948static int 949start_ap(int apic_id) 950{ 951 int ms; 952 953 /* used as a watchpoint to signal AP startup */ 954 cpus = mp_naps; 955 956 cpu_initialize_context(apic_id); 957 958 /* Wait up to 5 seconds for it to start. */ 959 for (ms = 0; ms < 5000; ms++) { 960 if (mp_naps > cpus) 961 return 1; /* return SUCCESS */ 962 DELAY(1000); 963 } 964 return 0; /* return FAILURE */ 965} 966 967/* 968 * Flush the TLB on all other CPU's 969 */ 970static void 971smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) 972{ 973 u_int ncpu; 974 struct _call_data data; 975 976 call_data = &data; 977 978 ncpu = mp_ncpus - 1; /* does not shootdown self */ 979 if (ncpu < 1) 980 return; /* no other cpus */ 981 if (!(read_eflags() & PSL_I)) 982 panic("%s: interrupts disabled", __func__); 983 mtx_lock_spin(&smp_ipi_mtx); 984 call_data->func_id = vector; 985 call_data->arg1 = addr1; 986 call_data->arg2 = addr2; 987 atomic_store_rel_int(&smp_tlb_wait, 0); 988 ipi_all_but_self(vector); 989 while (smp_tlb_wait < ncpu) 990 ia32_pause(); 991 call_data = NULL; 992 mtx_unlock_spin(&smp_ipi_mtx); 993} 994 995static void 996smp_targeted_tlb_shootdown(cpumask_t mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2) 997{ 998 int ncpu, othercpus; 999 struct _call_data data; 1000 1001 othercpus = mp_ncpus - 1; 1002 if (mask == (u_int)-1) { 1003 ncpu = othercpus; 1004 if (ncpu < 1) 1005 return; 1006 } else { 1007 mask &= ~PCPU_GET(cpumask); 1008 if (mask == 0) 1009 return; 1010 ncpu = bitcount32(mask); 1011 if (ncpu > othercpus) { 1012 /* XXX this should be a panic offence */ 1013 printf("SMP: tlb shootdown to %d other cpus (only have %d)\n", 1014 ncpu, othercpus); 1015 ncpu = othercpus; 1016 } 1017 /* XXX should be a panic, implied by mask == 0 above */ 1018 if (ncpu < 1) 1019 return; 1020 } 1021 if (!(read_eflags() & PSL_I)) 1022 panic("%s: interrupts disabled", __func__); 1023 mtx_lock_spin(&smp_ipi_mtx); 1024 call_data = &data; 1025 call_data->func_id = vector; 1026 call_data->arg1 = addr1; 1027 call_data->arg2 = addr2; 1028 atomic_store_rel_int(&smp_tlb_wait, 0); 1029 if (mask == (u_int)-1) 1030 ipi_all_but_self(vector); 1031 else 1032 ipi_selected(mask, vector); 1033 while (smp_tlb_wait < ncpu) 1034 ia32_pause(); 1035 call_data = NULL; 1036 mtx_unlock_spin(&smp_ipi_mtx); 1037} 1038 1039void 1040smp_cache_flush(void) 1041{ 1042 1043 if (smp_started) 1044 smp_tlb_shootdown(IPI_INVLCACHE, 0, 0); 1045} 1046 1047void 1048smp_invltlb(void) 1049{ 1050 1051 if (smp_started) { 1052 smp_tlb_shootdown(IPI_INVLTLB, 0, 0); 1053 } 1054} 1055 1056void 1057smp_invlpg(vm_offset_t addr) 1058{ 1059 1060 if (smp_started) { 1061 smp_tlb_shootdown(IPI_INVLPG, addr, 0); 1062 } 1063} 1064 1065void 1066smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2) 1067{ 1068 1069 if (smp_started) { 1070 smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2); 1071 } 1072} 1073 1074void 1075smp_masked_invltlb(cpumask_t mask) 1076{ 1077 1078 if (smp_started) { 1079 smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0); 1080 } 1081} 1082 1083void 1084smp_masked_invlpg(cpumask_t mask, vm_offset_t addr) 1085{ 1086 1087 if (smp_started) { 1088 smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0); 1089 } 1090} 1091 1092void 1093smp_masked_invlpg_range(cpumask_t mask, vm_offset_t addr1, vm_offset_t addr2) 1094{ 1095 1096 if (smp_started) { 1097 smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2); 1098 } 1099} 1100 1101/* 1102 * send an IPI to a set of cpus. 1103 */ 1104void 1105ipi_selected(cpumask_t cpus, u_int ipi) 1106{ 1107 int cpu; 1108 u_int bitmap = 0; 1109 u_int old_pending; 1110 u_int new_pending; 1111 1112 if (IPI_IS_BITMAPED(ipi)) { 1113 bitmap = 1 << ipi; 1114 ipi = IPI_BITMAP_VECTOR; 1115 } 1116 1117 CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi); 1118 while ((cpu = ffs(cpus)) != 0) { 1119 cpu--; 1120 cpus &= ~(1 << cpu); 1121 1122 KASSERT(cpu_apic_ids[cpu] != -1, 1123 ("IPI to non-existent CPU %d", cpu)); 1124 1125 if (bitmap) { 1126 do { 1127 old_pending = cpu_ipi_pending[cpu]; 1128 new_pending = old_pending | bitmap; 1129 } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu],old_pending, new_pending)); 1130 1131 if (!old_pending) 1132 ipi_pcpu(cpu, RESCHEDULE_VECTOR); 1133 continue; 1134 1135 } else { 1136 KASSERT(call_data != NULL, ("call_data not set")); 1137 ipi_pcpu(cpu, CALL_FUNCTION_VECTOR); 1138 } 1139 } 1140} 1141 1142/* 1143 * send an IPI to all CPUs EXCEPT myself 1144 */ 1145void 1146ipi_all_but_self(u_int ipi) 1147{ 1148 1149 if (IPI_IS_BITMAPED(ipi) || (ipi == IPI_STOP && stop_cpus_with_nmi)) { 1150 ipi_selected(PCPU_GET(other_cpus), ipi); 1151 return; 1152 } 1153 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); 1154 ipi_selected(PCPU_GET(other_cpus), ipi); 1155} 1156 1157/* 1158 * Handle an IPI_STOP by saving our current context and spinning until we 1159 * are resumed. 1160 */ 1161void 1162cpustop_handler(void) 1163{ 1164 int cpu = PCPU_GET(cpuid); 1165 int cpumask = PCPU_GET(cpumask); 1166 1167 savectx(&stoppcbs[cpu]); 1168 1169 /* Indicate that we are stopped */ 1170 atomic_set_int(&stopped_cpus, cpumask); 1171 1172 /* Wait for restart */ 1173 while (!(started_cpus & cpumask)) 1174 ia32_pause(); 1175 1176 atomic_clear_int(&started_cpus, cpumask); 1177 atomic_clear_int(&stopped_cpus, cpumask); 1178 1179 if (cpu == 0 && cpustop_restartfunc != NULL) { 1180 cpustop_restartfunc(); 1181 cpustop_restartfunc = NULL; 1182 } 1183} 1184 1185/* 1186 * This is called once the rest of the system is up and running and we're 1187 * ready to let the AP's out of the pen. 1188 */ 1189static void 1190release_aps(void *dummy __unused) 1191{ 1192 1193 if (mp_ncpus == 1) 1194 return; 1195 atomic_store_rel_int(&aps_ready, 1); 1196 while (smp_started == 0) 1197 ia32_pause(); 1198} 1199SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); 1200SYSINIT(start_ipis, SI_SUB_INTR, SI_ORDER_ANY, xen_smp_intr_init_cpus, NULL); 1201 1202