mp_machdep.c revision 176734
150476Speter/*- 220031Sphk * Copyright (c) 1996, by Steve Passe 320031Sphk * Copyright (c) 2003, by Peter Wemm 420031Sphk * All rights reserved. 520031Sphk * 6174990Sache * Redistribution and use in source and binary forms, with or without 720031Sphk * modification, are permitted provided that the following conditions 8163086Sflz * are met: 9163086Sflz * 1. Redistributions of source code must retain the above copyright 10163086Sflz * notice, this list of conditions and the following disclaimer. 11163086Sflz * 2. The name of the developer may NOT be used to endorse or promote products 12163086Sflz * derived from this software without specific prior written permission. 13163086Sflz * 14163086Sflz * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15163086Sflz * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16163086Sflz * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17163086Sflz * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18163086Sflz * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19163086Sflz * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 2020031Sphk * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21174990Sache * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 2220031Sphk * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23163086Sflz * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24163086Sflz * SUCH DAMAGE. 25163086Sflz */ 26163086Sflz 27163086Sflz#include <sys/cdefs.h> 28163086Sflz__FBSDID("$FreeBSD: head/sys/amd64/amd64/mp_machdep.c 176734 2008-03-02 07:58:42Z jeff $"); 29163086Sflz 30163086Sflz#include "opt_cpu.h" 31163086Sflz#include "opt_kstack_pages.h" 32163086Sflz#include "opt_mp_watchdog.h" 33163086Sflz#include "opt_sched.h" 34163086Sflz 3520031Sphk#include <sys/param.h> 36174990Sache#include <sys/systm.h> 3774606Sache#include <sys/bus.h> 38163086Sflz#ifdef GPROF 39163086Sflz#include <sys/gmon.h> 40163086Sflz#endif 41163086Sflz#include <sys/kernel.h> 42163086Sflz#include <sys/ktr.h> 43163086Sflz#include <sys/lock.h> 44163086Sflz#include <sys/malloc.h> 4520031Sphk#include <sys/memrange.h> 46174990Sache#include <sys/mutex.h> 4720031Sphk#include <sys/pcpu.h> 48163086Sflz#include <sys/proc.h> 49163086Sflz#include <sys/sched.h> 50163086Sflz#include <sys/smp.h> 51163086Sflz#include <sys/sysctl.h> 52163086Sflz 53163086Sflz#include <vm/vm.h> 54163086Sflz#include <vm/vm_param.h> 5520031Sphk#include <vm/pmap.h> 5620031Sphk#include <vm/vm_kern.h> 5720031Sphk#include <vm/vm_extern.h> 5820031Sphk 5920031Sphk#include <machine/apicreg.h> 6020031Sphk#include <machine/md_var.h> 6120031Sphk#include <machine/mp_watchdog.h> 6274570Sache#include <machine/pcb.h> 6320031Sphk#include <machine/psl.h> 6420031Sphk#include <machine/smp.h> 6520031Sphk#include <machine/specialreg.h> 6654090Sache#include <machine/tss.h> 6720031Sphk 6820031Sphk#define WARMBOOT_TARGET 0 6920031Sphk#define WARMBOOT_OFF (KERNBASE + 0x0467) 7020031Sphk#define WARMBOOT_SEG (KERNBASE + 0x0469) 7120031Sphk 7220031Sphk#define CMOS_REG (0x70) 7320031Sphk#define CMOS_DATA (0x71) 7420031Sphk#define BIOS_RESET (0x0f) 7520031Sphk#define BIOS_WARM (0x0a) 7620031Sphk 7720031Sphk/* lock region used by kernel profiling */ 7854090Sacheint mcount_lock; 7953943Sache 80174990Sacheint mp_naps; /* # of Applications processors */ 8153943Sacheint boot_cpu_id = -1; /* designated BSP */ 82163086Sflzextern int nkpt; 83163086Sflz 84163086Sflzextern struct pcpu __pcpu[]; 85163086Sflz 86163086Sflz/* AP uses this during bootstrap. Do not staticize. */ 87163086Sflzchar *bootSTK; 88163086Sflzstatic int bootAP; 89163086Sflz 90163086Sflz/* Free these after use */ 91163086Sflzvoid *bootstacks[MAXCPU]; 92163086Sflz 93163086Sflz/* Temporary holder for double fault stack */ 9453943Sachechar *doublefault_stack; 9574413Sache 9653943Sache/* Hotwire a 0->4MB V==P mapping */ 9774413Sacheextern pt_entry_t *KPTphys; 9853961Sache 9974413Sache/* SMP page table page */ 10053961Sacheextern pt_entry_t *SMPpt; 10174413Sache 10274413Sachestruct pcb stoppcbs[MAXCPU]; 103 104/* Variables needed for SMP tlb shootdown. */ 105vm_offset_t smp_tlb_addr1; 106vm_offset_t smp_tlb_addr2; 107volatile int smp_tlb_wait; 108 109extern inthand_t IDTVEC(fast_syscall), IDTVEC(fast_syscall32); 110 111#ifdef STOP_NMI 112volatile cpumask_t ipi_nmi_pending; 113 114static void ipi_nmi_selected(u_int32_t cpus); 115#endif 116 117/* 118 * Local data and functions. 119 */ 120 121#ifdef STOP_NMI 122/* 123 * Provide an alternate method of stopping other CPUs. If another CPU has 124 * disabled interrupts the conventional STOP IPI will be blocked. This 125 * NMI-based stop should get through in that case. 126 */ 127static int stop_cpus_with_nmi = 1; 128SYSCTL_INT(_debug, OID_AUTO, stop_cpus_with_nmi, CTLTYPE_INT | CTLFLAG_RW, 129 &stop_cpus_with_nmi, 0, ""); 130TUNABLE_INT("debug.stop_cpus_with_nmi", &stop_cpus_with_nmi); 131#else 132#define stop_cpus_with_nmi 0 133#endif 134 135static u_int logical_cpus; 136 137/* used to hold the AP's until we are ready to release them */ 138static struct mtx ap_boot_mtx; 139 140/* Set to 1 once we're ready to let the APs out of the pen. */ 141static volatile int aps_ready = 0; 142 143/* 144 * Store data from cpu_add() until later in the boot when we actually setup 145 * the APs. 146 */ 147struct cpu_info { 148 int cpu_present:1; 149 int cpu_bsp:1; 150 int cpu_disabled:1; 151} static cpu_info[MAX_APIC_ID + 1]; 152int cpu_apic_ids[MAXCPU]; 153 154/* Holds pending bitmap based IPIs per CPU */ 155static volatile u_int cpu_ipi_pending[MAXCPU]; 156 157static u_int boot_address; 158 159static void assign_cpu_ids(void); 160static void set_interrupt_apic_ids(void); 161static int start_all_aps(void); 162static int start_ap(int apic_id); 163static void release_aps(void *dummy); 164 165static int hlt_logical_cpus; 166static u_int hyperthreading_cpus; 167static cpumask_t hyperthreading_cpus_mask; 168static int hyperthreading_allowed = 1; 169static struct sysctl_ctx_list logical_cpu_clist; 170static u_int bootMP_size; 171 172static void 173mem_range_AP_init(void) 174{ 175 if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) 176 mem_range_softc.mr_op->initAP(&mem_range_softc); 177} 178 179struct cpu_group * 180cpu_topo(void) 181{ 182 if (cpu_cores == 0) 183 cpu_cores = 1; 184 if (cpu_logical == 0) 185 cpu_logical = 1; 186 if (mp_ncpus % (cpu_cores * cpu_logical) != 0) { 187 printf("WARNING: Non-uniform processors.\n"); 188 printf("WARNING: Using suboptimal topology.\n"); 189 return (smp_topo_none()); 190 } 191 /* 192 * No multi-core or hyper-threaded. 193 */ 194 if (cpu_logical * cpu_cores == 1) 195 return (smp_topo_none()); 196 /* 197 * Only HTT no multi-core. 198 */ 199 if (cpu_logical > 1 && cpu_cores == 1) 200 return (smp_topo_1level(CG_SHARE_L1, cpu_logical, CG_FLAG_HTT)); 201 /* 202 * Only multi-core no HTT. 203 */ 204 if (cpu_cores > 1 && cpu_logical == 1) 205 return (smp_topo_1level(CG_SHARE_NONE, cpu_cores, 0)); 206 /* 207 * Both HTT and multi-core. 208 */ 209 return (smp_topo_2level(CG_SHARE_NONE, cpu_cores, 210 CG_SHARE_L1, cpu_logical, CG_FLAG_HTT)); 211} 212 213/* 214 * Calculate usable address in base memory for AP trampoline code. 215 */ 216u_int 217mp_bootaddress(u_int basemem) 218{ 219 220 bootMP_size = mptramp_end - mptramp_start; 221 boot_address = trunc_page(basemem * 1024); /* round down to 4k boundary */ 222 if (((basemem * 1024) - boot_address) < bootMP_size) 223 boot_address -= PAGE_SIZE; /* not enough, lower by 4k */ 224 /* 3 levels of page table pages */ 225 mptramp_pagetables = boot_address - (PAGE_SIZE * 3); 226 227 return mptramp_pagetables; 228} 229 230void 231cpu_add(u_int apic_id, char boot_cpu) 232{ 233 234 if (apic_id > MAX_APIC_ID) { 235 panic("SMP: APIC ID %d too high", apic_id); 236 return; 237 } 238 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", 239 apic_id)); 240 cpu_info[apic_id].cpu_present = 1; 241 if (boot_cpu) { 242 KASSERT(boot_cpu_id == -1, 243 ("CPU %d claims to be BSP, but CPU %d already is", apic_id, 244 boot_cpu_id)); 245 boot_cpu_id = apic_id; 246 cpu_info[apic_id].cpu_bsp = 1; 247 } 248 if (mp_ncpus < MAXCPU) { 249 mp_ncpus++; 250 mp_maxid = mp_ncpus -1; 251 } 252 if (bootverbose) 253 printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : 254 "AP"); 255} 256 257void 258cpu_mp_setmaxid(void) 259{ 260 261 /* 262 * mp_maxid should be already set by calls to cpu_add(). 263 * Just sanity check its value here. 264 */ 265 if (mp_ncpus == 0) 266 KASSERT(mp_maxid == 0, 267 ("%s: mp_ncpus is zero, but mp_maxid is not", __func__)); 268 else if (mp_ncpus == 1) 269 mp_maxid = 0; 270 else 271 KASSERT(mp_maxid >= mp_ncpus - 1, 272 ("%s: counters out of sync: max %d, count %d", __func__, 273 mp_maxid, mp_ncpus)); 274} 275 276int 277cpu_mp_probe(void) 278{ 279 280 /* 281 * Always record BSP in CPU map so that the mbuf init code works 282 * correctly. 283 */ 284 all_cpus = 1; 285 if (mp_ncpus == 0) { 286 /* 287 * No CPUs were found, so this must be a UP system. Setup 288 * the variables to represent a system with a single CPU 289 * with an id of 0. 290 */ 291 mp_ncpus = 1; 292 return (0); 293 } 294 295 /* At least one CPU was found. */ 296 if (mp_ncpus == 1) { 297 /* 298 * One CPU was found, so this must be a UP system with 299 * an I/O APIC. 300 */ 301 mp_maxid = 0; 302 return (0); 303 } 304 305 /* At least two CPUs were found. */ 306 return (1); 307} 308 309/* 310 * Initialize the IPI handlers and start up the AP's. 311 */ 312void 313cpu_mp_start(void) 314{ 315 int i; 316 u_int threads_per_cache, p[4]; 317 318 /* Initialize the logical ID to APIC ID table. */ 319 for (i = 0; i < MAXCPU; i++) { 320 cpu_apic_ids[i] = -1; 321 cpu_ipi_pending[i] = 0; 322 } 323 324 /* Install an inter-CPU IPI for TLB invalidation */ 325 setidt(IPI_INVLTLB, IDTVEC(invltlb), SDT_SYSIGT, SEL_KPL, 0); 326 setidt(IPI_INVLPG, IDTVEC(invlpg), SDT_SYSIGT, SEL_KPL, 0); 327 setidt(IPI_INVLRNG, IDTVEC(invlrng), SDT_SYSIGT, SEL_KPL, 0); 328 329 /* Install an inter-CPU IPI for cache invalidation. */ 330 setidt(IPI_INVLCACHE, IDTVEC(invlcache), SDT_SYSIGT, SEL_KPL, 0); 331 332 /* Install an inter-CPU IPI for all-CPU rendezvous */ 333 setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous), SDT_SYSIGT, SEL_KPL, 0); 334 335 /* Install generic inter-CPU IPI handler */ 336 setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler), 337 SDT_SYSIGT, SEL_KPL, 0); 338 339 /* Install an inter-CPU IPI for CPU stop/restart */ 340 setidt(IPI_STOP, IDTVEC(cpustop), SDT_SYSIGT, SEL_KPL, 0); 341 342 /* Set boot_cpu_id if needed. */ 343 if (boot_cpu_id == -1) { 344 boot_cpu_id = PCPU_GET(apic_id); 345 cpu_info[boot_cpu_id].cpu_bsp = 1; 346 } else 347 KASSERT(boot_cpu_id == PCPU_GET(apic_id), 348 ("BSP's APIC ID doesn't match boot_cpu_id")); 349 cpu_apic_ids[0] = boot_cpu_id; 350 351 assign_cpu_ids(); 352 353 /* Start each Application Processor */ 354 start_all_aps(); 355 356 /* Setup the initial logical CPUs info. */ 357 logical_cpus = logical_cpus_mask = 0; 358 if (cpu_feature & CPUID_HTT) 359 logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16; 360 361 /* 362 * Work out if hyperthreading is *really* enabled. This 363 * is made really ugly by the fact that processors lie: Dual 364 * core processors claim to be hyperthreaded even when they're 365 * not, presumably because they want to be treated the same 366 * way as HTT with respect to per-cpu software licensing. 367 * At the time of writing (May 12, 2005) the only hyperthreaded 368 * cpus are from Intel, and Intel's dual-core processors can be 369 * identified via the "deterministic cache parameters" cpuid 370 * calls. 371 */ 372 /* 373 * First determine if this is an Intel processor which claims 374 * to have hyperthreading support. 375 */ 376 if ((cpu_feature & CPUID_HTT) && 377 (strcmp(cpu_vendor, "GenuineIntel") == 0)) { 378 /* 379 * If the "deterministic cache parameters" cpuid calls 380 * are available, use them. 381 */ 382 if (cpu_high >= 4) { 383 /* Ask the processor about the L1 cache. */ 384 for (i = 0; i < 1; i++) { 385 cpuid_count(4, i, p); 386 threads_per_cache = ((p[0] & 0x3ffc000) >> 14) + 1; 387 if (hyperthreading_cpus < threads_per_cache) 388 hyperthreading_cpus = threads_per_cache; 389 if ((p[0] & 0x1f) == 0) 390 break; 391 } 392 } 393 394 /* 395 * If the deterministic cache parameters are not 396 * available, or if no caches were reported to exist, 397 * just accept what the HTT flag indicated. 398 */ 399 if (hyperthreading_cpus == 0) 400 hyperthreading_cpus = logical_cpus; 401 } 402 403 set_interrupt_apic_ids(); 404} 405 406 407/* 408 * Print various information about the SMP system hardware and setup. 409 */ 410void 411cpu_mp_announce(void) 412{ 413 int i, x; 414 415 /* List CPUs */ 416 printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id); 417 for (i = 1, x = 0; x <= MAX_APIC_ID; x++) { 418 if (!cpu_info[x].cpu_present || cpu_info[x].cpu_bsp) 419 continue; 420 if (cpu_info[x].cpu_disabled) 421 printf(" cpu (AP): APIC ID: %2d (disabled)\n", x); 422 else { 423 KASSERT(i < mp_ncpus, 424 ("mp_ncpus and actual cpus are out of whack")); 425 printf(" cpu%d (AP): APIC ID: %2d\n", i++, x); 426 } 427 } 428} 429 430/* 431 * AP CPU's call this to initialize themselves. 432 */ 433void 434init_secondary(void) 435{ 436 struct pcpu *pc; 437 u_int64_t msr, cr0; 438 int cpu, gsel_tss; 439 440 /* Set by the startup code for us to use */ 441 cpu = bootAP; 442 443 /* Init tss */ 444 common_tss[cpu] = common_tss[0]; 445 common_tss[cpu].tss_rsp0 = 0; /* not used until after switch */ 446 common_tss[cpu].tss_iobase = sizeof(struct amd64tss); 447 common_tss[cpu].tss_ist1 = (long)&doublefault_stack[PAGE_SIZE]; 448 449 gdt_segs[GPROC0_SEL].ssd_base = (long) &common_tss[cpu]; 450 ssdtosyssd(&gdt_segs[GPROC0_SEL], 451 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 452 453 lgdt(&r_gdt); /* does magic intra-segment return */ 454 455 /* Get per-cpu data */ 456 pc = &__pcpu[cpu]; 457 458 /* prime data page for it to use */ 459 pcpu_init(pc, cpu, sizeof(struct pcpu)); 460 pc->pc_apic_id = cpu_apic_ids[cpu]; 461 pc->pc_prvspace = pc; 462 pc->pc_curthread = 0; 463 pc->pc_tssp = &common_tss[cpu]; 464 pc->pc_rsp0 = 0; 465 466 wrmsr(MSR_FSBASE, 0); /* User value */ 467 wrmsr(MSR_GSBASE, (u_int64_t)pc); 468 wrmsr(MSR_KGSBASE, (u_int64_t)pc); /* XXX User value while we're in the kernel */ 469 470 lidt(&r_idt); 471 472 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 473 ltr(gsel_tss); 474 475 /* 476 * Set to a known state: 477 * Set by mpboot.s: CR0_PG, CR0_PE 478 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM 479 */ 480 cr0 = rcr0(); 481 cr0 &= ~(CR0_CD | CR0_NW | CR0_EM); 482 load_cr0(cr0); 483 484 /* Set up the fast syscall stuff */ 485 msr = rdmsr(MSR_EFER) | EFER_SCE; 486 wrmsr(MSR_EFER, msr); 487 wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall)); 488 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); 489 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | 490 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); 491 wrmsr(MSR_STAR, msr); 492 wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D); 493 494 /* Disable local APIC just to be sure. */ 495 lapic_disable(); 496 497 /* signal our startup to the BSP. */ 498 mp_naps++; 499 500 /* Spin until the BSP releases the AP's. */ 501 while (!aps_ready) 502 ia32_pause(); 503 504 /* Initialize the PAT MSR. */ 505 pmap_init_pat(); 506 507 /* set up CPU registers and state */ 508 cpu_setregs(); 509 510 /* set up SSE/NX registers */ 511 initializecpu(); 512 513 /* set up FPU state on the AP */ 514 fpuinit(); 515 516 /* A quick check from sanity claus */ 517 if (PCPU_GET(apic_id) != lapic_id()) { 518 printf("SMP: cpuid = %d\n", PCPU_GET(cpuid)); 519 printf("SMP: actual apic_id = %d\n", lapic_id()); 520 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); 521 panic("cpuid mismatch! boom!!"); 522 } 523 524 /* Initialize curthread. */ 525 KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); 526 PCPU_SET(curthread, PCPU_GET(idlethread)); 527 528 mtx_lock_spin(&ap_boot_mtx); 529 530 /* Init local apic for irq's */ 531 lapic_setup(1); 532 533 /* Set memory range attributes for this CPU to match the BSP */ 534 mem_range_AP_init(); 535 536 smp_cpus++; 537 538 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid)); 539 printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid)); 540 541 /* Determine if we are a logical CPU. */ 542 if (logical_cpus > 1 && PCPU_GET(apic_id) % logical_cpus != 0) 543 logical_cpus_mask |= PCPU_GET(cpumask); 544 545 /* Determine if we are a hyperthread. */ 546 if (hyperthreading_cpus > 1 && 547 PCPU_GET(apic_id) % hyperthreading_cpus != 0) 548 hyperthreading_cpus_mask |= PCPU_GET(cpumask); 549 550 /* Build our map of 'other' CPUs. */ 551 PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); 552 553 if (bootverbose) 554 lapic_dump("AP"); 555 556 if (smp_cpus == mp_ncpus) { 557 /* enable IPI's, tlb shootdown, freezes etc */ 558 atomic_store_rel_int(&smp_started, 1); 559 smp_active = 1; /* historic */ 560 } 561 562 /* 563 * Enable global pages TLB extension 564 * This also implicitly flushes the TLB 565 */ 566 567 load_cr4(rcr4() | CR4_PGE); 568 569 mtx_unlock_spin(&ap_boot_mtx); 570 571 /* wait until all the AP's are up */ 572 while (smp_started == 0) 573 ia32_pause(); 574 575 sched_throw(NULL); 576 577 panic("scheduler returned us to %s", __func__); 578 /* NOTREACHED */ 579} 580 581/******************************************************************* 582 * local functions and data 583 */ 584 585/* 586 * We tell the I/O APIC code about all the CPUs we want to receive 587 * interrupts. If we don't want certain CPUs to receive IRQs we 588 * can simply not tell the I/O APIC code about them in this function. 589 * We also do not tell it about the BSP since it tells itself about 590 * the BSP internally to work with UP kernels and on UP machines. 591 */ 592static void 593set_interrupt_apic_ids(void) 594{ 595 u_int i, apic_id; 596 597 for (i = 0; i < MAXCPU; i++) { 598 apic_id = cpu_apic_ids[i]; 599 if (apic_id == -1) 600 continue; 601 if (cpu_info[apic_id].cpu_bsp) 602 continue; 603 if (cpu_info[apic_id].cpu_disabled) 604 continue; 605 606 /* Don't let hyperthreads service interrupts. */ 607 if (hyperthreading_cpus > 1 && 608 apic_id % hyperthreading_cpus != 0) 609 continue; 610 611 intr_add_cpu(i); 612 } 613} 614 615/* 616 * Assign logical CPU IDs to local APICs. 617 */ 618static void 619assign_cpu_ids(void) 620{ 621 u_int i; 622 623 /* Check for explicitly disabled CPUs. */ 624 for (i = 0; i <= MAX_APIC_ID; i++) { 625 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp) 626 continue; 627 628 /* Don't use this CPU if it has been disabled by a tunable. */ 629 if (resource_disabled("lapic", i)) { 630 cpu_info[i].cpu_disabled = 1; 631 continue; 632 } 633 } 634 635 /* 636 * Assign CPU IDs to local APIC IDs and disable any CPUs 637 * beyond MAXCPU. CPU 0 has already been assigned to the BSP, 638 * so we only have to assign IDs for APs. 639 */ 640 mp_ncpus = 1; 641 for (i = 0; i <= MAX_APIC_ID; i++) { 642 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp || 643 cpu_info[i].cpu_disabled) 644 continue; 645 646 if (mp_ncpus < MAXCPU) { 647 cpu_apic_ids[mp_ncpus] = i; 648 mp_ncpus++; 649 } else 650 cpu_info[i].cpu_disabled = 1; 651 } 652 KASSERT(mp_maxid >= mp_ncpus - 1, 653 ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, 654 mp_ncpus)); 655} 656 657/* 658 * start each AP in our list 659 */ 660static int 661start_all_aps(void) 662{ 663 vm_offset_t va = boot_address + KERNBASE; 664 u_int64_t *pt4, *pt3, *pt2; 665 u_int32_t mpbioswarmvec; 666 int apic_id, cpu, i; 667 u_char mpbiosreason; 668 669 mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); 670 671 /* install the AP 1st level boot code */ 672 pmap_kenter(va, boot_address); 673 pmap_invalidate_page(kernel_pmap, va); 674 bcopy(mptramp_start, (void *)va, bootMP_size); 675 676 /* Locate the page tables, they'll be below the trampoline */ 677 pt4 = (u_int64_t *)(uintptr_t)(mptramp_pagetables + KERNBASE); 678 pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t); 679 pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t); 680 681 /* Create the initial 1GB replicated page tables */ 682 for (i = 0; i < 512; i++) { 683 /* Each slot of the level 4 pages points to the same level 3 page */ 684 pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + PAGE_SIZE); 685 pt4[i] |= PG_V | PG_RW | PG_U; 686 687 /* Each slot of the level 3 pages points to the same level 2 page */ 688 pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + (2 * PAGE_SIZE)); 689 pt3[i] |= PG_V | PG_RW | PG_U; 690 691 /* The level 2 page slots are mapped with 2MB pages for 1GB. */ 692 pt2[i] = i * (2 * 1024 * 1024); 693 pt2[i] |= PG_V | PG_RW | PG_PS | PG_U; 694 } 695 696 /* save the current value of the warm-start vector */ 697 mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF); 698 outb(CMOS_REG, BIOS_RESET); 699 mpbiosreason = inb(CMOS_DATA); 700 701 /* setup a vector to our boot code */ 702 *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET; 703 *((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4); 704 outb(CMOS_REG, BIOS_RESET); 705 outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ 706 707 /* start each AP */ 708 for (cpu = 1; cpu < mp_ncpus; cpu++) { 709 apic_id = cpu_apic_ids[cpu]; 710 711 /* allocate and set up an idle stack data page */ 712 bootstacks[cpu] = (void *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE); 713 doublefault_stack = (char *)kmem_alloc(kernel_map, PAGE_SIZE); 714 715 bootSTK = (char *)bootstacks[cpu] + KSTACK_PAGES * PAGE_SIZE - 8; 716 bootAP = cpu; 717 718 /* attempt to start the Application Processor */ 719 if (!start_ap(apic_id)) { 720 /* restore the warmstart vector */ 721 *(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec; 722 panic("AP #%d (PHY# %d) failed!", cpu, apic_id); 723 } 724 725 all_cpus |= (1 << cpu); /* record AP in CPU map */ 726 } 727 728 /* build our map of 'other' CPUs */ 729 PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); 730 731 /* restore the warmstart vector */ 732 *(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec; 733 734 outb(CMOS_REG, BIOS_RESET); 735 outb(CMOS_DATA, mpbiosreason); 736 737 /* number of APs actually started */ 738 return mp_naps; 739} 740 741 742/* 743 * This function starts the AP (application processor) identified 744 * by the APIC ID 'physicalCpu'. It does quite a "song and dance" 745 * to accomplish this. This is necessary because of the nuances 746 * of the different hardware we might encounter. It isn't pretty, 747 * but it seems to work. 748 */ 749static int 750start_ap(int apic_id) 751{ 752 int vector, ms; 753 int cpus; 754 755 /* calculate the vector */ 756 vector = (boot_address >> 12) & 0xff; 757 758 /* used as a watchpoint to signal AP startup */ 759 cpus = mp_naps; 760 761 /* 762 * first we do an INIT/RESET IPI this INIT IPI might be run, reseting 763 * and running the target CPU. OR this INIT IPI might be latched (P5 764 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be 765 * ignored. 766 */ 767 768 /* do an INIT IPI: assert RESET */ 769 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 770 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); 771 772 /* wait for pending status end */ 773 lapic_ipi_wait(-1); 774 775 /* do an INIT IPI: deassert RESET */ 776 lapic_ipi_raw(APIC_DEST_ALLESELF | APIC_TRIGMOD_LEVEL | 777 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 0); 778 779 /* wait for pending status end */ 780 DELAY(10000); /* wait ~10mS */ 781 lapic_ipi_wait(-1); 782 783 /* 784 * next we do a STARTUP IPI: the previous INIT IPI might still be 785 * latched, (P5 bug) this 1st STARTUP would then terminate 786 * immediately, and the previously started INIT IPI would continue. OR 787 * the previous INIT IPI has already run. and this STARTUP IPI will 788 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI 789 * will run. 790 */ 791 792 /* do a STARTUP IPI */ 793 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 794 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 795 vector, apic_id); 796 lapic_ipi_wait(-1); 797 DELAY(200); /* wait ~200uS */ 798 799 /* 800 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF 801 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR 802 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is 803 * recognized after hardware RESET or INIT IPI. 804 */ 805 806 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 807 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 808 vector, apic_id); 809 lapic_ipi_wait(-1); 810 DELAY(200); /* wait ~200uS */ 811 812 /* Wait up to 5 seconds for it to start. */ 813 for (ms = 0; ms < 5000; ms++) { 814 if (mp_naps > cpus) 815 return 1; /* return SUCCESS */ 816 DELAY(1000); 817 } 818 return 0; /* return FAILURE */ 819} 820 821/* 822 * Flush the TLB on all other CPU's 823 */ 824static void 825smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) 826{ 827 u_int ncpu; 828 829 ncpu = mp_ncpus - 1; /* does not shootdown self */ 830 if (ncpu < 1) 831 return; /* no other cpus */ 832 if (!(read_rflags() & PSL_I)) 833 panic("%s: interrupts disabled", __func__); 834 mtx_lock_spin(&smp_ipi_mtx); 835 smp_tlb_addr1 = addr1; 836 smp_tlb_addr2 = addr2; 837 atomic_store_rel_int(&smp_tlb_wait, 0); 838 ipi_all_but_self(vector); 839 while (smp_tlb_wait < ncpu) 840 ia32_pause(); 841 mtx_unlock_spin(&smp_ipi_mtx); 842} 843 844static void 845smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2) 846{ 847 int ncpu, othercpus; 848 849 othercpus = mp_ncpus - 1; 850 if (mask == (u_int)-1) { 851 ncpu = othercpus; 852 if (ncpu < 1) 853 return; 854 } else { 855 mask &= ~PCPU_GET(cpumask); 856 if (mask == 0) 857 return; 858 ncpu = bitcount32(mask); 859 if (ncpu > othercpus) { 860 /* XXX this should be a panic offence */ 861 printf("SMP: tlb shootdown to %d other cpus (only have %d)\n", 862 ncpu, othercpus); 863 ncpu = othercpus; 864 } 865 /* XXX should be a panic, implied by mask == 0 above */ 866 if (ncpu < 1) 867 return; 868 } 869 if (!(read_rflags() & PSL_I)) 870 panic("%s: interrupts disabled", __func__); 871 mtx_lock_spin(&smp_ipi_mtx); 872 smp_tlb_addr1 = addr1; 873 smp_tlb_addr2 = addr2; 874 atomic_store_rel_int(&smp_tlb_wait, 0); 875 if (mask == (u_int)-1) 876 ipi_all_but_self(vector); 877 else 878 ipi_selected(mask, vector); 879 while (smp_tlb_wait < ncpu) 880 ia32_pause(); 881 mtx_unlock_spin(&smp_ipi_mtx); 882} 883 884void 885smp_cache_flush(void) 886{ 887 888 if (smp_started) 889 smp_tlb_shootdown(IPI_INVLCACHE, 0, 0); 890} 891 892void 893smp_invltlb(void) 894{ 895 896 if (smp_started) { 897 smp_tlb_shootdown(IPI_INVLTLB, 0, 0); 898 } 899} 900 901void 902smp_invlpg(vm_offset_t addr) 903{ 904 905 if (smp_started) 906 smp_tlb_shootdown(IPI_INVLPG, addr, 0); 907} 908 909void 910smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2) 911{ 912 913 if (smp_started) { 914 smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2); 915 } 916} 917 918void 919smp_masked_invltlb(u_int mask) 920{ 921 922 if (smp_started) { 923 smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0); 924 } 925} 926 927void 928smp_masked_invlpg(u_int mask, vm_offset_t addr) 929{ 930 931 if (smp_started) { 932 smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0); 933 } 934} 935 936void 937smp_masked_invlpg_range(u_int mask, vm_offset_t addr1, vm_offset_t addr2) 938{ 939 940 if (smp_started) { 941 smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2); 942 } 943} 944 945void 946ipi_bitmap_handler(struct trapframe frame) 947{ 948 int cpu = PCPU_GET(cpuid); 949 u_int ipi_bitmap; 950 951 ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); 952 953 if (ipi_bitmap & (1 << IPI_PREEMPT)) { 954 struct thread *running_thread = curthread; 955 thread_lock(running_thread); 956 if (running_thread->td_critnest > 1) 957 running_thread->td_owepreempt = 1; 958 else 959 mi_switch(SW_INVOL | SW_PREEMPT, NULL); 960 thread_unlock(running_thread); 961 } 962 963 /* Nothing to do for AST */ 964} 965 966/* 967 * send an IPI to a set of cpus. 968 */ 969void 970ipi_selected(u_int32_t cpus, u_int ipi) 971{ 972 int cpu; 973 u_int bitmap = 0; 974 u_int old_pending; 975 u_int new_pending; 976 977 if (IPI_IS_BITMAPED(ipi)) { 978 bitmap = 1 << ipi; 979 ipi = IPI_BITMAP_VECTOR; 980 } 981 982#ifdef STOP_NMI 983 if (ipi == IPI_STOP && stop_cpus_with_nmi) { 984 ipi_nmi_selected(cpus); 985 return; 986 } 987#endif 988 CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi); 989 while ((cpu = ffs(cpus)) != 0) { 990 cpu--; 991 cpus &= ~(1 << cpu); 992 993 KASSERT(cpu_apic_ids[cpu] != -1, 994 ("IPI to non-existent CPU %d", cpu)); 995 996 if (bitmap) { 997 do { 998 old_pending = cpu_ipi_pending[cpu]; 999 new_pending = old_pending | bitmap; 1000 } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu],old_pending, new_pending)); 1001 1002 if (old_pending) 1003 continue; 1004 } 1005 1006 lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]); 1007 } 1008 1009} 1010 1011/* 1012 * send an IPI INTerrupt containing 'vector' to all CPUs, including myself 1013 */ 1014void 1015ipi_all(u_int ipi) 1016{ 1017 1018 if (IPI_IS_BITMAPED(ipi) || (ipi == IPI_STOP && stop_cpus_with_nmi)) { 1019 ipi_selected(all_cpus, ipi); 1020 return; 1021 } 1022 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); 1023 lapic_ipi_vectored(ipi, APIC_IPI_DEST_ALL); 1024} 1025 1026/* 1027 * send an IPI to all CPUs EXCEPT myself 1028 */ 1029void 1030ipi_all_but_self(u_int ipi) 1031{ 1032 1033 if (IPI_IS_BITMAPED(ipi) || (ipi == IPI_STOP && stop_cpus_with_nmi)) { 1034 ipi_selected(PCPU_GET(other_cpus), ipi); 1035 return; 1036 } 1037 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); 1038 lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); 1039} 1040 1041/* 1042 * send an IPI to myself 1043 */ 1044void 1045ipi_self(u_int ipi) 1046{ 1047 1048 if (IPI_IS_BITMAPED(ipi) || (ipi == IPI_STOP && stop_cpus_with_nmi)) { 1049 ipi_selected(PCPU_GET(cpumask), ipi); 1050 return; 1051 } 1052 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); 1053 lapic_ipi_vectored(ipi, APIC_IPI_DEST_SELF); 1054} 1055 1056#ifdef STOP_NMI 1057/* 1058 * send NMI IPI to selected CPUs 1059 */ 1060 1061#define BEFORE_SPIN 1000000 1062 1063void 1064ipi_nmi_selected(u_int32_t cpus) 1065{ 1066 int cpu; 1067 register_t icrlo; 1068 1069 icrlo = APIC_DELMODE_NMI | APIC_DESTMODE_PHY | APIC_LEVEL_ASSERT 1070 | APIC_TRIGMOD_EDGE; 1071 1072 CTR2(KTR_SMP, "%s: cpus: %x nmi", __func__, cpus); 1073 1074 atomic_set_int(&ipi_nmi_pending, cpus); 1075 1076 while ((cpu = ffs(cpus)) != 0) { 1077 cpu--; 1078 cpus &= ~(1 << cpu); 1079 1080 KASSERT(cpu_apic_ids[cpu] != -1, 1081 ("IPI NMI to non-existent CPU %d", cpu)); 1082 1083 /* Wait for an earlier IPI to finish. */ 1084 if (!lapic_ipi_wait(BEFORE_SPIN)) 1085 panic("ipi_nmi_selected: previous IPI has not cleared"); 1086 1087 lapic_ipi_raw(icrlo, cpu_apic_ids[cpu]); 1088 } 1089} 1090 1091int 1092ipi_nmi_handler(void) 1093{ 1094 int cpumask = PCPU_GET(cpumask); 1095 1096 if (!(ipi_nmi_pending & cpumask)) 1097 return 1; 1098 1099 atomic_clear_int(&ipi_nmi_pending, cpumask); 1100 cpustop_handler(); 1101 return 0; 1102} 1103 1104#endif /* STOP_NMI */ 1105 1106/* 1107 * Handle an IPI_STOP by saving our current context and spinning until we 1108 * are resumed. 1109 */ 1110void 1111cpustop_handler(void) 1112{ 1113 int cpu = PCPU_GET(cpuid); 1114 int cpumask = PCPU_GET(cpumask); 1115 1116 savectx(&stoppcbs[cpu]); 1117 1118 /* Indicate that we are stopped */ 1119 atomic_set_int(&stopped_cpus, cpumask); 1120 1121 /* Wait for restart */ 1122 while (!(started_cpus & cpumask)) 1123 ia32_pause(); 1124 1125 atomic_clear_int(&started_cpus, cpumask); 1126 atomic_clear_int(&stopped_cpus, cpumask); 1127 1128 if (cpu == 0 && cpustop_restartfunc != NULL) { 1129 cpustop_restartfunc(); 1130 cpustop_restartfunc = NULL; 1131 } 1132} 1133 1134/* 1135 * This is called once the rest of the system is up and running and we're 1136 * ready to let the AP's out of the pen. 1137 */ 1138static void 1139release_aps(void *dummy __unused) 1140{ 1141 1142 if (mp_ncpus == 1) 1143 return; 1144 atomic_store_rel_int(&aps_ready, 1); 1145 while (smp_started == 0) 1146 ia32_pause(); 1147} 1148SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); 1149 1150static int 1151sysctl_hlt_cpus(SYSCTL_HANDLER_ARGS) 1152{ 1153 u_int mask; 1154 int error; 1155 1156 mask = hlt_cpus_mask; 1157 error = sysctl_handle_int(oidp, &mask, 0, req); 1158 if (error || !req->newptr) 1159 return (error); 1160 1161 if (logical_cpus_mask != 0 && 1162 (mask & logical_cpus_mask) == logical_cpus_mask) 1163 hlt_logical_cpus = 1; 1164 else 1165 hlt_logical_cpus = 0; 1166 1167 if (! hyperthreading_allowed) 1168 mask |= hyperthreading_cpus_mask; 1169 1170 if ((mask & all_cpus) == all_cpus) 1171 mask &= ~(1<<0); 1172 hlt_cpus_mask = mask; 1173 return (error); 1174} 1175SYSCTL_PROC(_machdep, OID_AUTO, hlt_cpus, CTLTYPE_INT|CTLFLAG_RW, 1176 0, 0, sysctl_hlt_cpus, "IU", 1177 "Bitmap of CPUs to halt. 101 (binary) will halt CPUs 0 and 2."); 1178 1179static int 1180sysctl_hlt_logical_cpus(SYSCTL_HANDLER_ARGS) 1181{ 1182 int disable, error; 1183 1184 disable = hlt_logical_cpus; 1185 error = sysctl_handle_int(oidp, &disable, 0, req); 1186 if (error || !req->newptr) 1187 return (error); 1188 1189 if (disable) 1190 hlt_cpus_mask |= logical_cpus_mask; 1191 else 1192 hlt_cpus_mask &= ~logical_cpus_mask; 1193 1194 if (! hyperthreading_allowed) 1195 hlt_cpus_mask |= hyperthreading_cpus_mask; 1196 1197 if ((hlt_cpus_mask & all_cpus) == all_cpus) 1198 hlt_cpus_mask &= ~(1<<0); 1199 1200 hlt_logical_cpus = disable; 1201 return (error); 1202} 1203 1204static int 1205sysctl_hyperthreading_allowed(SYSCTL_HANDLER_ARGS) 1206{ 1207 int allowed, error; 1208 1209 allowed = hyperthreading_allowed; 1210 error = sysctl_handle_int(oidp, &allowed, 0, req); 1211 if (error || !req->newptr) 1212 return (error); 1213 1214 if (allowed) 1215 hlt_cpus_mask &= ~hyperthreading_cpus_mask; 1216 else 1217 hlt_cpus_mask |= hyperthreading_cpus_mask; 1218 1219 if (logical_cpus_mask != 0 && 1220 (hlt_cpus_mask & logical_cpus_mask) == logical_cpus_mask) 1221 hlt_logical_cpus = 1; 1222 else 1223 hlt_logical_cpus = 0; 1224 1225 if ((hlt_cpus_mask & all_cpus) == all_cpus) 1226 hlt_cpus_mask &= ~(1<<0); 1227 1228 hyperthreading_allowed = allowed; 1229 return (error); 1230} 1231 1232static void 1233cpu_hlt_setup(void *dummy __unused) 1234{ 1235 1236 if (logical_cpus_mask != 0) { 1237 TUNABLE_INT_FETCH("machdep.hlt_logical_cpus", 1238 &hlt_logical_cpus); 1239 sysctl_ctx_init(&logical_cpu_clist); 1240 SYSCTL_ADD_PROC(&logical_cpu_clist, 1241 SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO, 1242 "hlt_logical_cpus", CTLTYPE_INT|CTLFLAG_RW, 0, 0, 1243 sysctl_hlt_logical_cpus, "IU", ""); 1244 SYSCTL_ADD_UINT(&logical_cpu_clist, 1245 SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO, 1246 "logical_cpus_mask", CTLTYPE_INT|CTLFLAG_RD, 1247 &logical_cpus_mask, 0, ""); 1248 1249 if (hlt_logical_cpus) 1250 hlt_cpus_mask |= logical_cpus_mask; 1251 1252 /* 1253 * If necessary for security purposes, force 1254 * hyperthreading off, regardless of the value 1255 * of hlt_logical_cpus. 1256 */ 1257 if (hyperthreading_cpus_mask) { 1258 TUNABLE_INT_FETCH("machdep.hyperthreading_allowed", 1259 &hyperthreading_allowed); 1260 SYSCTL_ADD_PROC(&logical_cpu_clist, 1261 SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO, 1262 "hyperthreading_allowed", CTLTYPE_INT|CTLFLAG_RW, 1263 0, 0, sysctl_hyperthreading_allowed, "IU", ""); 1264 if (! hyperthreading_allowed) 1265 hlt_cpus_mask |= hyperthreading_cpus_mask; 1266 } 1267 } 1268} 1269SYSINIT(cpu_hlt, SI_SUB_SMP, SI_ORDER_ANY, cpu_hlt_setup, NULL); 1270 1271int 1272mp_grab_cpu_hlt(void) 1273{ 1274 u_int mask = PCPU_GET(cpumask); 1275#ifdef MP_WATCHDOG 1276 u_int cpuid = PCPU_GET(cpuid); 1277#endif 1278 int retval; 1279 1280#ifdef MP_WATCHDOG 1281 ap_watchdog(cpuid); 1282#endif 1283 1284 retval = mask & hlt_cpus_mask; 1285 while (mask & hlt_cpus_mask) 1286 __asm __volatile("sti; hlt" : : : "memory"); 1287 return (retval); 1288} 1289