mp_machdep.c revision 211176
1/*- 2 * Copyright (c) 1996, by Steve Passe 3 * Copyright (c) 2003, by Peter Wemm 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. The name of the developer may NOT be used to endorse or promote products 12 * derived from this software without specific prior written permission. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: head/sys/amd64/amd64/mp_machdep.c 211176 2010-08-11 10:51:27Z attilio $"); 29 30#include "opt_cpu.h" 31#include "opt_kstack_pages.h" 32#include "opt_mp_watchdog.h" 33#include "opt_sched.h" 34#include "opt_smp.h" 35 36#include <sys/param.h> 37#include <sys/systm.h> 38#include <sys/bus.h> 39#ifdef GPROF 40#include <sys/gmon.h> 41#endif 42#include <sys/kernel.h> 43#include <sys/ktr.h> 44#include <sys/lock.h> 45#include <sys/malloc.h> 46#include <sys/memrange.h> 47#include <sys/mutex.h> 48#include <sys/pcpu.h> 49#include <sys/proc.h> 50#include <sys/sched.h> 51#include <sys/smp.h> 52#include <sys/sysctl.h> 53 54#include <vm/vm.h> 55#include <vm/vm_param.h> 56#include <vm/pmap.h> 57#include <vm/vm_kern.h> 58#include <vm/vm_extern.h> 59 60#include <machine/apicreg.h> 61#include <machine/clock.h> 62#include <machine/cputypes.h> 63#include <machine/cpufunc.h> 64#include <machine/mca.h> 65#include <machine/md_var.h> 66#include <machine/mp_watchdog.h> 67#include <machine/pcb.h> 68#include <machine/psl.h> 69#include <machine/smp.h> 70#include <machine/specialreg.h> 71#include <machine/tss.h> 72 73#define WARMBOOT_TARGET 0 74#define WARMBOOT_OFF (KERNBASE + 0x0467) 75#define WARMBOOT_SEG (KERNBASE + 0x0469) 76 77#define CMOS_REG (0x70) 78#define CMOS_DATA (0x71) 79#define BIOS_RESET (0x0f) 80#define BIOS_WARM (0x0a) 81 82/* lock region used by kernel profiling */ 83int mcount_lock; 84 85int mp_naps; /* # of Applications processors */ 86int boot_cpu_id = -1; /* designated BSP */ 87 88extern struct pcpu __pcpu[]; 89 90/* AP uses this during bootstrap. Do not staticize. */ 91char *bootSTK; 92static int bootAP; 93 94/* Free these after use */ 95void *bootstacks[MAXCPU]; 96 97/* Temporary variables for init_secondary() */ 98char *doublefault_stack; 99char *nmi_stack; 100void *dpcpu; 101 102struct pcb stoppcbs[MAXCPU]; 103struct pcb **susppcbs = NULL; 104 105/* Variables needed for SMP tlb shootdown. */ 106vm_offset_t smp_tlb_addr1; 107vm_offset_t smp_tlb_addr2; 108volatile int smp_tlb_wait; 109 110#ifdef COUNT_IPIS 111/* Interrupt counts. */ 112static u_long *ipi_preempt_counts[MAXCPU]; 113static u_long *ipi_ast_counts[MAXCPU]; 114u_long *ipi_invltlb_counts[MAXCPU]; 115u_long *ipi_invlrng_counts[MAXCPU]; 116u_long *ipi_invlpg_counts[MAXCPU]; 117u_long *ipi_invlcache_counts[MAXCPU]; 118u_long *ipi_rendezvous_counts[MAXCPU]; 119u_long *ipi_lazypmap_counts[MAXCPU]; 120static u_long *ipi_hardclock_counts[MAXCPU]; 121static u_long *ipi_statclock_counts[MAXCPU]; 122#endif 123 124extern inthand_t IDTVEC(fast_syscall), IDTVEC(fast_syscall32); 125 126/* 127 * Local data and functions. 128 */ 129 130static cpumask_t logical_cpus; 131static volatile cpumask_t ipi_nmi_pending; 132 133/* used to hold the AP's until we are ready to release them */ 134static struct mtx ap_boot_mtx; 135 136/* Set to 1 once we're ready to let the APs out of the pen. */ 137static volatile int aps_ready = 0; 138 139/* 140 * Store data from cpu_add() until later in the boot when we actually setup 141 * the APs. 142 */ 143struct cpu_info { 144 int cpu_present:1; 145 int cpu_bsp:1; 146 int cpu_disabled:1; 147 int cpu_hyperthread:1; 148} static cpu_info[MAX_APIC_ID + 1]; 149int cpu_apic_ids[MAXCPU]; 150int apic_cpuids[MAX_APIC_ID + 1]; 151 152/* Holds pending bitmap based IPIs per CPU */ 153static volatile u_int cpu_ipi_pending[MAXCPU]; 154 155static u_int boot_address; 156static int cpu_logical; 157static int cpu_cores; 158 159static void assign_cpu_ids(void); 160static void set_interrupt_apic_ids(void); 161static int start_all_aps(void); 162static int start_ap(int apic_id); 163static void release_aps(void *dummy); 164 165static cpumask_t hlt_logical_cpus; 166static cpumask_t hyperthreading_cpus; 167static cpumask_t hyperthreading_cpus_mask; 168static int hyperthreading_allowed = 1; 169static struct sysctl_ctx_list logical_cpu_clist; 170static u_int bootMP_size; 171 172static void 173mem_range_AP_init(void) 174{ 175 if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) 176 mem_range_softc.mr_op->initAP(&mem_range_softc); 177} 178 179static void 180topo_probe_0xb(void) 181{ 182 int logical; 183 int p[4]; 184 int bits; 185 int type; 186 int cnt; 187 int i; 188 int x; 189 190 /* We only support two levels for now. */ 191 for (i = 0; i < 3; i++) { 192 cpuid_count(0x0B, i, p); 193 bits = p[0] & 0x1f; 194 logical = p[1] &= 0xffff; 195 type = (p[2] >> 8) & 0xff; 196 if (type == 0 || logical == 0) 197 break; 198 for (cnt = 0, x = 0; x <= MAX_APIC_ID; x++) { 199 if (!cpu_info[x].cpu_present || 200 cpu_info[x].cpu_disabled) 201 continue; 202 if (x >> bits == boot_cpu_id >> bits) 203 cnt++; 204 } 205 if (type == CPUID_TYPE_SMT) 206 cpu_logical = cnt; 207 else if (type == CPUID_TYPE_CORE) 208 cpu_cores = cnt; 209 } 210 if (cpu_logical == 0) 211 cpu_logical = 1; 212 cpu_cores /= cpu_logical; 213} 214 215static void 216topo_probe_0x4(void) 217{ 218 u_int threads_per_cache, p[4]; 219 u_int htt, cmp; 220 int i; 221 222 htt = cmp = 1; 223 /* 224 * If this CPU supports HTT or CMP then mention the 225 * number of physical/logical cores it contains. 226 */ 227 if (cpu_feature & CPUID_HTT) 228 htt = (cpu_procinfo & CPUID_HTT_CORES) >> 16; 229 if (cpu_vendor_id == CPU_VENDOR_AMD && (amd_feature2 & AMDID2_CMP)) 230 cmp = (cpu_procinfo2 & AMDID_CMP_CORES) + 1; 231 else if (cpu_vendor_id == CPU_VENDOR_INTEL && (cpu_high >= 4)) { 232 cpuid_count(4, 0, p); 233 if ((p[0] & 0x1f) != 0) 234 cmp = ((p[0] >> 26) & 0x3f) + 1; 235 } 236 cpu_cores = cmp; 237 cpu_logical = htt / cmp; 238 239 /* Setup the initial logical CPUs info. */ 240 if (cpu_feature & CPUID_HTT) 241 logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16; 242 243 /* 244 * Work out if hyperthreading is *really* enabled. This 245 * is made really ugly by the fact that processors lie: Dual 246 * core processors claim to be hyperthreaded even when they're 247 * not, presumably because they want to be treated the same 248 * way as HTT with respect to per-cpu software licensing. 249 * At the time of writing (May 12, 2005) the only hyperthreaded 250 * cpus are from Intel, and Intel's dual-core processors can be 251 * identified via the "deterministic cache parameters" cpuid 252 * calls. 253 */ 254 /* 255 * First determine if this is an Intel processor which claims 256 * to have hyperthreading support. 257 */ 258 if ((cpu_feature & CPUID_HTT) && cpu_vendor_id == CPU_VENDOR_INTEL) { 259 /* 260 * If the "deterministic cache parameters" cpuid calls 261 * are available, use them. 262 */ 263 if (cpu_high >= 4) { 264 /* Ask the processor about the L1 cache. */ 265 for (i = 0; i < 1; i++) { 266 cpuid_count(4, i, p); 267 threads_per_cache = ((p[0] & 0x3ffc000) >> 14) + 1; 268 if (hyperthreading_cpus < threads_per_cache) 269 hyperthreading_cpus = threads_per_cache; 270 if ((p[0] & 0x1f) == 0) 271 break; 272 } 273 } 274 275 /* 276 * If the deterministic cache parameters are not 277 * available, or if no caches were reported to exist, 278 * just accept what the HTT flag indicated. 279 */ 280 if (hyperthreading_cpus == 0) 281 hyperthreading_cpus = logical_cpus; 282 } 283} 284 285static void 286topo_probe(void) 287{ 288 static int cpu_topo_probed = 0; 289 290 if (cpu_topo_probed) 291 return; 292 293 logical_cpus = logical_cpus_mask = 0; 294 if (cpu_high >= 0xb) 295 topo_probe_0xb(); 296 else if (cpu_high) 297 topo_probe_0x4(); 298 if (cpu_cores == 0) 299 cpu_cores = mp_ncpus > 0 ? mp_ncpus : 1; 300 if (cpu_logical == 0) 301 cpu_logical = 1; 302 cpu_topo_probed = 1; 303} 304 305struct cpu_group * 306cpu_topo(void) 307{ 308 int cg_flags; 309 310 /* 311 * Determine whether any threading flags are 312 * necessry. 313 */ 314 topo_probe(); 315 if (cpu_logical > 1 && hyperthreading_cpus) 316 cg_flags = CG_FLAG_HTT; 317 else if (cpu_logical > 1) 318 cg_flags = CG_FLAG_SMT; 319 else 320 cg_flags = 0; 321 if (mp_ncpus % (cpu_cores * cpu_logical) != 0) { 322 printf("WARNING: Non-uniform processors.\n"); 323 printf("WARNING: Using suboptimal topology.\n"); 324 return (smp_topo_none()); 325 } 326 /* 327 * No multi-core or hyper-threaded. 328 */ 329 if (cpu_logical * cpu_cores == 1) 330 return (smp_topo_none()); 331 /* 332 * Only HTT no multi-core. 333 */ 334 if (cpu_logical > 1 && cpu_cores == 1) 335 return (smp_topo_1level(CG_SHARE_L1, cpu_logical, cg_flags)); 336 /* 337 * Only multi-core no HTT. 338 */ 339 if (cpu_cores > 1 && cpu_logical == 1) 340 return (smp_topo_1level(CG_SHARE_L2, cpu_cores, cg_flags)); 341 /* 342 * Both HTT and multi-core. 343 */ 344 return (smp_topo_2level(CG_SHARE_L2, cpu_cores, 345 CG_SHARE_L1, cpu_logical, cg_flags)); 346} 347 348/* 349 * Calculate usable address in base memory for AP trampoline code. 350 */ 351u_int 352mp_bootaddress(u_int basemem) 353{ 354 355 bootMP_size = mptramp_end - mptramp_start; 356 boot_address = trunc_page(basemem * 1024); /* round down to 4k boundary */ 357 if (((basemem * 1024) - boot_address) < bootMP_size) 358 boot_address -= PAGE_SIZE; /* not enough, lower by 4k */ 359 /* 3 levels of page table pages */ 360 mptramp_pagetables = boot_address - (PAGE_SIZE * 3); 361 362 return mptramp_pagetables; 363} 364 365void 366cpu_add(u_int apic_id, char boot_cpu) 367{ 368 369 if (apic_id > MAX_APIC_ID) { 370 panic("SMP: APIC ID %d too high", apic_id); 371 return; 372 } 373 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", 374 apic_id)); 375 cpu_info[apic_id].cpu_present = 1; 376 if (boot_cpu) { 377 KASSERT(boot_cpu_id == -1, 378 ("CPU %d claims to be BSP, but CPU %d already is", apic_id, 379 boot_cpu_id)); 380 boot_cpu_id = apic_id; 381 cpu_info[apic_id].cpu_bsp = 1; 382 } 383 if (mp_ncpus < MAXCPU) { 384 mp_ncpus++; 385 mp_maxid = mp_ncpus -1; 386 } 387 if (bootverbose) 388 printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : 389 "AP"); 390} 391 392void 393cpu_mp_setmaxid(void) 394{ 395 396 /* 397 * mp_maxid should be already set by calls to cpu_add(). 398 * Just sanity check its value here. 399 */ 400 if (mp_ncpus == 0) 401 KASSERT(mp_maxid == 0, 402 ("%s: mp_ncpus is zero, but mp_maxid is not", __func__)); 403 else if (mp_ncpus == 1) 404 mp_maxid = 0; 405 else 406 KASSERT(mp_maxid >= mp_ncpus - 1, 407 ("%s: counters out of sync: max %d, count %d", __func__, 408 mp_maxid, mp_ncpus)); 409} 410 411int 412cpu_mp_probe(void) 413{ 414 415 /* 416 * Always record BSP in CPU map so that the mbuf init code works 417 * correctly. 418 */ 419 all_cpus = 1; 420 if (mp_ncpus == 0) { 421 /* 422 * No CPUs were found, so this must be a UP system. Setup 423 * the variables to represent a system with a single CPU 424 * with an id of 0. 425 */ 426 mp_ncpus = 1; 427 return (0); 428 } 429 430 /* At least one CPU was found. */ 431 if (mp_ncpus == 1) { 432 /* 433 * One CPU was found, so this must be a UP system with 434 * an I/O APIC. 435 */ 436 mp_maxid = 0; 437 return (0); 438 } 439 440 /* At least two CPUs were found. */ 441 return (1); 442} 443 444/* 445 * Initialize the IPI handlers and start up the AP's. 446 */ 447void 448cpu_mp_start(void) 449{ 450 int i; 451 452 /* Initialize the logical ID to APIC ID table. */ 453 for (i = 0; i < MAXCPU; i++) { 454 cpu_apic_ids[i] = -1; 455 cpu_ipi_pending[i] = 0; 456 } 457 458 /* Install an inter-CPU IPI for TLB invalidation */ 459 setidt(IPI_INVLTLB, IDTVEC(invltlb), SDT_SYSIGT, SEL_KPL, 0); 460 setidt(IPI_INVLPG, IDTVEC(invlpg), SDT_SYSIGT, SEL_KPL, 0); 461 setidt(IPI_INVLRNG, IDTVEC(invlrng), SDT_SYSIGT, SEL_KPL, 0); 462 463 /* Install an inter-CPU IPI for cache invalidation. */ 464 setidt(IPI_INVLCACHE, IDTVEC(invlcache), SDT_SYSIGT, SEL_KPL, 0); 465 466 /* Install an inter-CPU IPI for all-CPU rendezvous */ 467 setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous), SDT_SYSIGT, SEL_KPL, 0); 468 469 /* Install generic inter-CPU IPI handler */ 470 setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler), 471 SDT_SYSIGT, SEL_KPL, 0); 472 473 /* Install an inter-CPU IPI for CPU stop/restart */ 474 setidt(IPI_STOP, IDTVEC(cpustop), SDT_SYSIGT, SEL_KPL, 0); 475 476 /* Install an inter-CPU IPI for CPU suspend/resume */ 477 setidt(IPI_SUSPEND, IDTVEC(cpususpend), SDT_SYSIGT, SEL_KPL, 0); 478 479 /* Set boot_cpu_id if needed. */ 480 if (boot_cpu_id == -1) { 481 boot_cpu_id = PCPU_GET(apic_id); 482 cpu_info[boot_cpu_id].cpu_bsp = 1; 483 } else 484 KASSERT(boot_cpu_id == PCPU_GET(apic_id), 485 ("BSP's APIC ID doesn't match boot_cpu_id")); 486 487 /* Probe logical/physical core configuration. */ 488 topo_probe(); 489 490 assign_cpu_ids(); 491 492 /* Start each Application Processor */ 493 start_all_aps(); 494 495 set_interrupt_apic_ids(); 496} 497 498 499/* 500 * Print various information about the SMP system hardware and setup. 501 */ 502void 503cpu_mp_announce(void) 504{ 505 const char *hyperthread; 506 int i; 507 508 printf("FreeBSD/SMP: %d package(s) x %d core(s)", 509 mp_ncpus / (cpu_cores * cpu_logical), cpu_cores); 510 if (hyperthreading_cpus > 1) 511 printf(" x %d HTT threads", cpu_logical); 512 else if (cpu_logical > 1) 513 printf(" x %d SMT threads", cpu_logical); 514 printf("\n"); 515 516 /* List active CPUs first. */ 517 printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id); 518 for (i = 1; i < mp_ncpus; i++) { 519 if (cpu_info[cpu_apic_ids[i]].cpu_hyperthread) 520 hyperthread = "/HT"; 521 else 522 hyperthread = ""; 523 printf(" cpu%d (AP%s): APIC ID: %2d\n", i, hyperthread, 524 cpu_apic_ids[i]); 525 } 526 527 /* List disabled CPUs last. */ 528 for (i = 0; i <= MAX_APIC_ID; i++) { 529 if (!cpu_info[i].cpu_present || !cpu_info[i].cpu_disabled) 530 continue; 531 if (cpu_info[i].cpu_hyperthread) 532 hyperthread = "/HT"; 533 else 534 hyperthread = ""; 535 printf(" cpu (AP%s): APIC ID: %2d (disabled)\n", hyperthread, 536 i); 537 } 538} 539 540/* 541 * AP CPU's call this to initialize themselves. 542 */ 543void 544init_secondary(void) 545{ 546 struct pcpu *pc; 547 struct nmi_pcpu *np; 548 u_int64_t msr, cr0; 549 int cpu, gsel_tss, x; 550 struct region_descriptor ap_gdt; 551 552 /* Set by the startup code for us to use */ 553 cpu = bootAP; 554 555 /* Init tss */ 556 common_tss[cpu] = common_tss[0]; 557 common_tss[cpu].tss_rsp0 = 0; /* not used until after switch */ 558 common_tss[cpu].tss_iobase = sizeof(struct amd64tss) + 559 IOPAGES * PAGE_SIZE; 560 common_tss[cpu].tss_ist1 = (long)&doublefault_stack[PAGE_SIZE]; 561 562 /* The NMI stack runs on IST2. */ 563 np = ((struct nmi_pcpu *) &nmi_stack[PAGE_SIZE]) - 1; 564 common_tss[cpu].tss_ist2 = (long) np; 565 566 /* Prepare private GDT */ 567 gdt_segs[GPROC0_SEL].ssd_base = (long) &common_tss[cpu]; 568 for (x = 0; x < NGDT; x++) { 569 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) && 570 x != GUSERLDT_SEL && x != (GUSERLDT_SEL + 1)) 571 ssdtosd(&gdt_segs[x], &gdt[NGDT * cpu + x]); 572 } 573 ssdtosyssd(&gdt_segs[GPROC0_SEL], 574 (struct system_segment_descriptor *)&gdt[NGDT * cpu + GPROC0_SEL]); 575 ap_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 576 ap_gdt.rd_base = (long) &gdt[NGDT * cpu]; 577 lgdt(&ap_gdt); /* does magic intra-segment return */ 578 579 /* Get per-cpu data */ 580 pc = &__pcpu[cpu]; 581 582 /* prime data page for it to use */ 583 pcpu_init(pc, cpu, sizeof(struct pcpu)); 584 dpcpu_init(dpcpu, cpu); 585 pc->pc_apic_id = cpu_apic_ids[cpu]; 586 pc->pc_prvspace = pc; 587 pc->pc_curthread = 0; 588 pc->pc_tssp = &common_tss[cpu]; 589 pc->pc_commontssp = &common_tss[cpu]; 590 pc->pc_rsp0 = 0; 591 pc->pc_tss = (struct system_segment_descriptor *)&gdt[NGDT * cpu + 592 GPROC0_SEL]; 593 pc->pc_fs32p = &gdt[NGDT * cpu + GUFS32_SEL]; 594 pc->pc_gs32p = &gdt[NGDT * cpu + GUGS32_SEL]; 595 pc->pc_ldt = (struct system_segment_descriptor *)&gdt[NGDT * cpu + 596 GUSERLDT_SEL]; 597 598 /* Save the per-cpu pointer for use by the NMI handler. */ 599 np->np_pcpu = (register_t) pc; 600 601 wrmsr(MSR_FSBASE, 0); /* User value */ 602 wrmsr(MSR_GSBASE, (u_int64_t)pc); 603 wrmsr(MSR_KGSBASE, (u_int64_t)pc); /* XXX User value while we're in the kernel */ 604 605 lidt(&r_idt); 606 607 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 608 ltr(gsel_tss); 609 610 /* 611 * Set to a known state: 612 * Set by mpboot.s: CR0_PG, CR0_PE 613 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM 614 */ 615 cr0 = rcr0(); 616 cr0 &= ~(CR0_CD | CR0_NW | CR0_EM); 617 load_cr0(cr0); 618 619 /* Set up the fast syscall stuff */ 620 msr = rdmsr(MSR_EFER) | EFER_SCE; 621 wrmsr(MSR_EFER, msr); 622 wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall)); 623 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); 624 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | 625 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); 626 wrmsr(MSR_STAR, msr); 627 wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D); 628 629 /* Disable local APIC just to be sure. */ 630 lapic_disable(); 631 632 /* signal our startup to the BSP. */ 633 mp_naps++; 634 635 /* Spin until the BSP releases the AP's. */ 636 while (!aps_ready) 637 ia32_pause(); 638 639 /* Initialize the PAT MSR. */ 640 pmap_init_pat(); 641 642 /* set up CPU registers and state */ 643 cpu_setregs(); 644 645 /* set up SSE/NX registers */ 646 initializecpu(); 647 648 /* set up FPU state on the AP */ 649 fpuinit(); 650 651 /* A quick check from sanity claus */ 652 if (PCPU_GET(apic_id) != lapic_id()) { 653 printf("SMP: cpuid = %d\n", PCPU_GET(cpuid)); 654 printf("SMP: actual apic_id = %d\n", lapic_id()); 655 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); 656 panic("cpuid mismatch! boom!!"); 657 } 658 659 /* Initialize curthread. */ 660 KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); 661 PCPU_SET(curthread, PCPU_GET(idlethread)); 662 663 mca_init(); 664 665 mtx_lock_spin(&ap_boot_mtx); 666 667 /* Init local apic for irq's */ 668 lapic_setup(1); 669 670 /* Set memory range attributes for this CPU to match the BSP */ 671 mem_range_AP_init(); 672 673 smp_cpus++; 674 675 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid)); 676 printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid)); 677 678 /* Determine if we are a logical CPU. */ 679 if (logical_cpus > 1 && PCPU_GET(apic_id) % logical_cpus != 0) 680 logical_cpus_mask |= PCPU_GET(cpumask); 681 682 /* Determine if we are a hyperthread. */ 683 if (hyperthreading_cpus > 1 && 684 PCPU_GET(apic_id) % hyperthreading_cpus != 0) 685 hyperthreading_cpus_mask |= PCPU_GET(cpumask); 686 687 /* Build our map of 'other' CPUs. */ 688 PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); 689 690 if (bootverbose) 691 lapic_dump("AP"); 692 693 if (smp_cpus == mp_ncpus) { 694 /* enable IPI's, tlb shootdown, freezes etc */ 695 atomic_store_rel_int(&smp_started, 1); 696 smp_active = 1; /* historic */ 697 } 698 699 /* 700 * Enable global pages TLB extension 701 * This also implicitly flushes the TLB 702 */ 703 704 load_cr4(rcr4() | CR4_PGE); 705 load_ds(_udatasel); 706 load_es(_udatasel); 707 load_fs(_ufssel); 708 mtx_unlock_spin(&ap_boot_mtx); 709 710 /* Wait until all the AP's are up. */ 711 while (smp_started == 0) 712 ia32_pause(); 713 714 /* Start per-CPU event timers. */ 715 cpu_initclocks_ap(); 716 717 sched_throw(NULL); 718 719 panic("scheduler returned us to %s", __func__); 720 /* NOTREACHED */ 721} 722 723/******************************************************************* 724 * local functions and data 725 */ 726 727/* 728 * We tell the I/O APIC code about all the CPUs we want to receive 729 * interrupts. If we don't want certain CPUs to receive IRQs we 730 * can simply not tell the I/O APIC code about them in this function. 731 * We also do not tell it about the BSP since it tells itself about 732 * the BSP internally to work with UP kernels and on UP machines. 733 */ 734static void 735set_interrupt_apic_ids(void) 736{ 737 u_int i, apic_id; 738 739 for (i = 0; i < MAXCPU; i++) { 740 apic_id = cpu_apic_ids[i]; 741 if (apic_id == -1) 742 continue; 743 if (cpu_info[apic_id].cpu_bsp) 744 continue; 745 if (cpu_info[apic_id].cpu_disabled) 746 continue; 747 748 /* Don't let hyperthreads service interrupts. */ 749 if (hyperthreading_cpus > 1 && 750 apic_id % hyperthreading_cpus != 0) 751 continue; 752 753 intr_add_cpu(i); 754 } 755} 756 757/* 758 * Assign logical CPU IDs to local APICs. 759 */ 760static void 761assign_cpu_ids(void) 762{ 763 u_int i; 764 765 TUNABLE_INT_FETCH("machdep.hyperthreading_allowed", 766 &hyperthreading_allowed); 767 768 /* Check for explicitly disabled CPUs. */ 769 for (i = 0; i <= MAX_APIC_ID; i++) { 770 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp) 771 continue; 772 773 if (hyperthreading_cpus > 1 && i % hyperthreading_cpus != 0) { 774 cpu_info[i].cpu_hyperthread = 1; 775#if defined(SCHED_ULE) 776 /* 777 * Don't use HT CPU if it has been disabled by a 778 * tunable. 779 */ 780 if (hyperthreading_allowed == 0) { 781 cpu_info[i].cpu_disabled = 1; 782 continue; 783 } 784#endif 785 } 786 787 /* Don't use this CPU if it has been disabled by a tunable. */ 788 if (resource_disabled("lapic", i)) { 789 cpu_info[i].cpu_disabled = 1; 790 continue; 791 } 792 } 793 794 /* 795 * Assign CPU IDs to local APIC IDs and disable any CPUs 796 * beyond MAXCPU. CPU 0 is always assigned to the BSP. 797 * 798 * To minimize confusion for userland, we attempt to number 799 * CPUs such that all threads and cores in a package are 800 * grouped together. For now we assume that the BSP is always 801 * the first thread in a package and just start adding APs 802 * starting with the BSP's APIC ID. 803 */ 804 mp_ncpus = 1; 805 cpu_apic_ids[0] = boot_cpu_id; 806 apic_cpuids[boot_cpu_id] = 0; 807 for (i = boot_cpu_id + 1; i != boot_cpu_id; 808 i == MAX_APIC_ID ? i = 0 : i++) { 809 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp || 810 cpu_info[i].cpu_disabled) 811 continue; 812 813 if (mp_ncpus < MAXCPU) { 814 cpu_apic_ids[mp_ncpus] = i; 815 apic_cpuids[i] = mp_ncpus; 816 mp_ncpus++; 817 } else 818 cpu_info[i].cpu_disabled = 1; 819 } 820 KASSERT(mp_maxid >= mp_ncpus - 1, 821 ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, 822 mp_ncpus)); 823} 824 825/* 826 * start each AP in our list 827 */ 828static int 829start_all_aps(void) 830{ 831 vm_offset_t va = boot_address + KERNBASE; 832 u_int64_t *pt4, *pt3, *pt2; 833 u_int32_t mpbioswarmvec; 834 int apic_id, cpu, i; 835 u_char mpbiosreason; 836 837 mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); 838 839 /* install the AP 1st level boot code */ 840 pmap_kenter(va, boot_address); 841 pmap_invalidate_page(kernel_pmap, va); 842 bcopy(mptramp_start, (void *)va, bootMP_size); 843 844 /* Locate the page tables, they'll be below the trampoline */ 845 pt4 = (u_int64_t *)(uintptr_t)(mptramp_pagetables + KERNBASE); 846 pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t); 847 pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t); 848 849 /* Create the initial 1GB replicated page tables */ 850 for (i = 0; i < 512; i++) { 851 /* Each slot of the level 4 pages points to the same level 3 page */ 852 pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + PAGE_SIZE); 853 pt4[i] |= PG_V | PG_RW | PG_U; 854 855 /* Each slot of the level 3 pages points to the same level 2 page */ 856 pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + (2 * PAGE_SIZE)); 857 pt3[i] |= PG_V | PG_RW | PG_U; 858 859 /* The level 2 page slots are mapped with 2MB pages for 1GB. */ 860 pt2[i] = i * (2 * 1024 * 1024); 861 pt2[i] |= PG_V | PG_RW | PG_PS | PG_U; 862 } 863 864 /* save the current value of the warm-start vector */ 865 mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF); 866 outb(CMOS_REG, BIOS_RESET); 867 mpbiosreason = inb(CMOS_DATA); 868 869 /* setup a vector to our boot code */ 870 *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET; 871 *((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4); 872 outb(CMOS_REG, BIOS_RESET); 873 outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ 874 875 /* start each AP */ 876 for (cpu = 1; cpu < mp_ncpus; cpu++) { 877 apic_id = cpu_apic_ids[cpu]; 878 879 /* allocate and set up an idle stack data page */ 880 bootstacks[cpu] = (void *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE); 881 doublefault_stack = (char *)kmem_alloc(kernel_map, PAGE_SIZE); 882 nmi_stack = (char *)kmem_alloc(kernel_map, PAGE_SIZE); 883 dpcpu = (void *)kmem_alloc(kernel_map, DPCPU_SIZE); 884 885 bootSTK = (char *)bootstacks[cpu] + KSTACK_PAGES * PAGE_SIZE - 8; 886 bootAP = cpu; 887 888 /* attempt to start the Application Processor */ 889 if (!start_ap(apic_id)) { 890 /* restore the warmstart vector */ 891 *(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec; 892 panic("AP #%d (PHY# %d) failed!", cpu, apic_id); 893 } 894 895 all_cpus |= (1 << cpu); /* record AP in CPU map */ 896 } 897 898 /* build our map of 'other' CPUs */ 899 PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); 900 901 /* restore the warmstart vector */ 902 *(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec; 903 904 outb(CMOS_REG, BIOS_RESET); 905 outb(CMOS_DATA, mpbiosreason); 906 907 /* number of APs actually started */ 908 return mp_naps; 909} 910 911 912/* 913 * This function starts the AP (application processor) identified 914 * by the APIC ID 'physicalCpu'. It does quite a "song and dance" 915 * to accomplish this. This is necessary because of the nuances 916 * of the different hardware we might encounter. It isn't pretty, 917 * but it seems to work. 918 */ 919static int 920start_ap(int apic_id) 921{ 922 int vector, ms; 923 int cpus; 924 925 /* calculate the vector */ 926 vector = (boot_address >> 12) & 0xff; 927 928 /* used as a watchpoint to signal AP startup */ 929 cpus = mp_naps; 930 931 /* 932 * first we do an INIT/RESET IPI this INIT IPI might be run, reseting 933 * and running the target CPU. OR this INIT IPI might be latched (P5 934 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be 935 * ignored. 936 */ 937 938 /* do an INIT IPI: assert RESET */ 939 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 940 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); 941 942 /* wait for pending status end */ 943 lapic_ipi_wait(-1); 944 945 /* do an INIT IPI: deassert RESET */ 946 lapic_ipi_raw(APIC_DEST_ALLESELF | APIC_TRIGMOD_LEVEL | 947 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 0); 948 949 /* wait for pending status end */ 950 DELAY(10000); /* wait ~10mS */ 951 lapic_ipi_wait(-1); 952 953 /* 954 * next we do a STARTUP IPI: the previous INIT IPI might still be 955 * latched, (P5 bug) this 1st STARTUP would then terminate 956 * immediately, and the previously started INIT IPI would continue. OR 957 * the previous INIT IPI has already run. and this STARTUP IPI will 958 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI 959 * will run. 960 */ 961 962 /* do a STARTUP IPI */ 963 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 964 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 965 vector, apic_id); 966 lapic_ipi_wait(-1); 967 DELAY(200); /* wait ~200uS */ 968 969 /* 970 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF 971 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR 972 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is 973 * recognized after hardware RESET or INIT IPI. 974 */ 975 976 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 977 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 978 vector, apic_id); 979 lapic_ipi_wait(-1); 980 DELAY(200); /* wait ~200uS */ 981 982 /* Wait up to 5 seconds for it to start. */ 983 for (ms = 0; ms < 5000; ms++) { 984 if (mp_naps > cpus) 985 return 1; /* return SUCCESS */ 986 DELAY(1000); 987 } 988 return 0; /* return FAILURE */ 989} 990 991#ifdef COUNT_XINVLTLB_HITS 992u_int xhits_gbl[MAXCPU]; 993u_int xhits_pg[MAXCPU]; 994u_int xhits_rng[MAXCPU]; 995SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, ""); 996SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, 997 sizeof(xhits_gbl), "IU", ""); 998SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, 999 sizeof(xhits_pg), "IU", ""); 1000SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, 1001 sizeof(xhits_rng), "IU", ""); 1002 1003u_int ipi_global; 1004u_int ipi_page; 1005u_int ipi_range; 1006u_int ipi_range_size; 1007SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); 1008SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); 1009SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); 1010SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, 1011 0, ""); 1012 1013u_int ipi_masked_global; 1014u_int ipi_masked_page; 1015u_int ipi_masked_range; 1016u_int ipi_masked_range_size; 1017SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW, 1018 &ipi_masked_global, 0, ""); 1019SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW, 1020 &ipi_masked_page, 0, ""); 1021SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW, 1022 &ipi_masked_range, 0, ""); 1023SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW, 1024 &ipi_masked_range_size, 0, ""); 1025#endif /* COUNT_XINVLTLB_HITS */ 1026 1027/* 1028 * Flush the TLB on all other CPU's 1029 */ 1030static void 1031smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) 1032{ 1033 u_int ncpu; 1034 1035 ncpu = mp_ncpus - 1; /* does not shootdown self */ 1036 if (ncpu < 1) 1037 return; /* no other cpus */ 1038 if (!(read_rflags() & PSL_I)) 1039 panic("%s: interrupts disabled", __func__); 1040 mtx_lock_spin(&smp_ipi_mtx); 1041 smp_tlb_addr1 = addr1; 1042 smp_tlb_addr2 = addr2; 1043 atomic_store_rel_int(&smp_tlb_wait, 0); 1044 ipi_all_but_self(vector); 1045 while (smp_tlb_wait < ncpu) 1046 ia32_pause(); 1047 mtx_unlock_spin(&smp_ipi_mtx); 1048} 1049 1050static void 1051smp_targeted_tlb_shootdown(cpumask_t mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2) 1052{ 1053 int ncpu, othercpus; 1054 1055 othercpus = mp_ncpus - 1; 1056 if (mask == (u_int)-1) { 1057 ncpu = othercpus; 1058 if (ncpu < 1) 1059 return; 1060 } else { 1061 mask &= ~PCPU_GET(cpumask); 1062 if (mask == 0) 1063 return; 1064 ncpu = bitcount32(mask); 1065 if (ncpu > othercpus) { 1066 /* XXX this should be a panic offence */ 1067 printf("SMP: tlb shootdown to %d other cpus (only have %d)\n", 1068 ncpu, othercpus); 1069 ncpu = othercpus; 1070 } 1071 /* XXX should be a panic, implied by mask == 0 above */ 1072 if (ncpu < 1) 1073 return; 1074 } 1075 if (!(read_rflags() & PSL_I)) 1076 panic("%s: interrupts disabled", __func__); 1077 mtx_lock_spin(&smp_ipi_mtx); 1078 smp_tlb_addr1 = addr1; 1079 smp_tlb_addr2 = addr2; 1080 atomic_store_rel_int(&smp_tlb_wait, 0); 1081 if (mask == (u_int)-1) 1082 ipi_all_but_self(vector); 1083 else 1084 ipi_selected(mask, vector); 1085 while (smp_tlb_wait < ncpu) 1086 ia32_pause(); 1087 mtx_unlock_spin(&smp_ipi_mtx); 1088} 1089 1090/* 1091 * Send an IPI to specified CPU handling the bitmap logic. 1092 */ 1093static void 1094ipi_send_cpu(int cpu, u_int ipi) 1095{ 1096 u_int bitmap, old_pending, new_pending; 1097 1098 KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu)); 1099 1100 if (IPI_IS_BITMAPED(ipi)) { 1101 bitmap = 1 << ipi; 1102 ipi = IPI_BITMAP_VECTOR; 1103 do { 1104 old_pending = cpu_ipi_pending[cpu]; 1105 new_pending = old_pending | bitmap; 1106 } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu], 1107 old_pending, new_pending)); 1108 if (old_pending) 1109 return; 1110 } 1111 lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]); 1112} 1113 1114void 1115smp_cache_flush(void) 1116{ 1117 1118 if (smp_started) 1119 smp_tlb_shootdown(IPI_INVLCACHE, 0, 0); 1120} 1121 1122void 1123smp_invltlb(void) 1124{ 1125 1126 if (smp_started) { 1127 smp_tlb_shootdown(IPI_INVLTLB, 0, 0); 1128#ifdef COUNT_XINVLTLB_HITS 1129 ipi_global++; 1130#endif 1131 } 1132} 1133 1134void 1135smp_invlpg(vm_offset_t addr) 1136{ 1137 1138 if (smp_started) { 1139 smp_tlb_shootdown(IPI_INVLPG, addr, 0); 1140#ifdef COUNT_XINVLTLB_HITS 1141 ipi_page++; 1142#endif 1143 } 1144} 1145 1146void 1147smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2) 1148{ 1149 1150 if (smp_started) { 1151 smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2); 1152#ifdef COUNT_XINVLTLB_HITS 1153 ipi_range++; 1154 ipi_range_size += (addr2 - addr1) / PAGE_SIZE; 1155#endif 1156 } 1157} 1158 1159void 1160smp_masked_invltlb(cpumask_t mask) 1161{ 1162 1163 if (smp_started) { 1164 smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0); 1165#ifdef COUNT_XINVLTLB_HITS 1166 ipi_masked_global++; 1167#endif 1168 } 1169} 1170 1171void 1172smp_masked_invlpg(cpumask_t mask, vm_offset_t addr) 1173{ 1174 1175 if (smp_started) { 1176 smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0); 1177#ifdef COUNT_XINVLTLB_HITS 1178 ipi_masked_page++; 1179#endif 1180 } 1181} 1182 1183void 1184smp_masked_invlpg_range(cpumask_t mask, vm_offset_t addr1, vm_offset_t addr2) 1185{ 1186 1187 if (smp_started) { 1188 smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2); 1189#ifdef COUNT_XINVLTLB_HITS 1190 ipi_masked_range++; 1191 ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE; 1192#endif 1193 } 1194} 1195 1196void 1197ipi_bitmap_handler(struct trapframe frame) 1198{ 1199 int cpu = PCPU_GET(cpuid); 1200 u_int ipi_bitmap; 1201 1202 ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); 1203 1204 if (ipi_bitmap & (1 << IPI_PREEMPT)) { 1205#ifdef COUNT_IPIS 1206 (*ipi_preempt_counts[cpu])++; 1207#endif 1208 sched_preempt(curthread); 1209 } 1210 if (ipi_bitmap & (1 << IPI_AST)) { 1211#ifdef COUNT_IPIS 1212 (*ipi_ast_counts[cpu])++; 1213#endif 1214 /* Nothing to do for AST */ 1215 } 1216 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) { 1217#ifdef COUNT_IPIS 1218 (*ipi_hardclock_counts[cpu])++; 1219#endif 1220 hardclockintr(&frame); 1221 } 1222 if (ipi_bitmap & (1 << IPI_STATCLOCK)) { 1223#ifdef COUNT_IPIS 1224 (*ipi_statclock_counts[cpu])++; 1225#endif 1226 statclockintr(&frame); 1227 } 1228} 1229 1230/* 1231 * send an IPI to a set of cpus. 1232 */ 1233void 1234ipi_selected(cpumask_t cpus, u_int ipi) 1235{ 1236 int cpu; 1237 1238 /* 1239 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1240 * of help in order to understand what is the source. 1241 * Set the mask of receiving CPUs for this purpose. 1242 */ 1243 if (ipi == IPI_STOP_HARD) 1244 atomic_set_int(&ipi_nmi_pending, cpus); 1245 1246 CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi); 1247 while ((cpu = ffs(cpus)) != 0) { 1248 cpu--; 1249 cpus &= ~(1 << cpu); 1250 ipi_send_cpu(cpu, ipi); 1251 } 1252} 1253 1254/* 1255 * send an IPI to a specific CPU. 1256 */ 1257void 1258ipi_cpu(int cpu, u_int ipi) 1259{ 1260 1261 /* 1262 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1263 * of help in order to understand what is the source. 1264 * Set the mask of receiving CPUs for this purpose. 1265 */ 1266 if (ipi == IPI_STOP_HARD) 1267 atomic_set_int(&ipi_nmi_pending, 1 << cpu); 1268 1269 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1270 ipi_send_cpu(cpu, ipi); 1271} 1272 1273/* 1274 * send an IPI to all CPUs EXCEPT myself 1275 */ 1276void 1277ipi_all_but_self(u_int ipi) 1278{ 1279 1280 if (IPI_IS_BITMAPED(ipi)) { 1281 ipi_selected(PCPU_GET(other_cpus), ipi); 1282 return; 1283 } 1284 1285 /* 1286 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1287 * of help in order to understand what is the source. 1288 * Set the mask of receiving CPUs for this purpose. 1289 */ 1290 if (ipi == IPI_STOP_HARD) 1291 atomic_set_int(&ipi_nmi_pending, PCPU_GET(other_cpus)); 1292 1293 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); 1294 lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); 1295} 1296 1297int 1298ipi_nmi_handler() 1299{ 1300 cpumask_t cpumask; 1301 1302 /* 1303 * As long as there is not a simple way to know about a NMI's 1304 * source, if the bitmask for the current CPU is present in 1305 * the global pending bitword an IPI_STOP_HARD has been issued 1306 * and should be handled. 1307 */ 1308 cpumask = PCPU_GET(cpumask); 1309 if ((ipi_nmi_pending & cpumask) == 0) 1310 return (1); 1311 1312 atomic_clear_int(&ipi_nmi_pending, cpumask); 1313 cpustop_handler(); 1314 return (0); 1315} 1316 1317/* 1318 * Handle an IPI_STOP by saving our current context and spinning until we 1319 * are resumed. 1320 */ 1321void 1322cpustop_handler(void) 1323{ 1324 cpumask_t cpumask; 1325 u_int cpu; 1326 1327 sched_pin(); 1328 cpu = PCPU_GET(cpuid); 1329 cpumask = PCPU_GET(cpumask); 1330 sched_unpin(); 1331 1332 savectx(&stoppcbs[cpu]); 1333 1334 /* Indicate that we are stopped */ 1335 atomic_set_int(&stopped_cpus, cpumask); 1336 1337 /* Wait for restart */ 1338 while (!(started_cpus & cpumask)) 1339 ia32_pause(); 1340 1341 atomic_clear_int(&started_cpus, cpumask); 1342 atomic_clear_int(&stopped_cpus, cpumask); 1343 1344 if (cpu == 0 && cpustop_restartfunc != NULL) { 1345 cpustop_restartfunc(); 1346 cpustop_restartfunc = NULL; 1347 } 1348} 1349 1350/* 1351 * Handle an IPI_SUSPEND by saving our current context and spinning until we 1352 * are resumed. 1353 */ 1354void 1355cpususpend_handler(void) 1356{ 1357 cpumask_t cpumask; 1358 register_t cr3, rf; 1359 u_int cpu; 1360 1361 sched_pin(); 1362 cpu = PCPU_GET(cpuid); 1363 cpumask = PCPU_GET(cpumask); 1364 sched_unpin(); 1365 1366 rf = intr_disable(); 1367 cr3 = rcr3(); 1368 1369 if (savectx(susppcbs[cpu])) { 1370 wbinvd(); 1371 atomic_set_int(&stopped_cpus, cpumask); 1372 } 1373 1374 /* Wait for resume */ 1375 while (!(started_cpus & cpumask)) 1376 ia32_pause(); 1377 1378 atomic_clear_int(&started_cpus, cpumask); 1379 atomic_clear_int(&stopped_cpus, cpumask); 1380 1381 /* Restore CR3 and enable interrupts */ 1382 load_cr3(cr3); 1383 mca_resume(); 1384 lapic_setup(0); 1385 intr_restore(rf); 1386} 1387 1388/* 1389 * This is called once the rest of the system is up and running and we're 1390 * ready to let the AP's out of the pen. 1391 */ 1392static void 1393release_aps(void *dummy __unused) 1394{ 1395 1396 if (mp_ncpus == 1) 1397 return; 1398 atomic_store_rel_int(&aps_ready, 1); 1399 while (smp_started == 0) 1400 ia32_pause(); 1401} 1402SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); 1403 1404static int 1405sysctl_hlt_cpus(SYSCTL_HANDLER_ARGS) 1406{ 1407 cpumask_t mask; 1408 int error; 1409 1410 mask = hlt_cpus_mask; 1411 error = sysctl_handle_int(oidp, &mask, 0, req); 1412 if (error || !req->newptr) 1413 return (error); 1414 1415 if (logical_cpus_mask != 0 && 1416 (mask & logical_cpus_mask) == logical_cpus_mask) 1417 hlt_logical_cpus = 1; 1418 else 1419 hlt_logical_cpus = 0; 1420 1421 if (! hyperthreading_allowed) 1422 mask |= hyperthreading_cpus_mask; 1423 1424 if ((mask & all_cpus) == all_cpus) 1425 mask &= ~(1<<0); 1426 hlt_cpus_mask = mask; 1427 return (error); 1428} 1429SYSCTL_PROC(_machdep, OID_AUTO, hlt_cpus, CTLTYPE_INT|CTLFLAG_RW, 1430 0, 0, sysctl_hlt_cpus, "IU", 1431 "Bitmap of CPUs to halt. 101 (binary) will halt CPUs 0 and 2."); 1432 1433static int 1434sysctl_hlt_logical_cpus(SYSCTL_HANDLER_ARGS) 1435{ 1436 int disable, error; 1437 1438 disable = hlt_logical_cpus; 1439 error = sysctl_handle_int(oidp, &disable, 0, req); 1440 if (error || !req->newptr) 1441 return (error); 1442 1443 if (disable) 1444 hlt_cpus_mask |= logical_cpus_mask; 1445 else 1446 hlt_cpus_mask &= ~logical_cpus_mask; 1447 1448 if (! hyperthreading_allowed) 1449 hlt_cpus_mask |= hyperthreading_cpus_mask; 1450 1451 if ((hlt_cpus_mask & all_cpus) == all_cpus) 1452 hlt_cpus_mask &= ~(1<<0); 1453 1454 hlt_logical_cpus = disable; 1455 return (error); 1456} 1457 1458static int 1459sysctl_hyperthreading_allowed(SYSCTL_HANDLER_ARGS) 1460{ 1461 int allowed, error; 1462 1463 allowed = hyperthreading_allowed; 1464 error = sysctl_handle_int(oidp, &allowed, 0, req); 1465 if (error || !req->newptr) 1466 return (error); 1467 1468#ifdef SCHED_ULE 1469 /* 1470 * SCHED_ULE doesn't allow enabling/disabling HT cores at 1471 * run-time. 1472 */ 1473 if (allowed != hyperthreading_allowed) 1474 return (ENOTSUP); 1475 return (error); 1476#endif 1477 1478 if (allowed) 1479 hlt_cpus_mask &= ~hyperthreading_cpus_mask; 1480 else 1481 hlt_cpus_mask |= hyperthreading_cpus_mask; 1482 1483 if (logical_cpus_mask != 0 && 1484 (hlt_cpus_mask & logical_cpus_mask) == logical_cpus_mask) 1485 hlt_logical_cpus = 1; 1486 else 1487 hlt_logical_cpus = 0; 1488 1489 if ((hlt_cpus_mask & all_cpus) == all_cpus) 1490 hlt_cpus_mask &= ~(1<<0); 1491 1492 hyperthreading_allowed = allowed; 1493 return (error); 1494} 1495 1496static void 1497cpu_hlt_setup(void *dummy __unused) 1498{ 1499 1500 if (logical_cpus_mask != 0) { 1501 TUNABLE_INT_FETCH("machdep.hlt_logical_cpus", 1502 &hlt_logical_cpus); 1503 sysctl_ctx_init(&logical_cpu_clist); 1504 SYSCTL_ADD_PROC(&logical_cpu_clist, 1505 SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO, 1506 "hlt_logical_cpus", CTLTYPE_INT|CTLFLAG_RW, 0, 0, 1507 sysctl_hlt_logical_cpus, "IU", ""); 1508 SYSCTL_ADD_UINT(&logical_cpu_clist, 1509 SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO, 1510 "logical_cpus_mask", CTLTYPE_INT|CTLFLAG_RD, 1511 &logical_cpus_mask, 0, ""); 1512 1513 if (hlt_logical_cpus) 1514 hlt_cpus_mask |= logical_cpus_mask; 1515 1516 /* 1517 * If necessary for security purposes, force 1518 * hyperthreading off, regardless of the value 1519 * of hlt_logical_cpus. 1520 */ 1521 if (hyperthreading_cpus_mask) { 1522 SYSCTL_ADD_PROC(&logical_cpu_clist, 1523 SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO, 1524 "hyperthreading_allowed", CTLTYPE_INT|CTLFLAG_RW, 1525 0, 0, sysctl_hyperthreading_allowed, "IU", ""); 1526 if (! hyperthreading_allowed) 1527 hlt_cpus_mask |= hyperthreading_cpus_mask; 1528 } 1529 } 1530} 1531SYSINIT(cpu_hlt, SI_SUB_SMP, SI_ORDER_ANY, cpu_hlt_setup, NULL); 1532 1533int 1534mp_grab_cpu_hlt(void) 1535{ 1536 cpumask_t mask; 1537#ifdef MP_WATCHDOG 1538 u_int cpuid; 1539#endif 1540 int retval; 1541 1542#ifdef MP_WATCHDOG 1543 sched_pin(); 1544 mask = PCPU_GET(cpumask); 1545 cpuid = PCPU_GET(cpuid); 1546 sched_unpin(); 1547 ap_watchdog(cpuid); 1548#else 1549 mask = PCPU_GET(cpumask); 1550#endif 1551 1552 retval = mask & hlt_cpus_mask; 1553 while (mask & hlt_cpus_mask) 1554 __asm __volatile("sti; hlt" : : : "memory"); 1555 return (retval); 1556} 1557 1558#ifdef COUNT_IPIS 1559/* 1560 * Setup interrupt counters for IPI handlers. 1561 */ 1562static void 1563mp_ipi_intrcnt(void *dummy) 1564{ 1565 char buf[64]; 1566 int i; 1567 1568 CPU_FOREACH(i) { 1569 snprintf(buf, sizeof(buf), "cpu%d:invltlb", i); 1570 intrcnt_add(buf, &ipi_invltlb_counts[i]); 1571 snprintf(buf, sizeof(buf), "cpu%d:invlrng", i); 1572 intrcnt_add(buf, &ipi_invlrng_counts[i]); 1573 snprintf(buf, sizeof(buf), "cpu%d:invlpg", i); 1574 intrcnt_add(buf, &ipi_invlpg_counts[i]); 1575 snprintf(buf, sizeof(buf), "cpu%d:preempt", i); 1576 intrcnt_add(buf, &ipi_preempt_counts[i]); 1577 snprintf(buf, sizeof(buf), "cpu%d:ast", i); 1578 intrcnt_add(buf, &ipi_ast_counts[i]); 1579 snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i); 1580 intrcnt_add(buf, &ipi_rendezvous_counts[i]); 1581 snprintf(buf, sizeof(buf), "cpu%d:lazypmap", i); 1582 intrcnt_add(buf, &ipi_lazypmap_counts[i]); 1583 snprintf(buf, sizeof(buf), "cpu%d:hardclock", i); 1584 intrcnt_add(buf, &ipi_hardclock_counts[i]); 1585 snprintf(buf, sizeof(buf), "cpu%d:statclock", i); 1586 intrcnt_add(buf, &ipi_statclock_counts[i]); 1587 } 1588} 1589SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL); 1590#endif 1591 1592