mp_x86.c revision 189023
1/*- 2 * Copyright (c) 1996, by Steve Passe 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. The name of the developer may NOT be used to endorse or promote products 11 * derived from this software without specific prior written permission. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 */ 25 26#include <sys/cdefs.h> 27__FBSDID("$FreeBSD: head/sys/i386/i386/mp_machdep.c 189023 2009-02-25 01:49:01Z sobomax $"); 28 29#include "opt_apic.h" 30#include "opt_cpu.h" 31#include "opt_kstack_pages.h" 32#include "opt_mp_watchdog.h" 33#include "opt_sched.h" 34#include "opt_smp.h" 35 36#if !defined(lint) 37#if !defined(SMP) 38#error How did you get here? 39#endif 40 41#ifndef DEV_APIC 42#error The apic device is required for SMP, add "device apic" to your config file. 43#endif 44#if defined(CPU_DISABLE_CMPXCHG) && !defined(COMPILING_LINT) 45#error SMP not supported with CPU_DISABLE_CMPXCHG 46#endif 47#endif /* not lint */ 48 49#include <sys/param.h> 50#include <sys/systm.h> 51#include <sys/bus.h> 52#include <sys/cons.h> /* cngetc() */ 53#ifdef GPROF 54#include <sys/gmon.h> 55#endif 56#include <sys/kernel.h> 57#include <sys/ktr.h> 58#include <sys/lock.h> 59#include <sys/malloc.h> 60#include <sys/memrange.h> 61#include <sys/mutex.h> 62#include <sys/pcpu.h> 63#include <sys/proc.h> 64#include <sys/sched.h> 65#include <sys/smp.h> 66#include <sys/sysctl.h> 67 68#include <vm/vm.h> 69#include <vm/vm_param.h> 70#include <vm/pmap.h> 71#include <vm/vm_kern.h> 72#include <vm/vm_extern.h> 73 74#include <machine/apicreg.h> 75#include <machine/cputypes.h> 76#include <machine/md_var.h> 77#include <machine/mp_watchdog.h> 78#include <machine/pcb.h> 79#include <machine/psl.h> 80#include <machine/smp.h> 81#include <machine/specialreg.h> 82 83#define WARMBOOT_TARGET 0 84#define WARMBOOT_OFF (KERNBASE + 0x0467) 85#define WARMBOOT_SEG (KERNBASE + 0x0469) 86 87#define CMOS_REG (0x70) 88#define CMOS_DATA (0x71) 89#define BIOS_RESET (0x0f) 90#define BIOS_WARM (0x0a) 91 92/* 93 * this code MUST be enabled here and in mpboot.s. 94 * it follows the very early stages of AP boot by placing values in CMOS ram. 95 * it NORMALLY will never be needed and thus the primitive method for enabling. 96 * 97#define CHECK_POINTS 98 */ 99 100#if defined(CHECK_POINTS) && !defined(PC98) 101#define CHECK_READ(A) (outb(CMOS_REG, (A)), inb(CMOS_DATA)) 102#define CHECK_WRITE(A,D) (outb(CMOS_REG, (A)), outb(CMOS_DATA, (D))) 103 104#define CHECK_INIT(D); \ 105 CHECK_WRITE(0x34, (D)); \ 106 CHECK_WRITE(0x35, (D)); \ 107 CHECK_WRITE(0x36, (D)); \ 108 CHECK_WRITE(0x37, (D)); \ 109 CHECK_WRITE(0x38, (D)); \ 110 CHECK_WRITE(0x39, (D)); 111 112#define CHECK_PRINT(S); \ 113 printf("%s: %d, %d, %d, %d, %d, %d\n", \ 114 (S), \ 115 CHECK_READ(0x34), \ 116 CHECK_READ(0x35), \ 117 CHECK_READ(0x36), \ 118 CHECK_READ(0x37), \ 119 CHECK_READ(0x38), \ 120 CHECK_READ(0x39)); 121 122#else /* CHECK_POINTS */ 123 124#define CHECK_INIT(D) 125#define CHECK_PRINT(S) 126#define CHECK_WRITE(A, D) 127 128#endif /* CHECK_POINTS */ 129 130/* lock region used by kernel profiling */ 131int mcount_lock; 132 133int mp_naps; /* # of Applications processors */ 134int boot_cpu_id = -1; /* designated BSP */ 135 136extern struct pcpu __pcpu[]; 137 138/* AP uses this during bootstrap. Do not staticize. */ 139char *bootSTK; 140static int bootAP; 141 142/* Free these after use */ 143void *bootstacks[MAXCPU]; 144 145/* Hotwire a 0->4MB V==P mapping */ 146extern pt_entry_t *KPTphys; 147 148struct pcb stoppcbs[MAXCPU]; 149 150/* Variables needed for SMP tlb shootdown. */ 151vm_offset_t smp_tlb_addr1; 152vm_offset_t smp_tlb_addr2; 153volatile int smp_tlb_wait; 154 155#ifdef STOP_NMI 156volatile cpumask_t ipi_nmi_pending; 157 158static void ipi_nmi_selected(u_int32_t cpus); 159#endif 160 161#ifdef COUNT_IPIS 162/* Interrupt counts. */ 163static u_long *ipi_preempt_counts[MAXCPU]; 164static u_long *ipi_ast_counts[MAXCPU]; 165u_long *ipi_invltlb_counts[MAXCPU]; 166u_long *ipi_invlrng_counts[MAXCPU]; 167u_long *ipi_invlpg_counts[MAXCPU]; 168u_long *ipi_invlcache_counts[MAXCPU]; 169u_long *ipi_rendezvous_counts[MAXCPU]; 170u_long *ipi_lazypmap_counts[MAXCPU]; 171#endif 172 173/* 174 * Local data and functions. 175 */ 176 177#ifdef STOP_NMI 178/* 179 * Provide an alternate method of stopping other CPUs. If another CPU has 180 * disabled interrupts the conventional STOP IPI will be blocked. This 181 * NMI-based stop should get through in that case. 182 */ 183static int stop_cpus_with_nmi = 1; 184SYSCTL_INT(_debug, OID_AUTO, stop_cpus_with_nmi, CTLTYPE_INT | CTLFLAG_RW, 185 &stop_cpus_with_nmi, 0, ""); 186TUNABLE_INT("debug.stop_cpus_with_nmi", &stop_cpus_with_nmi); 187#else 188#define stop_cpus_with_nmi 0 189#endif 190 191static u_int logical_cpus; 192 193/* used to hold the AP's until we are ready to release them */ 194static struct mtx ap_boot_mtx; 195 196/* Set to 1 once we're ready to let the APs out of the pen. */ 197static volatile int aps_ready = 0; 198 199/* 200 * Store data from cpu_add() until later in the boot when we actually setup 201 * the APs. 202 */ 203struct cpu_info { 204 int cpu_present:1; 205 int cpu_bsp:1; 206 int cpu_disabled:1; 207 int cpu_hyperthread:1; 208} static cpu_info[MAX_APIC_ID + 1]; 209int cpu_apic_ids[MAXCPU]; 210int apic_cpuids[MAX_APIC_ID + 1]; 211 212/* Holds pending bitmap based IPIs per CPU */ 213static volatile u_int cpu_ipi_pending[MAXCPU]; 214 215static u_int boot_address; 216 217static void assign_cpu_ids(void); 218static void install_ap_tramp(void); 219static void set_interrupt_apic_ids(void); 220static int start_all_aps(void); 221static int start_ap(int apic_id); 222static void release_aps(void *dummy); 223 224static int hlt_logical_cpus; 225static u_int hyperthreading_cpus; 226static cpumask_t hyperthreading_cpus_mask; 227static int hyperthreading_allowed = 1; 228static struct sysctl_ctx_list logical_cpu_clist; 229 230static void 231mem_range_AP_init(void) 232{ 233 if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) 234 mem_range_softc.mr_op->initAP(&mem_range_softc); 235} 236 237struct cpu_group * 238cpu_topo(void) 239{ 240 if (cpu_cores == 0) 241 cpu_cores = 1; 242 if (cpu_logical == 0) 243 cpu_logical = 1; 244 if (mp_ncpus % (cpu_cores * cpu_logical) != 0) { 245 printf("WARNING: Non-uniform processors.\n"); 246 printf("WARNING: Using suboptimal topology.\n"); 247 return (smp_topo_none()); 248 } 249 /* 250 * No multi-core or hyper-threaded. 251 */ 252 if (cpu_logical * cpu_cores == 1) 253 return (smp_topo_none()); 254 /* 255 * Only HTT no multi-core. 256 */ 257 if (cpu_logical > 1 && cpu_cores == 1) 258 return (smp_topo_1level(CG_SHARE_L1, cpu_logical, CG_FLAG_HTT)); 259 /* 260 * Only multi-core no HTT. 261 */ 262 if (cpu_cores > 1 && cpu_logical == 1) 263 return (smp_topo_1level(CG_SHARE_NONE, cpu_cores, 0)); 264 /* 265 * Both HTT and multi-core. 266 */ 267 return (smp_topo_2level(CG_SHARE_NONE, cpu_cores, 268 CG_SHARE_L1, cpu_logical, CG_FLAG_HTT)); 269} 270 271 272/* 273 * Calculate usable address in base memory for AP trampoline code. 274 */ 275u_int 276mp_bootaddress(u_int basemem) 277{ 278 279 boot_address = trunc_page(basemem); /* round down to 4k boundary */ 280 if ((basemem - boot_address) < bootMP_size) 281 boot_address -= PAGE_SIZE; /* not enough, lower by 4k */ 282 283 return boot_address; 284} 285 286void 287cpu_add(u_int apic_id, char boot_cpu) 288{ 289 290 if (apic_id > MAX_APIC_ID) { 291 panic("SMP: APIC ID %d too high", apic_id); 292 return; 293 } 294 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", 295 apic_id)); 296 cpu_info[apic_id].cpu_present = 1; 297 if (boot_cpu) { 298 KASSERT(boot_cpu_id == -1, 299 ("CPU %d claims to be BSP, but CPU %d already is", apic_id, 300 boot_cpu_id)); 301 boot_cpu_id = apic_id; 302 cpu_info[apic_id].cpu_bsp = 1; 303 } 304 if (mp_ncpus < MAXCPU) 305 mp_ncpus++; 306 if (bootverbose) 307 printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : 308 "AP"); 309} 310 311void 312cpu_mp_setmaxid(void) 313{ 314 315 mp_maxid = MAXCPU - 1; 316} 317 318int 319cpu_mp_probe(void) 320{ 321 322 /* 323 * Always record BSP in CPU map so that the mbuf init code works 324 * correctly. 325 */ 326 all_cpus = 1; 327 if (mp_ncpus == 0) { 328 /* 329 * No CPUs were found, so this must be a UP system. Setup 330 * the variables to represent a system with a single CPU 331 * with an id of 0. 332 */ 333 mp_ncpus = 1; 334 return (0); 335 } 336 337 /* At least one CPU was found. */ 338 if (mp_ncpus == 1) { 339 /* 340 * One CPU was found, so this must be a UP system with 341 * an I/O APIC. 342 */ 343 return (0); 344 } 345 346 /* At least two CPUs were found. */ 347 return (1); 348} 349 350/* 351 * Initialize the IPI handlers and start up the AP's. 352 */ 353void 354cpu_mp_start(void) 355{ 356 int i; 357 u_int threads_per_cache, p[4]; 358 359 /* Initialize the logical ID to APIC ID table. */ 360 for (i = 0; i < MAXCPU; i++) { 361 cpu_apic_ids[i] = -1; 362 cpu_ipi_pending[i] = 0; 363 } 364 365 /* Install an inter-CPU IPI for TLB invalidation */ 366 setidt(IPI_INVLTLB, IDTVEC(invltlb), 367 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); 368 setidt(IPI_INVLPG, IDTVEC(invlpg), 369 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); 370 setidt(IPI_INVLRNG, IDTVEC(invlrng), 371 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); 372 373 /* Install an inter-CPU IPI for cache invalidation. */ 374 setidt(IPI_INVLCACHE, IDTVEC(invlcache), 375 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); 376 377 /* Install an inter-CPU IPI for lazy pmap release */ 378 setidt(IPI_LAZYPMAP, IDTVEC(lazypmap), 379 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); 380 381 /* Install an inter-CPU IPI for all-CPU rendezvous */ 382 setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous), 383 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); 384 385 /* Install generic inter-CPU IPI handler */ 386 setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler), 387 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); 388 389 /* Install an inter-CPU IPI for CPU stop/restart */ 390 setidt(IPI_STOP, IDTVEC(cpustop), 391 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); 392 393 394 /* Set boot_cpu_id if needed. */ 395 if (boot_cpu_id == -1) { 396 boot_cpu_id = PCPU_GET(apic_id); 397 cpu_info[boot_cpu_id].cpu_bsp = 1; 398 } else 399 KASSERT(boot_cpu_id == PCPU_GET(apic_id), 400 ("BSP's APIC ID doesn't match boot_cpu_id")); 401 cpu_apic_ids[0] = boot_cpu_id; 402 apic_cpuids[boot_cpu_id] = 0; 403 404 /* Setup the initial logical CPUs info. */ 405 logical_cpus = logical_cpus_mask = 0; 406 if (cpu_feature & CPUID_HTT) 407 logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16; 408 409 /* 410 * Work out if hyperthreading is *really* enabled. This 411 * is made really ugly by the fact that processors lie: Dual 412 * core processors claim to be hyperthreaded even when they're 413 * not, presumably because they want to be treated the same 414 * way as HTT with respect to per-cpu software licensing. 415 * At the time of writing (May 12, 2005) the only hyperthreaded 416 * cpus are from Intel, and Intel's dual-core processors can be 417 * identified via the "deterministic cache parameters" cpuid 418 * calls. 419 */ 420 /* 421 * First determine if this is an Intel processor which claims 422 * to have hyperthreading support. 423 */ 424 if ((cpu_feature & CPUID_HTT) && cpu_vendor_id == CPU_VENDOR_INTEL) { 425 /* 426 * If the "deterministic cache parameters" cpuid calls 427 * are available, use them. 428 */ 429 if (cpu_high >= 4) { 430 /* Ask the processor about the L1 cache. */ 431 for (i = 0; i < 1; i++) { 432 cpuid_count(4, i, p); 433 threads_per_cache = ((p[0] & 0x3ffc000) >> 14) + 1; 434 if (hyperthreading_cpus < threads_per_cache) 435 hyperthreading_cpus = threads_per_cache; 436 if ((p[0] & 0x1f) == 0) 437 break; 438 } 439 } 440 441 /* 442 * If the deterministic cache parameters are not 443 * available, or if no caches were reported to exist, 444 * just accept what the HTT flag indicated. 445 */ 446 if (hyperthreading_cpus == 0) 447 hyperthreading_cpus = logical_cpus; 448 } 449 450 assign_cpu_ids(); 451 452 /* Start each Application Processor */ 453 start_all_aps(); 454 455 set_interrupt_apic_ids(); 456} 457 458 459/* 460 * Print various information about the SMP system hardware and setup. 461 */ 462void 463cpu_mp_announce(void) 464{ 465 int i, x; 466 const char *hyperthread; 467 468 /* List CPUs */ 469 printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id); 470 for (i = 1, x = 0; x <= MAX_APIC_ID; x++) { 471 if (!cpu_info[x].cpu_present || cpu_info[x].cpu_bsp) 472 continue; 473 if (cpu_info[x].cpu_hyperthread) { 474 hyperthread = "/HT"; 475 } else { 476 hyperthread = ""; 477 } 478 if (cpu_info[x].cpu_disabled) 479 printf(" cpu (AP%s): APIC ID: %2d (disabled)\n", 480 hyperthread, x); 481 else { 482 KASSERT(i < mp_ncpus, 483 ("mp_ncpus and actual cpus are out of whack")); 484 printf(" cpu%d (AP%s): APIC ID: %2d\n", i++, 485 hyperthread, x); 486 } 487 } 488} 489 490/* 491 * AP CPU's call this to initialize themselves. 492 */ 493void 494init_secondary(void) 495{ 496 struct pcpu *pc; 497 vm_offset_t addr; 498 int gsel_tss; 499 int x, myid; 500 u_int cr0; 501 502 /* bootAP is set in start_ap() to our ID. */ 503 myid = bootAP; 504 505 /* Get per-cpu data */ 506 pc = &__pcpu[myid]; 507 508 /* prime data page for it to use */ 509 pcpu_init(pc, myid, sizeof(struct pcpu)); 510 pc->pc_apic_id = cpu_apic_ids[myid]; 511 pc->pc_prvspace = pc; 512 pc->pc_curthread = 0; 513 514 gdt_segs[GPRIV_SEL].ssd_base = (int) pc; 515 gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; 516 517 for (x = 0; x < NGDT; x++) { 518 ssdtosd(&gdt_segs[x], &gdt[myid * NGDT + x].sd); 519 } 520 521 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 522 r_gdt.rd_base = (int) &gdt[myid * NGDT]; 523 lgdt(&r_gdt); /* does magic intra-segment return */ 524 525 lidt(&r_idt); 526 527 lldt(_default_ldt); 528 PCPU_SET(currentldt, _default_ldt); 529 530 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 531 gdt[myid * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; 532 PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */ 533 PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); 534 PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); 535 PCPU_SET(tss_gdt, &gdt[myid * NGDT + GPROC0_SEL].sd); 536 PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); 537 ltr(gsel_tss); 538 539 PCPU_SET(fsgs_gdt, &gdt[myid * NGDT + GUFS_SEL].sd); 540 541 /* 542 * Set to a known state: 543 * Set by mpboot.s: CR0_PG, CR0_PE 544 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM 545 */ 546 cr0 = rcr0(); 547 cr0 &= ~(CR0_CD | CR0_NW | CR0_EM); 548 load_cr0(cr0); 549 CHECK_WRITE(0x38, 5); 550 551 /* Disable local APIC just to be sure. */ 552 lapic_disable(); 553 554 /* signal our startup to the BSP. */ 555 mp_naps++; 556 CHECK_WRITE(0x39, 6); 557 558 /* Spin until the BSP releases the AP's. */ 559 while (!aps_ready) 560 ia32_pause(); 561 562 /* BSP may have changed PTD while we were waiting */ 563 invltlb(); 564 for (addr = 0; addr < NKPT * NBPDR - 1; addr += PAGE_SIZE) 565 invlpg(addr); 566 567#if defined(I586_CPU) && !defined(NO_F00F_HACK) 568 lidt(&r_idt); 569#endif 570 571 /* Initialize the PAT MSR if present. */ 572 pmap_init_pat(); 573 574 /* set up CPU registers and state */ 575 cpu_setregs(); 576 577 /* set up FPU state on the AP */ 578 npxinit(__INITIAL_NPXCW__); 579 580 /* set up SSE registers */ 581 enable_sse(); 582 583#ifdef PAE 584 /* Enable the PTE no-execute bit. */ 585 if ((amd_feature & AMDID_NX) != 0) { 586 uint64_t msr; 587 588 msr = rdmsr(MSR_EFER) | EFER_NXE; 589 wrmsr(MSR_EFER, msr); 590 } 591#endif 592 593 /* A quick check from sanity claus */ 594 if (PCPU_GET(apic_id) != lapic_id()) { 595 printf("SMP: cpuid = %d\n", PCPU_GET(cpuid)); 596 printf("SMP: actual apic_id = %d\n", lapic_id()); 597 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); 598 panic("cpuid mismatch! boom!!"); 599 } 600 601 /* Initialize curthread. */ 602 KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); 603 PCPU_SET(curthread, PCPU_GET(idlethread)); 604 605 mtx_lock_spin(&ap_boot_mtx); 606 607 /* Init local apic for irq's */ 608 lapic_setup(1); 609 610 /* Set memory range attributes for this CPU to match the BSP */ 611 mem_range_AP_init(); 612 613 smp_cpus++; 614 615 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid)); 616 printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid)); 617 618 /* Determine if we are a logical CPU. */ 619 if (logical_cpus > 1 && PCPU_GET(apic_id) % logical_cpus != 0) 620 logical_cpus_mask |= PCPU_GET(cpumask); 621 622 /* Determine if we are a hyperthread. */ 623 if (hyperthreading_cpus > 1 && 624 PCPU_GET(apic_id) % hyperthreading_cpus != 0) 625 hyperthreading_cpus_mask |= PCPU_GET(cpumask); 626 627 /* Build our map of 'other' CPUs. */ 628 PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); 629 630 if (bootverbose) 631 lapic_dump("AP"); 632 633 if (smp_cpus == mp_ncpus) { 634 /* enable IPI's, tlb shootdown, freezes etc */ 635 atomic_store_rel_int(&smp_started, 1); 636 smp_active = 1; /* historic */ 637 } 638 639 mtx_unlock_spin(&ap_boot_mtx); 640 641 /* wait until all the AP's are up */ 642 while (smp_started == 0) 643 ia32_pause(); 644 645 /* enter the scheduler */ 646 sched_throw(NULL); 647 648 panic("scheduler returned us to %s", __func__); 649 /* NOTREACHED */ 650} 651 652/******************************************************************* 653 * local functions and data 654 */ 655 656/* 657 * We tell the I/O APIC code about all the CPUs we want to receive 658 * interrupts. If we don't want certain CPUs to receive IRQs we 659 * can simply not tell the I/O APIC code about them in this function. 660 * We also do not tell it about the BSP since it tells itself about 661 * the BSP internally to work with UP kernels and on UP machines. 662 */ 663static void 664set_interrupt_apic_ids(void) 665{ 666 u_int i, apic_id; 667 668 for (i = 0; i < MAXCPU; i++) { 669 apic_id = cpu_apic_ids[i]; 670 if (apic_id == -1) 671 continue; 672 if (cpu_info[apic_id].cpu_bsp) 673 continue; 674 if (cpu_info[apic_id].cpu_disabled) 675 continue; 676 677 /* Don't let hyperthreads service interrupts. */ 678 if (hyperthreading_cpus > 1 && 679 apic_id % hyperthreading_cpus != 0) 680 continue; 681 682 intr_add_cpu(i); 683 } 684} 685 686/* 687 * Assign logical CPU IDs to local APICs. 688 */ 689static void 690assign_cpu_ids(void) 691{ 692 u_int i; 693 694 TUNABLE_INT_FETCH("machdep.hyperthreading_allowed", 695 &hyperthreading_allowed); 696 697 /* Check for explicitly disabled CPUs. */ 698 for (i = 0; i <= MAX_APIC_ID; i++) { 699 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp) 700 continue; 701 702 if (hyperthreading_cpus > 1 && i % hyperthreading_cpus != 0) { 703 cpu_info[i].cpu_hyperthread = 1; 704#if defined(SCHED_ULE) 705 /* 706 * Don't use HT CPU if it has been disabled by a 707 * tunable. 708 */ 709 if (hyperthreading_allowed == 0) { 710 cpu_info[i].cpu_disabled = 1; 711 continue; 712 } 713#endif 714 } 715 716 /* Don't use this CPU if it has been disabled by a tunable. */ 717 if (resource_disabled("lapic", i)) { 718 cpu_info[i].cpu_disabled = 1; 719 continue; 720 } 721 } 722 723 /* 724 * Assign CPU IDs to local APIC IDs and disable any CPUs 725 * beyond MAXCPU. CPU 0 has already been assigned to the BSP, 726 * so we only have to assign IDs for APs. 727 */ 728 mp_ncpus = 1; 729 for (i = 0; i <= MAX_APIC_ID; i++) { 730 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp || 731 cpu_info[i].cpu_disabled) 732 continue; 733 734 if (mp_ncpus < MAXCPU) { 735 cpu_apic_ids[mp_ncpus] = i; 736 apic_cpuids[i] = mp_ncpus; 737 mp_ncpus++; 738 } else 739 cpu_info[i].cpu_disabled = 1; 740 } 741 KASSERT(mp_maxid >= mp_ncpus - 1, 742 ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, 743 mp_ncpus)); 744} 745 746/* 747 * start each AP in our list 748 */ 749/* Lowest 1MB is already mapped: don't touch*/ 750#define TMPMAP_START 1 751static int 752start_all_aps(void) 753{ 754#ifndef PC98 755 u_char mpbiosreason; 756#endif 757 uintptr_t kptbase; 758 u_int32_t mpbioswarmvec; 759 int apic_id, cpu, i; 760 761 mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); 762 763 /* install the AP 1st level boot code */ 764 install_ap_tramp(); 765 766 /* save the current value of the warm-start vector */ 767 mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF); 768#ifndef PC98 769 outb(CMOS_REG, BIOS_RESET); 770 mpbiosreason = inb(CMOS_DATA); 771#endif 772 773 /* set up temporary P==V mapping for AP boot */ 774 /* XXX this is a hack, we should boot the AP on its own stack/PTD */ 775 776 kptbase = (uintptr_t)(void *)KPTphys; 777 for (i = TMPMAP_START; i < NKPT; i++) 778 PTD[i] = (pd_entry_t)(PG_V | PG_RW | 779 ((kptbase + i * PAGE_SIZE) & PG_FRAME)); 780 invltlb(); 781 782 /* start each AP */ 783 for (cpu = 1; cpu < mp_ncpus; cpu++) { 784 apic_id = cpu_apic_ids[cpu]; 785 786 /* allocate and set up a boot stack data page */ 787 bootstacks[cpu] = (char *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE); 788 789 /* setup a vector to our boot code */ 790 *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET; 791 *((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4); 792#ifndef PC98 793 outb(CMOS_REG, BIOS_RESET); 794 outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ 795#endif 796 797 bootSTK = (char *)bootstacks[cpu] + KSTACK_PAGES * PAGE_SIZE - 4; 798 bootAP = cpu; 799 800 /* attempt to start the Application Processor */ 801 CHECK_INIT(99); /* setup checkpoints */ 802 if (!start_ap(apic_id)) { 803 printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id); 804 CHECK_PRINT("trace"); /* show checkpoints */ 805 /* better panic as the AP may be running loose */ 806 printf("panic y/n? [y] "); 807 if (cngetc() != 'n') 808 panic("bye-bye"); 809 } 810 CHECK_PRINT("trace"); /* show checkpoints */ 811 812 all_cpus |= (1 << cpu); /* record AP in CPU map */ 813 } 814 815 /* build our map of 'other' CPUs */ 816 PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); 817 818 /* restore the warmstart vector */ 819 *(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec; 820 821#ifndef PC98 822 outb(CMOS_REG, BIOS_RESET); 823 outb(CMOS_DATA, mpbiosreason); 824#endif 825 826 /* Undo V==P hack from above */ 827 for (i = TMPMAP_START; i < NKPT; i++) 828 PTD[i] = 0; 829 pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1); 830 831 /* number of APs actually started */ 832 return mp_naps; 833} 834 835/* 836 * load the 1st level AP boot code into base memory. 837 */ 838 839/* targets for relocation */ 840extern void bigJump(void); 841extern void bootCodeSeg(void); 842extern void bootDataSeg(void); 843extern void MPentry(void); 844extern u_int MP_GDT; 845extern u_int mp_gdtbase; 846 847static void 848install_ap_tramp(void) 849{ 850 int x; 851 int size = *(int *) ((u_long) & bootMP_size); 852 vm_offset_t va = boot_address + KERNBASE; 853 u_char *src = (u_char *) ((u_long) bootMP); 854 u_char *dst = (u_char *) va; 855 u_int boot_base = (u_int) bootMP; 856 u_int8_t *dst8; 857 u_int16_t *dst16; 858 u_int32_t *dst32; 859 860 KASSERT (size <= PAGE_SIZE, 861 ("'size' do not fit into PAGE_SIZE, as expected.")); 862 pmap_kenter(va, boot_address); 863 pmap_invalidate_page (kernel_pmap, va); 864 for (x = 0; x < size; ++x) 865 *dst++ = *src++; 866 867 /* 868 * modify addresses in code we just moved to basemem. unfortunately we 869 * need fairly detailed info about mpboot.s for this to work. changes 870 * to mpboot.s might require changes here. 871 */ 872 873 /* boot code is located in KERNEL space */ 874 dst = (u_char *) va; 875 876 /* modify the lgdt arg */ 877 dst32 = (u_int32_t *) (dst + ((u_int) & mp_gdtbase - boot_base)); 878 *dst32 = boot_address + ((u_int) & MP_GDT - boot_base); 879 880 /* modify the ljmp target for MPentry() */ 881 dst32 = (u_int32_t *) (dst + ((u_int) bigJump - boot_base) + 1); 882 *dst32 = ((u_int) MPentry - KERNBASE); 883 884 /* modify the target for boot code segment */ 885 dst16 = (u_int16_t *) (dst + ((u_int) bootCodeSeg - boot_base)); 886 dst8 = (u_int8_t *) (dst16 + 1); 887 *dst16 = (u_int) boot_address & 0xffff; 888 *dst8 = ((u_int) boot_address >> 16) & 0xff; 889 890 /* modify the target for boot data segment */ 891 dst16 = (u_int16_t *) (dst + ((u_int) bootDataSeg - boot_base)); 892 dst8 = (u_int8_t *) (dst16 + 1); 893 *dst16 = (u_int) boot_address & 0xffff; 894 *dst8 = ((u_int) boot_address >> 16) & 0xff; 895} 896 897/* 898 * This function starts the AP (application processor) identified 899 * by the APIC ID 'physicalCpu'. It does quite a "song and dance" 900 * to accomplish this. This is necessary because of the nuances 901 * of the different hardware we might encounter. It isn't pretty, 902 * but it seems to work. 903 */ 904static int 905start_ap(int apic_id) 906{ 907 int vector, ms; 908 int cpus; 909 910 /* calculate the vector */ 911 vector = (boot_address >> 12) & 0xff; 912 913 /* used as a watchpoint to signal AP startup */ 914 cpus = mp_naps; 915 916 /* 917 * first we do an INIT/RESET IPI this INIT IPI might be run, reseting 918 * and running the target CPU. OR this INIT IPI might be latched (P5 919 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be 920 * ignored. 921 */ 922 923 /* do an INIT IPI: assert RESET */ 924 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 925 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); 926 927 /* wait for pending status end */ 928 lapic_ipi_wait(-1); 929 930 /* do an INIT IPI: deassert RESET */ 931 lapic_ipi_raw(APIC_DEST_ALLESELF | APIC_TRIGMOD_LEVEL | 932 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 0); 933 934 /* wait for pending status end */ 935 DELAY(10000); /* wait ~10mS */ 936 lapic_ipi_wait(-1); 937 938 /* 939 * next we do a STARTUP IPI: the previous INIT IPI might still be 940 * latched, (P5 bug) this 1st STARTUP would then terminate 941 * immediately, and the previously started INIT IPI would continue. OR 942 * the previous INIT IPI has already run. and this STARTUP IPI will 943 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI 944 * will run. 945 */ 946 947 /* do a STARTUP IPI */ 948 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 949 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 950 vector, apic_id); 951 lapic_ipi_wait(-1); 952 DELAY(200); /* wait ~200uS */ 953 954 /* 955 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF 956 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR 957 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is 958 * recognized after hardware RESET or INIT IPI. 959 */ 960 961 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 962 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 963 vector, apic_id); 964 lapic_ipi_wait(-1); 965 DELAY(200); /* wait ~200uS */ 966 967 /* Wait up to 5 seconds for it to start. */ 968 for (ms = 0; ms < 5000; ms++) { 969 if (mp_naps > cpus) 970 return 1; /* return SUCCESS */ 971 DELAY(1000); 972 } 973 return 0; /* return FAILURE */ 974} 975 976#ifdef COUNT_XINVLTLB_HITS 977u_int xhits_gbl[MAXCPU]; 978u_int xhits_pg[MAXCPU]; 979u_int xhits_rng[MAXCPU]; 980SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, ""); 981SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, 982 sizeof(xhits_gbl), "IU", ""); 983SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, 984 sizeof(xhits_pg), "IU", ""); 985SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, 986 sizeof(xhits_rng), "IU", ""); 987 988u_int ipi_global; 989u_int ipi_page; 990u_int ipi_range; 991u_int ipi_range_size; 992SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); 993SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); 994SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); 995SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, 996 0, ""); 997 998u_int ipi_masked_global; 999u_int ipi_masked_page; 1000u_int ipi_masked_range; 1001u_int ipi_masked_range_size; 1002SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW, 1003 &ipi_masked_global, 0, ""); 1004SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW, 1005 &ipi_masked_page, 0, ""); 1006SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW, 1007 &ipi_masked_range, 0, ""); 1008SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW, 1009 &ipi_masked_range_size, 0, ""); 1010#endif /* COUNT_XINVLTLB_HITS */ 1011 1012/* 1013 * Flush the TLB on all other CPU's 1014 */ 1015static void 1016smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) 1017{ 1018 u_int ncpu; 1019 1020 ncpu = mp_ncpus - 1; /* does not shootdown self */ 1021 if (ncpu < 1) 1022 return; /* no other cpus */ 1023 if (!(read_eflags() & PSL_I)) 1024 panic("%s: interrupts disabled", __func__); 1025 mtx_lock_spin(&smp_ipi_mtx); 1026 smp_tlb_addr1 = addr1; 1027 smp_tlb_addr2 = addr2; 1028 atomic_store_rel_int(&smp_tlb_wait, 0); 1029 ipi_all_but_self(vector); 1030 while (smp_tlb_wait < ncpu) 1031 ia32_pause(); 1032 mtx_unlock_spin(&smp_ipi_mtx); 1033} 1034 1035static void 1036smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2) 1037{ 1038 int ncpu, othercpus; 1039 1040 othercpus = mp_ncpus - 1; 1041 if (mask == (u_int)-1) { 1042 ncpu = othercpus; 1043 if (ncpu < 1) 1044 return; 1045 } else { 1046 mask &= ~PCPU_GET(cpumask); 1047 if (mask == 0) 1048 return; 1049 ncpu = bitcount32(mask); 1050 if (ncpu > othercpus) { 1051 /* XXX this should be a panic offence */ 1052 printf("SMP: tlb shootdown to %d other cpus (only have %d)\n", 1053 ncpu, othercpus); 1054 ncpu = othercpus; 1055 } 1056 /* XXX should be a panic, implied by mask == 0 above */ 1057 if (ncpu < 1) 1058 return; 1059 } 1060 if (!(read_eflags() & PSL_I)) 1061 panic("%s: interrupts disabled", __func__); 1062 mtx_lock_spin(&smp_ipi_mtx); 1063 smp_tlb_addr1 = addr1; 1064 smp_tlb_addr2 = addr2; 1065 atomic_store_rel_int(&smp_tlb_wait, 0); 1066 if (mask == (u_int)-1) 1067 ipi_all_but_self(vector); 1068 else 1069 ipi_selected(mask, vector); 1070 while (smp_tlb_wait < ncpu) 1071 ia32_pause(); 1072 mtx_unlock_spin(&smp_ipi_mtx); 1073} 1074 1075void 1076smp_cache_flush(void) 1077{ 1078 1079 if (smp_started) 1080 smp_tlb_shootdown(IPI_INVLCACHE, 0, 0); 1081} 1082 1083void 1084smp_invltlb(void) 1085{ 1086 1087 if (smp_started) { 1088 smp_tlb_shootdown(IPI_INVLTLB, 0, 0); 1089#ifdef COUNT_XINVLTLB_HITS 1090 ipi_global++; 1091#endif 1092 } 1093} 1094 1095void 1096smp_invlpg(vm_offset_t addr) 1097{ 1098 1099 if (smp_started) { 1100 smp_tlb_shootdown(IPI_INVLPG, addr, 0); 1101#ifdef COUNT_XINVLTLB_HITS 1102 ipi_page++; 1103#endif 1104 } 1105} 1106 1107void 1108smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2) 1109{ 1110 1111 if (smp_started) { 1112 smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2); 1113#ifdef COUNT_XINVLTLB_HITS 1114 ipi_range++; 1115 ipi_range_size += (addr2 - addr1) / PAGE_SIZE; 1116#endif 1117 } 1118} 1119 1120void 1121smp_masked_invltlb(u_int mask) 1122{ 1123 1124 if (smp_started) { 1125 smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0); 1126#ifdef COUNT_XINVLTLB_HITS 1127 ipi_masked_global++; 1128#endif 1129 } 1130} 1131 1132void 1133smp_masked_invlpg(u_int mask, vm_offset_t addr) 1134{ 1135 1136 if (smp_started) { 1137 smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0); 1138#ifdef COUNT_XINVLTLB_HITS 1139 ipi_masked_page++; 1140#endif 1141 } 1142} 1143 1144void 1145smp_masked_invlpg_range(u_int mask, vm_offset_t addr1, vm_offset_t addr2) 1146{ 1147 1148 if (smp_started) { 1149 smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2); 1150#ifdef COUNT_XINVLTLB_HITS 1151 ipi_masked_range++; 1152 ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE; 1153#endif 1154 } 1155} 1156 1157void 1158ipi_bitmap_handler(struct trapframe frame) 1159{ 1160 int cpu = PCPU_GET(cpuid); 1161 u_int ipi_bitmap; 1162 1163 ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); 1164 1165 if (ipi_bitmap & (1 << IPI_PREEMPT)) { 1166#ifdef COUNT_IPIS 1167 (*ipi_preempt_counts[cpu])++; 1168#endif 1169 sched_preempt(curthread); 1170 } 1171 1172 if (ipi_bitmap & (1 << IPI_AST)) { 1173#ifdef COUNT_IPIS 1174 (*ipi_ast_counts[cpu])++; 1175#endif 1176 /* Nothing to do for AST */ 1177 } 1178} 1179 1180/* 1181 * send an IPI to a set of cpus. 1182 */ 1183void 1184ipi_selected(u_int32_t cpus, u_int ipi) 1185{ 1186 int cpu; 1187 u_int bitmap = 0; 1188 u_int old_pending; 1189 u_int new_pending; 1190 1191 if (IPI_IS_BITMAPED(ipi)) { 1192 bitmap = 1 << ipi; 1193 ipi = IPI_BITMAP_VECTOR; 1194 } 1195 1196#ifdef STOP_NMI 1197 if (ipi == IPI_STOP && stop_cpus_with_nmi) { 1198 ipi_nmi_selected(cpus); 1199 return; 1200 } 1201#endif 1202 CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi); 1203 while ((cpu = ffs(cpus)) != 0) { 1204 cpu--; 1205 cpus &= ~(1 << cpu); 1206 1207 KASSERT(cpu_apic_ids[cpu] != -1, 1208 ("IPI to non-existent CPU %d", cpu)); 1209 1210 if (bitmap) { 1211 do { 1212 old_pending = cpu_ipi_pending[cpu]; 1213 new_pending = old_pending | bitmap; 1214 } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu],old_pending, new_pending)); 1215 1216 if (old_pending) 1217 continue; 1218 } 1219 1220 lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]); 1221 } 1222 1223} 1224 1225/* 1226 * send an IPI to all CPUs EXCEPT myself 1227 */ 1228void 1229ipi_all_but_self(u_int ipi) 1230{ 1231 1232 if (IPI_IS_BITMAPED(ipi) || (ipi == IPI_STOP && stop_cpus_with_nmi)) { 1233 ipi_selected(PCPU_GET(other_cpus), ipi); 1234 return; 1235 } 1236 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); 1237 lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); 1238} 1239 1240#ifdef STOP_NMI 1241/* 1242 * send NMI IPI to selected CPUs 1243 */ 1244 1245#define BEFORE_SPIN 1000000 1246 1247void 1248ipi_nmi_selected(u_int32_t cpus) 1249{ 1250 int cpu; 1251 register_t icrlo; 1252 1253 icrlo = APIC_DELMODE_NMI | APIC_DESTMODE_PHY | APIC_LEVEL_ASSERT 1254 | APIC_TRIGMOD_EDGE; 1255 1256 CTR2(KTR_SMP, "%s: cpus: %x nmi", __func__, cpus); 1257 1258 atomic_set_int(&ipi_nmi_pending, cpus); 1259 1260 while ((cpu = ffs(cpus)) != 0) { 1261 cpu--; 1262 cpus &= ~(1 << cpu); 1263 1264 KASSERT(cpu_apic_ids[cpu] != -1, 1265 ("IPI NMI to non-existent CPU %d", cpu)); 1266 1267 /* Wait for an earlier IPI to finish. */ 1268 if (!lapic_ipi_wait(BEFORE_SPIN)) 1269 panic("ipi_nmi_selected: previous IPI has not cleared"); 1270 1271 lapic_ipi_raw(icrlo, cpu_apic_ids[cpu]); 1272 } 1273} 1274 1275int 1276ipi_nmi_handler(void) 1277{ 1278 int cpumask = PCPU_GET(cpumask); 1279 1280 if (!(ipi_nmi_pending & cpumask)) 1281 return 1; 1282 1283 atomic_clear_int(&ipi_nmi_pending, cpumask); 1284 cpustop_handler(); 1285 return 0; 1286} 1287 1288#endif /* STOP_NMI */ 1289 1290/* 1291 * Handle an IPI_STOP by saving our current context and spinning until we 1292 * are resumed. 1293 */ 1294void 1295cpustop_handler(void) 1296{ 1297 int cpu = PCPU_GET(cpuid); 1298 int cpumask = PCPU_GET(cpumask); 1299 1300 savectx(&stoppcbs[cpu]); 1301 1302 /* Indicate that we are stopped */ 1303 atomic_set_int(&stopped_cpus, cpumask); 1304 1305 /* Wait for restart */ 1306 while (!(started_cpus & cpumask)) 1307 ia32_pause(); 1308 1309 atomic_clear_int(&started_cpus, cpumask); 1310 atomic_clear_int(&stopped_cpus, cpumask); 1311 1312 if (cpu == 0 && cpustop_restartfunc != NULL) { 1313 cpustop_restartfunc(); 1314 cpustop_restartfunc = NULL; 1315 } 1316} 1317 1318/* 1319 * This is called once the rest of the system is up and running and we're 1320 * ready to let the AP's out of the pen. 1321 */ 1322static void 1323release_aps(void *dummy __unused) 1324{ 1325 1326 if (mp_ncpus == 1) 1327 return; 1328 atomic_store_rel_int(&aps_ready, 1); 1329 while (smp_started == 0) 1330 ia32_pause(); 1331} 1332SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); 1333 1334static int 1335sysctl_hlt_cpus(SYSCTL_HANDLER_ARGS) 1336{ 1337 u_int mask; 1338 int error; 1339 1340 mask = hlt_cpus_mask; 1341 error = sysctl_handle_int(oidp, &mask, 0, req); 1342 if (error || !req->newptr) 1343 return (error); 1344 1345 if (logical_cpus_mask != 0 && 1346 (mask & logical_cpus_mask) == logical_cpus_mask) 1347 hlt_logical_cpus = 1; 1348 else 1349 hlt_logical_cpus = 0; 1350 1351 if (! hyperthreading_allowed) 1352 mask |= hyperthreading_cpus_mask; 1353 1354 if ((mask & all_cpus) == all_cpus) 1355 mask &= ~(1<<0); 1356 hlt_cpus_mask = mask; 1357 return (error); 1358} 1359SYSCTL_PROC(_machdep, OID_AUTO, hlt_cpus, CTLTYPE_INT|CTLFLAG_RW, 1360 0, 0, sysctl_hlt_cpus, "IU", 1361 "Bitmap of CPUs to halt. 101 (binary) will halt CPUs 0 and 2."); 1362 1363static int 1364sysctl_hlt_logical_cpus(SYSCTL_HANDLER_ARGS) 1365{ 1366 int disable, error; 1367 1368 disable = hlt_logical_cpus; 1369 error = sysctl_handle_int(oidp, &disable, 0, req); 1370 if (error || !req->newptr) 1371 return (error); 1372 1373 if (disable) 1374 hlt_cpus_mask |= logical_cpus_mask; 1375 else 1376 hlt_cpus_mask &= ~logical_cpus_mask; 1377 1378 if (! hyperthreading_allowed) 1379 hlt_cpus_mask |= hyperthreading_cpus_mask; 1380 1381 if ((hlt_cpus_mask & all_cpus) == all_cpus) 1382 hlt_cpus_mask &= ~(1<<0); 1383 1384 hlt_logical_cpus = disable; 1385 return (error); 1386} 1387 1388static int 1389sysctl_hyperthreading_allowed(SYSCTL_HANDLER_ARGS) 1390{ 1391 int allowed, error; 1392 1393 allowed = hyperthreading_allowed; 1394 error = sysctl_handle_int(oidp, &allowed, 0, req); 1395 if (error || !req->newptr) 1396 return (error); 1397 1398#ifdef SCHED_ULE 1399 /* 1400 * SCHED_ULE doesn't allow enabling/disabling HT cores at 1401 * tun time. 1402 */ 1403 if (allowed != hyperthreading_allowed) 1404 return (ENOTSUP); 1405 return (error); 1406#endif 1407 1408 if (allowed) 1409 hlt_cpus_mask &= ~hyperthreading_cpus_mask; 1410 else 1411 hlt_cpus_mask |= hyperthreading_cpus_mask; 1412 1413 if (logical_cpus_mask != 0 && 1414 (hlt_cpus_mask & logical_cpus_mask) == logical_cpus_mask) 1415 hlt_logical_cpus = 1; 1416 else 1417 hlt_logical_cpus = 0; 1418 1419 if ((hlt_cpus_mask & all_cpus) == all_cpus) 1420 hlt_cpus_mask &= ~(1<<0); 1421 1422 hyperthreading_allowed = allowed; 1423 return (error); 1424} 1425 1426static void 1427cpu_hlt_setup(void *dummy __unused) 1428{ 1429 1430 if (logical_cpus_mask != 0) { 1431 TUNABLE_INT_FETCH("machdep.hlt_logical_cpus", 1432 &hlt_logical_cpus); 1433 sysctl_ctx_init(&logical_cpu_clist); 1434 SYSCTL_ADD_PROC(&logical_cpu_clist, 1435 SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO, 1436 "hlt_logical_cpus", CTLTYPE_INT|CTLFLAG_RW, 0, 0, 1437 sysctl_hlt_logical_cpus, "IU", ""); 1438 SYSCTL_ADD_UINT(&logical_cpu_clist, 1439 SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO, 1440 "logical_cpus_mask", CTLTYPE_INT|CTLFLAG_RD, 1441 &logical_cpus_mask, 0, ""); 1442 1443 if (hlt_logical_cpus) 1444 hlt_cpus_mask |= logical_cpus_mask; 1445 1446 /* 1447 * If necessary for security purposes, force 1448 * hyperthreading off, regardless of the value 1449 * of hlt_logical_cpus. 1450 */ 1451 if (hyperthreading_cpus_mask) { 1452 SYSCTL_ADD_PROC(&logical_cpu_clist, 1453 SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO, 1454 "hyperthreading_allowed", CTLTYPE_INT|CTLFLAG_RW, 1455 0, 0, sysctl_hyperthreading_allowed, "IU", ""); 1456 if (! hyperthreading_allowed) 1457 hlt_cpus_mask |= hyperthreading_cpus_mask; 1458 } 1459 } 1460} 1461SYSINIT(cpu_hlt, SI_SUB_SMP, SI_ORDER_ANY, cpu_hlt_setup, NULL); 1462 1463int 1464mp_grab_cpu_hlt(void) 1465{ 1466 u_int mask = PCPU_GET(cpumask); 1467#ifdef MP_WATCHDOG 1468 u_int cpuid = PCPU_GET(cpuid); 1469#endif 1470 int retval; 1471 1472#ifdef MP_WATCHDOG 1473 ap_watchdog(cpuid); 1474#endif 1475 1476 retval = mask & hlt_cpus_mask; 1477 while (mask & hlt_cpus_mask) 1478 __asm __volatile("sti; hlt" : : : "memory"); 1479 return (retval); 1480} 1481 1482#ifdef COUNT_IPIS 1483/* 1484 * Setup interrupt counters for IPI handlers. 1485 */ 1486static void 1487mp_ipi_intrcnt(void *dummy) 1488{ 1489 char buf[64]; 1490 int i; 1491 1492 for (i = 0; i < mp_maxid; i++) { 1493 if (CPU_ABSENT(i)) 1494 continue; 1495 snprintf(buf, sizeof(buf), "cpu%d: invltlb", i); 1496 intrcnt_add(buf, &ipi_invltlb_counts[i]); 1497 snprintf(buf, sizeof(buf), "cpu%d: invlrng", i); 1498 intrcnt_add(buf, &ipi_invlrng_counts[i]); 1499 snprintf(buf, sizeof(buf), "cpu%d: invlpg", i); 1500 intrcnt_add(buf, &ipi_invlpg_counts[i]); 1501 snprintf(buf, sizeof(buf), "cpu%d: preempt", i); 1502 intrcnt_add(buf, &ipi_preempt_counts[i]); 1503 snprintf(buf, sizeof(buf), "cpu%d: ast", i); 1504 intrcnt_add(buf, &ipi_ast_counts[i]); 1505 snprintf(buf, sizeof(buf), "cpu%d: rendezvous", i); 1506 intrcnt_add(buf, &ipi_rendezvous_counts[i]); 1507 snprintf(buf, sizeof(buf), "cpu%d: lazypmap", i); 1508 intrcnt_add(buf, &ipi_lazypmap_counts[i]); 1509 } 1510} 1511SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL); 1512#endif 1513