mp_machdep.c revision 214630
1/*- 2 * Copyright (c) 1996, by Steve Passe 3 * Copyright (c) 2003, by Peter Wemm 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. The name of the developer may NOT be used to endorse or promote products 12 * derived from this software without specific prior written permission. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: head/sys/amd64/amd64/mp_machdep.c 214630 2010-11-01 17:40:35Z jhb $"); 29 30#include "opt_cpu.h" 31#include "opt_kstack_pages.h" 32#include "opt_mp_watchdog.h" 33#include "opt_sched.h" 34#include "opt_smp.h" 35 36#include <sys/param.h> 37#include <sys/systm.h> 38#include <sys/bus.h> 39#ifdef GPROF 40#include <sys/gmon.h> 41#endif 42#include <sys/kernel.h> 43#include <sys/ktr.h> 44#include <sys/lock.h> 45#include <sys/malloc.h> 46#include <sys/memrange.h> 47#include <sys/mutex.h> 48#include <sys/pcpu.h> 49#include <sys/proc.h> 50#include <sys/sched.h> 51#include <sys/smp.h> 52#include <sys/sysctl.h> 53 54#include <vm/vm.h> 55#include <vm/vm_param.h> 56#include <vm/pmap.h> 57#include <vm/vm_kern.h> 58#include <vm/vm_extern.h> 59 60#include <machine/apicreg.h> 61#include <machine/clock.h> 62#include <machine/cputypes.h> 63#include <machine/cpufunc.h> 64#include <x86/mca.h> 65#include <machine/md_var.h> 66#include <machine/mp_watchdog.h> 67#include <machine/pcb.h> 68#include <machine/psl.h> 69#include <machine/smp.h> 70#include <machine/specialreg.h> 71#include <machine/tss.h> 72 73#define WARMBOOT_TARGET 0 74#define WARMBOOT_OFF (KERNBASE + 0x0467) 75#define WARMBOOT_SEG (KERNBASE + 0x0469) 76 77#define CMOS_REG (0x70) 78#define CMOS_DATA (0x71) 79#define BIOS_RESET (0x0f) 80#define BIOS_WARM (0x0a) 81 82/* lock region used by kernel profiling */ 83int mcount_lock; 84 85int mp_naps; /* # of Applications processors */ 86int boot_cpu_id = -1; /* designated BSP */ 87 88extern struct pcpu __pcpu[]; 89 90/* AP uses this during bootstrap. Do not staticize. */ 91char *bootSTK; 92static int bootAP; 93 94/* Free these after use */ 95void *bootstacks[MAXCPU]; 96 97/* Temporary variables for init_secondary() */ 98char *doublefault_stack; 99char *nmi_stack; 100void *dpcpu; 101 102struct pcb stoppcbs[MAXCPU]; 103struct pcb **susppcbs = NULL; 104 105/* Variables needed for SMP tlb shootdown. */ 106vm_offset_t smp_tlb_addr1; 107vm_offset_t smp_tlb_addr2; 108volatile int smp_tlb_wait; 109 110#ifdef COUNT_IPIS 111/* Interrupt counts. */ 112static u_long *ipi_preempt_counts[MAXCPU]; 113static u_long *ipi_ast_counts[MAXCPU]; 114u_long *ipi_invltlb_counts[MAXCPU]; 115u_long *ipi_invlrng_counts[MAXCPU]; 116u_long *ipi_invlpg_counts[MAXCPU]; 117u_long *ipi_invlcache_counts[MAXCPU]; 118u_long *ipi_rendezvous_counts[MAXCPU]; 119u_long *ipi_lazypmap_counts[MAXCPU]; 120static u_long *ipi_hardclock_counts[MAXCPU]; 121#endif 122 123extern inthand_t IDTVEC(fast_syscall), IDTVEC(fast_syscall32); 124 125/* 126 * Local data and functions. 127 */ 128 129static volatile cpumask_t ipi_nmi_pending; 130 131/* used to hold the AP's until we are ready to release them */ 132static struct mtx ap_boot_mtx; 133 134/* Set to 1 once we're ready to let the APs out of the pen. */ 135static volatile int aps_ready = 0; 136 137/* 138 * Store data from cpu_add() until later in the boot when we actually setup 139 * the APs. 140 */ 141struct cpu_info { 142 int cpu_present:1; 143 int cpu_bsp:1; 144 int cpu_disabled:1; 145 int cpu_hyperthread:1; 146} static cpu_info[MAX_APIC_ID + 1]; 147int cpu_apic_ids[MAXCPU]; 148int apic_cpuids[MAX_APIC_ID + 1]; 149 150/* Holds pending bitmap based IPIs per CPU */ 151static volatile u_int cpu_ipi_pending[MAXCPU]; 152 153static u_int boot_address; 154static int cpu_logical; /* logical cpus per core */ 155static int cpu_cores; /* cores per package */ 156 157static void assign_cpu_ids(void); 158static void set_interrupt_apic_ids(void); 159static int start_all_aps(void); 160static int start_ap(int apic_id); 161static void release_aps(void *dummy); 162 163static int hlt_logical_cpus; 164static u_int hyperthreading_cpus; /* logical cpus sharing L1 cache */ 165static cpumask_t hyperthreading_cpus_mask; 166static int hyperthreading_allowed = 1; 167static struct sysctl_ctx_list logical_cpu_clist; 168static u_int bootMP_size; 169 170static void 171mem_range_AP_init(void) 172{ 173 if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) 174 mem_range_softc.mr_op->initAP(&mem_range_softc); 175} 176 177static void 178topo_probe_amd(void) 179{ 180 181 /* AMD processors do not support HTT. */ 182 cpu_cores = (amd_feature2 & AMDID2_CMP) != 0 ? 183 (cpu_procinfo2 & AMDID_CMP_CORES) + 1 : 1; 184 cpu_logical = 1; 185} 186 187/* 188 * Round up to the next power of two, if necessary, and then 189 * take log2. 190 * Returns -1 if argument is zero. 191 */ 192static __inline int 193mask_width(u_int x) 194{ 195 196 return (fls(x << (1 - powerof2(x))) - 1); 197} 198 199static void 200topo_probe_0x4(void) 201{ 202 u_int p[4]; 203 int pkg_id_bits; 204 int core_id_bits; 205 int max_cores; 206 int max_logical; 207 int id; 208 209 /* Both zero and one here mean one logical processor per package. */ 210 max_logical = (cpu_feature & CPUID_HTT) != 0 ? 211 (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1; 212 if (max_logical <= 1) 213 return; 214 215 /* 216 * Because of uniformity assumption we examine only 217 * those logical processors that belong to the same 218 * package as BSP. Further, we count number of 219 * logical processors that belong to the same core 220 * as BSP thus deducing number of threads per core. 221 */ 222 cpuid_count(0x04, 0, p); 223 max_cores = ((p[0] >> 26) & 0x3f) + 1; 224 core_id_bits = mask_width(max_logical/max_cores); 225 if (core_id_bits < 0) 226 return; 227 pkg_id_bits = core_id_bits + mask_width(max_cores); 228 229 for (id = 0; id <= MAX_APIC_ID; id++) { 230 /* Check logical CPU availability. */ 231 if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled) 232 continue; 233 /* Check if logical CPU has the same package ID. */ 234 if ((id >> pkg_id_bits) != (boot_cpu_id >> pkg_id_bits)) 235 continue; 236 cpu_cores++; 237 /* Check if logical CPU has the same package and core IDs. */ 238 if ((id >> core_id_bits) == (boot_cpu_id >> core_id_bits)) 239 cpu_logical++; 240 } 241 242 cpu_cores /= cpu_logical; 243 hyperthreading_cpus = cpu_logical; 244} 245 246static void 247topo_probe_0xb(void) 248{ 249 u_int p[4]; 250 int bits; 251 int cnt; 252 int i; 253 int logical; 254 int type; 255 int x; 256 257 /* We only support three levels for now. */ 258 for (i = 0; i < 3; i++) { 259 cpuid_count(0x0b, i, p); 260 261 /* Fall back if CPU leaf 11 doesn't really exist. */ 262 if (i == 0 && p[1] == 0) { 263 topo_probe_0x4(); 264 return; 265 } 266 267 bits = p[0] & 0x1f; 268 logical = p[1] &= 0xffff; 269 type = (p[2] >> 8) & 0xff; 270 if (type == 0 || logical == 0) 271 break; 272 /* 273 * Because of uniformity assumption we examine only 274 * those logical processors that belong to the same 275 * package as BSP. 276 */ 277 for (cnt = 0, x = 0; x <= MAX_APIC_ID; x++) { 278 if (!cpu_info[x].cpu_present || 279 cpu_info[x].cpu_disabled) 280 continue; 281 if (x >> bits == boot_cpu_id >> bits) 282 cnt++; 283 } 284 if (type == CPUID_TYPE_SMT) 285 cpu_logical = cnt; 286 else if (type == CPUID_TYPE_CORE) 287 cpu_cores = cnt; 288 } 289 if (cpu_logical == 0) 290 cpu_logical = 1; 291 cpu_cores /= cpu_logical; 292} 293 294/* 295 * Both topology discovery code and code that consumes topology 296 * information assume top-down uniformity of the topology. 297 * That is, all physical packages must be identical and each 298 * core in a package must have the same number of threads. 299 * Topology information is queried only on BSP, on which this 300 * code runs and for which it can query CPUID information. 301 * Then topology is extrapolated on all packages using the 302 * uniformity assumption. 303 */ 304static void 305topo_probe(void) 306{ 307 static int cpu_topo_probed = 0; 308 309 if (cpu_topo_probed) 310 return; 311 312 logical_cpus_mask = 0; 313 if (cpu_vendor_id == CPU_VENDOR_AMD) 314 topo_probe_amd(); 315 else if (cpu_vendor_id == CPU_VENDOR_INTEL) { 316 /* 317 * See Intel(R) 64 Architecture Processor 318 * Topology Enumeration article for details. 319 * 320 * Note that 0x1 <= cpu_high < 4 case should be 321 * compatible with topo_probe_0x4() logic when 322 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1) 323 * or it should trigger the fallback otherwise. 324 */ 325 if (cpu_high >= 0xb) 326 topo_probe_0xb(); 327 else if (cpu_high >= 0x1) 328 topo_probe_0x4(); 329 } 330 331 /* 332 * Fallback: assume each logical CPU is in separate 333 * physical package. That is, no multi-core, no SMT. 334 */ 335 if (cpu_cores == 0) 336 cpu_cores = 1; 337 if (cpu_logical == 0) 338 cpu_logical = 1; 339 cpu_topo_probed = 1; 340} 341 342struct cpu_group * 343cpu_topo(void) 344{ 345 int cg_flags; 346 347 /* 348 * Determine whether any threading flags are 349 * necessry. 350 */ 351 topo_probe(); 352 if (cpu_logical > 1 && hyperthreading_cpus) 353 cg_flags = CG_FLAG_HTT; 354 else if (cpu_logical > 1) 355 cg_flags = CG_FLAG_SMT; 356 else 357 cg_flags = 0; 358 if (mp_ncpus % (cpu_cores * cpu_logical) != 0) { 359 printf("WARNING: Non-uniform processors.\n"); 360 printf("WARNING: Using suboptimal topology.\n"); 361 return (smp_topo_none()); 362 } 363 /* 364 * No multi-core or hyper-threaded. 365 */ 366 if (cpu_logical * cpu_cores == 1) 367 return (smp_topo_none()); 368 /* 369 * Only HTT no multi-core. 370 */ 371 if (cpu_logical > 1 && cpu_cores == 1) 372 return (smp_topo_1level(CG_SHARE_L1, cpu_logical, cg_flags)); 373 /* 374 * Only multi-core no HTT. 375 */ 376 if (cpu_cores > 1 && cpu_logical == 1) 377 return (smp_topo_1level(CG_SHARE_L2, cpu_cores, cg_flags)); 378 /* 379 * Both HTT and multi-core. 380 */ 381 return (smp_topo_2level(CG_SHARE_L2, cpu_cores, 382 CG_SHARE_L1, cpu_logical, cg_flags)); 383} 384 385/* 386 * Calculate usable address in base memory for AP trampoline code. 387 */ 388u_int 389mp_bootaddress(u_int basemem) 390{ 391 392 bootMP_size = mptramp_end - mptramp_start; 393 boot_address = trunc_page(basemem * 1024); /* round down to 4k boundary */ 394 if (((basemem * 1024) - boot_address) < bootMP_size) 395 boot_address -= PAGE_SIZE; /* not enough, lower by 4k */ 396 /* 3 levels of page table pages */ 397 mptramp_pagetables = boot_address - (PAGE_SIZE * 3); 398 399 return mptramp_pagetables; 400} 401 402void 403cpu_add(u_int apic_id, char boot_cpu) 404{ 405 406 if (apic_id > MAX_APIC_ID) { 407 panic("SMP: APIC ID %d too high", apic_id); 408 return; 409 } 410 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", 411 apic_id)); 412 cpu_info[apic_id].cpu_present = 1; 413 if (boot_cpu) { 414 KASSERT(boot_cpu_id == -1, 415 ("CPU %d claims to be BSP, but CPU %d already is", apic_id, 416 boot_cpu_id)); 417 boot_cpu_id = apic_id; 418 cpu_info[apic_id].cpu_bsp = 1; 419 } 420 if (mp_ncpus < MAXCPU) { 421 mp_ncpus++; 422 mp_maxid = mp_ncpus -1; 423 } 424 if (bootverbose) 425 printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : 426 "AP"); 427} 428 429void 430cpu_mp_setmaxid(void) 431{ 432 433 /* 434 * mp_maxid should be already set by calls to cpu_add(). 435 * Just sanity check its value here. 436 */ 437 if (mp_ncpus == 0) 438 KASSERT(mp_maxid == 0, 439 ("%s: mp_ncpus is zero, but mp_maxid is not", __func__)); 440 else if (mp_ncpus == 1) 441 mp_maxid = 0; 442 else 443 KASSERT(mp_maxid >= mp_ncpus - 1, 444 ("%s: counters out of sync: max %d, count %d", __func__, 445 mp_maxid, mp_ncpus)); 446} 447 448int 449cpu_mp_probe(void) 450{ 451 452 /* 453 * Always record BSP in CPU map so that the mbuf init code works 454 * correctly. 455 */ 456 all_cpus = 1; 457 if (mp_ncpus == 0) { 458 /* 459 * No CPUs were found, so this must be a UP system. Setup 460 * the variables to represent a system with a single CPU 461 * with an id of 0. 462 */ 463 mp_ncpus = 1; 464 return (0); 465 } 466 467 /* At least one CPU was found. */ 468 if (mp_ncpus == 1) { 469 /* 470 * One CPU was found, so this must be a UP system with 471 * an I/O APIC. 472 */ 473 mp_maxid = 0; 474 return (0); 475 } 476 477 /* At least two CPUs were found. */ 478 return (1); 479} 480 481/* 482 * Initialize the IPI handlers and start up the AP's. 483 */ 484void 485cpu_mp_start(void) 486{ 487 int i; 488 489 /* Initialize the logical ID to APIC ID table. */ 490 for (i = 0; i < MAXCPU; i++) { 491 cpu_apic_ids[i] = -1; 492 cpu_ipi_pending[i] = 0; 493 } 494 495 /* Install an inter-CPU IPI for TLB invalidation */ 496 setidt(IPI_INVLTLB, IDTVEC(invltlb), SDT_SYSIGT, SEL_KPL, 0); 497 setidt(IPI_INVLPG, IDTVEC(invlpg), SDT_SYSIGT, SEL_KPL, 0); 498 setidt(IPI_INVLRNG, IDTVEC(invlrng), SDT_SYSIGT, SEL_KPL, 0); 499 500 /* Install an inter-CPU IPI for cache invalidation. */ 501 setidt(IPI_INVLCACHE, IDTVEC(invlcache), SDT_SYSIGT, SEL_KPL, 0); 502 503 /* Install an inter-CPU IPI for all-CPU rendezvous */ 504 setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous), SDT_SYSIGT, SEL_KPL, 0); 505 506 /* Install generic inter-CPU IPI handler */ 507 setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler), 508 SDT_SYSIGT, SEL_KPL, 0); 509 510 /* Install an inter-CPU IPI for CPU stop/restart */ 511 setidt(IPI_STOP, IDTVEC(cpustop), SDT_SYSIGT, SEL_KPL, 0); 512 513 /* Install an inter-CPU IPI for CPU suspend/resume */ 514 setidt(IPI_SUSPEND, IDTVEC(cpususpend), SDT_SYSIGT, SEL_KPL, 0); 515 516 /* Set boot_cpu_id if needed. */ 517 if (boot_cpu_id == -1) { 518 boot_cpu_id = PCPU_GET(apic_id); 519 cpu_info[boot_cpu_id].cpu_bsp = 1; 520 } else 521 KASSERT(boot_cpu_id == PCPU_GET(apic_id), 522 ("BSP's APIC ID doesn't match boot_cpu_id")); 523 524 /* Probe logical/physical core configuration. */ 525 topo_probe(); 526 527 assign_cpu_ids(); 528 529 /* Start each Application Processor */ 530 start_all_aps(); 531 532 set_interrupt_apic_ids(); 533} 534 535 536/* 537 * Print various information about the SMP system hardware and setup. 538 */ 539void 540cpu_mp_announce(void) 541{ 542 const char *hyperthread; 543 int i; 544 545 printf("FreeBSD/SMP: %d package(s) x %d core(s)", 546 mp_ncpus / (cpu_cores * cpu_logical), cpu_cores); 547 if (hyperthreading_cpus > 1) 548 printf(" x %d HTT threads", cpu_logical); 549 else if (cpu_logical > 1) 550 printf(" x %d SMT threads", cpu_logical); 551 printf("\n"); 552 553 /* List active CPUs first. */ 554 printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id); 555 for (i = 1; i < mp_ncpus; i++) { 556 if (cpu_info[cpu_apic_ids[i]].cpu_hyperthread) 557 hyperthread = "/HT"; 558 else 559 hyperthread = ""; 560 printf(" cpu%d (AP%s): APIC ID: %2d\n", i, hyperthread, 561 cpu_apic_ids[i]); 562 } 563 564 /* List disabled CPUs last. */ 565 for (i = 0; i <= MAX_APIC_ID; i++) { 566 if (!cpu_info[i].cpu_present || !cpu_info[i].cpu_disabled) 567 continue; 568 if (cpu_info[i].cpu_hyperthread) 569 hyperthread = "/HT"; 570 else 571 hyperthread = ""; 572 printf(" cpu (AP%s): APIC ID: %2d (disabled)\n", hyperthread, 573 i); 574 } 575} 576 577/* 578 * AP CPU's call this to initialize themselves. 579 */ 580void 581init_secondary(void) 582{ 583 struct pcpu *pc; 584 struct nmi_pcpu *np; 585 u_int64_t msr, cr0; 586 int cpu, gsel_tss, x; 587 struct region_descriptor ap_gdt; 588 589 /* Set by the startup code for us to use */ 590 cpu = bootAP; 591 592 /* Init tss */ 593 common_tss[cpu] = common_tss[0]; 594 common_tss[cpu].tss_rsp0 = 0; /* not used until after switch */ 595 common_tss[cpu].tss_iobase = sizeof(struct amd64tss) + 596 IOPAGES * PAGE_SIZE; 597 common_tss[cpu].tss_ist1 = (long)&doublefault_stack[PAGE_SIZE]; 598 599 /* The NMI stack runs on IST2. */ 600 np = ((struct nmi_pcpu *) &nmi_stack[PAGE_SIZE]) - 1; 601 common_tss[cpu].tss_ist2 = (long) np; 602 603 /* Prepare private GDT */ 604 gdt_segs[GPROC0_SEL].ssd_base = (long) &common_tss[cpu]; 605 for (x = 0; x < NGDT; x++) { 606 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) && 607 x != GUSERLDT_SEL && x != (GUSERLDT_SEL + 1)) 608 ssdtosd(&gdt_segs[x], &gdt[NGDT * cpu + x]); 609 } 610 ssdtosyssd(&gdt_segs[GPROC0_SEL], 611 (struct system_segment_descriptor *)&gdt[NGDT * cpu + GPROC0_SEL]); 612 ap_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 613 ap_gdt.rd_base = (long) &gdt[NGDT * cpu]; 614 lgdt(&ap_gdt); /* does magic intra-segment return */ 615 616 /* Get per-cpu data */ 617 pc = &__pcpu[cpu]; 618 619 /* prime data page for it to use */ 620 pcpu_init(pc, cpu, sizeof(struct pcpu)); 621 dpcpu_init(dpcpu, cpu); 622 pc->pc_apic_id = cpu_apic_ids[cpu]; 623 pc->pc_prvspace = pc; 624 pc->pc_curthread = 0; 625 pc->pc_tssp = &common_tss[cpu]; 626 pc->pc_commontssp = &common_tss[cpu]; 627 pc->pc_rsp0 = 0; 628 pc->pc_tss = (struct system_segment_descriptor *)&gdt[NGDT * cpu + 629 GPROC0_SEL]; 630 pc->pc_fs32p = &gdt[NGDT * cpu + GUFS32_SEL]; 631 pc->pc_gs32p = &gdt[NGDT * cpu + GUGS32_SEL]; 632 pc->pc_ldt = (struct system_segment_descriptor *)&gdt[NGDT * cpu + 633 GUSERLDT_SEL]; 634 635 /* Save the per-cpu pointer for use by the NMI handler. */ 636 np->np_pcpu = (register_t) pc; 637 638 wrmsr(MSR_FSBASE, 0); /* User value */ 639 wrmsr(MSR_GSBASE, (u_int64_t)pc); 640 wrmsr(MSR_KGSBASE, (u_int64_t)pc); /* XXX User value while we're in the kernel */ 641 642 lidt(&r_idt); 643 644 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 645 ltr(gsel_tss); 646 647 /* 648 * Set to a known state: 649 * Set by mpboot.s: CR0_PG, CR0_PE 650 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM 651 */ 652 cr0 = rcr0(); 653 cr0 &= ~(CR0_CD | CR0_NW | CR0_EM); 654 load_cr0(cr0); 655 656 /* Set up the fast syscall stuff */ 657 msr = rdmsr(MSR_EFER) | EFER_SCE; 658 wrmsr(MSR_EFER, msr); 659 wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall)); 660 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); 661 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | 662 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); 663 wrmsr(MSR_STAR, msr); 664 wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D); 665 666 /* Disable local APIC just to be sure. */ 667 lapic_disable(); 668 669 /* signal our startup to the BSP. */ 670 mp_naps++; 671 672 /* Spin until the BSP releases the AP's. */ 673 while (!aps_ready) 674 ia32_pause(); 675 676 /* Initialize the PAT MSR. */ 677 pmap_init_pat(); 678 679 /* set up CPU registers and state */ 680 cpu_setregs(); 681 682 /* set up SSE/NX registers */ 683 initializecpu(); 684 685 /* set up FPU state on the AP */ 686 fpuinit(); 687 688 /* A quick check from sanity claus */ 689 if (PCPU_GET(apic_id) != lapic_id()) { 690 printf("SMP: cpuid = %d\n", PCPU_GET(cpuid)); 691 printf("SMP: actual apic_id = %d\n", lapic_id()); 692 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); 693 panic("cpuid mismatch! boom!!"); 694 } 695 696 /* Initialize curthread. */ 697 KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); 698 PCPU_SET(curthread, PCPU_GET(idlethread)); 699 700 mca_init(); 701 702 mtx_lock_spin(&ap_boot_mtx); 703 704 /* Init local apic for irq's */ 705 lapic_setup(1); 706 707 /* Set memory range attributes for this CPU to match the BSP */ 708 mem_range_AP_init(); 709 710 smp_cpus++; 711 712 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid)); 713 printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid)); 714 715 /* Determine if we are a logical CPU. */ 716 /* XXX Calculation depends on cpu_logical being a power of 2, e.g. 2 */ 717 if (cpu_logical > 1 && PCPU_GET(apic_id) % cpu_logical != 0) 718 logical_cpus_mask |= PCPU_GET(cpumask); 719 720 /* Determine if we are a hyperthread. */ 721 if (hyperthreading_cpus > 1 && 722 PCPU_GET(apic_id) % hyperthreading_cpus != 0) 723 hyperthreading_cpus_mask |= PCPU_GET(cpumask); 724 725 /* Build our map of 'other' CPUs. */ 726 PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); 727 728 if (bootverbose) 729 lapic_dump("AP"); 730 731 if (smp_cpus == mp_ncpus) { 732 /* enable IPI's, tlb shootdown, freezes etc */ 733 atomic_store_rel_int(&smp_started, 1); 734 smp_active = 1; /* historic */ 735 } 736 737 /* 738 * Enable global pages TLB extension 739 * This also implicitly flushes the TLB 740 */ 741 742 load_cr4(rcr4() | CR4_PGE); 743 load_ds(_udatasel); 744 load_es(_udatasel); 745 load_fs(_ufssel); 746 mtx_unlock_spin(&ap_boot_mtx); 747 748 /* Wait until all the AP's are up. */ 749 while (smp_started == 0) 750 ia32_pause(); 751 752 /* Start per-CPU event timers. */ 753 cpu_initclocks_ap(); 754 755 sched_throw(NULL); 756 757 panic("scheduler returned us to %s", __func__); 758 /* NOTREACHED */ 759} 760 761/******************************************************************* 762 * local functions and data 763 */ 764 765/* 766 * We tell the I/O APIC code about all the CPUs we want to receive 767 * interrupts. If we don't want certain CPUs to receive IRQs we 768 * can simply not tell the I/O APIC code about them in this function. 769 * We also do not tell it about the BSP since it tells itself about 770 * the BSP internally to work with UP kernels and on UP machines. 771 */ 772static void 773set_interrupt_apic_ids(void) 774{ 775 u_int i, apic_id; 776 777 for (i = 0; i < MAXCPU; i++) { 778 apic_id = cpu_apic_ids[i]; 779 if (apic_id == -1) 780 continue; 781 if (cpu_info[apic_id].cpu_bsp) 782 continue; 783 if (cpu_info[apic_id].cpu_disabled) 784 continue; 785 786 /* Don't let hyperthreads service interrupts. */ 787 if (hyperthreading_cpus > 1 && 788 apic_id % hyperthreading_cpus != 0) 789 continue; 790 791 intr_add_cpu(i); 792 } 793} 794 795/* 796 * Assign logical CPU IDs to local APICs. 797 */ 798static void 799assign_cpu_ids(void) 800{ 801 u_int i; 802 803 TUNABLE_INT_FETCH("machdep.hyperthreading_allowed", 804 &hyperthreading_allowed); 805 806 /* Check for explicitly disabled CPUs. */ 807 for (i = 0; i <= MAX_APIC_ID; i++) { 808 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp) 809 continue; 810 811 if (hyperthreading_cpus > 1 && i % hyperthreading_cpus != 0) { 812 cpu_info[i].cpu_hyperthread = 1; 813#if defined(SCHED_ULE) 814 /* 815 * Don't use HT CPU if it has been disabled by a 816 * tunable. 817 */ 818 if (hyperthreading_allowed == 0) { 819 cpu_info[i].cpu_disabled = 1; 820 continue; 821 } 822#endif 823 } 824 825 /* Don't use this CPU if it has been disabled by a tunable. */ 826 if (resource_disabled("lapic", i)) { 827 cpu_info[i].cpu_disabled = 1; 828 continue; 829 } 830 } 831 832 /* 833 * Assign CPU IDs to local APIC IDs and disable any CPUs 834 * beyond MAXCPU. CPU 0 is always assigned to the BSP. 835 * 836 * To minimize confusion for userland, we attempt to number 837 * CPUs such that all threads and cores in a package are 838 * grouped together. For now we assume that the BSP is always 839 * the first thread in a package and just start adding APs 840 * starting with the BSP's APIC ID. 841 */ 842 mp_ncpus = 1; 843 cpu_apic_ids[0] = boot_cpu_id; 844 apic_cpuids[boot_cpu_id] = 0; 845 for (i = boot_cpu_id + 1; i != boot_cpu_id; 846 i == MAX_APIC_ID ? i = 0 : i++) { 847 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp || 848 cpu_info[i].cpu_disabled) 849 continue; 850 851 if (mp_ncpus < MAXCPU) { 852 cpu_apic_ids[mp_ncpus] = i; 853 apic_cpuids[i] = mp_ncpus; 854 mp_ncpus++; 855 } else 856 cpu_info[i].cpu_disabled = 1; 857 } 858 KASSERT(mp_maxid >= mp_ncpus - 1, 859 ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, 860 mp_ncpus)); 861} 862 863/* 864 * start each AP in our list 865 */ 866static int 867start_all_aps(void) 868{ 869 vm_offset_t va = boot_address + KERNBASE; 870 u_int64_t *pt4, *pt3, *pt2; 871 u_int32_t mpbioswarmvec; 872 int apic_id, cpu, i; 873 u_char mpbiosreason; 874 875 mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); 876 877 /* install the AP 1st level boot code */ 878 pmap_kenter(va, boot_address); 879 pmap_invalidate_page(kernel_pmap, va); 880 bcopy(mptramp_start, (void *)va, bootMP_size); 881 882 /* Locate the page tables, they'll be below the trampoline */ 883 pt4 = (u_int64_t *)(uintptr_t)(mptramp_pagetables + KERNBASE); 884 pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t); 885 pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t); 886 887 /* Create the initial 1GB replicated page tables */ 888 for (i = 0; i < 512; i++) { 889 /* Each slot of the level 4 pages points to the same level 3 page */ 890 pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + PAGE_SIZE); 891 pt4[i] |= PG_V | PG_RW | PG_U; 892 893 /* Each slot of the level 3 pages points to the same level 2 page */ 894 pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + (2 * PAGE_SIZE)); 895 pt3[i] |= PG_V | PG_RW | PG_U; 896 897 /* The level 2 page slots are mapped with 2MB pages for 1GB. */ 898 pt2[i] = i * (2 * 1024 * 1024); 899 pt2[i] |= PG_V | PG_RW | PG_PS | PG_U; 900 } 901 902 /* save the current value of the warm-start vector */ 903 mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF); 904 outb(CMOS_REG, BIOS_RESET); 905 mpbiosreason = inb(CMOS_DATA); 906 907 /* setup a vector to our boot code */ 908 *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET; 909 *((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4); 910 outb(CMOS_REG, BIOS_RESET); 911 outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ 912 913 /* start each AP */ 914 for (cpu = 1; cpu < mp_ncpus; cpu++) { 915 apic_id = cpu_apic_ids[cpu]; 916 917 /* allocate and set up an idle stack data page */ 918 bootstacks[cpu] = (void *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE); 919 doublefault_stack = (char *)kmem_alloc(kernel_map, PAGE_SIZE); 920 nmi_stack = (char *)kmem_alloc(kernel_map, PAGE_SIZE); 921 dpcpu = (void *)kmem_alloc(kernel_map, DPCPU_SIZE); 922 923 bootSTK = (char *)bootstacks[cpu] + KSTACK_PAGES * PAGE_SIZE - 8; 924 bootAP = cpu; 925 926 /* attempt to start the Application Processor */ 927 if (!start_ap(apic_id)) { 928 /* restore the warmstart vector */ 929 *(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec; 930 panic("AP #%d (PHY# %d) failed!", cpu, apic_id); 931 } 932 933 all_cpus |= (1 << cpu); /* record AP in CPU map */ 934 } 935 936 /* build our map of 'other' CPUs */ 937 PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); 938 939 /* restore the warmstart vector */ 940 *(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec; 941 942 outb(CMOS_REG, BIOS_RESET); 943 outb(CMOS_DATA, mpbiosreason); 944 945 /* number of APs actually started */ 946 return mp_naps; 947} 948 949 950/* 951 * This function starts the AP (application processor) identified 952 * by the APIC ID 'physicalCpu'. It does quite a "song and dance" 953 * to accomplish this. This is necessary because of the nuances 954 * of the different hardware we might encounter. It isn't pretty, 955 * but it seems to work. 956 */ 957static int 958start_ap(int apic_id) 959{ 960 int vector, ms; 961 int cpus; 962 963 /* calculate the vector */ 964 vector = (boot_address >> 12) & 0xff; 965 966 /* used as a watchpoint to signal AP startup */ 967 cpus = mp_naps; 968 969 /* 970 * first we do an INIT/RESET IPI this INIT IPI might be run, reseting 971 * and running the target CPU. OR this INIT IPI might be latched (P5 972 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be 973 * ignored. 974 */ 975 976 /* do an INIT IPI: assert RESET */ 977 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 978 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); 979 980 /* wait for pending status end */ 981 lapic_ipi_wait(-1); 982 983 /* do an INIT IPI: deassert RESET */ 984 lapic_ipi_raw(APIC_DEST_ALLESELF | APIC_TRIGMOD_LEVEL | 985 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 0); 986 987 /* wait for pending status end */ 988 DELAY(10000); /* wait ~10mS */ 989 lapic_ipi_wait(-1); 990 991 /* 992 * next we do a STARTUP IPI: the previous INIT IPI might still be 993 * latched, (P5 bug) this 1st STARTUP would then terminate 994 * immediately, and the previously started INIT IPI would continue. OR 995 * the previous INIT IPI has already run. and this STARTUP IPI will 996 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI 997 * will run. 998 */ 999 1000 /* do a STARTUP IPI */ 1001 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1002 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 1003 vector, apic_id); 1004 lapic_ipi_wait(-1); 1005 DELAY(200); /* wait ~200uS */ 1006 1007 /* 1008 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF 1009 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR 1010 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is 1011 * recognized after hardware RESET or INIT IPI. 1012 */ 1013 1014 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1015 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 1016 vector, apic_id); 1017 lapic_ipi_wait(-1); 1018 DELAY(200); /* wait ~200uS */ 1019 1020 /* Wait up to 5 seconds for it to start. */ 1021 for (ms = 0; ms < 5000; ms++) { 1022 if (mp_naps > cpus) 1023 return 1; /* return SUCCESS */ 1024 DELAY(1000); 1025 } 1026 return 0; /* return FAILURE */ 1027} 1028 1029#ifdef COUNT_XINVLTLB_HITS 1030u_int xhits_gbl[MAXCPU]; 1031u_int xhits_pg[MAXCPU]; 1032u_int xhits_rng[MAXCPU]; 1033SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, ""); 1034SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, 1035 sizeof(xhits_gbl), "IU", ""); 1036SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, 1037 sizeof(xhits_pg), "IU", ""); 1038SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, 1039 sizeof(xhits_rng), "IU", ""); 1040 1041u_int ipi_global; 1042u_int ipi_page; 1043u_int ipi_range; 1044u_int ipi_range_size; 1045SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); 1046SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); 1047SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); 1048SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, 1049 0, ""); 1050 1051u_int ipi_masked_global; 1052u_int ipi_masked_page; 1053u_int ipi_masked_range; 1054u_int ipi_masked_range_size; 1055SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW, 1056 &ipi_masked_global, 0, ""); 1057SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW, 1058 &ipi_masked_page, 0, ""); 1059SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW, 1060 &ipi_masked_range, 0, ""); 1061SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW, 1062 &ipi_masked_range_size, 0, ""); 1063#endif /* COUNT_XINVLTLB_HITS */ 1064 1065/* 1066 * Flush the TLB on all other CPU's 1067 */ 1068static void 1069smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) 1070{ 1071 u_int ncpu; 1072 1073 ncpu = mp_ncpus - 1; /* does not shootdown self */ 1074 if (ncpu < 1) 1075 return; /* no other cpus */ 1076 if (!(read_rflags() & PSL_I)) 1077 panic("%s: interrupts disabled", __func__); 1078 mtx_lock_spin(&smp_ipi_mtx); 1079 smp_tlb_addr1 = addr1; 1080 smp_tlb_addr2 = addr2; 1081 atomic_store_rel_int(&smp_tlb_wait, 0); 1082 ipi_all_but_self(vector); 1083 while (smp_tlb_wait < ncpu) 1084 ia32_pause(); 1085 mtx_unlock_spin(&smp_ipi_mtx); 1086} 1087 1088static void 1089smp_targeted_tlb_shootdown(cpumask_t mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2) 1090{ 1091 int ncpu, othercpus; 1092 1093 othercpus = mp_ncpus - 1; 1094 if (mask == (cpumask_t)-1) { 1095 ncpu = othercpus; 1096 if (ncpu < 1) 1097 return; 1098 } else { 1099 mask &= ~PCPU_GET(cpumask); 1100 if (mask == 0) 1101 return; 1102 ncpu = bitcount32(mask); 1103 if (ncpu > othercpus) { 1104 /* XXX this should be a panic offence */ 1105 printf("SMP: tlb shootdown to %d other cpus (only have %d)\n", 1106 ncpu, othercpus); 1107 ncpu = othercpus; 1108 } 1109 /* XXX should be a panic, implied by mask == 0 above */ 1110 if (ncpu < 1) 1111 return; 1112 } 1113 if (!(read_rflags() & PSL_I)) 1114 panic("%s: interrupts disabled", __func__); 1115 mtx_lock_spin(&smp_ipi_mtx); 1116 smp_tlb_addr1 = addr1; 1117 smp_tlb_addr2 = addr2; 1118 atomic_store_rel_int(&smp_tlb_wait, 0); 1119 if (mask == (cpumask_t)-1) 1120 ipi_all_but_self(vector); 1121 else 1122 ipi_selected(mask, vector); 1123 while (smp_tlb_wait < ncpu) 1124 ia32_pause(); 1125 mtx_unlock_spin(&smp_ipi_mtx); 1126} 1127 1128/* 1129 * Send an IPI to specified CPU handling the bitmap logic. 1130 */ 1131static void 1132ipi_send_cpu(int cpu, u_int ipi) 1133{ 1134 u_int bitmap, old_pending, new_pending; 1135 1136 KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu)); 1137 1138 if (IPI_IS_BITMAPED(ipi)) { 1139 bitmap = 1 << ipi; 1140 ipi = IPI_BITMAP_VECTOR; 1141 do { 1142 old_pending = cpu_ipi_pending[cpu]; 1143 new_pending = old_pending | bitmap; 1144 } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu], 1145 old_pending, new_pending)); 1146 if (old_pending) 1147 return; 1148 } 1149 lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]); 1150} 1151 1152void 1153smp_cache_flush(void) 1154{ 1155 1156 if (smp_started) 1157 smp_tlb_shootdown(IPI_INVLCACHE, 0, 0); 1158} 1159 1160void 1161smp_invltlb(void) 1162{ 1163 1164 if (smp_started) { 1165 smp_tlb_shootdown(IPI_INVLTLB, 0, 0); 1166#ifdef COUNT_XINVLTLB_HITS 1167 ipi_global++; 1168#endif 1169 } 1170} 1171 1172void 1173smp_invlpg(vm_offset_t addr) 1174{ 1175 1176 if (smp_started) { 1177 smp_tlb_shootdown(IPI_INVLPG, addr, 0); 1178#ifdef COUNT_XINVLTLB_HITS 1179 ipi_page++; 1180#endif 1181 } 1182} 1183 1184void 1185smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2) 1186{ 1187 1188 if (smp_started) { 1189 smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2); 1190#ifdef COUNT_XINVLTLB_HITS 1191 ipi_range++; 1192 ipi_range_size += (addr2 - addr1) / PAGE_SIZE; 1193#endif 1194 } 1195} 1196 1197void 1198smp_masked_invltlb(cpumask_t mask) 1199{ 1200 1201 if (smp_started) { 1202 smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0); 1203#ifdef COUNT_XINVLTLB_HITS 1204 ipi_masked_global++; 1205#endif 1206 } 1207} 1208 1209void 1210smp_masked_invlpg(cpumask_t mask, vm_offset_t addr) 1211{ 1212 1213 if (smp_started) { 1214 smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0); 1215#ifdef COUNT_XINVLTLB_HITS 1216 ipi_masked_page++; 1217#endif 1218 } 1219} 1220 1221void 1222smp_masked_invlpg_range(cpumask_t mask, vm_offset_t addr1, vm_offset_t addr2) 1223{ 1224 1225 if (smp_started) { 1226 smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2); 1227#ifdef COUNT_XINVLTLB_HITS 1228 ipi_masked_range++; 1229 ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE; 1230#endif 1231 } 1232} 1233 1234void 1235ipi_bitmap_handler(struct trapframe frame) 1236{ 1237 struct trapframe *oldframe; 1238 struct thread *td; 1239 int cpu = PCPU_GET(cpuid); 1240 u_int ipi_bitmap; 1241 1242 critical_enter(); 1243 td = curthread; 1244 td->td_intr_nesting_level++; 1245 oldframe = td->td_intr_frame; 1246 td->td_intr_frame = &frame; 1247 ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); 1248 if (ipi_bitmap & (1 << IPI_PREEMPT)) { 1249#ifdef COUNT_IPIS 1250 (*ipi_preempt_counts[cpu])++; 1251#endif 1252 sched_preempt(td); 1253 } 1254 if (ipi_bitmap & (1 << IPI_AST)) { 1255#ifdef COUNT_IPIS 1256 (*ipi_ast_counts[cpu])++; 1257#endif 1258 /* Nothing to do for AST */ 1259 } 1260 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) { 1261#ifdef COUNT_IPIS 1262 (*ipi_hardclock_counts[cpu])++; 1263#endif 1264 hardclockintr(); 1265 } 1266 td->td_intr_frame = oldframe; 1267 td->td_intr_nesting_level--; 1268 critical_exit(); 1269} 1270 1271/* 1272 * send an IPI to a set of cpus. 1273 */ 1274void 1275ipi_selected(cpumask_t cpus, u_int ipi) 1276{ 1277 int cpu; 1278 1279 /* 1280 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1281 * of help in order to understand what is the source. 1282 * Set the mask of receiving CPUs for this purpose. 1283 */ 1284 if (ipi == IPI_STOP_HARD) 1285 atomic_set_int(&ipi_nmi_pending, cpus); 1286 1287 CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi); 1288 while ((cpu = ffs(cpus)) != 0) { 1289 cpu--; 1290 cpus &= ~(1 << cpu); 1291 ipi_send_cpu(cpu, ipi); 1292 } 1293} 1294 1295/* 1296 * send an IPI to a specific CPU. 1297 */ 1298void 1299ipi_cpu(int cpu, u_int ipi) 1300{ 1301 1302 /* 1303 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1304 * of help in order to understand what is the source. 1305 * Set the mask of receiving CPUs for this purpose. 1306 */ 1307 if (ipi == IPI_STOP_HARD) 1308 atomic_set_int(&ipi_nmi_pending, 1 << cpu); 1309 1310 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1311 ipi_send_cpu(cpu, ipi); 1312} 1313 1314/* 1315 * send an IPI to all CPUs EXCEPT myself 1316 */ 1317void 1318ipi_all_but_self(u_int ipi) 1319{ 1320 1321 if (IPI_IS_BITMAPED(ipi)) { 1322 ipi_selected(PCPU_GET(other_cpus), ipi); 1323 return; 1324 } 1325 1326 /* 1327 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1328 * of help in order to understand what is the source. 1329 * Set the mask of receiving CPUs for this purpose. 1330 */ 1331 if (ipi == IPI_STOP_HARD) 1332 atomic_set_int(&ipi_nmi_pending, PCPU_GET(other_cpus)); 1333 1334 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); 1335 lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); 1336} 1337 1338int 1339ipi_nmi_handler() 1340{ 1341 cpumask_t cpumask; 1342 1343 /* 1344 * As long as there is not a simple way to know about a NMI's 1345 * source, if the bitmask for the current CPU is present in 1346 * the global pending bitword an IPI_STOP_HARD has been issued 1347 * and should be handled. 1348 */ 1349 cpumask = PCPU_GET(cpumask); 1350 if ((ipi_nmi_pending & cpumask) == 0) 1351 return (1); 1352 1353 atomic_clear_int(&ipi_nmi_pending, cpumask); 1354 cpustop_handler(); 1355 return (0); 1356} 1357 1358/* 1359 * Handle an IPI_STOP by saving our current context and spinning until we 1360 * are resumed. 1361 */ 1362void 1363cpustop_handler(void) 1364{ 1365 cpumask_t cpumask; 1366 u_int cpu; 1367 1368 cpu = PCPU_GET(cpuid); 1369 cpumask = PCPU_GET(cpumask); 1370 1371 savectx(&stoppcbs[cpu]); 1372 1373 /* Indicate that we are stopped */ 1374 atomic_set_int(&stopped_cpus, cpumask); 1375 1376 /* Wait for restart */ 1377 while (!(started_cpus & cpumask)) 1378 ia32_pause(); 1379 1380 atomic_clear_int(&started_cpus, cpumask); 1381 atomic_clear_int(&stopped_cpus, cpumask); 1382 1383 if (cpu == 0 && cpustop_restartfunc != NULL) { 1384 cpustop_restartfunc(); 1385 cpustop_restartfunc = NULL; 1386 } 1387} 1388 1389/* 1390 * Handle an IPI_SUSPEND by saving our current context and spinning until we 1391 * are resumed. 1392 */ 1393void 1394cpususpend_handler(void) 1395{ 1396 cpumask_t cpumask; 1397 register_t cr3, rf; 1398 u_int cpu; 1399 1400 cpu = PCPU_GET(cpuid); 1401 cpumask = PCPU_GET(cpumask); 1402 1403 rf = intr_disable(); 1404 cr3 = rcr3(); 1405 1406 if (savectx(susppcbs[cpu])) { 1407 wbinvd(); 1408 atomic_set_int(&stopped_cpus, cpumask); 1409 } else { 1410 PCPU_SET(switchtime, 0); 1411 PCPU_SET(switchticks, ticks); 1412 } 1413 1414 /* Wait for resume */ 1415 while (!(started_cpus & cpumask)) 1416 ia32_pause(); 1417 1418 atomic_clear_int(&started_cpus, cpumask); 1419 atomic_clear_int(&stopped_cpus, cpumask); 1420 1421 /* Restore CR3 and enable interrupts */ 1422 load_cr3(cr3); 1423 mca_resume(); 1424 lapic_setup(0); 1425 intr_restore(rf); 1426} 1427 1428/* 1429 * This is called once the rest of the system is up and running and we're 1430 * ready to let the AP's out of the pen. 1431 */ 1432static void 1433release_aps(void *dummy __unused) 1434{ 1435 1436 if (mp_ncpus == 1) 1437 return; 1438 atomic_store_rel_int(&aps_ready, 1); 1439 while (smp_started == 0) 1440 ia32_pause(); 1441} 1442SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); 1443 1444static int 1445sysctl_hlt_cpus(SYSCTL_HANDLER_ARGS) 1446{ 1447 cpumask_t mask; 1448 int error; 1449 1450 mask = hlt_cpus_mask; 1451 error = sysctl_handle_int(oidp, &mask, 0, req); 1452 if (error || !req->newptr) 1453 return (error); 1454 1455 if (logical_cpus_mask != 0 && 1456 (mask & logical_cpus_mask) == logical_cpus_mask) 1457 hlt_logical_cpus = 1; 1458 else 1459 hlt_logical_cpus = 0; 1460 1461 if (! hyperthreading_allowed) 1462 mask |= hyperthreading_cpus_mask; 1463 1464 if ((mask & all_cpus) == all_cpus) 1465 mask &= ~(1<<0); 1466 hlt_cpus_mask = mask; 1467 return (error); 1468} 1469SYSCTL_PROC(_machdep, OID_AUTO, hlt_cpus, CTLTYPE_INT|CTLFLAG_RW, 1470 0, 0, sysctl_hlt_cpus, "IU", 1471 "Bitmap of CPUs to halt. 101 (binary) will halt CPUs 0 and 2."); 1472 1473static int 1474sysctl_hlt_logical_cpus(SYSCTL_HANDLER_ARGS) 1475{ 1476 int disable, error; 1477 1478 disable = hlt_logical_cpus; 1479 error = sysctl_handle_int(oidp, &disable, 0, req); 1480 if (error || !req->newptr) 1481 return (error); 1482 1483 if (disable) 1484 hlt_cpus_mask |= logical_cpus_mask; 1485 else 1486 hlt_cpus_mask &= ~logical_cpus_mask; 1487 1488 if (! hyperthreading_allowed) 1489 hlt_cpus_mask |= hyperthreading_cpus_mask; 1490 1491 if ((hlt_cpus_mask & all_cpus) == all_cpus) 1492 hlt_cpus_mask &= ~(1<<0); 1493 1494 hlt_logical_cpus = disable; 1495 return (error); 1496} 1497 1498static int 1499sysctl_hyperthreading_allowed(SYSCTL_HANDLER_ARGS) 1500{ 1501 int allowed, error; 1502 1503 allowed = hyperthreading_allowed; 1504 error = sysctl_handle_int(oidp, &allowed, 0, req); 1505 if (error || !req->newptr) 1506 return (error); 1507 1508#ifdef SCHED_ULE 1509 /* 1510 * SCHED_ULE doesn't allow enabling/disabling HT cores at 1511 * run-time. 1512 */ 1513 if (allowed != hyperthreading_allowed) 1514 return (ENOTSUP); 1515 return (error); 1516#endif 1517 1518 if (allowed) 1519 hlt_cpus_mask &= ~hyperthreading_cpus_mask; 1520 else 1521 hlt_cpus_mask |= hyperthreading_cpus_mask; 1522 1523 if (logical_cpus_mask != 0 && 1524 (hlt_cpus_mask & logical_cpus_mask) == logical_cpus_mask) 1525 hlt_logical_cpus = 1; 1526 else 1527 hlt_logical_cpus = 0; 1528 1529 if ((hlt_cpus_mask & all_cpus) == all_cpus) 1530 hlt_cpus_mask &= ~(1<<0); 1531 1532 hyperthreading_allowed = allowed; 1533 return (error); 1534} 1535 1536static void 1537cpu_hlt_setup(void *dummy __unused) 1538{ 1539 1540 if (logical_cpus_mask != 0) { 1541 TUNABLE_INT_FETCH("machdep.hlt_logical_cpus", 1542 &hlt_logical_cpus); 1543 sysctl_ctx_init(&logical_cpu_clist); 1544 SYSCTL_ADD_PROC(&logical_cpu_clist, 1545 SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO, 1546 "hlt_logical_cpus", CTLTYPE_INT|CTLFLAG_RW, 0, 0, 1547 sysctl_hlt_logical_cpus, "IU", ""); 1548 SYSCTL_ADD_UINT(&logical_cpu_clist, 1549 SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO, 1550 "logical_cpus_mask", CTLTYPE_INT|CTLFLAG_RD, 1551 &logical_cpus_mask, 0, ""); 1552 1553 if (hlt_logical_cpus) 1554 hlt_cpus_mask |= logical_cpus_mask; 1555 1556 /* 1557 * If necessary for security purposes, force 1558 * hyperthreading off, regardless of the value 1559 * of hlt_logical_cpus. 1560 */ 1561 if (hyperthreading_cpus_mask) { 1562 SYSCTL_ADD_PROC(&logical_cpu_clist, 1563 SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO, 1564 "hyperthreading_allowed", CTLTYPE_INT|CTLFLAG_RW, 1565 0, 0, sysctl_hyperthreading_allowed, "IU", ""); 1566 if (! hyperthreading_allowed) 1567 hlt_cpus_mask |= hyperthreading_cpus_mask; 1568 } 1569 } 1570} 1571SYSINIT(cpu_hlt, SI_SUB_SMP, SI_ORDER_ANY, cpu_hlt_setup, NULL); 1572 1573int 1574mp_grab_cpu_hlt(void) 1575{ 1576 cpumask_t mask; 1577#ifdef MP_WATCHDOG 1578 u_int cpuid; 1579#endif 1580 int retval; 1581 1582 mask = PCPU_GET(cpumask); 1583#ifdef MP_WATCHDOG 1584 cpuid = PCPU_GET(cpuid); 1585 ap_watchdog(cpuid); 1586#endif 1587 1588 retval = 0; 1589 while (mask & hlt_cpus_mask) { 1590 retval = 1; 1591 __asm __volatile("sti; hlt" : : : "memory"); 1592 } 1593 return (retval); 1594} 1595 1596#ifdef COUNT_IPIS 1597/* 1598 * Setup interrupt counters for IPI handlers. 1599 */ 1600static void 1601mp_ipi_intrcnt(void *dummy) 1602{ 1603 char buf[64]; 1604 int i; 1605 1606 CPU_FOREACH(i) { 1607 snprintf(buf, sizeof(buf), "cpu%d:invltlb", i); 1608 intrcnt_add(buf, &ipi_invltlb_counts[i]); 1609 snprintf(buf, sizeof(buf), "cpu%d:invlrng", i); 1610 intrcnt_add(buf, &ipi_invlrng_counts[i]); 1611 snprintf(buf, sizeof(buf), "cpu%d:invlpg", i); 1612 intrcnt_add(buf, &ipi_invlpg_counts[i]); 1613 snprintf(buf, sizeof(buf), "cpu%d:preempt", i); 1614 intrcnt_add(buf, &ipi_preempt_counts[i]); 1615 snprintf(buf, sizeof(buf), "cpu%d:ast", i); 1616 intrcnt_add(buf, &ipi_ast_counts[i]); 1617 snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i); 1618 intrcnt_add(buf, &ipi_rendezvous_counts[i]); 1619 snprintf(buf, sizeof(buf), "cpu%d:lazypmap", i); 1620 intrcnt_add(buf, &ipi_lazypmap_counts[i]); 1621 snprintf(buf, sizeof(buf), "cpu%d:hardclock", i); 1622 intrcnt_add(buf, &ipi_hardclock_counts[i]); 1623 } 1624} 1625SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL); 1626#endif 1627 1628