mp_machdep.c revision 222756
1/*- 2 * Copyright (c) 1996, by Steve Passe 3 * Copyright (c) 2003, by Peter Wemm 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. The name of the developer may NOT be used to endorse or promote products 12 * derived from this software without specific prior written permission. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: head/sys/amd64/amd64/mp_machdep.c 222756 2011-06-06 14:23:13Z avg $"); 29 30#include "opt_cpu.h" 31#include "opt_kstack_pages.h" 32#include "opt_mp_watchdog.h" 33#include "opt_sched.h" 34#include "opt_smp.h" 35 36#include <sys/param.h> 37#include <sys/systm.h> 38#include <sys/bus.h> 39#ifdef GPROF 40#include <sys/gmon.h> 41#endif 42#include <sys/kernel.h> 43#include <sys/ktr.h> 44#include <sys/lock.h> 45#include <sys/malloc.h> 46#include <sys/memrange.h> 47#include <sys/mutex.h> 48#include <sys/pcpu.h> 49#include <sys/proc.h> 50#include <sys/sched.h> 51#include <sys/smp.h> 52#include <sys/sysctl.h> 53 54#include <vm/vm.h> 55#include <vm/vm_param.h> 56#include <vm/pmap.h> 57#include <vm/vm_kern.h> 58#include <vm/vm_extern.h> 59 60#include <x86/apicreg.h> 61#include <machine/clock.h> 62#include <machine/cputypes.h> 63#include <machine/cpufunc.h> 64#include <x86/mca.h> 65#include <machine/md_var.h> 66#include <machine/mp_watchdog.h> 67#include <machine/pcb.h> 68#include <machine/psl.h> 69#include <machine/smp.h> 70#include <machine/specialreg.h> 71#include <machine/tss.h> 72 73#define WARMBOOT_TARGET 0 74#define WARMBOOT_OFF (KERNBASE + 0x0467) 75#define WARMBOOT_SEG (KERNBASE + 0x0469) 76 77#define CMOS_REG (0x70) 78#define CMOS_DATA (0x71) 79#define BIOS_RESET (0x0f) 80#define BIOS_WARM (0x0a) 81 82/* lock region used by kernel profiling */ 83int mcount_lock; 84 85int mp_naps; /* # of Applications processors */ 86int boot_cpu_id = -1; /* designated BSP */ 87 88extern struct pcpu __pcpu[]; 89 90/* AP uses this during bootstrap. Do not staticize. */ 91char *bootSTK; 92static int bootAP; 93 94/* Free these after use */ 95void *bootstacks[MAXCPU]; 96 97/* Temporary variables for init_secondary() */ 98char *doublefault_stack; 99char *nmi_stack; 100void *dpcpu; 101 102struct pcb stoppcbs[MAXCPU]; 103struct pcb **susppcbs = NULL; 104 105/* Variables needed for SMP tlb shootdown. */ 106vm_offset_t smp_tlb_addr1; 107vm_offset_t smp_tlb_addr2; 108volatile int smp_tlb_wait; 109 110#ifdef COUNT_IPIS 111/* Interrupt counts. */ 112static u_long *ipi_preempt_counts[MAXCPU]; 113static u_long *ipi_ast_counts[MAXCPU]; 114u_long *ipi_invltlb_counts[MAXCPU]; 115u_long *ipi_invlrng_counts[MAXCPU]; 116u_long *ipi_invlpg_counts[MAXCPU]; 117u_long *ipi_invlcache_counts[MAXCPU]; 118u_long *ipi_rendezvous_counts[MAXCPU]; 119static u_long *ipi_hardclock_counts[MAXCPU]; 120#endif 121 122extern inthand_t IDTVEC(fast_syscall), IDTVEC(fast_syscall32); 123 124/* 125 * Local data and functions. 126 */ 127 128static volatile cpumask_t ipi_nmi_pending; 129 130/* used to hold the AP's until we are ready to release them */ 131static struct mtx ap_boot_mtx; 132 133/* Set to 1 once we're ready to let the APs out of the pen. */ 134static volatile int aps_ready = 0; 135 136/* 137 * Store data from cpu_add() until later in the boot when we actually setup 138 * the APs. 139 */ 140struct cpu_info { 141 int cpu_present:1; 142 int cpu_bsp:1; 143 int cpu_disabled:1; 144 int cpu_hyperthread:1; 145} static cpu_info[MAX_APIC_ID + 1]; 146int cpu_apic_ids[MAXCPU]; 147int apic_cpuids[MAX_APIC_ID + 1]; 148 149/* Holds pending bitmap based IPIs per CPU */ 150static volatile u_int cpu_ipi_pending[MAXCPU]; 151 152static u_int boot_address; 153static int cpu_logical; /* logical cpus per core */ 154static int cpu_cores; /* cores per package */ 155 156static void assign_cpu_ids(void); 157static void set_interrupt_apic_ids(void); 158static int start_all_aps(void); 159static int start_ap(int apic_id); 160static void release_aps(void *dummy); 161 162static int hlt_logical_cpus; 163static u_int hyperthreading_cpus; /* logical cpus sharing L1 cache */ 164static cpumask_t hyperthreading_cpus_mask; 165static int hyperthreading_allowed = 1; 166static struct sysctl_ctx_list logical_cpu_clist; 167static u_int bootMP_size; 168 169static void 170mem_range_AP_init(void) 171{ 172 if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) 173 mem_range_softc.mr_op->initAP(&mem_range_softc); 174} 175 176static void 177topo_probe_amd(void) 178{ 179 int core_id_bits; 180 int id; 181 182 /* AMD processors do not support HTT. */ 183 cpu_logical = 1; 184 185 if ((amd_feature2 & AMDID2_CMP) == 0) { 186 cpu_cores = 1; 187 return; 188 } 189 190 core_id_bits = (cpu_procinfo2 & AMDID_COREID_SIZE) >> 191 AMDID_COREID_SIZE_SHIFT; 192 if (core_id_bits == 0) { 193 cpu_cores = (cpu_procinfo2 & AMDID_CMP_CORES) + 1; 194 return; 195 } 196 197 /* Fam 10h and newer should get here. */ 198 for (id = 0; id <= MAX_APIC_ID; id++) { 199 /* Check logical CPU availability. */ 200 if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled) 201 continue; 202 /* Check if logical CPU has the same package ID. */ 203 if ((id >> core_id_bits) != (boot_cpu_id >> core_id_bits)) 204 continue; 205 cpu_cores++; 206 } 207} 208 209/* 210 * Round up to the next power of two, if necessary, and then 211 * take log2. 212 * Returns -1 if argument is zero. 213 */ 214static __inline int 215mask_width(u_int x) 216{ 217 218 return (fls(x << (1 - powerof2(x))) - 1); 219} 220 221static void 222topo_probe_0x4(void) 223{ 224 u_int p[4]; 225 int pkg_id_bits; 226 int core_id_bits; 227 int max_cores; 228 int max_logical; 229 int id; 230 231 /* Both zero and one here mean one logical processor per package. */ 232 max_logical = (cpu_feature & CPUID_HTT) != 0 ? 233 (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1; 234 if (max_logical <= 1) 235 return; 236 237 /* 238 * Because of uniformity assumption we examine only 239 * those logical processors that belong to the same 240 * package as BSP. Further, we count number of 241 * logical processors that belong to the same core 242 * as BSP thus deducing number of threads per core. 243 */ 244 if (cpu_high >= 0x4) { 245 cpuid_count(0x04, 0, p); 246 max_cores = ((p[0] >> 26) & 0x3f) + 1; 247 } else 248 max_cores = 1; 249 core_id_bits = mask_width(max_logical/max_cores); 250 if (core_id_bits < 0) 251 return; 252 pkg_id_bits = core_id_bits + mask_width(max_cores); 253 254 for (id = 0; id <= MAX_APIC_ID; id++) { 255 /* Check logical CPU availability. */ 256 if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled) 257 continue; 258 /* Check if logical CPU has the same package ID. */ 259 if ((id >> pkg_id_bits) != (boot_cpu_id >> pkg_id_bits)) 260 continue; 261 cpu_cores++; 262 /* Check if logical CPU has the same package and core IDs. */ 263 if ((id >> core_id_bits) == (boot_cpu_id >> core_id_bits)) 264 cpu_logical++; 265 } 266 267 KASSERT(cpu_cores >= 1 && cpu_logical >= 1, 268 ("topo_probe_0x4 couldn't find BSP")); 269 270 cpu_cores /= cpu_logical; 271 hyperthreading_cpus = cpu_logical; 272} 273 274static void 275topo_probe_0xb(void) 276{ 277 u_int p[4]; 278 int bits; 279 int cnt; 280 int i; 281 int logical; 282 int type; 283 int x; 284 285 /* We only support three levels for now. */ 286 for (i = 0; i < 3; i++) { 287 cpuid_count(0x0b, i, p); 288 289 /* Fall back if CPU leaf 11 doesn't really exist. */ 290 if (i == 0 && p[1] == 0) { 291 topo_probe_0x4(); 292 return; 293 } 294 295 bits = p[0] & 0x1f; 296 logical = p[1] &= 0xffff; 297 type = (p[2] >> 8) & 0xff; 298 if (type == 0 || logical == 0) 299 break; 300 /* 301 * Because of uniformity assumption we examine only 302 * those logical processors that belong to the same 303 * package as BSP. 304 */ 305 for (cnt = 0, x = 0; x <= MAX_APIC_ID; x++) { 306 if (!cpu_info[x].cpu_present || 307 cpu_info[x].cpu_disabled) 308 continue; 309 if (x >> bits == boot_cpu_id >> bits) 310 cnt++; 311 } 312 if (type == CPUID_TYPE_SMT) 313 cpu_logical = cnt; 314 else if (type == CPUID_TYPE_CORE) 315 cpu_cores = cnt; 316 } 317 if (cpu_logical == 0) 318 cpu_logical = 1; 319 cpu_cores /= cpu_logical; 320} 321 322/* 323 * Both topology discovery code and code that consumes topology 324 * information assume top-down uniformity of the topology. 325 * That is, all physical packages must be identical and each 326 * core in a package must have the same number of threads. 327 * Topology information is queried only on BSP, on which this 328 * code runs and for which it can query CPUID information. 329 * Then topology is extrapolated on all packages using the 330 * uniformity assumption. 331 */ 332static void 333topo_probe(void) 334{ 335 static int cpu_topo_probed = 0; 336 337 if (cpu_topo_probed) 338 return; 339 340 logical_cpus_mask = 0; 341 if (mp_ncpus <= 1) 342 cpu_cores = cpu_logical = 1; 343 else if (cpu_vendor_id == CPU_VENDOR_AMD) 344 topo_probe_amd(); 345 else if (cpu_vendor_id == CPU_VENDOR_INTEL) { 346 /* 347 * See Intel(R) 64 Architecture Processor 348 * Topology Enumeration article for details. 349 * 350 * Note that 0x1 <= cpu_high < 4 case should be 351 * compatible with topo_probe_0x4() logic when 352 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1) 353 * or it should trigger the fallback otherwise. 354 */ 355 if (cpu_high >= 0xb) 356 topo_probe_0xb(); 357 else if (cpu_high >= 0x1) 358 topo_probe_0x4(); 359 } 360 361 /* 362 * Fallback: assume each logical CPU is in separate 363 * physical package. That is, no multi-core, no SMT. 364 */ 365 if (cpu_cores == 0 || cpu_logical == 0) 366 cpu_cores = cpu_logical = 1; 367 cpu_topo_probed = 1; 368} 369 370struct cpu_group * 371cpu_topo(void) 372{ 373 int cg_flags; 374 375 /* 376 * Determine whether any threading flags are 377 * necessry. 378 */ 379 topo_probe(); 380 if (cpu_logical > 1 && hyperthreading_cpus) 381 cg_flags = CG_FLAG_HTT; 382 else if (cpu_logical > 1) 383 cg_flags = CG_FLAG_SMT; 384 else 385 cg_flags = 0; 386 if (mp_ncpus % (cpu_cores * cpu_logical) != 0) { 387 printf("WARNING: Non-uniform processors.\n"); 388 printf("WARNING: Using suboptimal topology.\n"); 389 return (smp_topo_none()); 390 } 391 /* 392 * No multi-core or hyper-threaded. 393 */ 394 if (cpu_logical * cpu_cores == 1) 395 return (smp_topo_none()); 396 /* 397 * Only HTT no multi-core. 398 */ 399 if (cpu_logical > 1 && cpu_cores == 1) 400 return (smp_topo_1level(CG_SHARE_L1, cpu_logical, cg_flags)); 401 /* 402 * Only multi-core no HTT. 403 */ 404 if (cpu_cores > 1 && cpu_logical == 1) 405 return (smp_topo_1level(CG_SHARE_L2, cpu_cores, cg_flags)); 406 /* 407 * Both HTT and multi-core. 408 */ 409 return (smp_topo_2level(CG_SHARE_L2, cpu_cores, 410 CG_SHARE_L1, cpu_logical, cg_flags)); 411} 412 413/* 414 * Calculate usable address in base memory for AP trampoline code. 415 */ 416u_int 417mp_bootaddress(u_int basemem) 418{ 419 420 bootMP_size = mptramp_end - mptramp_start; 421 boot_address = trunc_page(basemem * 1024); /* round down to 4k boundary */ 422 if (((basemem * 1024) - boot_address) < bootMP_size) 423 boot_address -= PAGE_SIZE; /* not enough, lower by 4k */ 424 /* 3 levels of page table pages */ 425 mptramp_pagetables = boot_address - (PAGE_SIZE * 3); 426 427 return mptramp_pagetables; 428} 429 430void 431cpu_add(u_int apic_id, char boot_cpu) 432{ 433 434 if (apic_id > MAX_APIC_ID) { 435 panic("SMP: APIC ID %d too high", apic_id); 436 return; 437 } 438 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", 439 apic_id)); 440 cpu_info[apic_id].cpu_present = 1; 441 if (boot_cpu) { 442 KASSERT(boot_cpu_id == -1, 443 ("CPU %d claims to be BSP, but CPU %d already is", apic_id, 444 boot_cpu_id)); 445 boot_cpu_id = apic_id; 446 cpu_info[apic_id].cpu_bsp = 1; 447 } 448 if (mp_ncpus < MAXCPU) { 449 mp_ncpus++; 450 mp_maxid = mp_ncpus - 1; 451 } 452 if (bootverbose) 453 printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : 454 "AP"); 455} 456 457void 458cpu_mp_setmaxid(void) 459{ 460 461 /* 462 * mp_maxid should be already set by calls to cpu_add(). 463 * Just sanity check its value here. 464 */ 465 if (mp_ncpus == 0) 466 KASSERT(mp_maxid == 0, 467 ("%s: mp_ncpus is zero, but mp_maxid is not", __func__)); 468 else if (mp_ncpus == 1) 469 mp_maxid = 0; 470 else 471 KASSERT(mp_maxid >= mp_ncpus - 1, 472 ("%s: counters out of sync: max %d, count %d", __func__, 473 mp_maxid, mp_ncpus)); 474} 475 476int 477cpu_mp_probe(void) 478{ 479 480 /* 481 * Always record BSP in CPU map so that the mbuf init code works 482 * correctly. 483 */ 484 all_cpus = 1; 485 if (mp_ncpus == 0) { 486 /* 487 * No CPUs were found, so this must be a UP system. Setup 488 * the variables to represent a system with a single CPU 489 * with an id of 0. 490 */ 491 mp_ncpus = 1; 492 return (0); 493 } 494 495 /* At least one CPU was found. */ 496 if (mp_ncpus == 1) { 497 /* 498 * One CPU was found, so this must be a UP system with 499 * an I/O APIC. 500 */ 501 mp_maxid = 0; 502 return (0); 503 } 504 505 /* At least two CPUs were found. */ 506 return (1); 507} 508 509/* 510 * Initialize the IPI handlers and start up the AP's. 511 */ 512void 513cpu_mp_start(void) 514{ 515 int i; 516 517 /* Initialize the logical ID to APIC ID table. */ 518 for (i = 0; i < MAXCPU; i++) { 519 cpu_apic_ids[i] = -1; 520 cpu_ipi_pending[i] = 0; 521 } 522 523 /* Install an inter-CPU IPI for TLB invalidation */ 524 setidt(IPI_INVLTLB, IDTVEC(invltlb), SDT_SYSIGT, SEL_KPL, 0); 525 setidt(IPI_INVLPG, IDTVEC(invlpg), SDT_SYSIGT, SEL_KPL, 0); 526 setidt(IPI_INVLRNG, IDTVEC(invlrng), SDT_SYSIGT, SEL_KPL, 0); 527 528 /* Install an inter-CPU IPI for cache invalidation. */ 529 setidt(IPI_INVLCACHE, IDTVEC(invlcache), SDT_SYSIGT, SEL_KPL, 0); 530 531 /* Install an inter-CPU IPI for all-CPU rendezvous */ 532 setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous), SDT_SYSIGT, SEL_KPL, 0); 533 534 /* Install generic inter-CPU IPI handler */ 535 setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler), 536 SDT_SYSIGT, SEL_KPL, 0); 537 538 /* Install an inter-CPU IPI for CPU stop/restart */ 539 setidt(IPI_STOP, IDTVEC(cpustop), SDT_SYSIGT, SEL_KPL, 0); 540 541 /* Install an inter-CPU IPI for CPU suspend/resume */ 542 setidt(IPI_SUSPEND, IDTVEC(cpususpend), SDT_SYSIGT, SEL_KPL, 0); 543 544 /* Set boot_cpu_id if needed. */ 545 if (boot_cpu_id == -1) { 546 boot_cpu_id = PCPU_GET(apic_id); 547 cpu_info[boot_cpu_id].cpu_bsp = 1; 548 } else 549 KASSERT(boot_cpu_id == PCPU_GET(apic_id), 550 ("BSP's APIC ID doesn't match boot_cpu_id")); 551 552 /* Probe logical/physical core configuration. */ 553 topo_probe(); 554 555 assign_cpu_ids(); 556 557 /* Start each Application Processor */ 558 start_all_aps(); 559 560 set_interrupt_apic_ids(); 561} 562 563 564/* 565 * Print various information about the SMP system hardware and setup. 566 */ 567void 568cpu_mp_announce(void) 569{ 570 const char *hyperthread; 571 int i; 572 573 printf("FreeBSD/SMP: %d package(s) x %d core(s)", 574 mp_ncpus / (cpu_cores * cpu_logical), cpu_cores); 575 if (hyperthreading_cpus > 1) 576 printf(" x %d HTT threads", cpu_logical); 577 else if (cpu_logical > 1) 578 printf(" x %d SMT threads", cpu_logical); 579 printf("\n"); 580 581 /* List active CPUs first. */ 582 printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id); 583 for (i = 1; i < mp_ncpus; i++) { 584 if (cpu_info[cpu_apic_ids[i]].cpu_hyperthread) 585 hyperthread = "/HT"; 586 else 587 hyperthread = ""; 588 printf(" cpu%d (AP%s): APIC ID: %2d\n", i, hyperthread, 589 cpu_apic_ids[i]); 590 } 591 592 /* List disabled CPUs last. */ 593 for (i = 0; i <= MAX_APIC_ID; i++) { 594 if (!cpu_info[i].cpu_present || !cpu_info[i].cpu_disabled) 595 continue; 596 if (cpu_info[i].cpu_hyperthread) 597 hyperthread = "/HT"; 598 else 599 hyperthread = ""; 600 printf(" cpu (AP%s): APIC ID: %2d (disabled)\n", hyperthread, 601 i); 602 } 603} 604 605/* 606 * AP CPU's call this to initialize themselves. 607 */ 608void 609init_secondary(void) 610{ 611 struct pcpu *pc; 612 struct nmi_pcpu *np; 613 u_int64_t msr, cr0; 614 int cpu, gsel_tss, x; 615 struct region_descriptor ap_gdt; 616 617 /* Set by the startup code for us to use */ 618 cpu = bootAP; 619 620 /* Init tss */ 621 common_tss[cpu] = common_tss[0]; 622 common_tss[cpu].tss_rsp0 = 0; /* not used until after switch */ 623 common_tss[cpu].tss_iobase = sizeof(struct amd64tss) + 624 IOPAGES * PAGE_SIZE; 625 common_tss[cpu].tss_ist1 = (long)&doublefault_stack[PAGE_SIZE]; 626 627 /* The NMI stack runs on IST2. */ 628 np = ((struct nmi_pcpu *) &nmi_stack[PAGE_SIZE]) - 1; 629 common_tss[cpu].tss_ist2 = (long) np; 630 631 /* Prepare private GDT */ 632 gdt_segs[GPROC0_SEL].ssd_base = (long) &common_tss[cpu]; 633 for (x = 0; x < NGDT; x++) { 634 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) && 635 x != GUSERLDT_SEL && x != (GUSERLDT_SEL + 1)) 636 ssdtosd(&gdt_segs[x], &gdt[NGDT * cpu + x]); 637 } 638 ssdtosyssd(&gdt_segs[GPROC0_SEL], 639 (struct system_segment_descriptor *)&gdt[NGDT * cpu + GPROC0_SEL]); 640 ap_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 641 ap_gdt.rd_base = (long) &gdt[NGDT * cpu]; 642 lgdt(&ap_gdt); /* does magic intra-segment return */ 643 644 /* Get per-cpu data */ 645 pc = &__pcpu[cpu]; 646 647 /* prime data page for it to use */ 648 pcpu_init(pc, cpu, sizeof(struct pcpu)); 649 dpcpu_init(dpcpu, cpu); 650 pc->pc_apic_id = cpu_apic_ids[cpu]; 651 pc->pc_prvspace = pc; 652 pc->pc_curthread = 0; 653 pc->pc_tssp = &common_tss[cpu]; 654 pc->pc_commontssp = &common_tss[cpu]; 655 pc->pc_rsp0 = 0; 656 pc->pc_tss = (struct system_segment_descriptor *)&gdt[NGDT * cpu + 657 GPROC0_SEL]; 658 pc->pc_fs32p = &gdt[NGDT * cpu + GUFS32_SEL]; 659 pc->pc_gs32p = &gdt[NGDT * cpu + GUGS32_SEL]; 660 pc->pc_ldt = (struct system_segment_descriptor *)&gdt[NGDT * cpu + 661 GUSERLDT_SEL]; 662 663 /* Save the per-cpu pointer for use by the NMI handler. */ 664 np->np_pcpu = (register_t) pc; 665 666 wrmsr(MSR_FSBASE, 0); /* User value */ 667 wrmsr(MSR_GSBASE, (u_int64_t)pc); 668 wrmsr(MSR_KGSBASE, (u_int64_t)pc); /* XXX User value while we're in the kernel */ 669 670 lidt(&r_idt); 671 672 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 673 ltr(gsel_tss); 674 675 /* 676 * Set to a known state: 677 * Set by mpboot.s: CR0_PG, CR0_PE 678 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM 679 */ 680 cr0 = rcr0(); 681 cr0 &= ~(CR0_CD | CR0_NW | CR0_EM); 682 load_cr0(cr0); 683 684 /* Set up the fast syscall stuff */ 685 msr = rdmsr(MSR_EFER) | EFER_SCE; 686 wrmsr(MSR_EFER, msr); 687 wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall)); 688 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); 689 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | 690 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); 691 wrmsr(MSR_STAR, msr); 692 wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D); 693 694 /* Disable local APIC just to be sure. */ 695 lapic_disable(); 696 697 /* signal our startup to the BSP. */ 698 mp_naps++; 699 700 /* Spin until the BSP releases the AP's. */ 701 while (!aps_ready) 702 ia32_pause(); 703 704 /* Initialize the PAT MSR. */ 705 pmap_init_pat(); 706 707 /* set up CPU registers and state */ 708 cpu_setregs(); 709 710 /* set up SSE/NX registers */ 711 initializecpu(); 712 713 /* set up FPU state on the AP */ 714 fpuinit(); 715 716 /* A quick check from sanity claus */ 717 if (PCPU_GET(apic_id) != lapic_id()) { 718 printf("SMP: cpuid = %d\n", PCPU_GET(cpuid)); 719 printf("SMP: actual apic_id = %d\n", lapic_id()); 720 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); 721 panic("cpuid mismatch! boom!!"); 722 } 723 724 /* Initialize curthread. */ 725 KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); 726 PCPU_SET(curthread, PCPU_GET(idlethread)); 727 728 mca_init(); 729 730 mtx_lock_spin(&ap_boot_mtx); 731 732 /* Init local apic for irq's */ 733 lapic_setup(1); 734 735 /* Set memory range attributes for this CPU to match the BSP */ 736 mem_range_AP_init(); 737 738 smp_cpus++; 739 740 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid)); 741 printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid)); 742 743 /* Determine if we are a logical CPU. */ 744 /* XXX Calculation depends on cpu_logical being a power of 2, e.g. 2 */ 745 if (cpu_logical > 1 && PCPU_GET(apic_id) % cpu_logical != 0) 746 logical_cpus_mask |= PCPU_GET(cpumask); 747 748 /* Determine if we are a hyperthread. */ 749 if (hyperthreading_cpus > 1 && 750 PCPU_GET(apic_id) % hyperthreading_cpus != 0) 751 hyperthreading_cpus_mask |= PCPU_GET(cpumask); 752 753 /* Build our map of 'other' CPUs. */ 754 PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); 755 756 if (bootverbose) 757 lapic_dump("AP"); 758 759 if (smp_cpus == mp_ncpus) { 760 /* enable IPI's, tlb shootdown, freezes etc */ 761 atomic_store_rel_int(&smp_started, 1); 762 smp_active = 1; /* historic */ 763 } 764 765 /* 766 * Enable global pages TLB extension 767 * This also implicitly flushes the TLB 768 */ 769 770 load_cr4(rcr4() | CR4_PGE); 771 load_ds(_udatasel); 772 load_es(_udatasel); 773 load_fs(_ufssel); 774 mtx_unlock_spin(&ap_boot_mtx); 775 776 /* Wait until all the AP's are up. */ 777 while (smp_started == 0) 778 ia32_pause(); 779 780 /* Start per-CPU event timers. */ 781 cpu_initclocks_ap(); 782 783 sched_throw(NULL); 784 785 panic("scheduler returned us to %s", __func__); 786 /* NOTREACHED */ 787} 788 789/******************************************************************* 790 * local functions and data 791 */ 792 793/* 794 * We tell the I/O APIC code about all the CPUs we want to receive 795 * interrupts. If we don't want certain CPUs to receive IRQs we 796 * can simply not tell the I/O APIC code about them in this function. 797 * We also do not tell it about the BSP since it tells itself about 798 * the BSP internally to work with UP kernels and on UP machines. 799 */ 800static void 801set_interrupt_apic_ids(void) 802{ 803 u_int i, apic_id; 804 805 for (i = 0; i < MAXCPU; i++) { 806 apic_id = cpu_apic_ids[i]; 807 if (apic_id == -1) 808 continue; 809 if (cpu_info[apic_id].cpu_bsp) 810 continue; 811 if (cpu_info[apic_id].cpu_disabled) 812 continue; 813 814 /* Don't let hyperthreads service interrupts. */ 815 if (hyperthreading_cpus > 1 && 816 apic_id % hyperthreading_cpus != 0) 817 continue; 818 819 intr_add_cpu(i); 820 } 821} 822 823/* 824 * Assign logical CPU IDs to local APICs. 825 */ 826static void 827assign_cpu_ids(void) 828{ 829 u_int i; 830 831 TUNABLE_INT_FETCH("machdep.hyperthreading_allowed", 832 &hyperthreading_allowed); 833 834 /* Check for explicitly disabled CPUs. */ 835 for (i = 0; i <= MAX_APIC_ID; i++) { 836 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp) 837 continue; 838 839 if (hyperthreading_cpus > 1 && i % hyperthreading_cpus != 0) { 840 cpu_info[i].cpu_hyperthread = 1; 841#if defined(SCHED_ULE) 842 /* 843 * Don't use HT CPU if it has been disabled by a 844 * tunable. 845 */ 846 if (hyperthreading_allowed == 0) { 847 cpu_info[i].cpu_disabled = 1; 848 continue; 849 } 850#endif 851 } 852 853 /* Don't use this CPU if it has been disabled by a tunable. */ 854 if (resource_disabled("lapic", i)) { 855 cpu_info[i].cpu_disabled = 1; 856 continue; 857 } 858 } 859 860 /* 861 * Assign CPU IDs to local APIC IDs and disable any CPUs 862 * beyond MAXCPU. CPU 0 is always assigned to the BSP. 863 * 864 * To minimize confusion for userland, we attempt to number 865 * CPUs such that all threads and cores in a package are 866 * grouped together. For now we assume that the BSP is always 867 * the first thread in a package and just start adding APs 868 * starting with the BSP's APIC ID. 869 */ 870 mp_ncpus = 1; 871 cpu_apic_ids[0] = boot_cpu_id; 872 apic_cpuids[boot_cpu_id] = 0; 873 for (i = boot_cpu_id + 1; i != boot_cpu_id; 874 i == MAX_APIC_ID ? i = 0 : i++) { 875 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp || 876 cpu_info[i].cpu_disabled) 877 continue; 878 879 if (mp_ncpus < MAXCPU) { 880 cpu_apic_ids[mp_ncpus] = i; 881 apic_cpuids[i] = mp_ncpus; 882 mp_ncpus++; 883 } else 884 cpu_info[i].cpu_disabled = 1; 885 } 886 KASSERT(mp_maxid >= mp_ncpus - 1, 887 ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, 888 mp_ncpus)); 889} 890 891/* 892 * start each AP in our list 893 */ 894static int 895start_all_aps(void) 896{ 897 vm_offset_t va = boot_address + KERNBASE; 898 u_int64_t *pt4, *pt3, *pt2; 899 u_int32_t mpbioswarmvec; 900 int apic_id, cpu, i; 901 u_char mpbiosreason; 902 903 mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); 904 905 /* install the AP 1st level boot code */ 906 pmap_kenter(va, boot_address); 907 pmap_invalidate_page(kernel_pmap, va); 908 bcopy(mptramp_start, (void *)va, bootMP_size); 909 910 /* Locate the page tables, they'll be below the trampoline */ 911 pt4 = (u_int64_t *)(uintptr_t)(mptramp_pagetables + KERNBASE); 912 pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t); 913 pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t); 914 915 /* Create the initial 1GB replicated page tables */ 916 for (i = 0; i < 512; i++) { 917 /* Each slot of the level 4 pages points to the same level 3 page */ 918 pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + PAGE_SIZE); 919 pt4[i] |= PG_V | PG_RW | PG_U; 920 921 /* Each slot of the level 3 pages points to the same level 2 page */ 922 pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + (2 * PAGE_SIZE)); 923 pt3[i] |= PG_V | PG_RW | PG_U; 924 925 /* The level 2 page slots are mapped with 2MB pages for 1GB. */ 926 pt2[i] = i * (2 * 1024 * 1024); 927 pt2[i] |= PG_V | PG_RW | PG_PS | PG_U; 928 } 929 930 /* save the current value of the warm-start vector */ 931 mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF); 932 outb(CMOS_REG, BIOS_RESET); 933 mpbiosreason = inb(CMOS_DATA); 934 935 /* setup a vector to our boot code */ 936 *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET; 937 *((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4); 938 outb(CMOS_REG, BIOS_RESET); 939 outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ 940 941 /* start each AP */ 942 for (cpu = 1; cpu < mp_ncpus; cpu++) { 943 apic_id = cpu_apic_ids[cpu]; 944 945 /* allocate and set up an idle stack data page */ 946 bootstacks[cpu] = (void *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE); 947 doublefault_stack = (char *)kmem_alloc(kernel_map, PAGE_SIZE); 948 nmi_stack = (char *)kmem_alloc(kernel_map, PAGE_SIZE); 949 dpcpu = (void *)kmem_alloc(kernel_map, DPCPU_SIZE); 950 951 bootSTK = (char *)bootstacks[cpu] + KSTACK_PAGES * PAGE_SIZE - 8; 952 bootAP = cpu; 953 954 /* attempt to start the Application Processor */ 955 if (!start_ap(apic_id)) { 956 /* restore the warmstart vector */ 957 *(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec; 958 panic("AP #%d (PHY# %d) failed!", cpu, apic_id); 959 } 960 961 all_cpus |= (1 << cpu); /* record AP in CPU map */ 962 } 963 964 /* build our map of 'other' CPUs */ 965 PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); 966 967 /* restore the warmstart vector */ 968 *(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec; 969 970 outb(CMOS_REG, BIOS_RESET); 971 outb(CMOS_DATA, mpbiosreason); 972 973 /* number of APs actually started */ 974 return mp_naps; 975} 976 977 978/* 979 * This function starts the AP (application processor) identified 980 * by the APIC ID 'physicalCpu'. It does quite a "song and dance" 981 * to accomplish this. This is necessary because of the nuances 982 * of the different hardware we might encounter. It isn't pretty, 983 * but it seems to work. 984 */ 985static int 986start_ap(int apic_id) 987{ 988 int vector, ms; 989 int cpus; 990 991 /* calculate the vector */ 992 vector = (boot_address >> 12) & 0xff; 993 994 /* used as a watchpoint to signal AP startup */ 995 cpus = mp_naps; 996 997 /* 998 * first we do an INIT/RESET IPI this INIT IPI might be run, reseting 999 * and running the target CPU. OR this INIT IPI might be latched (P5 1000 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be 1001 * ignored. 1002 */ 1003 1004 /* do an INIT IPI: assert RESET */ 1005 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1006 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); 1007 1008 /* wait for pending status end */ 1009 lapic_ipi_wait(-1); 1010 1011 /* do an INIT IPI: deassert RESET */ 1012 lapic_ipi_raw(APIC_DEST_ALLESELF | APIC_TRIGMOD_LEVEL | 1013 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 0); 1014 1015 /* wait for pending status end */ 1016 DELAY(10000); /* wait ~10mS */ 1017 lapic_ipi_wait(-1); 1018 1019 /* 1020 * next we do a STARTUP IPI: the previous INIT IPI might still be 1021 * latched, (P5 bug) this 1st STARTUP would then terminate 1022 * immediately, and the previously started INIT IPI would continue. OR 1023 * the previous INIT IPI has already run. and this STARTUP IPI will 1024 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI 1025 * will run. 1026 */ 1027 1028 /* do a STARTUP IPI */ 1029 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1030 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 1031 vector, apic_id); 1032 lapic_ipi_wait(-1); 1033 DELAY(200); /* wait ~200uS */ 1034 1035 /* 1036 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF 1037 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR 1038 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is 1039 * recognized after hardware RESET or INIT IPI. 1040 */ 1041 1042 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1043 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 1044 vector, apic_id); 1045 lapic_ipi_wait(-1); 1046 DELAY(200); /* wait ~200uS */ 1047 1048 /* Wait up to 5 seconds for it to start. */ 1049 for (ms = 0; ms < 5000; ms++) { 1050 if (mp_naps > cpus) 1051 return 1; /* return SUCCESS */ 1052 DELAY(1000); 1053 } 1054 return 0; /* return FAILURE */ 1055} 1056 1057#ifdef COUNT_XINVLTLB_HITS 1058u_int xhits_gbl[MAXCPU]; 1059u_int xhits_pg[MAXCPU]; 1060u_int xhits_rng[MAXCPU]; 1061SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, ""); 1062SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, 1063 sizeof(xhits_gbl), "IU", ""); 1064SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, 1065 sizeof(xhits_pg), "IU", ""); 1066SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, 1067 sizeof(xhits_rng), "IU", ""); 1068 1069u_int ipi_global; 1070u_int ipi_page; 1071u_int ipi_range; 1072u_int ipi_range_size; 1073SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); 1074SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); 1075SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); 1076SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, 1077 &ipi_range_size, 0, ""); 1078 1079u_int ipi_masked_global; 1080u_int ipi_masked_page; 1081u_int ipi_masked_range; 1082u_int ipi_masked_range_size; 1083SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW, 1084 &ipi_masked_global, 0, ""); 1085SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW, 1086 &ipi_masked_page, 0, ""); 1087SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW, 1088 &ipi_masked_range, 0, ""); 1089SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW, 1090 &ipi_masked_range_size, 0, ""); 1091#endif /* COUNT_XINVLTLB_HITS */ 1092 1093/* 1094 * Flush the TLB on all other CPU's 1095 */ 1096static void 1097smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) 1098{ 1099 u_int ncpu; 1100 1101 ncpu = mp_ncpus - 1; /* does not shootdown self */ 1102 if (ncpu < 1) 1103 return; /* no other cpus */ 1104 if (!(read_rflags() & PSL_I)) 1105 panic("%s: interrupts disabled", __func__); 1106 mtx_lock_spin(&smp_ipi_mtx); 1107 smp_tlb_addr1 = addr1; 1108 smp_tlb_addr2 = addr2; 1109 atomic_store_rel_int(&smp_tlb_wait, 0); 1110 ipi_all_but_self(vector); 1111 while (smp_tlb_wait < ncpu) 1112 ia32_pause(); 1113 mtx_unlock_spin(&smp_ipi_mtx); 1114} 1115 1116static void 1117smp_targeted_tlb_shootdown(cpumask_t mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2) 1118{ 1119 int ncpu, othercpus; 1120 1121 othercpus = mp_ncpus - 1; 1122 if (mask == (cpumask_t)-1) { 1123 ncpu = othercpus; 1124 if (ncpu < 1) 1125 return; 1126 } else { 1127 mask &= ~PCPU_GET(cpumask); 1128 if (mask == 0) 1129 return; 1130 ncpu = bitcount32(mask); 1131 if (ncpu > othercpus) { 1132 /* XXX this should be a panic offence */ 1133 printf("SMP: tlb shootdown to %d other cpus (only have %d)\n", 1134 ncpu, othercpus); 1135 ncpu = othercpus; 1136 } 1137 /* XXX should be a panic, implied by mask == 0 above */ 1138 if (ncpu < 1) 1139 return; 1140 } 1141 if (!(read_rflags() & PSL_I)) 1142 panic("%s: interrupts disabled", __func__); 1143 mtx_lock_spin(&smp_ipi_mtx); 1144 smp_tlb_addr1 = addr1; 1145 smp_tlb_addr2 = addr2; 1146 atomic_store_rel_int(&smp_tlb_wait, 0); 1147 if (mask == (cpumask_t)-1) 1148 ipi_all_but_self(vector); 1149 else 1150 ipi_selected(mask, vector); 1151 while (smp_tlb_wait < ncpu) 1152 ia32_pause(); 1153 mtx_unlock_spin(&smp_ipi_mtx); 1154} 1155 1156/* 1157 * Send an IPI to specified CPU handling the bitmap logic. 1158 */ 1159static void 1160ipi_send_cpu(int cpu, u_int ipi) 1161{ 1162 u_int bitmap, old_pending, new_pending; 1163 1164 KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu)); 1165 1166 if (IPI_IS_BITMAPED(ipi)) { 1167 bitmap = 1 << ipi; 1168 ipi = IPI_BITMAP_VECTOR; 1169 do { 1170 old_pending = cpu_ipi_pending[cpu]; 1171 new_pending = old_pending | bitmap; 1172 } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu], 1173 old_pending, new_pending)); 1174 if (old_pending) 1175 return; 1176 } 1177 lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]); 1178} 1179 1180void 1181smp_cache_flush(void) 1182{ 1183 1184 if (smp_started) 1185 smp_tlb_shootdown(IPI_INVLCACHE, 0, 0); 1186} 1187 1188void 1189smp_invltlb(void) 1190{ 1191 1192 if (smp_started) { 1193 smp_tlb_shootdown(IPI_INVLTLB, 0, 0); 1194#ifdef COUNT_XINVLTLB_HITS 1195 ipi_global++; 1196#endif 1197 } 1198} 1199 1200void 1201smp_invlpg(vm_offset_t addr) 1202{ 1203 1204 if (smp_started) { 1205 smp_tlb_shootdown(IPI_INVLPG, addr, 0); 1206#ifdef COUNT_XINVLTLB_HITS 1207 ipi_page++; 1208#endif 1209 } 1210} 1211 1212void 1213smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2) 1214{ 1215 1216 if (smp_started) { 1217 smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2); 1218#ifdef COUNT_XINVLTLB_HITS 1219 ipi_range++; 1220 ipi_range_size += (addr2 - addr1) / PAGE_SIZE; 1221#endif 1222 } 1223} 1224 1225void 1226smp_masked_invltlb(cpumask_t mask) 1227{ 1228 1229 if (smp_started) { 1230 smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0); 1231#ifdef COUNT_XINVLTLB_HITS 1232 ipi_masked_global++; 1233#endif 1234 } 1235} 1236 1237void 1238smp_masked_invlpg(cpumask_t mask, vm_offset_t addr) 1239{ 1240 1241 if (smp_started) { 1242 smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0); 1243#ifdef COUNT_XINVLTLB_HITS 1244 ipi_masked_page++; 1245#endif 1246 } 1247} 1248 1249void 1250smp_masked_invlpg_range(cpumask_t mask, vm_offset_t addr1, vm_offset_t addr2) 1251{ 1252 1253 if (smp_started) { 1254 smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2); 1255#ifdef COUNT_XINVLTLB_HITS 1256 ipi_masked_range++; 1257 ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE; 1258#endif 1259 } 1260} 1261 1262void 1263ipi_bitmap_handler(struct trapframe frame) 1264{ 1265 struct trapframe *oldframe; 1266 struct thread *td; 1267 int cpu = PCPU_GET(cpuid); 1268 u_int ipi_bitmap; 1269 1270 critical_enter(); 1271 td = curthread; 1272 td->td_intr_nesting_level++; 1273 oldframe = td->td_intr_frame; 1274 td->td_intr_frame = &frame; 1275 ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); 1276 if (ipi_bitmap & (1 << IPI_PREEMPT)) { 1277#ifdef COUNT_IPIS 1278 (*ipi_preempt_counts[cpu])++; 1279#endif 1280 sched_preempt(td); 1281 } 1282 if (ipi_bitmap & (1 << IPI_AST)) { 1283#ifdef COUNT_IPIS 1284 (*ipi_ast_counts[cpu])++; 1285#endif 1286 /* Nothing to do for AST */ 1287 } 1288 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) { 1289#ifdef COUNT_IPIS 1290 (*ipi_hardclock_counts[cpu])++; 1291#endif 1292 hardclockintr(); 1293 } 1294 td->td_intr_frame = oldframe; 1295 td->td_intr_nesting_level--; 1296 critical_exit(); 1297} 1298 1299/* 1300 * send an IPI to a set of cpus. 1301 */ 1302void 1303ipi_selected(cpumask_t cpus, u_int ipi) 1304{ 1305 int cpu; 1306 1307 /* 1308 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1309 * of help in order to understand what is the source. 1310 * Set the mask of receiving CPUs for this purpose. 1311 */ 1312 if (ipi == IPI_STOP_HARD) 1313 atomic_set_int(&ipi_nmi_pending, cpus); 1314 1315 CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi); 1316 while ((cpu = ffs(cpus)) != 0) { 1317 cpu--; 1318 cpus &= ~(1 << cpu); 1319 ipi_send_cpu(cpu, ipi); 1320 } 1321} 1322 1323/* 1324 * send an IPI to a specific CPU. 1325 */ 1326void 1327ipi_cpu(int cpu, u_int ipi) 1328{ 1329 1330 /* 1331 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1332 * of help in order to understand what is the source. 1333 * Set the mask of receiving CPUs for this purpose. 1334 */ 1335 if (ipi == IPI_STOP_HARD) 1336 atomic_set_int(&ipi_nmi_pending, 1 << cpu); 1337 1338 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1339 ipi_send_cpu(cpu, ipi); 1340} 1341 1342/* 1343 * send an IPI to all CPUs EXCEPT myself 1344 */ 1345void 1346ipi_all_but_self(u_int ipi) 1347{ 1348 1349 if (IPI_IS_BITMAPED(ipi)) { 1350 ipi_selected(PCPU_GET(other_cpus), ipi); 1351 return; 1352 } 1353 1354 /* 1355 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1356 * of help in order to understand what is the source. 1357 * Set the mask of receiving CPUs for this purpose. 1358 */ 1359 if (ipi == IPI_STOP_HARD) 1360 atomic_set_int(&ipi_nmi_pending, PCPU_GET(other_cpus)); 1361 1362 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); 1363 lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); 1364} 1365 1366int 1367ipi_nmi_handler() 1368{ 1369 cpumask_t cpumask; 1370 1371 /* 1372 * As long as there is not a simple way to know about a NMI's 1373 * source, if the bitmask for the current CPU is present in 1374 * the global pending bitword an IPI_STOP_HARD has been issued 1375 * and should be handled. 1376 */ 1377 cpumask = PCPU_GET(cpumask); 1378 if ((ipi_nmi_pending & cpumask) == 0) 1379 return (1); 1380 1381 atomic_clear_int(&ipi_nmi_pending, cpumask); 1382 cpustop_handler(); 1383 return (0); 1384} 1385 1386/* 1387 * Handle an IPI_STOP by saving our current context and spinning until we 1388 * are resumed. 1389 */ 1390void 1391cpustop_handler(void) 1392{ 1393 cpumask_t cpumask; 1394 u_int cpu; 1395 1396 cpu = PCPU_GET(cpuid); 1397 cpumask = PCPU_GET(cpumask); 1398 1399 savectx(&stoppcbs[cpu]); 1400 1401 /* Indicate that we are stopped */ 1402 atomic_set_int(&stopped_cpus, cpumask); 1403 1404 /* Wait for restart */ 1405 while (!(started_cpus & cpumask)) 1406 ia32_pause(); 1407 1408 atomic_clear_int(&started_cpus, cpumask); 1409 atomic_clear_int(&stopped_cpus, cpumask); 1410 1411 if (cpu == 0 && cpustop_restartfunc != NULL) { 1412 cpustop_restartfunc(); 1413 cpustop_restartfunc = NULL; 1414 } 1415} 1416 1417/* 1418 * Handle an IPI_SUSPEND by saving our current context and spinning until we 1419 * are resumed. 1420 */ 1421void 1422cpususpend_handler(void) 1423{ 1424 cpumask_t cpumask; 1425 register_t cr3, rf; 1426 u_int cpu; 1427 1428 cpu = PCPU_GET(cpuid); 1429 cpumask = PCPU_GET(cpumask); 1430 1431 rf = intr_disable(); 1432 cr3 = rcr3(); 1433 1434 if (savectx(susppcbs[cpu])) { 1435 wbinvd(); 1436 atomic_set_int(&stopped_cpus, cpumask); 1437 } else { 1438 pmap_init_pat(); 1439 PCPU_SET(switchtime, 0); 1440 PCPU_SET(switchticks, ticks); 1441 } 1442 1443 /* Wait for resume */ 1444 while (!(started_cpus & cpumask)) 1445 ia32_pause(); 1446 1447 atomic_clear_int(&started_cpus, cpumask); 1448 atomic_clear_int(&stopped_cpus, cpumask); 1449 1450 /* Restore CR3 and enable interrupts */ 1451 load_cr3(cr3); 1452 mca_resume(); 1453 lapic_setup(0); 1454 intr_restore(rf); 1455} 1456 1457/* 1458 * This is called once the rest of the system is up and running and we're 1459 * ready to let the AP's out of the pen. 1460 */ 1461static void 1462release_aps(void *dummy __unused) 1463{ 1464 1465 if (mp_ncpus == 1) 1466 return; 1467 atomic_store_rel_int(&aps_ready, 1); 1468 while (smp_started == 0) 1469 ia32_pause(); 1470} 1471SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); 1472 1473static int 1474sysctl_hlt_cpus(SYSCTL_HANDLER_ARGS) 1475{ 1476 cpumask_t mask; 1477 int error; 1478 1479 mask = hlt_cpus_mask; 1480 error = sysctl_handle_int(oidp, &mask, 0, req); 1481 if (error || !req->newptr) 1482 return (error); 1483 1484 if (logical_cpus_mask != 0 && 1485 (mask & logical_cpus_mask) == logical_cpus_mask) 1486 hlt_logical_cpus = 1; 1487 else 1488 hlt_logical_cpus = 0; 1489 1490 if (! hyperthreading_allowed) 1491 mask |= hyperthreading_cpus_mask; 1492 1493 if ((mask & all_cpus) == all_cpus) 1494 mask &= ~(1<<0); 1495 hlt_cpus_mask = mask; 1496 return (error); 1497} 1498SYSCTL_PROC(_machdep, OID_AUTO, hlt_cpus, CTLTYPE_INT|CTLFLAG_RW, 1499 0, 0, sysctl_hlt_cpus, "IU", 1500 "Bitmap of CPUs to halt. 101 (binary) will halt CPUs 0 and 2."); 1501 1502static int 1503sysctl_hlt_logical_cpus(SYSCTL_HANDLER_ARGS) 1504{ 1505 int disable, error; 1506 1507 disable = hlt_logical_cpus; 1508 error = sysctl_handle_int(oidp, &disable, 0, req); 1509 if (error || !req->newptr) 1510 return (error); 1511 1512 if (disable) 1513 hlt_cpus_mask |= logical_cpus_mask; 1514 else 1515 hlt_cpus_mask &= ~logical_cpus_mask; 1516 1517 if (! hyperthreading_allowed) 1518 hlt_cpus_mask |= hyperthreading_cpus_mask; 1519 1520 if ((hlt_cpus_mask & all_cpus) == all_cpus) 1521 hlt_cpus_mask &= ~(1<<0); 1522 1523 hlt_logical_cpus = disable; 1524 return (error); 1525} 1526 1527static int 1528sysctl_hyperthreading_allowed(SYSCTL_HANDLER_ARGS) 1529{ 1530 int allowed, error; 1531 1532 allowed = hyperthreading_allowed; 1533 error = sysctl_handle_int(oidp, &allowed, 0, req); 1534 if (error || !req->newptr) 1535 return (error); 1536 1537#ifdef SCHED_ULE 1538 /* 1539 * SCHED_ULE doesn't allow enabling/disabling HT cores at 1540 * run-time. 1541 */ 1542 if (allowed != hyperthreading_allowed) 1543 return (ENOTSUP); 1544 return (error); 1545#endif 1546 1547 if (allowed) 1548 hlt_cpus_mask &= ~hyperthreading_cpus_mask; 1549 else 1550 hlt_cpus_mask |= hyperthreading_cpus_mask; 1551 1552 if (logical_cpus_mask != 0 && 1553 (hlt_cpus_mask & logical_cpus_mask) == logical_cpus_mask) 1554 hlt_logical_cpus = 1; 1555 else 1556 hlt_logical_cpus = 0; 1557 1558 if ((hlt_cpus_mask & all_cpus) == all_cpus) 1559 hlt_cpus_mask &= ~(1<<0); 1560 1561 hyperthreading_allowed = allowed; 1562 return (error); 1563} 1564 1565static void 1566cpu_hlt_setup(void *dummy __unused) 1567{ 1568 1569 if (logical_cpus_mask != 0) { 1570 TUNABLE_INT_FETCH("machdep.hlt_logical_cpus", 1571 &hlt_logical_cpus); 1572 sysctl_ctx_init(&logical_cpu_clist); 1573 SYSCTL_ADD_PROC(&logical_cpu_clist, 1574 SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO, 1575 "hlt_logical_cpus", CTLTYPE_INT|CTLFLAG_RW, 0, 0, 1576 sysctl_hlt_logical_cpus, "IU", ""); 1577 SYSCTL_ADD_UINT(&logical_cpu_clist, 1578 SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO, 1579 "logical_cpus_mask", CTLTYPE_INT|CTLFLAG_RD, 1580 &logical_cpus_mask, 0, ""); 1581 1582 if (hlt_logical_cpus) 1583 hlt_cpus_mask |= logical_cpus_mask; 1584 1585 /* 1586 * If necessary for security purposes, force 1587 * hyperthreading off, regardless of the value 1588 * of hlt_logical_cpus. 1589 */ 1590 if (hyperthreading_cpus_mask) { 1591 SYSCTL_ADD_PROC(&logical_cpu_clist, 1592 SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO, 1593 "hyperthreading_allowed", CTLTYPE_INT|CTLFLAG_RW, 1594 0, 0, sysctl_hyperthreading_allowed, "IU", ""); 1595 if (! hyperthreading_allowed) 1596 hlt_cpus_mask |= hyperthreading_cpus_mask; 1597 } 1598 } 1599} 1600SYSINIT(cpu_hlt, SI_SUB_SMP, SI_ORDER_ANY, cpu_hlt_setup, NULL); 1601 1602int 1603mp_grab_cpu_hlt(void) 1604{ 1605 cpumask_t mask; 1606#ifdef MP_WATCHDOG 1607 u_int cpuid; 1608#endif 1609 int retval; 1610 1611 mask = PCPU_GET(cpumask); 1612#ifdef MP_WATCHDOG 1613 cpuid = PCPU_GET(cpuid); 1614 ap_watchdog(cpuid); 1615#endif 1616 1617 retval = 0; 1618 while (mask & hlt_cpus_mask) { 1619 retval = 1; 1620 __asm __volatile("sti; hlt" : : : "memory"); 1621 } 1622 return (retval); 1623} 1624 1625#ifdef COUNT_IPIS 1626/* 1627 * Setup interrupt counters for IPI handlers. 1628 */ 1629static void 1630mp_ipi_intrcnt(void *dummy) 1631{ 1632 char buf[64]; 1633 int i; 1634 1635 CPU_FOREACH(i) { 1636 snprintf(buf, sizeof(buf), "cpu%d:invltlb", i); 1637 intrcnt_add(buf, &ipi_invltlb_counts[i]); 1638 snprintf(buf, sizeof(buf), "cpu%d:invlrng", i); 1639 intrcnt_add(buf, &ipi_invlrng_counts[i]); 1640 snprintf(buf, sizeof(buf), "cpu%d:invlpg", i); 1641 intrcnt_add(buf, &ipi_invlpg_counts[i]); 1642 snprintf(buf, sizeof(buf), "cpu%d:preempt", i); 1643 intrcnt_add(buf, &ipi_preempt_counts[i]); 1644 snprintf(buf, sizeof(buf), "cpu%d:ast", i); 1645 intrcnt_add(buf, &ipi_ast_counts[i]); 1646 snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i); 1647 intrcnt_add(buf, &ipi_rendezvous_counts[i]); 1648 snprintf(buf, sizeof(buf), "cpu%d:hardclock", i); 1649 intrcnt_add(buf, &ipi_hardclock_counts[i]); 1650 } 1651} 1652SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL); 1653#endif 1654 1655