mp_machdep.c revision 233704
1/*- 2 * Copyright (c) 1996, by Steve Passe 3 * Copyright (c) 2003, by Peter Wemm 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. The name of the developer may NOT be used to endorse or promote products 12 * derived from this software without specific prior written permission. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: head/sys/amd64/amd64/mp_machdep.c 233704 2012-03-30 17:03:06Z jkim $"); 29 30#include "opt_cpu.h" 31#include "opt_kstack_pages.h" 32#include "opt_sched.h" 33#include "opt_smp.h" 34 35#include <sys/param.h> 36#include <sys/systm.h> 37#include <sys/bus.h> 38#include <sys/cpuset.h> 39#ifdef GPROF 40#include <sys/gmon.h> 41#endif 42#include <sys/kernel.h> 43#include <sys/ktr.h> 44#include <sys/lock.h> 45#include <sys/malloc.h> 46#include <sys/memrange.h> 47#include <sys/mutex.h> 48#include <sys/pcpu.h> 49#include <sys/proc.h> 50#include <sys/sched.h> 51#include <sys/smp.h> 52#include <sys/sysctl.h> 53 54#include <vm/vm.h> 55#include <vm/vm_param.h> 56#include <vm/pmap.h> 57#include <vm/vm_kern.h> 58#include <vm/vm_extern.h> 59 60#include <x86/apicreg.h> 61#include <machine/clock.h> 62#include <machine/cputypes.h> 63#include <machine/cpufunc.h> 64#include <x86/mca.h> 65#include <machine/md_var.h> 66#include <machine/pcb.h> 67#include <machine/psl.h> 68#include <machine/smp.h> 69#include <machine/specialreg.h> 70#include <machine/tss.h> 71 72#define WARMBOOT_TARGET 0 73#define WARMBOOT_OFF (KERNBASE + 0x0467) 74#define WARMBOOT_SEG (KERNBASE + 0x0469) 75 76#define CMOS_REG (0x70) 77#define CMOS_DATA (0x71) 78#define BIOS_RESET (0x0f) 79#define BIOS_WARM (0x0a) 80 81/* lock region used by kernel profiling */ 82int mcount_lock; 83 84int mp_naps; /* # of Applications processors */ 85int boot_cpu_id = -1; /* designated BSP */ 86 87extern struct pcpu __pcpu[]; 88 89/* AP uses this during bootstrap. Do not staticize. */ 90char *bootSTK; 91static int bootAP; 92 93/* Free these after use */ 94void *bootstacks[MAXCPU]; 95 96/* Temporary variables for init_secondary() */ 97char *doublefault_stack; 98char *nmi_stack; 99void *dpcpu; 100 101struct pcb stoppcbs[MAXCPU]; 102struct pcb **susppcbs; 103void **suspfpusave; 104 105/* Variables needed for SMP tlb shootdown. */ 106vm_offset_t smp_tlb_addr1; 107vm_offset_t smp_tlb_addr2; 108volatile int smp_tlb_wait; 109 110#ifdef COUNT_IPIS 111/* Interrupt counts. */ 112static u_long *ipi_preempt_counts[MAXCPU]; 113static u_long *ipi_ast_counts[MAXCPU]; 114u_long *ipi_invltlb_counts[MAXCPU]; 115u_long *ipi_invlrng_counts[MAXCPU]; 116u_long *ipi_invlpg_counts[MAXCPU]; 117u_long *ipi_invlcache_counts[MAXCPU]; 118u_long *ipi_rendezvous_counts[MAXCPU]; 119static u_long *ipi_hardclock_counts[MAXCPU]; 120#endif 121 122extern inthand_t IDTVEC(fast_syscall), IDTVEC(fast_syscall32); 123 124/* 125 * Local data and functions. 126 */ 127 128static volatile cpuset_t ipi_nmi_pending; 129 130/* used to hold the AP's until we are ready to release them */ 131static struct mtx ap_boot_mtx; 132 133/* Set to 1 once we're ready to let the APs out of the pen. */ 134static volatile int aps_ready = 0; 135 136/* 137 * Store data from cpu_add() until later in the boot when we actually setup 138 * the APs. 139 */ 140struct cpu_info { 141 int cpu_present:1; 142 int cpu_bsp:1; 143 int cpu_disabled:1; 144 int cpu_hyperthread:1; 145} static cpu_info[MAX_APIC_ID + 1]; 146int cpu_apic_ids[MAXCPU]; 147int apic_cpuids[MAX_APIC_ID + 1]; 148 149/* Holds pending bitmap based IPIs per CPU */ 150static volatile u_int cpu_ipi_pending[MAXCPU]; 151 152static u_int boot_address; 153static int cpu_logical; /* logical cpus per core */ 154static int cpu_cores; /* cores per package */ 155 156static void assign_cpu_ids(void); 157static void set_interrupt_apic_ids(void); 158static int start_all_aps(void); 159static int start_ap(int apic_id); 160static void release_aps(void *dummy); 161 162static u_int hyperthreading_cpus; /* logical cpus sharing L1 cache */ 163static int hyperthreading_allowed = 1; 164static u_int bootMP_size; 165 166static void 167mem_range_AP_init(void) 168{ 169 if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) 170 mem_range_softc.mr_op->initAP(&mem_range_softc); 171} 172 173static void 174topo_probe_amd(void) 175{ 176 int core_id_bits; 177 int id; 178 179 /* AMD processors do not support HTT. */ 180 cpu_logical = 1; 181 182 if ((amd_feature2 & AMDID2_CMP) == 0) { 183 cpu_cores = 1; 184 return; 185 } 186 187 core_id_bits = (cpu_procinfo2 & AMDID_COREID_SIZE) >> 188 AMDID_COREID_SIZE_SHIFT; 189 if (core_id_bits == 0) { 190 cpu_cores = (cpu_procinfo2 & AMDID_CMP_CORES) + 1; 191 return; 192 } 193 194 /* Fam 10h and newer should get here. */ 195 for (id = 0; id <= MAX_APIC_ID; id++) { 196 /* Check logical CPU availability. */ 197 if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled) 198 continue; 199 /* Check if logical CPU has the same package ID. */ 200 if ((id >> core_id_bits) != (boot_cpu_id >> core_id_bits)) 201 continue; 202 cpu_cores++; 203 } 204} 205 206/* 207 * Round up to the next power of two, if necessary, and then 208 * take log2. 209 * Returns -1 if argument is zero. 210 */ 211static __inline int 212mask_width(u_int x) 213{ 214 215 return (fls(x << (1 - powerof2(x))) - 1); 216} 217 218static void 219topo_probe_0x4(void) 220{ 221 u_int p[4]; 222 int pkg_id_bits; 223 int core_id_bits; 224 int max_cores; 225 int max_logical; 226 int id; 227 228 /* Both zero and one here mean one logical processor per package. */ 229 max_logical = (cpu_feature & CPUID_HTT) != 0 ? 230 (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1; 231 if (max_logical <= 1) 232 return; 233 234 /* 235 * Because of uniformity assumption we examine only 236 * those logical processors that belong to the same 237 * package as BSP. Further, we count number of 238 * logical processors that belong to the same core 239 * as BSP thus deducing number of threads per core. 240 */ 241 if (cpu_high >= 0x4) { 242 cpuid_count(0x04, 0, p); 243 max_cores = ((p[0] >> 26) & 0x3f) + 1; 244 } else 245 max_cores = 1; 246 core_id_bits = mask_width(max_logical/max_cores); 247 if (core_id_bits < 0) 248 return; 249 pkg_id_bits = core_id_bits + mask_width(max_cores); 250 251 for (id = 0; id <= MAX_APIC_ID; id++) { 252 /* Check logical CPU availability. */ 253 if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled) 254 continue; 255 /* Check if logical CPU has the same package ID. */ 256 if ((id >> pkg_id_bits) != (boot_cpu_id >> pkg_id_bits)) 257 continue; 258 cpu_cores++; 259 /* Check if logical CPU has the same package and core IDs. */ 260 if ((id >> core_id_bits) == (boot_cpu_id >> core_id_bits)) 261 cpu_logical++; 262 } 263 264 KASSERT(cpu_cores >= 1 && cpu_logical >= 1, 265 ("topo_probe_0x4 couldn't find BSP")); 266 267 cpu_cores /= cpu_logical; 268 hyperthreading_cpus = cpu_logical; 269} 270 271static void 272topo_probe_0xb(void) 273{ 274 u_int p[4]; 275 int bits; 276 int cnt; 277 int i; 278 int logical; 279 int type; 280 int x; 281 282 /* We only support three levels for now. */ 283 for (i = 0; i < 3; i++) { 284 cpuid_count(0x0b, i, p); 285 286 /* Fall back if CPU leaf 11 doesn't really exist. */ 287 if (i == 0 && p[1] == 0) { 288 topo_probe_0x4(); 289 return; 290 } 291 292 bits = p[0] & 0x1f; 293 logical = p[1] &= 0xffff; 294 type = (p[2] >> 8) & 0xff; 295 if (type == 0 || logical == 0) 296 break; 297 /* 298 * Because of uniformity assumption we examine only 299 * those logical processors that belong to the same 300 * package as BSP. 301 */ 302 for (cnt = 0, x = 0; x <= MAX_APIC_ID; x++) { 303 if (!cpu_info[x].cpu_present || 304 cpu_info[x].cpu_disabled) 305 continue; 306 if (x >> bits == boot_cpu_id >> bits) 307 cnt++; 308 } 309 if (type == CPUID_TYPE_SMT) 310 cpu_logical = cnt; 311 else if (type == CPUID_TYPE_CORE) 312 cpu_cores = cnt; 313 } 314 if (cpu_logical == 0) 315 cpu_logical = 1; 316 cpu_cores /= cpu_logical; 317} 318 319/* 320 * Both topology discovery code and code that consumes topology 321 * information assume top-down uniformity of the topology. 322 * That is, all physical packages must be identical and each 323 * core in a package must have the same number of threads. 324 * Topology information is queried only on BSP, on which this 325 * code runs and for which it can query CPUID information. 326 * Then topology is extrapolated on all packages using the 327 * uniformity assumption. 328 */ 329static void 330topo_probe(void) 331{ 332 static int cpu_topo_probed = 0; 333 334 if (cpu_topo_probed) 335 return; 336 337 CPU_ZERO(&logical_cpus_mask); 338 if (mp_ncpus <= 1) 339 cpu_cores = cpu_logical = 1; 340 else if (cpu_vendor_id == CPU_VENDOR_AMD) 341 topo_probe_amd(); 342 else if (cpu_vendor_id == CPU_VENDOR_INTEL) { 343 /* 344 * See Intel(R) 64 Architecture Processor 345 * Topology Enumeration article for details. 346 * 347 * Note that 0x1 <= cpu_high < 4 case should be 348 * compatible with topo_probe_0x4() logic when 349 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1) 350 * or it should trigger the fallback otherwise. 351 */ 352 if (cpu_high >= 0xb) 353 topo_probe_0xb(); 354 else if (cpu_high >= 0x1) 355 topo_probe_0x4(); 356 } 357 358 /* 359 * Fallback: assume each logical CPU is in separate 360 * physical package. That is, no multi-core, no SMT. 361 */ 362 if (cpu_cores == 0 || cpu_logical == 0) 363 cpu_cores = cpu_logical = 1; 364 cpu_topo_probed = 1; 365} 366 367struct cpu_group * 368cpu_topo(void) 369{ 370 int cg_flags; 371 372 /* 373 * Determine whether any threading flags are 374 * necessry. 375 */ 376 topo_probe(); 377 if (cpu_logical > 1 && hyperthreading_cpus) 378 cg_flags = CG_FLAG_HTT; 379 else if (cpu_logical > 1) 380 cg_flags = CG_FLAG_SMT; 381 else 382 cg_flags = 0; 383 if (mp_ncpus % (cpu_cores * cpu_logical) != 0) { 384 printf("WARNING: Non-uniform processors.\n"); 385 printf("WARNING: Using suboptimal topology.\n"); 386 return (smp_topo_none()); 387 } 388 /* 389 * No multi-core or hyper-threaded. 390 */ 391 if (cpu_logical * cpu_cores == 1) 392 return (smp_topo_none()); 393 /* 394 * Only HTT no multi-core. 395 */ 396 if (cpu_logical > 1 && cpu_cores == 1) 397 return (smp_topo_1level(CG_SHARE_L1, cpu_logical, cg_flags)); 398 /* 399 * Only multi-core no HTT. 400 */ 401 if (cpu_cores > 1 && cpu_logical == 1) 402 return (smp_topo_1level(CG_SHARE_L2, cpu_cores, cg_flags)); 403 /* 404 * Both HTT and multi-core. 405 */ 406 return (smp_topo_2level(CG_SHARE_L2, cpu_cores, 407 CG_SHARE_L1, cpu_logical, cg_flags)); 408} 409 410/* 411 * Calculate usable address in base memory for AP trampoline code. 412 */ 413u_int 414mp_bootaddress(u_int basemem) 415{ 416 417 bootMP_size = mptramp_end - mptramp_start; 418 boot_address = trunc_page(basemem * 1024); /* round down to 4k boundary */ 419 if (((basemem * 1024) - boot_address) < bootMP_size) 420 boot_address -= PAGE_SIZE; /* not enough, lower by 4k */ 421 /* 3 levels of page table pages */ 422 mptramp_pagetables = boot_address - (PAGE_SIZE * 3); 423 424 return mptramp_pagetables; 425} 426 427void 428cpu_add(u_int apic_id, char boot_cpu) 429{ 430 431 if (apic_id > MAX_APIC_ID) { 432 panic("SMP: APIC ID %d too high", apic_id); 433 return; 434 } 435 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", 436 apic_id)); 437 cpu_info[apic_id].cpu_present = 1; 438 if (boot_cpu) { 439 KASSERT(boot_cpu_id == -1, 440 ("CPU %d claims to be BSP, but CPU %d already is", apic_id, 441 boot_cpu_id)); 442 boot_cpu_id = apic_id; 443 cpu_info[apic_id].cpu_bsp = 1; 444 } 445 if (mp_ncpus < MAXCPU) { 446 mp_ncpus++; 447 mp_maxid = mp_ncpus - 1; 448 } 449 if (bootverbose) 450 printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : 451 "AP"); 452} 453 454void 455cpu_mp_setmaxid(void) 456{ 457 458 /* 459 * mp_maxid should be already set by calls to cpu_add(). 460 * Just sanity check its value here. 461 */ 462 if (mp_ncpus == 0) 463 KASSERT(mp_maxid == 0, 464 ("%s: mp_ncpus is zero, but mp_maxid is not", __func__)); 465 else if (mp_ncpus == 1) 466 mp_maxid = 0; 467 else 468 KASSERT(mp_maxid >= mp_ncpus - 1, 469 ("%s: counters out of sync: max %d, count %d", __func__, 470 mp_maxid, mp_ncpus)); 471} 472 473int 474cpu_mp_probe(void) 475{ 476 477 /* 478 * Always record BSP in CPU map so that the mbuf init code works 479 * correctly. 480 */ 481 CPU_SETOF(0, &all_cpus); 482 if (mp_ncpus == 0) { 483 /* 484 * No CPUs were found, so this must be a UP system. Setup 485 * the variables to represent a system with a single CPU 486 * with an id of 0. 487 */ 488 mp_ncpus = 1; 489 return (0); 490 } 491 492 /* At least one CPU was found. */ 493 if (mp_ncpus == 1) { 494 /* 495 * One CPU was found, so this must be a UP system with 496 * an I/O APIC. 497 */ 498 mp_maxid = 0; 499 return (0); 500 } 501 502 /* At least two CPUs were found. */ 503 return (1); 504} 505 506/* 507 * Initialize the IPI handlers and start up the AP's. 508 */ 509void 510cpu_mp_start(void) 511{ 512 int i; 513 514 /* Initialize the logical ID to APIC ID table. */ 515 for (i = 0; i < MAXCPU; i++) { 516 cpu_apic_ids[i] = -1; 517 cpu_ipi_pending[i] = 0; 518 } 519 520 /* Install an inter-CPU IPI for TLB invalidation */ 521 setidt(IPI_INVLTLB, IDTVEC(invltlb), SDT_SYSIGT, SEL_KPL, 0); 522 setidt(IPI_INVLPG, IDTVEC(invlpg), SDT_SYSIGT, SEL_KPL, 0); 523 setidt(IPI_INVLRNG, IDTVEC(invlrng), SDT_SYSIGT, SEL_KPL, 0); 524 525 /* Install an inter-CPU IPI for cache invalidation. */ 526 setidt(IPI_INVLCACHE, IDTVEC(invlcache), SDT_SYSIGT, SEL_KPL, 0); 527 528 /* Install an inter-CPU IPI for all-CPU rendezvous */ 529 setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous), SDT_SYSIGT, SEL_KPL, 0); 530 531 /* Install generic inter-CPU IPI handler */ 532 setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler), 533 SDT_SYSIGT, SEL_KPL, 0); 534 535 /* Install an inter-CPU IPI for CPU stop/restart */ 536 setidt(IPI_STOP, IDTVEC(cpustop), SDT_SYSIGT, SEL_KPL, 0); 537 538 /* Install an inter-CPU IPI for CPU suspend/resume */ 539 setidt(IPI_SUSPEND, IDTVEC(cpususpend), SDT_SYSIGT, SEL_KPL, 0); 540 541 /* Set boot_cpu_id if needed. */ 542 if (boot_cpu_id == -1) { 543 boot_cpu_id = PCPU_GET(apic_id); 544 cpu_info[boot_cpu_id].cpu_bsp = 1; 545 } else 546 KASSERT(boot_cpu_id == PCPU_GET(apic_id), 547 ("BSP's APIC ID doesn't match boot_cpu_id")); 548 549 /* Probe logical/physical core configuration. */ 550 topo_probe(); 551 552 assign_cpu_ids(); 553 554 /* Start each Application Processor */ 555 start_all_aps(); 556 557 set_interrupt_apic_ids(); 558} 559 560 561/* 562 * Print various information about the SMP system hardware and setup. 563 */ 564void 565cpu_mp_announce(void) 566{ 567 const char *hyperthread; 568 int i; 569 570 printf("FreeBSD/SMP: %d package(s) x %d core(s)", 571 mp_ncpus / (cpu_cores * cpu_logical), cpu_cores); 572 if (hyperthreading_cpus > 1) 573 printf(" x %d HTT threads", cpu_logical); 574 else if (cpu_logical > 1) 575 printf(" x %d SMT threads", cpu_logical); 576 printf("\n"); 577 578 /* List active CPUs first. */ 579 printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id); 580 for (i = 1; i < mp_ncpus; i++) { 581 if (cpu_info[cpu_apic_ids[i]].cpu_hyperthread) 582 hyperthread = "/HT"; 583 else 584 hyperthread = ""; 585 printf(" cpu%d (AP%s): APIC ID: %2d\n", i, hyperthread, 586 cpu_apic_ids[i]); 587 } 588 589 /* List disabled CPUs last. */ 590 for (i = 0; i <= MAX_APIC_ID; i++) { 591 if (!cpu_info[i].cpu_present || !cpu_info[i].cpu_disabled) 592 continue; 593 if (cpu_info[i].cpu_hyperthread) 594 hyperthread = "/HT"; 595 else 596 hyperthread = ""; 597 printf(" cpu (AP%s): APIC ID: %2d (disabled)\n", hyperthread, 598 i); 599 } 600} 601 602/* 603 * AP CPU's call this to initialize themselves. 604 */ 605void 606init_secondary(void) 607{ 608 struct pcpu *pc; 609 struct nmi_pcpu *np; 610 u_int64_t msr, cr0; 611 u_int cpuid; 612 int cpu, gsel_tss, x; 613 struct region_descriptor ap_gdt; 614 615 /* Set by the startup code for us to use */ 616 cpu = bootAP; 617 618 /* Init tss */ 619 common_tss[cpu] = common_tss[0]; 620 common_tss[cpu].tss_rsp0 = 0; /* not used until after switch */ 621 common_tss[cpu].tss_iobase = sizeof(struct amd64tss) + 622 IOPAGES * PAGE_SIZE; 623 common_tss[cpu].tss_ist1 = (long)&doublefault_stack[PAGE_SIZE]; 624 625 /* The NMI stack runs on IST2. */ 626 np = ((struct nmi_pcpu *) &nmi_stack[PAGE_SIZE]) - 1; 627 common_tss[cpu].tss_ist2 = (long) np; 628 629 /* Prepare private GDT */ 630 gdt_segs[GPROC0_SEL].ssd_base = (long) &common_tss[cpu]; 631 for (x = 0; x < NGDT; x++) { 632 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) && 633 x != GUSERLDT_SEL && x != (GUSERLDT_SEL + 1)) 634 ssdtosd(&gdt_segs[x], &gdt[NGDT * cpu + x]); 635 } 636 ssdtosyssd(&gdt_segs[GPROC0_SEL], 637 (struct system_segment_descriptor *)&gdt[NGDT * cpu + GPROC0_SEL]); 638 ap_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 639 ap_gdt.rd_base = (long) &gdt[NGDT * cpu]; 640 lgdt(&ap_gdt); /* does magic intra-segment return */ 641 642 /* Get per-cpu data */ 643 pc = &__pcpu[cpu]; 644 645 /* prime data page for it to use */ 646 pcpu_init(pc, cpu, sizeof(struct pcpu)); 647 dpcpu_init(dpcpu, cpu); 648 pc->pc_apic_id = cpu_apic_ids[cpu]; 649 pc->pc_prvspace = pc; 650 pc->pc_curthread = 0; 651 pc->pc_tssp = &common_tss[cpu]; 652 pc->pc_commontssp = &common_tss[cpu]; 653 pc->pc_rsp0 = 0; 654 pc->pc_tss = (struct system_segment_descriptor *)&gdt[NGDT * cpu + 655 GPROC0_SEL]; 656 pc->pc_fs32p = &gdt[NGDT * cpu + GUFS32_SEL]; 657 pc->pc_gs32p = &gdt[NGDT * cpu + GUGS32_SEL]; 658 pc->pc_ldt = (struct system_segment_descriptor *)&gdt[NGDT * cpu + 659 GUSERLDT_SEL]; 660 661 /* Save the per-cpu pointer for use by the NMI handler. */ 662 np->np_pcpu = (register_t) pc; 663 664 wrmsr(MSR_FSBASE, 0); /* User value */ 665 wrmsr(MSR_GSBASE, (u_int64_t)pc); 666 wrmsr(MSR_KGSBASE, (u_int64_t)pc); /* XXX User value while we're in the kernel */ 667 668 lidt(&r_idt); 669 670 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 671 ltr(gsel_tss); 672 673 /* 674 * Set to a known state: 675 * Set by mpboot.s: CR0_PG, CR0_PE 676 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM 677 */ 678 cr0 = rcr0(); 679 cr0 &= ~(CR0_CD | CR0_NW | CR0_EM); 680 load_cr0(cr0); 681 682 /* Set up the fast syscall stuff */ 683 msr = rdmsr(MSR_EFER) | EFER_SCE; 684 wrmsr(MSR_EFER, msr); 685 wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall)); 686 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); 687 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | 688 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); 689 wrmsr(MSR_STAR, msr); 690 wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D); 691 692 /* Disable local APIC just to be sure. */ 693 lapic_disable(); 694 695 /* signal our startup to the BSP. */ 696 mp_naps++; 697 698 /* Spin until the BSP releases the AP's. */ 699 while (!aps_ready) 700 ia32_pause(); 701 702 /* Initialize the PAT MSR. */ 703 pmap_init_pat(); 704 705 /* set up CPU registers and state */ 706 cpu_setregs(); 707 708 /* set up SSE/NX registers */ 709 initializecpu(); 710 711 /* set up FPU state on the AP */ 712 fpuinit(); 713 714 /* A quick check from sanity claus */ 715 cpuid = PCPU_GET(cpuid); 716 if (PCPU_GET(apic_id) != lapic_id()) { 717 printf("SMP: cpuid = %d\n", cpuid); 718 printf("SMP: actual apic_id = %d\n", lapic_id()); 719 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); 720 panic("cpuid mismatch! boom!!"); 721 } 722 723 /* Initialize curthread. */ 724 KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); 725 PCPU_SET(curthread, PCPU_GET(idlethread)); 726 727 mca_init(); 728 729 mtx_lock_spin(&ap_boot_mtx); 730 731 /* Init local apic for irq's */ 732 lapic_setup(1); 733 734 /* Set memory range attributes for this CPU to match the BSP */ 735 mem_range_AP_init(); 736 737 smp_cpus++; 738 739 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid); 740 printf("SMP: AP CPU #%d Launched!\n", cpuid); 741 742 /* Determine if we are a logical CPU. */ 743 /* XXX Calculation depends on cpu_logical being a power of 2, e.g. 2 */ 744 if (cpu_logical > 1 && PCPU_GET(apic_id) % cpu_logical != 0) 745 CPU_SET(cpuid, &logical_cpus_mask); 746 747 if (bootverbose) 748 lapic_dump("AP"); 749 750 if (smp_cpus == mp_ncpus) { 751 /* enable IPI's, tlb shootdown, freezes etc */ 752 atomic_store_rel_int(&smp_started, 1); 753 smp_active = 1; /* historic */ 754 } 755 756 /* 757 * Enable global pages TLB extension 758 * This also implicitly flushes the TLB 759 */ 760 761 load_cr4(rcr4() | CR4_PGE); 762 load_ds(_udatasel); 763 load_es(_udatasel); 764 load_fs(_ufssel); 765 mtx_unlock_spin(&ap_boot_mtx); 766 767 /* Wait until all the AP's are up. */ 768 while (smp_started == 0) 769 ia32_pause(); 770 771 /* Start per-CPU event timers. */ 772 cpu_initclocks_ap(); 773 774 sched_throw(NULL); 775 776 panic("scheduler returned us to %s", __func__); 777 /* NOTREACHED */ 778} 779 780/******************************************************************* 781 * local functions and data 782 */ 783 784/* 785 * We tell the I/O APIC code about all the CPUs we want to receive 786 * interrupts. If we don't want certain CPUs to receive IRQs we 787 * can simply not tell the I/O APIC code about them in this function. 788 * We also do not tell it about the BSP since it tells itself about 789 * the BSP internally to work with UP kernels and on UP machines. 790 */ 791static void 792set_interrupt_apic_ids(void) 793{ 794 u_int i, apic_id; 795 796 for (i = 0; i < MAXCPU; i++) { 797 apic_id = cpu_apic_ids[i]; 798 if (apic_id == -1) 799 continue; 800 if (cpu_info[apic_id].cpu_bsp) 801 continue; 802 if (cpu_info[apic_id].cpu_disabled) 803 continue; 804 805 /* Don't let hyperthreads service interrupts. */ 806 if (hyperthreading_cpus > 1 && 807 apic_id % hyperthreading_cpus != 0) 808 continue; 809 810 intr_add_cpu(i); 811 } 812} 813 814/* 815 * Assign logical CPU IDs to local APICs. 816 */ 817static void 818assign_cpu_ids(void) 819{ 820 u_int i; 821 822 TUNABLE_INT_FETCH("machdep.hyperthreading_allowed", 823 &hyperthreading_allowed); 824 825 /* Check for explicitly disabled CPUs. */ 826 for (i = 0; i <= MAX_APIC_ID; i++) { 827 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp) 828 continue; 829 830 if (hyperthreading_cpus > 1 && i % hyperthreading_cpus != 0) { 831 cpu_info[i].cpu_hyperthread = 1; 832 833 /* 834 * Don't use HT CPU if it has been disabled by a 835 * tunable. 836 */ 837 if (hyperthreading_allowed == 0) { 838 cpu_info[i].cpu_disabled = 1; 839 continue; 840 } 841 } 842 843 /* Don't use this CPU if it has been disabled by a tunable. */ 844 if (resource_disabled("lapic", i)) { 845 cpu_info[i].cpu_disabled = 1; 846 continue; 847 } 848 } 849 850 if (hyperthreading_allowed == 0 && hyperthreading_cpus > 1) { 851 hyperthreading_cpus = 0; 852 cpu_logical = 1; 853 } 854 855 /* 856 * Assign CPU IDs to local APIC IDs and disable any CPUs 857 * beyond MAXCPU. CPU 0 is always assigned to the BSP. 858 * 859 * To minimize confusion for userland, we attempt to number 860 * CPUs such that all threads and cores in a package are 861 * grouped together. For now we assume that the BSP is always 862 * the first thread in a package and just start adding APs 863 * starting with the BSP's APIC ID. 864 */ 865 mp_ncpus = 1; 866 cpu_apic_ids[0] = boot_cpu_id; 867 apic_cpuids[boot_cpu_id] = 0; 868 for (i = boot_cpu_id + 1; i != boot_cpu_id; 869 i == MAX_APIC_ID ? i = 0 : i++) { 870 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp || 871 cpu_info[i].cpu_disabled) 872 continue; 873 874 if (mp_ncpus < MAXCPU) { 875 cpu_apic_ids[mp_ncpus] = i; 876 apic_cpuids[i] = mp_ncpus; 877 mp_ncpus++; 878 } else 879 cpu_info[i].cpu_disabled = 1; 880 } 881 KASSERT(mp_maxid >= mp_ncpus - 1, 882 ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, 883 mp_ncpus)); 884} 885 886/* 887 * start each AP in our list 888 */ 889static int 890start_all_aps(void) 891{ 892 vm_offset_t va = boot_address + KERNBASE; 893 u_int64_t *pt4, *pt3, *pt2; 894 u_int32_t mpbioswarmvec; 895 int apic_id, cpu, i; 896 u_char mpbiosreason; 897 898 mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); 899 900 /* install the AP 1st level boot code */ 901 pmap_kenter(va, boot_address); 902 pmap_invalidate_page(kernel_pmap, va); 903 bcopy(mptramp_start, (void *)va, bootMP_size); 904 905 /* Locate the page tables, they'll be below the trampoline */ 906 pt4 = (u_int64_t *)(uintptr_t)(mptramp_pagetables + KERNBASE); 907 pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t); 908 pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t); 909 910 /* Create the initial 1GB replicated page tables */ 911 for (i = 0; i < 512; i++) { 912 /* Each slot of the level 4 pages points to the same level 3 page */ 913 pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + PAGE_SIZE); 914 pt4[i] |= PG_V | PG_RW | PG_U; 915 916 /* Each slot of the level 3 pages points to the same level 2 page */ 917 pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + (2 * PAGE_SIZE)); 918 pt3[i] |= PG_V | PG_RW | PG_U; 919 920 /* The level 2 page slots are mapped with 2MB pages for 1GB. */ 921 pt2[i] = i * (2 * 1024 * 1024); 922 pt2[i] |= PG_V | PG_RW | PG_PS | PG_U; 923 } 924 925 /* save the current value of the warm-start vector */ 926 mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF); 927 outb(CMOS_REG, BIOS_RESET); 928 mpbiosreason = inb(CMOS_DATA); 929 930 /* setup a vector to our boot code */ 931 *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET; 932 *((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4); 933 outb(CMOS_REG, BIOS_RESET); 934 outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ 935 936 /* start each AP */ 937 for (cpu = 1; cpu < mp_ncpus; cpu++) { 938 apic_id = cpu_apic_ids[cpu]; 939 940 /* allocate and set up an idle stack data page */ 941 bootstacks[cpu] = (void *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE); 942 doublefault_stack = (char *)kmem_alloc(kernel_map, PAGE_SIZE); 943 nmi_stack = (char *)kmem_alloc(kernel_map, PAGE_SIZE); 944 dpcpu = (void *)kmem_alloc(kernel_map, DPCPU_SIZE); 945 946 bootSTK = (char *)bootstacks[cpu] + KSTACK_PAGES * PAGE_SIZE - 8; 947 bootAP = cpu; 948 949 /* attempt to start the Application Processor */ 950 if (!start_ap(apic_id)) { 951 /* restore the warmstart vector */ 952 *(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec; 953 panic("AP #%d (PHY# %d) failed!", cpu, apic_id); 954 } 955 956 CPU_SET(cpu, &all_cpus); /* record AP in CPU map */ 957 } 958 959 /* restore the warmstart vector */ 960 *(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec; 961 962 outb(CMOS_REG, BIOS_RESET); 963 outb(CMOS_DATA, mpbiosreason); 964 965 /* number of APs actually started */ 966 return mp_naps; 967} 968 969 970/* 971 * This function starts the AP (application processor) identified 972 * by the APIC ID 'physicalCpu'. It does quite a "song and dance" 973 * to accomplish this. This is necessary because of the nuances 974 * of the different hardware we might encounter. It isn't pretty, 975 * but it seems to work. 976 */ 977static int 978start_ap(int apic_id) 979{ 980 int vector, ms; 981 int cpus; 982 983 /* calculate the vector */ 984 vector = (boot_address >> 12) & 0xff; 985 986 /* used as a watchpoint to signal AP startup */ 987 cpus = mp_naps; 988 989 /* 990 * first we do an INIT/RESET IPI this INIT IPI might be run, reseting 991 * and running the target CPU. OR this INIT IPI might be latched (P5 992 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be 993 * ignored. 994 */ 995 996 /* do an INIT IPI: assert RESET */ 997 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 998 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); 999 1000 /* wait for pending status end */ 1001 lapic_ipi_wait(-1); 1002 1003 /* do an INIT IPI: deassert RESET */ 1004 lapic_ipi_raw(APIC_DEST_ALLESELF | APIC_TRIGMOD_LEVEL | 1005 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 0); 1006 1007 /* wait for pending status end */ 1008 DELAY(10000); /* wait ~10mS */ 1009 lapic_ipi_wait(-1); 1010 1011 /* 1012 * next we do a STARTUP IPI: the previous INIT IPI might still be 1013 * latched, (P5 bug) this 1st STARTUP would then terminate 1014 * immediately, and the previously started INIT IPI would continue. OR 1015 * the previous INIT IPI has already run. and this STARTUP IPI will 1016 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI 1017 * will run. 1018 */ 1019 1020 /* do a STARTUP IPI */ 1021 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1022 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 1023 vector, apic_id); 1024 lapic_ipi_wait(-1); 1025 DELAY(200); /* wait ~200uS */ 1026 1027 /* 1028 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF 1029 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR 1030 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is 1031 * recognized after hardware RESET or INIT IPI. 1032 */ 1033 1034 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1035 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 1036 vector, apic_id); 1037 lapic_ipi_wait(-1); 1038 DELAY(200); /* wait ~200uS */ 1039 1040 /* Wait up to 5 seconds for it to start. */ 1041 for (ms = 0; ms < 5000; ms++) { 1042 if (mp_naps > cpus) 1043 return 1; /* return SUCCESS */ 1044 DELAY(1000); 1045 } 1046 return 0; /* return FAILURE */ 1047} 1048 1049#ifdef COUNT_XINVLTLB_HITS 1050u_int xhits_gbl[MAXCPU]; 1051u_int xhits_pg[MAXCPU]; 1052u_int xhits_rng[MAXCPU]; 1053static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, ""); 1054SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, 1055 sizeof(xhits_gbl), "IU", ""); 1056SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, 1057 sizeof(xhits_pg), "IU", ""); 1058SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, 1059 sizeof(xhits_rng), "IU", ""); 1060 1061u_int ipi_global; 1062u_int ipi_page; 1063u_int ipi_range; 1064u_int ipi_range_size; 1065SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); 1066SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); 1067SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); 1068SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, 1069 &ipi_range_size, 0, ""); 1070 1071u_int ipi_masked_global; 1072u_int ipi_masked_page; 1073u_int ipi_masked_range; 1074u_int ipi_masked_range_size; 1075SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW, 1076 &ipi_masked_global, 0, ""); 1077SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW, 1078 &ipi_masked_page, 0, ""); 1079SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW, 1080 &ipi_masked_range, 0, ""); 1081SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW, 1082 &ipi_masked_range_size, 0, ""); 1083#endif /* COUNT_XINVLTLB_HITS */ 1084 1085/* 1086 * Send an IPI to specified CPU handling the bitmap logic. 1087 */ 1088static void 1089ipi_send_cpu(int cpu, u_int ipi) 1090{ 1091 u_int bitmap, old_pending, new_pending; 1092 1093 KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu)); 1094 1095 if (IPI_IS_BITMAPED(ipi)) { 1096 bitmap = 1 << ipi; 1097 ipi = IPI_BITMAP_VECTOR; 1098 do { 1099 old_pending = cpu_ipi_pending[cpu]; 1100 new_pending = old_pending | bitmap; 1101 } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu], 1102 old_pending, new_pending)); 1103 if (old_pending) 1104 return; 1105 } 1106 lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]); 1107} 1108 1109/* 1110 * Flush the TLB on all other CPU's 1111 */ 1112static void 1113smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) 1114{ 1115 u_int ncpu; 1116 1117 ncpu = mp_ncpus - 1; /* does not shootdown self */ 1118 if (ncpu < 1) 1119 return; /* no other cpus */ 1120 if (!(read_rflags() & PSL_I)) 1121 panic("%s: interrupts disabled", __func__); 1122 mtx_lock_spin(&smp_ipi_mtx); 1123 smp_tlb_addr1 = addr1; 1124 smp_tlb_addr2 = addr2; 1125 atomic_store_rel_int(&smp_tlb_wait, 0); 1126 ipi_all_but_self(vector); 1127 while (smp_tlb_wait < ncpu) 1128 ia32_pause(); 1129 mtx_unlock_spin(&smp_ipi_mtx); 1130} 1131 1132static void 1133smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2) 1134{ 1135 int cpu, ncpu, othercpus; 1136 1137 othercpus = mp_ncpus - 1; 1138 if (CPU_ISFULLSET(&mask)) { 1139 if (othercpus < 1) 1140 return; 1141 } else { 1142 CPU_CLR(PCPU_GET(cpuid), &mask); 1143 if (CPU_EMPTY(&mask)) 1144 return; 1145 } 1146 if (!(read_rflags() & PSL_I)) 1147 panic("%s: interrupts disabled", __func__); 1148 mtx_lock_spin(&smp_ipi_mtx); 1149 smp_tlb_addr1 = addr1; 1150 smp_tlb_addr2 = addr2; 1151 atomic_store_rel_int(&smp_tlb_wait, 0); 1152 if (CPU_ISFULLSET(&mask)) { 1153 ncpu = othercpus; 1154 ipi_all_but_self(vector); 1155 } else { 1156 ncpu = 0; 1157 while ((cpu = cpusetobj_ffs(&mask)) != 0) { 1158 cpu--; 1159 CPU_CLR(cpu, &mask); 1160 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, 1161 cpu, vector); 1162 ipi_send_cpu(cpu, vector); 1163 ncpu++; 1164 } 1165 } 1166 while (smp_tlb_wait < ncpu) 1167 ia32_pause(); 1168 mtx_unlock_spin(&smp_ipi_mtx); 1169} 1170 1171void 1172smp_cache_flush(void) 1173{ 1174 1175 if (smp_started) 1176 smp_tlb_shootdown(IPI_INVLCACHE, 0, 0); 1177} 1178 1179void 1180smp_invltlb(void) 1181{ 1182 1183 if (smp_started) { 1184 smp_tlb_shootdown(IPI_INVLTLB, 0, 0); 1185#ifdef COUNT_XINVLTLB_HITS 1186 ipi_global++; 1187#endif 1188 } 1189} 1190 1191void 1192smp_invlpg(vm_offset_t addr) 1193{ 1194 1195 if (smp_started) { 1196 smp_tlb_shootdown(IPI_INVLPG, addr, 0); 1197#ifdef COUNT_XINVLTLB_HITS 1198 ipi_page++; 1199#endif 1200 } 1201} 1202 1203void 1204smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2) 1205{ 1206 1207 if (smp_started) { 1208 smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2); 1209#ifdef COUNT_XINVLTLB_HITS 1210 ipi_range++; 1211 ipi_range_size += (addr2 - addr1) / PAGE_SIZE; 1212#endif 1213 } 1214} 1215 1216void 1217smp_masked_invltlb(cpuset_t mask) 1218{ 1219 1220 if (smp_started) { 1221 smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0); 1222#ifdef COUNT_XINVLTLB_HITS 1223 ipi_masked_global++; 1224#endif 1225 } 1226} 1227 1228void 1229smp_masked_invlpg(cpuset_t mask, vm_offset_t addr) 1230{ 1231 1232 if (smp_started) { 1233 smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0); 1234#ifdef COUNT_XINVLTLB_HITS 1235 ipi_masked_page++; 1236#endif 1237 } 1238} 1239 1240void 1241smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2) 1242{ 1243 1244 if (smp_started) { 1245 smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2); 1246#ifdef COUNT_XINVLTLB_HITS 1247 ipi_masked_range++; 1248 ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE; 1249#endif 1250 } 1251} 1252 1253void 1254ipi_bitmap_handler(struct trapframe frame) 1255{ 1256 struct trapframe *oldframe; 1257 struct thread *td; 1258 int cpu = PCPU_GET(cpuid); 1259 u_int ipi_bitmap; 1260 1261 critical_enter(); 1262 td = curthread; 1263 td->td_intr_nesting_level++; 1264 oldframe = td->td_intr_frame; 1265 td->td_intr_frame = &frame; 1266 ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); 1267 if (ipi_bitmap & (1 << IPI_PREEMPT)) { 1268#ifdef COUNT_IPIS 1269 (*ipi_preempt_counts[cpu])++; 1270#endif 1271 sched_preempt(td); 1272 } 1273 if (ipi_bitmap & (1 << IPI_AST)) { 1274#ifdef COUNT_IPIS 1275 (*ipi_ast_counts[cpu])++; 1276#endif 1277 /* Nothing to do for AST */ 1278 } 1279 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) { 1280#ifdef COUNT_IPIS 1281 (*ipi_hardclock_counts[cpu])++; 1282#endif 1283 hardclockintr(); 1284 } 1285 td->td_intr_frame = oldframe; 1286 td->td_intr_nesting_level--; 1287 critical_exit(); 1288} 1289 1290/* 1291 * send an IPI to a set of cpus. 1292 */ 1293void 1294ipi_selected(cpuset_t cpus, u_int ipi) 1295{ 1296 int cpu; 1297 1298 /* 1299 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1300 * of help in order to understand what is the source. 1301 * Set the mask of receiving CPUs for this purpose. 1302 */ 1303 if (ipi == IPI_STOP_HARD) 1304 CPU_OR_ATOMIC(&ipi_nmi_pending, &cpus); 1305 1306 while ((cpu = cpusetobj_ffs(&cpus)) != 0) { 1307 cpu--; 1308 CPU_CLR(cpu, &cpus); 1309 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1310 ipi_send_cpu(cpu, ipi); 1311 } 1312} 1313 1314/* 1315 * send an IPI to a specific CPU. 1316 */ 1317void 1318ipi_cpu(int cpu, u_int ipi) 1319{ 1320 1321 /* 1322 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1323 * of help in order to understand what is the source. 1324 * Set the mask of receiving CPUs for this purpose. 1325 */ 1326 if (ipi == IPI_STOP_HARD) 1327 CPU_SET_ATOMIC(cpu, &ipi_nmi_pending); 1328 1329 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1330 ipi_send_cpu(cpu, ipi); 1331} 1332 1333/* 1334 * send an IPI to all CPUs EXCEPT myself 1335 */ 1336void 1337ipi_all_but_self(u_int ipi) 1338{ 1339 cpuset_t other_cpus; 1340 1341 other_cpus = all_cpus; 1342 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 1343 1344 if (IPI_IS_BITMAPED(ipi)) { 1345 ipi_selected(other_cpus, ipi); 1346 return; 1347 } 1348 1349 /* 1350 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1351 * of help in order to understand what is the source. 1352 * Set the mask of receiving CPUs for this purpose. 1353 */ 1354 if (ipi == IPI_STOP_HARD) 1355 CPU_OR_ATOMIC(&ipi_nmi_pending, &other_cpus); 1356 1357 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); 1358 lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); 1359} 1360 1361int 1362ipi_nmi_handler() 1363{ 1364 u_int cpuid; 1365 1366 /* 1367 * As long as there is not a simple way to know about a NMI's 1368 * source, if the bitmask for the current CPU is present in 1369 * the global pending bitword an IPI_STOP_HARD has been issued 1370 * and should be handled. 1371 */ 1372 cpuid = PCPU_GET(cpuid); 1373 if (!CPU_ISSET(cpuid, &ipi_nmi_pending)) 1374 return (1); 1375 1376 CPU_CLR_ATOMIC(cpuid, &ipi_nmi_pending); 1377 cpustop_handler(); 1378 return (0); 1379} 1380 1381/* 1382 * Handle an IPI_STOP by saving our current context and spinning until we 1383 * are resumed. 1384 */ 1385void 1386cpustop_handler(void) 1387{ 1388 u_int cpu; 1389 1390 cpu = PCPU_GET(cpuid); 1391 1392 savectx(&stoppcbs[cpu]); 1393 1394 /* Indicate that we are stopped */ 1395 CPU_SET_ATOMIC(cpu, &stopped_cpus); 1396 1397 /* Wait for restart */ 1398 while (!CPU_ISSET(cpu, &started_cpus)) 1399 ia32_pause(); 1400 1401 CPU_CLR_ATOMIC(cpu, &started_cpus); 1402 CPU_CLR_ATOMIC(cpu, &stopped_cpus); 1403 1404 if (cpu == 0 && cpustop_restartfunc != NULL) { 1405 cpustop_restartfunc(); 1406 cpustop_restartfunc = NULL; 1407 } 1408} 1409 1410/* 1411 * Handle an IPI_SUSPEND by saving our current context and spinning until we 1412 * are resumed. 1413 */ 1414void 1415cpususpend_handler(void) 1416{ 1417 u_int cpu; 1418 1419 cpu = PCPU_GET(cpuid); 1420 1421 if (savectx(susppcbs[cpu])) { 1422 ctx_fpusave(suspfpusave[cpu]); 1423 wbinvd(); 1424 CPU_SET_ATOMIC(cpu, &stopped_cpus); 1425 } else { 1426 pmap_init_pat(); 1427 load_cr3(susppcbs[cpu]->pcb_cr3); 1428 initializecpu(); 1429 PCPU_SET(switchtime, 0); 1430 PCPU_SET(switchticks, ticks); 1431 } 1432 1433 /* Wait for resume */ 1434 while (!CPU_ISSET(cpu, &started_cpus)) 1435 ia32_pause(); 1436 1437 CPU_CLR_ATOMIC(cpu, &started_cpus); 1438 CPU_CLR_ATOMIC(cpu, &stopped_cpus); 1439 1440 /* Resume MCA and local APIC */ 1441 mca_resume(); 1442 lapic_setup(0); 1443} 1444 1445/* 1446 * This is called once the rest of the system is up and running and we're 1447 * ready to let the AP's out of the pen. 1448 */ 1449static void 1450release_aps(void *dummy __unused) 1451{ 1452 1453 if (mp_ncpus == 1) 1454 return; 1455 atomic_store_rel_int(&aps_ready, 1); 1456 while (smp_started == 0) 1457 ia32_pause(); 1458} 1459SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); 1460 1461#ifdef COUNT_IPIS 1462/* 1463 * Setup interrupt counters for IPI handlers. 1464 */ 1465static void 1466mp_ipi_intrcnt(void *dummy) 1467{ 1468 char buf[64]; 1469 int i; 1470 1471 CPU_FOREACH(i) { 1472 snprintf(buf, sizeof(buf), "cpu%d:invltlb", i); 1473 intrcnt_add(buf, &ipi_invltlb_counts[i]); 1474 snprintf(buf, sizeof(buf), "cpu%d:invlrng", i); 1475 intrcnt_add(buf, &ipi_invlrng_counts[i]); 1476 snprintf(buf, sizeof(buf), "cpu%d:invlpg", i); 1477 intrcnt_add(buf, &ipi_invlpg_counts[i]); 1478 snprintf(buf, sizeof(buf), "cpu%d:preempt", i); 1479 intrcnt_add(buf, &ipi_preempt_counts[i]); 1480 snprintf(buf, sizeof(buf), "cpu%d:ast", i); 1481 intrcnt_add(buf, &ipi_ast_counts[i]); 1482 snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i); 1483 intrcnt_add(buf, &ipi_rendezvous_counts[i]); 1484 snprintf(buf, sizeof(buf), "cpu%d:hardclock", i); 1485 intrcnt_add(buf, &ipi_hardclock_counts[i]); 1486 } 1487} 1488SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL); 1489#endif 1490 1491